diff --git a/docker/hoodie/hadoop/base/pom.xml b/docker/hoodie/hadoop/base/pom.xml index 52dd2a8d1..131a6259d 100644 --- a/docker/hoodie/hadoop/base/pom.xml +++ b/docker/hoodie/hadoop/base/pom.xml @@ -30,6 +30,7 @@ UTF-8 true + ${project.parent.parent.basedir} diff --git a/docker/hoodie/hadoop/datanode/pom.xml b/docker/hoodie/hadoop/datanode/pom.xml index 23cb64dc6..baf42a717 100644 --- a/docker/hoodie/hadoop/datanode/pom.xml +++ b/docker/hoodie/hadoop/datanode/pom.xml @@ -30,6 +30,7 @@ UTF-8 true + ${project.parent.parent.basedir} diff --git a/docker/hoodie/hadoop/historyserver/pom.xml b/docker/hoodie/hadoop/historyserver/pom.xml index d35e94040..67dd17b27 100644 --- a/docker/hoodie/hadoop/historyserver/pom.xml +++ b/docker/hoodie/hadoop/historyserver/pom.xml @@ -30,6 +30,7 @@ UTF-8 true + ${project.parent.parent.basedir} diff --git a/docker/hoodie/hadoop/hive_base/pom.xml b/docker/hoodie/hadoop/hive_base/pom.xml index 2f7c2b5fa..44debacc9 100644 --- a/docker/hoodie/hadoop/hive_base/pom.xml +++ b/docker/hoodie/hadoop/hive_base/pom.xml @@ -30,6 +30,7 @@ UTF-8 true + ${project.parent.parent.basedir} diff --git a/docker/hoodie/hadoop/namenode/pom.xml b/docker/hoodie/hadoop/namenode/pom.xml index a996f5754..a611a5615 100644 --- a/docker/hoodie/hadoop/namenode/pom.xml +++ b/docker/hoodie/hadoop/namenode/pom.xml @@ -30,6 +30,7 @@ UTF-8 true + ${project.parent.parent.basedir} diff --git a/docker/hoodie/hadoop/pom.xml b/docker/hoodie/hadoop/pom.xml index fff962fef..21ae28263 100644 --- a/docker/hoodie/hadoop/pom.xml +++ b/docker/hoodie/hadoop/pom.xml @@ -56,6 +56,7 @@ 0.217 1.4.3 true + ${project.parent.basedir} diff --git a/docker/hoodie/hadoop/prestobase/pom.xml b/docker/hoodie/hadoop/prestobase/pom.xml index d3c1d0f0b..4de12057f 100644 --- a/docker/hoodie/hadoop/prestobase/pom.xml +++ b/docker/hoodie/hadoop/prestobase/pom.xml @@ -32,6 +32,7 @@ UTF-8 true + ${project.parent.parent.basedir} diff --git a/docker/hoodie/hadoop/spark_base/pom.xml b/docker/hoodie/hadoop/spark_base/pom.xml index 32b33e020..070bf72f6 100644 --- a/docker/hoodie/hadoop/spark_base/pom.xml +++ b/docker/hoodie/hadoop/spark_base/pom.xml @@ -30,6 +30,7 @@ UTF-8 true + ${project.parent.parent.basedir} diff --git a/docker/hoodie/hadoop/sparkadhoc/pom.xml b/docker/hoodie/hadoop/sparkadhoc/pom.xml index 80a811cec..10400c700 100644 --- a/docker/hoodie/hadoop/sparkadhoc/pom.xml +++ b/docker/hoodie/hadoop/sparkadhoc/pom.xml @@ -30,6 +30,7 @@ UTF-8 true + ${project.parent.parent.basedir} diff --git a/docker/hoodie/hadoop/sparkmaster/pom.xml b/docker/hoodie/hadoop/sparkmaster/pom.xml index b0e93601f..8806ea09f 100644 --- a/docker/hoodie/hadoop/sparkmaster/pom.xml +++ b/docker/hoodie/hadoop/sparkmaster/pom.xml @@ -30,6 +30,7 @@ UTF-8 true + ${project.parent.parent.basedir} diff --git a/docker/hoodie/hadoop/sparkworker/pom.xml b/docker/hoodie/hadoop/sparkworker/pom.xml index 5f833035b..8dae84442 100644 --- a/docker/hoodie/hadoop/sparkworker/pom.xml +++ b/docker/hoodie/hadoop/sparkworker/pom.xml @@ -30,6 +30,7 @@ UTF-8 true + ${project.parent.parent.basedir} diff --git a/hudi-cli/pom.xml b/hudi-cli/pom.xml index 91df8ac0a..44e11c594 100644 --- a/hudi-cli/pom.xml +++ b/hudi-cli/pom.xml @@ -29,6 +29,7 @@ 1.2.0.RELEASE org.springframework.shell.Bootstrap + ${project.parent.basedir} diff --git a/hudi-cli/src/main/java/org/apache/hudi/cli/HoodiePrintHelper.java b/hudi-cli/src/main/java/org/apache/hudi/cli/HoodiePrintHelper.java index a5b64953a..635097f8a 100644 --- a/hudi-cli/src/main/java/org/apache/hudi/cli/HoodiePrintHelper.java +++ b/hudi-cli/src/main/java/org/apache/hudi/cli/HoodiePrintHelper.java @@ -52,19 +52,16 @@ public class HoodiePrintHelper { * @param rows List of rows * @return Serialized form for printing */ - public static String print(TableHeader rowHeader, - Map> fieldNameToConverterMap, - String sortByField, boolean isDescending, Integer limit, boolean headerOnly, - List rows) { + public static String print(TableHeader rowHeader, Map> fieldNameToConverterMap, + String sortByField, boolean isDescending, Integer limit, boolean headerOnly, List rows) { if (headerOnly) { return HoodiePrintHelper.print(rowHeader); } - Table table = new Table(rowHeader, fieldNameToConverterMap, - Option.ofNullable(sortByField.isEmpty() ? null : sortByField), - Option.ofNullable(isDescending), - Option.ofNullable(limit <= 0 ? null : limit)).addAllRows(rows).flip(); + Table table = + new Table(rowHeader, fieldNameToConverterMap, Option.ofNullable(sortByField.isEmpty() ? null : sortByField), + Option.ofNullable(isDescending), Option.ofNullable(limit <= 0 ? null : limit)).addAllRows(rows).flip(); return HoodiePrintHelper.print(table); } @@ -79,9 +76,8 @@ public class HoodiePrintHelper { String[] header = new String[buffer.getFieldNames().size()]; buffer.getFieldNames().toArray(header); - String[][] rows = buffer.getRenderRows().stream() - .map(l -> l.stream().toArray(String[]::new)) - .toArray(String[][]::new); + String[][] rows = + buffer.getRenderRows().stream().map(l -> l.stream().toArray(String[]::new)).toArray(String[][]::new); return printTextTable(header, rows); } @@ -94,7 +90,7 @@ public class HoodiePrintHelper { private static String print(TableHeader header) { String[] head = new String[header.getFieldNames().size()]; header.getFieldNames().toArray(head); - return printTextTable(head, new String[][]{}); + return printTextTable(head, new String[][] {}); } /** diff --git a/hudi-cli/src/main/java/org/apache/hudi/cli/Table.java b/hudi-cli/src/main/java/org/apache/hudi/cli/Table.java index 79ee7cc92..3569451f6 100644 --- a/hudi-cli/src/main/java/org/apache/hudi/cli/Table.java +++ b/hudi-cli/src/main/java/org/apache/hudi/cli/Table.java @@ -31,8 +31,7 @@ import java.util.stream.IntStream; import org.apache.hudi.common.util.Option; /** - * Table to be rendered. This class takes care of ordering - * rows and limiting before renderer renders it. + * Table to be rendered. This class takes care of ordering rows and limiting before renderer renders it. */ public class Table implements Iterable> { @@ -53,11 +52,8 @@ public class Table implements Iterable> { // Rows ready for Rendering private List> renderRows; - public Table(TableHeader rowHeader, - Map> fieldNameToConverterMap, - Option orderingFieldNameOptional, - Option isDescendingOptional, - Option limitOptional) { + public Table(TableHeader rowHeader, Map> fieldNameToConverterMap, + Option orderingFieldNameOptional, Option isDescendingOptional, Option limitOptional) { this.rowHeader = rowHeader; this.fieldNameToConverterMap = fieldNameToConverterMap; this.orderingFieldNameOptional = orderingFieldNameOptional; @@ -68,6 +64,7 @@ public class Table implements Iterable> { /** * Main API to add row to the table + * * @param row Row */ public Table add(List row) { @@ -86,6 +83,7 @@ public class Table implements Iterable> { /** * Add all rows + * * @param rows Rows to be aded * @return */ @@ -96,6 +94,7 @@ public class Table implements Iterable> { /** * Add all rows + * * @param rows Rows to be added * @return */ @@ -115,6 +114,7 @@ public class Table implements Iterable> { /** * Sorting of rows by a specified field + * * @return */ private List> orderRows() { diff --git a/hudi-cli/src/main/java/org/apache/hudi/cli/commands/ArchivedCommitsCommand.java b/hudi-cli/src/main/java/org/apache/hudi/cli/commands/ArchivedCommitsCommand.java index c599c98c8..9a0eff16d 100644 --- a/hudi-cli/src/main/java/org/apache/hudi/cli/commands/ArchivedCommitsCommand.java +++ b/hudi-cli/src/main/java/org/apache/hudi/cli/commands/ArchivedCommitsCommand.java @@ -59,8 +59,8 @@ public class ArchivedCommitsCommand implements CommandMarker { @CliOption(key = {"limit"}, help = "Limit commits", unspecifiedDefaultValue = "-1") final Integer limit, @CliOption(key = {"sortBy"}, help = "Sorting Field", unspecifiedDefaultValue = "") final String sortByField, @CliOption(key = {"desc"}, help = "Ordering", unspecifiedDefaultValue = "false") final boolean descending, - @CliOption(key = { - "headeronly"}, help = "Print Header Only", unspecifiedDefaultValue = "false") final boolean headerOnly) + @CliOption(key = {"headeronly"}, help = "Print Header Only", + unspecifiedDefaultValue = "false") final boolean headerOnly) throws IOException { System.out.println("===============> Showing only " + limit + " archived commits <==============="); String basePath = HoodieCLI.tableMetadata.getBasePath(); @@ -71,12 +71,12 @@ public class ArchivedCommitsCommand implements CommandMarker { FileStatus[] fsStatuses = FSUtils.getFs(basePath, HoodieCLI.conf).globStatus(archivePath); List allStats = new ArrayList<>(); for (FileStatus fs : fsStatuses) { - //read the archived file + // read the archived file Reader reader = HoodieLogFormat.newReader(FSUtils.getFs(basePath, HoodieCLI.conf), new HoodieLogFile(fs.getPath()), HoodieArchivedMetaEntry.getClassSchema()); List readRecords = new ArrayList<>(); - //read the avro blocks + // read the avro blocks while (reader.hasNext()) { HoodieAvroDataBlock blk = (HoodieAvroDataBlock) reader.next(); List records = blk.getRecords(); @@ -86,9 +86,8 @@ public class ArchivedCommitsCommand implements CommandMarker { .filter(r -> r.get("actionType").toString().equals(HoodieTimeline.COMMIT_ACTION) || r.get("actionType").toString().equals(HoodieTimeline.DELTA_COMMIT_ACTION)) .flatMap(r -> { - HoodieCommitMetadata metadata = - (HoodieCommitMetadata) SpecificData.get().deepCopy(HoodieCommitMetadata.SCHEMA$, - r.get("hoodieCommitMetadata")); + HoodieCommitMetadata metadata = (HoodieCommitMetadata) SpecificData.get() + .deepCopy(HoodieCommitMetadata.SCHEMA$, r.get("hoodieCommitMetadata")); final String instantTime = r.get("commitTime").toString(); final String action = r.get("actionType").toString(); return metadata.getPartitionToWriteStats().values().stream().flatMap(hoodieWriteStats -> { @@ -118,22 +117,13 @@ public class ArchivedCommitsCommand implements CommandMarker { allStats.addAll(readCommits); reader.close(); } - TableHeader header = new TableHeader().addTableHeaderField("action") - .addTableHeaderField("instant") - .addTableHeaderField("partition") - .addTableHeaderField("file_id") - .addTableHeaderField("prev_instant") - .addTableHeaderField("num_writes") - .addTableHeaderField("num_inserts") - .addTableHeaderField("num_deletes") - .addTableHeaderField("num_update_writes") - .addTableHeaderField("total_log_files") - .addTableHeaderField("total_log_blocks") - .addTableHeaderField("total_corrupt_log_blocks") - .addTableHeaderField("total_rollback_blocks") - .addTableHeaderField("total_log_records") - .addTableHeaderField("total_updated_records_compacted") - .addTableHeaderField("total_write_bytes") + TableHeader header = new TableHeader().addTableHeaderField("action").addTableHeaderField("instant") + .addTableHeaderField("partition").addTableHeaderField("file_id").addTableHeaderField("prev_instant") + .addTableHeaderField("num_writes").addTableHeaderField("num_inserts").addTableHeaderField("num_deletes") + .addTableHeaderField("num_update_writes").addTableHeaderField("total_log_files") + .addTableHeaderField("total_log_blocks").addTableHeaderField("total_corrupt_log_blocks") + .addTableHeaderField("total_rollback_blocks").addTableHeaderField("total_log_records") + .addTableHeaderField("total_updated_records_compacted").addTableHeaderField("total_write_bytes") .addTableHeaderField("total_write_errors"); return HoodiePrintHelper.print(header, new HashMap<>(), sortByField, descending, limit, headerOnly, allStats); @@ -141,41 +131,39 @@ public class ArchivedCommitsCommand implements CommandMarker { @CliCommand(value = "show archived commits", help = "Read commits from archived files and show details") public String showCommits( - @CliOption(key = {"skipMetadata"}, help = "Skip displaying commit metadata", unspecifiedDefaultValue = "true") - boolean skipMetadata, + @CliOption(key = {"skipMetadata"}, help = "Skip displaying commit metadata", + unspecifiedDefaultValue = "true") boolean skipMetadata, @CliOption(key = {"limit"}, help = "Limit commits", unspecifiedDefaultValue = "10") final Integer limit, @CliOption(key = {"sortBy"}, help = "Sorting Field", unspecifiedDefaultValue = "") final String sortByField, @CliOption(key = {"desc"}, help = "Ordering", unspecifiedDefaultValue = "false") final boolean descending, - @CliOption(key = { - "headeronly"}, help = "Print Header Only", unspecifiedDefaultValue = "false") final boolean headerOnly) + @CliOption(key = {"headeronly"}, help = "Print Header Only", + unspecifiedDefaultValue = "false") final boolean headerOnly) throws IOException { System.out.println("===============> Showing only " + limit + " archived commits <==============="); String basePath = HoodieCLI.tableMetadata.getBasePath(); - FileStatus[] fsStatuses = FSUtils.getFs(basePath, HoodieCLI.conf) - .globStatus(new Path(basePath + "/.hoodie/.commits_.archive*")); + FileStatus[] fsStatuses = + FSUtils.getFs(basePath, HoodieCLI.conf).globStatus(new Path(basePath + "/.hoodie/.commits_.archive*")); List allCommits = new ArrayList<>(); for (FileStatus fs : fsStatuses) { - //read the archived file + // read the archived file HoodieLogFormat.Reader reader = HoodieLogFormat.newReader(FSUtils.getFs(basePath, HoodieCLI.conf), new HoodieLogFile(fs.getPath()), HoodieArchivedMetaEntry.getClassSchema()); List readRecords = new ArrayList<>(); - //read the avro blocks + // read the avro blocks while (reader.hasNext()) { HoodieAvroDataBlock blk = (HoodieAvroDataBlock) reader.next(); List records = blk.getRecords(); readRecords.addAll(records); } - List readCommits = readRecords.stream().map(r -> (GenericRecord) r).map(r -> - readCommit(r, skipMetadata)) - .collect(Collectors.toList()); + List readCommits = readRecords.stream().map(r -> (GenericRecord) r) + .map(r -> readCommit(r, skipMetadata)).collect(Collectors.toList()); allCommits.addAll(readCommits); reader.close(); } - TableHeader header = new TableHeader().addTableHeaderField("CommitTime") - .addTableHeaderField("CommitType"); + TableHeader header = new TableHeader().addTableHeaderField("CommitTime").addTableHeaderField("CommitType"); if (!skipMetadata) { header = header.addTableHeaderField("CommitDetails"); diff --git a/hudi-cli/src/main/java/org/apache/hudi/cli/commands/CleansCommand.java b/hudi-cli/src/main/java/org/apache/hudi/cli/commands/CleansCommand.java index 3217ba55d..0143c547f 100644 --- a/hudi-cli/src/main/java/org/apache/hudi/cli/commands/CleansCommand.java +++ b/hudi-cli/src/main/java/org/apache/hudi/cli/commands/CleansCommand.java @@ -63,8 +63,8 @@ public class CleansCommand implements CommandMarker { @CliOption(key = {"limit"}, help = "Limit commits", unspecifiedDefaultValue = "-1") final Integer limit, @CliOption(key = {"sortBy"}, help = "Sorting Field", unspecifiedDefaultValue = "") final String sortByField, @CliOption(key = {"desc"}, help = "Ordering", unspecifiedDefaultValue = "false") final boolean descending, - @CliOption(key = { - "headeronly"}, help = "Print Header Only", unspecifiedDefaultValue = "false") final boolean headerOnly) + @CliOption(key = {"headeronly"}, help = "Print Header Only", + unspecifiedDefaultValue = "false") final boolean headerOnly) throws IOException { HoodieActiveTimeline activeTimeline = HoodieCLI.tableMetadata.getActiveTimeline(); @@ -74,17 +74,15 @@ public class CleansCommand implements CommandMarker { Collections.reverse(cleans); for (int i = 0; i < cleans.size(); i++) { HoodieInstant clean = cleans.get(i); - HoodieCleanMetadata cleanMetadata = AvroUtils - .deserializeHoodieCleanMetadata(timeline.getInstantDetails(clean).get()); - rows.add(new Comparable[]{clean.getTimestamp(), cleanMetadata.getEarliestCommitToRetain(), + HoodieCleanMetadata cleanMetadata = + AvroUtils.deserializeHoodieCleanMetadata(timeline.getInstantDetails(clean).get()); + rows.add(new Comparable[] {clean.getTimestamp(), cleanMetadata.getEarliestCommitToRetain(), cleanMetadata.getTotalFilesDeleted(), cleanMetadata.getTimeTakenInMillis()}); } - TableHeader header = new TableHeader() - .addTableHeaderField("CleanTime") - .addTableHeaderField("EarliestCommandRetained") - .addTableHeaderField("Total Files Deleted") - .addTableHeaderField("Total Time Taken"); + TableHeader header = + new TableHeader().addTableHeaderField("CleanTime").addTableHeaderField("EarliestCommandRetained") + .addTableHeaderField("Total Files Deleted").addTableHeaderField("Total Time Taken"); return HoodiePrintHelper.print(header, new HashMap<>(), sortByField, descending, limit, headerOnly, rows); } @@ -95,13 +93,12 @@ public class CleansCommand implements CommandMarker { } @CliCommand(value = "clean showpartitions", help = "Show partition level details of a clean") - public String showCleanPartitions( - @CliOption(key = {"clean"}, help = "clean to show") final String commitTime, + public String showCleanPartitions(@CliOption(key = {"clean"}, help = "clean to show") final String commitTime, @CliOption(key = {"limit"}, help = "Limit commits", unspecifiedDefaultValue = "-1") final Integer limit, @CliOption(key = {"sortBy"}, help = "Sorting Field", unspecifiedDefaultValue = "") final String sortByField, @CliOption(key = {"desc"}, help = "Ordering", unspecifiedDefaultValue = "false") final boolean descending, - @CliOption(key = { - "headeronly"}, help = "Print Header Only", unspecifiedDefaultValue = "false") final boolean headerOnly) + @CliOption(key = {"headeronly"}, help = "Print Header Only", + unspecifiedDefaultValue = "false") final boolean headerOnly) throws Exception { HoodieActiveTimeline activeTimeline = HoodieCLI.tableMetadata.getActiveTimeline(); @@ -112,8 +109,8 @@ public class CleansCommand implements CommandMarker { return "Clean " + commitTime + " not found in metadata " + timeline; } - HoodieCleanMetadata cleanMetadata = AvroUtils.deserializeHoodieCleanMetadata( - timeline.getInstantDetails(cleanInstant).get()); + HoodieCleanMetadata cleanMetadata = + AvroUtils.deserializeHoodieCleanMetadata(timeline.getInstantDetails(cleanInstant).get()); List rows = new ArrayList<>(); for (Map.Entry entry : cleanMetadata.getPartitionMetadata().entrySet()) { String path = entry.getKey(); @@ -121,14 +118,11 @@ public class CleansCommand implements CommandMarker { String policy = stats.getPolicy(); Integer totalSuccessDeletedFiles = stats.getSuccessDeleteFiles().size(); Integer totalFailedDeletedFiles = stats.getFailedDeleteFiles().size(); - rows.add(new Comparable[]{path, policy, totalSuccessDeletedFiles, totalFailedDeletedFiles}); + rows.add(new Comparable[] {path, policy, totalSuccessDeletedFiles, totalFailedDeletedFiles}); } - TableHeader header = new TableHeader() - .addTableHeaderField("Partition Path") - .addTableHeaderField("Cleaning policy") - .addTableHeaderField("Total Files Successfully Deleted") - .addTableHeaderField("Total Failed Deletions"); + TableHeader header = new TableHeader().addTableHeaderField("Partition Path").addTableHeaderField("Cleaning policy") + .addTableHeaderField("Total Files Successfully Deleted").addTableHeaderField("Total Failed Deletions"); return HoodiePrintHelper.print(header, new HashMap<>(), sortByField, descending, limit, headerOnly, rows); } diff --git a/hudi-cli/src/main/java/org/apache/hudi/cli/commands/CommitsCommand.java b/hudi-cli/src/main/java/org/apache/hudi/cli/commands/CommitsCommand.java index 8a7420b5a..448e9956f 100644 --- a/hudi-cli/src/main/java/org/apache/hudi/cli/commands/CommitsCommand.java +++ b/hudi-cli/src/main/java/org/apache/hudi/cli/commands/CommitsCommand.java @@ -69,12 +69,13 @@ public class CommitsCommand implements CommandMarker { } @CliCommand(value = "commits show", help = "Show the commits") - public String showCommits(@CliOption(key = { - "limit"}, mandatory = false, help = "Limit commits", unspecifiedDefaultValue = "-1") final Integer limit, + public String showCommits( + @CliOption(key = {"limit"}, mandatory = false, help = "Limit commits", + unspecifiedDefaultValue = "-1") final Integer limit, @CliOption(key = {"sortBy"}, help = "Sorting Field", unspecifiedDefaultValue = "") final String sortByField, @CliOption(key = {"desc"}, help = "Ordering", unspecifiedDefaultValue = "false") final boolean descending, - @CliOption(key = { - "headeronly"}, help = "Print Header Only", unspecifiedDefaultValue = "false") final boolean headerOnly) + @CliOption(key = {"headeronly"}, help = "Print Header Only", + unspecifiedDefaultValue = "false") final boolean headerOnly) throws IOException { HoodieActiveTimeline activeTimeline = HoodieCLI.tableMetadata.getActiveTimeline(); @@ -84,16 +85,12 @@ public class CommitsCommand implements CommandMarker { Collections.reverse(commits); for (int i = 0; i < commits.size(); i++) { HoodieInstant commit = commits.get(i); - HoodieCommitMetadata commitMetadata = HoodieCommitMetadata.fromBytes(timeline.getInstantDetails(commit).get(), - HoodieCommitMetadata.class); - rows.add(new Comparable[]{commit.getTimestamp(), - commitMetadata.fetchTotalBytesWritten(), - commitMetadata.fetchTotalFilesInsert(), - commitMetadata.fetchTotalFilesUpdated(), - commitMetadata.fetchTotalPartitionsWritten(), - commitMetadata.fetchTotalRecordsWritten(), - commitMetadata.fetchTotalUpdateRecordsWritten(), - commitMetadata.fetchTotalWriteErrors()}); + HoodieCommitMetadata commitMetadata = + HoodieCommitMetadata.fromBytes(timeline.getInstantDetails(commit).get(), HoodieCommitMetadata.class); + rows.add(new Comparable[] {commit.getTimestamp(), commitMetadata.fetchTotalBytesWritten(), + commitMetadata.fetchTotalFilesInsert(), commitMetadata.fetchTotalFilesUpdated(), + commitMetadata.fetchTotalPartitionsWritten(), commitMetadata.fetchTotalRecordsWritten(), + commitMetadata.fetchTotalUpdateRecordsWritten(), commitMetadata.fetchTotalWriteErrors()}); } Map> fieldNameToConverterMap = new HashMap<>(); @@ -101,15 +98,10 @@ public class CommitsCommand implements CommandMarker { return NumericUtils.humanReadableByteCount((Double.valueOf(entry.toString()))); }); - TableHeader header = new TableHeader() - .addTableHeaderField("CommitTime") - .addTableHeaderField("Total Bytes Written") - .addTableHeaderField("Total Files Added") - .addTableHeaderField("Total Files Updated") - .addTableHeaderField("Total Partitions Written") - .addTableHeaderField("Total Records Written") - .addTableHeaderField("Total Update Records Written") - .addTableHeaderField("Total Errors"); + TableHeader header = new TableHeader().addTableHeaderField("CommitTime").addTableHeaderField("Total Bytes Written") + .addTableHeaderField("Total Files Added").addTableHeaderField("Total Files Updated") + .addTableHeaderField("Total Partitions Written").addTableHeaderField("Total Records Written") + .addTableHeaderField("Total Update Records Written").addTableHeaderField("Total Errors"); return HoodiePrintHelper.print(header, fieldNameToConverterMap, sortByField, descending, limit, headerOnly, rows); } @@ -132,8 +124,8 @@ public class CommitsCommand implements CommandMarker { } SparkLauncher sparkLauncher = SparkUtil.initLauncher(sparkPropertiesPath); - sparkLauncher - .addAppArgs(SparkMain.SparkCommand.ROLLBACK.toString(), commitTime, HoodieCLI.tableMetadata.getBasePath()); + sparkLauncher.addAppArgs(SparkMain.SparkCommand.ROLLBACK.toString(), commitTime, + HoodieCLI.tableMetadata.getBasePath()); Process process = sparkLauncher.launch(); InputStreamConsumer.captureOutput(process); int exitCode = process.waitFor(); @@ -146,13 +138,12 @@ public class CommitsCommand implements CommandMarker { } @CliCommand(value = "commit showpartitions", help = "Show partition level details of a commit") - public String showCommitPartitions( - @CliOption(key = {"commit"}, help = "Commit to show") final String commitTime, + public String showCommitPartitions(@CliOption(key = {"commit"}, help = "Commit to show") final String commitTime, @CliOption(key = {"limit"}, help = "Limit commits", unspecifiedDefaultValue = "-1") final Integer limit, @CliOption(key = {"sortBy"}, help = "Sorting Field", unspecifiedDefaultValue = "") final String sortByField, @CliOption(key = {"desc"}, help = "Ordering", unspecifiedDefaultValue = "false") final boolean descending, - @CliOption(key = { - "headeronly"}, help = "Print Header Only", unspecifiedDefaultValue = "false") final boolean headerOnly) + @CliOption(key = {"headeronly"}, help = "Print Header Only", + unspecifiedDefaultValue = "false") final boolean headerOnly) throws Exception { HoodieActiveTimeline activeTimeline = HoodieCLI.tableMetadata.getActiveTimeline(); @@ -185,8 +176,7 @@ public class CommitsCommand implements CommandMarker { totalBytesWritten += stat.getTotalWriteBytes(); totalWriteErrors += stat.getTotalWriteErrors(); } - rows.add(new Comparable[]{path, totalFilesAdded, totalFilesUpdated, - totalRecordsInserted, totalRecordsUpdated, + rows.add(new Comparable[] {path, totalFilesAdded, totalFilesUpdated, totalRecordsInserted, totalRecordsUpdated, totalBytesWritten, totalWriteErrors}); } @@ -195,26 +185,21 @@ public class CommitsCommand implements CommandMarker { return NumericUtils.humanReadableByteCount((Long.valueOf(entry.toString()))); }); - TableHeader header = new TableHeader() - .addTableHeaderField("Partition Path") - .addTableHeaderField("Total Files Added") - .addTableHeaderField("Total Files Updated") - .addTableHeaderField("Total Records Inserted") - .addTableHeaderField("Total Records Updated") - .addTableHeaderField("Total Bytes Written") - .addTableHeaderField("Total Errors"); + TableHeader header = new TableHeader().addTableHeaderField("Partition Path") + .addTableHeaderField("Total Files Added").addTableHeaderField("Total Files Updated") + .addTableHeaderField("Total Records Inserted").addTableHeaderField("Total Records Updated") + .addTableHeaderField("Total Bytes Written").addTableHeaderField("Total Errors"); return HoodiePrintHelper.print(header, fieldNameToConverterMap, sortByField, descending, limit, headerOnly, rows); } @CliCommand(value = "commit showfiles", help = "Show file level details of a commit") - public String showCommitFiles( - @CliOption(key = {"commit"}, help = "Commit to show") final String commitTime, + public String showCommitFiles(@CliOption(key = {"commit"}, help = "Commit to show") final String commitTime, @CliOption(key = {"limit"}, help = "Limit commits", unspecifiedDefaultValue = "-1") final Integer limit, @CliOption(key = {"sortBy"}, help = "Sorting Field", unspecifiedDefaultValue = "") final String sortByField, @CliOption(key = {"desc"}, help = "Ordering", unspecifiedDefaultValue = "false") final boolean descending, - @CliOption(key = { - "headeronly"}, help = "Print Header Only", unspecifiedDefaultValue = "false") final boolean headerOnly) + @CliOption(key = {"headeronly"}, help = "Print Header Only", + unspecifiedDefaultValue = "false") final boolean headerOnly) throws Exception { HoodieActiveTimeline activeTimeline = HoodieCLI.tableMetadata.getActiveTimeline(); @@ -231,23 +216,15 @@ public class CommitsCommand implements CommandMarker { String path = entry.getKey(); List stats = entry.getValue(); for (HoodieWriteStat stat : stats) { - rows.add(new Comparable[]{path, stat.getFileId(), stat.getPrevCommit(), stat.getNumUpdateWrites(), - stat.getNumWrites(), stat.getTotalWriteBytes(), - stat.getTotalWriteErrors(), - stat.getFileSizeInBytes() - }); + rows.add(new Comparable[] {path, stat.getFileId(), stat.getPrevCommit(), stat.getNumUpdateWrites(), + stat.getNumWrites(), stat.getTotalWriteBytes(), stat.getTotalWriteErrors(), stat.getFileSizeInBytes()}); } } - TableHeader header = new TableHeader() - .addTableHeaderField("Partition Path") - .addTableHeaderField("File ID") - .addTableHeaderField("Previous Commit") - .addTableHeaderField("Total Records Updated") - .addTableHeaderField("Total Records Written") - .addTableHeaderField("Total Bytes Written") - .addTableHeaderField("Total Errors") - .addTableHeaderField("File Size"); + TableHeader header = new TableHeader().addTableHeaderField("Partition Path").addTableHeaderField("File ID") + .addTableHeaderField("Previous Commit").addTableHeaderField("Total Records Updated") + .addTableHeaderField("Total Records Written").addTableHeaderField("Total Bytes Written") + .addTableHeaderField("Total Errors").addTableHeaderField("File Size"); return HoodiePrintHelper.print(header, new HashMap<>(), sortByField, descending, limit, headerOnly, rows); } @@ -270,8 +247,8 @@ public class CommitsCommand implements CommandMarker { String sourceLatestCommit = sourceTimeline.getInstants().iterator().hasNext() ? "0" : sourceTimeline.lastInstant().get().getTimestamp(); - if (sourceLatestCommit != null && HoodieTimeline.compareTimestamps(targetLatestCommit, sourceLatestCommit, - HoodieTimeline.GREATER)) { + if (sourceLatestCommit != null + && HoodieTimeline.compareTimestamps(targetLatestCommit, sourceLatestCommit, HoodieTimeline.GREATER)) { // source is behind the target List commitsToCatchup = targetTimeline.findInstantsAfter(sourceLatestCommit, Integer.MAX_VALUE) .getInstants().map(HoodieInstant::getTimestamp).collect(Collectors.toList()); diff --git a/hudi-cli/src/main/java/org/apache/hudi/cli/commands/CompactionCommand.java b/hudi-cli/src/main/java/org/apache/hudi/cli/commands/CompactionCommand.java index 6a32edc99..c89a68982 100644 --- a/hudi-cli/src/main/java/org/apache/hudi/cli/commands/CompactionCommand.java +++ b/hudi-cli/src/main/java/org/apache/hudi/cli/commands/CompactionCommand.java @@ -75,16 +75,15 @@ public class CompactionCommand implements CommandMarker { @CliCommand(value = "compactions show all", help = "Shows all compactions that are in active timeline") public String compactionsAll( - @CliOption(key = { - "includeExtraMetadata"}, help = "Include extra metadata", unspecifiedDefaultValue = "false") final - boolean includeExtraMetadata, - @CliOption(key = { - "limit"}, mandatory = false, help = "Limit commits", unspecifiedDefaultValue = "-1") final Integer limit, + @CliOption(key = {"includeExtraMetadata"}, help = "Include extra metadata", + unspecifiedDefaultValue = "false") final boolean includeExtraMetadata, + @CliOption(key = {"limit"}, mandatory = false, help = "Limit commits", + unspecifiedDefaultValue = "-1") final Integer limit, @CliOption(key = {"sortBy"}, help = "Sorting Field", unspecifiedDefaultValue = "") final String sortByField, @CliOption(key = {"desc"}, help = "Ordering", unspecifiedDefaultValue = "false") final boolean descending, - @CliOption(key = { - "headeronly"}, help = "Print Header Only", unspecifiedDefaultValue = "false") final - boolean headerOnly) throws IOException { + @CliOption(key = {"headeronly"}, help = "Print Header Only", + unspecifiedDefaultValue = "false") final boolean headerOnly) + throws IOException { HoodieActiveTimeline activeTimeline = HoodieCLI.tableMetadata.getActiveTimeline(); HoodieTimeline timeline = activeTimeline.getCommitsAndCompactionTimeline(); HoodieTimeline commitTimeline = activeTimeline.getCommitTimeline().filterCompletedInstants(); @@ -99,15 +98,14 @@ public class CompactionCommand implements CommandMarker { if (!instant.getAction().equals(HoodieTimeline.COMPACTION_ACTION)) { try { // This could be a completed compaction. Assume a compaction request file is present but skip if fails - workload = AvroUtils.deserializeCompactionPlan( - activeTimeline.getInstantAuxiliaryDetails( - HoodieTimeline.getCompactionRequestedInstant(instant.getTimestamp())).get()); + workload = AvroUtils.deserializeCompactionPlan(activeTimeline + .getInstantAuxiliaryDetails(HoodieTimeline.getCompactionRequestedInstant(instant.getTimestamp())).get()); } catch (HoodieIOException ioe) { // SKIP } } else { - workload = AvroUtils.deserializeCompactionPlan(activeTimeline.getInstantAuxiliaryDetails( - HoodieTimeline.getCompactionRequestedInstant(instant.getTimestamp())).get()); + workload = AvroUtils.deserializeCompactionPlan(activeTimeline + .getInstantAuxiliaryDetails(HoodieTimeline.getCompactionRequestedInstant(instant.getTimestamp())).get()); } if (null != workload) { @@ -116,22 +114,18 @@ public class CompactionCommand implements CommandMarker { state = State.COMPLETED; } if (includeExtraMetadata) { - rows.add(new Comparable[]{instant.getTimestamp(), - state.toString(), + rows.add(new Comparable[] {instant.getTimestamp(), state.toString(), workload.getOperations() == null ? 0 : workload.getOperations().size(), workload.getExtraMetadata().toString()}); } else { - rows.add(new Comparable[]{instant.getTimestamp(), - state.toString(), + rows.add(new Comparable[] {instant.getTimestamp(), state.toString(), workload.getOperations() == null ? 0 : workload.getOperations().size()}); } } } Map> fieldNameToConverterMap = new HashMap<>(); - TableHeader header = new TableHeader() - .addTableHeaderField("Compaction Instant Time") - .addTableHeaderField("State") + TableHeader header = new TableHeader().addTableHeaderField("Compaction Instant Time").addTableHeaderField("State") .addTableHeaderField("Total FileIds to be Compacted"); if (includeExtraMetadata) { header = header.addTableHeaderField("Extra Metadata"); @@ -141,48 +135,37 @@ public class CompactionCommand implements CommandMarker { @CliCommand(value = "compaction show", help = "Shows compaction details for a specific compaction instant") public String compactionShow( - @CliOption(key = "instant", mandatory = true, help = "Base path for the target hoodie dataset") final - String compactionInstantTime, - @CliOption(key = { - "limit"}, mandatory = false, help = "Limit commits", unspecifiedDefaultValue = "-1") final Integer limit, + @CliOption(key = "instant", mandatory = true, + help = "Base path for the target hoodie dataset") final String compactionInstantTime, + @CliOption(key = {"limit"}, mandatory = false, help = "Limit commits", + unspecifiedDefaultValue = "-1") final Integer limit, @CliOption(key = {"sortBy"}, help = "Sorting Field", unspecifiedDefaultValue = "") final String sortByField, @CliOption(key = {"desc"}, help = "Ordering", unspecifiedDefaultValue = "false") final boolean descending, - @CliOption(key = { - "headeronly"}, help = "Print Header Only", unspecifiedDefaultValue = "false") final boolean headerOnly) + @CliOption(key = {"headeronly"}, help = "Print Header Only", + unspecifiedDefaultValue = "false") final boolean headerOnly) throws Exception { HoodieActiveTimeline activeTimeline = HoodieCLI.tableMetadata.getActiveTimeline(); - HoodieCompactionPlan workload = AvroUtils.deserializeCompactionPlan( - activeTimeline.getInstantAuxiliaryDetails( - HoodieTimeline.getCompactionRequestedInstant(compactionInstantTime)).get()); + HoodieCompactionPlan workload = AvroUtils.deserializeCompactionPlan(activeTimeline + .getInstantAuxiliaryDetails(HoodieTimeline.getCompactionRequestedInstant(compactionInstantTime)).get()); List rows = new ArrayList<>(); if ((null != workload) && (null != workload.getOperations())) { for (HoodieCompactionOperation op : workload.getOperations()) { - rows.add(new Comparable[]{op.getPartitionPath(), - op.getFileId(), - op.getBaseInstantTime(), - op.getDataFilePath(), - op.getDeltaFilePaths().size(), - op.getMetrics() == null ? "" : op.getMetrics().toString() - }); + rows.add(new Comparable[] {op.getPartitionPath(), op.getFileId(), op.getBaseInstantTime(), op.getDataFilePath(), + op.getDeltaFilePaths().size(), op.getMetrics() == null ? "" : op.getMetrics().toString()}); } } Map> fieldNameToConverterMap = new HashMap<>(); - TableHeader header = new TableHeader() - .addTableHeaderField("Partition Path") - .addTableHeaderField("File Id") - .addTableHeaderField("Base Instant") - .addTableHeaderField("Data File Path") - .addTableHeaderField("Total Delta Files") - .addTableHeaderField("getMetrics"); + TableHeader header = new TableHeader().addTableHeaderField("Partition Path").addTableHeaderField("File Id") + .addTableHeaderField("Base Instant").addTableHeaderField("Data File Path") + .addTableHeaderField("Total Delta Files").addTableHeaderField("getMetrics"); return HoodiePrintHelper.print(header, fieldNameToConverterMap, sortByField, descending, limit, headerOnly, rows); } @CliCommand(value = "compaction schedule", help = "Schedule Compaction") - public String scheduleCompact( - @CliOption(key = "sparkMemory", unspecifiedDefaultValue = "1G", help = "Spark executor memory") - final String sparkMemory) throws Exception { + public String scheduleCompact(@CliOption(key = "sparkMemory", unspecifiedDefaultValue = "1G", + help = "Spark executor memory") final String sparkMemory) throws Exception { boolean initialized = HoodieCLI.initConf(); HoodieCLI.initFS(initialized); @@ -190,8 +173,8 @@ public class CompactionCommand implements CommandMarker { String compactionInstantTime = HoodieActiveTimeline.createNewCommitTime(); if (HoodieCLI.tableMetadata.getTableType() == HoodieTableType.MERGE_ON_READ) { - String sparkPropertiesPath = Utils.getDefaultPropertiesFile( - scala.collection.JavaConversions.propertiesAsScalaMap(System.getProperties())); + String sparkPropertiesPath = + Utils.getDefaultPropertiesFile(scala.collection.JavaConversions.propertiesAsScalaMap(System.getProperties())); SparkLauncher sparkLauncher = SparkUtil.initLauncher(sparkPropertiesPath); sparkLauncher.addAppArgs(SparkCommand.COMPACT_SCHEDULE.toString(), HoodieCLI.tableMetadata.getBasePath(), HoodieCLI.tableMetadata.getTableConfig().getTableName(), compactionInstantTime, sparkMemory); @@ -209,33 +192,34 @@ public class CompactionCommand implements CommandMarker { @CliCommand(value = "compaction run", help = "Run Compaction for given instant time") public String compact( - @CliOption(key = {"parallelism"}, mandatory = true, help = "Parallelism for hoodie compaction") - final String parallelism, - @CliOption(key = "schemaFilePath", mandatory = true, help = "Path for Avro schema file") - final String schemaFilePath, - @CliOption(key = "sparkMemory", unspecifiedDefaultValue = "4G", help = "Spark executor memory") - final String sparkMemory, - @CliOption(key = "retry", unspecifiedDefaultValue = "1", help = "Number of retries") - final String retry, - @CliOption(key = "compactionInstant", mandatory = false, help = "Base path for the target hoodie dataset") - String compactionInstantTime) throws Exception { + @CliOption(key = {"parallelism"}, mandatory = true, + help = "Parallelism for hoodie compaction") final String parallelism, + @CliOption(key = "schemaFilePath", mandatory = true, + help = "Path for Avro schema file") final String schemaFilePath, + @CliOption(key = "sparkMemory", unspecifiedDefaultValue = "4G", + help = "Spark executor memory") final String sparkMemory, + @CliOption(key = "retry", unspecifiedDefaultValue = "1", help = "Number of retries") final String retry, + @CliOption(key = "compactionInstant", mandatory = false, + help = "Base path for the target hoodie dataset") String compactionInstantTime) + throws Exception { boolean initialized = HoodieCLI.initConf(); HoodieCLI.initFS(initialized); if (HoodieCLI.tableMetadata.getTableType() == HoodieTableType.MERGE_ON_READ) { if (null == compactionInstantTime) { // pick outstanding one with lowest timestamp - Option firstPendingInstant = HoodieCLI.tableMetadata.reloadActiveTimeline() - .filterCompletedAndCompactionInstants().filter(instant -> instant.getAction() - .equals(HoodieTimeline.COMPACTION_ACTION)).firstInstant().map(HoodieInstant::getTimestamp); + Option firstPendingInstant = + HoodieCLI.tableMetadata.reloadActiveTimeline().filterCompletedAndCompactionInstants() + .filter(instant -> instant.getAction().equals(HoodieTimeline.COMPACTION_ACTION)).firstInstant() + .map(HoodieInstant::getTimestamp); if (!firstPendingInstant.isPresent()) { return "NO PENDING COMPACTION TO RUN"; } compactionInstantTime = firstPendingInstant.get(); } - String sparkPropertiesPath = Utils.getDefaultPropertiesFile( - scala.collection.JavaConversions.propertiesAsScalaMap(System.getProperties())); + String sparkPropertiesPath = + Utils.getDefaultPropertiesFile(scala.collection.JavaConversions.propertiesAsScalaMap(System.getProperties())); SparkLauncher sparkLauncher = SparkUtil.initLauncher(sparkPropertiesPath); sparkLauncher.addAppArgs(SparkCommand.COMPACT_RUN.toString(), HoodieCLI.tableMetadata.getBasePath(), HoodieCLI.tableMetadata.getTableConfig().getTableName(), compactionInstantTime, parallelism, schemaFilePath, @@ -279,8 +263,8 @@ public class CompactionCommand implements CommandMarker { @CliOption(key = {"limit"}, help = "Limit commits", unspecifiedDefaultValue = "-1") Integer limit, @CliOption(key = {"sortBy"}, help = "Sorting Field", unspecifiedDefaultValue = "") String sortByField, @CliOption(key = {"desc"}, help = "Ordering", unspecifiedDefaultValue = "false") boolean descending, - @CliOption(key = { - "headeronly"}, help = "Print Header Only", unspecifiedDefaultValue = "false") boolean headerOnly) + @CliOption(key = {"headeronly"}, help = "Print Header Only", + unspecifiedDefaultValue = "false") boolean headerOnly) throws Exception { boolean initialized = HoodieCLI.initConf(); HoodieCLI.initFS(initialized); @@ -290,12 +274,11 @@ public class CompactionCommand implements CommandMarker { String output = null; if (HoodieCLI.tableMetadata.getTableType() == HoodieTableType.MERGE_ON_READ) { try { - String sparkPropertiesPath = Utils.getDefaultPropertiesFile( - scala.collection.JavaConversions.propertiesAsScalaMap(System.getProperties())); + String sparkPropertiesPath = Utils + .getDefaultPropertiesFile(scala.collection.JavaConversions.propertiesAsScalaMap(System.getProperties())); SparkLauncher sparkLauncher = SparkUtil.initLauncher(sparkPropertiesPath); - sparkLauncher.addAppArgs(SparkCommand.COMPACT_VALIDATE.toString(), - HoodieCLI.tableMetadata.getBasePath(), compactionInstant, outputPathStr, parallelism, master, - sparkMemory); + sparkLauncher.addAppArgs(SparkCommand.COMPACT_VALIDATE.toString(), HoodieCLI.tableMetadata.getBasePath(), + compactionInstant, outputPathStr, parallelism, master, sparkMemory); Process process = sparkLauncher.launch(); InputStreamConsumer.captureOutput(process); int exitCode = process.waitFor(); @@ -307,8 +290,7 @@ public class CompactionCommand implements CommandMarker { String message = "\n\n\t COMPACTION PLAN " + (valid ? "VALID" : "INVALID") + "\n\n"; List rows = new ArrayList<>(); res.stream().forEach(r -> { - Comparable[] row = new Comparable[]{r.getOperation().getFileId(), - r.getOperation().getBaseInstantTime(), + Comparable[] row = new Comparable[] {r.getOperation().getFileId(), r.getOperation().getBaseInstantTime(), r.getOperation().getDataFilePath().isPresent() ? r.getOperation().getDataFilePath().get() : "", r.getOperation().getDeltaFilePaths().size(), r.isSuccess(), r.getException().isPresent() ? r.getException().get().getMessage() : ""}; @@ -316,12 +298,8 @@ public class CompactionCommand implements CommandMarker { }); Map> fieldNameToConverterMap = new HashMap<>(); - TableHeader header = new TableHeader() - .addTableHeaderField("File Id") - .addTableHeaderField("Base Instant Time") - .addTableHeaderField("Base Data File") - .addTableHeaderField("Num Delta Files") - .addTableHeaderField("Valid") + TableHeader header = new TableHeader().addTableHeaderField("File Id").addTableHeaderField("Base Instant Time") + .addTableHeaderField("Base Data File").addTableHeaderField("Num Delta Files").addTableHeaderField("Valid") .addTableHeaderField("Error"); output = message + HoodiePrintHelper.print(header, fieldNameToConverterMap, sortByField, descending, limit, @@ -349,8 +327,8 @@ public class CompactionCommand implements CommandMarker { @CliOption(key = {"limit"}, help = "Limit commits", unspecifiedDefaultValue = "-1") Integer limit, @CliOption(key = {"sortBy"}, help = "Sorting Field", unspecifiedDefaultValue = "") String sortByField, @CliOption(key = {"desc"}, help = "Ordering", unspecifiedDefaultValue = "false") boolean descending, - @CliOption(key = { - "headeronly"}, help = "Print Header Only", unspecifiedDefaultValue = "false") boolean headerOnly) + @CliOption(key = {"headeronly"}, help = "Print Header Only", + unspecifiedDefaultValue = "false") boolean headerOnly) throws Exception { boolean initialized = HoodieCLI.initConf(); HoodieCLI.initFS(initialized); @@ -360,12 +338,12 @@ public class CompactionCommand implements CommandMarker { String output = ""; if (HoodieCLI.tableMetadata.getTableType() == HoodieTableType.MERGE_ON_READ) { try { - String sparkPropertiesPath = Utils.getDefaultPropertiesFile( - scala.collection.JavaConversions.propertiesAsScalaMap(System.getProperties())); + String sparkPropertiesPath = Utils + .getDefaultPropertiesFile(scala.collection.JavaConversions.propertiesAsScalaMap(System.getProperties())); SparkLauncher sparkLauncher = SparkUtil.initLauncher(sparkPropertiesPath); - sparkLauncher.addAppArgs(SparkCommand.COMPACT_UNSCHEDULE_PLAN.toString(), - HoodieCLI.tableMetadata.getBasePath(), compactionInstant, outputPathStr, parallelism, master, - sparkMemory, Boolean.valueOf(skipV).toString(), Boolean.valueOf(dryRun).toString()); + sparkLauncher.addAppArgs(SparkCommand.COMPACT_UNSCHEDULE_PLAN.toString(), HoodieCLI.tableMetadata.getBasePath(), + compactionInstant, outputPathStr, parallelism, master, sparkMemory, Boolean.valueOf(skipV).toString(), + Boolean.valueOf(dryRun).toString()); Process process = sparkLauncher.launch(); InputStreamConsumer.captureOutput(process); int exitCode = process.waitFor(); @@ -373,8 +351,8 @@ public class CompactionCommand implements CommandMarker { return "Failed to unschedule compaction for " + compactionInstant; } List res = deSerializeOperationResult(outputPathStr, HoodieCLI.fs); - output = getRenamesToBePrinted(res, limit, sortByField, descending, headerOnly, - "unschedule pending compaction"); + output = + getRenamesToBePrinted(res, limit, sortByField, descending, headerOnly, "unschedule pending compaction"); } finally { // Delete tmp file used to serialize result if (HoodieCLI.fs.exists(outputPath)) { @@ -407,12 +385,12 @@ public class CompactionCommand implements CommandMarker { String output = ""; if (HoodieCLI.tableMetadata.getTableType() == HoodieTableType.MERGE_ON_READ) { try { - String sparkPropertiesPath = Utils.getDefaultPropertiesFile( - scala.collection.JavaConversions.propertiesAsScalaMap(System.getProperties())); + String sparkPropertiesPath = Utils + .getDefaultPropertiesFile(scala.collection.JavaConversions.propertiesAsScalaMap(System.getProperties())); SparkLauncher sparkLauncher = SparkUtil.initLauncher(sparkPropertiesPath); - sparkLauncher.addAppArgs(SparkCommand.COMPACT_UNSCHEDULE_FILE.toString(), - HoodieCLI.tableMetadata.getBasePath(), fileId, outputPathStr, "1", master, - sparkMemory, Boolean.valueOf(skipV).toString(), Boolean.valueOf(dryRun).toString()); + sparkLauncher.addAppArgs(SparkCommand.COMPACT_UNSCHEDULE_FILE.toString(), HoodieCLI.tableMetadata.getBasePath(), + fileId, outputPathStr, "1", master, sparkMemory, Boolean.valueOf(skipV).toString(), + Boolean.valueOf(dryRun).toString()); Process process = sparkLauncher.launch(); InputStreamConsumer.captureOutput(process); int exitCode = process.waitFor(); @@ -445,8 +423,8 @@ public class CompactionCommand implements CommandMarker { @CliOption(key = {"limit"}, help = "Limit commits", unspecifiedDefaultValue = "-1") Integer limit, @CliOption(key = {"sortBy"}, help = "Sorting Field", unspecifiedDefaultValue = "") String sortByField, @CliOption(key = {"desc"}, help = "Ordering", unspecifiedDefaultValue = "false") boolean descending, - @CliOption(key = { - "headeronly"}, help = "Print Header Only", unspecifiedDefaultValue = "false") boolean headerOnly) + @CliOption(key = {"headeronly"}, help = "Print Header Only", + unspecifiedDefaultValue = "false") boolean headerOnly) throws Exception { boolean initialized = HoodieCLI.initConf(); HoodieCLI.initFS(initialized); @@ -455,12 +433,11 @@ public class CompactionCommand implements CommandMarker { String output = ""; if (HoodieCLI.tableMetadata.getTableType() == HoodieTableType.MERGE_ON_READ) { try { - String sparkPropertiesPath = Utils.getDefaultPropertiesFile( - scala.collection.JavaConversions.propertiesAsScalaMap(System.getProperties())); + String sparkPropertiesPath = Utils + .getDefaultPropertiesFile(scala.collection.JavaConversions.propertiesAsScalaMap(System.getProperties())); SparkLauncher sparkLauncher = SparkUtil.initLauncher(sparkPropertiesPath); - sparkLauncher.addAppArgs(SparkCommand.COMPACT_REPAIR.toString(), - HoodieCLI.tableMetadata.getBasePath(), compactionInstant, outputPathStr, parallelism, master, - sparkMemory, Boolean.valueOf(dryRun).toString()); + sparkLauncher.addAppArgs(SparkCommand.COMPACT_REPAIR.toString(), HoodieCLI.tableMetadata.getBasePath(), + compactionInstant, outputPathStr, parallelism, master, sparkMemory, Boolean.valueOf(dryRun).toString()); Process process = sparkLauncher.launch(); InputStreamConsumer.captureOutput(process); int exitCode = process.waitFor(); @@ -481,41 +458,35 @@ public class CompactionCommand implements CommandMarker { } } - private String getRenamesToBePrinted(List res, Integer limit, - String sortByField, boolean descending, boolean headerOnly, String operation) { + private String getRenamesToBePrinted(List res, Integer limit, String sortByField, boolean descending, + boolean headerOnly, String operation) { - Option result = Option.fromJavaOptional( - res.stream().map(r -> r.isExecuted() && r.isSuccess()).reduce(Boolean::logicalAnd)); + Option result = + Option.fromJavaOptional(res.stream().map(r -> r.isExecuted() && r.isSuccess()).reduce(Boolean::logicalAnd)); if (result.isPresent()) { System.out.println("There were some file renames that needed to be done to " + operation); if (result.get()) { System.out.println("All renames successfully completed to " + operation + " done !!"); } else { - System.out.println("Some renames failed. DataSet could be in inconsistent-state. " - + "Try running compaction repair"); + System.out + .println("Some renames failed. DataSet could be in inconsistent-state. " + "Try running compaction repair"); } List rows = new ArrayList<>(); res.stream().forEach(r -> { - Comparable[] row = new Comparable[] { - r.getOperation().fileId, r.getOperation().srcPath, r.getOperation().destPath, - r.isExecuted(), r.isSuccess(), r.getException().isPresent() ? r.getException().get().getMessage() : "" - }; + Comparable[] row = + new Comparable[] {r.getOperation().fileId, r.getOperation().srcPath, r.getOperation().destPath, + r.isExecuted(), r.isSuccess(), r.getException().isPresent() ? r.getException().get().getMessage() : ""}; rows.add(row); }); Map> fieldNameToConverterMap = new HashMap<>(); - TableHeader header = new TableHeader() - .addTableHeaderField("File Id") - .addTableHeaderField("Source File Path") - .addTableHeaderField("Destination File Path") - .addTableHeaderField("Rename Executed?") - .addTableHeaderField("Rename Succeeded?") - .addTableHeaderField("Error"); + TableHeader header = new TableHeader().addTableHeaderField("File Id").addTableHeaderField("Source File Path") + .addTableHeaderField("Destination File Path").addTableHeaderField("Rename Executed?") + .addTableHeaderField("Rename Succeeded?").addTableHeaderField("Error"); - return HoodiePrintHelper.print(header, fieldNameToConverterMap, sortByField, descending, - limit, headerOnly, rows); + return HoodiePrintHelper.print(header, fieldNameToConverterMap, sortByField, descending, limit, headerOnly, rows); } else { return "No File renames needed to " + operation + ". Operation successful."; } diff --git a/hudi-cli/src/main/java/org/apache/hudi/cli/commands/DatasetsCommand.java b/hudi-cli/src/main/java/org/apache/hudi/cli/commands/DatasetsCommand.java index 19e1e9525..b6b1a38ef 100644 --- a/hudi-cli/src/main/java/org/apache/hudi/cli/commands/DatasetsCommand.java +++ b/hudi-cli/src/main/java/org/apache/hudi/cli/commands/DatasetsCommand.java @@ -52,13 +52,12 @@ public class DatasetsCommand implements CommandMarker { @CliOption(key = {"maxCheckIntervalMs"}, mandatory = false, unspecifiedDefaultValue = "300000", help = "Max wait time for eventual consistency") final Integer maxConsistencyIntervalMs, @CliOption(key = {"maxCheckIntervalMs"}, mandatory = false, unspecifiedDefaultValue = "7", - help = "Max checks for eventual consistency") final Integer maxConsistencyChecks) throws IOException { - HoodieCLI.setConsistencyGuardConfig( - ConsistencyGuardConfig.newBuilder() - .withConsistencyCheckEnabled(eventuallyConsistent) + help = "Max checks for eventual consistency") final Integer maxConsistencyChecks) + throws IOException { + HoodieCLI + .setConsistencyGuardConfig(ConsistencyGuardConfig.newBuilder().withConsistencyCheckEnabled(eventuallyConsistent) .withInitialConsistencyCheckIntervalMs(initialConsistencyIntervalMs) - .withMaxConsistencyCheckIntervalMs(maxConsistencyIntervalMs) - .withMaxConsistencyChecks(maxConsistencyChecks) + .withMaxConsistencyCheckIntervalMs(maxConsistencyIntervalMs).withMaxConsistencyChecks(maxConsistencyChecks) .build()); HoodieCLI.initConf(); HoodieCLI.connectTo(path); @@ -70,8 +69,8 @@ public class DatasetsCommand implements CommandMarker { /** * Create a Hoodie Table if it does not exist * - * @param path Base Path - * @param name Hoodie Table Name + * @param path Base Path + * @param name Hoodie Table Name * @param tableTypeStr Hoodie Table Type * @param payloadClass Payload Class */ @@ -82,7 +81,8 @@ public class DatasetsCommand implements CommandMarker { @CliOption(key = {"tableType"}, unspecifiedDefaultValue = "COPY_ON_WRITE", help = "Hoodie Table Type. Must be one of : COPY_ON_WRITE or MERGE_ON_READ") final String tableTypeStr, @CliOption(key = {"payloadClass"}, unspecifiedDefaultValue = "org.apache.hudi.common.model.HoodieAvroPayload", - help = "Payload Class") final String payloadClass) throws IOException { + help = "Payload Class") final String payloadClass) + throws IOException { boolean initialized = HoodieCLI.initConf(); HoodieCLI.initFS(initialized); @@ -117,15 +117,13 @@ public class DatasetsCommand implements CommandMarker { */ @CliCommand(value = "desc", help = "Describle Hoodie Table properties") public String descTable() { - TableHeader header = new TableHeader() - .addTableHeaderField("Property") - .addTableHeaderField("Value"); + TableHeader header = new TableHeader().addTableHeaderField("Property").addTableHeaderField("Value"); List rows = new ArrayList<>(); - rows.add(new Comparable[]{"basePath", HoodieCLI.tableMetadata.getBasePath()}); - rows.add(new Comparable[]{"metaPath", HoodieCLI.tableMetadata.getMetaPath()}); - rows.add(new Comparable[]{"fileSystem", HoodieCLI.tableMetadata.getFs().getScheme()}); + rows.add(new Comparable[] {"basePath", HoodieCLI.tableMetadata.getBasePath()}); + rows.add(new Comparable[] {"metaPath", HoodieCLI.tableMetadata.getMetaPath()}); + rows.add(new Comparable[] {"fileSystem", HoodieCLI.tableMetadata.getFs().getScheme()}); HoodieCLI.tableMetadata.getTableConfig().getProps().entrySet().forEach(e -> { - rows.add(new Comparable[]{e.getKey(), e.getValue()}); + rows.add(new Comparable[] {e.getKey(), e.getValue()}); }); return HoodiePrintHelper.print(header, new HashMap<>(), "", false, -1, false, rows); } diff --git a/hudi-cli/src/main/java/org/apache/hudi/cli/commands/FileSystemViewCommand.java b/hudi-cli/src/main/java/org/apache/hudi/cli/commands/FileSystemViewCommand.java index d86390d7f..6e9bde6ce 100644 --- a/hudi-cli/src/main/java/org/apache/hudi/cli/commands/FileSystemViewCommand.java +++ b/hudi-cli/src/main/java/org/apache/hudi/cli/commands/FileSystemViewCommand.java @@ -52,24 +52,23 @@ public class FileSystemViewCommand implements CommandMarker { @CliCommand(value = "show fsview all", help = "Show entire file-system view") public String showAllFileSlices( - @CliOption(key = {"pathRegex"}, - help = "regex to select files, eg: 2016/08/02", unspecifiedDefaultValue = "*/*/*") String globRegex, + @CliOption(key = {"pathRegex"}, help = "regex to select files, eg: 2016/08/02", + unspecifiedDefaultValue = "*/*/*") String globRegex, @CliOption(key = {"readOptimizedOnly"}, help = "Only display read-optimized view", unspecifiedDefaultValue = "false") boolean readOptimizedOnly, @CliOption(key = {"maxInstant"}, help = "File-Slices upto this instant are displayed", unspecifiedDefaultValue = "") String maxInstant, - @CliOption(key = { - "includeMax"}, help = "Include Max Instant", unspecifiedDefaultValue = "false") boolean includeMaxInstant, - @CliOption(key = { - "includeInflight"}, help = "Include Inflight Instants", unspecifiedDefaultValue = "false") - boolean includeInflight, - @CliOption(key = {"excludeCompaction"}, help = "Exclude compaction Instants", unspecifiedDefaultValue = "false") - boolean excludeCompaction, + @CliOption(key = {"includeMax"}, help = "Include Max Instant", + unspecifiedDefaultValue = "false") boolean includeMaxInstant, + @CliOption(key = {"includeInflight"}, help = "Include Inflight Instants", + unspecifiedDefaultValue = "false") boolean includeInflight, + @CliOption(key = {"excludeCompaction"}, help = "Exclude compaction Instants", + unspecifiedDefaultValue = "false") boolean excludeCompaction, @CliOption(key = {"limit"}, help = "Limit rows to be displayed", unspecifiedDefaultValue = "-1") Integer limit, @CliOption(key = {"sortBy"}, help = "Sorting Field", unspecifiedDefaultValue = "") final String sortByField, @CliOption(key = {"desc"}, help = "Ordering", unspecifiedDefaultValue = "false") final boolean descending, - @CliOption(key = { - "headeronly"}, help = "Print Header Only", unspecifiedDefaultValue = "false") final boolean headerOnly) + @CliOption(key = {"headeronly"}, help = "Print Header Only", + unspecifiedDefaultValue = "false") final boolean headerOnly) throws IOException { HoodieTableFileSystemView fsView = buildFileSystemView(globRegex, maxInstant, readOptimizedOnly, includeMaxInstant, @@ -97,15 +96,10 @@ public class FileSystemViewCommand implements CommandMarker { fieldNameToConverterMap.put("Total Delta File Size", converterFunction); fieldNameToConverterMap.put("Data-File Size", converterFunction); - TableHeader header = new TableHeader() - .addTableHeaderField("Partition") - .addTableHeaderField("FileId") - .addTableHeaderField("Base-Instant") - .addTableHeaderField("Data-File") - .addTableHeaderField("Data-File Size"); + TableHeader header = new TableHeader().addTableHeaderField("Partition").addTableHeaderField("FileId") + .addTableHeaderField("Base-Instant").addTableHeaderField("Data-File").addTableHeaderField("Data-File Size"); if (!readOptimizedOnly) { - header = header.addTableHeaderField("Num Delta Files") - .addTableHeaderField("Total Delta File Size") + header = header.addTableHeaderField("Num Delta Files").addTableHeaderField("Total Delta File Size") .addTableHeaderField("Delta Files"); } return HoodiePrintHelper.print(header, fieldNameToConverterMap, sortByField, descending, limit, headerOnly, rows); @@ -113,25 +107,24 @@ public class FileSystemViewCommand implements CommandMarker { @CliCommand(value = "show fsview latest", help = "Show latest file-system view") public String showLatestFileSlices( - @CliOption(key = {"partitionPath"}, - help = "A valid paritition path", mandatory = true) String partition, + @CliOption(key = {"partitionPath"}, help = "A valid paritition path", mandatory = true) String partition, @CliOption(key = {"readOptimizedOnly"}, help = "Only display read-optimized view", unspecifiedDefaultValue = "false") boolean readOptimizedOnly, @CliOption(key = {"maxInstant"}, help = "File-Slices upto this instant are displayed", unspecifiedDefaultValue = "") String maxInstant, @CliOption(key = {"merge"}, help = "Merge File Slices due to pending compaction", unspecifiedDefaultValue = "true") final boolean merge, - @CliOption(key = {"includeMax"}, help = "Include Max Instant", unspecifiedDefaultValue = "false") - boolean includeMaxInstant, - @CliOption(key = {"includeInflight"}, help = "Include Inflight Instants", unspecifiedDefaultValue = "false") - boolean includeInflight, - @CliOption(key = {"excludeCompaction"}, help = "Exclude compaction Instants", unspecifiedDefaultValue = "false") - boolean excludeCompaction, + @CliOption(key = {"includeMax"}, help = "Include Max Instant", + unspecifiedDefaultValue = "false") boolean includeMaxInstant, + @CliOption(key = {"includeInflight"}, help = "Include Inflight Instants", + unspecifiedDefaultValue = "false") boolean includeInflight, + @CliOption(key = {"excludeCompaction"}, help = "Exclude compaction Instants", + unspecifiedDefaultValue = "false") boolean excludeCompaction, @CliOption(key = {"limit"}, help = "Limit rows to be displayed", unspecifiedDefaultValue = "-1") Integer limit, @CliOption(key = {"sortBy"}, help = "Sorting Field", unspecifiedDefaultValue = "") final String sortByField, @CliOption(key = {"desc"}, help = "Ordering", unspecifiedDefaultValue = "false") final boolean descending, - @CliOption(key = { - "headeronly"}, help = "Print Header Only", unspecifiedDefaultValue = "false") final boolean headerOnly) + @CliOption(key = {"headeronly"}, help = "Print Header Only", + unspecifiedDefaultValue = "false") final boolean headerOnly) throws IOException { HoodieTableFileSystemView fsView = buildFileSystemView(partition, maxInstant, readOptimizedOnly, includeMaxInstant, @@ -163,28 +156,25 @@ public class FileSystemViewCommand implements CommandMarker { if (!readOptimizedOnly) { row[idx++] = fs.getLogFiles().count(); row[idx++] = fs.getLogFiles().mapToLong(lf -> lf.getFileSize()).sum(); - long logFilesScheduledForCompactionTotalSize = fs.getLogFiles() - .filter(lf -> lf.getBaseCommitTime().equals(fs.getBaseInstantTime())) - .mapToLong(lf -> lf.getFileSize()).sum(); + long logFilesScheduledForCompactionTotalSize = + fs.getLogFiles().filter(lf -> lf.getBaseCommitTime().equals(fs.getBaseInstantTime())) + .mapToLong(lf -> lf.getFileSize()).sum(); row[idx++] = logFilesScheduledForCompactionTotalSize; - long logFilesUnscheduledTotalSize = fs.getLogFiles() - .filter(lf -> !lf.getBaseCommitTime().equals(fs.getBaseInstantTime())) - .mapToLong(lf -> lf.getFileSize()).sum(); + long logFilesUnscheduledTotalSize = + fs.getLogFiles().filter(lf -> !lf.getBaseCommitTime().equals(fs.getBaseInstantTime())) + .mapToLong(lf -> lf.getFileSize()).sum(); row[idx++] = logFilesUnscheduledTotalSize; double logSelectedForCompactionToBaseRatio = dataFileSize > 0 ? logFilesScheduledForCompactionTotalSize / (dataFileSize * 1.0) : -1; row[idx++] = logSelectedForCompactionToBaseRatio; - double logUnscheduledToBaseRatio = - dataFileSize > 0 ? logFilesUnscheduledTotalSize / (dataFileSize * 1.0) : -1; + double logUnscheduledToBaseRatio = dataFileSize > 0 ? logFilesUnscheduledTotalSize / (dataFileSize * 1.0) : -1; row[idx++] = logUnscheduledToBaseRatio; - row[idx++] = fs.getLogFiles() - .filter(lf -> lf.getBaseCommitTime().equals(fs.getBaseInstantTime())) + row[idx++] = fs.getLogFiles().filter(lf -> lf.getBaseCommitTime().equals(fs.getBaseInstantTime())) .collect(Collectors.toList()).toString(); - row[idx++] = fs.getLogFiles() - .filter(lf -> !lf.getBaseCommitTime().equals(fs.getBaseInstantTime())) + row[idx++] = fs.getLogFiles().filter(lf -> !lf.getBaseCommitTime().equals(fs.getBaseInstantTime())) .collect(Collectors.toList()).toString(); } rows.add(row); @@ -200,16 +190,11 @@ public class FileSystemViewCommand implements CommandMarker { fieldNameToConverterMap.put("Delta Size - compaction unscheduled", converterFunction); } - TableHeader header = new TableHeader() - .addTableHeaderField("Partition") - .addTableHeaderField("FileId") - .addTableHeaderField("Base-Instant") - .addTableHeaderField("Data-File") - .addTableHeaderField("Data-File Size"); + TableHeader header = new TableHeader().addTableHeaderField("Partition").addTableHeaderField("FileId") + .addTableHeaderField("Base-Instant").addTableHeaderField("Data-File").addTableHeaderField("Data-File Size"); if (!readOptimizedOnly) { - header = header.addTableHeaderField("Num Delta Files") - .addTableHeaderField("Total Delta Size") + header = header.addTableHeaderField("Num Delta Files").addTableHeaderField("Total Delta Size") .addTableHeaderField("Delta Size - compaction scheduled") .addTableHeaderField("Delta Size - compaction unscheduled") .addTableHeaderField("Delta To Base Ratio - compaction scheduled") @@ -222,19 +207,20 @@ public class FileSystemViewCommand implements CommandMarker { /** * Build File System View + * * @param globRegex Path Regex - * @param maxInstant Max Instants to be used for displaying file-instants + * @param maxInstant Max Instants to be used for displaying file-instants * @param readOptimizedOnly Include only read optimized view * @param includeMaxInstant Include Max instant - * @param includeInflight Include inflight instants + * @param includeInflight Include inflight instants * @param excludeCompaction Exclude Compaction instants * @return * @throws IOException */ private HoodieTableFileSystemView buildFileSystemView(String globRegex, String maxInstant, boolean readOptimizedOnly, boolean includeMaxInstant, boolean includeInflight, boolean excludeCompaction) throws IOException { - HoodieTableMetaClient metaClient = new HoodieTableMetaClient(HoodieCLI.tableMetadata.getHadoopConf(), - HoodieCLI.tableMetadata.getBasePath(), true); + HoodieTableMetaClient metaClient = + new HoodieTableMetaClient(HoodieCLI.tableMetadata.getHadoopConf(), HoodieCLI.tableMetadata.getBasePath(), true); FileSystem fs = HoodieCLI.fs; String globPath = String.format("%s/%s/*", HoodieCLI.tableMetadata.getBasePath(), globRegex); FileStatus[] statuses = fs.globStatus(new Path(globPath)); diff --git a/hudi-cli/src/main/java/org/apache/hudi/cli/commands/HDFSParquetImportCommand.java b/hudi-cli/src/main/java/org/apache/hudi/cli/commands/HDFSParquetImportCommand.java index 9ec1b4edd..e78b7356f 100644 --- a/hudi-cli/src/main/java/org/apache/hudi/cli/commands/HDFSParquetImportCommand.java +++ b/hudi-cli/src/main/java/org/apache/hudi/cli/commands/HDFSParquetImportCommand.java @@ -43,17 +43,17 @@ public class HDFSParquetImportCommand implements CommandMarker { @CliOption(key = "upsert", mandatory = false, unspecifiedDefaultValue = "false", help = "Uses upsert API instead of the default insert API of WriteClient") boolean useUpsert, @CliOption(key = "srcPath", mandatory = true, help = "Base path for the input dataset") final String srcPath, - @CliOption(key = "targetPath", mandatory = true, help = "Base path for the target hoodie dataset") final String - targetPath, + @CliOption(key = "targetPath", mandatory = true, + help = "Base path for the target hoodie dataset") final String targetPath, @CliOption(key = "tableName", mandatory = true, help = "Table name") final String tableName, @CliOption(key = "tableType", mandatory = true, help = "Table type") final String tableType, @CliOption(key = "rowKeyField", mandatory = true, help = "Row key field name") final String rowKeyField, - @CliOption(key = "partitionPathField", mandatory = true, help = "Partition path field name") final String - partitionPathField, - @CliOption(key = { - "parallelism"}, mandatory = true, help = "Parallelism for hoodie insert") final String parallelism, - @CliOption(key = "schemaFilePath", mandatory = true, help = "Path for Avro schema file") final String - schemaFilePath, + @CliOption(key = "partitionPathField", mandatory = true, + help = "Partition path field name") final String partitionPathField, + @CliOption(key = {"parallelism"}, mandatory = true, + help = "Parallelism for hoodie insert") final String parallelism, + @CliOption(key = "schemaFilePath", mandatory = true, + help = "Path for Avro schema file") final String schemaFilePath, @CliOption(key = "format", mandatory = true, help = "Format for the input data") final String format, @CliOption(key = "sparkMemory", mandatory = true, help = "Spark executor memory") final String sparkMemory, @CliOption(key = "retry", mandatory = true, help = "Number of retries") final String retry) throws Exception { @@ -62,8 +62,8 @@ public class HDFSParquetImportCommand implements CommandMarker { boolean initialized = HoodieCLI.initConf(); HoodieCLI.initFS(initialized); - String sparkPropertiesPath = Utils.getDefaultPropertiesFile( - JavaConverters.mapAsScalaMapConverter(System.getenv()).asScala()); + String sparkPropertiesPath = + Utils.getDefaultPropertiesFile(JavaConverters.mapAsScalaMapConverter(System.getenv()).asScala()); SparkLauncher sparkLauncher = SparkUtil.initLauncher(sparkPropertiesPath); @@ -72,8 +72,8 @@ public class HDFSParquetImportCommand implements CommandMarker { cmd = SparkCommand.UPSERT.toString(); } - sparkLauncher.addAppArgs(cmd, srcPath, targetPath, tableName, tableType, rowKeyField, - partitionPathField, parallelism, schemaFilePath, sparkMemory, retry); + sparkLauncher.addAppArgs(cmd, srcPath, targetPath, tableName, tableType, rowKeyField, partitionPathField, + parallelism, schemaFilePath, sparkMemory, retry); Process process = sparkLauncher.launch(); InputStreamConsumer.captureOutput(process); int exitCode = process.waitFor(); diff --git a/hudi-cli/src/main/java/org/apache/hudi/cli/commands/HoodieLogFileCommand.java b/hudi-cli/src/main/java/org/apache/hudi/cli/commands/HoodieLogFileCommand.java index cc822353e..927909fe5 100644 --- a/hudi-cli/src/main/java/org/apache/hudi/cli/commands/HoodieLogFileCommand.java +++ b/hudi-cli/src/main/java/org/apache/hudi/cli/commands/HoodieLogFileCommand.java @@ -69,30 +69,29 @@ public class HoodieLogFileCommand implements CommandMarker { @CliCommand(value = "show logfile metadata", help = "Read commit metadata from log files") public String showLogFileCommits( - @CliOption(key = "logFilePathPattern", mandatory = true, help = "Fully qualified path for the log file") final - String logFilePathPattern, + @CliOption(key = "logFilePathPattern", mandatory = true, + help = "Fully qualified path for the log file") final String logFilePathPattern, @CliOption(key = {"limit"}, help = "Limit commits", unspecifiedDefaultValue = "-1") final Integer limit, @CliOption(key = {"sortBy"}, help = "Sorting Field", unspecifiedDefaultValue = "") final String sortByField, @CliOption(key = {"desc"}, help = "Ordering", unspecifiedDefaultValue = "false") final boolean descending, - @CliOption(key = {"headeronly"}, help = "Print Header Only", unspecifiedDefaultValue = "false") - final boolean headerOnly) throws IOException { + @CliOption(key = {"headeronly"}, help = "Print Header Only", + unspecifiedDefaultValue = "false") final boolean headerOnly) + throws IOException { FileSystem fs = HoodieCLI.tableMetadata.getFs(); List logFilePaths = Arrays.stream(fs.globStatus(new Path(logFilePathPattern))) .map(status -> status.getPath().toString()).collect(Collectors.toList()); - Map, Map>, Integer>>> - commitCountAndMetadata = Maps.newHashMap(); + Map, Map>, Integer>>> commitCountAndMetadata = + Maps.newHashMap(); int totalEntries = 0; int numCorruptBlocks = 0; int dummyInstantTimeCount = 0; for (String logFilePath : logFilePaths) { FileStatus[] fsStatus = fs.listStatus(new Path(logFilePath)); - Schema writerSchema = new AvroSchemaConverter().convert( - SchemaUtil.readSchemaFromLogFile(HoodieCLI.tableMetadata.getFs(), new Path(logFilePath))); - Reader reader = HoodieLogFormat - .newReader(fs, new HoodieLogFile(fsStatus[0].getPath()), writerSchema); + Schema writerSchema = new AvroSchemaConverter() + .convert(SchemaUtil.readSchemaFromLogFile(HoodieCLI.tableMetadata.getFs(), new Path(logFilePath))); + Reader reader = HoodieLogFormat.newReader(fs, new HoodieLogFile(fsStatus[0].getPath()), writerSchema); // read the avro blocks while (reader.hasNext()) { @@ -126,8 +125,8 @@ public class HoodieLogFileCommand implements CommandMarker { new Tuple3<>(n.getBlockType(), new Tuple2<>(n.getLogBlockHeader(), n.getLogBlockFooter()), recordCount)); totalEntries++; } else { - List, Map>, - Integer>> list = new ArrayList<>(); + List, Map>, Integer>> list = + new ArrayList<>(); list.add( new Tuple3<>(n.getBlockType(), new Tuple2<>(n.getLogBlockHeader(), n.getLogBlockFooter()), recordCount)); commitCountAndMetadata.put(instantTime, list); @@ -139,12 +138,11 @@ public class HoodieLogFileCommand implements CommandMarker { List rows = new ArrayList<>(); int i = 0; ObjectMapper objectMapper = new ObjectMapper(); - for (Map.Entry, Map>, Integer>>> entry - : commitCountAndMetadata.entrySet()) { + for (Map.Entry, Map>, Integer>>> entry : commitCountAndMetadata + .entrySet()) { String instantTime = entry.getKey().toString(); - for (Tuple3, - Map>, Integer> tuple3 : entry.getValue()) { + for (Tuple3, Map>, Integer> tuple3 : entry + .getValue()) { Comparable[] output = new Comparable[5]; output[0] = instantTime; output[1] = tuple3._3(); @@ -156,21 +154,18 @@ public class HoodieLogFileCommand implements CommandMarker { } } - TableHeader header = new TableHeader() - .addTableHeaderField("InstantTime") - .addTableHeaderField("RecordCount") - .addTableHeaderField("BlockType") - .addTableHeaderField("HeaderMetadata") - .addTableHeaderField("FooterMetadata"); + TableHeader header = new TableHeader().addTableHeaderField("InstantTime").addTableHeaderField("RecordCount") + .addTableHeaderField("BlockType").addTableHeaderField("HeaderMetadata").addTableHeaderField("FooterMetadata"); return HoodiePrintHelper.print(header, new HashMap<>(), sortByField, descending, limit, headerOnly, rows); } @CliCommand(value = "show logfile records", help = "Read records from log files") - public String showLogFileRecords(@CliOption(key = { - "limit"}, mandatory = false, help = "Limit commits", unspecifiedDefaultValue = "10") final Integer limit, - @CliOption(key = "logFilePathPattern", mandatory = true, help = "Fully qualified paths for the log files") - final String logFilePathPattern, + public String showLogFileRecords( + @CliOption(key = {"limit"}, mandatory = false, help = "Limit commits", + unspecifiedDefaultValue = "10") final Integer limit, + @CliOption(key = "logFilePathPattern", mandatory = true, + help = "Fully qualified paths for the log files") final String logFilePathPattern, @CliOption(key = "mergeRecords", mandatory = false, help = "If the records in the log files should be merged", unspecifiedDefaultValue = "false") final Boolean shouldMerge) throws IOException { @@ -184,22 +179,21 @@ public class HoodieLogFileCommand implements CommandMarker { // TODO : readerSchema can change across blocks/log files, fix this inside Scanner AvroSchemaConverter converter = new AvroSchemaConverter(); // get schema from last log file - Schema readerSchema = converter.convert( - SchemaUtil.readSchemaFromLogFile(fs, new Path(logFilePaths.get(logFilePaths.size() - 1)))); + Schema readerSchema = + converter.convert(SchemaUtil.readSchemaFromLogFile(fs, new Path(logFilePaths.get(logFilePaths.size() - 1)))); List allRecords = new ArrayList<>(); if (shouldMerge) { System.out.println("===========================> MERGING RECORDS <==================="); - HoodieMergedLogRecordScanner scanner = new HoodieMergedLogRecordScanner(fs, - HoodieCLI.tableMetadata.getBasePath(), logFilePaths, readerSchema, - HoodieCLI.tableMetadata.getActiveTimeline().getCommitTimeline().lastInstant().get() - .getTimestamp(), - Long.valueOf(HoodieMemoryConfig.DEFAULT_MAX_MEMORY_FOR_SPILLABLE_MAP_IN_BYTES), - Boolean.valueOf(HoodieCompactionConfig.DEFAULT_COMPACTION_LAZY_BLOCK_READ_ENABLED), - Boolean.valueOf(HoodieCompactionConfig.DEFAULT_COMPACTION_REVERSE_LOG_READ_ENABLED), - Integer.valueOf(HoodieMemoryConfig.DEFAULT_MAX_DFS_STREAM_BUFFER_SIZE), - HoodieMemoryConfig.DEFAULT_SPILLABLE_MAP_BASE_PATH); + HoodieMergedLogRecordScanner scanner = + new HoodieMergedLogRecordScanner(fs, HoodieCLI.tableMetadata.getBasePath(), logFilePaths, readerSchema, + HoodieCLI.tableMetadata.getActiveTimeline().getCommitTimeline().lastInstant().get().getTimestamp(), + Long.valueOf(HoodieMemoryConfig.DEFAULT_MAX_MEMORY_FOR_SPILLABLE_MAP_IN_BYTES), + Boolean.valueOf(HoodieCompactionConfig.DEFAULT_COMPACTION_LAZY_BLOCK_READ_ENABLED), + Boolean.valueOf(HoodieCompactionConfig.DEFAULT_COMPACTION_REVERSE_LOG_READ_ENABLED), + Integer.valueOf(HoodieMemoryConfig.DEFAULT_MAX_DFS_STREAM_BUFFER_SIZE), + HoodieMemoryConfig.DEFAULT_SPILLABLE_MAP_BASE_PATH); for (HoodieRecord hoodieRecord : scanner) { Option record = hoodieRecord.getData().getInsertValue(readerSchema); if (allRecords.size() >= limit) { @@ -209,10 +203,10 @@ public class HoodieLogFileCommand implements CommandMarker { } } else { for (String logFile : logFilePaths) { - Schema writerSchema = new AvroSchemaConverter().convert( - SchemaUtil.readSchemaFromLogFile(HoodieCLI.tableMetadata.getFs(), new Path(logFile))); - HoodieLogFormat.Reader reader = HoodieLogFormat - .newReader(fs, new HoodieLogFile(new Path(logFile)), writerSchema); + Schema writerSchema = new AvroSchemaConverter() + .convert(SchemaUtil.readSchemaFromLogFile(HoodieCLI.tableMetadata.getFs(), new Path(logFile))); + HoodieLogFormat.Reader reader = + HoodieLogFormat.newReader(fs, new HoodieLogFile(new Path(logFile)), writerSchema); // read the avro blocks while (reader.hasNext()) { HoodieLogBlock n = reader.next(); diff --git a/hudi-cli/src/main/java/org/apache/hudi/cli/commands/HoodieSyncCommand.java b/hudi-cli/src/main/java/org/apache/hudi/cli/commands/HoodieSyncCommand.java index 614789e2f..ecd15b891 100644 --- a/hudi-cli/src/main/java/org/apache/hudi/cli/commands/HoodieSyncCommand.java +++ b/hudi-cli/src/main/java/org/apache/hudi/cli/commands/HoodieSyncCommand.java @@ -44,19 +44,16 @@ public class HoodieSyncCommand implements CommandMarker { public String validateSync( @CliOption(key = {"mode"}, unspecifiedDefaultValue = "complete", help = "Check mode") final String mode, @CliOption(key = {"sourceDb"}, unspecifiedDefaultValue = "rawdata", help = "source database") final String srcDb, - @CliOption(key = { - "targetDb"}, unspecifiedDefaultValue = "dwh_hoodie", help = "target database") final String tgtDb, - @CliOption(key = { - "partitionCount"}, unspecifiedDefaultValue = "5", help = "total number of recent partitions to validate") - final int partitionCount, - @CliOption(key = { - "hiveServerUrl"}, mandatory = true, help = "hiveServerURL to connect to") final String hiveServerUrl, - @CliOption(key = { - "hiveUser"}, mandatory = false, unspecifiedDefaultValue = "", help = "hive username to connect to") final - String hiveUser, - @CliOption(key = { - "hivePass"}, mandatory = true, unspecifiedDefaultValue = "", help = "hive password to connect to") final - String hivePass) + @CliOption(key = {"targetDb"}, unspecifiedDefaultValue = "dwh_hoodie", + help = "target database") final String tgtDb, + @CliOption(key = {"partitionCount"}, unspecifiedDefaultValue = "5", + help = "total number of recent partitions to validate") final int partitionCount, + @CliOption(key = {"hiveServerUrl"}, mandatory = true, + help = "hiveServerURL to connect to") final String hiveServerUrl, + @CliOption(key = {"hiveUser"}, mandatory = false, unspecifiedDefaultValue = "", + help = "hive username to connect to") final String hiveUser, + @CliOption(key = {"hivePass"}, mandatory = true, unspecifiedDefaultValue = "", + help = "hive password to connect to") final String hivePass) throws Exception { HoodieTableMetaClient target = HoodieCLI.syncTableMetadata; HoodieTimeline targetTimeline = target.getActiveTimeline().getCommitsTimeline(); @@ -77,8 +74,8 @@ public class HoodieSyncCommand implements CommandMarker { String sourceLatestCommit = sourceTimeline.getInstants().iterator().hasNext() ? "0" : sourceTimeline.lastInstant().get().getTimestamp(); - if (sourceLatestCommit != null && HoodieTimeline.compareTimestamps(targetLatestCommit, sourceLatestCommit, - HoodieTimeline.GREATER)) { + if (sourceLatestCommit != null + && HoodieTimeline.compareTimestamps(targetLatestCommit, sourceLatestCommit, HoodieTimeline.GREATER)) { // source is behind the target List commitsToCatchup = targetTimeline.findInstantsAfter(sourceLatestCommit, Integer.MAX_VALUE) .getInstants().collect(Collectors.toList()); @@ -89,8 +86,8 @@ public class HoodieSyncCommand implements CommandMarker { long newInserts = CommitUtil.countNewRecords(target, commitsToCatchup.stream().map(HoodieInstant::getTimestamp).collect(Collectors.toList())); return "Count difference now is (count(" + target.getTableConfig().getTableName() + ") - count(" - + source.getTableConfig().getTableName() - + ") == " + (targetCount - sourceCount) + ". Catch up count is " + newInserts; + + source.getTableConfig().getTableName() + ") == " + (targetCount - sourceCount) + ". Catch up count is " + + newInserts; } } else { List commitsToCatchup = sourceTimeline.findInstantsAfter(targetLatestCommit, Integer.MAX_VALUE) @@ -102,8 +99,8 @@ public class HoodieSyncCommand implements CommandMarker { long newInserts = CommitUtil.countNewRecords(source, commitsToCatchup.stream().map(HoodieInstant::getTimestamp).collect(Collectors.toList())); return "Count difference now is (count(" + source.getTableConfig().getTableName() + ") - count(" - + target.getTableConfig().getTableName() - + ") == " + (sourceCount - targetCount) + ". Catch up count is " + newInserts; + + target.getTableConfig().getTableName() + ") == " + (sourceCount - targetCount) + ". Catch up count is " + + newInserts; } } diff --git a/hudi-cli/src/main/java/org/apache/hudi/cli/commands/RepairsCommand.java b/hudi-cli/src/main/java/org/apache/hudi/cli/commands/RepairsCommand.java index ad9b8269a..84757d92e 100644 --- a/hudi-cli/src/main/java/org/apache/hudi/cli/commands/RepairsCommand.java +++ b/hudi-cli/src/main/java/org/apache/hudi/cli/commands/RepairsCommand.java @@ -47,16 +47,15 @@ public class RepairsCommand implements CommandMarker { return HoodieCLI.tableMetadata != null; } - @CliCommand(value = "repair deduplicate", help = "De-duplicate a partition path contains duplicates & produce " - + "repaired files to replace with") - public String deduplicate(@CliOption(key = { - "duplicatedPartitionPath"}, help = "Partition Path containing the duplicates", mandatory = true) final String - duplicatedPartitionPath, - @CliOption(key = { - "repairedOutputPath"}, help = "Location to place the repaired files", mandatory = true) final String - repairedOutputPath, - @CliOption(key = { - "sparkProperties"}, help = "Spark Properites File Path", mandatory = true) final String sparkPropertiesPath) + @CliCommand(value = "repair deduplicate", + help = "De-duplicate a partition path contains duplicates & produce " + "repaired files to replace with") + public String deduplicate( + @CliOption(key = {"duplicatedPartitionPath"}, help = "Partition Path containing the duplicates", + mandatory = true) final String duplicatedPartitionPath, + @CliOption(key = {"repairedOutputPath"}, help = "Location to place the repaired files", + mandatory = true) final String repairedOutputPath, + @CliOption(key = {"sparkProperties"}, help = "Spark Properites File Path", + mandatory = true) final String sparkPropertiesPath) throws Exception { SparkLauncher sparkLauncher = SparkUtil.initLauncher(sparkPropertiesPath); sparkLauncher.addAppArgs(SparkMain.SparkCommand.DEDUPLICATE.toString(), duplicatedPartitionPath, repairedOutputPath, @@ -73,14 +72,15 @@ public class RepairsCommand implements CommandMarker { @CliCommand(value = "repair addpartitionmeta", help = "Add partition metadata to a dataset, if not present") - public String addPartitionMeta(@CliOption(key = { - "dryrun"}, help = "Should we actually add or just print what would be done", unspecifiedDefaultValue = "true") - final boolean dryRun) throws IOException { + public String addPartitionMeta( + @CliOption(key = {"dryrun"}, help = "Should we actually add or just print what would be done", + unspecifiedDefaultValue = "true") final boolean dryRun) + throws IOException { - String latestCommit = HoodieCLI.tableMetadata.getActiveTimeline().getCommitTimeline().lastInstant().get() - .getTimestamp(); - List partitionPaths = FSUtils.getAllPartitionFoldersThreeLevelsDown(HoodieCLI.fs, - HoodieCLI.tableMetadata.getBasePath()); + String latestCommit = + HoodieCLI.tableMetadata.getActiveTimeline().getCommitTimeline().lastInstant().get().getTimestamp(); + List partitionPaths = + FSUtils.getAllPartitionFoldersThreeLevelsDown(HoodieCLI.fs, HoodieCLI.tableMetadata.getBasePath()); Path basePath = new Path(HoodieCLI.tableMetadata.getBasePath()); String[][] rows = new String[partitionPaths.size() + 1][]; @@ -94,8 +94,8 @@ public class RepairsCommand implements CommandMarker { if (!HoodiePartitionMetadata.hasPartitionMetadata(HoodieCLI.fs, partitionPath)) { row[1] = "No"; if (!dryRun) { - HoodiePartitionMetadata partitionMetadata = new HoodiePartitionMetadata(HoodieCLI.fs, latestCommit, basePath, - partitionPath); + HoodiePartitionMetadata partitionMetadata = + new HoodiePartitionMetadata(HoodieCLI.fs, latestCommit, basePath, partitionPath); partitionMetadata.trySave(0); } } diff --git a/hudi-cli/src/main/java/org/apache/hudi/cli/commands/RollbacksCommand.java b/hudi-cli/src/main/java/org/apache/hudi/cli/commands/RollbacksCommand.java index bd568de17..90a716f3c 100644 --- a/hudi-cli/src/main/java/org/apache/hudi/cli/commands/RollbacksCommand.java +++ b/hudi-cli/src/main/java/org/apache/hudi/cli/commands/RollbacksCommand.java @@ -50,8 +50,8 @@ public class RollbacksCommand implements CommandMarker { @CliOption(key = {"limit"}, help = "Limit #rows to be displayed", unspecifiedDefaultValue = "10") Integer limit, @CliOption(key = {"sortBy"}, help = "Sorting Field", unspecifiedDefaultValue = "") final String sortByField, @CliOption(key = {"desc"}, help = "Ordering", unspecifiedDefaultValue = "false") final boolean descending, - @CliOption(key = { - "headeronly"}, help = "Print Header Only", unspecifiedDefaultValue = "false") final boolean headerOnly) + @CliOption(key = {"headeronly"}, help = "Print Header Only", + unspecifiedDefaultValue = "false") final boolean headerOnly) throws IOException { HoodieActiveTimeline activeTimeline = new RollbackTimeline(HoodieCLI.tableMetadata); HoodieTimeline rollback = activeTimeline.getRollbackTimeline().filterCompletedInstants(); @@ -59,8 +59,8 @@ public class RollbacksCommand implements CommandMarker { final List rows = new ArrayList<>(); rollback.getInstants().forEach(instant -> { try { - HoodieRollbackMetadata metadata = AvroUtils.deserializeAvroMetadata( - activeTimeline.getInstantDetails(instant).get(), HoodieRollbackMetadata.class); + HoodieRollbackMetadata metadata = AvroUtils + .deserializeAvroMetadata(activeTimeline.getInstantDetails(instant).get(), HoodieRollbackMetadata.class); metadata.getCommitsRollback().forEach(c -> { Comparable[] row = new Comparable[5]; row[0] = metadata.getStartRollbackTime(); @@ -74,11 +74,8 @@ public class RollbacksCommand implements CommandMarker { e.printStackTrace(); } }); - TableHeader header = new TableHeader() - .addTableHeaderField("Instant") - .addTableHeaderField("Rolledback Instant") - .addTableHeaderField("Total Files Deleted") - .addTableHeaderField("Time taken in millis") + TableHeader header = new TableHeader().addTableHeaderField("Instant").addTableHeaderField("Rolledback Instant") + .addTableHeaderField("Total Files Deleted").addTableHeaderField("Time taken in millis") .addTableHeaderField("Total Partitions"); return HoodiePrintHelper.print(header, new HashMap<>(), sortByField, descending, limit, headerOnly, rows); } @@ -89,17 +86,18 @@ public class RollbacksCommand implements CommandMarker { @CliOption(key = {"limit"}, help = "Limit #rows to be displayed", unspecifiedDefaultValue = "10") Integer limit, @CliOption(key = {"sortBy"}, help = "Sorting Field", unspecifiedDefaultValue = "") final String sortByField, @CliOption(key = {"desc"}, help = "Ordering", unspecifiedDefaultValue = "false") final boolean descending, - @CliOption(key = { - "headeronly"}, help = "Print Header Only", unspecifiedDefaultValue = "false") final boolean headerOnly) + @CliOption(key = {"headeronly"}, help = "Print Header Only", + unspecifiedDefaultValue = "false") final boolean headerOnly) throws IOException { HoodieActiveTimeline activeTimeline = new RollbackTimeline(HoodieCLI.tableMetadata); final List rows = new ArrayList<>(); HoodieRollbackMetadata metadata = AvroUtils.deserializeAvroMetadata( - activeTimeline.getInstantDetails(new HoodieInstant(State.COMPLETED, ROLLBACK_ACTION, rollbackInstant)) - .get(), HoodieRollbackMetadata.class); + activeTimeline.getInstantDetails(new HoodieInstant(State.COMPLETED, ROLLBACK_ACTION, rollbackInstant)).get(), + HoodieRollbackMetadata.class); metadata.getPartitionMetadata().entrySet().forEach(e -> { - Stream.concat(e.getValue().getSuccessDeleteFiles().stream().map(f -> Pair.of(f, true)), - e.getValue().getFailedDeleteFiles().stream().map(f -> Pair.of(f, false))) + Stream + .concat(e.getValue().getSuccessDeleteFiles().stream().map(f -> Pair.of(f, true)), + e.getValue().getFailedDeleteFiles().stream().map(f -> Pair.of(f, false))) .forEach(fileWithDeleteStatus -> { Comparable[] row = new Comparable[5]; row[0] = metadata.getStartRollbackTime(); @@ -111,12 +109,8 @@ public class RollbacksCommand implements CommandMarker { }); }); - TableHeader header = new TableHeader() - .addTableHeaderField("Instant") - .addTableHeaderField("Rolledback Instants") - .addTableHeaderField("Partition") - .addTableHeaderField("Deleted File") - .addTableHeaderField("Succeeded"); + TableHeader header = new TableHeader().addTableHeaderField("Instant").addTableHeaderField("Rolledback Instants") + .addTableHeaderField("Partition").addTableHeaderField("Deleted File").addTableHeaderField("Succeeded"); return HoodiePrintHelper.print(header, new HashMap<>(), sortByField, descending, limit, headerOnly, rows); } diff --git a/hudi-cli/src/main/java/org/apache/hudi/cli/commands/SavepointsCommand.java b/hudi-cli/src/main/java/org/apache/hudi/cli/commands/SavepointsCommand.java index 8434650b4..ae451ef9b 100644 --- a/hudi-cli/src/main/java/org/apache/hudi/cli/commands/SavepointsCommand.java +++ b/hudi-cli/src/main/java/org/apache/hudi/cli/commands/SavepointsCommand.java @@ -62,8 +62,8 @@ public class SavepointsCommand implements CommandMarker { @CliAvailabilityIndicator({"savepoint rollback"}) public boolean isRollbackToSavepointAvailable() { - return HoodieCLI.tableMetadata != null && !HoodieCLI.tableMetadata.getActiveTimeline().getSavePointTimeline() - .filterCompletedInstants().empty(); + return HoodieCLI.tableMetadata != null + && !HoodieCLI.tableMetadata.getActiveTimeline().getSavePointTimeline().filterCompletedInstants().empty(); } @CliCommand(value = "savepoints show", help = "Show the savepoints") @@ -137,8 +137,8 @@ public class SavepointsCommand implements CommandMarker { } private static HoodieWriteClient createHoodieClient(JavaSparkContext jsc, String basePath) throws Exception { - HoodieWriteConfig config = HoodieWriteConfig.newBuilder().withPath(basePath).withIndexConfig( - HoodieIndexConfig.newBuilder().withIndexType(HoodieIndex.IndexType.BLOOM).build()).build(); + HoodieWriteConfig config = HoodieWriteConfig.newBuilder().withPath(basePath) + .withIndexConfig(HoodieIndexConfig.newBuilder().withIndexType(HoodieIndex.IndexType.BLOOM).build()).build(); return new HoodieWriteClient(jsc, config, false); } diff --git a/hudi-cli/src/main/java/org/apache/hudi/cli/commands/SparkMain.java b/hudi-cli/src/main/java/org/apache/hudi/cli/commands/SparkMain.java index 2a11fef10..0f4cf7968 100644 --- a/hudi-cli/src/main/java/org/apache/hudi/cli/commands/SparkMain.java +++ b/hudi-cli/src/main/java/org/apache/hudi/cli/commands/SparkMain.java @@ -43,8 +43,7 @@ public class SparkMain { * Commands */ enum SparkCommand { - ROLLBACK, DEDUPLICATE, ROLLBACK_TO_SAVEPOINT, SAVEPOINT, IMPORT, UPSERT, COMPACT_SCHEDULE, COMPACT_RUN, - COMPACT_UNSCHEDULE_PLAN, COMPACT_UNSCHEDULE_FILE, COMPACT_VALIDATE, COMPACT_REPAIR + ROLLBACK, DEDUPLICATE, ROLLBACK_TO_SAVEPOINT, SAVEPOINT, IMPORT, UPSERT, COMPACT_SCHEDULE, COMPACT_RUN, COMPACT_UNSCHEDULE_PLAN, COMPACT_UNSCHEDULE_FILE, COMPACT_VALIDATE, COMPACT_REPAIR } public static void main(String[] args) throws Exception { @@ -76,13 +75,12 @@ public class SparkMain { break; case COMPACT_RUN: assert (args.length == 8); - returnCode = compact(jsc, args[1], args[2], args[3], Integer.parseInt(args[4]), - args[5], args[6], Integer.parseInt(args[7]), false); + returnCode = compact(jsc, args[1], args[2], args[3], Integer.parseInt(args[4]), args[5], args[6], + Integer.parseInt(args[7]), false); break; case COMPACT_SCHEDULE: assert (args.length == 5); - returnCode = compact(jsc, args[1], args[2], args[3], 1, - "", args[4], 0, true); + returnCode = compact(jsc, args[1], args[2], args[3], 1, "", args[4], 0, true); break; case COMPACT_VALIDATE: assert (args.length == 7); @@ -113,8 +111,7 @@ public class SparkMain { System.exit(returnCode); } - private static int dataLoad(JavaSparkContext jsc, String command, - String srcPath, String targetPath, String tableName, + private static int dataLoad(JavaSparkContext jsc, String command, String srcPath, String targetPath, String tableName, String tableType, String rowKey, String partitionKey, int parallelism, String schemaFile, String sparkMaster, String sparkMemory, int retry) throws Exception { Config cfg = new Config(); @@ -180,9 +177,9 @@ public class SparkMain { new HoodieCompactionAdminTool(cfg).run(jsc); } - private static void doCompactUnscheduleFile(JavaSparkContext jsc, String basePath, String fileId, - String outputPath, int parallelism, String sparkMaster, String sparkMemory, boolean skipValidation, - boolean dryRun) throws Exception { + private static void doCompactUnscheduleFile(JavaSparkContext jsc, String basePath, String fileId, String outputPath, + int parallelism, String sparkMaster, String sparkMemory, boolean skipValidation, boolean dryRun) + throws Exception { HoodieCompactionAdminTool.Config cfg = new HoodieCompactionAdminTool.Config(); cfg.basePath = basePath; cfg.operation = Operation.UNSCHEDULE_FILE; @@ -244,8 +241,8 @@ public class SparkMain { } private static HoodieWriteClient createHoodieClient(JavaSparkContext jsc, String basePath) throws Exception { - HoodieWriteConfig config = HoodieWriteConfig.newBuilder().withPath(basePath).withIndexConfig( - HoodieIndexConfig.newBuilder().withIndexType(HoodieIndex.IndexType.BLOOM).build()).build(); + HoodieWriteConfig config = HoodieWriteConfig.newBuilder().withPath(basePath) + .withIndexConfig(HoodieIndexConfig.newBuilder().withIndexType(HoodieIndex.IndexType.BLOOM).build()).build(); return new HoodieWriteClient(jsc, config); } } diff --git a/hudi-cli/src/main/java/org/apache/hudi/cli/commands/StatsCommand.java b/hudi-cli/src/main/java/org/apache/hudi/cli/commands/StatsCommand.java index 9b8f41cb9..8b7546b24 100644 --- a/hudi-cli/src/main/java/org/apache/hudi/cli/commands/StatsCommand.java +++ b/hudi-cli/src/main/java/org/apache/hudi/cli/commands/StatsCommand.java @@ -63,8 +63,9 @@ public class StatsCommand implements CommandMarker { @CliOption(key = {"limit"}, help = "Limit commits", unspecifiedDefaultValue = "-1") final Integer limit, @CliOption(key = {"sortBy"}, help = "Sorting Field", unspecifiedDefaultValue = "") final String sortByField, @CliOption(key = {"desc"}, help = "Ordering", unspecifiedDefaultValue = "false") final boolean descending, - @CliOption(key = {"headeronly"}, help = "Print Header Only", unspecifiedDefaultValue = "false") - final boolean headerOnly) throws IOException { + @CliOption(key = {"headeronly"}, help = "Print Header Only", + unspecifiedDefaultValue = "false") final boolean headerOnly) + throws IOException { long totalRecordsUpserted = 0; long totalRecordsWritten = 0; @@ -82,7 +83,7 @@ public class StatsCommand implements CommandMarker { if (commit.fetchTotalUpdateRecordsWritten() > 0) { waf = df.format((float) commit.fetchTotalRecordsWritten() / commit.fetchTotalUpdateRecordsWritten()); } - rows.add(new Comparable[]{commitTime.getTimestamp(), commit.fetchTotalUpdateRecordsWritten(), + rows.add(new Comparable[] {commitTime.getTimestamp(), commit.fetchTotalUpdateRecordsWritten(), commit.fetchTotalRecordsWritten(), waf}); totalRecordsUpserted += commit.fetchTotalUpdateRecordsWritten(); totalRecordsWritten += commit.fetchTotalRecordsWritten(); @@ -91,33 +92,28 @@ public class StatsCommand implements CommandMarker { if (totalRecordsUpserted > 0) { waf = df.format((float) totalRecordsWritten / totalRecordsUpserted); } - rows.add(new Comparable[]{"Total", totalRecordsUpserted, totalRecordsWritten, waf}); + rows.add(new Comparable[] {"Total", totalRecordsUpserted, totalRecordsWritten, waf}); - TableHeader header = new TableHeader() - .addTableHeaderField("CommitTime") - .addTableHeaderField("Total Upserted") - .addTableHeaderField("Total Written") - .addTableHeaderField("Write Amplifiation Factor"); + TableHeader header = new TableHeader().addTableHeaderField("CommitTime").addTableHeaderField("Total Upserted") + .addTableHeaderField("Total Written").addTableHeaderField("Write Amplifiation Factor"); return HoodiePrintHelper.print(header, new HashMap<>(), sortByField, descending, limit, headerOnly, rows); } private Comparable[] printFileSizeHistogram(String commitTime, Snapshot s) { - return new Comparable[]{commitTime, s.getMin(), - s.getValue(0.1), s.getMedian(), - s.getMean(), s.get95thPercentile(), - s.getMax(), s.size(), - s.getStdDev()}; + return new Comparable[] {commitTime, s.getMin(), s.getValue(0.1), s.getMedian(), s.getMean(), s.get95thPercentile(), + s.getMax(), s.size(), s.getStdDev()}; } @CliCommand(value = "stats filesizes", help = "File Sizes. Display summary stats on sizes of files") public String fileSizeStats( - @CliOption(key = {"partitionPath"}, - help = "regex to select files, eg: 2016/08/02", unspecifiedDefaultValue = "*/*/*") final String globRegex, + @CliOption(key = {"partitionPath"}, help = "regex to select files, eg: 2016/08/02", + unspecifiedDefaultValue = "*/*/*") final String globRegex, @CliOption(key = {"limit"}, help = "Limit commits", unspecifiedDefaultValue = "-1") final Integer limit, @CliOption(key = {"sortBy"}, help = "Sorting Field", unspecifiedDefaultValue = "") final String sortByField, @CliOption(key = {"desc"}, help = "Ordering", unspecifiedDefaultValue = "false") final boolean descending, - @CliOption(key = {"headeronly"}, help = "Print Header Only", unspecifiedDefaultValue = "false") - final boolean headerOnly) throws IOException { + @CliOption(key = {"headeronly"}, help = "Print Header Only", + unspecifiedDefaultValue = "false") final boolean headerOnly) + throws IOException { FileSystem fs = HoodieCLI.fs; String globPath = String.format("%s/%s/*", HoodieCLI.tableMetadata.getBasePath(), globRegex); @@ -145,8 +141,8 @@ public class StatsCommand implements CommandMarker { Snapshot s = globalHistogram.getSnapshot(); rows.add(printFileSizeHistogram("ALL", s)); - Function converterFunction = entry -> - NumericUtils.humanReadableByteCount((Double.valueOf(entry.toString()))); + Function converterFunction = + entry -> NumericUtils.humanReadableByteCount((Double.valueOf(entry.toString()))); Map> fieldNameToConverterMap = new HashMap<>(); fieldNameToConverterMap.put("Min", converterFunction); fieldNameToConverterMap.put("10th", converterFunction); @@ -156,16 +152,9 @@ public class StatsCommand implements CommandMarker { fieldNameToConverterMap.put("Max", converterFunction); fieldNameToConverterMap.put("StdDev", converterFunction); - TableHeader header = new TableHeader() - .addTableHeaderField("CommitTime") - .addTableHeaderField("Min") - .addTableHeaderField("10th") - .addTableHeaderField("50th") - .addTableHeaderField("avg") - .addTableHeaderField("95th") - .addTableHeaderField("Max") - .addTableHeaderField("NumFiles") - .addTableHeaderField("StdDev"); + TableHeader header = new TableHeader().addTableHeaderField("CommitTime").addTableHeaderField("Min") + .addTableHeaderField("10th").addTableHeaderField("50th").addTableHeaderField("avg").addTableHeaderField("95th") + .addTableHeaderField("Max").addTableHeaderField("NumFiles").addTableHeaderField("StdDev"); return HoodiePrintHelper.print(header, fieldNameToConverterMap, sortByField, descending, limit, headerOnly, rows); } } diff --git a/hudi-cli/src/main/java/org/apache/hudi/cli/utils/HiveUtil.java b/hudi-cli/src/main/java/org/apache/hudi/cli/utils/HiveUtil.java index 4b4a44aa6..dfeae269b 100644 --- a/hudi-cli/src/main/java/org/apache/hudi/cli/utils/HiveUtil.java +++ b/hudi-cli/src/main/java/org/apache/hudi/cli/utils/HiveUtil.java @@ -48,12 +48,11 @@ public class HiveUtil { ResultSet rs = null; Statement stmt = conn.createStatement(); try { - //stmt.execute("set mapred.job.queue.name="); + // stmt.execute("set mapred.job.queue.name="); stmt.execute("set hive.input.format=org.apache.hadoop.hive.ql.io.HiveInputFormat"); stmt.execute("set hive.stats.autogather=false"); rs = stmt.executeQuery( - "select count(`_hoodie_commit_time`) as cnt from " + dbName + "." - + source.getTableConfig().getTableName()); + "select count(`_hoodie_commit_time`) as cnt from " + dbName + "." + source.getTableConfig().getTableName()); long count = -1; if (rs.next()) { count = rs.getLong("cnt"); @@ -88,7 +87,7 @@ public class HiveUtil { ResultSet rs = null; Statement stmt = conn.createStatement(); try { - //stmt.execute("set mapred.job.queue.name="); + // stmt.execute("set mapred.job.queue.name="); stmt.execute("set hive.input.format=org.apache.hadoop.hive.ql.io.HiveInputFormat"); stmt.execute("set hive.stats.autogather=false"); rs = stmt.executeQuery( diff --git a/hudi-cli/src/main/java/org/apache/hudi/cli/utils/SparkUtil.java b/hudi-cli/src/main/java/org/apache/hudi/cli/utils/SparkUtil.java index 06e06af0a..e8d2a08c4 100644 --- a/hudi-cli/src/main/java/org/apache/hudi/cli/utils/SparkUtil.java +++ b/hudi-cli/src/main/java/org/apache/hudi/cli/utils/SparkUtil.java @@ -40,8 +40,8 @@ public class SparkUtil { public static SparkLauncher initLauncher(String propertiesFile) throws URISyntaxException { String currentJar = new File(SparkUtil.class.getProtectionDomain().getCodeSource().getLocation().toURI().getPath()) .getAbsolutePath(); - SparkLauncher sparkLauncher = new SparkLauncher().setAppResource(currentJar) - .setMainClass(SparkMain.class.getName()); + SparkLauncher sparkLauncher = + new SparkLauncher().setAppResource(currentJar).setMainClass(SparkMain.class.getName()); if (!StringUtils.isNullOrEmpty(propertiesFile)) { sparkLauncher.setPropertiesFile(propertiesFile); diff --git a/hudi-client/pom.xml b/hudi-client/pom.xml index d05948eef..d350777cf 100644 --- a/hudi-client/pom.xml +++ b/hudi-client/pom.xml @@ -26,6 +26,10 @@ hudi-client jar + + ${project.parent.basedir} + + diff --git a/hudi-client/src/main/java/org/apache/hudi/AbstractHoodieClient.java b/hudi-client/src/main/java/org/apache/hudi/AbstractHoodieClient.java index c6634783f..f44e06280 100644 --- a/hudi-client/src/main/java/org/apache/hudi/AbstractHoodieClient.java +++ b/hudi-client/src/main/java/org/apache/hudi/AbstractHoodieClient.java @@ -32,8 +32,8 @@ import org.apache.log4j.Logger; import org.apache.spark.api.java.JavaSparkContext; /** - * Abstract class taking care of holding common member variables (FileSystem, SparkContext, HoodieConfigs) - * Also, manages embedded timeline-server if enabled. + * Abstract class taking care of holding common member variables (FileSystem, SparkContext, HoodieConfigs) Also, manages + * embedded timeline-server if enabled. */ public abstract class AbstractHoodieClient implements Serializable, AutoCloseable { @@ -45,10 +45,9 @@ public abstract class AbstractHoodieClient implements Serializable, AutoCloseabl protected final String basePath; /** - * Timeline Server has the same lifetime as that of Client. - * Any operations done on the same timeline service will be able to take advantage - * of the cached file-system view. New completed actions will be synced automatically - * in an incremental fashion. + * Timeline Server has the same lifetime as that of Client. Any operations done on the same timeline service will be + * able to take advantage of the cached file-system view. New completed actions will be synced automatically in an + * incremental fashion. */ private transient Option timelineServer; private final boolean shouldStopTimelineServer; diff --git a/hudi-client/src/main/java/org/apache/hudi/CompactionAdminClient.java b/hudi-client/src/main/java/org/apache/hudi/CompactionAdminClient.java index d3ac06d5f..7fc34c433 100644 --- a/hudi-client/src/main/java/org/apache/hudi/CompactionAdminClient.java +++ b/hudi-client/src/main/java/org/apache/hudi/CompactionAdminClient.java @@ -69,8 +69,7 @@ public class CompactionAdminClient extends AbstractHoodieClient { super(jsc, HoodieWriteConfig.newBuilder().withPath(basePath).build()); } - public CompactionAdminClient(JavaSparkContext jsc, String basePath, - Option timelineServer) { + public CompactionAdminClient(JavaSparkContext jsc, String basePath, Option timelineServer) { super(jsc, HoodieWriteConfig.newBuilder().withPath(basePath).build(), timelineServer); } @@ -78,11 +77,11 @@ public class CompactionAdminClient extends AbstractHoodieClient { * Validate all compaction operations in a compaction plan. Verifies the file-slices are consistent with corresponding * compaction operations. * - * @param metaClient Hoodie Table Meta Client + * @param metaClient Hoodie Table Meta Client * @param compactionInstant Compaction Instant */ - public List validateCompactionPlan(HoodieTableMetaClient metaClient, - String compactionInstant, int parallelism) throws IOException { + public List validateCompactionPlan(HoodieTableMetaClient metaClient, String compactionInstant, + int parallelism) throws IOException { HoodieCompactionPlan plan = getCompactionPlan(metaClient, compactionInstant); HoodieTableFileSystemView fsView = new HoodieTableFileSystemView(metaClient, metaClient.getCommitsAndCompactionTimeline()); @@ -108,19 +107,17 @@ public class CompactionAdminClient extends AbstractHoodieClient { * This operation MUST be executed with compactions and writer turned OFF. * * @param compactionInstant Compaction Instant - * @param skipValidation Skip validation step - * @param parallelism Parallelism - * @param dryRun Dry Run + * @param skipValidation Skip validation step + * @param parallelism Parallelism + * @param dryRun Dry Run */ - public List unscheduleCompactionPlan( - String compactionInstant, boolean skipValidation, int parallelism, boolean dryRun) throws Exception { + public List unscheduleCompactionPlan(String compactionInstant, boolean skipValidation, + int parallelism, boolean dryRun) throws Exception { HoodieTableMetaClient metaClient = createMetaClient(false); - List> renameActions = - getRenamingActionsForUnschedulingCompactionPlan(metaClient, compactionInstant, parallelism, - Option.empty(), skipValidation); + List> renameActions = getRenamingActionsForUnschedulingCompactionPlan(metaClient, + compactionInstant, parallelism, Option.empty(), skipValidation); - List res = - runRenamingOps(metaClient, renameActions, parallelism, dryRun); + List res = runRenamingOps(metaClient, renameActions, parallelism, dryRun); Option success = Option.fromJavaOptional(res.stream().map(r -> (r.isExecuted() && r.isSuccess())).reduce(Boolean::logicalAnd)); @@ -145,21 +142,20 @@ public class CompactionAdminClient extends AbstractHoodieClient { } /** - * Remove a fileId from pending compaction. Removes the associated compaction operation and rename delta-files - * that were generated for that file-id after the compaction operation was scheduled. + * Remove a fileId from pending compaction. Removes the associated compaction operation and rename delta-files that + * were generated for that file-id after the compaction operation was scheduled. * * This operation MUST be executed with compactions and writer turned OFF. * - * @param fgId FileGroupId to be unscheduled + * @param fgId FileGroupId to be unscheduled * @param skipValidation Skip validation - * @param dryRun Dry Run Mode + * @param dryRun Dry Run Mode */ - public List unscheduleCompactionFileId(HoodieFileGroupId fgId, - boolean skipValidation, boolean dryRun) throws Exception { + public List unscheduleCompactionFileId(HoodieFileGroupId fgId, boolean skipValidation, boolean dryRun) + throws Exception { HoodieTableMetaClient metaClient = createMetaClient(false); List> renameActions = - getRenamingActionsForUnschedulingCompactionForFileId(metaClient, fgId, - Option.empty(), skipValidation); + getRenamingActionsForUnschedulingCompactionForFileId(metaClient, fgId, Option.empty(), skipValidation); List res = runRenamingOps(metaClient, renameActions, 1, dryRun); @@ -167,15 +163,15 @@ public class CompactionAdminClient extends AbstractHoodieClient { // Ready to remove this file-Id from compaction request Pair compactionOperationWithInstant = CompactionUtils.getAllPendingCompactionOperations(metaClient).get(fgId); - HoodieCompactionPlan plan = CompactionUtils - .getCompactionPlan(metaClient, compactionOperationWithInstant.getKey()); - List newOps = plan.getOperations().stream() - .filter(op -> (!op.getFileId().equals(fgId.getFileId())) - && (!op.getPartitionPath().equals(fgId.getPartitionPath()))).collect(Collectors.toList()); + HoodieCompactionPlan plan = + CompactionUtils.getCompactionPlan(metaClient, compactionOperationWithInstant.getKey()); + List newOps = plan.getOperations().stream().filter( + op -> (!op.getFileId().equals(fgId.getFileId())) && (!op.getPartitionPath().equals(fgId.getPartitionPath()))) + .collect(Collectors.toList()); HoodieCompactionPlan newPlan = HoodieCompactionPlan.newBuilder().setOperations(newOps).setExtraMetadata(plan.getExtraMetadata()).build(); - HoodieInstant inflight = new HoodieInstant(State.INFLIGHT, COMPACTION_ACTION, - compactionOperationWithInstant.getLeft()); + HoodieInstant inflight = + new HoodieInstant(State.INFLIGHT, COMPACTION_ACTION, compactionOperationWithInstant.getLeft()); Path inflightPath = new Path(metaClient.getMetaPath(), inflight.getFileName()); if (metaClient.getFs().exists(inflightPath)) { // revert if in inflight state @@ -189,28 +185,28 @@ public class CompactionAdminClient extends AbstractHoodieClient { } /** - * Renames delta files to make file-slices consistent with the timeline as dictated by Hoodie metadata. - * Use when compaction unschedule fails partially. + * Renames delta files to make file-slices consistent with the timeline as dictated by Hoodie metadata. Use when + * compaction unschedule fails partially. * * This operation MUST be executed with compactions and writer turned OFF. + * * @param compactionInstant Compaction Instant to be repaired - * @param dryRun Dry Run Mode + * @param dryRun Dry Run Mode */ - public List repairCompaction(String compactionInstant, - int parallelism, boolean dryRun) throws Exception { + public List repairCompaction(String compactionInstant, int parallelism, boolean dryRun) + throws Exception { HoodieTableMetaClient metaClient = createMetaClient(false); - List validationResults = - validateCompactionPlan(metaClient, compactionInstant, parallelism); - List failed = validationResults.stream() - .filter(v -> !v.isSuccess()).collect(Collectors.toList()); + List validationResults = validateCompactionPlan(metaClient, compactionInstant, parallelism); + List failed = + validationResults.stream().filter(v -> !v.isSuccess()).collect(Collectors.toList()); if (failed.isEmpty()) { return new ArrayList<>(); } - final HoodieTableFileSystemView fsView = new HoodieTableFileSystemView(metaClient, - metaClient.getCommitsAndCompactionTimeline()); - List> renameActions = failed.stream().flatMap(v -> - getRenamingActionsToAlignWithCompactionOperation(metaClient, compactionInstant, + final HoodieTableFileSystemView fsView = + new HoodieTableFileSystemView(metaClient, metaClient.getCommitsAndCompactionTimeline()); + List> renameActions = + failed.stream().flatMap(v -> getRenamingActionsToAlignWithCompactionOperation(metaClient, compactionInstant, v.getOperation(), Option.of(fsView)).stream()).collect(Collectors.toList()); return runRenamingOps(metaClient, renameActions, parallelism, dryRun); } @@ -218,11 +214,10 @@ public class CompactionAdminClient extends AbstractHoodieClient { /** * Construction Compaction Plan from compaction instant */ - private static HoodieCompactionPlan getCompactionPlan(HoodieTableMetaClient metaClient, - String compactionInstant) throws IOException { - HoodieCompactionPlan compactionPlan = AvroUtils.deserializeCompactionPlan( - metaClient.getActiveTimeline().getInstantAuxiliaryDetails( - HoodieTimeline.getCompactionRequestedInstant(compactionInstant)).get()); + private static HoodieCompactionPlan getCompactionPlan(HoodieTableMetaClient metaClient, String compactionInstant) + throws IOException { + HoodieCompactionPlan compactionPlan = AvroUtils.deserializeCompactionPlan(metaClient.getActiveTimeline() + .getInstantAuxiliaryDetails(HoodieTimeline.getCompactionRequestedInstant(compactionInstant)).get()); return compactionPlan; } @@ -230,28 +225,26 @@ public class CompactionAdminClient extends AbstractHoodieClient { * Get Renaming actions to ensure the log-files of merged file-slices is aligned with compaction operation. This * method is used to recover from failures during unschedule compaction operations. * - * @param metaClient Hoodie Table Meta Client + * @param metaClient Hoodie Table Meta Client * @param compactionInstant Compaction Instant - * @param op Compaction Operation - * @param fsViewOpt File System View + * @param op Compaction Operation + * @param fsViewOpt File System View */ protected static List> getRenamingActionsToAlignWithCompactionOperation( HoodieTableMetaClient metaClient, String compactionInstant, CompactionOperation op, Option fsViewOpt) { - HoodieTableFileSystemView fileSystemView = fsViewOpt.isPresent() ? fsViewOpt.get() : - new HoodieTableFileSystemView(metaClient, metaClient.getCommitsAndCompactionTimeline()); + HoodieTableFileSystemView fileSystemView = fsViewOpt.isPresent() ? fsViewOpt.get() + : new HoodieTableFileSystemView(metaClient, metaClient.getCommitsAndCompactionTimeline()); HoodieInstant lastInstant = metaClient.getCommitsAndCompactionTimeline().lastInstant().get(); FileSlice merged = fileSystemView.getLatestMergedFileSlicesBeforeOrOn(op.getPartitionPath(), lastInstant.getTimestamp()) .filter(fs -> fs.getFileId().equals(op.getFileId())).findFirst().get(); - final int maxVersion = - op.getDeltaFilePaths().stream().map(lf -> FSUtils.getFileVersionFromLog(new Path(lf))) - .reduce((x, y) -> x > y ? x : y).orElse(0); + final int maxVersion = op.getDeltaFilePaths().stream().map(lf -> FSUtils.getFileVersionFromLog(new Path(lf))) + .reduce((x, y) -> x > y ? x : y).orElse(0); List logFilesToBeMoved = merged.getLogFiles().filter(lf -> lf.getLogVersion() > maxVersion).collect(Collectors.toList()); return logFilesToBeMoved.stream().map(lf -> { - Preconditions.checkArgument(lf.getLogVersion() - maxVersion > 0, - "Expect new log version to be sane"); + Preconditions.checkArgument(lf.getLogVersion() - maxVersion > 0, "Expect new log version to be sane"); HoodieLogFile newLogFile = new HoodieLogFile(new Path(lf.getPath().getParent(), FSUtils.makeLogFileName(lf.getFileId(), "." + FSUtils.getFileExtensionFromLog(lf.getPath()), compactionInstant, lf.getLogVersion() - maxVersion, HoodieLogFormat.UNKNOWN_WRITE_TOKEN))); @@ -280,16 +273,15 @@ public class CompactionAdminClient extends AbstractHoodieClient { /** * Check if a compaction operation is valid * - * @param metaClient Hoodie Table Meta client + * @param metaClient Hoodie Table Meta client * @param compactionInstant Compaction Instant - * @param operation Compaction Operation - * @param fsViewOpt File System View + * @param operation Compaction Operation + * @param fsViewOpt File System View */ - private ValidationOpResult validateCompactionOperation(HoodieTableMetaClient metaClient, - String compactionInstant, CompactionOperation operation, Option fsViewOpt) - throws IOException { - HoodieTableFileSystemView fileSystemView = fsViewOpt.isPresent() ? fsViewOpt.get() : - new HoodieTableFileSystemView(metaClient, metaClient.getCommitsAndCompactionTimeline()); + private ValidationOpResult validateCompactionOperation(HoodieTableMetaClient metaClient, String compactionInstant, + CompactionOperation operation, Option fsViewOpt) throws IOException { + HoodieTableFileSystemView fileSystemView = fsViewOpt.isPresent() ? fsViewOpt.get() + : new HoodieTableFileSystemView(metaClient, metaClient.getCommitsAndCompactionTimeline()); Option lastInstant = metaClient.getCommitsAndCompactionTimeline().lastInstant(); try { if (lastInstant.isPresent()) { @@ -300,45 +292,42 @@ public class CompactionAdminClient extends AbstractHoodieClient { FileSlice fs = fileSliceOptional.get(); Option df = fs.getDataFile(); if (operation.getDataFilePath().isPresent()) { - String expPath = metaClient.getFs().getFileStatus(new Path(operation.getDataFilePath().get())).getPath() - .toString(); - Preconditions.checkArgument(df.isPresent(), "Data File must be present. File Slice was : " - + fs + ", operation :" + operation); + String expPath = + metaClient.getFs().getFileStatus(new Path(operation.getDataFilePath().get())).getPath().toString(); + Preconditions.checkArgument(df.isPresent(), + "Data File must be present. File Slice was : " + fs + ", operation :" + operation); Preconditions.checkArgument(df.get().getPath().equals(expPath), "Base Path in operation is specified as " + expPath + " but got path " + df.get().getPath()); } Set logFilesInFileSlice = fs.getLogFiles().collect(Collectors.toSet()); - Set logFilesInCompactionOp = operation.getDeltaFilePaths().stream() - .map(dp -> { - try { - FileStatus[] fileStatuses = metaClient.getFs().listStatus(new Path(dp)); - Preconditions.checkArgument(fileStatuses.length == 1, "Expect only 1 file-status"); - return new HoodieLogFile(fileStatuses[0]); - } catch (FileNotFoundException fe) { - throw new CompactionValidationException(fe.getMessage()); - } catch (IOException ioe) { - throw new HoodieIOException(ioe.getMessage(), ioe); - } - }).collect(Collectors.toSet()); - Set missing = - logFilesInCompactionOp.stream().filter(lf -> !logFilesInFileSlice.contains(lf)) - .collect(Collectors.toSet()); + Set logFilesInCompactionOp = operation.getDeltaFilePaths().stream().map(dp -> { + try { + FileStatus[] fileStatuses = metaClient.getFs().listStatus(new Path(dp)); + Preconditions.checkArgument(fileStatuses.length == 1, "Expect only 1 file-status"); + return new HoodieLogFile(fileStatuses[0]); + } catch (FileNotFoundException fe) { + throw new CompactionValidationException(fe.getMessage()); + } catch (IOException ioe) { + throw new HoodieIOException(ioe.getMessage(), ioe); + } + }).collect(Collectors.toSet()); + Set missing = logFilesInCompactionOp.stream().filter(lf -> !logFilesInFileSlice.contains(lf)) + .collect(Collectors.toSet()); Preconditions.checkArgument(missing.isEmpty(), - "All log files specified in compaction operation is not present. Missing :" + missing - + ", Exp :" + logFilesInCompactionOp + ", Got :" + logFilesInFileSlice); - Set diff = - logFilesInFileSlice.stream().filter(lf -> !logFilesInCompactionOp.contains(lf)) - .collect(Collectors.toSet()); + "All log files specified in compaction operation is not present. Missing :" + missing + ", Exp :" + + logFilesInCompactionOp + ", Got :" + logFilesInFileSlice); + Set diff = logFilesInFileSlice.stream().filter(lf -> !logFilesInCompactionOp.contains(lf)) + .collect(Collectors.toSet()); Preconditions.checkArgument(diff.stream().allMatch(lf -> lf.getBaseCommitTime().equals(compactionInstant)), "There are some log-files which are neither specified in compaction plan " + "nor present after compaction request instant. Some of these :" + diff); } else { - throw new CompactionValidationException("Unable to find file-slice for file-id (" + operation.getFileId() - + " Compaction operation is invalid."); + throw new CompactionValidationException( + "Unable to find file-slice for file-id (" + operation.getFileId() + " Compaction operation is invalid."); } } else { - throw new CompactionValidationException("Unable to find any committed instant. Compaction Operation may " - + "be pointing to stale file-slices"); + throw new CompactionValidationException( + "Unable to find any committed instant. Compaction Operation may " + "be pointing to stale file-slices"); } } catch (CompactionValidationException | IllegalArgumentException e) { return new ValidationOpResult(operation, false, Option.of(e)); @@ -349,7 +338,7 @@ public class CompactionAdminClient extends AbstractHoodieClient { /** * Execute Renaming operation * - * @param metaClient HoodieTable MetaClient + * @param metaClient HoodieTable MetaClient * @param renameActions List of rename operations */ private List runRenamingOps(HoodieTableMetaClient metaClient, @@ -374,8 +363,7 @@ public class CompactionAdminClient extends AbstractHoodieClient { }).collect(); } else { log.info("Dry-Run Mode activated for rename operations"); - return renameActions.parallelStream() - .map(lfPair -> new RenameOpResult(lfPair, false, false, Option.empty())) + return renameActions.parallelStream().map(lfPair -> new RenameOpResult(lfPair, false, false, Option.empty())) .collect(Collectors.toList()); } } @@ -385,28 +373,28 @@ public class CompactionAdminClient extends AbstractHoodieClient { * Generate renaming actions for unscheduling a pending compaction plan. NOTE: Can only be used safely when no writer * (ingestion/compaction) is running. * - * @param metaClient Hoodie Table MetaClient + * @param metaClient Hoodie Table MetaClient * @param compactionInstant Compaction Instant to be unscheduled - * @param fsViewOpt Cached File System View - * @param skipValidation Skip Validation + * @param fsViewOpt Cached File System View + * @param skipValidation Skip Validation * @return list of pairs of log-files (old, new) and for each pair, rename must be done to successfully unschedule - * compaction. + * compaction. */ protected List> getRenamingActionsForUnschedulingCompactionPlan( HoodieTableMetaClient metaClient, String compactionInstant, int parallelism, Option fsViewOpt, boolean skipValidation) throws IOException { - HoodieTableFileSystemView fsView = fsViewOpt.isPresent() ? fsViewOpt.get() : - new HoodieTableFileSystemView(metaClient, metaClient.getCommitsAndCompactionTimeline()); + HoodieTableFileSystemView fsView = fsViewOpt.isPresent() ? fsViewOpt.get() + : new HoodieTableFileSystemView(metaClient, metaClient.getCommitsAndCompactionTimeline()); HoodieCompactionPlan plan = getCompactionPlan(metaClient, compactionInstant); if (plan.getOperations() != null) { - log.info("Number of Compaction Operations :" + plan.getOperations().size() - + " for instant :" + compactionInstant); + log.info( + "Number of Compaction Operations :" + plan.getOperations().size() + " for instant :" + compactionInstant); List ops = plan.getOperations().stream() .map(CompactionOperation::convertFromAvroRecordInstance).collect(Collectors.toList()); return jsc.parallelize(ops, parallelism).flatMap(op -> { try { - return getRenamingActionsForUnschedulingCompactionOperation(metaClient, compactionInstant, - op, Option.of(fsView), skipValidation).iterator(); + return getRenamingActionsForUnschedulingCompactionOperation(metaClient, compactionInstant, op, + Option.of(fsView), skipValidation).iterator(); } catch (IOException ioe) { throw new HoodieIOException(ioe.getMessage(), ioe); } catch (CompactionValidationException ve) { @@ -422,20 +410,20 @@ public class CompactionAdminClient extends AbstractHoodieClient { * Generate renaming actions for unscheduling a compaction operation NOTE: Can only be used safely when no writer * (ingestion/compaction) is running. * - * @param metaClient Hoodie Table MetaClient + * @param metaClient Hoodie Table MetaClient * @param compactionInstant Compaction Instant - * @param operation Compaction Operation - * @param fsViewOpt Cached File System View - * @param skipValidation Skip Validation + * @param operation Compaction Operation + * @param fsViewOpt Cached File System View + * @param skipValidation Skip Validation * @return list of pairs of log-files (old, new) and for each pair, rename must be done to successfully unschedule - * compaction. + * compaction. */ public List> getRenamingActionsForUnschedulingCompactionOperation( HoodieTableMetaClient metaClient, String compactionInstant, CompactionOperation operation, Option fsViewOpt, boolean skipValidation) throws IOException { List> result = new ArrayList<>(); - HoodieTableFileSystemView fileSystemView = fsViewOpt.isPresent() ? fsViewOpt.get() : - new HoodieTableFileSystemView(metaClient, metaClient.getCommitsAndCompactionTimeline()); + HoodieTableFileSystemView fileSystemView = fsViewOpt.isPresent() ? fsViewOpt.get() + : new HoodieTableFileSystemView(metaClient, metaClient.getCommitsAndCompactionTimeline()); if (!skipValidation) { validateCompactionOperation(metaClient, compactionInstant, operation, Option.of(fileSystemView)); } @@ -444,15 +432,13 @@ public class CompactionAdminClient extends AbstractHoodieClient { fileSystemView.getLatestMergedFileSlicesBeforeOrOn(operation.getPartitionPath(), lastInstant.getTimestamp()) .filter(fs -> fs.getFileId().equals(operation.getFileId())).findFirst().get(); List logFilesToRepair = - merged.getLogFiles().filter(lf -> lf.getBaseCommitTime().equals(compactionInstant)) - .sorted(HoodieLogFile.getLogFileComparator()) - .collect(Collectors.toList()); + merged.getLogFiles().filter(lf -> lf.getBaseCommitTime().equals(compactionInstant)) + .sorted(HoodieLogFile.getLogFileComparator()).collect(Collectors.toList()); FileSlice fileSliceForCompaction = fileSystemView.getLatestFileSlicesBeforeOrOn(operation.getPartitionPath(), operation.getBaseInstantTime(), true) .filter(fs -> fs.getFileId().equals(operation.getFileId())).findFirst().get(); - int maxUsedVersion = - fileSliceForCompaction.getLogFiles().findFirst().map(HoodieLogFile::getLogVersion) - .orElse(HoodieLogFile.LOGFILE_BASE_VERSION - 1); + int maxUsedVersion = fileSliceForCompaction.getLogFiles().findFirst().map(HoodieLogFile::getLogVersion) + .orElse(HoodieLogFile.LOGFILE_BASE_VERSION - 1); String logExtn = fileSliceForCompaction.getLogFiles().findFirst().map(lf -> "." + lf.getFileExtension()) .orElse(HoodieLogFile.DELTA_EXTENSION); String parentPath = fileSliceForCompaction.getDataFile().map(df -> new Path(df.getPath()).getParent().toString()) @@ -471,16 +457,16 @@ public class CompactionAdminClient extends AbstractHoodieClient { * Generate renaming actions for unscheduling a fileId from pending compaction. NOTE: Can only be used safely when no * writer (ingestion/compaction) is running. * - * @param metaClient Hoodie Table MetaClient - * @param fgId FileGroupId to remove compaction - * @param fsViewOpt Cached File System View + * @param metaClient Hoodie Table MetaClient + * @param fgId FileGroupId to remove compaction + * @param fsViewOpt Cached File System View * @param skipValidation Skip Validation * @return list of pairs of log-files (old, new) and for each pair, rename must be done to successfully unschedule - * compaction. + * compaction. */ public List> getRenamingActionsForUnschedulingCompactionForFileId( - HoodieTableMetaClient metaClient, HoodieFileGroupId fgId, - Option fsViewOpt, boolean skipValidation) throws IOException { + HoodieTableMetaClient metaClient, HoodieFileGroupId fgId, Option fsViewOpt, + boolean skipValidation) throws IOException { Map> allPendingCompactions = CompactionUtils.getAllPendingCompactionOperations(metaClient); if (allPendingCompactions.containsKey(fgId)) { @@ -496,20 +482,19 @@ public class CompactionAdminClient extends AbstractHoodieClient { */ public static class RenameOpResult extends OperationResult { - public RenameOpResult() { + public RenameOpResult() {} + + public RenameOpResult(Pair op, boolean success, Option exception) { + super( + new RenameInfo(op.getKey().getFileId(), op.getKey().getPath().toString(), op.getRight().getPath().toString()), + success, exception); } - public RenameOpResult(Pair op, boolean success, + public RenameOpResult(Pair op, boolean executed, boolean success, Option exception) { - super(new RenameInfo(op.getKey().getFileId(), op.getKey().getPath().toString(), - op.getRight().getPath().toString()), success, exception); - } - - public RenameOpResult( - Pair op, boolean executed, boolean success, - Option exception) { - super(new RenameInfo(op.getKey().getFileId(), op.getKey().getPath().toString(), - op.getRight().getPath().toString()), executed, success, exception); + super( + new RenameInfo(op.getKey().getFileId(), op.getKey().getPath().toString(), op.getRight().getPath().toString()), + executed, success, exception); } } @@ -518,11 +503,9 @@ public class CompactionAdminClient extends AbstractHoodieClient { */ public static class ValidationOpResult extends OperationResult { - public ValidationOpResult() { - } + public ValidationOpResult() {} - public ValidationOpResult( - CompactionOperation operation, boolean success, Option exception) { + public ValidationOpResult(CompactionOperation operation, boolean success, Option exception) { super(operation, success, exception); } } @@ -533,8 +516,7 @@ public class CompactionAdminClient extends AbstractHoodieClient { public String srcPath; public String destPath; - public RenameInfo() { - } + public RenameInfo() {} public RenameInfo(String fileId, String srcPath, String destPath) { this.fileId = fileId; diff --git a/hudi-client/src/main/java/org/apache/hudi/HoodieReadClient.java b/hudi-client/src/main/java/org/apache/hudi/HoodieReadClient.java index 43f4c5760..819279b25 100644 --- a/hudi-client/src/main/java/org/apache/hudi/HoodieReadClient.java +++ b/hudi-client/src/main/java/org/apache/hudi/HoodieReadClient.java @@ -58,9 +58,8 @@ public class HoodieReadClient extends AbstractHoo private static final Logger logger = LogManager.getLogger(HoodieReadClient.class); /** - * TODO: We need to persist the index type into hoodie.properties and be able to access the index - * just with a simple basepath pointing to the dataset. Until, then just always assume a - * BloomIndex + * TODO: We need to persist the index type into hoodie.properties and be able to access the index just with a simple + * basepath pointing to the dataset. Until, then just always assume a BloomIndex */ private final transient HoodieIndex index; private final HoodieTimeline commitTimeline; @@ -70,13 +69,11 @@ public class HoodieReadClient extends AbstractHoo /** * @param basePath path to Hoodie dataset */ - public HoodieReadClient(JavaSparkContext jsc, String basePath, - Option timelineService) { + public HoodieReadClient(JavaSparkContext jsc, String basePath, Option timelineService) { this(jsc, HoodieWriteConfig.newBuilder().withPath(basePath) // by default we use HoodieBloomIndex - .withIndexConfig( - HoodieIndexConfig.newBuilder().withIndexType(HoodieIndex.IndexType.BLOOM).build()) - .build(), timelineService); + .withIndexConfig(HoodieIndexConfig.newBuilder().withIndexType(HoodieIndex.IndexType.BLOOM).build()).build(), + timelineService); } /** @@ -130,8 +127,7 @@ public class HoodieReadClient extends AbstractHoo private void assertSqlContext() { if (!sqlContextOpt.isPresent()) { - throw new IllegalStateException( - "SQLContext must be set, when performing dataframe operations"); + throw new IllegalStateException("SQLContext must be set, when performing dataframe operations"); } } @@ -152,17 +148,16 @@ public class HoodieReadClient extends AbstractHoo */ public Dataset readROView(JavaRDD hoodieKeys, int parallelism) { assertSqlContext(); - JavaPairRDD>> lookupResultRDD = index - .fetchRecordLocation(hoodieKeys, jsc, hoodieTable); - JavaPairRDD> keyToFileRDD = lookupResultRDD - .mapToPair(r -> new Tuple2<>(r._1, convertToDataFilePath(r._2))); + JavaPairRDD>> lookupResultRDD = + index.fetchRecordLocation(hoodieKeys, jsc, hoodieTable); + JavaPairRDD> keyToFileRDD = + lookupResultRDD.mapToPair(r -> new Tuple2<>(r._1, convertToDataFilePath(r._2))); List paths = keyToFileRDD.filter(keyFileTuple -> keyFileTuple._2().isPresent()) .map(keyFileTuple -> keyFileTuple._2().get()).collect(); // record locations might be same for multiple keys, so need a unique list Set uniquePaths = new HashSet<>(paths); - Dataset originalDF = sqlContextOpt.get().read() - .parquet(uniquePaths.toArray(new String[uniquePaths.size()])); + Dataset originalDF = sqlContextOpt.get().read().parquet(uniquePaths.toArray(new String[uniquePaths.size()])); StructType schema = originalDF.schema(); JavaPairRDD keyRowRDD = originalDF.javaRDD().mapToPair(row -> { HoodieKey key = new HoodieKey(row.getAs(HoodieRecord.RECORD_KEY_METADATA_FIELD), @@ -176,18 +171,16 @@ public class HoodieReadClient extends AbstractHoo } /** - * Checks if the given [Keys] exists in the hoodie table and returns [Key, Option[FullFilePath]] - * If the optional FullFilePath value is not present, then the key is not found. If the - * FullFilePath value is present, it is the path component (without scheme) of the URI underlying - * file + * Checks if the given [Keys] exists in the hoodie table and returns [Key, Option[FullFilePath]] If the optional + * FullFilePath value is not present, then the key is not found. If the FullFilePath value is present, it is the path + * component (without scheme) of the URI underlying file */ public JavaPairRDD> checkExists(JavaRDD hoodieKeys) { return index.fetchRecordLocation(hoodieKeys, jsc, hoodieTable); } /** - * Filter out HoodieRecords that already exists in the output folder. This is useful in - * deduplication. + * Filter out HoodieRecords that already exists in the output folder. This is useful in deduplication. * * @param hoodieRecords Input RDD of Hoodie records. * @return A subset of hoodieRecords RDD, with existing records filtered out. @@ -198,27 +191,27 @@ public class HoodieReadClient extends AbstractHoo } /** - * Looks up the index and tags each incoming record with a location of a file that contains the - * row (if it is actually present). Input RDD should contain no duplicates if needed. + * Looks up the index and tags each incoming record with a location of a file that contains the row (if it is actually + * present). Input RDD should contain no duplicates if needed. * * @param hoodieRecords Input RDD of Hoodie records * @return Tagged RDD of Hoodie records */ - public JavaRDD> tagLocation(JavaRDD> hoodieRecords) - throws HoodieIndexException { + public JavaRDD> tagLocation(JavaRDD> hoodieRecords) throws HoodieIndexException { return index.tagLocation(hoodieRecords, jsc, hoodieTable); } /** * Return all pending compactions with instant time for clients to decide what to compact next. + * * @return */ public List> getPendingCompactions() { - HoodieTableMetaClient metaClient = new HoodieTableMetaClient(jsc.hadoopConfiguration(), - hoodieTable.getMetaClient().getBasePath(), true); + HoodieTableMetaClient metaClient = + new HoodieTableMetaClient(jsc.hadoopConfiguration(), hoodieTable.getMetaClient().getBasePath(), true); return CompactionUtils.getAllPendingCompactionPlans(metaClient).stream() - .map(instantWorkloadPair -> - Pair.of(instantWorkloadPair.getKey().getTimestamp(), instantWorkloadPair.getValue())) + .map( + instantWorkloadPair -> Pair.of(instantWorkloadPair.getKey().getTimestamp(), instantWorkloadPair.getValue())) .collect(Collectors.toList()); } } diff --git a/hudi-client/src/main/java/org/apache/hudi/HoodieWriteClient.java b/hudi-client/src/main/java/org/apache/hudi/HoodieWriteClient.java index 4a8482194..86fc7d020 100644 --- a/hudi-client/src/main/java/org/apache/hudi/HoodieWriteClient.java +++ b/hudi-client/src/main/java/org/apache/hudi/HoodieWriteClient.java @@ -87,11 +87,10 @@ import org.apache.spark.storage.StorageLevel; import scala.Tuple2; /** - * Hoodie Write Client helps you build datasets on HDFS [insert()] and then perform efficient - * mutations on a HDFS dataset [upsert()] + * Hoodie Write Client helps you build datasets on HDFS [insert()] and then perform efficient mutations on a HDFS + * dataset [upsert()] *

- * Note that, at any given time, there can only be one Spark job performing these operatons on a - * Hoodie dataset. + * Note that, at any given time, there can only be one Spark job performing these operatons on a Hoodie dataset. */ public class HoodieWriteClient extends AbstractHoodieClient { @@ -117,19 +116,17 @@ public class HoodieWriteClient extends AbstractHo * @param clientConfig * @param rollbackInFlight */ - public HoodieWriteClient(JavaSparkContext jsc, HoodieWriteConfig clientConfig, - boolean rollbackInFlight) { + public HoodieWriteClient(JavaSparkContext jsc, HoodieWriteConfig clientConfig, boolean rollbackInFlight) { this(jsc, clientConfig, rollbackInFlight, HoodieIndex.createIndex(clientConfig, jsc)); } @VisibleForTesting - HoodieWriteClient(JavaSparkContext jsc, HoodieWriteConfig clientConfig, - boolean rollbackInFlight, HoodieIndex index) { + HoodieWriteClient(JavaSparkContext jsc, HoodieWriteConfig clientConfig, boolean rollbackInFlight, HoodieIndex index) { this(jsc, clientConfig, rollbackInFlight, index, Option.empty()); } - public HoodieWriteClient(JavaSparkContext jsc, HoodieWriteConfig clientConfig, - boolean rollbackInFlight, HoodieIndex index, Option timelineService) { + public HoodieWriteClient(JavaSparkContext jsc, HoodieWriteConfig clientConfig, boolean rollbackInFlight, + HoodieIndex index, Option timelineService) { super(jsc, clientConfig, timelineService); this.index = index; this.metrics = new HoodieMetrics(config, config.getTableName()); @@ -137,26 +134,22 @@ public class HoodieWriteClient extends AbstractHo } public static SparkConf registerClasses(SparkConf conf) { - conf.registerKryoClasses( - new Class[]{HoodieWriteConfig.class, HoodieRecord.class, HoodieKey.class}); + conf.registerKryoClasses(new Class[] {HoodieWriteConfig.class, HoodieRecord.class, HoodieKey.class}); return conf; } /** - * Filter out HoodieRecords that already exists in the output folder. This is useful in - * deduplication. + * Filter out HoodieRecords that already exists in the output folder. This is useful in deduplication. * * @param hoodieRecords Input RDD of Hoodie records. * @return A subset of hoodieRecords RDD, with existing records filtered out. */ public JavaRDD> filterExists(JavaRDD> hoodieRecords) { // Create a Hoodie table which encapsulated the commits and files visible - HoodieTable table = HoodieTable.getHoodieTable( - createMetaClient(true), config, jsc); + HoodieTable table = HoodieTable.getHoodieTable(createMetaClient(true), config, jsc); indexTimer = metrics.getIndexCtx(); JavaRDD> recordsWithLocation = index.tagLocation(hoodieRecords, jsc, table); - metrics.updateIndexMetrics("lookup", metrics.getDurationInMs(indexTimer == null ? 0L : - indexTimer.stop())); + metrics.updateIndexMetrics("lookup", metrics.getDurationInMs(indexTimer == null ? 0L : indexTimer.stop())); indexTimer = null; return recordsWithLocation.filter(v1 -> !v1.isCurrentLocationKnown()); } @@ -168,14 +161,13 @@ public class HoodieWriteClient extends AbstractHo HoodieTable table = getTableAndInitCtx(records); try { // De-dupe/merge if needed - JavaRDD> dedupedRecords = combineOnCondition( - config.shouldCombineBeforeUpsert(), records, config.getUpsertShuffleParallelism()); + JavaRDD> dedupedRecords = + combineOnCondition(config.shouldCombineBeforeUpsert(), records, config.getUpsertShuffleParallelism()); indexTimer = metrics.getIndexCtx(); // perform index loop up to get existing location of records JavaRDD> taggedRecords = index.tagLocation(dedupedRecords, jsc, table); - metrics.updateIndexMetrics("lookup", metrics.getDurationInMs(indexTimer == null ? 0L : - indexTimer.stop())); + metrics.updateIndexMetrics("lookup", metrics.getDurationInMs(indexTimer == null ? 0L : indexTimer.stop())); indexTimer = null; return upsertRecordsInternal(taggedRecords, commitTime, table, true); } catch (Throwable e) { @@ -189,15 +181,13 @@ public class HoodieWriteClient extends AbstractHo /** * Upserts the given prepared records into the Hoodie table, at the supplied commitTime. *

- * This implementation requires that the input records are already tagged, and de-duped if - * needed. + * This implementation requires that the input records are already tagged, and de-duped if needed. * * @param preppedRecords Prepared HoodieRecords to upsert * @param commitTime Commit Time handle * @return JavaRDD[WriteStatus] - RDD of WriteStatus to inspect errors and counts */ - public JavaRDD upsertPreppedRecords(JavaRDD> preppedRecords, - final String commitTime) { + public JavaRDD upsertPreppedRecords(JavaRDD> preppedRecords, final String commitTime) { HoodieTable table = getTableAndInitCtx(preppedRecords); try { return upsertRecordsInternal(preppedRecords, commitTime, table, true); @@ -205,17 +195,15 @@ public class HoodieWriteClient extends AbstractHo if (e instanceof HoodieUpsertException) { throw (HoodieUpsertException) e; } - throw new HoodieUpsertException( - "Failed to upsert prepared records for commit time " + commitTime, e); + throw new HoodieUpsertException("Failed to upsert prepared records for commit time " + commitTime, e); } } /** - * Inserts the given HoodieRecords, into the table. This API is intended to be used for normal - * writes. + * Inserts the given HoodieRecords, into the table. This API is intended to be used for normal writes. *

- * This implementation skips the index check and is able to leverage benefits such as small file - * handling/blocking alignment, as with upsert(), by profiling the workload + * This implementation skips the index check and is able to leverage benefits such as small file handling/blocking + * alignment, as with upsert(), by profiling the workload * * @param records HoodieRecords to insert * @param commitTime Commit Time handle @@ -225,8 +213,8 @@ public class HoodieWriteClient extends AbstractHo HoodieTable table = getTableAndInitCtx(records); try { // De-dupe/merge if needed - JavaRDD> dedupedRecords = combineOnCondition( - config.shouldCombineBeforeInsert(), records, config.getInsertShuffleParallelism()); + JavaRDD> dedupedRecords = + combineOnCondition(config.shouldCombineBeforeInsert(), records, config.getInsertShuffleParallelism()); return upsertRecordsInternal(dedupedRecords, commitTime, table, false); } catch (Throwable e) { @@ -240,16 +228,15 @@ public class HoodieWriteClient extends AbstractHo /** * Inserts the given prepared records into the Hoodie table, at the supplied commitTime. *

- * This implementation skips the index check, skips de-duping and is able to leverage benefits - * such as small file handling/blocking alignment, as with insert(), by profiling the workload. - * The prepared HoodieRecords should be de-duped if needed. + * This implementation skips the index check, skips de-duping and is able to leverage benefits such as small file + * handling/blocking alignment, as with insert(), by profiling the workload. The prepared HoodieRecords should be + * de-duped if needed. * * @param preppedRecords HoodieRecords to insert * @param commitTime Commit Time handle * @return JavaRDD[WriteStatus] - RDD of WriteStatus to inspect errors and counts */ - public JavaRDD insertPreppedRecords(JavaRDD> preppedRecords, - final String commitTime) { + public JavaRDD insertPreppedRecords(JavaRDD> preppedRecords, final String commitTime) { HoodieTable table = getTableAndInitCtx(preppedRecords); try { return upsertRecordsInternal(preppedRecords, commitTime, table, false); @@ -257,44 +244,38 @@ public class HoodieWriteClient extends AbstractHo if (e instanceof HoodieInsertException) { throw e; } - throw new HoodieInsertException( - "Failed to insert prepared records for commit time " + commitTime, e); + throw new HoodieInsertException("Failed to insert prepared records for commit time " + commitTime, e); } } /** - * Loads the given HoodieRecords, as inserts into the table. This is suitable for doing big bulk - * loads into a Hoodie table for the very first time (e.g: converting an existing dataset to - * Hoodie). + * Loads the given HoodieRecords, as inserts into the table. This is suitable for doing big bulk loads into a Hoodie + * table for the very first time (e.g: converting an existing dataset to Hoodie). *

- * This implementation uses sortBy (which does range partitioning based on reservoir sampling) and - * attempts to control the numbers of files with less memory compared to the {@link - * HoodieWriteClient#insert(JavaRDD, String)} + * This implementation uses sortBy (which does range partitioning based on reservoir sampling) and attempts to control + * the numbers of files with less memory compared to the {@link HoodieWriteClient#insert(JavaRDD, String)} * * @param records HoodieRecords to insert * @param commitTime Commit Time handle * @return JavaRDD[WriteStatus] - RDD of WriteStatus to inspect errors and counts */ - public JavaRDD bulkInsert(JavaRDD> records, - final String commitTime) { + public JavaRDD bulkInsert(JavaRDD> records, final String commitTime) { return bulkInsert(records, commitTime, Option.empty()); } /** - * Loads the given HoodieRecords, as inserts into the table. This is suitable for doing big bulk - * loads into a Hoodie table for the very first time (e.g: converting an existing dataset to - * Hoodie). + * Loads the given HoodieRecords, as inserts into the table. This is suitable for doing big bulk loads into a Hoodie + * table for the very first time (e.g: converting an existing dataset to Hoodie). *

- * This implementation uses sortBy (which does range partitioning based on reservoir sampling) and - * attempts to control the numbers of files with less memory compared to the {@link - * HoodieWriteClient#insert(JavaRDD, String)}. Optionally it allows users to specify their own - * partitioner. If specified then it will be used for repartitioning records. See {@link - * UserDefinedBulkInsertPartitioner}. + * This implementation uses sortBy (which does range partitioning based on reservoir sampling) and attempts to control + * the numbers of files with less memory compared to the {@link HoodieWriteClient#insert(JavaRDD, String)}. Optionally + * it allows users to specify their own partitioner. If specified then it will be used for repartitioning records. See + * {@link UserDefinedBulkInsertPartitioner}. * * @param records HoodieRecords to insert * @param commitTime Commit Time handle - * @param bulkInsertPartitioner If specified then it will be used to partition input records - * before they are inserted into hoodie. + * @param bulkInsertPartitioner If specified then it will be used to partition input records before they are inserted + * into hoodie. * @return JavaRDD[WriteStatus] - RDD of WriteStatus to inspect errors and counts */ public JavaRDD bulkInsert(JavaRDD> records, final String commitTime, @@ -302,8 +283,8 @@ public class HoodieWriteClient extends AbstractHo HoodieTable table = getTableAndInitCtx(records); try { // De-dupe/merge if needed - JavaRDD> dedupedRecords = combineOnCondition( - config.shouldCombineBeforeInsert(), records, config.getInsertShuffleParallelism()); + JavaRDD> dedupedRecords = + combineOnCondition(config.shouldCombineBeforeInsert(), records, config.getInsertShuffleParallelism()); return bulkInsertInternal(dedupedRecords, commitTime, table, bulkInsertPartitioner); } catch (Throwable e) { @@ -315,24 +296,23 @@ public class HoodieWriteClient extends AbstractHo } /** - * Loads the given HoodieRecords, as inserts into the table. This is suitable for doing big bulk - * loads into a Hoodie table for the very first time (e.g: converting an existing dataset to - * Hoodie). The input records should contain no duplicates if needed. + * Loads the given HoodieRecords, as inserts into the table. This is suitable for doing big bulk loads into a Hoodie + * table for the very first time (e.g: converting an existing dataset to Hoodie). The input records should contain no + * duplicates if needed. *

- * This implementation uses sortBy (which does range partitioning based on reservoir sampling) and - * attempts to control the numbers of files with less memory compared to the {@link - * HoodieWriteClient#insert(JavaRDD, String)}. Optionally it allows users to specify their own - * partitioner. If specified then it will be used for repartitioning records. See {@link - * UserDefinedBulkInsertPartitioner}. + * This implementation uses sortBy (which does range partitioning based on reservoir sampling) and attempts to control + * the numbers of files with less memory compared to the {@link HoodieWriteClient#insert(JavaRDD, String)}. Optionally + * it allows users to specify their own partitioner. If specified then it will be used for repartitioning records. See + * {@link UserDefinedBulkInsertPartitioner}. * * @param preppedRecords HoodieRecords to insert * @param commitTime Commit Time handle - * @param bulkInsertPartitioner If specified then it will be used to partition input records - * before they are inserted into hoodie. + * @param bulkInsertPartitioner If specified then it will be used to partition input records before they are inserted + * into hoodie. * @return JavaRDD[WriteStatus] - RDD of WriteStatus to inspect errors and counts */ - public JavaRDD bulkInsertPreppedRecords(JavaRDD> preppedRecords, - final String commitTime, Option bulkInsertPartitioner) { + public JavaRDD bulkInsertPreppedRecords(JavaRDD> preppedRecords, final String commitTime, + Option bulkInsertPartitioner) { HoodieTable table = getTableAndInitCtx(preppedRecords); try { return bulkInsertInternal(preppedRecords, commitTime, table, bulkInsertPartitioner); @@ -340,19 +320,16 @@ public class HoodieWriteClient extends AbstractHo if (e instanceof HoodieInsertException) { throw e; } - throw new HoodieInsertException( - "Failed to bulk insert prepared records for commit time " + commitTime, e); + throw new HoodieInsertException("Failed to bulk insert prepared records for commit time " + commitTime, e); } } - private JavaRDD bulkInsertInternal(JavaRDD> dedupedRecords, - String commitTime, HoodieTable table, - Option bulkInsertPartitioner) { + private JavaRDD bulkInsertInternal(JavaRDD> dedupedRecords, String commitTime, + HoodieTable table, Option bulkInsertPartitioner) { final JavaRDD> repartitionedRecords; final int parallelism = config.getBulkInsertShuffleParallelism(); if (bulkInsertPartitioner.isPresent()) { - repartitionedRecords = bulkInsertPartitioner.get() - .repartitionRecords(dedupedRecords, parallelism); + repartitionedRecords = bulkInsertPartitioner.get().repartitionRecords(dedupedRecords, parallelism); } else { // Now, sort the records and line them up nicely for loading. repartitionedRecords = dedupedRecords.sortBy(record -> { @@ -363,10 +340,9 @@ public class HoodieWriteClient extends AbstractHo }, true, parallelism); } - //generate new file ID prefixes for each output partition - final List fileIDPrefixes = IntStream.range(0, parallelism) - .mapToObj(i -> FSUtils.createNewFileIdPfx()) - .collect(Collectors.toList()); + // generate new file ID prefixes for each output partition + final List fileIDPrefixes = + IntStream.range(0, parallelism).mapToObj(i -> FSUtils.createNewFileIdPfx()).collect(Collectors.toList()); JavaRDD writeStatusRDD = repartitionedRecords .mapPartitionsWithIndex(new BulkInsertMapFunction(commitTime, config, table, fileIDPrefixes), true) @@ -375,8 +351,7 @@ public class HoodieWriteClient extends AbstractHo return updateIndexAndCommitIfNeeded(writeStatusRDD, table, commitTime); } - private void commitOnAutoCommit(String commitTime, JavaRDD resultRDD, - String actionType) { + private void commitOnAutoCommit(String commitTime, JavaRDD resultRDD, String actionType) { if (config.shouldAutoCommit()) { logger.info("Auto commit enabled: Committing " + commitTime); boolean commitResult = commit(commitTime, resultRDD, Option.empty(), actionType); @@ -388,8 +363,8 @@ public class HoodieWriteClient extends AbstractHo } } - private JavaRDD> combineOnCondition(boolean condition, - JavaRDD> records, int parallelism) { + private JavaRDD> combineOnCondition(boolean condition, JavaRDD> records, + int parallelism) { if (condition) { return deduplicateRecords(records, parallelism); } @@ -397,14 +372,13 @@ public class HoodieWriteClient extends AbstractHo } /** - * Save the workload profile in an intermediate file (here re-using commit files) This is useful - * when performing rollback for MOR datasets. Only updates are recorded in the workload profile - * metadata since updates to log blocks are unknown across batches Inserts (which are new parquet - * files) are rolled back based on commit time. // TODO : Create a new WorkloadProfile metadata - * file instead of using HoodieCommitMetadata + * Save the workload profile in an intermediate file (here re-using commit files) This is useful when performing + * rollback for MOR datasets. Only updates are recorded in the workload profile metadata since updates to log blocks + * are unknown across batches Inserts (which are new parquet files) are rolled back based on commit time. // TODO : + * Create a new WorkloadProfile metadata file instead of using HoodieCommitMetadata */ - private void saveWorkloadProfileMetadataToInflight(WorkloadProfile profile, HoodieTable table, - String commitTime) throws HoodieCommitException { + private void saveWorkloadProfileMetadataToInflight(WorkloadProfile profile, HoodieTable table, String commitTime) + throws HoodieCommitException { try { HoodieCommitMetadata metadata = new HoodieCommitMetadata(); profile.getPartitionPaths().stream().forEach(path -> { @@ -422,16 +396,14 @@ public class HoodieWriteClient extends AbstractHo HoodieActiveTimeline activeTimeline = table.getActiveTimeline(); Option instant = activeTimeline.getCommitsTimeline().filterInflightsExcludingCompaction().lastInstant(); - activeTimeline.saveToInflight(instant.get(), - Option.of(metadata.toJsonString().getBytes(StandardCharsets.UTF_8))); + activeTimeline.saveToInflight(instant.get(), Option.of(metadata.toJsonString().getBytes(StandardCharsets.UTF_8))); } catch (IOException io) { - throw new HoodieCommitException( - "Failed to commit " + commitTime + " unable to save inflight metadata ", io); + throw new HoodieCommitException("Failed to commit " + commitTime + " unable to save inflight metadata ", io); } } - private JavaRDD upsertRecordsInternal(JavaRDD> preppedRecords, - String commitTime, HoodieTable hoodieTable, final boolean isUpsert) { + private JavaRDD upsertRecordsInternal(JavaRDD> preppedRecords, String commitTime, + HoodieTable hoodieTable, final boolean isUpsert) { // Cache the tagged records, so we don't end up computing both // TODO: Consistent contract in HoodieWriteClient regarding preppedRecord storage level handling @@ -451,14 +423,13 @@ public class HoodieWriteClient extends AbstractHo // partition using the insert partitioner final Partitioner partitioner = getPartitioner(hoodieTable, isUpsert, profile); JavaRDD> partitionedRecords = partition(preppedRecords, partitioner); - JavaRDD writeStatusRDD = partitionedRecords - .mapPartitionsWithIndex((partition, recordItr) -> { - if (isUpsert) { - return hoodieTable.handleUpsertPartition(commitTime, partition, recordItr, partitioner); - } else { - return hoodieTable.handleInsertPartition(commitTime, partition, recordItr, partitioner); - } - }, true).flatMap(List::iterator); + JavaRDD writeStatusRDD = partitionedRecords.mapPartitionsWithIndex((partition, recordItr) -> { + if (isUpsert) { + return hoodieTable.handleUpsertPartition(commitTime, partition, recordItr, partitioner); + } else { + return hoodieTable.handleInsertPartition(commitTime, partition, recordItr, partitioner); + } + }, true).flatMap(List::iterator); return updateIndexAndCommitIfNeeded(writeStatusRDD, hoodieTable, commitTime); } @@ -471,26 +442,24 @@ public class HoodieWriteClient extends AbstractHo } } - private JavaRDD updateIndexAndCommitIfNeeded(JavaRDD writeStatusRDD, - HoodieTable table, String commitTime) { + private JavaRDD updateIndexAndCommitIfNeeded(JavaRDD writeStatusRDD, HoodieTable table, + String commitTime) { // cache writeStatusRDD before updating index, so that all actions before this are not triggered again for future // RDD actions that are performed after updating the index. writeStatusRDD = writeStatusRDD.persist(config.getWriteStatusStorageLevel()); indexTimer = metrics.getIndexCtx(); // Update the index back JavaRDD statuses = index.updateLocation(writeStatusRDD, jsc, table); - metrics.updateIndexMetrics("update", metrics.getDurationInMs(indexTimer == null ? 0L : - indexTimer.stop())); + metrics.updateIndexMetrics("update", metrics.getDurationInMs(indexTimer == null ? 0L : indexTimer.stop())); indexTimer = null; // Trigger the insert and collect statuses commitOnAutoCommit(commitTime, statuses, table.getMetaClient().getCommitActionType()); return statuses; } - private JavaRDD> partition(JavaRDD> dedupedRecords, - Partitioner partitioner) { - return dedupedRecords.mapToPair(record -> new Tuple2<>( - new Tuple2<>(record.getKey(), Option.ofNullable(record.getCurrentLocation())), record)) + private JavaRDD> partition(JavaRDD> dedupedRecords, Partitioner partitioner) { + return dedupedRecords.mapToPair( + record -> new Tuple2<>(new Tuple2<>(record.getKey(), Option.ofNullable(record.getCurrentLocation())), record)) .partitionBy(partitioner).map(Tuple2::_2); } @@ -515,8 +484,7 @@ public class HoodieWriteClient extends AbstractHo logger.info("Commiting " + commitTime); // Create a Hoodie table which encapsulated the commits and files visible - HoodieTable table = HoodieTable.getHoodieTable( - createMetaClient(true), config, jsc); + HoodieTable table = HoodieTable.getHoodieTable(createMetaClient(true), config, jsc); HoodieActiveTimeline activeTimeline = table.getActiveTimeline(); HoodieCommitMetadata metadata = new HoodieCommitMetadata(); @@ -545,8 +513,7 @@ public class HoodieWriteClient extends AbstractHo } // We cannot have unbounded commit files. Archive commits if we have to archive - HoodieCommitArchiveLog archiveLog = new HoodieCommitArchiveLog(config, - createMetaClient(true)); + HoodieCommitArchiveLog archiveLog = new HoodieCommitArchiveLog(config, createMetaClient(true)); archiveLog.archiveIfRequired(jsc); if (config.isAutoClean()) { // Call clean to cleanup if there is anything to cleanup after the commit, @@ -557,30 +524,27 @@ public class HoodieWriteClient extends AbstractHo } if (writeContext != null) { long durationInMs = metrics.getDurationInMs(writeContext.stop()); - metrics - .updateCommitMetrics(HoodieActiveTimeline.COMMIT_FORMATTER.parse(commitTime).getTime(), - durationInMs, metadata, actionType); + metrics.updateCommitMetrics(HoodieActiveTimeline.COMMIT_FORMATTER.parse(commitTime).getTime(), durationInMs, + metadata, actionType); writeContext = null; } logger.info("Committed " + commitTime); } catch (IOException e) { - throw new HoodieCommitException( - "Failed to complete commit " + config.getBasePath() + " at time " + commitTime, e); + throw new HoodieCommitException("Failed to complete commit " + config.getBasePath() + " at time " + commitTime, + e); } catch (ParseException e) { - throw new HoodieCommitException( - "Failed to complete commit " + config.getBasePath() + " at time " + commitTime - + "Instant time is not of valid format", e); + throw new HoodieCommitException("Failed to complete commit " + config.getBasePath() + " at time " + commitTime + + "Instant time is not of valid format", e); } return true; } /** - * Savepoint a specific commit. Latest version of data files as of the passed in commitTime will - * be referenced in the savepoint and will never be cleaned. The savepointed commit will never be - * rolledback or archived. + * Savepoint a specific commit. Latest version of data files as of the passed in commitTime will be referenced in the + * savepoint and will never be cleaned. The savepointed commit will never be rolledback or archived. *

- * This gives an option to rollback the state to the savepoint anytime. Savepoint needs to be - * manually created and deleted. + * This gives an option to rollback the state to the savepoint anytime. Savepoint needs to be manually created and + * deleted. *

* Savepoint should be on a commit that could not have been cleaned. * @@ -589,8 +553,7 @@ public class HoodieWriteClient extends AbstractHo * @return true if the savepoint was created successfully */ public boolean savepoint(String user, String comment) { - HoodieTable table = HoodieTable.getHoodieTable( - createMetaClient(true), config, jsc); + HoodieTable table = HoodieTable.getHoodieTable(createMetaClient(true), config, jsc); if (table.getCompletedCommitsTimeline().empty()) { throw new HoodieSavepointException("Could not savepoint. Commit timeline is empty"); } @@ -604,12 +567,11 @@ public class HoodieWriteClient extends AbstractHo } /** - * Savepoint a specific commit. Latest version of data files as of the passed in commitTime will - * be referenced in the savepoint and will never be cleaned. The savepointed commit will never be - * rolledback or archived. + * Savepoint a specific commit. Latest version of data files as of the passed in commitTime will be referenced in the + * savepoint and will never be cleaned. The savepointed commit will never be rolledback or archived. *

- * This gives an option to rollback the state to the savepoint anytime. Savepoint needs to be - * manually created and deleted. + * This gives an option to rollback the state to the savepoint anytime. Savepoint needs to be manually created and + * deleted. *

* Savepoint should be on a commit that could not have been cleaned. * @@ -619,39 +581,35 @@ public class HoodieWriteClient extends AbstractHo * @return true if the savepoint was created successfully */ public boolean savepoint(String commitTime, String user, String comment) { - HoodieTable table = HoodieTable.getHoodieTable( - createMetaClient(true), config, jsc); + HoodieTable table = HoodieTable.getHoodieTable(createMetaClient(true), config, jsc); if (table.getMetaClient().getTableType() == HoodieTableType.MERGE_ON_READ) { throw new UnsupportedOperationException("Savepointing is not supported or MergeOnRead table types"); } Option cleanInstant = table.getCompletedCleanTimeline().lastInstant(); - HoodieInstant commitInstant = new HoodieInstant(false, HoodieTimeline.COMMIT_ACTION, - commitTime); + HoodieInstant commitInstant = new HoodieInstant(false, HoodieTimeline.COMMIT_ACTION, commitTime); if (!table.getCompletedCommitsTimeline().containsInstant(commitInstant)) { - throw new HoodieSavepointException( - "Could not savepoint non-existing commit " + commitInstant); + throw new HoodieSavepointException("Could not savepoint non-existing commit " + commitInstant); } try { // Check the last commit that was not cleaned and check if savepoint time is > that commit String lastCommitRetained; if (cleanInstant.isPresent()) { - HoodieCleanMetadata cleanMetadata = AvroUtils.deserializeHoodieCleanMetadata( - table.getActiveTimeline().getInstantDetails(cleanInstant.get()).get()); + HoodieCleanMetadata cleanMetadata = AvroUtils + .deserializeHoodieCleanMetadata(table.getActiveTimeline().getInstantDetails(cleanInstant.get()).get()); lastCommitRetained = cleanMetadata.getEarliestCommitToRetain(); } else { lastCommitRetained = table.getCompletedCommitsTimeline().firstInstant().get().getTimestamp(); } // Cannot allow savepoint time on a commit that could have been cleaned - Preconditions.checkArgument(HoodieTimeline - .compareTimestamps(commitTime, lastCommitRetained, HoodieTimeline.GREATER_OR_EQUAL), - "Could not savepoint commit " + commitTime + " as this is beyond the lookup window " - + lastCommitRetained); + Preconditions.checkArgument( + HoodieTimeline.compareTimestamps(commitTime, lastCommitRetained, HoodieTimeline.GREATER_OR_EQUAL), + "Could not savepoint commit " + commitTime + " as this is beyond the lookup window " + lastCommitRetained); - Map> latestFilesMap = jsc.parallelize(FSUtils - .getAllPartitionPaths(fs, table.getMetaClient().getBasePath(), + Map> latestFilesMap = jsc + .parallelize(FSUtils.getAllPartitionPaths(fs, table.getMetaClient().getBasePath(), config.shouldAssumeDatePartitioning())) .mapToPair((PairFunction>) partitionPath -> { // Scan all partitions files with this commit time @@ -662,12 +620,10 @@ public class HoodieWriteClient extends AbstractHo return new Tuple2<>(partitionPath, latestFiles); }).collectAsMap(); - HoodieSavepointMetadata metadata = AvroUtils - .convertSavepointMetadata(user, comment, latestFilesMap); + HoodieSavepointMetadata metadata = AvroUtils.convertSavepointMetadata(user, comment, latestFilesMap); // Nothing to save in the savepoint - table.getActiveTimeline() - .saveAsComplete(new HoodieInstant(true, HoodieTimeline.SAVEPOINT_ACTION, commitTime), - AvroUtils.serializeSavepointMetadata(metadata)); + table.getActiveTimeline().saveAsComplete(new HoodieInstant(true, HoodieTimeline.SAVEPOINT_ACTION, commitTime), + AvroUtils.serializeSavepointMetadata(metadata)); logger.info("Savepoint " + commitTime + " created"); return true; } catch (IOException e) { @@ -676,22 +632,20 @@ public class HoodieWriteClient extends AbstractHo } /** - * Delete a savepoint that was created. Once the savepoint is deleted, the commit can be - * rolledback and cleaner may clean up data files. + * Delete a savepoint that was created. Once the savepoint is deleted, the commit can be rolledback and cleaner may + * clean up data files. * * @param savepointTime - delete the savepoint * @return true if the savepoint was deleted successfully */ public void deleteSavepoint(String savepointTime) { - HoodieTable table = HoodieTable.getHoodieTable( - createMetaClient(true), config, jsc); + HoodieTable table = HoodieTable.getHoodieTable(createMetaClient(true), config, jsc); if (table.getMetaClient().getTableType() == HoodieTableType.MERGE_ON_READ) { throw new UnsupportedOperationException("Savepointing is not supported or MergeOnRead table types"); } HoodieActiveTimeline activeTimeline = table.getActiveTimeline(); - HoodieInstant savePoint = new HoodieInstant(false, HoodieTimeline.SAVEPOINT_ACTION, - savepointTime); + HoodieInstant savePoint = new HoodieInstant(false, HoodieTimeline.SAVEPOINT_ACTION, savepointTime); boolean isSavepointPresent = table.getCompletedSavepointTimeline().containsInstant(savePoint); if (!isSavepointPresent) { logger.warn("No savepoint present " + savepointTime); @@ -699,31 +653,27 @@ public class HoodieWriteClient extends AbstractHo } activeTimeline.revertToInflight(savePoint); - activeTimeline - .deleteInflight(new HoodieInstant(true, HoodieTimeline.SAVEPOINT_ACTION, savepointTime)); + activeTimeline.deleteInflight(new HoodieInstant(true, HoodieTimeline.SAVEPOINT_ACTION, savepointTime)); logger.info("Savepoint " + savepointTime + " deleted"); } /** * Delete a compaction request that is pending. * - * NOTE - This is an Admin operation. - * With async compaction, this is expected to be called with async compaction and write shutdown. - * Otherwise, async compactor could fail with errors + * NOTE - This is an Admin operation. With async compaction, this is expected to be called with async compaction and + * write shutdown. Otherwise, async compactor could fail with errors * * @param compactionTime - delete the compaction time */ private void deleteRequestedCompaction(String compactionTime) { - HoodieTable table = HoodieTable.getHoodieTable( - createMetaClient(true), config, jsc); + HoodieTable table = HoodieTable.getHoodieTable(createMetaClient(true), config, jsc); HoodieActiveTimeline activeTimeline = table.getActiveTimeline(); HoodieInstant compactionRequestedInstant = new HoodieInstant(State.REQUESTED, HoodieTimeline.COMPACTION_ACTION, compactionTime); - boolean isCompactionInstantInRequestedState = table.getActiveTimeline().filterPendingCompactionTimeline() - .containsInstant(compactionRequestedInstant); + boolean isCompactionInstantInRequestedState = + table.getActiveTimeline().filterPendingCompactionTimeline().containsInstant(compactionRequestedInstant); HoodieTimeline commitTimeline = table.getCompletedCommitTimeline(); - if (commitTimeline.empty() && !commitTimeline - .findInstantsAfter(compactionTime, Integer.MAX_VALUE).empty()) { + if (commitTimeline.empty() && !commitTimeline.findInstantsAfter(compactionTime, Integer.MAX_VALUE).empty()) { throw new HoodieRollbackException( "Found commits after time :" + compactionTime + ", please rollback greater commits first"); } @@ -736,15 +686,14 @@ public class HoodieWriteClient extends AbstractHo } /** - * Rollback the state to the savepoint. WARNING: This rollsback recent commits and deleted data - * files. Queries accessing the files will mostly fail. This should be done during a downtime. + * Rollback the state to the savepoint. WARNING: This rollsback recent commits and deleted data files. Queries + * accessing the files will mostly fail. This should be done during a downtime. * * @param savepointTime - savepoint time to rollback to * @return true if the savepoint was rollecback to successfully */ public boolean rollbackToSavepoint(String savepointTime) { - HoodieTable table = HoodieTable.getHoodieTable( - createMetaClient(true), config, jsc); + HoodieTable table = HoodieTable.getHoodieTable(createMetaClient(true), config, jsc); HoodieActiveTimeline activeTimeline = table.getActiveTimeline(); // Rollback to savepoint is expected to be a manual operation and no concurrent write or compaction is expected @@ -753,34 +702,32 @@ public class HoodieWriteClient extends AbstractHo // file-slices that will be rolled-back as part of this operation HoodieTimeline commitTimeline = table.getMetaClient().getCommitsAndCompactionTimeline(); - HoodieInstant savePoint = new HoodieInstant(false, HoodieTimeline.SAVEPOINT_ACTION, - savepointTime); + HoodieInstant savePoint = new HoodieInstant(false, HoodieTimeline.SAVEPOINT_ACTION, savepointTime); boolean isSavepointPresent = table.getCompletedSavepointTimeline().containsInstant(savePoint); if (!isSavepointPresent) { throw new HoodieRollbackException("No savepoint for commitTime " + savepointTime); } - List commitsToRollback = commitTimeline - .findInstantsAfter(savepointTime, Integer.MAX_VALUE).getInstants() + List commitsToRollback = commitTimeline.findInstantsAfter(savepointTime, Integer.MAX_VALUE).getInstants() .map(HoodieInstant::getTimestamp).collect(Collectors.toList()); logger.info("Rolling back commits " + commitsToRollback); restoreToInstant(savepointTime); // Make sure the rollback was successful - Option lastInstant = activeTimeline.reload().getCommitsAndCompactionTimeline() - .filterCompletedAndCompactionInstants().lastInstant(); + Option lastInstant = + activeTimeline.reload().getCommitsAndCompactionTimeline().filterCompletedAndCompactionInstants().lastInstant(); Preconditions.checkArgument(lastInstant.isPresent()); Preconditions.checkArgument(lastInstant.get().getTimestamp().equals(savepointTime), - savepointTime + "is not the last commit after rolling back " + commitsToRollback - + ", last commit was " + lastInstant.get().getTimestamp()); + savepointTime + "is not the last commit after rolling back " + commitsToRollback + ", last commit was " + + lastInstant.get().getTimestamp()); return true; } /** - * Rollback the (inflight/committed) record changes with the given commit time. Three steps: (1) - * Atomically unpublish this commit (2) clean indexing data, (3) clean new generated parquet - * files. (4) Finally delete .commit or .inflight file, + * Rollback the (inflight/committed) record changes with the given commit time. Three steps: (1) Atomically unpublish + * this commit (2) clean indexing data, (3) clean new generated parquet files. (4) Finally delete .commit or .inflight + * file, */ public boolean rollback(final String commitTime) throws HoodieRollbackException { rollbackInternal(commitTime); @@ -788,17 +735,15 @@ public class HoodieWriteClient extends AbstractHo } /** - * NOTE : This action requires all writers (ingest and compact) to a dataset to be stopped before proceeding. - * Revert the (inflight/committed) record changes for all commits after the provided @param. - * Three steps: (1) Atomically unpublish this commit (2) clean indexing data, (3) clean new generated parquet/log - * files and/or append rollback to existing log files. (4) Finally delete .commit, .inflight, .compaction.inflight - * or .compaction.requested file + * NOTE : This action requires all writers (ingest and compact) to a dataset to be stopped before proceeding. Revert + * the (inflight/committed) record changes for all commits after the provided @param. Three steps: (1) Atomically + * unpublish this commit (2) clean indexing data, (3) clean new generated parquet/log files and/or append rollback to + * existing log files. (4) Finally delete .commit, .inflight, .compaction.inflight or .compaction.requested file */ public void restoreToInstant(final String instantTime) throws HoodieRollbackException { // Create a Hoodie table which encapsulated the commits and files visible - HoodieTable table = HoodieTable.getHoodieTable( - createMetaClient(true), config, jsc); + HoodieTable table = HoodieTable.getHoodieTable(createMetaClient(true), config, jsc); // Get all the commits on the timeline after the provided commit time List instantsToRollback = table.getActiveTimeline().getCommitsAndCompactionTimeline().getInstants() .filter(instant -> HoodieActiveTimeline.GREATER.test(instant.getTimestamp(), instantTime)) @@ -809,8 +754,7 @@ public class HoodieWriteClient extends AbstractHo String startRollbackInstant = startInstant(); // Start the timer final Timer.Context context = startContext(); - ImmutableMap.Builder> instantsToStats = - ImmutableMap.builder(); + ImmutableMap.Builder> instantsToStats = ImmutableMap.builder(); instantsToRollback.stream().forEach(instant -> { try { switch (instant.getAction()) { @@ -851,20 +795,17 @@ public class HoodieWriteClient extends AbstractHo return metrics.getRollbackCtx(); } - private List doRollbackAndGetStats(final String commitToRollback) throws - IOException { - HoodieTable table = HoodieTable.getHoodieTable( - createMetaClient(true), config, jsc); + private List doRollbackAndGetStats(final String commitToRollback) throws IOException { + HoodieTable table = HoodieTable.getHoodieTable(createMetaClient(true), config, jsc); HoodieTimeline inflightCommitTimeline = table.getInflightCommitTimeline(); HoodieTimeline commitTimeline = table.getCompletedCommitsTimeline(); // Check if any of the commits is a savepoint - do not allow rollback on those commits - List savepoints = table.getCompletedSavepointTimeline().getInstants() - .map(HoodieInstant::getTimestamp).collect(Collectors.toList()); + List savepoints = table.getCompletedSavepointTimeline().getInstants().map(HoodieInstant::getTimestamp) + .collect(Collectors.toList()); savepoints.stream().forEach(s -> { if (s.contains(commitToRollback)) { throw new HoodieRollbackException( - "Could not rollback a savepointed commit. Delete savepoint first before rolling back" - + s); + "Could not rollback a savepointed commit. Delete savepoint first before rolling back" + s); } }); @@ -877,17 +818,17 @@ public class HoodieWriteClient extends AbstractHo // If there is a commit in-between or after that is not rolled back, then abort String lastCommit = commitToRollback; - if ((lastCommit != null) && !commitTimeline.empty() && !commitTimeline - .findInstantsAfter(lastCommit, Integer.MAX_VALUE).empty()) { + if ((lastCommit != null) && !commitTimeline.empty() + && !commitTimeline.findInstantsAfter(lastCommit, Integer.MAX_VALUE).empty()) { throw new HoodieRollbackException( "Found commits after time :" + lastCommit + ", please rollback greater commits first"); } - List inflights = inflightCommitTimeline.getInstants().map(HoodieInstant::getTimestamp) - .collect(Collectors.toList()); + List inflights = + inflightCommitTimeline.getInstants().map(HoodieInstant::getTimestamp).collect(Collectors.toList()); if ((lastCommit != null) && !inflights.isEmpty() && (inflights.indexOf(lastCommit) != inflights.size() - 1)) { - throw new HoodieRollbackException("Found in-flight commits after time :" + lastCommit - + ", please rollback greater commits first"); + throw new HoodieRollbackException( + "Found in-flight commits after time :" + lastCommit + ", please rollback greater commits first"); } List stats = table.rollback(jsc, commitToRollback, true); @@ -904,18 +845,16 @@ public class HoodieWriteClient extends AbstractHo private void finishRollback(final Timer.Context context, List rollbackStats, List commitsToRollback, final String startRollbackTime) throws IOException { - HoodieTable table = HoodieTable.getHoodieTable( - createMetaClient(true), config, jsc); + HoodieTable table = HoodieTable.getHoodieTable(createMetaClient(true), config, jsc); Option durationInMs = Option.empty(); Long numFilesDeleted = rollbackStats.stream().mapToLong(stat -> stat.getSuccessDeleteFiles().size()).sum(); if (context != null) { durationInMs = Option.of(metrics.getDurationInMs(context.stop())); metrics.updateRollbackMetrics(durationInMs.get(), numFilesDeleted); } - HoodieRollbackMetadata rollbackMetadata = AvroUtils - .convertRollbackMetadata(startRollbackTime, durationInMs, commitsToRollback, rollbackStats); - table.getActiveTimeline().saveAsComplete( - new HoodieInstant(true, HoodieTimeline.ROLLBACK_ACTION, startRollbackTime), + HoodieRollbackMetadata rollbackMetadata = + AvroUtils.convertRollbackMetadata(startRollbackTime, durationInMs, commitsToRollback, rollbackStats); + table.getActiveTimeline().saveAsComplete(new HoodieInstant(true, HoodieTimeline.ROLLBACK_ACTION, startRollbackTime), AvroUtils.serializeRollbackMetadata(rollbackMetadata)); logger.info("Commits " + commitsToRollback + " rollback is complete"); @@ -930,23 +869,20 @@ public class HoodieWriteClient extends AbstractHo private void finishRestore(final Timer.Context context, Map> commitToStats, List commitsToRollback, final String startRestoreTime, final String restoreToInstant) throws IOException { - HoodieTable table = HoodieTable.getHoodieTable( - createMetaClient(true), config, jsc); + HoodieTable table = HoodieTable.getHoodieTable(createMetaClient(true), config, jsc); Option durationInMs = Option.empty(); Long numFilesDeleted = 0L; for (Map.Entry> commitToStat : commitToStats.entrySet()) { List stats = commitToStat.getValue(); - numFilesDeleted = stats.stream().mapToLong(stat -> stat.getSuccessDeleteFiles().size()) - .sum(); + numFilesDeleted = stats.stream().mapToLong(stat -> stat.getSuccessDeleteFiles().size()).sum(); } if (context != null) { durationInMs = Option.of(metrics.getDurationInMs(context.stop())); metrics.updateRollbackMetrics(durationInMs.get(), numFilesDeleted); } - HoodieRestoreMetadata restoreMetadata = AvroUtils - .convertRestoreMetadata(startRestoreTime, durationInMs, commitsToRollback, commitToStats); - table.getActiveTimeline().saveAsComplete( - new HoodieInstant(true, HoodieTimeline.RESTORE_ACTION, startRestoreTime), + HoodieRestoreMetadata restoreMetadata = + AvroUtils.convertRestoreMetadata(startRestoreTime, durationInMs, commitsToRollback, commitToStats); + table.getActiveTimeline().saveAsComplete(new HoodieInstant(true, HoodieTimeline.RESTORE_ACTION, startRestoreTime), AvroUtils.serializeRestoreMetadata(restoreMetadata)); logger.info("Commits " + commitsToRollback + " rollback is complete. Restored dataset to " + restoreToInstant); @@ -972,8 +908,8 @@ public class HoodieWriteClient extends AbstractHo Map> statToCommit = new HashMap<>(); finishRollback(context, stats, Arrays.asList(commitToRollback), startRollbackTime); } catch (IOException e) { - throw new HoodieRollbackException( - "Failed to rollback " + config.getBasePath() + " commits " + commitToRollback, e); + throw new HoodieRollbackException("Failed to rollback " + config.getBasePath() + " commits " + commitToRollback, + e); } } @@ -989,9 +925,9 @@ public class HoodieWriteClient extends AbstractHo } /** - * Clean up any stale/old files/data lying around (either on file storage or index storage) based - * on the configurations and CleaningPolicy used. (typically files that no longer can be used by a - * running query can be cleaned) + * Clean up any stale/old files/data lying around (either on file storage or index storage) based on the + * configurations and CleaningPolicy used. (typically files that no longer can be used by a running query can be + * cleaned) */ public void clean() throws HoodieIOException { String startCleanTime = HoodieActiveTimeline.createNewCommitTime(); @@ -999,9 +935,9 @@ public class HoodieWriteClient extends AbstractHo } /** - * Clean up any stale/old files/data lying around (either on file storage or index storage) based - * on the configurations and CleaningPolicy used. (typically files that no longer can be used by a - * running query can be cleaned) + * Clean up any stale/old files/data lying around (either on file storage or index storage) based on the + * configurations and CleaningPolicy used. (typically files that no longer can be used by a running query can be + * cleaned) */ private void clean(String startCleanTime) throws HoodieIOException { try { @@ -1009,8 +945,7 @@ public class HoodieWriteClient extends AbstractHo final Timer.Context context = metrics.getCleanCtx(); // Create a Hoodie table which encapsulated the commits and files visible - HoodieTable table = HoodieTable.getHoodieTable( - createMetaClient(true), config, jsc); + HoodieTable table = HoodieTable.getHoodieTable(createMetaClient(true), config, jsc); List cleanStats = table.clean(jsc); if (cleanStats.isEmpty()) { @@ -1025,15 +960,12 @@ public class HoodieWriteClient extends AbstractHo } // Create the metadata and save it - HoodieCleanMetadata metadata = AvroUtils - .convertCleanMetadata(startCleanTime, durationInMs, cleanStats); + HoodieCleanMetadata metadata = AvroUtils.convertCleanMetadata(startCleanTime, durationInMs, cleanStats); logger.info("Cleaned " + metadata.getTotalFilesDeleted() + " files"); - metrics - .updateCleanMetrics(durationInMs.orElseGet(() -> -1L), metadata.getTotalFilesDeleted()); + metrics.updateCleanMetrics(durationInMs.orElseGet(() -> -1L), metadata.getTotalFilesDeleted()); - table.getActiveTimeline() - .saveAsComplete(new HoodieInstant(true, HoodieTimeline.CLEAN_ACTION, startCleanTime), - AvroUtils.serializeCleanMetadata(metadata)); + table.getActiveTimeline().saveAsComplete(new HoodieInstant(true, HoodieTimeline.CLEAN_ACTION, startCleanTime), + AvroUtils.serializeCleanMetadata(metadata)); logger.info("Marked clean started on " + startCleanTime + " as complete"); if (!table.getActiveTimeline().getCleanerTimeline().empty()) { @@ -1067,8 +999,8 @@ public class HoodieWriteClient extends AbstractHo metaClient.getActiveTimeline().filterPendingCompactionTimeline().lastInstant().ifPresent(latestPending -> { Preconditions.checkArgument( HoodieTimeline.compareTimestamps(latestPending.getTimestamp(), instantTime, HoodieTimeline.LESSER), - "Latest pending compaction instant time must be earlier " - + "than this instant time. Latest Compaction :" + latestPending + ", Ingesting at " + instantTime); + "Latest pending compaction instant time must be earlier " + "than this instant time. Latest Compaction :" + + latestPending + ", Ingesting at " + instantTime); }); HoodieTable table = HoodieTable.getHoodieTable(metaClient, config, jsc); HoodieActiveTimeline activeTimeline = table.getActiveTimeline(); @@ -1100,14 +1032,14 @@ public class HoodieWriteClient extends AbstractHo metaClient.getCommitsTimeline().filterInflightsExcludingCompaction().firstInstant().ifPresent(earliestInflight -> { Preconditions.checkArgument( HoodieTimeline.compareTimestamps(earliestInflight.getTimestamp(), instantTime, HoodieTimeline.GREATER), - "Earliest write inflight instant time must be later " - + "than compaction time. Earliest :" + earliestInflight + ", Compaction scheduled at " + instantTime); + "Earliest write inflight instant time must be later " + "than compaction time. Earliest :" + earliestInflight + + ", Compaction scheduled at " + instantTime); }); // Committed and pending compaction instants should have strictly lower timestamps - List conflictingInstants = - metaClient.getActiveTimeline().getCommitsAndCompactionTimeline().getInstants().filter(instant -> - HoodieTimeline.compareTimestamps(instant.getTimestamp(), instantTime, - HoodieTimeline.GREATER_OR_EQUAL)).collect(Collectors.toList()); + List conflictingInstants = metaClient + .getActiveTimeline().getCommitsAndCompactionTimeline().getInstants().filter(instant -> HoodieTimeline + .compareTimestamps(instant.getTimestamp(), instantTime, HoodieTimeline.GREATER_OR_EQUAL)) + .collect(Collectors.toList()); Preconditions.checkArgument(conflictingInstants.isEmpty(), "Following instants have timestamps >= compactionInstant (" + instantTime + ") Instants :" + conflictingInstants); @@ -1160,35 +1092,30 @@ public class HoodieWriteClient extends AbstractHo /** * Deduplicate Hoodie records, using the given deduplication funciton. */ - JavaRDD> deduplicateRecords(JavaRDD> records, - int parallelism) { + JavaRDD> deduplicateRecords(JavaRDD> records, int parallelism) { boolean isIndexingGlobal = index.isGlobal(); - return records - .mapToPair(record -> { - HoodieKey hoodieKey = record.getKey(); - // If index used is global, then records are expected to differ in their partitionPath - Object key = isIndexingGlobal ? hoodieKey.getRecordKey() : hoodieKey; - return new Tuple2<>(key, record); - }) - .reduceByKey((rec1, rec2) -> { - @SuppressWarnings("unchecked") T reducedData = (T) rec1.getData() - .preCombine(rec2.getData()); - // we cannot allow the user to change the key or partitionPath, since that will affect - // everything - // so pick it from one of the records. - return new HoodieRecord(rec1.getKey(), reducedData); - }, parallelism).map(Tuple2::_2); + return records.mapToPair(record -> { + HoodieKey hoodieKey = record.getKey(); + // If index used is global, then records are expected to differ in their partitionPath + Object key = isIndexingGlobal ? hoodieKey.getRecordKey() : hoodieKey; + return new Tuple2<>(key, record); + }).reduceByKey((rec1, rec2) -> { + @SuppressWarnings("unchecked") + T reducedData = (T) rec1.getData().preCombine(rec2.getData()); + // we cannot allow the user to change the key or partitionPath, since that will affect + // everything + // so pick it from one of the records. + return new HoodieRecord(rec1.getKey(), reducedData); + }, parallelism).map(Tuple2::_2); } /** * Cleanup all inflight commits */ private void rollbackInflightCommits() { - HoodieTable table = HoodieTable.getHoodieTable( - createMetaClient(true), config, jsc); + HoodieTable table = HoodieTable.getHoodieTable(createMetaClient(true), config, jsc); HoodieTimeline inflightTimeline = table.getMetaClient().getCommitsTimeline().filterInflightsExcludingCompaction(); - List commits = inflightTimeline.getInstants().map(HoodieInstant::getTimestamp) - .collect(Collectors.toList()); + List commits = inflightTimeline.getInstants().map(HoodieInstant::getTimestamp).collect(Collectors.toList()); Collections.reverse(commits); for (String commit : commits) { rollback(commit); @@ -1222,7 +1149,7 @@ public class HoodieWriteClient extends AbstractHo HoodieTimeline pendingCompactionTimeline = metaClient.getActiveTimeline().filterPendingCompactionTimeline(); HoodieInstant inflightInstant = HoodieTimeline.getCompactionInflightInstant(compactionInstantTime); if (pendingCompactionTimeline.containsInstant(inflightInstant)) { - //inflight compaction - Needs to rollback first deleting new parquet files before we run compaction. + // inflight compaction - Needs to rollback first deleting new parquet files before we run compaction. rollbackInflightCompaction(inflightInstant, table); // refresh table metaClient = createMetaClient(true); @@ -1234,8 +1161,8 @@ public class HoodieWriteClient extends AbstractHo if (pendingCompactionTimeline.containsInstant(instant)) { return runCompaction(instant, metaClient.getActiveTimeline(), autoCommit); } else { - throw new IllegalStateException("No Compaction request available at " + compactionInstantTime - + " to run compaction"); + throw new IllegalStateException( + "No Compaction request available at " + compactionInstantTime + " to run compaction"); } } @@ -1247,10 +1174,10 @@ public class HoodieWriteClient extends AbstractHo * @param autoCommit Commit after compaction * @return RDD of Write Status */ - private JavaRDD runCompaction( - HoodieInstant compactionInstant, HoodieActiveTimeline activeTimeline, boolean autoCommit) throws IOException { - HoodieCompactionPlan compactionPlan = AvroUtils.deserializeCompactionPlan( - activeTimeline.getInstantAuxiliaryDetails(compactionInstant).get()); + private JavaRDD runCompaction(HoodieInstant compactionInstant, HoodieActiveTimeline activeTimeline, + boolean autoCommit) throws IOException { + HoodieCompactionPlan compactionPlan = + AvroUtils.deserializeCompactionPlan(activeTimeline.getInstantAuxiliaryDetails(compactionInstant).get()); // Mark instant as compaction inflight activeTimeline.transitionCompactionRequestedToInflight(compactionInstant); compactionTimer = metrics.getCompactionCtx(); @@ -1278,17 +1205,15 @@ public class HoodieWriteClient extends AbstractHo protected void commitCompaction(JavaRDD compactedStatuses, HoodieTable table, String compactionCommitTime, boolean autoCommit, Option> extraMetadata) { if (autoCommit) { - HoodieCommitMetadata metadata = - doCompactionCommit(table, compactedStatuses, compactionCommitTime, extraMetadata); + HoodieCommitMetadata metadata = doCompactionCommit(table, compactedStatuses, compactionCommitTime, extraMetadata); if (compactionTimer != null) { long durationInMs = metrics.getDurationInMs(compactionTimer.stop()); try { metrics.updateCommitMetrics(HoodieActiveTimeline.COMMIT_FORMATTER.parse(compactionCommitTime).getTime(), durationInMs, metadata, HoodieActiveTimeline.COMPACTION_ACTION); } catch (ParseException e) { - throw new HoodieCommitException( - "Commit time is not of valid format.Failed to commit compaction " + config.getBasePath() - + " at time " + compactionCommitTime, e); + throw new HoodieCommitException("Commit time is not of valid format.Failed to commit compaction " + + config.getBasePath() + " at time " + compactionCommitTime, e); } } logger.info("Compacted successfully on commit " + compactionCommitTime); @@ -1309,8 +1234,7 @@ public class HoodieWriteClient extends AbstractHo }); } } catch (HoodieIOException ioe) { - throw new HoodieCommitException( - "Failed to complete commit " + instantTime + " due to finalize errors.", ioe); + throw new HoodieCommitException("Failed to complete commit " + instantTime + " due to finalize errors.", ioe); } } @@ -1330,8 +1254,7 @@ public class HoodieWriteClient extends AbstractHo private HoodieCommitMetadata doCompactionCommit(HoodieTable table, JavaRDD writeStatuses, String compactionCommitTime, Option> extraMetadata) { HoodieTableMetaClient metaClient = table.getMetaClient(); - List updateStatusMap = writeStatuses.map(WriteStatus::getStat) - .collect(); + List updateStatusMap = writeStatuses.map(WriteStatus::getStat).collect(); HoodieCommitMetadata metadata = new HoodieCommitMetadata(true); for (HoodieWriteStat stat : updateStatusMap) { @@ -1383,8 +1306,7 @@ public class HoodieWriteClient extends AbstractHo // TODO : make sure we cannot rollback / archive last commit file try { // Create a Hoodie table which encapsulated the commits and files visible - HoodieTable table = HoodieTable.getHoodieTable( - createMetaClient(true), config, jsc); + HoodieTable table = HoodieTable.getHoodieTable(createMetaClient(true), config, jsc); // 0. All of the rolling stat management is only done by the DELTA commit for MOR and COMMIT for COW other wise // there may be race conditions HoodieRollingStatMetadata rollingStatMetadata = new HoodieRollingStatMetadata(actionType); @@ -1395,22 +1317,21 @@ public class HoodieWriteClient extends AbstractHo for (HoodieWriteStat stat : writeStats) { String partitionPath = stat.getPartitionPath(); - //TODO: why is stat.getPartitionPath() null at times here. + // TODO: why is stat.getPartitionPath() null at times here. metadata.addWriteStat(partitionPath, stat); HoodieRollingStat hoodieRollingStat = new HoodieRollingStat(stat.getFileId(), - stat.getNumWrites() - (stat.getNumUpdateWrites() - stat.getNumDeletes()), - stat.getNumUpdateWrites(), stat.getNumDeletes(), stat.getTotalWriteBytes()); + stat.getNumWrites() - (stat.getNumUpdateWrites() - stat.getNumDeletes()), stat.getNumUpdateWrites(), + stat.getNumDeletes(), stat.getTotalWriteBytes()); rollingStatMetadata.addRollingStat(partitionPath, hoodieRollingStat); } // The last rolling stat should be present in the completed timeline - Option lastInstant = table.getActiveTimeline().getCommitsTimeline().filterCompletedInstants() - .lastInstant(); + Option lastInstant = + table.getActiveTimeline().getCommitsTimeline().filterCompletedInstants().lastInstant(); if (lastInstant.isPresent()) { - HoodieCommitMetadata commitMetadata = HoodieCommitMetadata - .fromBytes(table.getActiveTimeline().getInstantDetails(lastInstant - .get()).get(), HoodieCommitMetadata.class); - Option lastRollingStat = Option.ofNullable(commitMetadata.getExtraMetadata() - .get(HoodieRollingStatMetadata.ROLLING_STAT_METADATA_KEY)); + HoodieCommitMetadata commitMetadata = HoodieCommitMetadata.fromBytes( + table.getActiveTimeline().getInstantDetails(lastInstant.get()).get(), HoodieCommitMetadata.class); + Option lastRollingStat = Option + .ofNullable(commitMetadata.getExtraMetadata().get(HoodieRollingStatMetadata.ROLLING_STAT_METADATA_KEY)); if (lastRollingStat.isPresent()) { rollingStatMetadata = rollingStatMetadata .merge(HoodieCommitMetadata.fromBytes(lastRollingStat.get().getBytes(), HoodieRollingStatMetadata.class)); @@ -1422,4 +1343,4 @@ public class HoodieWriteClient extends AbstractHo } } -} \ No newline at end of file +} diff --git a/hudi-client/src/main/java/org/apache/hudi/WriteStatus.java b/hudi-client/src/main/java/org/apache/hudi/WriteStatus.java index ad488bb4a..039c4ec65 100644 --- a/hudi-client/src/main/java/org/apache/hudi/WriteStatus.java +++ b/hudi-client/src/main/java/org/apache/hudi/WriteStatus.java @@ -64,14 +64,11 @@ public class WriteStatus implements Serializable { } /** - * Mark write as success, optionally using given parameters for the purpose of calculating some - * aggregate metrics. This method is not meant to cache passed arguments, since WriteStatus - * objects are collected in Spark Driver. + * Mark write as success, optionally using given parameters for the purpose of calculating some aggregate metrics. + * This method is not meant to cache passed arguments, since WriteStatus objects are collected in Spark Driver. * - * @param record deflated {@code HoodieRecord} containing information that uniquely identifies - * it. - * @param optionalRecordMetadata optional metadata related to data contained in {@link - * HoodieRecord} before deflation. + * @param record deflated {@code HoodieRecord} containing information that uniquely identifies it. + * @param optionalRecordMetadata optional metadata related to data contained in {@link HoodieRecord} before deflation. */ public void markSuccess(HoodieRecord record, Option> optionalRecordMetadata) { if (trackSuccessRecords) { @@ -81,14 +78,11 @@ public class WriteStatus implements Serializable { } /** - * Mark write as failed, optionally using given parameters for the purpose of calculating some - * aggregate metrics. This method is not meant to cache passed arguments, since WriteStatus - * objects are collected in Spark Driver. + * Mark write as failed, optionally using given parameters for the purpose of calculating some aggregate metrics. This + * method is not meant to cache passed arguments, since WriteStatus objects are collected in Spark Driver. * - * @param record deflated {@code HoodieRecord} containing information that uniquely identifies - * it. - * @param optionalRecordMetadata optional metadata related to data contained in {@link - * HoodieRecord} before deflation. + * @param record deflated {@code HoodieRecord} containing information that uniquely identifies it. + * @param optionalRecordMetadata optional metadata related to data contained in {@link HoodieRecord} before deflation. */ public void markFailure(HoodieRecord record, Throwable t, Option> optionalRecordMetadata) { if (failedRecords.isEmpty() || (random.nextDouble() <= failureFraction)) { diff --git a/hudi-client/src/main/java/org/apache/hudi/config/HoodieCompactionConfig.java b/hudi-client/src/main/java/org/apache/hudi/config/HoodieCompactionConfig.java index 0f7a5331a..19e25ab2c 100644 --- a/hudi-client/src/main/java/org/apache/hudi/config/HoodieCompactionConfig.java +++ b/hudi-client/src/main/java/org/apache/hudi/config/HoodieCompactionConfig.java @@ -40,10 +40,8 @@ public class HoodieCompactionConfig extends DefaultHoodieConfig { // Turn on inline compaction - after fw delta commits a inline compaction will be run public static final String INLINE_COMPACT_PROP = "hoodie.compact.inline"; // Run a compaction every N delta commits - public static final String INLINE_COMPACT_NUM_DELTA_COMMITS_PROP = - "hoodie.compact.inline.max" + ".delta.commits"; - public static final String CLEANER_FILE_VERSIONS_RETAINED_PROP = - "hoodie.cleaner.fileversions" + ".retained"; + public static final String INLINE_COMPACT_NUM_DELTA_COMMITS_PROP = "hoodie.compact.inline.max" + ".delta.commits"; + public static final String CLEANER_FILE_VERSIONS_RETAINED_PROP = "hoodie.cleaner.fileversions" + ".retained"; public static final String CLEANER_COMMITS_RETAINED_PROP = "hoodie.cleaner.commits.retained"; public static final String MAX_COMMITS_TO_KEEP_PROP = "hoodie.keep.max.commits"; public static final String MIN_COMMITS_TO_KEEP_PROP = "hoodie.keep.min.commits"; @@ -56,25 +54,21 @@ public class HoodieCompactionConfig extends DefaultHoodieConfig { * Configs related to specific table types **/ // Number of inserts, that will be put each partition/bucket for writing - public static final String COPY_ON_WRITE_TABLE_INSERT_SPLIT_SIZE = - "hoodie.copyonwrite.insert" + ".split.size"; + public static final String COPY_ON_WRITE_TABLE_INSERT_SPLIT_SIZE = "hoodie.copyonwrite.insert" + ".split.size"; // The rationale to pick the insert parallelism is the following. Writing out 100MB files, // with atleast 1kb records, means 100K records per file. we just overprovision to 500K public static final String DEFAULT_COPY_ON_WRITE_TABLE_INSERT_SPLIT_SIZE = String.valueOf(500000); // Config to control whether we control insert split sizes automatically based on average // record sizes - public static final String COPY_ON_WRITE_TABLE_AUTO_SPLIT_INSERTS = - "hoodie.copyonwrite.insert" + ".auto.split"; + public static final String COPY_ON_WRITE_TABLE_AUTO_SPLIT_INSERTS = "hoodie.copyonwrite.insert" + ".auto.split"; // its off by default public static final String DEFAULT_COPY_ON_WRITE_TABLE_AUTO_SPLIT_INSERTS = String.valueOf(true); // This value is used as a guessimate for the record size, if we can't determine this from // previous commits - public static final String COPY_ON_WRITE_TABLE_RECORD_SIZE_ESTIMATE = - "hoodie.copyonwrite" + ".record.size.estimate"; + public static final String COPY_ON_WRITE_TABLE_RECORD_SIZE_ESTIMATE = "hoodie.copyonwrite" + ".record.size.estimate"; // Used to determine how much more can be packed into a small file, before it exceeds the size // limit. - public static final String DEFAULT_COPY_ON_WRITE_TABLE_RECORD_SIZE_ESTIMATE = String - .valueOf(1024); + public static final String DEFAULT_COPY_ON_WRITE_TABLE_RECORD_SIZE_ESTIMATE = String.valueOf(1024); public static final String CLEANER_PARALLELISM = "hoodie.cleaner.parallelism"; public static final String DEFAULT_CLEANER_PARALLELISM = String.valueOf(200); public static final String TARGET_IO_PER_COMPACTION_IN_MB_PROP = "hoodie.compaction.target.io"; @@ -82,8 +76,7 @@ public class HoodieCompactionConfig extends DefaultHoodieConfig { public static final String DEFAULT_TARGET_IO_PER_COMPACTION_IN_MB = String.valueOf(500 * 1024); public static final String COMPACTION_STRATEGY_PROP = "hoodie.compaction.strategy"; // 200GB of target IO per compaction - public static final String DEFAULT_COMPACTION_STRATEGY = LogFileSizeBasedCompactionStrategy.class - .getName(); + public static final String DEFAULT_COMPACTION_STRATEGY = LogFileSizeBasedCompactionStrategy.class.getName(); // used to merge records written to log file public static final String DEFAULT_PAYLOAD_CLASS = HoodieAvroPayload.class.getName(); public static final String PAYLOAD_CLASS_PROP = "hoodie.compaction.payload.class"; @@ -91,15 +84,12 @@ public class HoodieCompactionConfig extends DefaultHoodieConfig { // used to choose a trade off between IO vs Memory when performing compaction process // Depending on outputfile_size and memory provided, choose true to avoid OOM for large file // size + small memory - public static final String COMPACTION_LAZY_BLOCK_READ_ENABLED_PROP = - "hoodie.compaction.lazy" + ".block.read"; + public static final String COMPACTION_LAZY_BLOCK_READ_ENABLED_PROP = "hoodie.compaction.lazy" + ".block.read"; public static final String DEFAULT_COMPACTION_LAZY_BLOCK_READ_ENABLED = "false"; // used to choose whether to enable reverse log reading (reverse log traversal) - public static final String COMPACTION_REVERSE_LOG_READ_ENABLED_PROP = - "hoodie.compaction" + ".reverse.log.read"; + public static final String COMPACTION_REVERSE_LOG_READ_ENABLED_PROP = "hoodie.compaction" + ".reverse.log.read"; public static final String DEFAULT_COMPACTION_REVERSE_LOG_READ_ENABLED = "false"; - private static final String DEFAULT_CLEANER_POLICY = HoodieCleaningPolicy.KEEP_LATEST_COMMITS - .name(); + private static final String DEFAULT_CLEANER_POLICY = HoodieCleaningPolicy.KEEP_LATEST_COMMITS.name(); private static final String DEFAULT_AUTO_CLEAN = "true"; private static final String DEFAULT_INLINE_COMPACT = "false"; private static final String DEFAULT_INLINE_COMPACT_NUM_DELTA_COMMITS = "1"; @@ -108,8 +98,8 @@ public class HoodieCompactionConfig extends DefaultHoodieConfig { private static final String DEFAULT_MAX_COMMITS_TO_KEEP = "30"; private static final String DEFAULT_MIN_COMMITS_TO_KEEP = "20"; private static final String DEFAULT_COMMITS_ARCHIVAL_BATCH_SIZE = String.valueOf(10); - public static final String TARGET_PARTITIONS_PER_DAYBASED_COMPACTION_PROP = "hoodie.compaction.daybased.target" - + ".partitions"; + public static final String TARGET_PARTITIONS_PER_DAYBASED_COMPACTION_PROP = + "hoodie.compaction.daybased.target" + ".partitions"; // 500GB of target IO per compaction (both read and write) public static final String DEFAULT_TARGET_PARTITIONS_PER_DAYBASED_COMPACTION = String.valueOf(10); @@ -188,14 +178,12 @@ public class HoodieCompactionConfig extends DefaultHoodieConfig { } public Builder autoTuneInsertSplits(boolean autoTuneInsertSplits) { - props.setProperty(COPY_ON_WRITE_TABLE_AUTO_SPLIT_INSERTS, - String.valueOf(autoTuneInsertSplits)); + props.setProperty(COPY_ON_WRITE_TABLE_AUTO_SPLIT_INSERTS, String.valueOf(autoTuneInsertSplits)); return this; } public Builder approxRecordSize(int recordSizeEstimate) { - props.setProperty(COPY_ON_WRITE_TABLE_RECORD_SIZE_ESTIMATE, - String.valueOf(recordSizeEstimate)); + props.setProperty(COPY_ON_WRITE_TABLE_RECORD_SIZE_ESTIMATE, String.valueOf(recordSizeEstimate)); return this; } @@ -215,32 +203,27 @@ public class HoodieCompactionConfig extends DefaultHoodieConfig { } public Builder withTargetIOPerCompactionInMB(long targetIOPerCompactionInMB) { - props.setProperty(TARGET_IO_PER_COMPACTION_IN_MB_PROP, - String.valueOf(targetIOPerCompactionInMB)); + props.setProperty(TARGET_IO_PER_COMPACTION_IN_MB_PROP, String.valueOf(targetIOPerCompactionInMB)); return this; } public Builder withMaxNumDeltaCommitsBeforeCompaction(int maxNumDeltaCommitsBeforeCompaction) { - props.setProperty(INLINE_COMPACT_NUM_DELTA_COMMITS_PROP, - String.valueOf(maxNumDeltaCommitsBeforeCompaction)); + props.setProperty(INLINE_COMPACT_NUM_DELTA_COMMITS_PROP, String.valueOf(maxNumDeltaCommitsBeforeCompaction)); return this; } public Builder withCompactionLazyBlockReadEnabled(Boolean compactionLazyBlockReadEnabled) { - props.setProperty(COMPACTION_LAZY_BLOCK_READ_ENABLED_PROP, - String.valueOf(compactionLazyBlockReadEnabled)); + props.setProperty(COMPACTION_LAZY_BLOCK_READ_ENABLED_PROP, String.valueOf(compactionLazyBlockReadEnabled)); return this; } public Builder withCompactionReverseLogReadEnabled(Boolean compactionReverseLogReadEnabled) { - props.setProperty(COMPACTION_REVERSE_LOG_READ_ENABLED_PROP, - String.valueOf(compactionReverseLogReadEnabled)); + props.setProperty(COMPACTION_REVERSE_LOG_READ_ENABLED_PROP, String.valueOf(compactionReverseLogReadEnabled)); return this; } public Builder withTargetPartitionsPerDayBasedCompaction(int targetPartitionsPerCompaction) { - props.setProperty(TARGET_PARTITIONS_PER_DAYBASED_COMPACTION_PROP, - String.valueOf(targetPartitionsPerCompaction)); + props.setProperty(TARGET_PARTITIONS_PER_DAYBASED_COMPACTION_PROP, String.valueOf(targetPartitionsPerCompaction)); return this; } @@ -251,8 +234,7 @@ public class HoodieCompactionConfig extends DefaultHoodieConfig { public HoodieCompactionConfig build() { HoodieCompactionConfig config = new HoodieCompactionConfig(props); - setDefaultOnCondition(props, !props.containsKey(AUTO_CLEAN_PROP), AUTO_CLEAN_PROP, - DEFAULT_AUTO_CLEAN); + setDefaultOnCondition(props, !props.containsKey(AUTO_CLEAN_PROP), AUTO_CLEAN_PROP, DEFAULT_AUTO_CLEAN); setDefaultOnCondition(props, !props.containsKey(INLINE_COMPACT_PROP), INLINE_COMPACT_PROP, DEFAULT_INLINE_COMPACT); setDefaultOnCondition(props, !props.containsKey(INLINE_COMPACT_NUM_DELTA_COMMITS_PROP), @@ -261,27 +243,25 @@ public class HoodieCompactionConfig extends DefaultHoodieConfig { DEFAULT_CLEANER_POLICY); setDefaultOnCondition(props, !props.containsKey(CLEANER_FILE_VERSIONS_RETAINED_PROP), CLEANER_FILE_VERSIONS_RETAINED_PROP, DEFAULT_CLEANER_FILE_VERSIONS_RETAINED); - setDefaultOnCondition(props, !props.containsKey(CLEANER_COMMITS_RETAINED_PROP), - CLEANER_COMMITS_RETAINED_PROP, DEFAULT_CLEANER_COMMITS_RETAINED); + setDefaultOnCondition(props, !props.containsKey(CLEANER_COMMITS_RETAINED_PROP), CLEANER_COMMITS_RETAINED_PROP, + DEFAULT_CLEANER_COMMITS_RETAINED); setDefaultOnCondition(props, !props.containsKey(MAX_COMMITS_TO_KEEP_PROP), MAX_COMMITS_TO_KEEP_PROP, DEFAULT_MAX_COMMITS_TO_KEEP); setDefaultOnCondition(props, !props.containsKey(MIN_COMMITS_TO_KEEP_PROP), MIN_COMMITS_TO_KEEP_PROP, DEFAULT_MIN_COMMITS_TO_KEEP); - setDefaultOnCondition(props, !props.containsKey(PARQUET_SMALL_FILE_LIMIT_BYTES), - PARQUET_SMALL_FILE_LIMIT_BYTES, DEFAULT_PARQUET_SMALL_FILE_LIMIT_BYTES); + setDefaultOnCondition(props, !props.containsKey(PARQUET_SMALL_FILE_LIMIT_BYTES), PARQUET_SMALL_FILE_LIMIT_BYTES, + DEFAULT_PARQUET_SMALL_FILE_LIMIT_BYTES); setDefaultOnCondition(props, !props.containsKey(COPY_ON_WRITE_TABLE_INSERT_SPLIT_SIZE), COPY_ON_WRITE_TABLE_INSERT_SPLIT_SIZE, DEFAULT_COPY_ON_WRITE_TABLE_INSERT_SPLIT_SIZE); setDefaultOnCondition(props, !props.containsKey(COPY_ON_WRITE_TABLE_AUTO_SPLIT_INSERTS), COPY_ON_WRITE_TABLE_AUTO_SPLIT_INSERTS, DEFAULT_COPY_ON_WRITE_TABLE_AUTO_SPLIT_INSERTS); setDefaultOnCondition(props, !props.containsKey(COPY_ON_WRITE_TABLE_RECORD_SIZE_ESTIMATE), - COPY_ON_WRITE_TABLE_RECORD_SIZE_ESTIMATE, - DEFAULT_COPY_ON_WRITE_TABLE_RECORD_SIZE_ESTIMATE); + COPY_ON_WRITE_TABLE_RECORD_SIZE_ESTIMATE, DEFAULT_COPY_ON_WRITE_TABLE_RECORD_SIZE_ESTIMATE); setDefaultOnCondition(props, !props.containsKey(CLEANER_PARALLELISM), CLEANER_PARALLELISM, DEFAULT_CLEANER_PARALLELISM); - setDefaultOnCondition(props, !props.containsKey(COMPACTION_STRATEGY_PROP), - COMPACTION_STRATEGY_PROP, DEFAULT_COMPACTION_STRATEGY); - setDefaultOnCondition(props, !props.containsKey(PAYLOAD_CLASS_PROP), - PAYLOAD_CLASS_PROP, DEFAULT_PAYLOAD_CLASS); + setDefaultOnCondition(props, !props.containsKey(COMPACTION_STRATEGY_PROP), COMPACTION_STRATEGY_PROP, + DEFAULT_COMPACTION_STRATEGY); + setDefaultOnCondition(props, !props.containsKey(PAYLOAD_CLASS_PROP), PAYLOAD_CLASS_PROP, DEFAULT_PAYLOAD_CLASS); setDefaultOnCondition(props, !props.containsKey(TARGET_IO_PER_COMPACTION_IN_MB_PROP), TARGET_IO_PER_COMPACTION_IN_MB_PROP, DEFAULT_TARGET_IO_PER_COMPACTION_IN_MB); setDefaultOnCondition(props, !props.containsKey(COMPACTION_LAZY_BLOCK_READ_ENABLED_PROP), @@ -299,13 +279,15 @@ public class HoodieCompactionConfig extends DefaultHoodieConfig { // commit instant on timeline, that still has not been cleaned. Could miss some data via incr pull int minInstantsToKeep = Integer.parseInt(props.getProperty(HoodieCompactionConfig.MIN_COMMITS_TO_KEEP_PROP)); int maxInstantsToKeep = Integer.parseInt(props.getProperty(HoodieCompactionConfig.MAX_COMMITS_TO_KEEP_PROP)); - int cleanerCommitsRetained = Integer - .parseInt(props.getProperty(HoodieCompactionConfig.CLEANER_COMMITS_RETAINED_PROP)); + int cleanerCommitsRetained = + Integer.parseInt(props.getProperty(HoodieCompactionConfig.CLEANER_COMMITS_RETAINED_PROP)); Preconditions.checkArgument(maxInstantsToKeep > minInstantsToKeep); Preconditions.checkArgument(minInstantsToKeep > cleanerCommitsRetained, - String.format("Increase %s=%d to be greater than %s=%d. Otherwise, there is risk of incremental pull " - + "missing data from few instants.", HoodieCompactionConfig.MIN_COMMITS_TO_KEEP_PROP, - minInstantsToKeep, HoodieCompactionConfig.CLEANER_COMMITS_RETAINED_PROP, cleanerCommitsRetained)); + String.format( + "Increase %s=%d to be greater than %s=%d. Otherwise, there is risk of incremental pull " + + "missing data from few instants.", + HoodieCompactionConfig.MIN_COMMITS_TO_KEEP_PROP, minInstantsToKeep, + HoodieCompactionConfig.CLEANER_COMMITS_RETAINED_PROP, cleanerCommitsRetained)); return config; } } diff --git a/hudi-client/src/main/java/org/apache/hudi/config/HoodieHBaseIndexConfig.java b/hudi-client/src/main/java/org/apache/hudi/config/HoodieHBaseIndexConfig.java index 256bba920..1b4306448 100644 --- a/hudi-client/src/main/java/org/apache/hudi/config/HoodieHBaseIndexConfig.java +++ b/hudi-client/src/main/java/org/apache/hudi/config/HoodieHBaseIndexConfig.java @@ -32,8 +32,8 @@ public class HoodieHBaseIndexConfig extends DefaultHoodieConfig { public static final String HBASE_GET_BATCH_SIZE_PROP = "hoodie.index.hbase.get.batch.size"; public static final String HBASE_ZK_ZNODEPARENT = "hoodie.index.hbase.zknode.path"; /** - * Note that if HBASE_PUT_BATCH_SIZE_AUTO_COMPUTE_PROP is set to true, this batch size will not - * be honored for HBase Puts + * Note that if HBASE_PUT_BATCH_SIZE_AUTO_COMPUTE_PROP is set to true, this batch size will not be honored for HBase + * Puts */ public static final String HBASE_PUT_BATCH_SIZE_PROP = "hoodie.index.hbase.put.batch.size"; @@ -48,18 +48,16 @@ public class HoodieHBaseIndexConfig extends DefaultHoodieConfig { public static final String HBASE_PUT_BATCH_SIZE_AUTO_COMPUTE_PROP = "hoodie.index.hbase.put.batch.size.autocompute"; public static final String DEFAULT_HBASE_PUT_BATCH_SIZE_AUTO_COMPUTE = "false"; /** - * Property to set the fraction of the global share of QPS that should be allocated to this job. - * Let's say there are 3 jobs which have input size in terms of number of rows required for - * HbaseIndexing as x, 2x, 3x respectively. Then this fraction for the jobs would be (0.17) 1/6, - * 0.33 (2/6) and 0.5 (3/6) respectively. + * Property to set the fraction of the global share of QPS that should be allocated to this job. Let's say there are 3 + * jobs which have input size in terms of number of rows required for HbaseIndexing as x, 2x, 3x respectively. Then + * this fraction for the jobs would be (0.17) 1/6, 0.33 (2/6) and 0.5 (3/6) respectively. */ public static final String HBASE_QPS_FRACTION_PROP = "hoodie.index.hbase.qps.fraction"; /** - * Property to set maximum QPS allowed per Region Server. This should be same across various - * jobs. This is intended to limit the aggregate QPS generated across various jobs to an Hbase - * Region Server. It is recommended to set this value based on global indexing throughput needs - * and most importantly, how much the HBase installation in use is able to tolerate without - * Region Servers going down. + * Property to set maximum QPS allowed per Region Server. This should be same across various jobs. This is intended to + * limit the aggregate QPS generated across various jobs to an Hbase Region Server. It is recommended to set this + * value based on global indexing throughput needs and most importantly, how much the HBase installation in use is + * able to tolerate without Region Servers going down. */ public static String HBASE_MAX_QPS_PER_REGION_SERVER_PROP = "hoodie.index.hbase.max.qps.per.region.server"; /** @@ -71,18 +69,17 @@ public class HoodieHBaseIndexConfig extends DefaultHoodieConfig { */ public static final int DEFAULT_HBASE_MAX_QPS_PER_REGION_SERVER = 1000; /** - * Default is 50%, which means a total of 2 jobs can run using HbaseIndex without overwhelming - * Region Servers + * Default is 50%, which means a total of 2 jobs can run using HbaseIndex without overwhelming Region Servers */ public static final float DEFAULT_HBASE_QPS_FRACTION = 0.5f; /** - * Property to decide if HBASE_QPS_FRACTION_PROP is dynamically calculated based on volume + * Property to decide if HBASE_QPS_FRACTION_PROP is dynamically calculated based on volume */ public static final String HOODIE_INDEX_COMPUTE_QPS_DYNAMICALLY = "hoodie.index.hbase.dynamic_qps"; public static final boolean DEFAULT_HOODIE_INDEX_COMPUTE_QPS_DYNAMICALLY = false; /** - * Min and Max for HBASE_QPS_FRACTION_PROP to stabilize skewed volume workloads + * Min and Max for HBASE_QPS_FRACTION_PROP to stabilize skewed volume workloads */ public static final String HBASE_MIN_QPS_FRACTION_PROP = "hoodie.index.hbase.min.qps.fraction"; public static final String DEFAULT_HBASE_MIN_QPS_FRACTION_PROP = "0.002"; @@ -90,7 +87,7 @@ public class HoodieHBaseIndexConfig extends DefaultHoodieConfig { public static final String HBASE_MAX_QPS_FRACTION_PROP = "hoodie.index.hbase.max.qps.fraction"; public static final String DEFAULT_HBASE_MAX_QPS_FRACTION_PROP = "0.06"; /** - * Hoodie index desired puts operation time in seconds + * Hoodie index desired puts operation time in seconds */ public static final String HOODIE_INDEX_DESIRED_PUTS_TIME_IN_SECS = "hoodie.index.hbase.desired_puts_time_in_secs"; public static final int DEFAULT_HOODIE_INDEX_DESIRED_PUTS_TIME_IN_SECS = 600; @@ -105,7 +102,7 @@ public class HoodieHBaseIndexConfig extends DefaultHoodieConfig { public static final String DEFAULT_HBASE_ZK_PATH_QPS_ROOT = "/QPS_ROOT"; public HoodieHBaseIndexConfig(final Properties props) { - super(props); + super(props); } public static HoodieHBaseIndexConfig.Builder newBuilder() { @@ -218,18 +215,15 @@ public class HoodieHBaseIndexConfig extends DefaultHoodieConfig { /** *

- * Method to set maximum QPS allowed per Region Server. This should be same across various - * jobs. This is intended to limit the aggregate QPS generated across various jobs to an - * Hbase Region Server. + * Method to set maximum QPS allowed per Region Server. This should be same across various jobs. This is intended to + * limit the aggregate QPS generated across various jobs to an Hbase Region Server. *

*

- * It is recommended to set this value based on your global indexing throughput needs and - * most importantly, how much your HBase installation is able to tolerate without Region - * Servers going down. + * It is recommended to set this value based on your global indexing throughput needs and most importantly, how much + * your HBase installation is able to tolerate without Region Servers going down. *

*/ - public HoodieHBaseIndexConfig.Builder hbaseIndexMaxQPSPerRegionServer( - int maxQPSPerRegionServer) { + public HoodieHBaseIndexConfig.Builder hbaseIndexMaxQPSPerRegionServer(int maxQPSPerRegionServer) { // This should be same across various jobs props.setProperty(HoodieHBaseIndexConfig.HBASE_MAX_QPS_PER_REGION_SERVER_PROP, String.valueOf(maxQPSPerRegionServer)); @@ -238,30 +232,30 @@ public class HoodieHBaseIndexConfig extends DefaultHoodieConfig { public HoodieHBaseIndexConfig build() { HoodieHBaseIndexConfig config = new HoodieHBaseIndexConfig(props); - setDefaultOnCondition(props, !props.containsKey(HBASE_GET_BATCH_SIZE_PROP), - HBASE_GET_BATCH_SIZE_PROP, String.valueOf(DEFAULT_HBASE_BATCH_SIZE)); - setDefaultOnCondition(props, !props.containsKey(HBASE_PUT_BATCH_SIZE_PROP), - HBASE_PUT_BATCH_SIZE_PROP, String.valueOf(DEFAULT_HBASE_BATCH_SIZE)); + setDefaultOnCondition(props, !props.containsKey(HBASE_GET_BATCH_SIZE_PROP), HBASE_GET_BATCH_SIZE_PROP, + String.valueOf(DEFAULT_HBASE_BATCH_SIZE)); + setDefaultOnCondition(props, !props.containsKey(HBASE_PUT_BATCH_SIZE_PROP), HBASE_PUT_BATCH_SIZE_PROP, + String.valueOf(DEFAULT_HBASE_BATCH_SIZE)); setDefaultOnCondition(props, !props.containsKey(HBASE_PUT_BATCH_SIZE_AUTO_COMPUTE_PROP), HBASE_PUT_BATCH_SIZE_AUTO_COMPUTE_PROP, String.valueOf(DEFAULT_HBASE_PUT_BATCH_SIZE_AUTO_COMPUTE)); - setDefaultOnCondition(props, !props.containsKey(HBASE_QPS_FRACTION_PROP), - HBASE_QPS_FRACTION_PROP, String.valueOf(DEFAULT_HBASE_QPS_FRACTION)); + setDefaultOnCondition(props, !props.containsKey(HBASE_QPS_FRACTION_PROP), HBASE_QPS_FRACTION_PROP, + String.valueOf(DEFAULT_HBASE_QPS_FRACTION)); setDefaultOnCondition(props, !props.containsKey(HBASE_MAX_QPS_PER_REGION_SERVER_PROP), HBASE_MAX_QPS_PER_REGION_SERVER_PROP, String.valueOf(DEFAULT_HBASE_MAX_QPS_PER_REGION_SERVER)); setDefaultOnCondition(props, !props.containsKey(HOODIE_INDEX_COMPUTE_QPS_DYNAMICALLY), HOODIE_INDEX_COMPUTE_QPS_DYNAMICALLY, String.valueOf(DEFAULT_HOODIE_INDEX_COMPUTE_QPS_DYNAMICALLY)); - setDefaultOnCondition(props, !props.containsKey(HBASE_INDEX_QPS_ALLOCATOR_CLASS), - HBASE_INDEX_QPS_ALLOCATOR_CLASS, String.valueOf(DEFAULT_HBASE_INDEX_QPS_ALLOCATOR_CLASS)); + setDefaultOnCondition(props, !props.containsKey(HBASE_INDEX_QPS_ALLOCATOR_CLASS), HBASE_INDEX_QPS_ALLOCATOR_CLASS, + String.valueOf(DEFAULT_HBASE_INDEX_QPS_ALLOCATOR_CLASS)); setDefaultOnCondition(props, !props.containsKey(HOODIE_INDEX_DESIRED_PUTS_TIME_IN_SECS), HOODIE_INDEX_DESIRED_PUTS_TIME_IN_SECS, String.valueOf(DEFAULT_HOODIE_INDEX_DESIRED_PUTS_TIME_IN_SECS)); - setDefaultOnCondition(props, !props.containsKey(HBASE_ZK_PATH_QPS_ROOT), - HBASE_ZK_PATH_QPS_ROOT, String.valueOf(DEFAULT_HBASE_ZK_PATH_QPS_ROOT)); + setDefaultOnCondition(props, !props.containsKey(HBASE_ZK_PATH_QPS_ROOT), HBASE_ZK_PATH_QPS_ROOT, + String.valueOf(DEFAULT_HBASE_ZK_PATH_QPS_ROOT)); setDefaultOnCondition(props, !props.containsKey(HOODIE_INDEX_HBASE_ZK_SESSION_TIMEOUT_MS), HOODIE_INDEX_HBASE_ZK_SESSION_TIMEOUT_MS, String.valueOf(DEFAULT_ZK_SESSION_TIMEOUT_MS)); setDefaultOnCondition(props, !props.containsKey(HOODIE_INDEX_HBASE_ZK_CONNECTION_TIMEOUT_MS), HOODIE_INDEX_HBASE_ZK_CONNECTION_TIMEOUT_MS, String.valueOf(DEFAULT_ZK_CONNECTION_TIMEOUT_MS)); - setDefaultOnCondition(props, !props.containsKey(HBASE_INDEX_QPS_ALLOCATOR_CLASS), - HBASE_INDEX_QPS_ALLOCATOR_CLASS, String.valueOf(DEFAULT_HBASE_INDEX_QPS_ALLOCATOR_CLASS)); + setDefaultOnCondition(props, !props.containsKey(HBASE_INDEX_QPS_ALLOCATOR_CLASS), HBASE_INDEX_QPS_ALLOCATOR_CLASS, + String.valueOf(DEFAULT_HBASE_INDEX_QPS_ALLOCATOR_CLASS)); return config; } diff --git a/hudi-client/src/main/java/org/apache/hudi/config/HoodieIndexConfig.java b/hudi-client/src/main/java/org/apache/hudi/config/HoodieIndexConfig.java index 85786782f..24b98f6e5 100644 --- a/hudi-client/src/main/java/org/apache/hudi/config/HoodieIndexConfig.java +++ b/hudi-client/src/main/java/org/apache/hudi/config/HoodieIndexConfig.java @@ -34,7 +34,7 @@ public class HoodieIndexConfig extends DefaultHoodieConfig { public static final String INDEX_TYPE_PROP = "hoodie.index.type"; public static final String DEFAULT_INDEX_TYPE = HoodieIndex.IndexType.BLOOM.name(); - // ***** Bloom Index configs ***** + // ***** Bloom Index configs ***** public static final String BLOOM_FILTER_NUM_ENTRIES = "hoodie.index.bloom.num_entries"; public static final String DEFAULT_BLOOM_FILTER_NUM_ENTRIES = "60000"; public static final String BLOOM_FILTER_FPP = "hoodie.index.bloom.fpp"; @@ -42,8 +42,7 @@ public class HoodieIndexConfig extends DefaultHoodieConfig { public static final String BLOOM_INDEX_PARALLELISM_PROP = "hoodie.bloom.index.parallelism"; // Disable explicit bloom index parallelism setting by default - hoodie auto computes public static final String DEFAULT_BLOOM_INDEX_PARALLELISM = "0"; - public static final String BLOOM_INDEX_PRUNE_BY_RANGES_PROP = - "hoodie.bloom.index.prune.by" + ".ranges"; + public static final String BLOOM_INDEX_PRUNE_BY_RANGES_PROP = "hoodie.bloom.index.prune.by" + ".ranges"; public static final String DEFAULT_BLOOM_INDEX_PRUNE_BY_RANGES = "true"; public static final String BLOOM_INDEX_USE_CACHING_PROP = "hoodie.bloom.index.use.caching"; public static final String DEFAULT_BLOOM_INDEX_USE_CACHING = "true"; @@ -67,8 +66,7 @@ public class HoodieIndexConfig extends DefaultHoodieConfig { public static final String DEFAULT_HBASE_BATCH_SIZE = "100"; - public static final String BLOOM_INDEX_INPUT_STORAGE_LEVEL = - "hoodie.bloom.index.input.storage" + ".level"; + public static final String BLOOM_INDEX_INPUT_STORAGE_LEVEL = "hoodie.bloom.index.input.storage" + ".level"; public static final String DEFAULT_BLOOM_INDEX_INPUT_STORAGE_LEVEL = "MEMORY_AND_DISK_SER"; private HoodieIndexConfig(Properties props) { @@ -175,20 +173,18 @@ public class HoodieIndexConfig extends DefaultHoodieConfig { public HoodieIndexConfig build() { HoodieIndexConfig config = new HoodieIndexConfig(props); - setDefaultOnCondition(props, !props.containsKey(INDEX_TYPE_PROP), INDEX_TYPE_PROP, - DEFAULT_INDEX_TYPE); - setDefaultOnCondition(props, !props.containsKey(BLOOM_FILTER_NUM_ENTRIES), - BLOOM_FILTER_NUM_ENTRIES, DEFAULT_BLOOM_FILTER_NUM_ENTRIES); - setDefaultOnCondition(props, !props.containsKey(BLOOM_FILTER_FPP), BLOOM_FILTER_FPP, - DEFAULT_BLOOM_FILTER_FPP); - setDefaultOnCondition(props, !props.containsKey(BLOOM_INDEX_PARALLELISM_PROP), - BLOOM_INDEX_PARALLELISM_PROP, DEFAULT_BLOOM_INDEX_PARALLELISM); + setDefaultOnCondition(props, !props.containsKey(INDEX_TYPE_PROP), INDEX_TYPE_PROP, DEFAULT_INDEX_TYPE); + setDefaultOnCondition(props, !props.containsKey(BLOOM_FILTER_NUM_ENTRIES), BLOOM_FILTER_NUM_ENTRIES, + DEFAULT_BLOOM_FILTER_NUM_ENTRIES); + setDefaultOnCondition(props, !props.containsKey(BLOOM_FILTER_FPP), BLOOM_FILTER_FPP, DEFAULT_BLOOM_FILTER_FPP); + setDefaultOnCondition(props, !props.containsKey(BLOOM_INDEX_PARALLELISM_PROP), BLOOM_INDEX_PARALLELISM_PROP, + DEFAULT_BLOOM_INDEX_PARALLELISM); setDefaultOnCondition(props, !props.containsKey(BLOOM_INDEX_PRUNE_BY_RANGES_PROP), BLOOM_INDEX_PRUNE_BY_RANGES_PROP, DEFAULT_BLOOM_INDEX_PRUNE_BY_RANGES); - setDefaultOnCondition(props, !props.containsKey(BLOOM_INDEX_USE_CACHING_PROP), - BLOOM_INDEX_USE_CACHING_PROP, DEFAULT_BLOOM_INDEX_USE_CACHING); - setDefaultOnCondition(props, !props.containsKey(BLOOM_INDEX_INPUT_STORAGE_LEVEL), - BLOOM_INDEX_INPUT_STORAGE_LEVEL, DEFAULT_BLOOM_INDEX_INPUT_STORAGE_LEVEL); + setDefaultOnCondition(props, !props.containsKey(BLOOM_INDEX_USE_CACHING_PROP), BLOOM_INDEX_USE_CACHING_PROP, + DEFAULT_BLOOM_INDEX_USE_CACHING); + setDefaultOnCondition(props, !props.containsKey(BLOOM_INDEX_INPUT_STORAGE_LEVEL), BLOOM_INDEX_INPUT_STORAGE_LEVEL, + DEFAULT_BLOOM_INDEX_INPUT_STORAGE_LEVEL); setDefaultOnCondition(props, !props.containsKey(BLOOM_INDEX_TREE_BASED_FILTER_PROP), BLOOM_INDEX_TREE_BASED_FILTER_PROP, DEFAULT_BLOOM_INDEX_TREE_BASED_FILTER); setDefaultOnCondition(props, !props.containsKey(BLOOM_INDEX_BUCKETIZED_CHECKING_PROP), diff --git a/hudi-client/src/main/java/org/apache/hudi/config/HoodieMemoryConfig.java b/hudi-client/src/main/java/org/apache/hudi/config/HoodieMemoryConfig.java index 7a41c5745..88f07b91c 100644 --- a/hudi-client/src/main/java/org/apache/hudi/config/HoodieMemoryConfig.java +++ b/hudi-client/src/main/java/org/apache/hudi/config/HoodieMemoryConfig.java @@ -41,8 +41,7 @@ public class HoodieMemoryConfig extends DefaultHoodieConfig { // Default max memory fraction during compaction, excess spills to disk public static final String DEFAULT_MAX_MEMORY_FRACTION_FOR_COMPACTION = String.valueOf(0.6); // Default memory size per compaction (used if SparkEnv is absent), excess spills to disk - public static final long DEFAULT_MAX_MEMORY_FOR_SPILLABLE_MAP_IN_BYTES = - 1024 * 1024 * 1024L; // 1GB + public static final long DEFAULT_MAX_MEMORY_FOR_SPILLABLE_MAP_IN_BYTES = 1024 * 1024 * 1024L; // 1GB // Property to set the max memory for merge public static final String MAX_MEMORY_FOR_MERGE_PROP = "hoodie.memory.merge.max.size"; // Property to set the max memory for compaction @@ -88,20 +87,17 @@ public class HoodieMemoryConfig extends DefaultHoodieConfig { } public Builder withMaxMemoryFractionPerPartitionMerge(double maxMemoryFractionPerPartitionMerge) { - props.setProperty(MAX_MEMORY_FRACTION_FOR_MERGE_PROP, - String.valueOf(maxMemoryFractionPerPartitionMerge)); + props.setProperty(MAX_MEMORY_FRACTION_FOR_MERGE_PROP, String.valueOf(maxMemoryFractionPerPartitionMerge)); return this; } public Builder withMaxMemoryFractionPerCompaction(double maxMemoryFractionPerCompaction) { - props.setProperty(MAX_MEMORY_FRACTION_FOR_COMPACTION_PROP, - String.valueOf(maxMemoryFractionPerCompaction)); + props.setProperty(MAX_MEMORY_FRACTION_FOR_COMPACTION_PROP, String.valueOf(maxMemoryFractionPerCompaction)); return this; } public Builder withMaxDFSStreamBufferSize(int maxStreamBufferSize) { - props.setProperty(MAX_DFS_STREAM_BUFFER_SIZE_PROP, - String.valueOf(maxStreamBufferSize)); + props.setProperty(MAX_DFS_STREAM_BUFFER_SIZE_PROP, String.valueOf(maxStreamBufferSize)); return this; } @@ -130,19 +126,16 @@ public class HoodieMemoryConfig extends DefaultHoodieConfig { if (SparkEnv.get() != null) { // 1 GB is the default conf used by Spark, look at SparkContext.scala - long executorMemoryInBytes = Utils.memoryStringToMb(SparkEnv.get().conf().get(SPARK_EXECUTOR_MEMORY_PROP, - DEFAULT_SPARK_EXECUTOR_MEMORY_MB)) * 1024 - * 1024L; + long executorMemoryInBytes = Utils.memoryStringToMb( + SparkEnv.get().conf().get(SPARK_EXECUTOR_MEMORY_PROP, DEFAULT_SPARK_EXECUTOR_MEMORY_MB)) * 1024 * 1024L; // 0.6 is the default value used by Spark, // look at {@link // https://github.com/apache/spark/blob/master/core/src/main/scala/org/apache/spark/SparkConf.scala#L507} - double memoryFraction = Double - .valueOf(SparkEnv.get().conf().get(SPARK_EXECUTOR_MEMORY_FRACTION_PROP, - DEFAULT_SPARK_EXECUTOR_MEMORY_FRACTION)); + double memoryFraction = Double.valueOf( + SparkEnv.get().conf().get(SPARK_EXECUTOR_MEMORY_FRACTION_PROP, DEFAULT_SPARK_EXECUTOR_MEMORY_FRACTION)); double maxMemoryFractionForMerge = Double.valueOf(maxMemoryFraction); double userAvailableMemory = executorMemoryInBytes * (1 - memoryFraction); - long maxMemoryForMerge = (long) Math - .floor(userAvailableMemory * maxMemoryFractionForMerge); + long maxMemoryForMerge = (long) Math.floor(userAvailableMemory * maxMemoryFractionForMerge); return maxMemoryForMerge; } else { return DEFAULT_MAX_MEMORY_FOR_SPILLABLE_MAP_IN_BYTES; @@ -151,29 +144,19 @@ public class HoodieMemoryConfig extends DefaultHoodieConfig { public HoodieMemoryConfig build() { HoodieMemoryConfig config = new HoodieMemoryConfig(props); - setDefaultOnCondition(props, - !props.containsKey(MAX_MEMORY_FRACTION_FOR_COMPACTION_PROP), - MAX_MEMORY_FRACTION_FOR_COMPACTION_PROP, - DEFAULT_MAX_MEMORY_FRACTION_FOR_COMPACTION); - setDefaultOnCondition(props, - !props.containsKey(MAX_MEMORY_FRACTION_FOR_MERGE_PROP), + setDefaultOnCondition(props, !props.containsKey(MAX_MEMORY_FRACTION_FOR_COMPACTION_PROP), + MAX_MEMORY_FRACTION_FOR_COMPACTION_PROP, DEFAULT_MAX_MEMORY_FRACTION_FOR_COMPACTION); + setDefaultOnCondition(props, !props.containsKey(MAX_MEMORY_FRACTION_FOR_MERGE_PROP), MAX_MEMORY_FRACTION_FOR_MERGE_PROP, DEFAULT_MAX_MEMORY_FRACTION_FOR_MERGE); - setDefaultOnCondition(props, - !props.containsKey(MAX_MEMORY_FOR_MERGE_PROP), - MAX_MEMORY_FOR_MERGE_PROP, String.valueOf( - getMaxMemoryAllowedForMerge(props.getProperty(MAX_MEMORY_FRACTION_FOR_MERGE_PROP)))); - setDefaultOnCondition(props, - !props.containsKey(MAX_MEMORY_FOR_COMPACTION_PROP), - MAX_MEMORY_FOR_COMPACTION_PROP, String.valueOf( - getMaxMemoryAllowedForMerge(props.getProperty(MAX_MEMORY_FRACTION_FOR_COMPACTION_PROP)))); - setDefaultOnCondition(props, - !props.containsKey(MAX_DFS_STREAM_BUFFER_SIZE_PROP), - MAX_DFS_STREAM_BUFFER_SIZE_PROP, String.valueOf(DEFAULT_MAX_DFS_STREAM_BUFFER_SIZE)); - setDefaultOnCondition(props, - !props.containsKey(SPILLABLE_MAP_BASE_PATH_PROP), - SPILLABLE_MAP_BASE_PATH_PROP, DEFAULT_SPILLABLE_MAP_BASE_PATH); - setDefaultOnCondition(props, - !props.containsKey(WRITESTATUS_FAILURE_FRACTION_PROP), + setDefaultOnCondition(props, !props.containsKey(MAX_MEMORY_FOR_MERGE_PROP), MAX_MEMORY_FOR_MERGE_PROP, + String.valueOf(getMaxMemoryAllowedForMerge(props.getProperty(MAX_MEMORY_FRACTION_FOR_MERGE_PROP)))); + setDefaultOnCondition(props, !props.containsKey(MAX_MEMORY_FOR_COMPACTION_PROP), MAX_MEMORY_FOR_COMPACTION_PROP, + String.valueOf(getMaxMemoryAllowedForMerge(props.getProperty(MAX_MEMORY_FRACTION_FOR_COMPACTION_PROP)))); + setDefaultOnCondition(props, !props.containsKey(MAX_DFS_STREAM_BUFFER_SIZE_PROP), MAX_DFS_STREAM_BUFFER_SIZE_PROP, + String.valueOf(DEFAULT_MAX_DFS_STREAM_BUFFER_SIZE)); + setDefaultOnCondition(props, !props.containsKey(SPILLABLE_MAP_BASE_PATH_PROP), SPILLABLE_MAP_BASE_PATH_PROP, + DEFAULT_SPILLABLE_MAP_BASE_PATH); + setDefaultOnCondition(props, !props.containsKey(WRITESTATUS_FAILURE_FRACTION_PROP), WRITESTATUS_FAILURE_FRACTION_PROP, String.valueOf(DEFAULT_WRITESTATUS_FAILURE_FRACTION)); return config; } diff --git a/hudi-client/src/main/java/org/apache/hudi/config/HoodieMetricsConfig.java b/hudi-client/src/main/java/org/apache/hudi/config/HoodieMetricsConfig.java index e21e00288..0074c7253 100644 --- a/hudi-client/src/main/java/org/apache/hudi/config/HoodieMetricsConfig.java +++ b/hudi-client/src/main/java/org/apache/hudi/config/HoodieMetricsConfig.java @@ -35,8 +35,7 @@ public class HoodieMetricsConfig extends DefaultHoodieConfig { public static final String METRICS_ON = METRIC_PREFIX + ".on"; public static final boolean DEFAULT_METRICS_ON = false; public static final String METRICS_REPORTER_TYPE = METRIC_PREFIX + ".reporter.type"; - public static final MetricsReporterType DEFAULT_METRICS_REPORTER_TYPE = MetricsReporterType - .GRAPHITE; + public static final MetricsReporterType DEFAULT_METRICS_REPORTER_TYPE = MetricsReporterType.GRAPHITE; // Graphite public static final String GRAPHITE_PREFIX = METRIC_PREFIX + ".graphite"; @@ -103,8 +102,7 @@ public class HoodieMetricsConfig extends DefaultHoodieConfig { public HoodieMetricsConfig build() { HoodieMetricsConfig config = new HoodieMetricsConfig(props); - setDefaultOnCondition(props, !props.containsKey(METRICS_ON), METRICS_ON, - String.valueOf(DEFAULT_METRICS_ON)); + setDefaultOnCondition(props, !props.containsKey(METRICS_ON), METRICS_ON, String.valueOf(DEFAULT_METRICS_ON)); setDefaultOnCondition(props, !props.containsKey(METRICS_REPORTER_TYPE), METRICS_REPORTER_TYPE, DEFAULT_METRICS_REPORTER_TYPE.name()); setDefaultOnCondition(props, !props.containsKey(GRAPHITE_SERVER_HOST), GRAPHITE_SERVER_HOST, diff --git a/hudi-client/src/main/java/org/apache/hudi/config/HoodieStorageConfig.java b/hudi-client/src/main/java/org/apache/hudi/config/HoodieStorageConfig.java index 27060d5e4..90fdb6cae 100644 --- a/hudi-client/src/main/java/org/apache/hudi/config/HoodieStorageConfig.java +++ b/hudi-client/src/main/java/org/apache/hudi/config/HoodieStorageConfig.java @@ -38,8 +38,7 @@ public class HoodieStorageConfig extends DefaultHoodieConfig { public static final String DEFAULT_PARQUET_PAGE_SIZE_BYTES = String.valueOf(1 * 1024 * 1024); // used to size log files public static final String LOGFILE_SIZE_MAX_BYTES = "hoodie.logfile.max.size"; - public static final String DEFAULT_LOGFILE_SIZE_MAX_BYTES = String - .valueOf(1024 * 1024 * 1024); // 1 GB + public static final String DEFAULT_LOGFILE_SIZE_MAX_BYTES = String.valueOf(1024 * 1024 * 1024); // 1 GB // used to size data blocks in log file public static final String LOGFILE_DATA_BLOCK_SIZE_MAX_BYTES = "hoodie.logfile.data.block.max.size"; public static final String DEFAULT_LOGFILE_DATA_BLOCK_SIZE_MAX_BYTES = String.valueOf(256 * 1024 * 1024); // 256 MB @@ -122,20 +121,20 @@ public class HoodieStorageConfig extends DefaultHoodieConfig { public HoodieStorageConfig build() { HoodieStorageConfig config = new HoodieStorageConfig(props); - setDefaultOnCondition(props, !props.containsKey(PARQUET_FILE_MAX_BYTES), - PARQUET_FILE_MAX_BYTES, DEFAULT_PARQUET_FILE_MAX_BYTES); - setDefaultOnCondition(props, !props.containsKey(PARQUET_BLOCK_SIZE_BYTES), - PARQUET_BLOCK_SIZE_BYTES, DEFAULT_PARQUET_BLOCK_SIZE_BYTES); - setDefaultOnCondition(props, !props.containsKey(PARQUET_PAGE_SIZE_BYTES), - PARQUET_PAGE_SIZE_BYTES, DEFAULT_PARQUET_PAGE_SIZE_BYTES); + setDefaultOnCondition(props, !props.containsKey(PARQUET_FILE_MAX_BYTES), PARQUET_FILE_MAX_BYTES, + DEFAULT_PARQUET_FILE_MAX_BYTES); + setDefaultOnCondition(props, !props.containsKey(PARQUET_BLOCK_SIZE_BYTES), PARQUET_BLOCK_SIZE_BYTES, + DEFAULT_PARQUET_BLOCK_SIZE_BYTES); + setDefaultOnCondition(props, !props.containsKey(PARQUET_PAGE_SIZE_BYTES), PARQUET_PAGE_SIZE_BYTES, + DEFAULT_PARQUET_PAGE_SIZE_BYTES); setDefaultOnCondition(props, !props.containsKey(LOGFILE_DATA_BLOCK_SIZE_MAX_BYTES), LOGFILE_DATA_BLOCK_SIZE_MAX_BYTES, DEFAULT_LOGFILE_DATA_BLOCK_SIZE_MAX_BYTES); - setDefaultOnCondition(props, !props.containsKey(LOGFILE_SIZE_MAX_BYTES), - LOGFILE_SIZE_MAX_BYTES, DEFAULT_LOGFILE_SIZE_MAX_BYTES); - setDefaultOnCondition(props, !props.containsKey(PARQUET_COMPRESSION_RATIO), - PARQUET_COMPRESSION_RATIO, DEFAULT_STREAM_COMPRESSION_RATIO); - setDefaultOnCondition(props, !props.containsKey(PARQUET_COMPRESSION_CODEC), - PARQUET_COMPRESSION_CODEC, DEFAULT_PARQUET_COMPRESSION_CODEC); + setDefaultOnCondition(props, !props.containsKey(LOGFILE_SIZE_MAX_BYTES), LOGFILE_SIZE_MAX_BYTES, + DEFAULT_LOGFILE_SIZE_MAX_BYTES); + setDefaultOnCondition(props, !props.containsKey(PARQUET_COMPRESSION_RATIO), PARQUET_COMPRESSION_RATIO, + DEFAULT_STREAM_COMPRESSION_RATIO); + setDefaultOnCondition(props, !props.containsKey(PARQUET_COMPRESSION_CODEC), PARQUET_COMPRESSION_CODEC, + DEFAULT_PARQUET_COMPRESSION_CODEC); setDefaultOnCondition(props, !props.containsKey(LOGFILE_TO_PARQUET_COMPRESSION_RATIO), LOGFILE_TO_PARQUET_COMPRESSION_RATIO, DEFAULT_LOGFILE_TO_PARQUET_COMPRESSION_RATIO); return config; diff --git a/hudi-client/src/main/java/org/apache/hudi/config/HoodieWriteConfig.java b/hudi-client/src/main/java/org/apache/hudi/config/HoodieWriteConfig.java index 82a8f9055..35975565f 100644 --- a/hudi-client/src/main/java/org/apache/hudi/config/HoodieWriteConfig.java +++ b/hudi-client/src/main/java/org/apache/hudi/config/HoodieWriteConfig.java @@ -61,8 +61,7 @@ public class HoodieWriteConfig extends DefaultHoodieConfig { private static final String DEFAULT_WRITE_STATUS_STORAGE_LEVEL = "MEMORY_AND_DISK_SER"; private static final String HOODIE_AUTO_COMMIT_PROP = "hoodie.auto.commit"; private static final String DEFAULT_HOODIE_AUTO_COMMIT = "true"; - private static final String HOODIE_ASSUME_DATE_PARTITIONING_PROP = - "hoodie.assume.date" + ".partitioning"; + private static final String HOODIE_ASSUME_DATE_PARTITIONING_PROP = "hoodie.assume.date" + ".partitioning"; private static final String DEFAULT_ASSUME_DATE_PARTITIONING = "false"; private static final String HOODIE_WRITE_STATUS_CLASS_PROP = "hoodie.writestatus.class"; private static final String DEFAULT_HOODIE_WRITE_STATUS_CLASS = WriteStatus.class.getName(); @@ -143,8 +142,7 @@ public class HoodieWriteConfig extends DefaultHoodieConfig { } public int getWriteBufferLimitBytes() { - return Integer - .parseInt(props.getProperty(WRITE_BUFFER_LIMIT_BYTES, DEFAULT_WRITE_BUFFER_LIMIT_BYTES)); + return Integer.parseInt(props.getProperty(WRITE_BUFFER_LIMIT_BYTES, DEFAULT_WRITE_BUFFER_LIMIT_BYTES)); } public boolean shouldCombineBeforeInsert() { @@ -191,18 +189,15 @@ public class HoodieWriteConfig extends DefaultHoodieConfig { * compaction properties **/ public HoodieCleaningPolicy getCleanerPolicy() { - return HoodieCleaningPolicy - .valueOf(props.getProperty(HoodieCompactionConfig.CLEANER_POLICY_PROP)); + return HoodieCleaningPolicy.valueOf(props.getProperty(HoodieCompactionConfig.CLEANER_POLICY_PROP)); } public int getCleanerFileVersionsRetained() { - return Integer - .parseInt(props.getProperty(HoodieCompactionConfig.CLEANER_FILE_VERSIONS_RETAINED_PROP)); + return Integer.parseInt(props.getProperty(HoodieCompactionConfig.CLEANER_FILE_VERSIONS_RETAINED_PROP)); } public int getCleanerCommitsRetained() { - return Integer - .parseInt(props.getProperty(HoodieCompactionConfig.CLEANER_COMMITS_RETAINED_PROP)); + return Integer.parseInt(props.getProperty(HoodieCompactionConfig.CLEANER_COMMITS_RETAINED_PROP)); } public int getMaxCommitsToKeep() { @@ -214,23 +209,19 @@ public class HoodieWriteConfig extends DefaultHoodieConfig { } public int getParquetSmallFileLimit() { - return Integer - .parseInt(props.getProperty(HoodieCompactionConfig.PARQUET_SMALL_FILE_LIMIT_BYTES)); + return Integer.parseInt(props.getProperty(HoodieCompactionConfig.PARQUET_SMALL_FILE_LIMIT_BYTES)); } public int getCopyOnWriteInsertSplitSize() { - return Integer - .parseInt(props.getProperty(HoodieCompactionConfig.COPY_ON_WRITE_TABLE_INSERT_SPLIT_SIZE)); + return Integer.parseInt(props.getProperty(HoodieCompactionConfig.COPY_ON_WRITE_TABLE_INSERT_SPLIT_SIZE)); } public int getCopyOnWriteRecordSizeEstimate() { - return Integer.parseInt( - props.getProperty(HoodieCompactionConfig.COPY_ON_WRITE_TABLE_RECORD_SIZE_ESTIMATE)); + return Integer.parseInt(props.getProperty(HoodieCompactionConfig.COPY_ON_WRITE_TABLE_RECORD_SIZE_ESTIMATE)); } public boolean shouldAutoTuneInsertSplits() { - return Boolean.parseBoolean( - props.getProperty(HoodieCompactionConfig.COPY_ON_WRITE_TABLE_AUTO_SPLIT_INSERTS)); + return Boolean.parseBoolean(props.getProperty(HoodieCompactionConfig.COPY_ON_WRITE_TABLE_AUTO_SPLIT_INSERTS)); } public int getCleanerParallelism() { @@ -246,28 +237,23 @@ public class HoodieWriteConfig extends DefaultHoodieConfig { } public int getInlineCompactDeltaCommitMax() { - return Integer - .parseInt(props.getProperty(HoodieCompactionConfig.INLINE_COMPACT_NUM_DELTA_COMMITS_PROP)); + return Integer.parseInt(props.getProperty(HoodieCompactionConfig.INLINE_COMPACT_NUM_DELTA_COMMITS_PROP)); } public CompactionStrategy getCompactionStrategy() { - return ReflectionUtils - .loadClass(props.getProperty(HoodieCompactionConfig.COMPACTION_STRATEGY_PROP)); + return ReflectionUtils.loadClass(props.getProperty(HoodieCompactionConfig.COMPACTION_STRATEGY_PROP)); } public Long getTargetIOPerCompactionInMB() { - return Long - .parseLong(props.getProperty(HoodieCompactionConfig.TARGET_IO_PER_COMPACTION_IN_MB_PROP)); + return Long.parseLong(props.getProperty(HoodieCompactionConfig.TARGET_IO_PER_COMPACTION_IN_MB_PROP)); } public Boolean getCompactionLazyBlockReadEnabled() { - return Boolean - .valueOf(props.getProperty(HoodieCompactionConfig.COMPACTION_LAZY_BLOCK_READ_ENABLED_PROP)); + return Boolean.valueOf(props.getProperty(HoodieCompactionConfig.COMPACTION_LAZY_BLOCK_READ_ENABLED_PROP)); } public Boolean getCompactionReverseLogReadEnabled() { - return Boolean.valueOf( - props.getProperty(HoodieCompactionConfig.COMPACTION_REVERSE_LOG_READ_ENABLED_PROP)); + return Boolean.valueOf(props.getProperty(HoodieCompactionConfig.COMPACTION_REVERSE_LOG_READ_ENABLED_PROP)); } public String getPayloadClass() { @@ -275,13 +261,11 @@ public class HoodieWriteConfig extends DefaultHoodieConfig { } public int getTargetPartitionsPerDayBasedCompaction() { - return Integer - .parseInt(props.getProperty(HoodieCompactionConfig.TARGET_PARTITIONS_PER_DAYBASED_COMPACTION_PROP)); + return Integer.parseInt(props.getProperty(HoodieCompactionConfig.TARGET_PARTITIONS_PER_DAYBASED_COMPACTION_PROP)); } public int getCommitArchivalBatchSize() { - return Integer - .parseInt(props.getProperty(HoodieCompactionConfig.COMMITS_ARCHIVAL_BATCH_SIZE_PROP)); + return Integer.parseInt(props.getProperty(HoodieCompactionConfig.COMMITS_ARCHIVAL_BATCH_SIZE_PROP)); } /** @@ -352,9 +336,8 @@ public class HoodieWriteConfig extends DefaultHoodieConfig { } /** - * Fraction of the global share of QPS that should be allocated to this job. - * Let's say there are 3 jobs which have input size in terms of number of rows - * required for HbaseIndexing as x, 2x, 3x respectively. Then this fraction for + * Fraction of the global share of QPS that should be allocated to this job. Let's say there are 3 jobs which have + * input size in terms of number of rows required for HbaseIndexing as x, 2x, 3x respectively. Then this fraction for * the jobs would be (0.17) 1/6, 0.33 (2/6) and 0.5 (3/6) respectively. */ public float getHbaseIndexQPSFraction() { @@ -370,8 +353,8 @@ public class HoodieWriteConfig extends DefaultHoodieConfig { } /** - * This should be same across various jobs. This is intended to limit the aggregate - * QPS generated across various Hoodie jobs to an Hbase Region Server + * This should be same across various jobs. This is intended to limit the aggregate QPS generated across various + * Hoodie jobs to an Hbase Region Server */ public int getHbaseIndexMaxQPSPerRegionServer() { return Integer.parseInt(props.getProperty(HoodieHBaseIndexConfig.HBASE_MAX_QPS_PER_REGION_SERVER_PROP)); @@ -382,8 +365,7 @@ public class HoodieWriteConfig extends DefaultHoodieConfig { } public boolean getBloomIndexPruneByRanges() { - return Boolean - .parseBoolean(props.getProperty(HoodieIndexConfig.BLOOM_INDEX_PRUNE_BY_RANGES_PROP)); + return Boolean.parseBoolean(props.getProperty(HoodieIndexConfig.BLOOM_INDEX_PRUNE_BY_RANGES_PROP)); } public boolean getBloomIndexUseCaching() { @@ -403,8 +385,7 @@ public class HoodieWriteConfig extends DefaultHoodieConfig { } public StorageLevel getBloomIndexInputStorageLevel() { - return StorageLevel - .fromString(props.getProperty(HoodieIndexConfig.BLOOM_INDEX_INPUT_STORAGE_LEVEL)); + return StorageLevel.fromString(props.getProperty(HoodieIndexConfig.BLOOM_INDEX_INPUT_STORAGE_LEVEL)); } /** @@ -423,8 +404,7 @@ public class HoodieWriteConfig extends DefaultHoodieConfig { } public int getLogFileDataBlockMaxSize() { - return Integer - .parseInt(props.getProperty(HoodieStorageConfig.LOGFILE_DATA_BLOCK_SIZE_MAX_BYTES)); + return Integer.parseInt(props.getProperty(HoodieStorageConfig.LOGFILE_DATA_BLOCK_SIZE_MAX_BYTES)); } public int getLogFileMaxSize() { @@ -451,8 +431,7 @@ public class HoodieWriteConfig extends DefaultHoodieConfig { } public MetricsReporterType getMetricsReporterType() { - return MetricsReporterType - .valueOf(props.getProperty(HoodieMetricsConfig.METRICS_REPORTER_TYPE)); + return MetricsReporterType.valueOf(props.getProperty(HoodieMetricsConfig.METRICS_REPORTER_TYPE)); } public String getGraphiteServerHost() { @@ -475,9 +454,7 @@ public class HoodieWriteConfig extends DefaultHoodieConfig { } public Double getMaxMemoryFractionPerCompaction() { - return Double - .valueOf( - props.getProperty(HoodieMemoryConfig.MAX_MEMORY_FRACTION_FOR_COMPACTION_PROP)); + return Double.valueOf(props.getProperty(HoodieMemoryConfig.MAX_MEMORY_FRACTION_FOR_COMPACTION_PROP)); } public Long getMaxMemoryPerPartitionMerge() { @@ -637,8 +614,7 @@ public class HoodieWriteConfig extends DefaultHoodieConfig { } public Builder withAssumeDatePartitioning(boolean assumeDatePartitioning) { - props.setProperty(HOODIE_ASSUME_DATE_PARTITIONING_PROP, - String.valueOf(assumeDatePartitioning)); + props.setProperty(HOODIE_ASSUME_DATE_PARTITIONING_PROP, String.valueOf(assumeDatePartitioning)); return this; } @@ -671,48 +647,42 @@ public class HoodieWriteConfig extends DefaultHoodieConfig { public HoodieWriteConfig build() { // Check for mandatory properties - setDefaultOnCondition(props, !props.containsKey(INSERT_PARALLELISM), INSERT_PARALLELISM, + setDefaultOnCondition(props, !props.containsKey(INSERT_PARALLELISM), INSERT_PARALLELISM, DEFAULT_PARALLELISM); + setDefaultOnCondition(props, !props.containsKey(BULKINSERT_PARALLELISM), BULKINSERT_PARALLELISM, DEFAULT_PARALLELISM); - setDefaultOnCondition(props, !props.containsKey(BULKINSERT_PARALLELISM), - BULKINSERT_PARALLELISM, DEFAULT_PARALLELISM); - setDefaultOnCondition(props, !props.containsKey(UPSERT_PARALLELISM), UPSERT_PARALLELISM, - DEFAULT_PARALLELISM); - setDefaultOnCondition(props, !props.containsKey(COMBINE_BEFORE_INSERT_PROP), - COMBINE_BEFORE_INSERT_PROP, DEFAULT_COMBINE_BEFORE_INSERT); - setDefaultOnCondition(props, !props.containsKey(COMBINE_BEFORE_UPSERT_PROP), - COMBINE_BEFORE_UPSERT_PROP, DEFAULT_COMBINE_BEFORE_UPSERT); - setDefaultOnCondition(props, !props.containsKey(WRITE_STATUS_STORAGE_LEVEL), - WRITE_STATUS_STORAGE_LEVEL, DEFAULT_WRITE_STATUS_STORAGE_LEVEL); - setDefaultOnCondition(props, !props.containsKey(HOODIE_AUTO_COMMIT_PROP), - HOODIE_AUTO_COMMIT_PROP, DEFAULT_HOODIE_AUTO_COMMIT); + setDefaultOnCondition(props, !props.containsKey(UPSERT_PARALLELISM), UPSERT_PARALLELISM, DEFAULT_PARALLELISM); + setDefaultOnCondition(props, !props.containsKey(COMBINE_BEFORE_INSERT_PROP), COMBINE_BEFORE_INSERT_PROP, + DEFAULT_COMBINE_BEFORE_INSERT); + setDefaultOnCondition(props, !props.containsKey(COMBINE_BEFORE_UPSERT_PROP), COMBINE_BEFORE_UPSERT_PROP, + DEFAULT_COMBINE_BEFORE_UPSERT); + setDefaultOnCondition(props, !props.containsKey(WRITE_STATUS_STORAGE_LEVEL), WRITE_STATUS_STORAGE_LEVEL, + DEFAULT_WRITE_STATUS_STORAGE_LEVEL); + setDefaultOnCondition(props, !props.containsKey(HOODIE_AUTO_COMMIT_PROP), HOODIE_AUTO_COMMIT_PROP, + DEFAULT_HOODIE_AUTO_COMMIT); setDefaultOnCondition(props, !props.containsKey(HOODIE_ASSUME_DATE_PARTITIONING_PROP), HOODIE_ASSUME_DATE_PARTITIONING_PROP, DEFAULT_ASSUME_DATE_PARTITIONING); - setDefaultOnCondition(props, !props.containsKey(HOODIE_WRITE_STATUS_CLASS_PROP), - HOODIE_WRITE_STATUS_CLASS_PROP, DEFAULT_HOODIE_WRITE_STATUS_CLASS); - setDefaultOnCondition(props, !props.containsKey(FINALIZE_WRITE_PARALLELISM), - FINALIZE_WRITE_PARALLELISM, DEFAULT_FINALIZE_WRITE_PARALLELISM); + setDefaultOnCondition(props, !props.containsKey(HOODIE_WRITE_STATUS_CLASS_PROP), HOODIE_WRITE_STATUS_CLASS_PROP, + DEFAULT_HOODIE_WRITE_STATUS_CLASS); + setDefaultOnCondition(props, !props.containsKey(FINALIZE_WRITE_PARALLELISM), FINALIZE_WRITE_PARALLELISM, + DEFAULT_FINALIZE_WRITE_PARALLELISM); setDefaultOnCondition(props, !props.containsKey(EMBEDDED_TIMELINE_SERVER_ENABLED), EMBEDDED_TIMELINE_SERVER_ENABLED, DEFAULT_EMBEDDED_TIMELINE_SERVER_ENABLED); setDefaultOnCondition(props, !props.containsKey(INITIAL_CONSISTENCY_CHECK_INTERVAL_MS_PROP), INITIAL_CONSISTENCY_CHECK_INTERVAL_MS_PROP, String.valueOf(DEFAULT_INITIAL_CONSISTENCY_CHECK_INTERVAL_MS)); setDefaultOnCondition(props, !props.containsKey(MAX_CONSISTENCY_CHECK_INTERVAL_MS_PROP), MAX_CONSISTENCY_CHECK_INTERVAL_MS_PROP, String.valueOf(DEFAULT_MAX_CONSISTENCY_CHECK_INTERVAL_MS)); - setDefaultOnCondition(props, !props.containsKey(MAX_CONSISTENCY_CHECKS_PROP), - MAX_CONSISTENCY_CHECKS_PROP, String.valueOf(DEFAULT_MAX_CONSISTENCY_CHECKS)); + setDefaultOnCondition(props, !props.containsKey(MAX_CONSISTENCY_CHECKS_PROP), MAX_CONSISTENCY_CHECKS_PROP, + String.valueOf(DEFAULT_MAX_CONSISTENCY_CHECKS)); setDefaultOnCondition(props, !props.containsKey(FAIL_ON_TIMELINE_ARCHIVING_ENABLED_PROP), - FAIL_ON_TIMELINE_ARCHIVING_ENABLED_PROP, DEFAULT_FAIL_ON_TIMELINE_ARCHIVING_ENABLED); + FAIL_ON_TIMELINE_ARCHIVING_ENABLED_PROP, DEFAULT_FAIL_ON_TIMELINE_ARCHIVING_ENABLED); // Make sure the props is propagated - setDefaultOnCondition(props, !isIndexConfigSet, - HoodieIndexConfig.newBuilder().fromProperties(props).build()); - setDefaultOnCondition(props, !isStorageConfigSet, - HoodieStorageConfig.newBuilder().fromProperties(props).build()); + setDefaultOnCondition(props, !isIndexConfigSet, HoodieIndexConfig.newBuilder().fromProperties(props).build()); + setDefaultOnCondition(props, !isStorageConfigSet, HoodieStorageConfig.newBuilder().fromProperties(props).build()); setDefaultOnCondition(props, !isCompactionConfigSet, HoodieCompactionConfig.newBuilder().fromProperties(props).build()); - setDefaultOnCondition(props, !isMetricsConfigSet, - HoodieMetricsConfig.newBuilder().fromProperties(props).build()); - setDefaultOnCondition(props, !isMemoryConfigSet, - HoodieMemoryConfig.newBuilder().fromProperties(props).build()); + setDefaultOnCondition(props, !isMetricsConfigSet, HoodieMetricsConfig.newBuilder().fromProperties(props).build()); + setDefaultOnCondition(props, !isMemoryConfigSet, HoodieMemoryConfig.newBuilder().fromProperties(props).build()); setDefaultOnCondition(props, !isViewConfigSet, FileSystemViewStorageConfig.newBuilder().fromProperties(props).build()); setDefaultOnCondition(props, !isConsistencyGuardSet, diff --git a/hudi-client/src/main/java/org/apache/hudi/exception/HoodieAppendException.java b/hudi-client/src/main/java/org/apache/hudi/exception/HoodieAppendException.java index b2bb38eda..e6035699a 100644 --- a/hudi-client/src/main/java/org/apache/hudi/exception/HoodieAppendException.java +++ b/hudi-client/src/main/java/org/apache/hudi/exception/HoodieAppendException.java @@ -19,8 +19,9 @@ package org.apache.hudi.exception; /** - *

Exception thrown for any higher level errors when HoodieClient is doing a delta - * commit

+ *

+ * Exception thrown for any higher level errors when HoodieClient is doing a delta commit + *

*/ public class HoodieAppendException extends HoodieException { diff --git a/hudi-client/src/main/java/org/apache/hudi/exception/HoodieCommitException.java b/hudi-client/src/main/java/org/apache/hudi/exception/HoodieCommitException.java index 46e573b80..3fb15a634 100644 --- a/hudi-client/src/main/java/org/apache/hudi/exception/HoodieCommitException.java +++ b/hudi-client/src/main/java/org/apache/hudi/exception/HoodieCommitException.java @@ -19,7 +19,8 @@ package org.apache.hudi.exception; /** - *

Exception thrown for any higher level errors when HoodieClient is doing a Commit + *

+ * Exception thrown for any higher level errors when HoodieClient is doing a Commit *

*/ public class HoodieCommitException extends HoodieException { diff --git a/hudi-client/src/main/java/org/apache/hudi/exception/HoodieDependentSystemUnavailableException.java b/hudi-client/src/main/java/org/apache/hudi/exception/HoodieDependentSystemUnavailableException.java index ffb125533..4530817d6 100644 --- a/hudi-client/src/main/java/org/apache/hudi/exception/HoodieDependentSystemUnavailableException.java +++ b/hudi-client/src/main/java/org/apache/hudi/exception/HoodieDependentSystemUnavailableException.java @@ -20,7 +20,9 @@ package org.apache.hudi.exception; /** - *

Exception thrown when dependent system is not available

+ *

+ * Exception thrown when dependent system is not available + *

*/ public class HoodieDependentSystemUnavailableException extends HoodieException { diff --git a/hudi-client/src/main/java/org/apache/hudi/exception/HoodieInsertException.java b/hudi-client/src/main/java/org/apache/hudi/exception/HoodieInsertException.java index 3de9fe6b1..37995bf7d 100644 --- a/hudi-client/src/main/java/org/apache/hudi/exception/HoodieInsertException.java +++ b/hudi-client/src/main/java/org/apache/hudi/exception/HoodieInsertException.java @@ -19,8 +19,9 @@ package org.apache.hudi.exception; /** - *

Exception thrown for any higher level errors when HoodieClient is doing a bulk - * insert

+ *

+ * Exception thrown for any higher level errors when HoodieClient is doing a bulk insert + *

*/ public class HoodieInsertException extends HoodieException { diff --git a/hudi-client/src/main/java/org/apache/hudi/exception/HoodieUpsertException.java b/hudi-client/src/main/java/org/apache/hudi/exception/HoodieUpsertException.java index c57dd697c..062ef67f7 100644 --- a/hudi-client/src/main/java/org/apache/hudi/exception/HoodieUpsertException.java +++ b/hudi-client/src/main/java/org/apache/hudi/exception/HoodieUpsertException.java @@ -19,8 +19,9 @@ package org.apache.hudi.exception; /** - *

Exception thrown for any higher level errors when HoodieClient is doing a - * incremental upsert

+ *

+ * Exception thrown for any higher level errors when HoodieClient is doing a incremental upsert + *

*/ public class HoodieUpsertException extends HoodieException { diff --git a/hudi-client/src/main/java/org/apache/hudi/func/BulkInsertMapFunction.java b/hudi-client/src/main/java/org/apache/hudi/func/BulkInsertMapFunction.java index 86a2a9fb7..417574eda 100644 --- a/hudi-client/src/main/java/org/apache/hudi/func/BulkInsertMapFunction.java +++ b/hudi-client/src/main/java/org/apache/hudi/func/BulkInsertMapFunction.java @@ -31,16 +31,16 @@ import org.apache.spark.api.java.function.Function2; /** * Map function that handles a sorted stream of HoodieRecords */ -public class BulkInsertMapFunction implements - Function2>, Iterator>> { +public class BulkInsertMapFunction + implements Function2>, Iterator>> { private String commitTime; private HoodieWriteConfig config; private HoodieTable hoodieTable; private List fileIDPrefixes; - public BulkInsertMapFunction(String commitTime, HoodieWriteConfig config, - HoodieTable hoodieTable, List fileIDPrefixes) { + public BulkInsertMapFunction(String commitTime, HoodieWriteConfig config, HoodieTable hoodieTable, + List fileIDPrefixes) { this.commitTime = commitTime; this.config = config; this.hoodieTable = hoodieTable; diff --git a/hudi-client/src/main/java/org/apache/hudi/func/CopyOnWriteLazyInsertIterable.java b/hudi-client/src/main/java/org/apache/hudi/func/CopyOnWriteLazyInsertIterable.java index a772ae6c0..13915a0ca 100644 --- a/hudi-client/src/main/java/org/apache/hudi/func/CopyOnWriteLazyInsertIterable.java +++ b/hudi-client/src/main/java/org/apache/hudi/func/CopyOnWriteLazyInsertIterable.java @@ -37,11 +37,10 @@ import org.apache.hudi.io.HoodieWriteHandle; import org.apache.hudi.table.HoodieTable; /** - * Lazy Iterable, that writes a stream of HoodieRecords sorted by the partitionPath, into new - * files. + * Lazy Iterable, that writes a stream of HoodieRecords sorted by the partitionPath, into new files. */ -public class CopyOnWriteLazyInsertIterable extends - LazyIterableIterator, List> { +public class CopyOnWriteLazyInsertIterable + extends LazyIterableIterator, List> { protected final HoodieWriteConfig hoodieConfig; protected final String commitTime; @@ -80,25 +79,23 @@ public class CopyOnWriteLazyInsertIterable extend * Transformer function to help transform a HoodieRecord. This transformer is used by BufferedIterator to offload some * expensive operations of transformation to the reader thread. */ - static Function, - HoodieInsertValueGenResult> getTransformFunction(Schema schema) { + static Function, HoodieInsertValueGenResult> getTransformFunction( + Schema schema) { return hoodieRecord -> new HoodieInsertValueGenResult(hoodieRecord, schema); } @Override - protected void start() { - } + protected void start() {} @Override protected List computeNext() { // Executor service used for launching writer thread. - BoundedInMemoryExecutor, - HoodieInsertValueGenResult, List> bufferedIteratorExecutor = null; + BoundedInMemoryExecutor, HoodieInsertValueGenResult, List> bufferedIteratorExecutor = + null; try { final Schema schema = new Schema.Parser().parse(hoodieConfig.getSchema()); bufferedIteratorExecutor = - new SparkBoundedInMemoryExecutor<>(hoodieConfig, inputItr, - getInsertHandler(), getTransformFunction(schema)); + new SparkBoundedInMemoryExecutor<>(hoodieConfig, inputItr, getInsertHandler(), getTransformFunction(schema)); final List result = bufferedIteratorExecutor.execute(); assert result != null && !result.isEmpty() && !bufferedIteratorExecutor.isRemaining(); return result; @@ -112,8 +109,7 @@ public class CopyOnWriteLazyInsertIterable extend } @Override - protected void end() { - } + protected void end() {} protected String getNextFileId(String idPfx) { return String.format("%s-%d", idPfx, numFilesWritten++); @@ -124,11 +120,10 @@ public class CopyOnWriteLazyInsertIterable extend } /** - * Consumes stream of hoodie records from in-memory queue and - * writes to one or more create-handles + * Consumes stream of hoodie records from in-memory queue and writes to one or more create-handles */ - protected class CopyOnWriteInsertHandler extends - BoundedInMemoryQueueConsumer, List> { + protected class CopyOnWriteInsertHandler + extends BoundedInMemoryQueueConsumer, List> { protected final List statuses = new ArrayList<>(); protected HoodieWriteHandle handle; diff --git a/hudi-client/src/main/java/org/apache/hudi/func/LazyIterableIterator.java b/hudi-client/src/main/java/org/apache/hudi/func/LazyIterableIterator.java index a66fd3553..3f0f4a1eb 100644 --- a/hudi-client/src/main/java/org/apache/hudi/func/LazyIterableIterator.java +++ b/hudi-client/src/main/java/org/apache/hudi/func/LazyIterableIterator.java @@ -21,16 +21,15 @@ package org.apache.hudi.func; import java.util.Iterator; /** - * (NOTE: Adapted from Apache SystemML) This class is a generic base class for lazy, single pass - * inputItr classes in order to simplify the implementation of lazy iterators for mapPartitions use - * cases. Note [SPARK-3369], which gives the reasons for backwards compatibility with regard to the - * iterable API despite Spark's single pass nature. + * (NOTE: Adapted from Apache SystemML) This class is a generic base class for lazy, single pass inputItr classes in + * order to simplify the implementation of lazy iterators for mapPartitions use cases. Note [SPARK-3369], which gives + * the reasons for backwards compatibility with regard to the iterable API despite Spark's single pass nature. *

- * Provide a way to obtain a inputItr of type O (output), out of an inputItr of type I (input) + * Provide a way to obtain a inputItr of type O (output), out of an inputItr of type I (input) *

- * Things to remember: - Assumes Spark calls hasNext() to check for elements, before calling next() - * to obtain them - Assumes hasNext() gets called atleast once. - Concrete Implementation is - * responsible for calling inputIterator.next() and doing the processing in computeNext() + * Things to remember: - Assumes Spark calls hasNext() to check for elements, before calling next() to obtain them - + * Assumes hasNext() gets called atleast once. - Concrete Implementation is responsible for calling inputIterator.next() + * and doing the processing in computeNext() */ public abstract class LazyIterableIterator implements Iterable, Iterator { @@ -88,13 +87,13 @@ public abstract class LazyIterableIterator implements Iterable, Iterato @Override public Iterator iterator() { - //check for consumed inputItr + // check for consumed inputItr if (consumed) { throw new RuntimeException("Invalid repeated inputItr consumption."); } - //hand out self as inputItr exactly once (note: do not hand out the input - //inputItr since it is consumed by the self inputItr implementation) + // hand out self as inputItr exactly once (note: do not hand out the input + // inputItr since it is consumed by the self inputItr implementation) consumed = true; return this; } diff --git a/hudi-client/src/main/java/org/apache/hudi/func/MergeOnReadLazyInsertIterable.java b/hudi-client/src/main/java/org/apache/hudi/func/MergeOnReadLazyInsertIterable.java index db44bca90..384502082 100644 --- a/hudi-client/src/main/java/org/apache/hudi/func/MergeOnReadLazyInsertIterable.java +++ b/hudi-client/src/main/java/org/apache/hudi/func/MergeOnReadLazyInsertIterable.java @@ -29,11 +29,9 @@ import org.apache.hudi.io.HoodieAppendHandle; import org.apache.hudi.table.HoodieTable; /** - * Lazy Iterable, that writes a stream of HoodieRecords sorted by the partitionPath, into new - * log files. + * Lazy Iterable, that writes a stream of HoodieRecords sorted by the partitionPath, into new log files. */ -public class MergeOnReadLazyInsertIterable extends - CopyOnWriteLazyInsertIterable { +public class MergeOnReadLazyInsertIterable extends CopyOnWriteLazyInsertIterable { public MergeOnReadLazyInsertIterable(Iterator> sortedRecordItr, HoodieWriteConfig config, String commitTime, HoodieTable hoodieTable, String idPfx) { diff --git a/hudi-client/src/main/java/org/apache/hudi/func/OperationResult.java b/hudi-client/src/main/java/org/apache/hudi/func/OperationResult.java index 980d555b6..fba7f581b 100644 --- a/hudi-client/src/main/java/org/apache/hudi/func/OperationResult.java +++ b/hudi-client/src/main/java/org/apache/hudi/func/OperationResult.java @@ -32,8 +32,7 @@ public class OperationResult implements Serializable { private boolean success; private Option exception; - public OperationResult() { - } + public OperationResult() {} public OperationResult(T operation, boolean success, Option exception) { this.operation = operation; @@ -67,11 +66,7 @@ public class OperationResult implements Serializable { @Override public String toString() { - return "OperationResult{" - + "operation=" + operation - + ", executed=" + executed - + ", success=" + success - + ", exception=" + exception - + '}'; + return "OperationResult{" + "operation=" + operation + ", executed=" + executed + ", success=" + success + + ", exception=" + exception + '}'; } } diff --git a/hudi-client/src/main/java/org/apache/hudi/func/ParquetReaderIterator.java b/hudi-client/src/main/java/org/apache/hudi/func/ParquetReaderIterator.java index 00c8636e1..ce5067283 100644 --- a/hudi-client/src/main/java/org/apache/hudi/func/ParquetReaderIterator.java +++ b/hudi-client/src/main/java/org/apache/hudi/func/ParquetReaderIterator.java @@ -25,8 +25,8 @@ import org.apache.hudi.exception.HoodieIOException; import org.apache.parquet.hadoop.ParquetReader; /** - * This class wraps a parquet reader and provides an iterator based api to - * read from a parquet file. This is used in {@link BoundedInMemoryQueue} + * This class wraps a parquet reader and provides an iterator based api to read from a parquet file. This is used in + * {@link BoundedInMemoryQueue} */ public class ParquetReaderIterator implements Iterator { diff --git a/hudi-client/src/main/java/org/apache/hudi/func/SparkBoundedInMemoryExecutor.java b/hudi-client/src/main/java/org/apache/hudi/func/SparkBoundedInMemoryExecutor.java index 3924fa2c1..2f57d17bb 100644 --- a/hudi-client/src/main/java/org/apache/hudi/func/SparkBoundedInMemoryExecutor.java +++ b/hudi-client/src/main/java/org/apache/hudi/func/SparkBoundedInMemoryExecutor.java @@ -36,17 +36,13 @@ public class SparkBoundedInMemoryExecutor extends BoundedInMemoryExecut final TaskContext sparkThreadTaskContext; public SparkBoundedInMemoryExecutor(final HoodieWriteConfig hoodieConfig, final Iterator inputItr, - BoundedInMemoryQueueConsumer consumer, - Function bufferedIteratorTransform) { + BoundedInMemoryQueueConsumer consumer, Function bufferedIteratorTransform) { this(hoodieConfig, new IteratorBasedQueueProducer<>(inputItr), consumer, bufferedIteratorTransform); } - public SparkBoundedInMemoryExecutor(final HoodieWriteConfig hoodieConfig, - BoundedInMemoryQueueProducer producer, - BoundedInMemoryQueueConsumer consumer, - Function bufferedIteratorTransform) { - super(hoodieConfig.getWriteBufferLimitBytes(), producer, - Option.of(consumer), bufferedIteratorTransform); + public SparkBoundedInMemoryExecutor(final HoodieWriteConfig hoodieConfig, BoundedInMemoryQueueProducer producer, + BoundedInMemoryQueueConsumer consumer, Function bufferedIteratorTransform) { + super(hoodieConfig.getWriteBufferLimitBytes(), producer, Option.of(consumer), bufferedIteratorTransform); this.sparkThreadTaskContext = TaskContext.get(); } diff --git a/hudi-client/src/main/java/org/apache/hudi/index/HoodieIndex.java b/hudi-client/src/main/java/org/apache/hudi/index/HoodieIndex.java index 76b51e8ea..9eb721abd 100644 --- a/hudi-client/src/main/java/org/apache/hudi/index/HoodieIndex.java +++ b/hudi-client/src/main/java/org/apache/hudi/index/HoodieIndex.java @@ -65,18 +65,18 @@ public abstract class HoodieIndex implements Seri } /** - * Checks if the given [Keys] exists in the hoodie table and returns [Key, Option[partitionPath, fileID]] - * If the optional is empty, then the key is not found. + * Checks if the given [Keys] exists in the hoodie table and returns [Key, Option[partitionPath, fileID]] If the + * optional is empty, then the key is not found. */ public abstract JavaPairRDD>> fetchRecordLocation( JavaRDD hoodieKeys, final JavaSparkContext jsc, HoodieTable hoodieTable); /** - * Looks up the index and tags each incoming record with a location of a file that contains the - * row (if it is actually present) + * Looks up the index and tags each incoming record with a location of a file that contains the row (if it is actually + * present) */ - public abstract JavaRDD> tagLocation(JavaRDD> recordRDD, - JavaSparkContext jsc, HoodieTable hoodieTable) throws HoodieIndexException; + public abstract JavaRDD> tagLocation(JavaRDD> recordRDD, JavaSparkContext jsc, + HoodieTable hoodieTable) throws HoodieIndexException; /** * Extracts the location of written records, and updates the index. @@ -84,8 +84,7 @@ public abstract class HoodieIndex implements Seri * TODO(vc): We may need to propagate the record as well in a WriteStatus class */ public abstract JavaRDD updateLocation(JavaRDD writeStatusRDD, JavaSparkContext jsc, - HoodieTable hoodieTable) - throws HoodieIndexException; + HoodieTable hoodieTable) throws HoodieIndexException; /** * Rollback the efffects of the commit made at commitTime. @@ -93,17 +92,17 @@ public abstract class HoodieIndex implements Seri public abstract boolean rollbackCommit(String commitTime); /** - * An index is `global` if {@link HoodieKey} to fileID mapping, does not depend on the - * `partitionPath`. Such an implementation is able to obtain the same mapping, for two hoodie keys - * with same `recordKey` but different `partitionPath` + * An index is `global` if {@link HoodieKey} to fileID mapping, does not depend on the `partitionPath`. Such an + * implementation is able to obtain the same mapping, for two hoodie keys with same `recordKey` but different + * `partitionPath` * * @return whether or not, the index implementation is global in nature */ public abstract boolean isGlobal(); /** - * This is used by storage to determine, if its safe to send inserts, straight to the log, i.e - * having a {@link FileSlice}, with no data file. + * This is used by storage to determine, if its safe to send inserts, straight to the log, i.e having a + * {@link FileSlice}, with no data file. * * @return Returns true/false depending on whether the impl has this capability */ @@ -111,8 +110,8 @@ public abstract class HoodieIndex implements Seri /** - * An index is "implicit" with respect to storage, if just writing new data to a file slice, - * updates the index as well. This is used by storage, to save memory footprint in certain cases. + * An index is "implicit" with respect to storage, if just writing new data to a file slice, updates the index as + * well. This is used by storage, to save memory footprint in certain cases. */ public abstract boolean isImplicitWithStorage(); diff --git a/hudi-client/src/main/java/org/apache/hudi/index/InMemoryHashIndex.java b/hudi-client/src/main/java/org/apache/hudi/index/InMemoryHashIndex.java index fdf2cbf48..506b8a958 100644 --- a/hudi-client/src/main/java/org/apache/hudi/index/InMemoryHashIndex.java +++ b/hudi-client/src/main/java/org/apache/hudi/index/InMemoryHashIndex.java @@ -40,7 +40,9 @@ import org.apache.spark.api.java.function.Function2; /** - * Hoodie Index implementation backed by an in-memory Hash map.

ONLY USE FOR LOCAL TESTING + * Hoodie Index implementation backed by an in-memory Hash map. + *

+ * ONLY USE FOR LOCAL TESTING */ public class InMemoryHashIndex extends HoodieIndex { @@ -80,7 +82,7 @@ public class InMemoryHashIndex extends HoodieInde if (newLocation.isPresent()) { recordLocationMap.put(key, newLocation.get()); } else { - //Delete existing index for a deleted record + // Delete existing index for a deleted record recordLocationMap.remove(key); } } @@ -122,12 +124,10 @@ public class InMemoryHashIndex extends HoodieInde /** * Function that tags each HoodieRecord with an existing location, if known. */ - class LocationTagFunction implements - Function2>, Iterator>> { + class LocationTagFunction implements Function2>, Iterator>> { @Override - public Iterator> call(Integer partitionNum, - Iterator> hoodieRecordIterator) { + public Iterator> call(Integer partitionNum, Iterator> hoodieRecordIterator) { List> taggedRecords = new ArrayList<>(); while (hoodieRecordIterator.hasNext()) { HoodieRecord rec = hoodieRecordIterator.next(); diff --git a/hudi-client/src/main/java/org/apache/hudi/index/bloom/BucketizedBloomCheckPartitioner.java b/hudi-client/src/main/java/org/apache/hudi/index/bloom/BucketizedBloomCheckPartitioner.java index f1150f046..abceb1a2c 100644 --- a/hudi-client/src/main/java/org/apache/hudi/index/bloom/BucketizedBloomCheckPartitioner.java +++ b/hudi-client/src/main/java/org/apache/hudi/index/bloom/BucketizedBloomCheckPartitioner.java @@ -35,6 +35,7 @@ import org.apache.spark.Partitioner; * Partitions bloom filter checks by spreading out comparisons across buckets of work. * * Each bucket incurs the following cost + * *

  *   1) Read bloom filter from file footer
  *   2) Check keys against bloom filter
@@ -47,6 +48,7 @@ import org.apache.spark.Partitioner;
  * could bound the amount of skew to std_dev(numberOfBucketsPerPartition) * cost of (3), lower than sort partitioning.
  *
  * Approach has two goals :
+ * 
  * 
  *   1) Pack as many buckets from same file group into same partition, to amortize cost of (1) and (2) further
  *   2) Spread buckets across partitions evenly to achieve skew reduction
@@ -76,8 +78,7 @@ public class BucketizedBloomCheckPartitioner extends Partitioner {
 
     Map bucketsPerFileGroup = new HashMap<>();
     // Compute the buckets needed per file group, using simple uniform distribution
-    fileGroupToComparisons.forEach((f, c) ->
-        bucketsPerFileGroup.put(f, (int) Math.ceil((c * 1.0) / keysPerBucket)));
+    fileGroupToComparisons.forEach((f, c) -> bucketsPerFileGroup.put(f, (int) Math.ceil((c * 1.0) / keysPerBucket)));
     int totalBuckets = bucketsPerFileGroup.values().stream().mapToInt(i -> i).sum();
     // If totalBuckets > targetPartitions, no need to have extra partitions
     this.partitions = Math.min(targetPartitions, totalBuckets);
diff --git a/hudi-client/src/main/java/org/apache/hudi/index/bloom/HoodieBloomIndex.java b/hudi-client/src/main/java/org/apache/hudi/index/bloom/HoodieBloomIndex.java
index 75016c6bd..d43bf1e38 100644
--- a/hudi-client/src/main/java/org/apache/hudi/index/bloom/HoodieBloomIndex.java
+++ b/hudi-client/src/main/java/org/apache/hudi/index/bloom/HoodieBloomIndex.java
@@ -78,12 +78,12 @@ public class HoodieBloomIndex extends HoodieIndex
     }
 
     // Step 1: Extract out thinner JavaPairRDD of (partitionPath, recordKey)
-    JavaPairRDD partitionRecordKeyPairRDD = recordRDD
-        .mapToPair(record -> new Tuple2<>(record.getPartitionPath(), record.getRecordKey()));
+    JavaPairRDD partitionRecordKeyPairRDD =
+        recordRDD.mapToPair(record -> new Tuple2<>(record.getPartitionPath(), record.getRecordKey()));
 
     // Lookup indexes for all the partition/recordkey pair
-    JavaPairRDD keyFilenamePairRDD = lookupIndex(partitionRecordKeyPairRDD, jsc,
-        hoodieTable);
+    JavaPairRDD keyFilenamePairRDD =
+        lookupIndex(partitionRecordKeyPairRDD, jsc, hoodieTable);
 
     // Cache the result, for subsequent stages.
     if (config.getBloomIndexUseCaching()) {
@@ -96,8 +96,7 @@ public class HoodieBloomIndex extends HoodieIndex
 
     // Step 4: Tag the incoming records, as inserts or updates, by joining with existing record keys
     // Cost: 4 sec.
-    JavaRDD> taggedRecordRDD = tagLocationBacktoRecords(keyFilenamePairRDD,
-        recordRDD);
+    JavaRDD> taggedRecordRDD = tagLocationBacktoRecords(keyFilenamePairRDD, recordRDD);
 
     if (config.getBloomIndexUseCaching()) {
       recordRDD.unpersist(); // unpersist the input Record RDD
@@ -108,8 +107,8 @@ public class HoodieBloomIndex extends HoodieIndex
   }
 
   /**
-   * Returns an RDD mapping each HoodieKey with a partitionPath/fileID which contains it. Option.Empty if the key is
-   * not found.
+   * Returns an RDD mapping each HoodieKey with a partitionPath/fileID which contains it. Option.Empty if the key is not
+   * found.
    *
    * @param hoodieKeys keys to lookup
    * @param jsc spark context
@@ -118,12 +117,12 @@ public class HoodieBloomIndex extends HoodieIndex
   @Override
   public JavaPairRDD>> fetchRecordLocation(JavaRDD hoodieKeys,
       JavaSparkContext jsc, HoodieTable hoodieTable) {
-    JavaPairRDD partitionRecordKeyPairRDD = hoodieKeys
-        .mapToPair(key -> new Tuple2<>(key.getPartitionPath(), key.getRecordKey()));
+    JavaPairRDD partitionRecordKeyPairRDD =
+        hoodieKeys.mapToPair(key -> new Tuple2<>(key.getPartitionPath(), key.getRecordKey()));
 
     // Lookup indexes for all the partition/recordkey pair
-    JavaPairRDD recordKeyLocationRDD = lookupIndex(partitionRecordKeyPairRDD, jsc,
-        hoodieTable);
+    JavaPairRDD recordKeyLocationRDD =
+        lookupIndex(partitionRecordKeyPairRDD, jsc, hoodieTable);
     JavaPairRDD keyHoodieKeyPairRDD = hoodieKeys.mapToPair(key -> new Tuple2<>(key, null));
 
     return keyHoodieKeyPairRDD.leftOuterJoin(recordKeyLocationRDD).mapToPair(keyLoc -> {
@@ -149,19 +148,19 @@ public class HoodieBloomIndex extends HoodieIndex
     List affectedPartitionPathList = new ArrayList<>(recordsPerPartition.keySet());
 
     // Step 2: Load all involved files as  pairs
-    List> fileInfoList = loadInvolvedFiles(affectedPartitionPathList, jsc,
-        hoodieTable);
-    final Map> partitionToFileInfo = fileInfoList.stream()
-        .collect(groupingBy(Tuple2::_1, mapping(Tuple2::_2, toList())));
+    List> fileInfoList =
+        loadInvolvedFiles(affectedPartitionPathList, jsc, hoodieTable);
+    final Map> partitionToFileInfo =
+        fileInfoList.stream().collect(groupingBy(Tuple2::_1, mapping(Tuple2::_2, toList())));
 
     // Step 3: Obtain a RDD, for each incoming record, that already exists, with the file id,
     // that contains it.
-    Map comparisonsPerFileGroup = computeComparisonsPerFileGroup(recordsPerPartition, partitionToFileInfo,
-        partitionRecordKeyPairRDD);
+    Map comparisonsPerFileGroup =
+        computeComparisonsPerFileGroup(recordsPerPartition, partitionToFileInfo, partitionRecordKeyPairRDD);
     int safeParallelism = computeSafeParallelism(recordsPerPartition, comparisonsPerFileGroup);
     int joinParallelism = determineParallelism(partitionRecordKeyPairRDD.partitions().size(), safeParallelism);
-    return findMatchingFilesForRecordKeys(partitionToFileInfo, partitionRecordKeyPairRDD, joinParallelism,
-        hoodieTable, comparisonsPerFileGroup);
+    return findMatchingFilesForRecordKeys(partitionToFileInfo, partitionRecordKeyPairRDD, joinParallelism, hoodieTable,
+        comparisonsPerFileGroup);
   }
 
   /**
@@ -175,13 +174,13 @@ public class HoodieBloomIndex extends HoodieIndex
     if (config.getBloomIndexPruneByRanges()) {
       // we will just try exploding the input and then count to determine comparisons
       // FIX(vc): Only do sampling here and extrapolate?
-      fileToComparisons = explodeRecordRDDWithFileComparisons(partitionToFileInfo,
-          partitionRecordKeyPairRDD).mapToPair(t -> t).countByKey();
+      fileToComparisons = explodeRecordRDDWithFileComparisons(partitionToFileInfo, partitionRecordKeyPairRDD)
+          .mapToPair(t -> t).countByKey();
     } else {
       fileToComparisons = new HashMap<>();
       partitionToFileInfo.entrySet().stream().forEach(e -> {
         for (BloomIndexFileInfo fileInfo : e.getValue()) {
-          //each file needs to be compared against all the records coming into the partition
+          // each file needs to be compared against all the records coming into the partition
           fileToComparisons.put(fileInfo.getFileId(), recordsPerPartition.get(e.getKey()));
         }
       });
@@ -191,34 +190,41 @@ public class HoodieBloomIndex extends HoodieIndex
 
   /**
    * Compute the minimum parallelism needed to play well with the spark 2GB limitation.. The index lookup can be skewed
-   * in three dimensions : #files, #partitions, #records 

To be able to smoothly handle skews, we need to compute how - * to split each partitions into subpartitions. We do it here, in a way that keeps the amount of each Spark join - * partition to < 2GB.

If {@link HoodieIndexConfig#BLOOM_INDEX_PARALLELISM_PROP} is - * specified as a NON-zero number, then that is used explicitly. + * in three dimensions : #files, #partitions, #records + *

+ * To be able to smoothly handle skews, we need to compute how to split each partitions into subpartitions. We do it + * here, in a way that keeps the amount of each Spark join partition to < 2GB. + *

+ * If {@link HoodieIndexConfig#BLOOM_INDEX_PARALLELISM_PROP} is specified as a NON-zero number, then that is used + * explicitly. */ int computeSafeParallelism(Map recordsPerPartition, Map comparisonsPerFileGroup) { long totalComparisons = comparisonsPerFileGroup.values().stream().mapToLong(Long::longValue).sum(); long totalFiles = comparisonsPerFileGroup.size(); long totalRecords = recordsPerPartition.values().stream().mapToLong(Long::longValue).sum(); int parallelism = (int) (totalComparisons / MAX_ITEMS_PER_SHUFFLE_PARTITION + 1); - logger.info(String.format("TotalRecords %d, TotalFiles %d, TotalAffectedPartitions %d, TotalComparisons %d, " - + "SafeParallelism %d", totalRecords, totalFiles, recordsPerPartition.size(), totalComparisons, parallelism)); + logger.info(String.format( + "TotalRecords %d, TotalFiles %d, TotalAffectedPartitions %d, TotalComparisons %d, " + "SafeParallelism %d", + totalRecords, totalFiles, recordsPerPartition.size(), totalComparisons, parallelism)); return parallelism; } /** - * Its crucial to pick the right parallelism.

totalSubPartitions : this is deemed safe limit, to be nice with - * Spark. inputParallelism : typically number of input file splits

We pick the max such that, we are always safe, - * but go higher if say a there are a lot of input files. (otherwise, we will fallback to number of partitions in - * input and end up with slow performance) + * Its crucial to pick the right parallelism. + *

+ * totalSubPartitions : this is deemed safe limit, to be nice with Spark. inputParallelism : typically number of input + * file splits + *

+ * We pick the max such that, we are always safe, but go higher if say a there are a lot of input files. (otherwise, + * we will fallback to number of partitions in input and end up with slow performance) */ private int determineParallelism(int inputParallelism, int totalSubPartitions) { // If bloom index parallelism is set, use it to to check against the input parallelism and // take the max int indexParallelism = Math.max(inputParallelism, config.getBloomIndexParallelism()); int joinParallelism = Math.max(totalSubPartitions, indexParallelism); - logger.info("InputParallelism: ${" + inputParallelism + "}, " + "IndexParallelism: ${" + config - .getBloomIndexParallelism() + "}, " + "TotalSubParts: ${" + totalSubPartitions + "}, " + logger.info("InputParallelism: ${" + inputParallelism + "}, " + "IndexParallelism: ${" + + config.getBloomIndexParallelism() + "}, " + "TotalSubParts: ${" + totalSubPartitions + "}, " + "Join Parallelism set to : " + joinParallelism); return joinParallelism; } @@ -231,11 +237,10 @@ public class HoodieBloomIndex extends HoodieIndex final HoodieTable hoodieTable) { // Obtain the latest data files from all the partitions. - List> partitionPathFileIDList = jsc - .parallelize(partitions, Math.max(partitions.size(), 1)) - .flatMap(partitionPath -> { - Option latestCommitTime = hoodieTable.getMetaClient().getCommitsTimeline() - .filterCompletedInstants().lastInstant(); + List> partitionPathFileIDList = + jsc.parallelize(partitions, Math.max(partitions.size(), 1)).flatMap(partitionPath -> { + Option latestCommitTime = + hoodieTable.getMetaClient().getCommitsTimeline().filterCompletedInstants().lastInstant(); List> filteredFiles = new ArrayList<>(); if (latestCommitTime.isPresent()) { filteredFiles = hoodieTable.getROFileSystemView() @@ -259,8 +264,7 @@ public class HoodieBloomIndex extends HoodieIndex }).collect(); } else { return partitionPathFileIDList.stream() - .map(pf -> new Tuple2<>(pf.getKey(), new BloomIndexFileInfo(pf.getValue()))) - .collect(toList()); + .map(pf -> new Tuple2<>(pf.getKey(), new BloomIndexFileInfo(pf.getValue()))).collect(toList()); } } @@ -307,9 +311,9 @@ public class HoodieBloomIndex extends HoodieIndex JavaRDD> explodeRecordRDDWithFileComparisons( final Map> partitionToFileIndexInfo, JavaPairRDD partitionRecordKeyPairRDD) { - IndexFileFilter indexFileFilter = config.useBloomIndexTreebasedFilter() - ? new IntervalTreeBasedIndexFileFilter(partitionToFileIndexInfo) - : new ListBasedIndexFileFilter(partitionToFileIndexInfo); + IndexFileFilter indexFileFilter = + config.useBloomIndexTreebasedFilter() ? new IntervalTreeBasedIndexFileFilter(partitionToFileIndexInfo) + : new ListBasedIndexFileFilter(partitionToFileIndexInfo); return partitionRecordKeyPairRDD.map(partitionRecordKeyPair -> { String recordKey = partitionRecordKeyPair._2(); @@ -322,10 +326,12 @@ public class HoodieBloomIndex extends HoodieIndex } /** - * Find out pair. All workload grouped by file-level.

Join PairRDD(PartitionPath, RecordKey) - * and PairRDD(PartitionPath, File) & then repartition such that each RDD partition is a file, then for each file, we - * do (1) load bloom filter, (2) load rowKeys, (3) Tag rowKey

Make sure the parallelism is atleast the groupby - * parallelism for tagging location + * Find out pair. All workload grouped by file-level. + *

+ * Join PairRDD(PartitionPath, RecordKey) and PairRDD(PartitionPath, File) & then repartition such that each RDD + * partition is a file, then for each file, we do (1) load bloom filter, (2) load rowKeys, (3) Tag rowKey + *

+ * Make sure the parallelism is atleast the groupby parallelism for tagging location */ @VisibleForTesting JavaPairRDD findMatchingFilesForRecordKeys( @@ -336,33 +342,24 @@ public class HoodieBloomIndex extends HoodieIndex explodeRecordRDDWithFileComparisons(partitionToFileIndexInfo, partitionRecordKeyPairRDD); if (config.useBloomIndexBucketizedChecking()) { - Partitioner partitioner = new BucketizedBloomCheckPartitioner( - shuffleParallelism, - fileGroupToComparisons, - config.getBloomIndexKeysPerBucket() - ); + Partitioner partitioner = new BucketizedBloomCheckPartitioner(shuffleParallelism, fileGroupToComparisons, + config.getBloomIndexKeysPerBucket()); - fileComparisonsRDD = fileComparisonsRDD - .mapToPair(t -> new Tuple2<>(Pair.of(t._1, t._2.getRecordKey()), t)) - .repartitionAndSortWithinPartitions(partitioner) - .map(Tuple2::_2); + fileComparisonsRDD = fileComparisonsRDD.mapToPair(t -> new Tuple2<>(Pair.of(t._1, t._2.getRecordKey()), t)) + .repartitionAndSortWithinPartitions(partitioner).map(Tuple2::_2); } else { fileComparisonsRDD = fileComparisonsRDD.sortBy(Tuple2::_1, true, shuffleParallelism); } - return fileComparisonsRDD - .mapPartitionsWithIndex(new HoodieBloomIndexCheckFunction(hoodieTable, config), true) - .flatMap(List::iterator) - .filter(lr -> lr.getMatchingRecordKeys().size() > 0) + return fileComparisonsRDD.mapPartitionsWithIndex(new HoodieBloomIndexCheckFunction(hoodieTable, config), true) + .flatMap(List::iterator).filter(lr -> lr.getMatchingRecordKeys().size() > 0) .flatMapToPair(lookupResult -> lookupResult.getMatchingRecordKeys().stream() .map(recordKey -> new Tuple2<>(new HoodieKey(recordKey, lookupResult.getPartitionPath()), new HoodieRecordLocation(lookupResult.getBaseInstantTime(), lookupResult.getFileId()))) - .collect(Collectors.toList()) - .iterator()); + .collect(Collectors.toList()).iterator()); } - HoodieRecord getTaggedRecord(HoodieRecord inputRecord, - Option location) { + HoodieRecord getTaggedRecord(HoodieRecord inputRecord, Option location) { HoodieRecord record = inputRecord; if (location.isPresent()) { // When you have a record in multiple files in the same partition, then rowKeyRecordPairRDD @@ -383,12 +380,12 @@ public class HoodieBloomIndex extends HoodieIndex */ protected JavaRDD> tagLocationBacktoRecords( JavaPairRDD keyFilenamePairRDD, JavaRDD> recordRDD) { - JavaPairRDD> keyRecordPairRDD = recordRDD - .mapToPair(record -> new Tuple2<>(record.getKey(), record)); + JavaPairRDD> keyRecordPairRDD = + recordRDD.mapToPair(record -> new Tuple2<>(record.getKey(), record)); // Here as the recordRDD might have more data than rowKeyRDD (some rowKeys' fileId is null), // so we do left outer join. - return keyRecordPairRDD.leftOuterJoin(keyFilenamePairRDD).values().map( - v1 -> getTaggedRecord(v1._1, Option.ofNullable(v1._2.orNull()))); + return keyRecordPairRDD.leftOuterJoin(keyFilenamePairRDD).values() + .map(v1 -> getTaggedRecord(v1._1, Option.ofNullable(v1._2.orNull()))); } @Override diff --git a/hudi-client/src/main/java/org/apache/hudi/index/bloom/HoodieBloomIndexCheckFunction.java b/hudi-client/src/main/java/org/apache/hudi/index/bloom/HoodieBloomIndexCheckFunction.java index 8fa2103e1..6da33da6f 100644 --- a/hudi-client/src/main/java/org/apache/hudi/index/bloom/HoodieBloomIndexCheckFunction.java +++ b/hudi-client/src/main/java/org/apache/hudi/index/bloom/HoodieBloomIndexCheckFunction.java @@ -34,11 +34,10 @@ import org.apache.spark.api.java.function.Function2; import scala.Tuple2; /** - * Function performing actual checking of RDD partition containing (fileId, hoodieKeys) against the - * actual files + * Function performing actual checking of RDD partition containing (fileId, hoodieKeys) against the actual files */ -public class HoodieBloomIndexCheckFunction implements - Function2>, Iterator>> { +public class HoodieBloomIndexCheckFunction + implements Function2>, Iterator>> { private final HoodieTable hoodieTable; @@ -59,14 +58,12 @@ public class HoodieBloomIndexCheckFunction implements private HoodieKeyLookupHandle keyLookupHandle; - LazyKeyCheckIterator( - Iterator> filePartitionRecordKeyTripletItr) { + LazyKeyCheckIterator(Iterator> filePartitionRecordKeyTripletItr) { super(filePartitionRecordKeyTripletItr); } @Override - protected void start() { - } + protected void start() {} @Override protected List computeNext() { @@ -113,7 +110,6 @@ public class HoodieBloomIndexCheckFunction implements } @Override - protected void end() { - } + protected void end() {} } } diff --git a/hudi-client/src/main/java/org/apache/hudi/index/bloom/HoodieGlobalBloomIndex.java b/hudi-client/src/main/java/org/apache/hudi/index/bloom/HoodieGlobalBloomIndex.java index 6dc91562c..a23143ba8 100644 --- a/hudi-client/src/main/java/org/apache/hudi/index/bloom/HoodieGlobalBloomIndex.java +++ b/hudi-client/src/main/java/org/apache/hudi/index/bloom/HoodieGlobalBloomIndex.java @@ -59,9 +59,8 @@ public class HoodieGlobalBloomIndex extends Hoodi final HoodieTable hoodieTable) { HoodieTableMetaClient metaClient = hoodieTable.getMetaClient(); try { - List allPartitionPaths = FSUtils - .getAllPartitionPaths(metaClient.getFs(), metaClient.getBasePath(), - config.shouldAssumeDatePartitioning()); + List allPartitionPaths = FSUtils.getAllPartitionPaths(metaClient.getFs(), metaClient.getBasePath(), + config.shouldAssumeDatePartitioning()); return super.loadInvolvedFiles(allPartitionPaths, jsc, hoodieTable); } catch (IOException e) { throw new HoodieIOException("Failed to load all partitions", e); @@ -88,9 +87,9 @@ public class HoodieGlobalBloomIndex extends Hoodi entry.getValue().forEach(indexFile -> indexToPartitionMap.put(indexFile.getFileId(), entry.getKey())); } - IndexFileFilter indexFileFilter = config.getBloomIndexPruneByRanges() - ? new IntervalTreeBasedGlobalIndexFileFilter(partitionToFileIndexInfo) - : new ListBasedGlobalIndexFileFilter(partitionToFileIndexInfo); + IndexFileFilter indexFileFilter = + config.getBloomIndexPruneByRanges() ? new IntervalTreeBasedGlobalIndexFileFilter(partitionToFileIndexInfo) + : new ListBasedGlobalIndexFileFilter(partitionToFileIndexInfo); return partitionRecordKeyPairRDD.map(partitionRecordKeyPair -> { String recordKey = partitionRecordKeyPair._2(); @@ -109,8 +108,8 @@ public class HoodieGlobalBloomIndex extends Hoodi @Override protected JavaRDD> tagLocationBacktoRecords( JavaPairRDD keyFilenamePairRDD, JavaRDD> recordRDD) { - JavaPairRDD> rowKeyRecordPairRDD = recordRDD - .mapToPair(record -> new Tuple2<>(record.getRecordKey(), record)); + JavaPairRDD> rowKeyRecordPairRDD = + recordRDD.mapToPair(record -> new Tuple2<>(record.getRecordKey(), record)); // Here as the recordRDD might have more data than rowKeyRDD (some rowKeys' fileId is null), // so we do left outer join. diff --git a/hudi-client/src/main/java/org/apache/hudi/index/bloom/IntervalTreeBasedGlobalIndexFileFilter.java b/hudi-client/src/main/java/org/apache/hudi/index/bloom/IntervalTreeBasedGlobalIndexFileFilter.java index 65bb5f3c3..4e269d745 100644 --- a/hudi-client/src/main/java/org/apache/hudi/index/bloom/IntervalTreeBasedGlobalIndexFileFilter.java +++ b/hudi-client/src/main/java/org/apache/hudi/index/bloom/IntervalTreeBasedGlobalIndexFileFilter.java @@ -28,7 +28,7 @@ import java.util.stream.Collectors; /** * Interval Tree based index look up for Global Index. Builds an {@link KeyRangeLookupTree} for all index files (across - * all partitions) and uses it to search for matching index files for any given recordKey that needs to be looked up. + * all partitions) and uses it to search for matching index files for any given recordKey that needs to be looked up. */ class IntervalTreeBasedGlobalIndexFileFilter implements IndexFileFilter { @@ -41,16 +41,16 @@ class IntervalTreeBasedGlobalIndexFileFilter implements IndexFileFilter { * @param partitionToFileIndexInfo Map of partition to List of {@link BloomIndexFileInfo}s */ IntervalTreeBasedGlobalIndexFileFilter(final Map> partitionToFileIndexInfo) { - List allIndexFiles = partitionToFileIndexInfo.values().stream().flatMap(Collection::stream) - .collect(Collectors.toList()); + List allIndexFiles = + partitionToFileIndexInfo.values().stream().flatMap(Collection::stream).collect(Collectors.toList()); // Note that the interval tree implementation doesn't have auto-balancing to ensure logN search time. // So, we are shuffling the input here hoping the tree will not have any skewness. If not, the tree could be skewed // which could result in N search time instead of NlogN. Collections.shuffle(allIndexFiles); allIndexFiles.forEach(indexFile -> { if (indexFile.hasKeyRanges()) { - indexLookUpTree.insert(new KeyRangeNode(indexFile.getMinRecordKey(), - indexFile.getMaxRecordKey(), indexFile.getFileId())); + indexLookUpTree + .insert(new KeyRangeNode(indexFile.getMinRecordKey(), indexFile.getMaxRecordKey(), indexFile.getFileId())); } else { filesWithNoRanges.add(indexFile.getFileId()); } diff --git a/hudi-client/src/main/java/org/apache/hudi/index/bloom/IntervalTreeBasedIndexFileFilter.java b/hudi-client/src/main/java/org/apache/hudi/index/bloom/IntervalTreeBasedIndexFileFilter.java index 29d5566c6..9737772b7 100644 --- a/hudi-client/src/main/java/org/apache/hudi/index/bloom/IntervalTreeBasedIndexFileFilter.java +++ b/hudi-client/src/main/java/org/apache/hudi/index/bloom/IntervalTreeBasedIndexFileFilter.java @@ -48,8 +48,8 @@ class IntervalTreeBasedIndexFileFilter implements IndexFileFilter { KeyRangeLookupTree lookUpTree = new KeyRangeLookupTree(); bloomIndexFiles.forEach(indexFileInfo -> { if (indexFileInfo.hasKeyRanges()) { - lookUpTree.insert(new KeyRangeNode(indexFileInfo.getMinRecordKey(), - indexFileInfo.getMaxRecordKey(), indexFileInfo.getFileId())); + lookUpTree.insert(new KeyRangeNode(indexFileInfo.getMinRecordKey(), indexFileInfo.getMaxRecordKey(), + indexFileInfo.getFileId())); } else { if (!partitionToFilesWithNoRanges.containsKey(partition)) { partitionToFilesWithNoRanges.put(partition, new HashSet<>()); diff --git a/hudi-client/src/main/java/org/apache/hudi/index/bloom/KeyRangeLookupTree.java b/hudi-client/src/main/java/org/apache/hudi/index/bloom/KeyRangeLookupTree.java index 4bf69125f..8f27a838f 100644 --- a/hudi-client/src/main/java/org/apache/hudi/index/bloom/KeyRangeLookupTree.java +++ b/hudi-client/src/main/java/org/apache/hudi/index/bloom/KeyRangeLookupTree.java @@ -50,25 +50,16 @@ class KeyRangeLookupTree implements Serializable { * * If no root exists, make {@code newNode} as the root and return the new root. * - * If current root and newNode matches with min record key and max record key, - * merge two nodes. In other words, add files from {@code newNode} to current root. - * Return current root. + * If current root and newNode matches with min record key and max record key, merge two nodes. In other words, add + * files from {@code newNode} to current root. Return current root. * - * If current root is < newNode - * if current root has no right sub tree - * update current root's right sub tree max and min - * set newNode as right sub tree - * else - * update root's right sub tree min and max with newNode's min and max record key as applicable - * recursively call insert() with root's right subtree as new root + * If current root is < newNode if current root has no right sub tree update current root's right sub tree max and min + * set newNode as right sub tree else update root's right sub tree min and max with newNode's min and max record key + * as applicable recursively call insert() with root's right subtree as new root * - * else // current root is >= newNode - * if current root has no left sub tree - * update current root's left sub tree max and min - * set newNode as left sub tree - * else - * update root's left sub tree min and max with newNode's min and max record key as applicable - * recursively call insert() with root's left subtree as new root + * else // current root is >= newNode if current root has no left sub tree update current root's left sub tree max and + * min set newNode as left sub tree else update root's left sub tree min and max with newNode's min and max record key + * as applicable recursively call insert() with root's left subtree as new root * * @param root refers to the current root of the look up tree * @param newNode newNode the new {@link KeyRangeNode} to be inserted diff --git a/hudi-client/src/main/java/org/apache/hudi/index/bloom/KeyRangeNode.java b/hudi-client/src/main/java/org/apache/hudi/index/bloom/KeyRangeNode.java index a96bae463..659498121 100644 --- a/hudi-client/src/main/java/org/apache/hudi/index/bloom/KeyRangeNode.java +++ b/hudi-client/src/main/java/org/apache/hudi/index/bloom/KeyRangeNode.java @@ -62,15 +62,10 @@ class KeyRangeNode implements Comparable, Serializable { @Override public String toString() { - return "KeyRangeNode{" - + "minRecordKey='" + minRecordKey + '\'' - + ", maxRecordKey='" + maxRecordKey + '\'' - + ", fileNameList=" + fileNameList - + ", rightSubTreeMax='" + rightSubTreeMax + '\'' - + ", leftSubTreeMax='" + leftSubTreeMax + '\'' - + ", rightSubTreeMin='" + rightSubTreeMin + '\'' - + ", leftSubTreeMin='" + leftSubTreeMin + '\'' - + '}'; + return "KeyRangeNode{" + "minRecordKey='" + minRecordKey + '\'' + ", maxRecordKey='" + maxRecordKey + '\'' + + ", fileNameList=" + fileNameList + ", rightSubTreeMax='" + rightSubTreeMax + '\'' + ", leftSubTreeMax='" + + leftSubTreeMax + '\'' + ", rightSubTreeMin='" + rightSubTreeMin + '\'' + ", leftSubTreeMin='" + leftSubTreeMin + + '\'' + '}'; } /** @@ -78,8 +73,8 @@ class KeyRangeNode implements Comparable, Serializable { * * @param that the {@link KeyRangeNode} to be compared with * @return the result of comparison. 0 if both min and max are equal in both. 1 if this {@link KeyRangeNode} is - * greater than the {@code that} keyRangeNode. -1 if {@code that} keyRangeNode is greater than this {@link - * KeyRangeNode} + * greater than the {@code that} keyRangeNode. -1 if {@code that} keyRangeNode is greater than this + * {@link KeyRangeNode} */ @Override public int compareTo(KeyRangeNode that) { diff --git a/hudi-client/src/main/java/org/apache/hudi/index/bloom/ListBasedGlobalIndexFileFilter.java b/hudi-client/src/main/java/org/apache/hudi/index/bloom/ListBasedGlobalIndexFileFilter.java index bccb03f36..d5fe4f6d2 100644 --- a/hudi-client/src/main/java/org/apache/hudi/index/bloom/ListBasedGlobalIndexFileFilter.java +++ b/hudi-client/src/main/java/org/apache/hudi/index/bloom/ListBasedGlobalIndexFileFilter.java @@ -30,8 +30,7 @@ class ListBasedGlobalIndexFileFilter extends ListBasedIndexFileFilter { * * @param partitionToFileIndexInfo Map of partition to List of {@link BloomIndexFileInfo} */ - ListBasedGlobalIndexFileFilter( - Map> partitionToFileIndexInfo) { + ListBasedGlobalIndexFileFilter(Map> partitionToFileIndexInfo) { super(partitionToFileIndexInfo); } diff --git a/hudi-client/src/main/java/org/apache/hudi/index/hbase/HBaseIndex.java b/hudi-client/src/main/java/org/apache/hudi/index/hbase/HBaseIndex.java index 111d23112..c40d8356c 100644 --- a/hudi-client/src/main/java/org/apache/hudi/index/hbase/HBaseIndex.java +++ b/hudi-client/src/main/java/org/apache/hudi/index/hbase/HBaseIndex.java @@ -68,10 +68,8 @@ import scala.Tuple2; */ public class HBaseIndex extends HoodieIndex { - public static final String DEFAULT_SPARK_EXECUTOR_INSTANCES_CONFIG_NAME = - "spark.executor.instances"; - public static final String DEFAULT_SPARK_DYNAMIC_ALLOCATION_ENABLED_CONFIG_NAME = - "spark.dynamicAllocation.enabled"; + public static final String DEFAULT_SPARK_EXECUTOR_INSTANCES_CONFIG_NAME = "spark.executor.instances"; + public static final String DEFAULT_SPARK_DYNAMIC_ALLOCATION_ENABLED_CONFIG_NAME = "spark.dynamicAllocation.enabled"; public static final String DEFAULT_SPARK_DYNAMIC_ALLOCATION_MAX_EXECUTORS_CONFIG_NAME = "spark.dynamicAllocation.maxExecutors"; @@ -114,9 +112,8 @@ public class HBaseIndex extends HoodieIndex { public HBaseIndexQPSResourceAllocator createQPSResourceAllocator(HoodieWriteConfig config) { try { logger.info("createQPSResourceAllocator :" + config.getHBaseQPSResourceAllocatorClass()); - final HBaseIndexQPSResourceAllocator resourceAllocator = - (HBaseIndexQPSResourceAllocator) ReflectionUtils.loadClass( - config.getHBaseQPSResourceAllocatorClass(), config); + final HBaseIndexQPSResourceAllocator resourceAllocator = (HBaseIndexQPSResourceAllocator) ReflectionUtils + .loadClass(config.getHBaseQPSResourceAllocatorClass(), config); return resourceAllocator; } catch (Exception e) { logger.warn("error while instantiating HBaseIndexQPSResourceAllocator", e); @@ -143,14 +140,14 @@ public class HBaseIndex extends HoodieIndex { try { return ConnectionFactory.createConnection(hbaseConfig); } catch (IOException e) { - throw new HoodieDependentSystemUnavailableException( - HoodieDependentSystemUnavailableException.HBASE, quorum + ":" + port); + throw new HoodieDependentSystemUnavailableException(HoodieDependentSystemUnavailableException.HBASE, + quorum + ":" + port); } } /** - * Since we are sharing the HbaseConnection across tasks in a JVM, make sure the HbaseConnectio is - * closed when JVM exits + * Since we are sharing the HbaseConnection across tasks in a JVM, make sure the HbaseConnectio is closed when JVM + * exits */ private void addShutDownHook() { Runtime.getRuntime().addShutdownHook(new Thread() { @@ -172,103 +169,95 @@ public class HBaseIndex extends HoodieIndex { } private Get generateStatement(String key) throws IOException { - return new Get(Bytes.toBytes(key)).setMaxVersions(1) - .addColumn(SYSTEM_COLUMN_FAMILY, COMMIT_TS_COLUMN) - .addColumn(SYSTEM_COLUMN_FAMILY, FILE_NAME_COLUMN) - .addColumn(SYSTEM_COLUMN_FAMILY, PARTITION_PATH_COLUMN); + return new Get(Bytes.toBytes(key)).setMaxVersions(1).addColumn(SYSTEM_COLUMN_FAMILY, COMMIT_TS_COLUMN) + .addColumn(SYSTEM_COLUMN_FAMILY, FILE_NAME_COLUMN).addColumn(SYSTEM_COLUMN_FAMILY, PARTITION_PATH_COLUMN); } private boolean checkIfValidCommit(HoodieTableMetaClient metaClient, String commitTs) { HoodieTimeline commitTimeline = metaClient.getActiveTimeline().filterCompletedInstants(); // Check if the last commit ts for this row is 1) present in the timeline or // 2) is less than the first commit ts in the timeline - return !commitTimeline.empty() && (commitTimeline - .containsInstant(new HoodieInstant(false, HoodieTimeline.COMMIT_ACTION, commitTs)) - || HoodieTimeline - .compareTimestamps(commitTimeline.firstInstant().get().getTimestamp(), commitTs, - HoodieTimeline.GREATER)); + return !commitTimeline.empty() + && (commitTimeline.containsInstant(new HoodieInstant(false, HoodieTimeline.COMMIT_ACTION, commitTs)) + || HoodieTimeline.compareTimestamps(commitTimeline.firstInstant().get().getTimestamp(), commitTs, + HoodieTimeline.GREATER)); } /** * Function that tags each HoodieRecord with an existing location, if known. */ - private Function2>, - Iterator>> locationTagFunction(HoodieTableMetaClient metaClient) { + private Function2>, Iterator>> locationTagFunction( + HoodieTableMetaClient metaClient) { - return (Function2>, Iterator>>) - (partitionNum, hoodieRecordIterator) -> { + return (Function2>, Iterator>>) (partitionNum, + hoodieRecordIterator) -> { - Integer multiGetBatchSize = config.getHbaseIndexGetBatchSize(); + Integer multiGetBatchSize = config.getHbaseIndexGetBatchSize(); - // Grab the global HBase connection - synchronized (HBaseIndex.class) { - if (hbaseConnection == null || hbaseConnection.isClosed()) { - hbaseConnection = getHBaseConnection(); - } - } - List> taggedRecords = new ArrayList<>(); - HTable hTable = null; - try { - hTable = (HTable) hbaseConnection.getTable(TableName.valueOf(tableName)); - List statements = new ArrayList<>(); - List currentBatchOfRecords = new LinkedList<>(); - // Do the tagging. - while (hoodieRecordIterator.hasNext()) { - HoodieRecord rec = hoodieRecordIterator.next(); - statements.add(generateStatement(rec.getRecordKey())); - currentBatchOfRecords.add(rec); - // iterator till we reach batch size - if (statements.size() >= multiGetBatchSize || !hoodieRecordIterator.hasNext()) { - // get results for batch from Hbase - Result[] results = doGet(hTable, statements); - // clear statements to be GC'd - statements.clear(); - for (Result result : results) { - // first, attempt to grab location from HBase - HoodieRecord currentRecord = currentBatchOfRecords.remove(0); - if (result.getRow() != null) { - String keyFromResult = Bytes.toString(result.getRow()); - String commitTs = Bytes - .toString(result.getValue(SYSTEM_COLUMN_FAMILY, COMMIT_TS_COLUMN)); - String fileId = Bytes - .toString(result.getValue(SYSTEM_COLUMN_FAMILY, FILE_NAME_COLUMN)); - String partitionPath = Bytes - .toString(result.getValue(SYSTEM_COLUMN_FAMILY, PARTITION_PATH_COLUMN)); + // Grab the global HBase connection + synchronized (HBaseIndex.class) { + if (hbaseConnection == null || hbaseConnection.isClosed()) { + hbaseConnection = getHBaseConnection(); + } + } + List> taggedRecords = new ArrayList<>(); + HTable hTable = null; + try { + hTable = (HTable) hbaseConnection.getTable(TableName.valueOf(tableName)); + List statements = new ArrayList<>(); + List currentBatchOfRecords = new LinkedList<>(); + // Do the tagging. + while (hoodieRecordIterator.hasNext()) { + HoodieRecord rec = hoodieRecordIterator.next(); + statements.add(generateStatement(rec.getRecordKey())); + currentBatchOfRecords.add(rec); + // iterator till we reach batch size + if (statements.size() >= multiGetBatchSize || !hoodieRecordIterator.hasNext()) { + // get results for batch from Hbase + Result[] results = doGet(hTable, statements); + // clear statements to be GC'd + statements.clear(); + for (Result result : results) { + // first, attempt to grab location from HBase + HoodieRecord currentRecord = currentBatchOfRecords.remove(0); + if (result.getRow() != null) { + String keyFromResult = Bytes.toString(result.getRow()); + String commitTs = Bytes.toString(result.getValue(SYSTEM_COLUMN_FAMILY, COMMIT_TS_COLUMN)); + String fileId = Bytes.toString(result.getValue(SYSTEM_COLUMN_FAMILY, FILE_NAME_COLUMN)); + String partitionPath = Bytes.toString(result.getValue(SYSTEM_COLUMN_FAMILY, PARTITION_PATH_COLUMN)); - if (checkIfValidCommit(metaClient, commitTs)) { - currentRecord = new HoodieRecord( - new HoodieKey(currentRecord.getRecordKey(), partitionPath), - currentRecord.getData()); - currentRecord.unseal(); - currentRecord.setCurrentLocation(new HoodieRecordLocation(commitTs, fileId)); - currentRecord.seal(); - taggedRecords.add(currentRecord); - // the key from Result and the key being processed should be same - assert (currentRecord.getRecordKey().contentEquals(keyFromResult)); - } else { //if commit is invalid, treat this as a new taggedRecord - taggedRecords.add(currentRecord); - } - } else { - taggedRecords.add(currentRecord); - } + if (checkIfValidCommit(metaClient, commitTs)) { + currentRecord = new HoodieRecord(new HoodieKey(currentRecord.getRecordKey(), partitionPath), + currentRecord.getData()); + currentRecord.unseal(); + currentRecord.setCurrentLocation(new HoodieRecordLocation(commitTs, fileId)); + currentRecord.seal(); + taggedRecords.add(currentRecord); + // the key from Result and the key being processed should be same + assert (currentRecord.getRecordKey().contentEquals(keyFromResult)); + } else { // if commit is invalid, treat this as a new taggedRecord + taggedRecords.add(currentRecord); } + } else { + taggedRecords.add(currentRecord); } } - } catch (IOException e) { - throw new HoodieIndexException( - "Failed to Tag indexed locations because of exception with HBase Client", e); - } finally { - if (hTable != null) { - try { - hTable.close(); - } catch (IOException e) { - // Ignore - } - } - } - return taggedRecords.iterator(); - }; + } + } catch (IOException e) { + throw new HoodieIndexException("Failed to Tag indexed locations because of exception with HBase Client", e); + } finally { + if (hTable != null) { + try { + hTable.close(); + } catch (IOException e) { + // Ignore + } + } + + } + return taggedRecords.iterator(); + }; } private Result[] doGet(HTable hTable, List keys) throws IOException { @@ -310,15 +299,12 @@ public class HBaseIndex extends HoodieIndex { continue; } Put put = new Put(Bytes.toBytes(rec.getRecordKey())); - put.addColumn(SYSTEM_COLUMN_FAMILY, COMMIT_TS_COLUMN, - Bytes.toBytes(loc.get().getInstantTime())); - put.addColumn(SYSTEM_COLUMN_FAMILY, FILE_NAME_COLUMN, - Bytes.toBytes(loc.get().getFileId())); - put.addColumn(SYSTEM_COLUMN_FAMILY, PARTITION_PATH_COLUMN, - Bytes.toBytes(rec.getPartitionPath())); + put.addColumn(SYSTEM_COLUMN_FAMILY, COMMIT_TS_COLUMN, Bytes.toBytes(loc.get().getInstantTime())); + put.addColumn(SYSTEM_COLUMN_FAMILY, FILE_NAME_COLUMN, Bytes.toBytes(loc.get().getFileId())); + put.addColumn(SYSTEM_COLUMN_FAMILY, PARTITION_PATH_COLUMN, Bytes.toBytes(rec.getPartitionPath())); puts.add(put); } else { - //Delete existing index for a deleted record + // Delete existing index for a deleted record Delete delete = new Delete(Bytes.toBytes(rec.getRecordKey())); deletes.add(delete); } @@ -328,7 +314,7 @@ public class HBaseIndex extends HoodieIndex { } doPutsAndDeletes(hTable, puts, deletes); } - //process remaining puts and deletes, if any + // process remaining puts and deletes, if any doPutsAndDeletes(hTable, puts, deletes); } catch (Exception e) { Exception we = new Exception("Error updating index for " + writeStatus, e); @@ -338,8 +324,7 @@ public class HBaseIndex extends HoodieIndex { writeStatusList.add(writeStatus); } } catch (IOException e) { - throw new HoodieIndexException( - "Failed to Update Index locations because of exception with HBase Client", e); + throw new HoodieIndexException("Failed to Update Index locations because of exception with HBase Client", e); } finally { if (hTable != null) { try { @@ -356,8 +341,7 @@ public class HBaseIndex extends HoodieIndex { /** * Helper method to facilitate performing puts and deletes in Hbase */ - private void doPutsAndDeletes(HTable hTable, List puts, List deletes) - throws IOException { + private void doPutsAndDeletes(HTable hTable, List puts, List deletes) throws IOException { if (puts.size() > 0) { hTable.put(puts); } @@ -385,58 +369,49 @@ public class HBaseIndex extends HoodieIndex { final HBaseIndexQPSResourceAllocator hBaseIndexQPSResourceAllocator = createQPSResourceAllocator(this.config); setPutBatchSize(writeStatusRDD, hBaseIndexQPSResourceAllocator, jsc); logger.info("multiPutBatchSize: before hbase puts" + multiPutBatchSize); - JavaRDD writeStatusJavaRDD = writeStatusRDD.mapPartitionsWithIndex( - updateLocationFunction(), true); + JavaRDD writeStatusJavaRDD = writeStatusRDD.mapPartitionsWithIndex(updateLocationFunction(), true); // caching the index updated status RDD writeStatusJavaRDD = writeStatusJavaRDD.persist(config.getWriteStatusStorageLevel()); return writeStatusJavaRDD; } private void setPutBatchSize(JavaRDD writeStatusRDD, - HBaseIndexQPSResourceAllocator hBaseIndexQPSResourceAllocator, - final JavaSparkContext jsc) { + HBaseIndexQPSResourceAllocator hBaseIndexQPSResourceAllocator, final JavaSparkContext jsc) { if (config.getHbaseIndexPutBatchSizeAutoCompute()) { SparkConf conf = jsc.getConf(); int maxExecutors = conf.getInt(DEFAULT_SPARK_EXECUTOR_INSTANCES_CONFIG_NAME, 1); if (conf.getBoolean(DEFAULT_SPARK_DYNAMIC_ALLOCATION_ENABLED_CONFIG_NAME, false)) { - maxExecutors = Math.max(maxExecutors, conf.getInt( - DEFAULT_SPARK_DYNAMIC_ALLOCATION_MAX_EXECUTORS_CONFIG_NAME, 1)); + maxExecutors = + Math.max(maxExecutors, conf.getInt(DEFAULT_SPARK_DYNAMIC_ALLOCATION_MAX_EXECUTORS_CONFIG_NAME, 1)); } /* - Each writeStatus represents status information from a write done in one of the IOHandles. - If a writeStatus has any insert, it implies that the corresponding task contacts HBase for - doing puts, since we only do puts for inserts from HBaseIndex. + * Each writeStatus represents status information from a write done in one of the IOHandles. If a writeStatus has + * any insert, it implies that the corresponding task contacts HBase for doing puts, since we only do puts for + * inserts from HBaseIndex. */ - final Tuple2 numPutsParallelismTuple = getHBasePutAccessParallelism(writeStatusRDD); + final Tuple2 numPutsParallelismTuple = getHBasePutAccessParallelism(writeStatusRDD); final long numPuts = numPutsParallelismTuple._1; final int hbasePutsParallelism = numPutsParallelismTuple._2; this.numRegionServersForTable = getNumRegionServersAliveForTable(); - final float desiredQPSFraction = hBaseIndexQPSResourceAllocator - .calculateQPSFractionForPutsTime(numPuts, this.numRegionServersForTable); + final float desiredQPSFraction = + hBaseIndexQPSResourceAllocator.calculateQPSFractionForPutsTime(numPuts, this.numRegionServersForTable); logger.info("Desired QPSFraction :" + desiredQPSFraction); logger.info("Number HBase puts :" + numPuts); logger.info("Hbase Puts Parallelism :" + hbasePutsParallelism); - final float availableQpsFraction = hBaseIndexQPSResourceAllocator - .acquireQPSResources(desiredQPSFraction, numPuts); + final float availableQpsFraction = + hBaseIndexQPSResourceAllocator.acquireQPSResources(desiredQPSFraction, numPuts); logger.info("Allocated QPS Fraction :" + availableQpsFraction); - multiPutBatchSize = putBatchSizeCalculator - .getBatchSize( - numRegionServersForTable, - maxQpsPerRegionServer, - hbasePutsParallelism, - maxExecutors, - SLEEP_TIME_MILLISECONDS, - availableQpsFraction); + multiPutBatchSize = putBatchSizeCalculator.getBatchSize(numRegionServersForTable, maxQpsPerRegionServer, + hbasePutsParallelism, maxExecutors, SLEEP_TIME_MILLISECONDS, availableQpsFraction); logger.info("multiPutBatchSize :" + multiPutBatchSize); } } @VisibleForTesting public Tuple2 getHBasePutAccessParallelism(final JavaRDD writeStatusRDD) { - final JavaPairRDD insertOnlyWriteStatusRDD = - writeStatusRDD.filter(w -> w.getStat().getNumInserts() > 0) - .mapToPair(w -> new Tuple2<>(w.getStat().getNumInserts(), 1)); + final JavaPairRDD insertOnlyWriteStatusRDD = writeStatusRDD + .filter(w -> w.getStat().getNumInserts() > 0).mapToPair(w -> new Tuple2<>(w.getStat().getNumInserts(), 1)); return insertOnlyWriteStatusRDD.fold(new Tuple2<>(0L, 0), (w, c) -> new Tuple2<>(w._1 + c._1, w._2 + c._2)); } @@ -460,21 +435,25 @@ public class HBaseIndex extends HoodieIndex { * 16000. We assume requests get distributed to Region Servers uniformly, so each RS gets 1600 requests which * happens to be 10% of 16667 (maxQPSPerRegionServer), as expected. *

- *

Assumptions made here

  • In a batch, writes get evenly distributed to each RS for that - * table. Since we do writes only in the case of inserts and not updates, for this assumption to fail, inserts would - * have to be skewed towards few RS, likelihood of which is less if Hbase table is pre-split and rowKeys are UUIDs - * (random strings). If this assumption fails, then it is possible for some RS to receive more than - * maxQpsPerRegionServer QPS, but for simplicity, we are going ahead with this model, since this is meant to be a - * lightweight distributed throttling mechanism without maintaining a global context. So if this assumption breaks, - * we are hoping the HBase Master relocates hot-spot regions to new Region Servers. + *

    + * Assumptions made here + *

  • In a batch, writes get evenly distributed to each RS for that table. Since we do writes only in the case of + * inserts and not updates, for this assumption to fail, inserts would have to be skewed towards few RS, likelihood + * of which is less if Hbase table is pre-split and rowKeys are UUIDs (random strings). If this assumption fails, + * then it is possible for some RS to receive more than maxQpsPerRegionServer QPS, but for simplicity, we are going + * ahead with this model, since this is meant to be a lightweight distributed throttling mechanism without + * maintaining a global context. So if this assumption breaks, we are hoping the HBase Master relocates hot-spot + * regions to new Region Servers. * - *
  • For Region Server stability, throttling at a second level granularity is fine. - * Although, within a second, the sum of queries might be within maxQpsPerRegionServer, there could be peaks at some - * sub second intervals. So, the assumption is that these peaks are tolerated by the Region Server (which at max can - * be maxQpsPerRegionServer).
  • + * + *
  • For Region Server stability, throttling at a second level granularity is fine. Although, within a second, the + * sum of queries might be within maxQpsPerRegionServer, there could be peaks at some sub second intervals. So, the + * assumption is that these peaks are tolerated by the Region Server (which at max can be maxQpsPerRegionServer). + *
  • + *

    */ - public int getBatchSize(int numRegionServersForTable, int maxQpsPerRegionServer, - int numTasksDuringPut, int maxExecutors, int sleepTimeMs, float qpsFraction) { + public int getBatchSize(int numRegionServersForTable, int maxQpsPerRegionServer, int numTasksDuringPut, + int maxExecutors, int sleepTimeMs, float qpsFraction) { int numRSAlive = numRegionServersForTable; int maxReqPerSec = (int) (qpsFraction * numRSAlive * maxQpsPerRegionServer); int numTasks = numTasksDuringPut; @@ -499,11 +478,9 @@ public class HBaseIndex extends HoodieIndex { // from the driver, so ok to use a local connection variable. if (numRegionServersForTable == null) { try (Connection conn = getHBaseConnection()) { - RegionLocator regionLocator = conn - .getRegionLocator(TableName.valueOf(tableName)); - numRegionServersForTable = Math.toIntExact( - regionLocator.getAllRegionLocations().stream().map(e -> e.getServerName()).distinct() - .count()); + RegionLocator regionLocator = conn.getRegionLocator(TableName.valueOf(tableName)); + numRegionServersForTable = Math + .toIntExact(regionLocator.getAllRegionLocations().stream().map(e -> e.getServerName()).distinct().count()); return numRegionServersForTable; } catch (IOException e) { logger.error(e); diff --git a/hudi-client/src/main/java/org/apache/hudi/index/hbase/HBaseIndexQPSResourceAllocator.java b/hudi-client/src/main/java/org/apache/hudi/index/hbase/HBaseIndexQPSResourceAllocator.java index 0adc24738..bccdd5bcc 100644 --- a/hudi-client/src/main/java/org/apache/hudi/index/hbase/HBaseIndexQPSResourceAllocator.java +++ b/hudi-client/src/main/java/org/apache/hudi/index/hbase/HBaseIndexQPSResourceAllocator.java @@ -26,11 +26,11 @@ import java.io.Serializable; public interface HBaseIndexQPSResourceAllocator extends Serializable { /** - * This method returns the QPS Fraction value that needs to be acquired such that the respective - * HBase index operation can be completed in desiredPutsTime. + * This method returns the QPS Fraction value that needs to be acquired such that the respective HBase index operation + * can be completed in desiredPutsTime. * - * @param numPuts Number of inserts to be written to HBase index - * @param desiredPutsTimeInSecs Total expected time for the HBase inserts operation + * @param numPuts Number of inserts to be written to HBase index + * @param desiredPutsTimeInSecs Total expected time for the HBase inserts operation * @return QPS fraction that needs to be acquired. */ float calculateQPSFractionForPutsTime(final long numPuts, final int desiredPutsTimeInSecs); @@ -38,8 +38,8 @@ public interface HBaseIndexQPSResourceAllocator extends Serializable { /** * This method acquires the requested QPS Fraction against HBase cluster for index operation. * - * @param desiredQPSFraction QPS fraction that needs to be requested and acquired - * @param numPuts Number of inserts to be written to HBase index + * @param desiredQPSFraction QPS fraction that needs to be requested and acquired + * @param numPuts Number of inserts to be written to HBase index * @return value of the acquired QPS Fraction. */ float acquireQPSResources(final float desiredQPSFraction, final long numPuts); diff --git a/hudi-client/src/main/java/org/apache/hudi/io/HoodieAppendHandle.java b/hudi-client/src/main/java/org/apache/hudi/io/HoodieAppendHandle.java index 0aa413739..13ab1d0bf 100644 --- a/hudi-client/src/main/java/org/apache/hudi/io/HoodieAppendHandle.java +++ b/hudi-client/src/main/java/org/apache/hudi/io/HoodieAppendHandle.java @@ -96,8 +96,8 @@ public class HoodieAppendHandle extends HoodieWri // Total number of new records inserted into the delta file private long insertRecordsWritten = 0; - public HoodieAppendHandle(HoodieWriteConfig config, String commitTime, HoodieTable hoodieTable, - String fileId, Iterator> recordItr) { + public HoodieAppendHandle(HoodieWriteConfig config, String commitTime, HoodieTable hoodieTable, String fileId, + Iterator> recordItr) { super(config, commitTime, fileId, hoodieTable); writeStatus.setStat(new HoodieDeltaWriteStat()); this.fileId = fileId; @@ -137,10 +137,8 @@ public class HoodieAppendHandle extends HoodieWri } catch (Exception e) { logger.error("Error in update task at commit " + instantTime, e); writeStatus.setGlobalError(e); - throw new HoodieUpsertException( - "Failed to initialize HoodieAppendHandle for FileId: " + fileId + " on commit " - + instantTime + " on HDFS path " + hoodieTable.getMetaClient().getBasePath() - + partitionPath, e); + throw new HoodieUpsertException("Failed to initialize HoodieAppendHandle for FileId: " + fileId + " on commit " + + instantTime + " on HDFS path " + hoodieTable.getMetaClient().getBasePath() + partitionPath, e); } Path path = new Path(partitionPath, writer.getLogFile().getFileName()); writeStatus.getStat().setPath(path.toString()); @@ -155,13 +153,11 @@ public class HoodieAppendHandle extends HoodieWri if (avroRecord.isPresent()) { // Convert GenericRecord to GenericRecord with hoodie commit metadata in schema avroRecord = Option.of(rewriteRecord((GenericRecord) avroRecord.get())); - String seqId = HoodieRecord.generateSequenceId(instantTime, TaskContext.getPartitionId(), - recordIndex.getAndIncrement()); - HoodieAvroUtils - .addHoodieKeyToRecord((GenericRecord) avroRecord.get(), hoodieRecord.getRecordKey(), - hoodieRecord.getPartitionPath(), fileId); - HoodieAvroUtils - .addCommitMetadataToRecord((GenericRecord) avroRecord.get(), instantTime, seqId); + String seqId = + HoodieRecord.generateSequenceId(instantTime, TaskContext.getPartitionId(), recordIndex.getAndIncrement()); + HoodieAvroUtils.addHoodieKeyToRecord((GenericRecord) avroRecord.get(), hoodieRecord.getRecordKey(), + hoodieRecord.getPartitionPath(), fileId); + HoodieAvroUtils.addCommitMetadataToRecord((GenericRecord) avroRecord.get(), instantTime, seqId); // If currentLocation is present, then this is an update if (hoodieRecord.getCurrentLocation() != null) { updatedRecordsWritten++; @@ -208,20 +204,18 @@ public class HoodieAppendHandle extends HoodieWri recordList.clear(); } if (keysToDelete.size() > 0) { - writer = writer.appendBlock( - new HoodieDeleteBlock(keysToDelete.stream().toArray(HoodieKey[]::new), header)); + writer = writer.appendBlock(new HoodieDeleteBlock(keysToDelete.stream().toArray(HoodieKey[]::new), header)); keysToDelete.clear(); } } catch (Exception e) { - throw new HoodieAppendException( - "Failed while appending records to " + currentLogFile.getPath(), e); + throw new HoodieAppendException("Failed while appending records to " + currentLogFile.getPath(), e); } } @Override public boolean canWrite(HoodieRecord record) { - return config.getParquetMaxFileSize() >= estimatedNumberOfBytesWritten * config - .getLogFileToParquetCompressionRatio(); + return config.getParquetMaxFileSize() >= estimatedNumberOfBytesWritten + * config.getLogFileToParquetCompressionRatio(); } @Override @@ -262,8 +256,8 @@ public class HoodieAppendHandle extends HoodieWri runtimeStats.setTotalUpsertTime(timer.endTimer()); stat.setRuntimeStats(runtimeStats); - logger.info(String.format("AppendHandle for partitionPath %s fileID %s, took %d ms.", - stat.getPartitionPath(), stat.getFileId(), runtimeStats.getTotalUpsertTime())); + logger.info(String.format("AppendHandle for partitionPath %s fileID %s, took %d ms.", stat.getPartitionPath(), + stat.getFileId(), runtimeStats.getTotalUpsertTime())); return writeStatus; } catch (IOException e) { @@ -282,13 +276,11 @@ public class HoodieAppendHandle extends HoodieWri return HoodieLogFormat.newWriterBuilder() .onParentPath(FSUtils.getPartitionPath(hoodieTable.getMetaClient().getBasePath(), partitionPath)) - .withFileId(fileId).overBaseCommit(baseCommitTime).withLogVersion( - latestLogFile.map(HoodieLogFile::getLogVersion).orElse(HoodieLogFile.LOGFILE_BASE_VERSION)) + .withFileId(fileId).overBaseCommit(baseCommitTime) + .withLogVersion(latestLogFile.map(HoodieLogFile::getLogVersion).orElse(HoodieLogFile.LOGFILE_BASE_VERSION)) .withSizeThreshold(config.getLogFileMaxSize()).withFs(fs) - .withLogWriteToken( - latestLogFile.map(x -> FSUtils.getWriteTokenFromLogPath(x.getPath())).orElse(writeToken)) - .withRolloverLogWriteToken(writeToken) - .withFileExtension(HoodieLogFile.DELTA_EXTENSION).build(); + .withLogWriteToken(latestLogFile.map(x -> FSUtils.getWriteTokenFromLogPath(x.getPath())).orElse(writeToken)) + .withRolloverLogWriteToken(writeToken).withFileExtension(HoodieLogFile.DELTA_EXTENSION).build(); } private void writeToBuffer(HoodieRecord record) { diff --git a/hudi-client/src/main/java/org/apache/hudi/io/HoodieCleanHelper.java b/hudi-client/src/main/java/org/apache/hudi/io/HoodieCleanHelper.java index b63ea39ed..89ab363fc 100644 --- a/hudi-client/src/main/java/org/apache/hudi/io/HoodieCleanHelper.java +++ b/hudi-client/src/main/java/org/apache/hudi/io/HoodieCleanHelper.java @@ -45,9 +45,12 @@ import org.apache.log4j.Logger; /** * Cleaner is responsible for garbage collecting older files in a given partition path, such that - *

    1) It provides sufficient time for existing queries running on older versions, to close

    - * 2) It bounds the growth of the files in the file system

    TODO: Should all cleaning be done - * based on {@link HoodieCommitMetadata} + *

    + * 1) It provides sufficient time for existing queries running on older versions, to close + *

    + * 2) It bounds the growth of the files in the file system + *

    + * TODO: Should all cleaning be done based on {@link HoodieCommitMetadata} */ public class HoodieCleanHelper> { @@ -65,23 +68,22 @@ public class HoodieCleanHelper> { this.commitTimeline = hoodieTable.getCompletedCommitTimeline(); this.config = config; this.fgIdToPendingCompactionOperations = - ((SyncableFileSystemView)hoodieTable.getRTFileSystemView()).getPendingCompactionOperations() - .map(entry -> Pair.of(new HoodieFileGroupId(entry.getValue().getPartitionPath(), - entry.getValue().getFileId()), entry.getValue())) + ((SyncableFileSystemView) hoodieTable.getRTFileSystemView()).getPendingCompactionOperations() + .map(entry -> Pair.of( + new HoodieFileGroupId(entry.getValue().getPartitionPath(), entry.getValue().getFileId()), + entry.getValue())) .collect(Collectors.toMap(Pair::getKey, Pair::getValue)); } /** - * Selects the older versions of files for cleaning, such that it bounds the number of versions of - * each file. This policy is useful, if you are simply interested in querying the table, and you - * don't want too many versions for a single file (i.e run it with versionsRetained = 1) + * Selects the older versions of files for cleaning, such that it bounds the number of versions of each file. This + * policy is useful, if you are simply interested in querying the table, and you don't want too many versions for a + * single file (i.e run it with versionsRetained = 1) */ - private List getFilesToCleanKeepingLatestVersions(String partitionPath) - throws IOException { - logger.info("Cleaning " + partitionPath + ", retaining latest " + config - .getCleanerFileVersionsRetained() + " file versions. "); - List fileGroups = fileSystemView.getAllFileGroups(partitionPath) - .collect(Collectors.toList()); + private List getFilesToCleanKeepingLatestVersions(String partitionPath) throws IOException { + logger.info("Cleaning " + partitionPath + ", retaining latest " + config.getCleanerFileVersionsRetained() + + " file versions. "); + List fileGroups = fileSystemView.getAllFileGroups(partitionPath).collect(Collectors.toList()); List deletePaths = new ArrayList<>(); // Collect all the datafiles savepointed by all the savepoints List savepointedFiles = hoodieTable.getSavepoints().stream() @@ -90,8 +92,8 @@ public class HoodieCleanHelper> { for (HoodieFileGroup fileGroup : fileGroups) { int keepVersions = config.getCleanerFileVersionsRetained(); // do not cleanup slice required for pending compaction - Iterator fileSliceIterator = fileGroup.getAllFileSlices() - .filter(fs -> !isFileSliceNeededForPendingCompaction(fs)).iterator(); + Iterator fileSliceIterator = + fileGroup.getAllFileSlices().filter(fs -> !isFileSliceNeededForPendingCompaction(fs)).iterator(); if (isFileGroupInPendingCompaction(fileGroup)) { // We have already saved the last version of file-groups for pending compaction Id keepVersions--; @@ -116,8 +118,8 @@ public class HoodieCleanHelper> { } if (hoodieTable.getMetaClient().getTableType() == HoodieTableType.MERGE_ON_READ) { // If merge on read, then clean the log files for the commits as well - deletePaths.addAll(nextSlice.getLogFiles().map(file -> file.getPath().toString()) - .collect(Collectors.toList())); + deletePaths + .addAll(nextSlice.getLogFiles().map(file -> file.getPath().toString()).collect(Collectors.toList())); } } } @@ -126,21 +128,21 @@ public class HoodieCleanHelper> { /** - * Selects the versions for file for cleaning, such that it

    - Leaves the latest version of the - * file untouched - For older versions, - It leaves all the commits untouched which has occured in - * last config.getCleanerCommitsRetained() commits - It leaves ONE commit before this - * window. We assume that the max(query execution time) == commit_batch_time * - * config.getCleanerCommitsRetained(). This is 12 hours by default. This is essential to leave the - * file used by the query thats running for the max time.

    This provides the effect of having - * lookback into all changes that happened in the last X commits. (eg: if you retain 24 commits, - * and commit batch time is 30 mins, then you have 12 hrs of lookback)

    This policy is the - * default. + * Selects the versions for file for cleaning, such that it + *

    + * - Leaves the latest version of the file untouched - For older versions, - It leaves all the commits untouched which + * has occured in last config.getCleanerCommitsRetained() commits - It leaves ONE commit before this + * window. We assume that the max(query execution time) == commit_batch_time * config.getCleanerCommitsRetained(). + * This is 12 hours by default. This is essential to leave the file used by the query thats running for the max time. + *

    + * This provides the effect of having lookback into all changes that happened in the last X commits. (eg: if you + * retain 24 commits, and commit batch time is 30 mins, then you have 12 hrs of lookback) + *

    + * This policy is the default. */ - private List getFilesToCleanKeepingLatestCommits(String partitionPath) - throws IOException { + private List getFilesToCleanKeepingLatestCommits(String partitionPath) throws IOException { int commitsRetained = config.getCleanerCommitsRetained(); - logger - .info("Cleaning " + partitionPath + ", retaining latest " + commitsRetained + " commits. "); + logger.info("Cleaning " + partitionPath + ", retaining latest " + commitsRetained + " commits. "); List deletePaths = new ArrayList<>(); // Collect all the datafiles savepointed by all the savepoints @@ -150,8 +152,7 @@ public class HoodieCleanHelper> { // determine if we have enough commits, to start cleaning. if (commitTimeline.countInstants() > commitsRetained) { HoodieInstant earliestCommitToRetain = getEarliestCommitToRetain().get(); - List fileGroups = fileSystemView.getAllFileGroups(partitionPath) - .collect(Collectors.toList()); + List fileGroups = fileSystemView.getAllFileGroups(partitionPath).collect(Collectors.toList()); for (HoodieFileGroup fileGroup : fileGroups) { List fileSliceList = fileGroup.getAllFileSlices().collect(Collectors.toList()); @@ -160,8 +161,8 @@ public class HoodieCleanHelper> { } String lastVersion = fileSliceList.get(0).getBaseInstantTime(); - String lastVersionBeforeEarliestCommitToRetain = getLatestVersionBeforeCommit(fileSliceList, - earliestCommitToRetain); + String lastVersionBeforeEarliestCommitToRetain = + getLatestVersionBeforeCommit(fileSliceList, earliestCommitToRetain); // Ensure there are more than 1 version of the file (we only clean old files from updates) // i.e always spare the last commit. @@ -183,16 +184,14 @@ public class HoodieCleanHelper> { } // Always keep the last commit - if (!isFileSliceNeededForPendingCompaction(aSlice) - && HoodieTimeline - .compareTimestamps(earliestCommitToRetain.getTimestamp(), fileCommitTime, - HoodieTimeline.GREATER)) { + if (!isFileSliceNeededForPendingCompaction(aSlice) && HoodieTimeline + .compareTimestamps(earliestCommitToRetain.getTimestamp(), fileCommitTime, HoodieTimeline.GREATER)) { // this is a commit, that should be cleaned. aFile.ifPresent(hoodieDataFile -> deletePaths.add(hoodieDataFile.getPath())); if (hoodieTable.getMetaClient().getTableType() == HoodieTableType.MERGE_ON_READ) { // If merge on read, then clean the log files for the commits as well - deletePaths.addAll(aSlice.getLogFiles().map(file -> file.getPath().toString()) - .collect(Collectors.toList())); + deletePaths + .addAll(aSlice.getLogFiles().map(file -> file.getPath().toString()).collect(Collectors.toList())); } } } @@ -205,12 +204,10 @@ public class HoodieCleanHelper> { /** * Gets the latest version < commitTime. This version file could still be used by queries. */ - private String getLatestVersionBeforeCommit(List fileSliceList, - HoodieInstant commitTime) { + private String getLatestVersionBeforeCommit(List fileSliceList, HoodieInstant commitTime) { for (FileSlice file : fileSliceList) { String fileCommitTime = file.getBaseInstantTime(); - if (HoodieTimeline - .compareTimestamps(commitTime.getTimestamp(), fileCommitTime, HoodieTimeline.GREATER)) { + if (HoodieTimeline.compareTimestamps(commitTime.getTimestamp(), fileCommitTime, HoodieTimeline.GREATER)) { // fileList is sorted on the reverse, so the first commit we find <= commitTime is the // one we want return fileCommitTime; @@ -246,14 +243,14 @@ public class HoodieCleanHelper> { int commitsRetained = config.getCleanerCommitsRetained(); if (config.getCleanerPolicy() == HoodieCleaningPolicy.KEEP_LATEST_COMMITS && commitTimeline.countInstants() > commitsRetained) { - earliestCommitToRetain = commitTimeline - .nthInstant(commitTimeline.countInstants() - commitsRetained); + earliestCommitToRetain = commitTimeline.nthInstant(commitTimeline.countInstants() - commitsRetained); } return earliestCommitToRetain; } /** * Determine if file slice needed to be preserved for pending compaction + * * @param fileSlice File Slice * @return true if file slice needs to be preserved, false otherwise. */ diff --git a/hudi-client/src/main/java/org/apache/hudi/io/HoodieCommitArchiveLog.java b/hudi-client/src/main/java/org/apache/hudi/io/HoodieCommitArchiveLog.java index bc8b4fe8b..2d2f1d4a1 100644 --- a/hudi-client/src/main/java/org/apache/hudi/io/HoodieCommitArchiveLog.java +++ b/hudi-client/src/main/java/org/apache/hudi/io/HoodieCommitArchiveLog.java @@ -83,9 +83,8 @@ public class HoodieCommitArchiveLog { try { if (this.writer == null) { return HoodieLogFormat.newWriterBuilder().onParentPath(archiveFilePath.getParent()) - .withFileId(archiveFilePath.getName()) - .withFileExtension(HoodieArchivedLogFile.ARCHIVE_EXTENSION).withFs(metaClient.getFs()) - .overBaseCommit("").build(); + .withFileId(archiveFilePath.getName()).withFileExtension(HoodieArchivedLogFile.ARCHIVE_EXTENSION) + .withFs(metaClient.getFs()).overBaseCommit("").build(); } else { return this.writer; } @@ -137,8 +136,7 @@ public class HoodieCommitArchiveLog { // TODO: Handle ROLLBACK_ACTION in future // ROLLBACK_ACTION is currently not defined in HoodieActiveTimeline HoodieTimeline cleanAndRollbackTimeline = table.getActiveTimeline() - .getTimelineOfActions(Sets.newHashSet(HoodieTimeline.CLEAN_ACTION)) - .filterCompletedInstants(); + .getTimelineOfActions(Sets.newHashSet(HoodieTimeline.CLEAN_ACTION)).filterCompletedInstants(); Stream instants = cleanAndRollbackTimeline.getInstants() .collect(Collectors.groupingBy(s -> s.getAction())).entrySet().stream().map(i -> { if (i.getValue().size() > maxCommitsToKeep) { @@ -148,7 +146,7 @@ public class HoodieCommitArchiveLog { } }).flatMap(i -> i.stream()); - //TODO (na) : Add a way to return actions associated with a timeline and then merge/unify + // TODO (na) : Add a way to return actions associated with a timeline and then merge/unify // with logic above to avoid Stream.concats HoodieTimeline commitTimeline = table.getCompletedCommitsTimeline(); Option oldestPendingCompactionInstant = @@ -159,20 +157,16 @@ public class HoodieCommitArchiveLog { Option firstSavepoint = table.getCompletedSavepointTimeline().firstInstant(); if (!commitTimeline.empty() && commitTimeline.countInstants() > maxCommitsToKeep) { // Actually do the commits - instants = Stream.concat(instants, commitTimeline.getInstants() - .filter(s -> { - // if no savepoint present, then dont filter - return !(firstSavepoint.isPresent() && HoodieTimeline - .compareTimestamps(firstSavepoint.get().getTimestamp(), s.getTimestamp(), - HoodieTimeline.LESSER_OR_EQUAL)); - }) - .filter(s -> { - // Ensure commits >= oldest pending compaction commit is retained - return oldestPendingCompactionInstant.map(instant -> { - return HoodieTimeline.compareTimestamps(instant.getTimestamp(), s.getTimestamp(), HoodieTimeline.GREATER); - }).orElse(true); - }) - .limit(commitTimeline.countInstants() - minCommitsToKeep)); + instants = Stream.concat(instants, commitTimeline.getInstants().filter(s -> { + // if no savepoint present, then dont filter + return !(firstSavepoint.isPresent() && HoodieTimeline.compareTimestamps(firstSavepoint.get().getTimestamp(), + s.getTimestamp(), HoodieTimeline.LESSER_OR_EQUAL)); + }).filter(s -> { + // Ensure commits >= oldest pending compaction commit is retained + return oldestPendingCompactionInstant.map(instant -> { + return HoodieTimeline.compareTimestamps(instant.getTimestamp(), s.getTimestamp(), HoodieTimeline.GREATER); + }).orElse(true); + }).limit(commitTimeline.countInstants() - minCommitsToKeep)); } return instants; @@ -194,13 +188,10 @@ public class HoodieCommitArchiveLog { } // Remove older meta-data from auxiliary path too - Option latestCommitted = - Option.fromJavaOptional(archivedInstants.stream() - .filter(i -> { - return i.isCompleted() - && (i.getAction().equals(HoodieTimeline.COMMIT_ACTION) || (i.getAction().equals( - HoodieTimeline.DELTA_COMMIT_ACTION))); - }).max(Comparator.comparing(HoodieInstant::getTimestamp))); + Option latestCommitted = Option.fromJavaOptional(archivedInstants.stream().filter(i -> { + return i.isCompleted() && (i.getAction().equals(HoodieTimeline.COMMIT_ACTION) + || (i.getAction().equals(HoodieTimeline.DELTA_COMMIT_ACTION))); + }).max(Comparator.comparing(HoodieInstant::getTimestamp))); if (latestCommitted.isPresent()) { success &= deleteAllInstantsOlderorEqualsInAuxMetaFolder(latestCommitted.get()); } @@ -214,12 +205,9 @@ public class HoodieCommitArchiveLog { * @return success if all eligible file deleted successfully * @throws IOException in case of error */ - private boolean deleteAllInstantsOlderorEqualsInAuxMetaFolder(HoodieInstant thresholdInstant) - throws IOException { - List instants = - HoodieTableMetaClient.scanHoodieInstantsFromFileSystem(metaClient.getFs(), - new Path(metaClient.getMetaAuxiliaryPath()), - HoodieActiveTimeline.VALID_EXTENSIONS_IN_ACTIVE_TIMELINE); + private boolean deleteAllInstantsOlderorEqualsInAuxMetaFolder(HoodieInstant thresholdInstant) throws IOException { + List instants = HoodieTableMetaClient.scanHoodieInstantsFromFileSystem(metaClient.getFs(), + new Path(metaClient.getMetaAuxiliaryPath()), HoodieActiveTimeline.VALID_EXTENSIONS_IN_ACTIVE_TIMELINE); List instantsToBeDeleted = instants.stream().filter(instant1 -> HoodieTimeline.compareTimestamps(instant1.getTimestamp(), @@ -239,8 +227,7 @@ public class HoodieCommitArchiveLog { public void archive(List instants) throws HoodieCommitException { try { - HoodieTimeline commitTimeline = metaClient.getActiveTimeline().getAllCommitsTimeline() - .filterCompletedInstants(); + HoodieTimeline commitTimeline = metaClient.getActiveTimeline().getAllCommitsTimeline().filterCompletedInstants(); Schema wrapperSchema = HoodieArchivedMetaEntry.getClassSchema(); log.info("Wrapper schema " + wrapperSchema.toString()); List records = new ArrayList<>(); @@ -277,15 +264,14 @@ public class HoodieCommitArchiveLog { } } - private IndexedRecord convertToAvroRecord(HoodieTimeline commitTimeline, - HoodieInstant hoodieInstant) throws IOException { + private IndexedRecord convertToAvroRecord(HoodieTimeline commitTimeline, HoodieInstant hoodieInstant) + throws IOException { HoodieArchivedMetaEntry archivedMetaWrapper = new HoodieArchivedMetaEntry(); archivedMetaWrapper.setCommitTime(hoodieInstant.getTimestamp()); switch (hoodieInstant.getAction()) { case HoodieTimeline.CLEAN_ACTION: { archivedMetaWrapper.setHoodieCleanMetadata(AvroUtils - .deserializeAvroMetadata(commitTimeline.getInstantDetails(hoodieInstant).get(), - HoodieCleanMetadata.class)); + .deserializeAvroMetadata(commitTimeline.getInstantDetails(hoodieInstant).get(), HoodieCleanMetadata.class)); archivedMetaWrapper.setActionType(ActionType.clean.name()); break; } @@ -297,16 +283,14 @@ public class HoodieCommitArchiveLog { break; } case HoodieTimeline.ROLLBACK_ACTION: { - archivedMetaWrapper.setHoodieRollbackMetadata(AvroUtils - .deserializeAvroMetadata(commitTimeline.getInstantDetails(hoodieInstant).get(), - HoodieRollbackMetadata.class)); + archivedMetaWrapper.setHoodieRollbackMetadata(AvroUtils.deserializeAvroMetadata( + commitTimeline.getInstantDetails(hoodieInstant).get(), HoodieRollbackMetadata.class)); archivedMetaWrapper.setActionType(ActionType.rollback.name()); break; } case HoodieTimeline.SAVEPOINT_ACTION: { - archivedMetaWrapper.setHoodieSavePointMetadata(AvroUtils - .deserializeAvroMetadata(commitTimeline.getInstantDetails(hoodieInstant).get(), - HoodieSavepointMetadata.class)); + archivedMetaWrapper.setHoodieSavePointMetadata(AvroUtils.deserializeAvroMetadata( + commitTimeline.getInstantDetails(hoodieInstant).get(), HoodieSavepointMetadata.class)); archivedMetaWrapper.setActionType(ActionType.savepoint.name()); break; } @@ -326,10 +310,10 @@ public class HoodieCommitArchiveLog { private org.apache.hudi.avro.model.HoodieCommitMetadata commitMetadataConverter( HoodieCommitMetadata hoodieCommitMetadata) { ObjectMapper mapper = new ObjectMapper(); - //Need this to ignore other public get() methods + // Need this to ignore other public get() methods mapper.configure(DeserializationFeature.FAIL_ON_UNKNOWN_PROPERTIES, false); - org.apache.hudi.avro.model.HoodieCommitMetadata avroMetaData = mapper - .convertValue(hoodieCommitMetadata, org.apache.hudi.avro.model.HoodieCommitMetadata.class); + org.apache.hudi.avro.model.HoodieCommitMetadata avroMetaData = + mapper.convertValue(hoodieCommitMetadata, org.apache.hudi.avro.model.HoodieCommitMetadata.class); // Do not archive Rolling Stats, cannot set to null since AVRO will throw null pointer avroMetaData.getExtraMetadata().put(HoodieRollingStatMetadata.ROLLING_STAT_METADATA_KEY, ""); return avroMetaData; diff --git a/hudi-client/src/main/java/org/apache/hudi/io/HoodieCreateHandle.java b/hudi-client/src/main/java/org/apache/hudi/io/HoodieCreateHandle.java index afb8f851b..9484ffa01 100644 --- a/hudi-client/src/main/java/org/apache/hudi/io/HoodieCreateHandle.java +++ b/hudi-client/src/main/java/org/apache/hudi/io/HoodieCreateHandle.java @@ -66,11 +66,10 @@ public class HoodieCreateHandle extends HoodieWri new Path(config.getBasePath()), FSUtils.getPartitionPath(config.getBasePath(), partitionPath)); partitionMetadata.trySave(TaskContext.getPartitionId()); createMarkerFile(partitionPath); - this.storageWriter = HoodieStorageWriterFactory - .getStorageWriter(commitTime, path, hoodieTable, config, writerSchema); + this.storageWriter = + HoodieStorageWriterFactory.getStorageWriter(commitTime, path, hoodieTable, config, writerSchema); } catch (IOException e) { - throw new HoodieInsertException( - "Failed to initialize HoodieStorageWriter for path " + path, e); + throw new HoodieInsertException("Failed to initialize HoodieStorageWriter for path " + path, e); } logger.info("New CreateHandle for partition :" + partitionPath + " with fileId " + fileId); } @@ -136,8 +135,7 @@ public class HoodieCreateHandle extends HoodieWri } } } catch (IOException io) { - throw new HoodieInsertException( - "Failed to insert records for path " + path, io); + throw new HoodieInsertException("Failed to insert records for path " + path, io); } } @@ -151,8 +149,8 @@ public class HoodieCreateHandle extends HoodieWri */ @Override public WriteStatus close() { - logger.info("Closing the file " + writeStatus.getFileId() + " as we are done with all the records " - + recordsWritten); + logger + .info("Closing the file " + writeStatus.getFileId() + " as we are done with all the records " + recordsWritten); try { storageWriter.close(); @@ -174,8 +172,8 @@ public class HoodieCreateHandle extends HoodieWri stat.setRuntimeStats(runtimeStats); writeStatus.setStat(stat); - logger.info(String.format("CreateHandle for partitionPath %s fileID %s, took %d ms.", - stat.getPartitionPath(), stat.getFileId(), runtimeStats.getTotalCreateTime())); + logger.info(String.format("CreateHandle for partitionPath %s fileID %s, took %d ms.", stat.getPartitionPath(), + stat.getFileId(), runtimeStats.getTotalCreateTime())); return writeStatus; } catch (IOException e) { diff --git a/hudi-client/src/main/java/org/apache/hudi/io/HoodieKeyLookupHandle.java b/hudi-client/src/main/java/org/apache/hudi/io/HoodieKeyLookupHandle.java index dec488449..20ab3bc06 100644 --- a/hudi-client/src/main/java/org/apache/hudi/io/HoodieKeyLookupHandle.java +++ b/hudi-client/src/main/java/org/apache/hudi/io/HoodieKeyLookupHandle.java @@ -67,15 +67,15 @@ public class HoodieKeyLookupHandle extends Hoodie /** * Given a list of row keys and one file, return only row keys existing in that file. */ - public static List checkCandidatesAgainstFile(Configuration configuration, - List candidateRecordKeys, Path filePath) throws HoodieIndexException { + public static List checkCandidatesAgainstFile(Configuration configuration, List candidateRecordKeys, + Path filePath) throws HoodieIndexException { List foundRecordKeys = new ArrayList<>(); try { // Load all rowKeys from the file, to double-confirm if (!candidateRecordKeys.isEmpty()) { HoodieTimer timer = new HoodieTimer().startTimer(); - Set fileRowKeys = ParquetUtils.filterParquetRowKeys(configuration, filePath, - new HashSet<>(candidateRecordKeys)); + Set fileRowKeys = + ParquetUtils.filterParquetRowKeys(configuration, filePath, new HashSet<>(candidateRecordKeys)); foundRecordKeys.addAll(fileRowKeys); logger.info(String.format("Checked keys against file %s, in %d ms. #candidates (%d) #found (%d)", filePath, timer.endTimer(), candidateRecordKeys.size(), foundRecordKeys.size())); @@ -112,11 +112,11 @@ public class HoodieKeyLookupHandle extends Hoodie } HoodieDataFile dataFile = getLatestDataFile(); - List matchingKeys = checkCandidatesAgainstFile(hoodieTable.getHadoopConf(), candidateRecordKeys, - new Path(dataFile.getPath())); - logger.info(String.format("Total records (%d), bloom filter candidates (%d)/fp(%d), actual matches (%d)", - totalKeysChecked, candidateRecordKeys.size(), candidateRecordKeys.size() - matchingKeys.size(), - matchingKeys.size())); + List matchingKeys = + checkCandidatesAgainstFile(hoodieTable.getHadoopConf(), candidateRecordKeys, new Path(dataFile.getPath())); + logger.info( + String.format("Total records (%d), bloom filter candidates (%d)/fp(%d), actual matches (%d)", totalKeysChecked, + candidateRecordKeys.size(), candidateRecordKeys.size() - matchingKeys.size(), matchingKeys.size())); return new KeyLookupResult(partitionPathFilePair.getRight(), partitionPathFilePair.getLeft(), dataFile.getCommitTime(), matchingKeys); } diff --git a/hudi-client/src/main/java/org/apache/hudi/io/HoodieMergeHandle.java b/hudi-client/src/main/java/org/apache/hudi/io/HoodieMergeHandle.java index a819cf7ed..4826c89d6 100644 --- a/hudi-client/src/main/java/org/apache/hudi/io/HoodieMergeHandle.java +++ b/hudi-client/src/main/java/org/apache/hudi/io/HoodieMergeHandle.java @@ -71,8 +71,7 @@ public class HoodieMergeHandle extends HoodieWrit Iterator> recordItr, String fileId) { super(config, commitTime, fileId, hoodieTable); String partitionPath = init(fileId, recordItr); - init(fileId, partitionPath, - hoodieTable.getROFileSystemView().getLatestDataFile(partitionPath, fileId).get()); + init(fileId, partitionPath, hoodieTable.getROFileSystemView().getLatestDataFile(partitionPath, fileId).get()); } /** @@ -83,8 +82,8 @@ public class HoodieMergeHandle extends HoodieWrit super(config, commitTime, fileId, hoodieTable); this.keyToNewRecords = keyToNewRecords; this.useWriterSchema = true; - init(fileId, keyToNewRecords.get(keyToNewRecords.keySet().stream().findFirst().get()) - .getPartitionPath(), dataFileToBeMerged); + init(fileId, keyToNewRecords.get(keyToNewRecords.keySet().stream().findFirst().get()).getPartitionPath(), + dataFileToBeMerged); } @@ -160,15 +159,13 @@ public class HoodieMergeHandle extends HoodieWrit new Path(config.getBasePath()), FSUtils.getPartitionPath(config.getBasePath(), partitionPath)); partitionMetadata.trySave(TaskContext.getPartitionId()); - oldFilePath = new Path( - config.getBasePath() + "/" + partitionPath + "/" + latestValidFilePath); + oldFilePath = new Path(config.getBasePath() + "/" + partitionPath + "/" + latestValidFilePath); String relativePath = new Path((partitionPath.isEmpty() ? "" : partitionPath + "/") + FSUtils.makeDataFileName(instantTime, writeToken, fileId)).toString(); newFilePath = new Path(config.getBasePath(), relativePath); - logger.info(String - .format("Merging new data into oldPath %s, as newPath %s", oldFilePath.toString(), - newFilePath.toString())); + logger.info(String.format("Merging new data into oldPath %s, as newPath %s", oldFilePath.toString(), + newFilePath.toString())); // file name is same for all records, in this bunch writeStatus.setFileId(fileId); writeStatus.setPartitionPath(partitionPath); @@ -180,14 +177,13 @@ public class HoodieMergeHandle extends HoodieWrit createMarkerFile(partitionPath); // Create the writer for writing the new version file - storageWriter = HoodieStorageWriterFactory - .getStorageWriter(instantTime, newFilePath, hoodieTable, config, writerSchema); + storageWriter = + HoodieStorageWriterFactory.getStorageWriter(instantTime, newFilePath, hoodieTable, config, writerSchema); } catch (IOException io) { logger.error("Error in update task at commit " + instantTime, io); writeStatus.setGlobalError(io); - throw new HoodieUpsertException( - "Failed to initialize HoodieUpdateHandle for FileId: " + fileId + " on commit " - + instantTime + " on path " + hoodieTable.getMetaClient().getBasePath(), io); + throw new HoodieUpsertException("Failed to initialize HoodieUpdateHandle for FileId: " + fileId + " on commit " + + instantTime + " on path " + hoodieTable.getMetaClient().getBasePath(), io); } } @@ -211,16 +207,14 @@ public class HoodieMergeHandle extends HoodieWrit record.unseal(); record.setNewLocation(new HoodieRecordLocation(instantTime, fileId)); record.seal(); - //NOTE: Once Records are added to map (spillable-map), DO NOT change it as they won't persist + // NOTE: Once Records are added to map (spillable-map), DO NOT change it as they won't persist keyToNewRecords.put(record.getRecordKey(), record); } logger.info("Number of entries in MemoryBasedMap => " + ((ExternalSpillableMap) keyToNewRecords).getInMemoryMapNumEntries() + "Total size in bytes of MemoryBasedMap => " - + ((ExternalSpillableMap) keyToNewRecords).getCurrentInMemoryMapSize() - + "Number of entries in DiskBasedMap => " - + ((ExternalSpillableMap) keyToNewRecords).getDiskBasedMapNumEntries() - + "Size of file spilled to disk => " + + ((ExternalSpillableMap) keyToNewRecords).getCurrentInMemoryMapSize() + "Number of entries in DiskBasedMap => " + + ((ExternalSpillableMap) keyToNewRecords).getDiskBasedMapNumEntries() + "Size of file spilled to disk => " + ((ExternalSpillableMap) keyToNewRecords).getSizeOfFileOnDiskInBytes()); return partitionPath; } @@ -258,8 +252,7 @@ public class HoodieMergeHandle extends HoodieWrit } /** - * Go through an old record. Here if we detect a newer version shows up, we write the new one to - * the file. + * Go through an old record. Here if we detect a newer version shows up, we write the new one to the file. */ public void write(GenericRecord oldRecord) { String key = oldRecord.get(HoodieRecord.RECORD_KEY_METADATA_FIELD).toString(); @@ -269,12 +262,12 @@ public class HoodieMergeHandle extends HoodieWrit // writing the first record. So make a copy of the record to be merged HoodieRecord hoodieRecord = new HoodieRecord<>(keyToNewRecords.get(key)); try { - Option combinedAvroRecord = hoodieRecord.getData() - .combineAndGetUpdateValue(oldRecord, useWriterSchema ? writerSchema : originalSchema); + Option combinedAvroRecord = + hoodieRecord.getData().combineAndGetUpdateValue(oldRecord, useWriterSchema ? writerSchema : originalSchema); if (writeUpdateRecord(hoodieRecord, combinedAvroRecord)) { - /* ONLY WHEN - * 1) we have an update for this key AND - * 2) We are able to successfully write the the combined new value + /* + * ONLY WHEN 1) we have an update for this key AND 2) We are able to successfully write the the combined new + * value * * We no longer need to copy the old record over. */ @@ -282,26 +275,24 @@ public class HoodieMergeHandle extends HoodieWrit } writtenRecordKeys.add(key); } catch (Exception e) { - throw new HoodieUpsertException( - "Failed to combine/merge new record with old value in storage, for new record {" - + keyToNewRecords.get(key) + "}, old value {" + oldRecord + "}", e); + throw new HoodieUpsertException("Failed to combine/merge new record with old value in storage, for new record {" + + keyToNewRecords.get(key) + "}, old value {" + oldRecord + "}", e); } } if (copyOldRecord) { // this should work as it is, since this is an existing record - String errMsg = "Failed to merge old record into new file for key " + key + " from old file " - + getOldFilePath() + " to new file " + newFilePath; + String errMsg = "Failed to merge old record into new file for key " + key + " from old file " + getOldFilePath() + + " to new file " + newFilePath; try { storageWriter.writeAvro(key, oldRecord); } catch (ClassCastException e) { - logger.error("Schema mismatch when rewriting old record " + oldRecord + " from file " - + getOldFilePath() + " to file " + newFilePath + " with writerSchema " + writerSchema - .toString(true)); + logger.error("Schema mismatch when rewriting old record " + oldRecord + " from file " + getOldFilePath() + + " to file " + newFilePath + " with writerSchema " + writerSchema.toString(true)); throw new HoodieUpsertException(errMsg, e); } catch (IOException e) { - logger.error("Failed to merge old record into new file for key " + key + " from old file " - + getOldFilePath() + " to new file " + newFilePath, e); + logger.error("Failed to merge old record into new file for key " + key + " from old file " + getOldFilePath() + + " to new file " + newFilePath, e); throw new HoodieUpsertException(errMsg, e); } recordsWritten++; @@ -344,8 +335,8 @@ public class HoodieMergeHandle extends HoodieWrit runtimeStats.setTotalUpsertTime(timer.endTimer()); stat.setRuntimeStats(runtimeStats); - logger.info(String.format("MergeHandle for partitionPath %s fileID %s, took %d ms.", - stat.getPartitionPath(), stat.getFileId(), runtimeStats.getTotalUpsertTime())); + logger.info(String.format("MergeHandle for partitionPath %s fileID %s, took %d ms.", stat.getPartitionPath(), + stat.getFileId(), runtimeStats.getTotalUpsertTime())); return writeStatus; } catch (IOException e) { diff --git a/hudi-client/src/main/java/org/apache/hudi/io/HoodieWriteHandle.java b/hudi-client/src/main/java/org/apache/hudi/io/HoodieWriteHandle.java index 4cfd941f0..02c629703 100644 --- a/hudi-client/src/main/java/org/apache/hudi/io/HoodieWriteHandle.java +++ b/hudi-client/src/main/java/org/apache/hudi/io/HoodieWriteHandle.java @@ -61,8 +61,7 @@ public abstract class HoodieWriteHandle extends H this.writerSchema = createHoodieWriteSchema(originalSchema); this.timer = new HoodieTimer().startTimer(); this.writeStatus = (WriteStatus) ReflectionUtils.loadClass(config.getWriteStatusClassName(), - !hoodieTable.getIndex().isImplicitWithStorage(), - config.getWriteStatusFailureFraction()); + !hoodieTable.getIndex().isImplicitWithStorage(), config.getWriteStatusFailureFraction()); } /** @@ -104,7 +103,7 @@ public abstract class HoodieWriteHandle extends H } /** - * THe marker path will be /.hoodie/.temp//2019/04/25/filename + * THe marker path will be /.hoodie/.temp//2019/04/25/filename */ private Path makeNewMarkerPath(String partitionPath) { Path markerRootPath = new Path(hoodieTable.getMetaClient().getMarkerFolderPath(instantTime)); diff --git a/hudi-client/src/main/java/org/apache/hudi/io/compact/HoodieCompactor.java b/hudi-client/src/main/java/org/apache/hudi/io/compact/HoodieCompactor.java index f57758f89..a674c04b7 100644 --- a/hudi-client/src/main/java/org/apache/hudi/io/compact/HoodieCompactor.java +++ b/hudi-client/src/main/java/org/apache/hudi/io/compact/HoodieCompactor.java @@ -37,23 +37,20 @@ public interface HoodieCompactor extends Serializable { /** * Generate a new compaction plan for scheduling * - * @param jsc Spark Context - * @param hoodieTable Hoodie Table - * @param config Hoodie Write Configuration + * @param jsc Spark Context + * @param hoodieTable Hoodie Table + * @param config Hoodie Write Configuration * @param compactionCommitTime scheduled compaction commit time * @param fgIdsInPendingCompactions partition-fileId pairs for which compaction is pending * @return Compaction Plan * @throws IOException when encountering errors */ - HoodieCompactionPlan generateCompactionPlan(JavaSparkContext jsc, - HoodieTable hoodieTable, HoodieWriteConfig config, String compactionCommitTime, - Set fgIdsInPendingCompactions) - throws IOException; + HoodieCompactionPlan generateCompactionPlan(JavaSparkContext jsc, HoodieTable hoodieTable, HoodieWriteConfig config, + String compactionCommitTime, Set fgIdsInPendingCompactions) throws IOException; /** * Execute compaction operations and report back status */ - JavaRDD compact(JavaSparkContext jsc, - HoodieCompactionPlan compactionPlan, HoodieTable hoodieTable, HoodieWriteConfig config, - String compactionInstantTime) throws IOException; + JavaRDD compact(JavaSparkContext jsc, HoodieCompactionPlan compactionPlan, HoodieTable hoodieTable, + HoodieWriteConfig config, String compactionInstantTime) throws IOException; } diff --git a/hudi-client/src/main/java/org/apache/hudi/io/compact/HoodieRealtimeTableCompactor.java b/hudi-client/src/main/java/org/apache/hudi/io/compact/HoodieRealtimeTableCompactor.java index 7963857d9..095b210b7 100644 --- a/hudi-client/src/main/java/org/apache/hudi/io/compact/HoodieRealtimeTableCompactor.java +++ b/hudi-client/src/main/java/org/apache/hudi/io/compact/HoodieRealtimeTableCompactor.java @@ -63,9 +63,9 @@ import org.apache.spark.util.AccumulatorV2; import org.apache.spark.util.LongAccumulator; /** - * HoodieRealtimeTableCompactor compacts a hoodie table with merge on read storage. Computes all - * possible compactions, passes it through a CompactionFilter and executes all the compactions and - * writes a new version of base files and make a normal commit + * HoodieRealtimeTableCompactor compacts a hoodie table with merge on read storage. Computes all possible compactions, + * passes it through a CompactionFilter and executes all the compactions and writes a new version of base files and make + * a normal commit * * @see HoodieCompactor */ @@ -78,9 +78,8 @@ public class HoodieRealtimeTableCompactor implements HoodieCompactor { private AccumulatorV2 totalFileSlices; @Override - public JavaRDD compact(JavaSparkContext jsc, - HoodieCompactionPlan compactionPlan, HoodieTable hoodieTable, HoodieWriteConfig config, - String compactionInstantTime) throws IOException { + public JavaRDD compact(JavaSparkContext jsc, HoodieCompactionPlan compactionPlan, + HoodieTable hoodieTable, HoodieWriteConfig config, String compactionInstantTime) throws IOException { if (compactionPlan == null || (compactionPlan.getOperations() == null) || (compactionPlan.getOperations().isEmpty())) { return jsc.emptyRDD(); @@ -88,41 +87,36 @@ public class HoodieRealtimeTableCompactor implements HoodieCompactor { HoodieTableMetaClient metaClient = hoodieTable.getMetaClient(); // Compacting is very similar to applying updates to existing file HoodieCopyOnWriteTable table = new HoodieCopyOnWriteTable(config, jsc); - List operations = compactionPlan.getOperations().stream().map( - CompactionOperation::convertFromAvroRecordInstance).collect(toList()); + List operations = compactionPlan.getOperations().stream() + .map(CompactionOperation::convertFromAvroRecordInstance).collect(toList()); log.info("Compactor compacting " + operations + " files"); return jsc.parallelize(operations, operations.size()) - .map(s -> compact(table, metaClient, config, s, compactionInstantTime)) - .flatMap(List::iterator); + .map(s -> compact(table, metaClient, config, s, compactionInstantTime)).flatMap(List::iterator); } private List compact(HoodieCopyOnWriteTable hoodieCopyOnWriteTable, HoodieTableMetaClient metaClient, - HoodieWriteConfig config, - CompactionOperation operation, String commitTime) throws IOException { + HoodieWriteConfig config, CompactionOperation operation, String commitTime) throws IOException { FileSystem fs = metaClient.getFs(); - Schema readerSchema = HoodieAvroUtils - .addMetadataFields(new Schema.Parser().parse(config.getSchema())); + Schema readerSchema = HoodieAvroUtils.addMetadataFields(new Schema.Parser().parse(config.getSchema())); - log.info("Compacting base " + operation.getDataFilePath() + " with delta files " + operation - .getDeltaFilePaths() + " for commit " + commitTime); + log.info("Compacting base " + operation.getDataFilePath() + " with delta files " + operation.getDeltaFilePaths() + + " for commit " + commitTime); // TODO - FIX THIS // Reads the entire avro file. Always only specific blocks should be read from the avro file // (failure recover). // Load all the delta commits since the last compaction commit and get all the blocks to be // loaded and load it using CompositeAvroLogReader // Since a DeltaCommit is not defined yet, reading all the records. revisit this soon. - String maxInstantTime = metaClient.getActiveTimeline() - .getTimelineOfActions( - Sets.newHashSet(HoodieTimeline.COMMIT_ACTION, HoodieTimeline.ROLLBACK_ACTION, - HoodieTimeline.DELTA_COMMIT_ACTION)) + String maxInstantTime = metaClient + .getActiveTimeline().getTimelineOfActions(Sets.newHashSet(HoodieTimeline.COMMIT_ACTION, + HoodieTimeline.ROLLBACK_ACTION, HoodieTimeline.DELTA_COMMIT_ACTION)) .filterCompletedInstants().lastInstant().get().getTimestamp(); log.info("MaxMemoryPerCompaction => " + config.getMaxMemoryPerCompaction()); - HoodieMergedLogRecordScanner scanner = new HoodieMergedLogRecordScanner(fs, - metaClient.getBasePath(), operation.getDeltaFilePaths(), readerSchema, maxInstantTime, - config.getMaxMemoryPerCompaction(), config.getCompactionLazyBlockReadEnabled(), - config.getCompactionReverseLogReadEnabled(), config.getMaxDFSStreamBufferSize(), - config.getSpillableMapBasePath()); + HoodieMergedLogRecordScanner scanner = new HoodieMergedLogRecordScanner(fs, metaClient.getBasePath(), + operation.getDeltaFilePaths(), readerSchema, maxInstantTime, config.getMaxMemoryPerCompaction(), + config.getCompactionLazyBlockReadEnabled(), config.getCompactionReverseLogReadEnabled(), + config.getMaxDFSStreamBufferSize(), config.getSpillableMapBasePath()); if (!scanner.iterator().hasNext()) { return Lists.newArrayList(); } @@ -134,53 +128,49 @@ public class HoodieRealtimeTableCompactor implements HoodieCompactor { // If the dataFile is present, there is a base parquet file present, perform updates else perform inserts into a // new base parquet file. if (oldDataFileOpt.isPresent()) { - result = hoodieCopyOnWriteTable - .handleUpdate(commitTime, operation.getFileId(), scanner.getRecords(), oldDataFileOpt.get()); + result = hoodieCopyOnWriteTable.handleUpdate(commitTime, operation.getFileId(), scanner.getRecords(), + oldDataFileOpt.get()); } else { - result = hoodieCopyOnWriteTable - .handleInsert(commitTime, operation.getPartitionPath(), operation.getFileId(), scanner.iterator()); + result = hoodieCopyOnWriteTable.handleInsert(commitTime, operation.getPartitionPath(), operation.getFileId(), + scanner.iterator()); } Iterable> resultIterable = () -> result; - return StreamSupport.stream(resultIterable.spliterator(), false).flatMap(Collection::stream) - .peek(s -> { - s.getStat().setTotalUpdatedRecordsCompacted(scanner.getNumMergedRecordsInLog()); - s.getStat().setTotalLogFilesCompacted(scanner.getTotalLogFiles()); - s.getStat().setTotalLogRecords(scanner.getTotalLogRecords()); - s.getStat().setPartitionPath(operation.getPartitionPath()); - s.getStat().setTotalLogSizeCompacted(operation.getMetrics().get( - CompactionStrategy.TOTAL_LOG_FILE_SIZE).longValue()); - s.getStat().setTotalLogBlocks(scanner.getTotalLogBlocks()); - s.getStat().setTotalCorruptLogBlock(scanner.getTotalCorruptBlocks()); - s.getStat().setTotalRollbackBlocks(scanner.getTotalRollbacks()); - RuntimeStats runtimeStats = new RuntimeStats(); - runtimeStats.setTotalScanTime(scanner.getTotalTimeTakenToReadAndMergeBlocks()); - s.getStat().setRuntimeStats(runtimeStats); - }).collect(toList()); + return StreamSupport.stream(resultIterable.spliterator(), false).flatMap(Collection::stream).peek(s -> { + s.getStat().setTotalUpdatedRecordsCompacted(scanner.getNumMergedRecordsInLog()); + s.getStat().setTotalLogFilesCompacted(scanner.getTotalLogFiles()); + s.getStat().setTotalLogRecords(scanner.getTotalLogRecords()); + s.getStat().setPartitionPath(operation.getPartitionPath()); + s.getStat() + .setTotalLogSizeCompacted(operation.getMetrics().get(CompactionStrategy.TOTAL_LOG_FILE_SIZE).longValue()); + s.getStat().setTotalLogBlocks(scanner.getTotalLogBlocks()); + s.getStat().setTotalCorruptLogBlock(scanner.getTotalCorruptBlocks()); + s.getStat().setTotalRollbackBlocks(scanner.getTotalRollbacks()); + RuntimeStats runtimeStats = new RuntimeStats(); + runtimeStats.setTotalScanTime(scanner.getTotalTimeTakenToReadAndMergeBlocks()); + s.getStat().setRuntimeStats(runtimeStats); + }).collect(toList()); } @Override - public HoodieCompactionPlan generateCompactionPlan(JavaSparkContext jsc, - HoodieTable hoodieTable, HoodieWriteConfig config, String compactionCommitTime, - Set fgIdsInPendingCompactions) throws IOException { + public HoodieCompactionPlan generateCompactionPlan(JavaSparkContext jsc, HoodieTable hoodieTable, + HoodieWriteConfig config, String compactionCommitTime, Set fgIdsInPendingCompactions) + throws IOException { totalLogFiles = new LongAccumulator(); totalFileSlices = new LongAccumulator(); jsc.sc().register(totalLogFiles); jsc.sc().register(totalFileSlices); - Preconditions - .checkArgument(hoodieTable.getMetaClient().getTableType() == HoodieTableType.MERGE_ON_READ, - "HoodieRealtimeTableCompactor can only compact table of type " - + HoodieTableType.MERGE_ON_READ + " and not " + hoodieTable.getMetaClient() - .getTableType().name()); + Preconditions.checkArgument(hoodieTable.getMetaClient().getTableType() == HoodieTableType.MERGE_ON_READ, + "HoodieRealtimeTableCompactor can only compact table of type " + HoodieTableType.MERGE_ON_READ + " and not " + + hoodieTable.getMetaClient().getTableType().name()); - //TODO : check if maxMemory is not greater than JVM or spark.executor memory + // TODO : check if maxMemory is not greater than JVM or spark.executor memory // TODO - rollback any compactions in flight HoodieTableMetaClient metaClient = hoodieTable.getMetaClient(); log.info("Compacting " + metaClient.getBasePath() + " with commit " + compactionCommitTime); - List partitionPaths = FSUtils - .getAllPartitionPaths(metaClient.getFs(), metaClient.getBasePath(), - config.shouldAssumeDatePartitioning()); + List partitionPaths = FSUtils.getAllPartitionPaths(metaClient.getFs(), metaClient.getBasePath(), + config.shouldAssumeDatePartitioning()); // filter the partition paths if needed to reduce list status partitionPaths = config.getCompactionStrategy().filterPartitionPaths(config, partitionPaths); @@ -192,28 +182,22 @@ public class HoodieRealtimeTableCompactor implements HoodieCompactor { RealtimeView fileSystemView = hoodieTable.getRTFileSystemView(); log.info("Compaction looking for files to compact in " + partitionPaths + " partitions"); - List operations = - jsc.parallelize(partitionPaths, partitionPaths.size()) - .flatMap((FlatMapFunction) partitionPath -> fileSystemView - .getLatestFileSlices(partitionPath) - .filter(slice -> - !fgIdsInPendingCompactions.contains(slice.getFileGroupId())) - .map( - s -> { - List logFiles = s.getLogFiles().sorted(HoodieLogFile - .getLogFileComparator()).collect(Collectors.toList()); - totalLogFiles.add((long) logFiles.size()); - totalFileSlices.add(1L); - // Avro generated classes are not inheriting Serializable. Using CompactionOperation POJO - // for spark Map operations and collecting them finally in Avro generated classes for storing - // into meta files. - Option dataFile = s.getDataFile(); - return new CompactionOperation(dataFile, partitionPath, logFiles, - config.getCompactionStrategy().captureMetrics(config, dataFile, partitionPath, logFiles)); - }) - .filter(c -> !c.getDeltaFilePaths().isEmpty()) - .collect(toList()).iterator()).collect().stream().map(CompactionUtils::buildHoodieCompactionOperation) - .collect(toList()); + List operations = jsc.parallelize(partitionPaths, partitionPaths.size()) + .flatMap((FlatMapFunction) partitionPath -> fileSystemView + .getLatestFileSlices(partitionPath) + .filter(slice -> !fgIdsInPendingCompactions.contains(slice.getFileGroupId())).map(s -> { + List logFiles = + s.getLogFiles().sorted(HoodieLogFile.getLogFileComparator()).collect(Collectors.toList()); + totalLogFiles.add((long) logFiles.size()); + totalFileSlices.add(1L); + // Avro generated classes are not inheriting Serializable. Using CompactionOperation POJO + // for spark Map operations and collecting them finally in Avro generated classes for storing + // into meta files. + Option dataFile = s.getDataFile(); + return new CompactionOperation(dataFile, partitionPath, logFiles, + config.getCompactionStrategy().captureMetrics(config, dataFile, partitionPath, logFiles)); + }).filter(c -> !c.getDeltaFilePaths().isEmpty()).collect(toList()).iterator()) + .collect().stream().map(CompactionUtils::buildHoodieCompactionOperation).collect(toList()); log.info("Total of " + operations.size() + " compactions are retrieved"); log.info("Total number of latest files slices " + totalFileSlices.value()); log.info("Total number of log files " + totalLogFiles.value()); @@ -222,11 +206,11 @@ public class HoodieRealtimeTableCompactor implements HoodieCompactor { // compactions only HoodieCompactionPlan compactionPlan = config.getCompactionStrategy().generateCompactionPlan(config, operations, CompactionUtils.getAllPendingCompactionPlans(metaClient).stream().map(Pair::getValue).collect(toList())); - Preconditions.checkArgument(compactionPlan.getOperations().stream().noneMatch( - op -> fgIdsInPendingCompactions.contains(new HoodieFileGroupId(op.getPartitionPath(), op.getFileId()))), + Preconditions.checkArgument( + compactionPlan.getOperations().stream().noneMatch( + op -> fgIdsInPendingCompactions.contains(new HoodieFileGroupId(op.getPartitionPath(), op.getFileId()))), "Bad Compaction Plan. FileId MUST NOT have multiple pending compactions. " - + "Please fix your strategy implementation." - + "FileIdsWithPendingCompactions :" + fgIdsInPendingCompactions + + "Please fix your strategy implementation." + "FileIdsWithPendingCompactions :" + fgIdsInPendingCompactions + ", Selected workload :" + compactionPlan); if (compactionPlan.getOperations().isEmpty()) { log.warn("After filtering, Nothing to compact for " + metaClient.getBasePath()); diff --git a/hudi-client/src/main/java/org/apache/hudi/io/compact/strategy/BoundedIOCompactionStrategy.java b/hudi-client/src/main/java/org/apache/hudi/io/compact/strategy/BoundedIOCompactionStrategy.java index ad5451005..0e179773b 100644 --- a/hudi-client/src/main/java/org/apache/hudi/io/compact/strategy/BoundedIOCompactionStrategy.java +++ b/hudi-client/src/main/java/org/apache/hudi/io/compact/strategy/BoundedIOCompactionStrategy.java @@ -25,8 +25,8 @@ import org.apache.hudi.avro.model.HoodieCompactionPlan; import org.apache.hudi.config.HoodieWriteConfig; /** - * CompactionStrategy which looks at total IO to be done for the compaction (read + write) and - * limits the list of compactions to be under a configured limit on the IO + * CompactionStrategy which looks at total IO to be done for the compaction (read + write) and limits the list of + * compactions to be under a configured limit on the IO * * @see CompactionStrategy */ diff --git a/hudi-client/src/main/java/org/apache/hudi/io/compact/strategy/BoundedPartitionAwareCompactionStrategy.java b/hudi-client/src/main/java/org/apache/hudi/io/compact/strategy/BoundedPartitionAwareCompactionStrategy.java index 7528730f0..49b887ec9 100644 --- a/hudi-client/src/main/java/org/apache/hudi/io/compact/strategy/BoundedPartitionAwareCompactionStrategy.java +++ b/hudi-client/src/main/java/org/apache/hudi/io/compact/strategy/BoundedPartitionAwareCompactionStrategy.java @@ -32,11 +32,10 @@ import org.apache.hudi.config.HoodieWriteConfig; /** * This strategy ensures that the last N partitions are picked up even if there are later partitions created for the - * dataset. lastNPartitions is defined as the N partitions before the currentDate. - * currentDay = 2018/01/01 - * The dataset has partitions for 2018/02/02 and 2018/03/03 beyond the currentDay - * This strategy will pick up the following partitions for compaction : - * (2018/01/01, allPartitionsInRange[(2018/01/01 - lastNPartitions) to 2018/01/01), 2018/02/02, 2018/03/03) + * dataset. lastNPartitions is defined as the N partitions before the currentDate. currentDay = 2018/01/01 The dataset + * has partitions for 2018/02/02 and 2018/03/03 beyond the currentDay This strategy will pick up the following + * partitions for compaction : (2018/01/01, allPartitionsInRange[(2018/01/01 - lastNPartitions) to 2018/01/01), + * 2018/02/02, 2018/03/03) */ public class BoundedPartitionAwareCompactionStrategy extends DayBasedCompactionStrategy { @@ -46,15 +45,14 @@ public class BoundedPartitionAwareCompactionStrategy extends DayBasedCompactionS public List orderAndFilter(HoodieWriteConfig writeConfig, List operations, List pendingCompactionPlans) { // The earliest partition to compact - current day minus the target partitions limit - String earliestPartitionPathToCompact = dateFormat.format( - getDateAtOffsetFromToday(-1 * writeConfig.getTargetPartitionsPerDayBasedCompaction())); + String earliestPartitionPathToCompact = + dateFormat.format(getDateAtOffsetFromToday(-1 * writeConfig.getTargetPartitionsPerDayBasedCompaction())); // Filter out all partitions greater than earliestPartitionPathToCompact - List eligibleCompactionOperations = operations.stream() - .collect(Collectors.groupingBy(HoodieCompactionOperation::getPartitionPath)).entrySet().stream() - .sorted(Map.Entry.comparingByKey(comparator)) - .filter(e -> comparator.compare(earliestPartitionPathToCompact, e.getKey()) >= 0) - .flatMap(e -> e.getValue().stream()) - .collect(Collectors.toList()); + List eligibleCompactionOperations = + operations.stream().collect(Collectors.groupingBy(HoodieCompactionOperation::getPartitionPath)).entrySet() + .stream().sorted(Map.Entry.comparingByKey(comparator)) + .filter(e -> comparator.compare(earliestPartitionPathToCompact, e.getKey()) >= 0) + .flatMap(e -> e.getValue().stream()).collect(Collectors.toList()); return eligibleCompactionOperations; } @@ -62,13 +60,12 @@ public class BoundedPartitionAwareCompactionStrategy extends DayBasedCompactionS @Override public List filterPartitionPaths(HoodieWriteConfig writeConfig, List partitionPaths) { // The earliest partition to compact - current day minus the target partitions limit - String earliestPartitionPathToCompact = dateFormat.format( - getDateAtOffsetFromToday(-1 * writeConfig.getTargetPartitionsPerDayBasedCompaction())); + String earliestPartitionPathToCompact = + dateFormat.format(getDateAtOffsetFromToday(-1 * writeConfig.getTargetPartitionsPerDayBasedCompaction())); // Get all partitions and sort them List filteredPartitionPaths = partitionPaths.stream().map(partition -> partition.replace("/", "-")) .sorted(Comparator.reverseOrder()).map(partitionPath -> partitionPath.replace("-", "/")) - .filter(e -> comparator.compare(earliestPartitionPathToCompact, e) >= 0) - .collect(Collectors.toList()); + .filter(e -> comparator.compare(earliestPartitionPathToCompact, e) >= 0).collect(Collectors.toList()); return filteredPartitionPaths; } diff --git a/hudi-client/src/main/java/org/apache/hudi/io/compact/strategy/CompactionStrategy.java b/hudi-client/src/main/java/org/apache/hudi/io/compact/strategy/CompactionStrategy.java index b7faf074e..7fc1634ca 100644 --- a/hudi-client/src/main/java/org/apache/hudi/io/compact/strategy/CompactionStrategy.java +++ b/hudi-client/src/main/java/org/apache/hudi/io/compact/strategy/CompactionStrategy.java @@ -32,11 +32,10 @@ import org.apache.hudi.config.HoodieWriteConfig; import org.apache.hudi.io.compact.HoodieRealtimeTableCompactor; /** - * Strategy for compaction. Pluggable implementation to define how compaction should be done. The - * over-ridden implementations of this abstract class can capture the relevant metrics to order - * and filter the final list of compaction operation to run in a single compaction. - * Implementation of CompactionStrategy cannot hold any state. Difference instantiations can be - * passed in every time + * Strategy for compaction. Pluggable implementation to define how compaction should be done. The over-ridden + * implementations of this abstract class can capture the relevant metrics to order and filter the final list of + * compaction operation to run in a single compaction. Implementation of CompactionStrategy cannot hold any state. + * Difference instantiations can be passed in every time * * @see HoodieRealtimeTableCompactor */ @@ -49,8 +48,8 @@ public abstract class CompactionStrategy implements Serializable { public static final String TOTAL_LOG_FILES = "TOTAL_LOG_FILES"; /** - * Callback hook when a HoodieCompactionOperation is created. Individual strategies can capture the - * metrics they need to decide on the priority. + * Callback hook when a HoodieCompactionOperation is created. Individual strategies can capture the metrics they need + * to decide on the priority. * * @param dataFile - Base file to compact * @param partitionPath - Partition path @@ -58,18 +57,18 @@ public abstract class CompactionStrategy implements Serializable { * @return Map[String, Object] - metrics captured */ public Map captureMetrics(HoodieWriteConfig writeConfig, Option dataFile, - String partitionPath, List logFiles) { + String partitionPath, List logFiles) { Map metrics = Maps.newHashMap(); Long defaultMaxParquetFileSize = writeConfig.getParquetMaxFileSize(); // Total size of all the log files Long totalLogFileSize = logFiles.stream().map(HoodieLogFile::getFileSize).filter(size -> size >= 0) .reduce((size1, size2) -> size1 + size2).orElse(0L); // Total read will be the base file + all the log files - Long totalIORead = FSUtils.getSizeInMB((dataFile.isPresent() ? dataFile.get().getFileSize() : 0L) - + totalLogFileSize); + Long totalIORead = + FSUtils.getSizeInMB((dataFile.isPresent() ? dataFile.get().getFileSize() : 0L) + totalLogFileSize); // Total write will be similar to the size of the base file - Long totalIOWrite = FSUtils - .getSizeInMB(dataFile.isPresent() ? dataFile.get().getFileSize() : defaultMaxParquetFileSize); + Long totalIOWrite = + FSUtils.getSizeInMB(dataFile.isPresent() ? dataFile.get().getFileSize() : defaultMaxParquetFileSize); // Total IO will the the IO for read + write Long totalIO = totalIORead + totalIOWrite; // Save these metrics and we will use during the filter @@ -86,8 +85,8 @@ public abstract class CompactionStrategy implements Serializable { * implementation takes care of setting compactor Id from configuration allowing subclasses to only worry about * ordering and filtering compaction operations * - * @param writeConfig Hoodie Write Config - * @param operations Compaction Operations to be ordered and filtered + * @param writeConfig Hoodie Write Config + * @param operations Compaction Operations to be ordered and filtered * @param pendingCompactionPlans Pending Compaction Plans for strategy to schedule next compaction plan * @return Compaction plan to be scheduled. */ @@ -95,27 +94,26 @@ public abstract class CompactionStrategy implements Serializable { List operations, List pendingCompactionPlans) { // Strategy implementation can overload this method to set specific compactor-id return HoodieCompactionPlan.newBuilder() - .setOperations(orderAndFilter(writeConfig, operations, pendingCompactionPlans)) - .build(); + .setOperations(orderAndFilter(writeConfig, operations, pendingCompactionPlans)).build(); } /** * Order and Filter the list of compactions. Use the metrics captured with the captureMetrics to order and filter out * compactions * - * @param writeConfig config for this compaction is passed in - * @param operations list of compactions collected + * @param writeConfig config for this compaction is passed in + * @param operations list of compactions collected * @param pendingCompactionPlans Pending Compaction Plans for strategy to schedule next compaction plan * @return list of compactions to perform in this run */ public List orderAndFilter(HoodieWriteConfig writeConfig, - List operations, - List pendingCompactionPlans) { + List operations, List pendingCompactionPlans) { return operations; } /** * Filter the partition paths based on compaction strategy + * * @param writeConfig * @param allPartitionPaths * @return diff --git a/hudi-client/src/main/java/org/apache/hudi/io/compact/strategy/DayBasedCompactionStrategy.java b/hudi-client/src/main/java/org/apache/hudi/io/compact/strategy/DayBasedCompactionStrategy.java index b04574bdc..79fc3470a 100644 --- a/hudi-client/src/main/java/org/apache/hudi/io/compact/strategy/DayBasedCompactionStrategy.java +++ b/hudi-client/src/main/java/org/apache/hudi/io/compact/strategy/DayBasedCompactionStrategy.java @@ -34,21 +34,18 @@ import org.apache.hudi.config.HoodieWriteConfig; import org.apache.hudi.exception.HoodieException; /** - * This strategy orders compactions in reverse order of creation of Hive Partitions. It helps to - * compact data in latest partitions first and then older capped at the Total_IO allowed. + * This strategy orders compactions in reverse order of creation of Hive Partitions. It helps to compact data in latest + * partitions first and then older capped at the Total_IO allowed. */ public class DayBasedCompactionStrategy extends CompactionStrategy { // For now, use SimpleDateFormat as default partition format protected static String datePartitionFormat = "yyyy/MM/dd"; // Sorts compaction in LastInFirstCompacted order - protected static Comparator comparator = (String leftPartition, - String rightPartition) -> { + protected static Comparator comparator = (String leftPartition, String rightPartition) -> { try { - Date left = new SimpleDateFormat(datePartitionFormat, Locale.ENGLISH) - .parse(leftPartition); - Date right = new SimpleDateFormat(datePartitionFormat, Locale.ENGLISH) - .parse(rightPartition); + Date left = new SimpleDateFormat(datePartitionFormat, Locale.ENGLISH).parse(leftPartition); + Date right = new SimpleDateFormat(datePartitionFormat, Locale.ENGLISH).parse(rightPartition); return left.after(right) ? -1 : right.after(left) ? 1 : 0; } catch (ParseException e) { throw new HoodieException("Invalid Partition Date Format", e); @@ -68,8 +65,7 @@ public class DayBasedCompactionStrategy extends CompactionStrategy { List filteredList = operations.stream() .collect(Collectors.groupingBy(HoodieCompactionOperation::getPartitionPath)).entrySet().stream() .sorted(Map.Entry.comparingByKey(comparator)).limit(writeConfig.getTargetPartitionsPerDayBasedCompaction()) - .flatMap(e -> e.getValue().stream()) - .collect(Collectors.toList()); + .flatMap(e -> e.getValue().stream()).collect(Collectors.toList()); return filteredList; } diff --git a/hudi-client/src/main/java/org/apache/hudi/io/compact/strategy/LogFileSizeBasedCompactionStrategy.java b/hudi-client/src/main/java/org/apache/hudi/io/compact/strategy/LogFileSizeBasedCompactionStrategy.java index 829112260..42caf011a 100644 --- a/hudi-client/src/main/java/org/apache/hudi/io/compact/strategy/LogFileSizeBasedCompactionStrategy.java +++ b/hudi-client/src/main/java/org/apache/hudi/io/compact/strategy/LogFileSizeBasedCompactionStrategy.java @@ -30,14 +30,14 @@ import org.apache.hudi.common.util.Option; import org.apache.hudi.config.HoodieWriteConfig; /** - * LogFileSizeBasedCompactionStrategy orders the compactions based on the total log files size and - * limits the compactions within a configured IO bound + * LogFileSizeBasedCompactionStrategy orders the compactions based on the total log files size and limits the + * compactions within a configured IO bound * * @see BoundedIOCompactionStrategy * @see CompactionStrategy */ -public class LogFileSizeBasedCompactionStrategy extends BoundedIOCompactionStrategy implements - Comparator { +public class LogFileSizeBasedCompactionStrategy extends BoundedIOCompactionStrategy + implements Comparator { private static final String TOTAL_LOG_FILE_SIZE = "TOTAL_LOG_FILE_SIZE"; @@ -47,9 +47,8 @@ public class LogFileSizeBasedCompactionStrategy extends BoundedIOCompactionStrat Map metrics = super.captureMetrics(config, dataFile, partitionPath, logFiles); // Total size of all the log files - Long totalLogFileSize = logFiles.stream().map(HoodieLogFile::getFileSize) - .filter(size -> size >= 0).reduce((size1, size2) -> size1 + size2) - .orElse(0L); + Long totalLogFileSize = logFiles.stream().map(HoodieLogFile::getFileSize).filter(size -> size >= 0) + .reduce((size1, size2) -> size1 + size2).orElse(0L); // save the metrics needed during the order metrics.put(TOTAL_LOG_FILE_SIZE, totalLogFileSize.doubleValue()); return metrics; @@ -59,9 +58,8 @@ public class LogFileSizeBasedCompactionStrategy extends BoundedIOCompactionStrat public List orderAndFilter(HoodieWriteConfig writeConfig, List operations, List pendingCompactionPlans) { // Order the operations based on the reverse size of the logs and limit them by the IO - return super - .orderAndFilter(writeConfig, - operations.stream().sorted(this).collect(Collectors.toList()), pendingCompactionPlans); + return super.orderAndFilter(writeConfig, operations.stream().sorted(this).collect(Collectors.toList()), + pendingCompactionPlans); } @Override diff --git a/hudi-client/src/main/java/org/apache/hudi/io/compact/strategy/UnBoundedCompactionStrategy.java b/hudi-client/src/main/java/org/apache/hudi/io/compact/strategy/UnBoundedCompactionStrategy.java index 0edc6a61a..73b13bde8 100644 --- a/hudi-client/src/main/java/org/apache/hudi/io/compact/strategy/UnBoundedCompactionStrategy.java +++ b/hudi-client/src/main/java/org/apache/hudi/io/compact/strategy/UnBoundedCompactionStrategy.java @@ -24,9 +24,8 @@ import org.apache.hudi.avro.model.HoodieCompactionPlan; import org.apache.hudi.config.HoodieWriteConfig; /** - * UnBoundedCompactionStrategy will not change ordering or filter any compaction. It is a - * pass-through and will compact all the base files which has a log file. This usually means - * no-intelligence on compaction. + * UnBoundedCompactionStrategy will not change ordering or filter any compaction. It is a pass-through and will compact + * all the base files which has a log file. This usually means no-intelligence on compaction. * * @see CompactionStrategy */ diff --git a/hudi-client/src/main/java/org/apache/hudi/io/compact/strategy/UnBoundedPartitionAwareCompactionStrategy.java b/hudi-client/src/main/java/org/apache/hudi/io/compact/strategy/UnBoundedPartitionAwareCompactionStrategy.java index 220de5d27..1650b4f4e 100644 --- a/hudi-client/src/main/java/org/apache/hudi/io/compact/strategy/UnBoundedPartitionAwareCompactionStrategy.java +++ b/hudi-client/src/main/java/org/apache/hudi/io/compact/strategy/UnBoundedPartitionAwareCompactionStrategy.java @@ -27,12 +27,11 @@ import org.apache.hudi.avro.model.HoodieCompactionPlan; import org.apache.hudi.config.HoodieWriteConfig; /** - * UnBoundedPartitionAwareCompactionStrategy is a custom UnBounded Strategy. - * This will filter all the partitions that are eligible to be compacted by a - * {@link BoundedPartitionAwareCompactionStrategy} and return the result. - * This is done so that a long running UnBoundedPartitionAwareCompactionStrategy does not step over partitions - * in a shorter running BoundedPartitionAwareCompactionStrategy. Essentially, this is an inverse of the - * partitions chosen in BoundedPartitionAwareCompactionStrategy + * UnBoundedPartitionAwareCompactionStrategy is a custom UnBounded Strategy. This will filter all the partitions that + * are eligible to be compacted by a {@link BoundedPartitionAwareCompactionStrategy} and return the result. This is done + * so that a long running UnBoundedPartitionAwareCompactionStrategy does not step over partitions in a shorter running + * BoundedPartitionAwareCompactionStrategy. Essentially, this is an inverse of the partitions chosen in + * BoundedPartitionAwareCompactionStrategy * * @see CompactionStrategy */ @@ -41,10 +40,10 @@ public class UnBoundedPartitionAwareCompactionStrategy extends CompactionStrateg @Override public List orderAndFilter(HoodieWriteConfig config, final List operations, final List pendingCompactionWorkloads) { - BoundedPartitionAwareCompactionStrategy boundedPartitionAwareCompactionStrategy - = new BoundedPartitionAwareCompactionStrategy(); - List operationsToExclude = boundedPartitionAwareCompactionStrategy - .orderAndFilter(config, operations, pendingCompactionWorkloads); + BoundedPartitionAwareCompactionStrategy boundedPartitionAwareCompactionStrategy = + new BoundedPartitionAwareCompactionStrategy(); + List operationsToExclude = + boundedPartitionAwareCompactionStrategy.orderAndFilter(config, operations, pendingCompactionWorkloads); List allOperations = new ArrayList<>(operations); allOperations.removeAll(operationsToExclude); return allOperations; @@ -52,13 +51,13 @@ public class UnBoundedPartitionAwareCompactionStrategy extends CompactionStrateg @Override public List filterPartitionPaths(HoodieWriteConfig writeConfig, List partitionPaths) { - List allPartitionPaths = partitionPaths.stream().map(partition -> partition.replace("/", "-")) - .sorted(Comparator.reverseOrder()).map(partitionPath -> partitionPath.replace("-", "/")) - .collect(Collectors.toList()); - BoundedPartitionAwareCompactionStrategy boundedPartitionAwareCompactionStrategy - = new BoundedPartitionAwareCompactionStrategy(); - List partitionsToExclude = boundedPartitionAwareCompactionStrategy.filterPartitionPaths(writeConfig, - partitionPaths); + List allPartitionPaths = + partitionPaths.stream().map(partition -> partition.replace("/", "-")).sorted(Comparator.reverseOrder()) + .map(partitionPath -> partitionPath.replace("-", "/")).collect(Collectors.toList()); + BoundedPartitionAwareCompactionStrategy boundedPartitionAwareCompactionStrategy = + new BoundedPartitionAwareCompactionStrategy(); + List partitionsToExclude = + boundedPartitionAwareCompactionStrategy.filterPartitionPaths(writeConfig, partitionPaths); allPartitionPaths.removeAll(partitionsToExclude); return allPartitionPaths; } diff --git a/hudi-client/src/main/java/org/apache/hudi/io/storage/HoodieParquetConfig.java b/hudi-client/src/main/java/org/apache/hudi/io/storage/HoodieParquetConfig.java index 93a5182e3..41f79b0b5 100644 --- a/hudi-client/src/main/java/org/apache/hudi/io/storage/HoodieParquetConfig.java +++ b/hudi-client/src/main/java/org/apache/hudi/io/storage/HoodieParquetConfig.java @@ -32,9 +32,8 @@ public class HoodieParquetConfig { private Configuration hadoopConf; private double compressionRatio; - public HoodieParquetConfig(HoodieAvroWriteSupport writeSupport, - CompressionCodecName compressionCodecName, int blockSize, int pageSize, long maxFileSize, - Configuration hadoopConf, double compressionRatio) { + public HoodieParquetConfig(HoodieAvroWriteSupport writeSupport, CompressionCodecName compressionCodecName, + int blockSize, int pageSize, long maxFileSize, Configuration hadoopConf, double compressionRatio) { this.writeSupport = writeSupport; this.compressionCodecName = compressionCodecName; this.blockSize = blockSize; diff --git a/hudi-client/src/main/java/org/apache/hudi/io/storage/HoodieParquetWriter.java b/hudi-client/src/main/java/org/apache/hudi/io/storage/HoodieParquetWriter.java index d1d5d9f43..eba4d9717 100644 --- a/hudi-client/src/main/java/org/apache/hudi/io/storage/HoodieParquetWriter.java +++ b/hudi-client/src/main/java/org/apache/hudi/io/storage/HoodieParquetWriter.java @@ -36,11 +36,11 @@ import org.apache.parquet.hadoop.ParquetWriter; import org.apache.spark.TaskContext; /** - * HoodieParquetWriter extends the ParquetWriter to help limit the size of underlying file. Provides - * a way to check if the current file can take more records with the canWrite() + * HoodieParquetWriter extends the ParquetWriter to help limit the size of underlying file. Provides a way to check if + * the current file can take more records with the canWrite() */ -public class HoodieParquetWriter extends - ParquetWriter implements HoodieStorageWriter { +public class HoodieParquetWriter + extends ParquetWriter implements HoodieStorageWriter { private static AtomicLong recordIndex = new AtomicLong(1); @@ -52,24 +52,22 @@ public class HoodieParquetWriter HoodieStorageWriter getStorageWriter( - String commitTime, Path path, HoodieTable hoodieTable, - HoodieWriteConfig config, Schema schema) throws IOException { + String commitTime, Path path, HoodieTable hoodieTable, HoodieWriteConfig config, Schema schema) + throws IOException { final String name = path.getName(); final String extension = FSUtils.isLogFile(path) ? HOODIE_LOG.getFileExtension() : FSUtils.getFileExtension(name); if (PARQUET.getFileExtension().equals(extension)) { @@ -46,19 +46,16 @@ public class HoodieStorageWriterFactory { throw new UnsupportedOperationException(extension + " format not supported yet."); } - private static HoodieStorageWriter newParquetStorageWriter(String commitTime, Path path, - HoodieWriteConfig config, Schema schema, HoodieTable hoodieTable) throws IOException { - BloomFilter filter = new BloomFilter(config.getBloomFilterNumEntries(), - config.getBloomFilterFPP()); - HoodieAvroWriteSupport writeSupport = new HoodieAvroWriteSupport( - new AvroSchemaConverter().convert(schema), schema, filter); + private static HoodieStorageWriter newParquetStorageWriter( + String commitTime, Path path, HoodieWriteConfig config, Schema schema, HoodieTable hoodieTable) + throws IOException { + BloomFilter filter = new BloomFilter(config.getBloomFilterNumEntries(), config.getBloomFilterFPP()); + HoodieAvroWriteSupport writeSupport = + new HoodieAvroWriteSupport(new AvroSchemaConverter().convert(schema), schema, filter); - HoodieParquetConfig parquetConfig = - new HoodieParquetConfig(writeSupport, config.getParquetCompressionCodec(), - config.getParquetBlockSize(), config.getParquetPageSize(), - config.getParquetMaxFileSize(), hoodieTable.getHadoopConf(), - config.getParquetCompressionRatio()); + HoodieParquetConfig parquetConfig = new HoodieParquetConfig(writeSupport, config.getParquetCompressionCodec(), + config.getParquetBlockSize(), config.getParquetPageSize(), config.getParquetMaxFileSize(), + hoodieTable.getHadoopConf(), config.getParquetCompressionRatio()); return new HoodieParquetWriter<>(commitTime, path, parquetConfig, schema); } diff --git a/hudi-client/src/main/java/org/apache/hudi/metrics/HoodieMetrics.java b/hudi-client/src/main/java/org/apache/hudi/metrics/HoodieMetrics.java index c462e2b04..916998560 100644 --- a/hudi-client/src/main/java/org/apache/hudi/metrics/HoodieMetrics.java +++ b/hudi-client/src/main/java/org/apache/hudi/metrics/HoodieMetrics.java @@ -118,8 +118,8 @@ public class HoodieMetrics { return indexTimer == null ? null : indexTimer.time(); } - public void updateCommitMetrics(long commitEpochTimeInMs, long durationInMs, - HoodieCommitMetadata metadata, String actionType) { + public void updateCommitMetrics(long commitEpochTimeInMs, long durationInMs, HoodieCommitMetadata metadata, + String actionType) { if (config.isMetricsOn()) { long totalPartitionsWritten = metadata.fetchTotalPartitionsWritten(); long totalFilesInsert = metadata.fetchTotalFilesInsert(); @@ -154,9 +154,8 @@ public class HoodieMetrics { public void updateRollbackMetrics(long durationInMs, long numFilesDeleted) { if (config.isMetricsOn()) { - logger.info(String - .format("Sending rollback metrics (duration=%d, numFilesDeleted=%d)", durationInMs, - numFilesDeleted)); + logger.info( + String.format("Sending rollback metrics (duration=%d, numFilesDeleted=%d)", durationInMs, numFilesDeleted)); Metrics.registerGauge(getMetricsName("rollback", "duration"), durationInMs); Metrics.registerGauge(getMetricsName("rollback", "numFilesDeleted"), numFilesDeleted); } @@ -164,9 +163,8 @@ public class HoodieMetrics { public void updateCleanMetrics(long durationInMs, int numFilesDeleted) { if (config.isMetricsOn()) { - logger.info(String - .format("Sending clean metrics (duration=%d, numFilesDeleted=%d)", durationInMs, - numFilesDeleted)); + logger.info( + String.format("Sending clean metrics (duration=%d, numFilesDeleted=%d)", durationInMs, numFilesDeleted)); Metrics.registerGauge(getMetricsName("clean", "duration"), durationInMs); Metrics.registerGauge(getMetricsName("clean", "numFilesDeleted"), numFilesDeleted); } @@ -174,20 +172,17 @@ public class HoodieMetrics { public void updateFinalizeWriteMetrics(long durationInMs, long numFilesFinalized) { if (config.isMetricsOn()) { - logger.info(String - .format("Sending finalize write metrics (duration=%d, numFilesFinalized=%d)", - durationInMs, numFilesFinalized)); + logger.info(String.format("Sending finalize write metrics (duration=%d, numFilesFinalized=%d)", durationInMs, + numFilesFinalized)); Metrics.registerGauge(getMetricsName("finalize", "duration"), durationInMs); Metrics.registerGauge(getMetricsName("finalize", "numFilesFinalized"), numFilesFinalized); } } - public void updateIndexMetrics(final String action,final long durationInMs) { + public void updateIndexMetrics(final String action, final long durationInMs) { if (config.isMetricsOn()) { - logger.info(String - .format("Sending index metrics (%s.duration, %d)",action, durationInMs)); - Metrics.registerGauge(getMetricsName("index", String.format("%s.duration", action)), - durationInMs); + logger.info(String.format("Sending index metrics (%s.duration, %d)", action, durationInMs)); + Metrics.registerGauge(getMetricsName("index", String.format("%s.duration", action)), durationInMs); } } @@ -202,4 +197,4 @@ public class HoodieMetrics { public long getDurationInMs(long ctxDuration) { return ctxDuration / 1000000; } -} \ No newline at end of file +} diff --git a/hudi-client/src/main/java/org/apache/hudi/metrics/InMemoryMetricsReporter.java b/hudi-client/src/main/java/org/apache/hudi/metrics/InMemoryMetricsReporter.java index 91de8dfe3..a0221c868 100644 --- a/hudi-client/src/main/java/org/apache/hudi/metrics/InMemoryMetricsReporter.java +++ b/hudi-client/src/main/java/org/apache/hudi/metrics/InMemoryMetricsReporter.java @@ -26,12 +26,10 @@ import java.io.Closeable; public class InMemoryMetricsReporter extends MetricsReporter { @Override - public void start() { - } + public void start() {} @Override - public void report() { - } + public void report() {} @Override public Closeable getReporter() { diff --git a/hudi-client/src/main/java/org/apache/hudi/metrics/MetricsGraphiteReporter.java b/hudi-client/src/main/java/org/apache/hudi/metrics/MetricsGraphiteReporter.java index 096545c65..f0793b1b0 100644 --- a/hudi-client/src/main/java/org/apache/hudi/metrics/MetricsGraphiteReporter.java +++ b/hudi-client/src/main/java/org/apache/hudi/metrics/MetricsGraphiteReporter.java @@ -30,8 +30,7 @@ import org.apache.log4j.LogManager; import org.apache.log4j.Logger; /** - * Implementation of Graphite reporter, which connects to the Graphite server, and send metrics to - * that server. + * Implementation of Graphite reporter, which connects to the Graphite server, and send metrics to that server. */ public class MetricsGraphiteReporter extends MetricsReporter { @@ -50,9 +49,8 @@ public class MetricsGraphiteReporter extends MetricsReporter { this.serverHost = config.getGraphiteServerHost(); this.serverPort = config.getGraphiteServerPort(); if (serverHost == null || serverPort == 0) { - throw new RuntimeException(String - .format("Graphite cannot be initialized with serverHost[%s] and serverPort[%s].", - serverHost, serverPort)); + throw new RuntimeException(String.format("Graphite cannot be initialized with serverHost[%s] and serverPort[%s].", + serverHost, serverPort)); } this.graphiteReporter = createGraphiteReport(); @@ -84,8 +82,7 @@ public class MetricsGraphiteReporter extends MetricsReporter { private GraphiteReporter createGraphiteReport() { Graphite graphite = new Graphite(new InetSocketAddress(serverHost, serverPort)); String reporterPrefix = config.getGraphiteMetricPrefix(); - return GraphiteReporter.forRegistry(registry).prefixedWith(reporterPrefix) - .convertRatesTo(TimeUnit.SECONDS).convertDurationsTo(TimeUnit.MILLISECONDS) - .filter(MetricFilter.ALL).build(graphite); + return GraphiteReporter.forRegistry(registry).prefixedWith(reporterPrefix).convertRatesTo(TimeUnit.SECONDS) + .convertDurationsTo(TimeUnit.MILLISECONDS).filter(MetricFilter.ALL).build(graphite); } } diff --git a/hudi-client/src/main/java/org/apache/hudi/metrics/MetricsReporterType.java b/hudi-client/src/main/java/org/apache/hudi/metrics/MetricsReporterType.java index 83b64bb23..6d7ccd3cb 100644 --- a/hudi-client/src/main/java/org/apache/hudi/metrics/MetricsReporterType.java +++ b/hudi-client/src/main/java/org/apache/hudi/metrics/MetricsReporterType.java @@ -19,8 +19,7 @@ package org.apache.hudi.metrics; /** - * Types of the reporter. Right now we only support Graphite. We can include JMX and CSV in the - * future. + * Types of the reporter. Right now we only support Graphite. We can include JMX and CSV in the future. */ public enum MetricsReporterType { GRAPHITE, INMEMORY diff --git a/hudi-client/src/main/java/org/apache/hudi/table/HoodieCopyOnWriteTable.java b/hudi-client/src/main/java/org/apache/hudi/table/HoodieCopyOnWriteTable.java index 00657eaa2..6644f142c 100644 --- a/hudi-client/src/main/java/org/apache/hudi/table/HoodieCopyOnWriteTable.java +++ b/hudi-client/src/main/java/org/apache/hudi/table/HoodieCopyOnWriteTable.java @@ -82,8 +82,7 @@ import scala.Tuple2; /** * Implementation of a very heavily read-optimized Hoodie Table where *

    - * INSERTS - Produce new files, block aligned to desired size (or) Merge with the smallest existing - * file, to expand it + * INSERTS - Produce new files, block aligned to desired size (or) Merge with the smallest existing file, to expand it *

    * UPDATES - Produce a new version of the file, just replacing the updated records with new values */ @@ -95,31 +94,28 @@ public class HoodieCopyOnWriteTable extends Hoodi super(config, jsc); } - private static PairFlatMapFunction>, String, - PartitionCleanStat> deleteFilesFunc( + private static PairFlatMapFunction>, String, PartitionCleanStat> deleteFilesFunc( HoodieTable table) { - return (PairFlatMapFunction>, String, PartitionCleanStat>) - iter -> { - Map partitionCleanStatMap = new HashMap<>(); + return (PairFlatMapFunction>, String, PartitionCleanStat>) iter -> { + Map partitionCleanStatMap = new HashMap<>(); - FileSystem fs = table.getMetaClient().getFs(); - while (iter.hasNext()) { - Tuple2 partitionDelFileTuple = iter.next(); - String partitionPath = partitionDelFileTuple._1(); - String deletePathStr = partitionDelFileTuple._2(); - Boolean deletedFileResult = deleteFileAndGetResult(fs, deletePathStr); - if (!partitionCleanStatMap.containsKey(partitionPath)) { - partitionCleanStatMap.put(partitionPath, new PartitionCleanStat(partitionPath)); - } - PartitionCleanStat partitionCleanStat = partitionCleanStatMap.get(partitionPath); - partitionCleanStat.addDeleteFilePatterns(deletePathStr); - partitionCleanStat.addDeletedFileResult(deletePathStr, deletedFileResult); - } + FileSystem fs = table.getMetaClient().getFs(); + while (iter.hasNext()) { + Tuple2 partitionDelFileTuple = iter.next(); + String partitionPath = partitionDelFileTuple._1(); + String deletePathStr = partitionDelFileTuple._2(); + Boolean deletedFileResult = deleteFileAndGetResult(fs, deletePathStr); + if (!partitionCleanStatMap.containsKey(partitionPath)) { + partitionCleanStatMap.put(partitionPath, new PartitionCleanStat(partitionPath)); + } + PartitionCleanStat partitionCleanStat = partitionCleanStatMap.get(partitionPath); + partitionCleanStat.addDeleteFilePatterns(deletePathStr); + partitionCleanStat.addDeletedFileResult(deletePathStr, deletedFileResult); + } - return partitionCleanStatMap.entrySet().stream() - .map(e -> new Tuple2<>(e.getKey(), e.getValue())) - .collect(Collectors.toList()).iterator(); - }; + return partitionCleanStatMap.entrySet().stream().map(e -> new Tuple2<>(e.getKey(), e.getValue())) + .collect(Collectors.toList()).iterator(); + }; } private static PairFlatMapFunction getFilesToDeleteFunc(HoodieTable table, @@ -131,8 +127,7 @@ public class HoodieCopyOnWriteTable extends Hoodi }; } - private static Boolean deleteFileAndGetResult(FileSystem fs, String deletePathStr) - throws IOException { + private static Boolean deleteFileAndGetResult(FileSystem fs, String deletePathStr) throws IOException { Path deletePath = new Path(deletePathStr); logger.debug("Working on delete path :" + deletePath); boolean deleteResult = fs.delete(deletePath, false); @@ -171,8 +166,8 @@ public class HoodieCopyOnWriteTable extends Hoodi throw new HoodieNotSupportedException("Compaction is not supported from a CopyOnWrite table"); } - public Iterator> handleUpdate(String commitTime, String fileId, - Iterator> recordItr) throws IOException { + public Iterator> handleUpdate(String commitTime, String fileId, Iterator> recordItr) + throws IOException { // This is needed since sometimes some buckets are never picked in getPartition() and end up with 0 records if (!recordItr.hasNext()) { logger.info("Empty partition with fileId => " + fileId); @@ -190,17 +185,16 @@ public class HoodieCopyOnWriteTable extends Hoodi return handleUpdateInternal(upsertHandle, commitTime, fileId); } - protected Iterator> handleUpdateInternal(HoodieMergeHandle upsertHandle, - String commitTime, String fileId) - throws IOException { + protected Iterator> handleUpdateInternal(HoodieMergeHandle upsertHandle, String commitTime, + String fileId) throws IOException { if (upsertHandle.getOldFilePath() == null) { throw new HoodieUpsertException( "Error in finding the old file path at commit " + commitTime + " for fileId: " + fileId); } else { AvroReadSupport.setAvroReadSchema(getHadoopConf(), upsertHandle.getWriterSchema()); BoundedInMemoryExecutor wrapper = null; - try (ParquetReader reader = AvroParquetReader.builder(upsertHandle.getOldFilePath()) - .withConf(getHadoopConf()).build()) { + try (ParquetReader reader = + AvroParquetReader.builder(upsertHandle.getOldFilePath()).withConf(getHadoopConf()).build()) { wrapper = new SparkBoundedInMemoryExecutor(config, new ParquetReaderIterator(reader), new UpdateHandler(upsertHandle), x -> x); wrapper.execute(); @@ -214,17 +208,15 @@ public class HoodieCopyOnWriteTable extends Hoodi } } - //TODO(vc): This needs to be revisited + // TODO(vc): This needs to be revisited if (upsertHandle.getWriteStatus().getPartitionPath() == null) { - logger.info("Upsert Handle has partition path as null " + upsertHandle.getOldFilePath() - + ", " + upsertHandle.getWriteStatus()); + logger.info("Upsert Handle has partition path as null " + upsertHandle.getOldFilePath() + ", " + + upsertHandle.getWriteStatus()); } - return Collections.singletonList(Collections.singletonList(upsertHandle.getWriteStatus())) - .iterator(); + return Collections.singletonList(Collections.singletonList(upsertHandle.getWriteStatus())).iterator(); } - protected HoodieMergeHandle getUpdateHandle(String commitTime, String fileId, - Iterator> recordItr) { + protected HoodieMergeHandle getUpdateHandle(String commitTime, String fileId, Iterator> recordItr) { return new HoodieMergeHandle<>(config, commitTime, this, recordItr, fileId); } @@ -233,8 +225,8 @@ public class HoodieCopyOnWriteTable extends Hoodi return new HoodieMergeHandle<>(config, commitTime, this, keyToNewRecords, fileId, dataFileToBeMerged); } - public Iterator> handleInsert(String commitTime, String idPfx, - Iterator> recordItr) throws Exception { + public Iterator> handleInsert(String commitTime, String idPfx, Iterator> recordItr) + throws Exception { // This is needed since sometimes some buckets are never picked in getPartition() and end up with 0 records if (!recordItr.hasNext()) { logger.info("Empty partition"); @@ -245,16 +237,16 @@ public class HoodieCopyOnWriteTable extends Hoodi public Iterator> handleInsert(String commitTime, String partitionPath, String fileId, Iterator> recordItr) { - HoodieCreateHandle createHandle = new HoodieCreateHandle(config, commitTime, this, partitionPath, fileId, - recordItr); + HoodieCreateHandle createHandle = + new HoodieCreateHandle(config, commitTime, this, partitionPath, fileId, recordItr); createHandle.write(); return Collections.singletonList(Collections.singletonList(createHandle.close())).iterator(); } @SuppressWarnings("unchecked") @Override - public Iterator> handleUpsertPartition(String commitTime, Integer partition, - Iterator recordItr, Partitioner partitioner) { + public Iterator> handleUpsertPartition(String commitTime, Integer partition, Iterator recordItr, + Partitioner partitioner) { UpsertPartitioner upsertPartitioner = (UpsertPartitioner) partitioner; BucketInfo binfo = upsertPartitioner.getBucketInfo(partition); BucketType btype = binfo.bucketType; @@ -264,8 +256,7 @@ public class HoodieCopyOnWriteTable extends Hoodi } else if (btype.equals(BucketType.UPDATE)) { return handleUpdate(commitTime, binfo.fileIdPrefix, recordItr); } else { - throw new HoodieUpsertException( - "Unknown bucketType " + btype + " for partition :" + partition); + throw new HoodieUpsertException("Unknown bucketType " + btype + " for partition :" + partition); } } catch (Throwable t) { String msg = "Error upserting bucketType " + btype + " for partition :" + partition; @@ -275,15 +266,14 @@ public class HoodieCopyOnWriteTable extends Hoodi } @Override - public Iterator> handleInsertPartition(String commitTime, Integer partition, - Iterator recordItr, Partitioner partitioner) { + public Iterator> handleInsertPartition(String commitTime, Integer partition, Iterator recordItr, + Partitioner partitioner) { return handleUpsertPartition(commitTime, partition, recordItr, partitioner); } /** - * Performs cleaning of partition paths according to cleaning policy and returns the number of - * files cleaned. Handles skews in partitions to clean by making files to clean as the unit of - * task distribution. + * Performs cleaning of partition paths according to cleaning policy and returns the number of files cleaned. Handles + * skews in partitions to clean by making files to clean as the unit of task distribution. * * @throws IllegalArgumentException if unknown cleaning policy is provided */ @@ -291,11 +281,9 @@ public class HoodieCopyOnWriteTable extends Hoodi public List clean(JavaSparkContext jsc) { try { FileSystem fs = getMetaClient().getFs(); - List partitionsToClean = FSUtils - .getAllPartitionPaths(fs, getMetaClient().getBasePath(), - config.shouldAssumeDatePartitioning()); - logger.info("Partitions to clean up : " + partitionsToClean + ", with policy " + config - .getCleanerPolicy()); + List partitionsToClean = + FSUtils.getAllPartitionPaths(fs, getMetaClient().getBasePath(), config.shouldAssumeDatePartitioning()); + logger.info("Partitions to clean up : " + partitionsToClean + ", with policy " + config.getCleanerPolicy()); if (partitionsToClean.isEmpty()) { logger.info("Nothing to clean here mom. It is already clean"); return Collections.emptyList(); @@ -307,12 +295,10 @@ public class HoodieCopyOnWriteTable extends Hoodi } /** - * Common method used for cleaning out parquet files under a partition path during rollback of a - * set of commits + * Common method used for cleaning out parquet files under a partition path during rollback of a set of commits */ protected Map deleteCleanedFiles(Map results, String partitionPath, - PathFilter filter) - throws IOException { + PathFilter filter) throws IOException { logger.info("Cleaning path " + partitionPath); FileSystem fs = getMetaClient().getFs(); FileStatus[] toBeDeleted = fs.listStatus(FSUtils.getPartitionPath(config.getBasePath(), partitionPath), filter); @@ -325,12 +311,10 @@ public class HoodieCopyOnWriteTable extends Hoodi } /** - * Common method used for cleaning out parquet files under a partition path during rollback of a - * set of commits + * Common method used for cleaning out parquet files under a partition path during rollback of a set of commits */ - protected Map deleteCleanedFiles(Map results, String commit, String - partitionPath) - throws IOException { + protected Map deleteCleanedFiles(Map results, String commit, + String partitionPath) throws IOException { logger.info("Cleaning path " + partitionPath); FileSystem fs = getMetaClient().getFs(); PathFilter filter = (path) -> { @@ -354,8 +338,8 @@ public class HoodieCopyOnWriteTable extends Hoodi throws IOException { String actionType = metaClient.getCommitActionType(); HoodieActiveTimeline activeTimeline = this.getActiveTimeline(); - List inflights = this.getInflightCommitTimeline().getInstants() - .map(HoodieInstant::getTimestamp).collect(Collectors.toList()); + List inflights = + this.getInflightCommitTimeline().getInstants().map(HoodieInstant::getTimestamp).collect(Collectors.toList()); // Atomically unpublish the commits if (!inflights.contains(commit)) { activeTimeline.revertToInflight(new HoodieInstant(false, actionType, commit)); @@ -364,27 +348,26 @@ public class HoodieCopyOnWriteTable extends Hoodi // delete all the data files for this commit logger.info("Clean out all parquet files generated for commit: " + commit); - List stats = jsc.parallelize(FSUtils - .getAllPartitionPaths(metaClient.getFs(), getMetaClient().getBasePath(), - config.shouldAssumeDatePartitioning())) - .map((Function) partitionPath -> { - // Scan all partitions files with this commit time - final Map filesToDeletedStatus = new HashMap<>(); - deleteCleanedFiles(filesToDeletedStatus, commit, partitionPath); - return HoodieRollbackStat.newBuilder().withPartitionPath(partitionPath) - .withDeletedFileResults(filesToDeletedStatus).build(); - }).collect(); + List stats = + jsc.parallelize(FSUtils.getAllPartitionPaths(metaClient.getFs(), getMetaClient().getBasePath(), + config.shouldAssumeDatePartitioning())).map((Function) partitionPath -> { + // Scan all partitions files with this commit time + final Map filesToDeletedStatus = new HashMap<>(); + deleteCleanedFiles(filesToDeletedStatus, commit, partitionPath); + return HoodieRollbackStat.newBuilder().withPartitionPath(partitionPath) + .withDeletedFileResults(filesToDeletedStatus).build(); + }).collect(); // Delete Inflight instant if enabled - deleteInflightInstant(deleteInstants, activeTimeline, - new HoodieInstant(true, actionType, commit)); + deleteInflightInstant(deleteInstants, activeTimeline, new HoodieInstant(true, actionType, commit)); return stats; } /** * Delete Inflight instant if enabled + * * @param deleteInstant Enable Deletion of Inflight instant - * @param activeTimeline Hoodie active timeline + * @param activeTimeline Hoodie active timeline * @param instantToBeDeleted Instant to be deleted */ protected void deleteInflightInstant(boolean deleteInstant, HoodieActiveTimeline activeTimeline, @@ -401,30 +384,27 @@ public class HoodieCopyOnWriteTable extends Hoodi } } - private List cleanPartitionPaths(List partitionsToClean, - JavaSparkContext jsc) { + private List cleanPartitionPaths(List partitionsToClean, JavaSparkContext jsc) { int cleanerParallelism = Math.min(partitionsToClean.size(), config.getCleanerParallelism()); logger.info("Using cleanerParallelism: " + cleanerParallelism); List> partitionCleanStats = jsc - .parallelize(partitionsToClean, cleanerParallelism) - .flatMapToPair(getFilesToDeleteFunc(this, config)) - .repartition(cleanerParallelism) // repartition to remove skews + .parallelize(partitionsToClean, cleanerParallelism).flatMapToPair(getFilesToDeleteFunc(this, config)) + .repartition(cleanerParallelism) // repartition to remove skews .mapPartitionsToPair(deleteFilesFunc(this)).reduceByKey( // merge partition level clean stats below - (Function2) (e1, e2) -> e1 - .merge(e2)).collect(); + (Function2) (e1, e2) -> e1.merge(e2)) + .collect(); - Map partitionCleanStatsMap = partitionCleanStats.stream() - .collect(Collectors.toMap(Tuple2::_1, Tuple2::_2)); + Map partitionCleanStatsMap = + partitionCleanStats.stream().collect(Collectors.toMap(Tuple2::_1, Tuple2::_2)); HoodieCleanHelper cleaner = new HoodieCleanHelper(this, config); // Return PartitionCleanStat for each partition passed. return partitionsToClean.stream().map(partitionPath -> { PartitionCleanStat partitionCleanStat = - (partitionCleanStatsMap.containsKey(partitionPath)) ? partitionCleanStatsMap - .get(partitionPath) : new PartitionCleanStat(partitionPath); - return HoodieCleanStat.newBuilder().withPolicy(config.getCleanerPolicy()) - .withPartitionPath(partitionPath) + (partitionCleanStatsMap.containsKey(partitionPath)) ? partitionCleanStatsMap.get(partitionPath) + : new PartitionCleanStat(partitionPath); + return HoodieCleanStat.newBuilder().withPolicy(config.getCleanerPolicy()).withPartitionPath(partitionPath) .withEarliestCommitRetained(cleaner.getEarliestCommitToRetain()) .withDeletePathPattern(partitionCleanStat.deletePathPatterns) .withSuccessfulDeletes(partitionCleanStat.successDeleteFiles) @@ -453,8 +433,7 @@ public class HoodieCopyOnWriteTable extends Hoodi } @Override - protected void finish() { - } + protected void finish() {} @Override protected Void getResult() { @@ -487,8 +466,8 @@ public class HoodieCopyOnWriteTable extends Hoodi private PartitionCleanStat merge(PartitionCleanStat other) { if (!this.partitionPath.equals(other.partitionPath)) { - throw new RuntimeException(String - .format("partitionPath is not a match: (%s, %s)", partitionPath, other.partitionPath)); + throw new RuntimeException( + String.format("partitionPath is not a match: (%s, %s)", partitionPath, other.partitionPath)); } successDeleteFiles.addAll(other.successDeleteFiles); deletePathPatterns.addAll(other.deletePathPatterns); @@ -516,8 +495,8 @@ public class HoodieCopyOnWriteTable extends Hoodi } /** - * Helper class for an insert bucket along with the weight [0.0, 0.1] that defines the amount of - * incoming inserts that should be allocated to the bucket + * Helper class for an insert bucket along with the weight [0.0, 0.1] that defines the amount of incoming inserts that + * should be allocated to the bucket */ class InsertBucket implements Serializable { @@ -563,8 +542,7 @@ public class HoodieCopyOnWriteTable extends Hoodi */ List smallFiles = new ArrayList(); /** - * Total number of RDD partitions, is determined by total buckets we want to pack the incoming - * workload into + * Total number of RDD partitions, is determined by total buckets we want to pack the incoming workload into */ private int totalBuckets = 0; /** @@ -599,17 +577,15 @@ public class HoodieCopyOnWriteTable extends Hoodi assignUpdates(profile); assignInserts(profile); - logger.info( - "Total Buckets :" + totalBuckets + ", " + "buckets info => " + bucketInfoMap + ", \n" - + "Partition to insert buckets => " + partitionPathToInsertBuckets + ", \n" - + "UpdateLocations mapped to buckets =>" + updateLocationToBucket); + logger.info("Total Buckets :" + totalBuckets + ", " + "buckets info => " + bucketInfoMap + ", \n" + + "Partition to insert buckets => " + partitionPathToInsertBuckets + ", \n" + + "UpdateLocations mapped to buckets =>" + updateLocationToBucket); } private void assignUpdates(WorkloadProfile profile) { // each update location gets a partition WorkloadStat gStat = profile.getGlobalStat(); - for (Map.Entry> updateLocEntry : gStat.getUpdateLocationToCount() - .entrySet()) { + for (Map.Entry> updateLocEntry : gStat.getUpdateLocationToCount().entrySet()) { addUpdateBucket(updateLocEntry.getKey()); } } @@ -628,8 +604,9 @@ public class HoodieCopyOnWriteTable extends Hoodi private void assignInserts(WorkloadProfile profile) { // for new inserts, compute buckets depending on how many records we have for each partition Set partitionPaths = profile.getPartitionPaths(); - long averageRecordSize = averageBytesPerRecord(metaClient.getActiveTimeline().getCommitTimeline() - .filterCompletedInstants(), config.getCopyOnWriteRecordSizeEstimate()); + long averageRecordSize = + averageBytesPerRecord(metaClient.getActiveTimeline().getCommitTimeline().filterCompletedInstants(), + config.getCopyOnWriteRecordSizeEstimate()); logger.info("AvgRecordSize => " + averageRecordSize); for (String partitionPath : partitionPaths) { WorkloadStat pStat = profile.getWorkloadStat(partitionPath); @@ -644,20 +621,17 @@ public class HoodieCopyOnWriteTable extends Hoodi // first try packing this into one of the smallFiles for (SmallFile smallFile : smallFiles) { - long recordsToAppend = Math - .min((config.getParquetMaxFileSize() - smallFile.sizeBytes) / averageRecordSize, - totalUnassignedInserts); + long recordsToAppend = Math.min((config.getParquetMaxFileSize() - smallFile.sizeBytes) / averageRecordSize, + totalUnassignedInserts); if (recordsToAppend > 0 && totalUnassignedInserts > 0) { // create a new bucket or re-use an existing bucket int bucket; if (updateLocationToBucket.containsKey(smallFile.location.getFileId())) { bucket = updateLocationToBucket.get(smallFile.location.getFileId()); - logger.info("Assigning " + recordsToAppend + " inserts to existing update bucket " - + bucket); + logger.info("Assigning " + recordsToAppend + " inserts to existing update bucket " + bucket); } else { bucket = addUpdateBucket(smallFile.location.getFileId()); - logger.info( - "Assigning " + recordsToAppend + " inserts to new update bucket " + bucket); + logger.info("Assigning " + recordsToAppend + " inserts to new update bucket " + bucket); } bucketNumbers.add(bucket); recordsPerBucket.add(recordsToAppend); @@ -673,10 +647,8 @@ public class HoodieCopyOnWriteTable extends Hoodi } int insertBuckets = (int) Math.ceil((1.0 * totalUnassignedInserts) / insertRecordsPerBucket); - logger.info( - "After small file assignment: unassignedInserts => " + totalUnassignedInserts - + ", totalInsertBuckets => " + insertBuckets + ", recordsPerBucket => " - + insertRecordsPerBucket); + logger.info("After small file assignment: unassignedInserts => " + totalUnassignedInserts + + ", totalInsertBuckets => " + insertBuckets + ", recordsPerBucket => " + insertRecordsPerBucket); for (int b = 0; b < insertBuckets; b++) { bucketNumbers.add(totalBuckets); recordsPerBucket.add(totalUnassignedInserts / insertBuckets); @@ -696,15 +668,14 @@ public class HoodieCopyOnWriteTable extends Hoodi bkt.weight = (1.0 * recordsPerBucket.get(i)) / pStat.getNumInserts(); insertBuckets.add(bkt); } - logger.info( - "Total insert buckets for partition path " + partitionPath + " => " + insertBuckets); + logger.info("Total insert buckets for partition path " + partitionPath + " => " + insertBuckets); partitionPathToInsertBuckets.put(partitionPath, insertBuckets); } } } /** - * Returns a list of small files in the given partition path + * Returns a list of small files in the given partition path */ protected List getSmallFiles(String partitionPath) { @@ -716,15 +687,13 @@ public class HoodieCopyOnWriteTable extends Hoodi if (!commitTimeline.empty()) { // if we have some commits HoodieInstant latestCommitTime = commitTimeline.lastInstant().get(); List allFiles = getROFileSystemView() - .getLatestDataFilesBeforeOrOn(partitionPath, latestCommitTime.getTimestamp()) - .collect(Collectors.toList()); + .getLatestDataFilesBeforeOrOn(partitionPath, latestCommitTime.getTimestamp()).collect(Collectors.toList()); for (HoodieDataFile file : allFiles) { if (file.getFileSize() < config.getParquetSmallFileLimit()) { String filename = file.getFileName(); SmallFile sf = new SmallFile(); - sf.location = new HoodieRecordLocation(FSUtils.getCommitTime(filename), - FSUtils.getFileId(filename)); + sf.location = new HoodieRecordLocation(FSUtils.getCommitTime(filename), FSUtils.getFileId(filename)); sf.sizeBytes = file.getFileSize(); smallFileLocations.add(sf); // Update the global small files list @@ -751,19 +720,18 @@ public class HoodieCopyOnWriteTable extends Hoodi @Override public int getPartition(Object key) { - Tuple2> keyLocation = (Tuple2>) key; + Tuple2> keyLocation = + (Tuple2>) key; if (keyLocation._2().isPresent()) { HoodieRecordLocation location = keyLocation._2().get(); return updateLocationToBucket.get(location.getFileId()); } else { - List targetBuckets = partitionPathToInsertBuckets - .get(keyLocation._1().getPartitionPath()); + List targetBuckets = partitionPathToInsertBuckets.get(keyLocation._1().getPartitionPath()); // pick the target bucket to use based on the weights. double totalWeight = 0.0; final long totalInserts = Math.max(1, globalStat.getNumInserts()); - final long hashOfKey = Hashing.md5() - .hashString(keyLocation._1().getRecordKey(), StandardCharsets.UTF_8).asLong(); + final long hashOfKey = + Hashing.md5().hashString(keyLocation._1().getRecordKey(), StandardCharsets.UTF_8).asLong(); final double r = 1.0 * Math.floorMod(hashOfKey, totalInserts) / totalInserts; for (InsertBucket insertBucket : targetBuckets) { totalWeight += insertBucket.weight; @@ -782,8 +750,8 @@ public class HoodieCopyOnWriteTable extends Hoodi } /** - * Obtains the average record size based on records written during previous commits. Used for - * estimating how many records pack into one file. + * Obtains the average record size based on records written during previous commits. Used for estimating how many + * records pack into one file. */ protected static long averageBytesPerRecord(HoodieTimeline commitTimeline, int defaultRecordSizeEstimate) { long avgSize = defaultRecordSizeEstimate; diff --git a/hudi-client/src/main/java/org/apache/hudi/table/HoodieMergeOnReadTable.java b/hudi-client/src/main/java/org/apache/hudi/table/HoodieMergeOnReadTable.java index e34a12454..f9b414124 100644 --- a/hudi-client/src/main/java/org/apache/hudi/table/HoodieMergeOnReadTable.java +++ b/hudi-client/src/main/java/org/apache/hudi/table/HoodieMergeOnReadTable.java @@ -73,15 +73,21 @@ import org.apache.spark.api.java.JavaSparkContext; import org.apache.spark.api.java.function.Function; /** - * Implementation of a more real-time read-optimized Hoodie Table where

    INSERTS - Same as - * HoodieCopyOnWriteTable - Produce new files, block aligned to desired size (or) Merge with the - * smallest existing file, to expand it

    UPDATES - Appends the changes to a rolling log file - * maintained per file Id. Compaction merges the log file into the base file.

    WARNING - MOR - * table type does not support nested rollbacks, every rollback must be followed by an attempted - * commit action

    + * Implementation of a more real-time read-optimized Hoodie Table where + *

    + * INSERTS - Same as HoodieCopyOnWriteTable - Produce new files, block aligned to desired size (or) Merge with the + * smallest existing file, to expand it + *

    + *

    + * UPDATES - Appends the changes to a rolling log file maintained per file Id. Compaction merges the log file into the + * base file. + *

    + *

    + * WARNING - MOR table type does not support nested rollbacks, every rollback must be followed by an attempted commit + * action + *

    */ -public class HoodieMergeOnReadTable extends - HoodieCopyOnWriteTable { +public class HoodieMergeOnReadTable extends HoodieCopyOnWriteTable { private static Logger logger = LogManager.getLogger(HoodieMergeOnReadTable.class); @@ -102,27 +108,24 @@ public class HoodieMergeOnReadTable extends } @Override - public Iterator> handleUpdate(String commitTime, String fileId, - Iterator> recordItr) throws IOException { + public Iterator> handleUpdate(String commitTime, String fileId, Iterator> recordItr) + throws IOException { logger.info("Merging updates for commit " + commitTime + " for file " + fileId); if (!index.canIndexLogFiles() && mergeOnReadUpsertPartitioner.getSmallFileIds().contains(fileId)) { - logger.info( - "Small file corrections for updates for commit " + commitTime + " for file " + fileId); + logger.info("Small file corrections for updates for commit " + commitTime + " for file " + fileId); return super.handleUpdate(commitTime, fileId, recordItr); } else { - HoodieAppendHandle appendHandle = new HoodieAppendHandle<>(config, commitTime, this, - fileId, recordItr); + HoodieAppendHandle appendHandle = new HoodieAppendHandle<>(config, commitTime, this, fileId, recordItr); appendHandle.doAppend(); appendHandle.close(); - return Collections.singletonList(Collections.singletonList(appendHandle.getWriteStatus())) - .iterator(); + return Collections.singletonList(Collections.singletonList(appendHandle.getWriteStatus())).iterator(); } } @Override - public Iterator> handleInsert(String commitTime, String idPfx, - Iterator> recordItr) throws Exception { + public Iterator> handleInsert(String commitTime, String idPfx, Iterator> recordItr) + throws Exception { // If canIndexLogFiles, write inserts to log files else write inserts to parquet files if (index.canIndexLogFiles()) { return new MergeOnReadLazyInsertIterable<>(recordItr, config, commitTime, this, idPfx); @@ -134,8 +137,8 @@ public class HoodieMergeOnReadTable extends @Override public HoodieCompactionPlan scheduleCompaction(JavaSparkContext jsc, String instantTime) { logger.info("Checking if compaction needs to be run on " + config.getBasePath()); - Option lastCompaction = getActiveTimeline().getCommitTimeline() - .filterCompletedInstants().lastInstant(); + Option lastCompaction = + getActiveTimeline().getCommitTimeline().filterCompletedInstants().lastInstant(); String deltaCommitsSinceTs = "0"; if (lastCompaction.isPresent()) { deltaCommitsSinceTs = lastCompaction.get().getTimestamp(); @@ -145,8 +148,8 @@ public class HoodieMergeOnReadTable extends .findInstantsAfter(deltaCommitsSinceTs, Integer.MAX_VALUE).countInstants(); if (config.getInlineCompactDeltaCommitMax() > deltaCommitsSinceLastCompaction) { logger.info("Not running compaction as only " + deltaCommitsSinceLastCompaction - + " delta commits was found since last compaction " + deltaCommitsSinceTs - + ". Waiting for " + config.getInlineCompactDeltaCommitMax()); + + " delta commits was found since last compaction " + deltaCommitsSinceTs + ". Waiting for " + + config.getInlineCompactDeltaCommitMax()); return new HoodieCompactionPlan(); } @@ -154,7 +157,7 @@ public class HoodieMergeOnReadTable extends HoodieRealtimeTableCompactor compactor = new HoodieRealtimeTableCompactor(); try { return compactor.generateCompactionPlan(jsc, this, config, instantTime, - ((SyncableFileSystemView)getRTFileSystemView()).getPendingCompactionOperations() + ((SyncableFileSystemView) getRTFileSystemView()).getPendingCompactionOperations() .map(instantTimeCompactionopPair -> instantTimeCompactionopPair.getValue().getFileGroupId()) .collect(Collectors.toSet())); @@ -186,9 +189,8 @@ public class HoodieMergeOnReadTable extends // Atomically un-publish all non-inflight commits Option commitOrCompactionOption = Option.fromJavaOptional(this.getActiveTimeline() .getTimelineOfActions(Sets.newHashSet(HoodieActiveTimeline.COMMIT_ACTION, - HoodieActiveTimeline.DELTA_COMMIT_ACTION, HoodieActiveTimeline.COMPACTION_ACTION)).getInstants() - .filter(i -> commit.equals(i.getTimestamp())) - .findFirst()); + HoodieActiveTimeline.DELTA_COMMIT_ACTION, HoodieActiveTimeline.COMPACTION_ACTION)) + .getInstants().filter(i -> commit.equals(i.getTimestamp())).findFirst()); HoodieInstant instantToRollback = commitOrCompactionOption.get(); // Atomically un-publish all non-inflight commits if (!instantToRollback.isInflight()) { @@ -196,128 +198,134 @@ public class HoodieMergeOnReadTable extends } logger.info("Unpublished " + commit); Long startTime = System.currentTimeMillis(); - List allRollbackStats = jsc.parallelize(FSUtils - .getAllPartitionPaths(this.metaClient.getFs(), this.getMetaClient().getBasePath(), - config.shouldAssumeDatePartitioning())) - .map((Function) partitionPath -> { - HoodieActiveTimeline activeTimeline = this.getActiveTimeline().reload(); - HoodieRollbackStat hoodieRollbackStats = null; - // Need to put the path filter here since Filter is not serializable - // PathFilter to get all parquet files and log files that need to be deleted - PathFilter filter = (path) -> { - if (path.toString().contains(".parquet")) { - String fileCommitTime = FSUtils.getCommitTime(path.getName()); - return commit.equals(fileCommitTime); - } else if (path.toString().contains(".log")) { - // Since the baseCommitTime is the only commit for new log files, it's okay here - String fileCommitTime = FSUtils.getBaseCommitTimeFromLogPath(path); - return commit.equals(fileCommitTime); - } - return false; - }; - - final Map filesToDeletedStatus = new HashMap<>(); - - switch (instantToRollback.getAction()) { - case HoodieTimeline.COMMIT_ACTION: - try { - // Rollback of a commit should delete the newly created parquet files along with any log - // files created with this as baseCommit. This is required to support multi-rollbacks in a MOR table. - super.deleteCleanedFiles(filesToDeletedStatus, partitionPath, filter); - hoodieRollbackStats = HoodieRollbackStat.newBuilder() - .withPartitionPath(partitionPath).withDeletedFileResults(filesToDeletedStatus).build(); - break; - } catch (IOException io) { - throw new UncheckedIOException("Failed to rollback for commit " + commit, io); - } - case HoodieTimeline.COMPACTION_ACTION: - try { - // If there is no delta commit present after the current commit (if compaction), no action, else we - // need to make sure that a compaction commit rollback also deletes any log files written as part of the - // succeeding deltacommit. - boolean higherDeltaCommits = !activeTimeline.getDeltaCommitTimeline() - .filterCompletedInstants().findInstantsAfter(commit, 1).empty(); - if (higherDeltaCommits) { - // Rollback of a compaction action with no higher deltacommit means that the compaction is scheduled - // and has not yet finished. In this scenario we should delete only the newly created parquet files - // and not corresponding base commit log files created with this as baseCommit since updates would - // have been written to the log files. - super.deleteCleanedFiles(filesToDeletedStatus, commit, partitionPath); - hoodieRollbackStats = HoodieRollbackStat.newBuilder() - .withPartitionPath(partitionPath).withDeletedFileResults(filesToDeletedStatus).build(); - } else { - // No deltacommits present after this compaction commit (inflight or requested). In this case, we - // can also delete any log files that were created with this compaction commit as base - // commit. - super.deleteCleanedFiles(filesToDeletedStatus, partitionPath, filter); - hoodieRollbackStats = HoodieRollbackStat.newBuilder() - .withPartitionPath(partitionPath).withDeletedFileResults(filesToDeletedStatus).build(); + List allRollbackStats = + jsc.parallelize(FSUtils.getAllPartitionPaths(this.metaClient.getFs(), this.getMetaClient().getBasePath(), + config.shouldAssumeDatePartitioning())).map((Function) partitionPath -> { + HoodieActiveTimeline activeTimeline = this.getActiveTimeline().reload(); + HoodieRollbackStat hoodieRollbackStats = null; + // Need to put the path filter here since Filter is not serializable + // PathFilter to get all parquet files and log files that need to be deleted + PathFilter filter = (path) -> { + if (path.toString().contains(".parquet")) { + String fileCommitTime = FSUtils.getCommitTime(path.getName()); + return commit.equals(fileCommitTime); + } else if (path.toString().contains(".log")) { + // Since the baseCommitTime is the only commit for new log files, it's okay here + String fileCommitTime = FSUtils.getBaseCommitTimeFromLogPath(path); + return commit.equals(fileCommitTime); } - break; - } catch (IOException io) { - throw new UncheckedIOException("Failed to rollback for commit " + commit, io); - } - case HoodieTimeline.DELTA_COMMIT_ACTION: - // -------------------------------------------------------------------------------------------------- - // (A) The following cases are possible if index.canIndexLogFiles and/or index.isGlobal - // -------------------------------------------------------------------------------------------------- - // (A.1) Failed first commit - Inserts were written to log files and HoodieWriteStat has no entries. In - // this scenario we would want to delete these log files. - // (A.2) Failed recurring commit - Inserts/Updates written to log files. In this scenario, - // HoodieWriteStat will have the baseCommitTime for the first log file written, add rollback blocks. - // (A.3) Rollback triggered for first commit - Inserts were written to the log files but the commit is - // being reverted. In this scenario, HoodieWriteStat will be `null` for the attribute prevCommitTime and - // and hence will end up deleting these log files. This is done so there are no orphan log files - // lying around. - // (A.4) Rollback triggered for recurring commits - Inserts/Updates are being rolled back, the actions - // taken in this scenario is a combination of (A.2) and (A.3) - // --------------------------------------------------------------------------------------------------- - // (B) The following cases are possible if !index.canIndexLogFiles and/or !index.isGlobal - // --------------------------------------------------------------------------------------------------- - // (B.1) Failed first commit - Inserts were written to parquet files and HoodieWriteStat has no entries. - // In this scenario, we delete all the parquet files written for the failed commit. - // (B.2) Failed recurring commits - Inserts were written to parquet files and updates to log files. In - // this scenario, perform (A.1) and for updates written to log files, write rollback blocks. - // (B.3) Rollback triggered for first commit - Same as (B.1) - // (B.4) Rollback triggered for recurring commits - Same as (B.2) plus we need to delete the log files - // as well if the base parquet file gets deleted. - try { - HoodieCommitMetadata commitMetadata = HoodieCommitMetadata.fromBytes( - metaClient.getCommitTimeline().getInstantDetails( - new HoodieInstant(true, instantToRollback.getAction(), instantToRollback.getTimestamp())) - .get(), HoodieCommitMetadata.class); + return false; + }; - // read commit file and (either append delete blocks or delete file) - Map filesToNumBlocksRollback = new HashMap<>(); + final Map filesToDeletedStatus = new HashMap<>(); - // In case all data was inserts and the commit failed, delete the file belonging to that commit - // We do not know fileIds for inserts (first inserts are either log files or parquet files), - // delete all files for the corresponding failed commit, if present (same as COW) - super.deleteCleanedFiles(filesToDeletedStatus, partitionPath, filter); - final Set deletedFiles = filesToDeletedStatus.entrySet().stream() - .map(entry -> { + switch (instantToRollback.getAction()) { + case HoodieTimeline.COMMIT_ACTION: + try { + // Rollback of a commit should delete the newly created parquet files along with any log + // files created with this as baseCommit. This is required to support multi-rollbacks in a MOR + // table. + super.deleteCleanedFiles(filesToDeletedStatus, partitionPath, filter); + hoodieRollbackStats = HoodieRollbackStat.newBuilder().withPartitionPath(partitionPath) + .withDeletedFileResults(filesToDeletedStatus).build(); + break; + } catch (IOException io) { + throw new UncheckedIOException("Failed to rollback for commit " + commit, io); + } + case HoodieTimeline.COMPACTION_ACTION: + try { + // If there is no delta commit present after the current commit (if compaction), no action, else we + // need to make sure that a compaction commit rollback also deletes any log files written as part of + // the + // succeeding deltacommit. + boolean higherDeltaCommits = !activeTimeline.getDeltaCommitTimeline().filterCompletedInstants() + .findInstantsAfter(commit, 1).empty(); + if (higherDeltaCommits) { + // Rollback of a compaction action with no higher deltacommit means that the compaction is + // scheduled + // and has not yet finished. In this scenario we should delete only the newly created parquet + // files + // and not corresponding base commit log files created with this as baseCommit since updates would + // have been written to the log files. + super.deleteCleanedFiles(filesToDeletedStatus, commit, partitionPath); + hoodieRollbackStats = HoodieRollbackStat.newBuilder().withPartitionPath(partitionPath) + .withDeletedFileResults(filesToDeletedStatus).build(); + } else { + // No deltacommits present after this compaction commit (inflight or requested). In this case, we + // can also delete any log files that were created with this compaction commit as base + // commit. + super.deleteCleanedFiles(filesToDeletedStatus, partitionPath, filter); + hoodieRollbackStats = HoodieRollbackStat.newBuilder().withPartitionPath(partitionPath) + .withDeletedFileResults(filesToDeletedStatus).build(); + } + break; + } catch (IOException io) { + throw new UncheckedIOException("Failed to rollback for commit " + commit, io); + } + case HoodieTimeline.DELTA_COMMIT_ACTION: + // -------------------------------------------------------------------------------------------------- + // (A) The following cases are possible if index.canIndexLogFiles and/or index.isGlobal + // -------------------------------------------------------------------------------------------------- + // (A.1) Failed first commit - Inserts were written to log files and HoodieWriteStat has no entries. + // In + // this scenario we would want to delete these log files. + // (A.2) Failed recurring commit - Inserts/Updates written to log files. In this scenario, + // HoodieWriteStat will have the baseCommitTime for the first log file written, add rollback blocks. + // (A.3) Rollback triggered for first commit - Inserts were written to the log files but the commit is + // being reverted. In this scenario, HoodieWriteStat will be `null` for the attribute prevCommitTime + // and + // and hence will end up deleting these log files. This is done so there are no orphan log files + // lying around. + // (A.4) Rollback triggered for recurring commits - Inserts/Updates are being rolled back, the actions + // taken in this scenario is a combination of (A.2) and (A.3) + // --------------------------------------------------------------------------------------------------- + // (B) The following cases are possible if !index.canIndexLogFiles and/or !index.isGlobal + // --------------------------------------------------------------------------------------------------- + // (B.1) Failed first commit - Inserts were written to parquet files and HoodieWriteStat has no + // entries. + // In this scenario, we delete all the parquet files written for the failed commit. + // (B.2) Failed recurring commits - Inserts were written to parquet files and updates to log files. In + // this scenario, perform (A.1) and for updates written to log files, write rollback blocks. + // (B.3) Rollback triggered for first commit - Same as (B.1) + // (B.4) Rollback triggered for recurring commits - Same as (B.2) plus we need to delete the log files + // as well if the base parquet file gets deleted. + try { + HoodieCommitMetadata commitMetadata = + HoodieCommitMetadata.fromBytes( + metaClient.getCommitTimeline().getInstantDetails(new HoodieInstant(true, + instantToRollback.getAction(), instantToRollback.getTimestamp())).get(), + HoodieCommitMetadata.class); + + // read commit file and (either append delete blocks or delete file) + Map filesToNumBlocksRollback = new HashMap<>(); + + // In case all data was inserts and the commit failed, delete the file belonging to that commit + // We do not know fileIds for inserts (first inserts are either log files or parquet files), + // delete all files for the corresponding failed commit, if present (same as COW) + super.deleteCleanedFiles(filesToDeletedStatus, partitionPath, filter); + final Set deletedFiles = filesToDeletedStatus.entrySet().stream().map(entry -> { Path filePath = entry.getKey().getPath(); return FSUtils.getFileIdFromFilePath(filePath); }).collect(Collectors.toSet()); - // append rollback blocks for updates - if (commitMetadata.getPartitionToWriteStats().containsKey(partitionPath)) { - hoodieRollbackStats = rollback(index, partitionPath, commit, commitMetadata, filesToDeletedStatus, - filesToNumBlocksRollback, deletedFiles); - } - break; - } catch (IOException io) { - throw new UncheckedIOException("Failed to rollback for commit " + commit, io); + // append rollback blocks for updates + if (commitMetadata.getPartitionToWriteStats().containsKey(partitionPath)) { + hoodieRollbackStats = rollback(index, partitionPath, commit, commitMetadata, filesToDeletedStatus, + filesToNumBlocksRollback, deletedFiles); + } + break; + } catch (IOException io) { + throw new UncheckedIOException("Failed to rollback for commit " + commit, io); + } + default: + break; } - default: - break; - } - return hoodieRollbackStats; - }).filter(Objects::nonNull).collect(); + return hoodieRollbackStats; + }).filter(Objects::nonNull).collect(); // Delete Inflight instants if enabled - deleteInflightInstant(deleteInstants, this.getActiveTimeline(), new HoodieInstant(true, instantToRollback - .getAction(), instantToRollback.getTimestamp())); + deleteInflightInstant(deleteInstants, this.getActiveTimeline(), + new HoodieInstant(true, instantToRollback.getAction(), instantToRollback.getTimestamp())); logger.debug("Time(in ms) taken to finish rollback " + (System.currentTimeMillis() - startTime)); @@ -332,8 +340,8 @@ public class HoodieMergeOnReadTable extends } /** - * UpsertPartitioner for MergeOnRead table type, this allows auto correction of small parquet - * files to larger ones without the need for an index in the logFile. + * UpsertPartitioner for MergeOnRead table type, this allows auto correction of small parquet files to larger ones + * without the need for an index in the logFile. */ class MergeOnReadUpsertPartitioner extends HoodieCopyOnWriteTable.UpsertPartitioner { @@ -361,21 +369,23 @@ public class HoodieMergeOnReadTable extends // TODO : choose last N small files since there can be multiple small files written to a single partition // by different spark partitions in a single batch Option smallFileSlice = Option.fromJavaOptional(getRTFileSystemView() - .getLatestFileSlicesBeforeOrOn(partitionPath, latestCommitTime.getTimestamp(), false).filter( - fileSlice -> fileSlice.getLogFiles().count() < 1 - && fileSlice.getDataFile().get().getFileSize() < config - .getParquetSmallFileLimit()).sorted((FileSlice left, FileSlice right) -> - left.getDataFile().get().getFileSize() < right.getDataFile().get().getFileSize() - ? -1 : 1).findFirst()); + .getLatestFileSlicesBeforeOrOn(partitionPath, latestCommitTime.getTimestamp(), false) + .filter(fileSlice -> fileSlice.getLogFiles().count() < 1 + && fileSlice.getDataFile().get().getFileSize() < config.getParquetSmallFileLimit()) + .sorted((FileSlice left, + FileSlice right) -> left.getDataFile().get().getFileSize() < right.getDataFile().get().getFileSize() + ? -1 + : 1) + .findFirst()); if (smallFileSlice.isPresent()) { allSmallFileSlices.add(smallFileSlice.get()); } } else { // If we can index log files, we can add more inserts to log files for fileIds including those under // pending compaction. - List allFileSlices = getRTFileSystemView() - .getLatestFileSlicesBeforeOrOn(partitionPath, latestCommitTime.getTimestamp(), true) - .collect(Collectors.toList()); + List allFileSlices = + getRTFileSystemView().getLatestFileSlicesBeforeOrOn(partitionPath, latestCommitTime.getTimestamp(), true) + .collect(Collectors.toList()); for (FileSlice fileSlice : allFileSlices) { if (isSmallFile(partitionPath, fileSlice)) { allSmallFileSlices.add(fileSlice); @@ -408,8 +418,7 @@ public class HoodieMergeOnReadTable extends } public List getSmallFileIds() { - return (List) smallFiles.stream() - .map(smallFile -> ((SmallFile) smallFile).location.getFileId()) + return (List) smallFiles.stream().map(smallFile -> ((SmallFile) smallFile).location.getFileId()) .collect(Collectors.toList()); } @@ -417,8 +426,8 @@ public class HoodieMergeOnReadTable extends if (!fileSlice.getDataFile().isPresent()) { return convertLogFilesSizeToExpectedParquetSize(fileSlice.getLogFiles().collect(Collectors.toList())); } else { - return fileSlice.getDataFile().get().getFileSize() + convertLogFilesSizeToExpectedParquetSize(fileSlice - .getLogFiles().collect(Collectors.toList())); + return fileSlice.getDataFile().get().getFileSize() + + convertLogFilesSizeToExpectedParquetSize(fileSlice.getLogFiles().collect(Collectors.toList())); } } @@ -431,13 +440,12 @@ public class HoodieMergeOnReadTable extends @VisibleForTesting public long convertLogFilesSizeToExpectedParquetSize(List hoodieLogFiles) { long totalSizeOfLogFiles = hoodieLogFiles.stream().map(hoodieLogFile -> hoodieLogFile.getFileSize()) - .filter(size -> size > 0) - .reduce((a, b) -> (a + b)).orElse(0L); + .filter(size -> size > 0).reduce((a, b) -> (a + b)).orElse(0L); // Here we assume that if there is no base parquet file, all log files contain only inserts. // We can then just get the parquet equivalent size of these log files, compare that with // {@link config.getParquetMaxFileSize()} and decide if there is scope to insert more rows - long logFilesEquivalentParquetFileSize = (long) (totalSizeOfLogFiles * config - .getLogFileToParquetCompressionRatio()); + long logFilesEquivalentParquetFileSize = + (long) (totalSizeOfLogFiles * config.getLogFileToParquetCompressionRatio()); return logFilesEquivalentParquetFileSize; } } @@ -447,8 +455,8 @@ public class HoodieMergeOnReadTable extends Map header = Maps.newHashMap(); header.put(HeaderMetadataType.INSTANT_TIME, metaClient.getActiveTimeline().lastInstant().get().getTimestamp()); header.put(HeaderMetadataType.TARGET_INSTANT_TIME, commit); - header.put(HeaderMetadataType.COMMAND_BLOCK_TYPE, String.valueOf(HoodieCommandBlockTypeEnum.ROLLBACK_PREVIOUS_BLOCK - .ordinal())); + header.put(HeaderMetadataType.COMMAND_BLOCK_TYPE, + String.valueOf(HoodieCommandBlockTypeEnum.ROLLBACK_PREVIOUS_BLOCK.ordinal())); return header; } @@ -461,52 +469,47 @@ public class HoodieMergeOnReadTable extends // But the index (global) might store the baseCommit of the parquet and not the requested, hence get the // baseCommit always by listing the file slice Map fileIdToBaseCommitTimeForLogMap = this.getRTFileSystemView().getLatestFileSlices(partitionPath) - .collect(Collectors.toMap(FileSlice::getFileId, FileSlice::getBaseInstantTime)); - commitMetadata.getPartitionToWriteStats().get(partitionPath).stream() - .filter(wStat -> { - // Filter out stats without prevCommit since they are all inserts - return wStat != null && wStat.getPrevCommit() != HoodieWriteStat.NULL_COMMIT && wStat.getPrevCommit() != null - && !deletedFiles.contains(wStat.getFileId()); - }).forEach(wStat -> { - Writer writer = null; - String baseCommitTime = fileIdToBaseCommitTimeForLogMap.get(wStat.getFileId()); - if (null != baseCommitTime) { - boolean success = false; - try { - writer = HoodieLogFormat.newWriterBuilder().onParentPath( - FSUtils.getPartitionPath(this.getMetaClient().getBasePath(), partitionPath)) - .withFileId(wStat.getFileId()).overBaseCommit(baseCommitTime) - .withFs(this.metaClient.getFs()) - .withFileExtension(HoodieLogFile.DELTA_EXTENSION).build(); - // generate metadata - Map header = generateHeader(commit); - // if update belongs to an existing log file - writer = writer.appendBlock(new HoodieCommandBlock(header)); - success = true; - } catch (IOException | InterruptedException io) { - throw new HoodieRollbackException( - "Failed to rollback for commit " + commit, io); - } finally { - try { - if (writer != null) { - writer.close(); - } - if (success) { - // This step is intentionally done after writer is closed. Guarantees that - // getFileStatus would reflect correct stats and FileNotFoundException is not thrown in - // cloud-storage : HUDI-168 - filesToNumBlocksRollback.put(this.getMetaClient().getFs() - .getFileStatus(writer.getLogFile().getPath()), 1L); - } - } catch (IOException io) { - throw new UncheckedIOException(io); - } + .collect(Collectors.toMap(FileSlice::getFileId, FileSlice::getBaseInstantTime)); + commitMetadata.getPartitionToWriteStats().get(partitionPath).stream().filter(wStat -> { + // Filter out stats without prevCommit since they are all inserts + return wStat != null && wStat.getPrevCommit() != HoodieWriteStat.NULL_COMMIT && wStat.getPrevCommit() != null + && !deletedFiles.contains(wStat.getFileId()); + }).forEach(wStat -> { + Writer writer = null; + String baseCommitTime = fileIdToBaseCommitTimeForLogMap.get(wStat.getFileId()); + if (null != baseCommitTime) { + boolean success = false; + try { + writer = HoodieLogFormat.newWriterBuilder() + .onParentPath(FSUtils.getPartitionPath(this.getMetaClient().getBasePath(), partitionPath)) + .withFileId(wStat.getFileId()).overBaseCommit(baseCommitTime).withFs(this.metaClient.getFs()) + .withFileExtension(HoodieLogFile.DELTA_EXTENSION).build(); + // generate metadata + Map header = generateHeader(commit); + // if update belongs to an existing log file + writer = writer.appendBlock(new HoodieCommandBlock(header)); + success = true; + } catch (IOException | InterruptedException io) { + throw new HoodieRollbackException("Failed to rollback for commit " + commit, io); + } finally { + try { + if (writer != null) { + writer.close(); } + if (success) { + // This step is intentionally done after writer is closed. Guarantees that + // getFileStatus would reflect correct stats and FileNotFoundException is not thrown in + // cloud-storage : HUDI-168 + filesToNumBlocksRollback.put(this.getMetaClient().getFs().getFileStatus(writer.getLogFile().getPath()), + 1L); + } + } catch (IOException io) { + throw new UncheckedIOException(io); } - }); - return HoodieRollbackStat.newBuilder() - .withPartitionPath(partitionPath) - .withDeletedFileResults(filesToDeletedStatus) + } + } + }); + return HoodieRollbackStat.newBuilder().withPartitionPath(partitionPath).withDeletedFileResults(filesToDeletedStatus) .withRollbackBlockAppendResults(filesToNumBlocksRollback).build(); } diff --git a/hudi-client/src/main/java/org/apache/hudi/table/HoodieTable.java b/hudi-client/src/main/java/org/apache/hudi/table/HoodieTable.java index 0ac18fcd6..30f74def8 100644 --- a/hudi-client/src/main/java/org/apache/hudi/table/HoodieTable.java +++ b/hudi-client/src/main/java/org/apache/hudi/table/HoodieTable.java @@ -82,22 +82,21 @@ public abstract class HoodieTable implements Seri protected HoodieTable(HoodieWriteConfig config, JavaSparkContext jsc) { this.config = config; this.hadoopConfiguration = new SerializableConfiguration(jsc.hadoopConfiguration()); - this.viewManager = FileSystemViewManager.createViewManager( - new SerializableConfiguration(jsc.hadoopConfiguration()), config.getViewStorageConfig()); + this.viewManager = FileSystemViewManager.createViewManager(new SerializableConfiguration(jsc.hadoopConfiguration()), + config.getViewStorageConfig()); this.metaClient = ClientUtils.createMetaClient(jsc, config, true); this.index = HoodieIndex.createIndex(config, jsc); } private synchronized FileSystemViewManager getViewManager() { if (null == viewManager) { - viewManager = FileSystemViewManager.createViewManager(hadoopConfiguration, - config.getViewStorageConfig()); + viewManager = FileSystemViewManager.createViewManager(hadoopConfiguration, config.getViewStorageConfig()); } return viewManager; } - public static HoodieTable getHoodieTable( - HoodieTableMetaClient metaClient, HoodieWriteConfig config, JavaSparkContext jsc) { + public static HoodieTable getHoodieTable(HoodieTableMetaClient metaClient, + HoodieWriteConfig config, JavaSparkContext jsc) { switch (metaClient.getTableType()) { case COPY_ON_WRITE: return new HoodieCopyOnWriteTable<>(config, jsc); @@ -202,8 +201,7 @@ public abstract class HoodieTable implements Seri * Get the list of savepoints in this table */ public List getSavepoints() { - return getCompletedSavepointTimeline().getInstants().map(HoodieInstant::getTimestamp) - .collect(Collectors.toList()); + return getCompletedSavepointTimeline().getInstants().map(HoodieInstant::getTimestamp).collect(Collectors.toList()); } /** @@ -214,18 +212,14 @@ public abstract class HoodieTable implements Seri throw new HoodieSavepointException( "Could not get data files for savepoint " + savepointTime + ". No such savepoint."); } - HoodieInstant instant = new HoodieInstant(false, HoodieTimeline.SAVEPOINT_ACTION, - savepointTime); + HoodieInstant instant = new HoodieInstant(false, HoodieTimeline.SAVEPOINT_ACTION, savepointTime); HoodieSavepointMetadata metadata = null; try { - metadata = AvroUtils - .deserializeHoodieSavepointMetadata(getActiveTimeline().getInstantDetails(instant).get()); + metadata = AvroUtils.deserializeHoodieSavepointMetadata(getActiveTimeline().getInstantDetails(instant).get()); } catch (IOException e) { - throw new HoodieSavepointException( - "Could not get savepointed data files for savepoint " + savepointTime, e); + throw new HoodieSavepointException("Could not get savepointed data files for savepoint " + savepointTime, e); } - return metadata.getPartitionMetadata().values().stream() - .flatMap(s -> s.getSavepointDataFile().stream()); + return metadata.getPartitionMetadata().values().stream().flatMap(s -> s.getSavepointDataFile().stream()); } public HoodieActiveTimeline getActiveTimeline() { @@ -242,30 +236,30 @@ public abstract class HoodieTable implements Seri /** * Perform the ultimate IO for a given upserted (RDD) partition */ - public abstract Iterator> handleUpsertPartition(String commitTime, - Integer partition, Iterator> recordIterator, Partitioner partitioner); + public abstract Iterator> handleUpsertPartition(String commitTime, Integer partition, + Iterator> recordIterator, Partitioner partitioner); /** * Perform the ultimate IO for a given inserted (RDD) partition */ - public abstract Iterator> handleInsertPartition(String commitTime, - Integer partition, Iterator> recordIterator, Partitioner partitioner); + public abstract Iterator> handleInsertPartition(String commitTime, Integer partition, + Iterator> recordIterator, Partitioner partitioner); /** * Schedule compaction for the instant time - * @param jsc Spark Context + * + * @param jsc Spark Context * @param instantTime Instant Time for scheduling compaction * @return */ public abstract HoodieCompactionPlan scheduleCompaction(JavaSparkContext jsc, String instantTime); /** - * Run Compaction on the table. Compaction arranges the data so that it is optimized for data - * access + * Run Compaction on the table. Compaction arranges the data so that it is optimized for data access * - * @param jsc Spark Context + * @param jsc Spark Context * @param compactionInstantTime Instant Time - * @param compactionPlan Compaction Plan + * @param compactionPlan Compaction Plan */ public abstract JavaRDD compact(JavaSparkContext jsc, String compactionInstantTime, HoodieCompactionPlan compactionPlan); @@ -276,9 +270,9 @@ public abstract class HoodieTable implements Seri public abstract List clean(JavaSparkContext jsc); /** - * Rollback the (inflight/committed) record changes with the given commit time. Four steps: (1) - * Atomically unpublish this commit (2) clean indexing data (3) clean new generated parquet files - * / log blocks (4) Finally, delete ..commit or ..inflight file if deleteInstants = true + * Rollback the (inflight/committed) record changes with the given commit time. Four steps: (1) Atomically unpublish + * this commit (2) clean indexing data (3) clean new generated parquet files / log blocks (4) Finally, delete + * ..commit or ..inflight file if deleteInstants = true */ public abstract List rollback(JavaSparkContext jsc, String commit, boolean deleteInstants) throws IOException; @@ -297,6 +291,7 @@ public abstract class HoodieTable implements Seri /** * Delete Marker directory corresponding to an instant + * * @param instantTs Instant Time */ protected void deleteMarkerDir(String instantTs) { @@ -317,10 +312,10 @@ public abstract class HoodieTable implements Seri * Reconciles WriteStats and marker files to detect and safely delete duplicate data files created because of Spark * retries. * - * @param jsc Spark Context + * @param jsc Spark Context * @param instantTs Instant Timestamp - * @param stats Hoodie Write Stat - * @param consistencyCheckEnabled Consistency Check Enabled + * @param stats Hoodie Write Stat + * @param consistencyCheckEnabled Consistency Check Enabled * @throws HoodieIOException */ protected void cleanFailedWrites(JavaSparkContext jsc, String instantTs, List stats, @@ -343,13 +338,12 @@ public abstract class HoodieTable implements Seri // Contains list of partially created files. These needs to be cleaned up. invalidDataPaths.removeAll(validDataPaths); if (!invalidDataPaths.isEmpty()) { - logger.info("Removing duplicate data files created due to spark retries before committing. Paths=" - + invalidDataPaths); + logger.info( + "Removing duplicate data files created due to spark retries before committing. Paths=" + invalidDataPaths); } Map>> groupByPartition = invalidDataPaths.stream() - .map(dp -> Pair.of(new Path(dp).getParent().toString(), dp)) - .collect(Collectors.groupingBy(Pair::getKey)); + .map(dp -> Pair.of(new Path(dp).getParent().toString(), dp)).collect(Collectors.groupingBy(Pair::getKey)); if (!groupByPartition.isEmpty()) { // Ensure all files in delete list is actually present. This is mandatory for an eventually consistent FS. @@ -394,7 +388,8 @@ public abstract class HoodieTable implements Seri /** * Ensures all files passed either appear or disappear - * @param jsc JavaSparkContext + * + * @param jsc JavaSparkContext * @param groupByPartition Files grouped by partition * @param visibility Appear/Disappear */ diff --git a/hudi-client/src/main/java/org/apache/hudi/table/UserDefinedBulkInsertPartitioner.java b/hudi-client/src/main/java/org/apache/hudi/table/UserDefinedBulkInsertPartitioner.java index 6d6f18e9d..420f87994 100644 --- a/hudi-client/src/main/java/org/apache/hudi/table/UserDefinedBulkInsertPartitioner.java +++ b/hudi-client/src/main/java/org/apache/hudi/table/UserDefinedBulkInsertPartitioner.java @@ -23,13 +23,11 @@ import org.apache.hudi.common.model.HoodieRecordPayload; import org.apache.spark.api.java.JavaRDD; /** - * Repartition input records into at least expected number of output spark partitions. It should - * give below guarantees - Output spark partition will have records from only one hoodie partition. - * - Average records per output spark partitions should be almost equal to (#inputRecords / - * #outputSparkPartitions) to avoid possible skews. + * Repartition input records into at least expected number of output spark partitions. It should give below guarantees - + * Output spark partition will have records from only one hoodie partition. - Average records per output spark + * partitions should be almost equal to (#inputRecords / #outputSparkPartitions) to avoid possible skews. */ public interface UserDefinedBulkInsertPartitioner { - JavaRDD> repartitionRecords(JavaRDD> records, - int outputSparkPartitions); + JavaRDD> repartitionRecords(JavaRDD> records, int outputSparkPartitions); } diff --git a/hudi-client/src/main/java/org/apache/hudi/table/WorkloadProfile.java b/hudi-client/src/main/java/org/apache/hudi/table/WorkloadProfile.java index 0ce4c62c3..650b71773 100644 --- a/hudi-client/src/main/java/org/apache/hudi/table/WorkloadProfile.java +++ b/hudi-client/src/main/java/org/apache/hudi/table/WorkloadProfile.java @@ -30,8 +30,7 @@ import org.apache.spark.api.java.JavaRDD; import scala.Tuple2; /** - * Information about incoming records for upsert/insert obtained either via sampling or - * introspecting the data fully + * Information about incoming records for upsert/insert obtained either via sampling or introspecting the data fully *

    * TODO(vc): Think about obtaining this directly from index.tagLocation */ @@ -62,11 +61,10 @@ public class WorkloadProfile implements Serializa Map>, Long> partitionLocationCounts = taggedRecords .mapToPair(record -> new Tuple2<>( - new Tuple2<>(record.getPartitionPath(), Option.ofNullable(record.getCurrentLocation())), - record)).countByKey(); + new Tuple2<>(record.getPartitionPath(), Option.ofNullable(record.getCurrentLocation())), record)) + .countByKey(); - for (Map.Entry>, Long> e : partitionLocationCounts - .entrySet()) { + for (Map.Entry>, Long> e : partitionLocationCounts.entrySet()) { String partitionPath = e.getKey()._1(); Long count = e.getValue(); Option locOption = e.getKey()._2(); diff --git a/hudi-client/src/test/java/HoodieClientExample.java b/hudi-client/src/test/java/HoodieClientExample.java index 7e298332a..6406b6bf0 100644 --- a/hudi-client/src/test/java/HoodieClientExample.java +++ b/hudi-client/src/test/java/HoodieClientExample.java @@ -41,7 +41,8 @@ import org.apache.spark.api.java.JavaRDD; import org.apache.spark.api.java.JavaSparkContext; /** - * Driver program that uses the Hoodie client with synthetic workload, and performs basic operations.

    + * Driver program that uses the Hoodie client with synthetic workload, and performs basic operations. + *

    */ public class HoodieClientExample { @@ -82,18 +83,15 @@ public class HoodieClientExample { Path path = new Path(tablePath); FileSystem fs = FSUtils.getFs(tablePath, jsc.hadoopConfiguration()); if (!fs.exists(path)) { - HoodieTableMetaClient - .initTableType(jsc.hadoopConfiguration(), tablePath, HoodieTableType.valueOf(tableType), tableName, - HoodieAvroPayload.class.getName()); + HoodieTableMetaClient.initTableType(jsc.hadoopConfiguration(), tablePath, HoodieTableType.valueOf(tableType), + tableName, HoodieAvroPayload.class.getName()); } // Create the write client to write some records in HoodieWriteConfig cfg = HoodieWriteConfig.newBuilder().withPath(tablePath) - .withSchema(HoodieTestDataGenerator.TRIP_EXAMPLE_SCHEMA).withParallelism(2, 2) - .forTable(tableName) + .withSchema(HoodieTestDataGenerator.TRIP_EXAMPLE_SCHEMA).withParallelism(2, 2).forTable(tableName) .withIndexConfig(HoodieIndexConfig.newBuilder().withIndexType(IndexType.BLOOM).build()) - .withCompactionConfig( - HoodieCompactionConfig.newBuilder().archiveCommitsWith(2, 3).build()).build(); + .withCompactionConfig(HoodieCompactionConfig.newBuilder().archiveCommitsWith(2, 3).build()).build(); HoodieWriteClient client = new HoodieWriteClient(jsc, cfg); /** diff --git a/hudi-client/src/test/java/org/apache/hudi/HoodieClientTestHarness.java b/hudi-client/src/test/java/org/apache/hudi/HoodieClientTestHarness.java index 80cb70fa7..4b20608f6 100644 --- a/hudi-client/src/test/java/org/apache/hudi/HoodieClientTestHarness.java +++ b/hudi-client/src/test/java/org/apache/hudi/HoodieClientTestHarness.java @@ -53,7 +53,7 @@ public abstract class HoodieClientTestHarness extends HoodieCommonTestHarness im protected transient ExecutorService executorService; protected transient HoodieTableMetaClient metaClient; - //dfs + // dfs protected String dfsBasePath; protected transient HdfsTestService hdfsTestService; protected transient MiniDFSCluster dfsCluster; @@ -74,6 +74,7 @@ public abstract class HoodieClientTestHarness extends HoodieCommonTestHarness im /** * Cleanups resource group for the subclasses of {@link TestHoodieClientBase}. + * * @throws IOException */ public void cleanupResources() throws IOException { @@ -84,8 +85,7 @@ public abstract class HoodieClientTestHarness extends HoodieCommonTestHarness im } /** - * Initializes the Spark contexts ({@link JavaSparkContext} and {@link SQLContext}) - * with the given application name. + * Initializes the Spark contexts ({@link JavaSparkContext} and {@link SQLContext}) with the given application name. * * @param appName The specified application name. */ @@ -94,13 +94,13 @@ public abstract class HoodieClientTestHarness extends HoodieCommonTestHarness im jsc = new JavaSparkContext(HoodieClientTestUtils.getSparkConfForTest(appName)); jsc.setLogLevel("ERROR"); - //SQLContext stuff + // SQLContext stuff sqlContext = new SQLContext(jsc); } /** - * Initializes the Spark contexts ({@link JavaSparkContext} and {@link SQLContext}) - * with a default name TestHoodieClient. + * Initializes the Spark contexts ({@link JavaSparkContext} and {@link SQLContext}) with a default name + * TestHoodieClient. */ protected void initSparkContexts() { initSparkContexts("TestHoodieClient"); @@ -155,8 +155,8 @@ public abstract class HoodieClientTestHarness extends HoodieCommonTestHarness im } /** - * Initializes an instance of {@link HoodieTableMetaClient} with a special table type - * specified by {@code getTableType()}. + * Initializes an instance of {@link HoodieTableMetaClient} with a special table type specified by + * {@code getTableType()}. * * @throws IOException */ diff --git a/hudi-client/src/test/java/org/apache/hudi/TestAsyncCompaction.java b/hudi-client/src/test/java/org/apache/hudi/TestAsyncCompaction.java index e00afe4ce..fcfbd736e 100644 --- a/hudi-client/src/test/java/org/apache/hudi/TestAsyncCompaction.java +++ b/hudi-client/src/test/java/org/apache/hudi/TestAsyncCompaction.java @@ -73,15 +73,14 @@ public class TestAsyncCompaction extends TestHoodieClientBase { private HoodieWriteConfig.Builder getConfigBuilder(Boolean autoCommit) { return HoodieWriteConfig.newBuilder().withPath(basePath).withSchema(TRIP_EXAMPLE_SCHEMA).withParallelism(2, 2) - .withAutoCommit(autoCommit).withAssumeDatePartitioning(true).withCompactionConfig( - HoodieCompactionConfig.newBuilder().compactionSmallFileSize(1024 * 1024 * 1024).withInlineCompaction(false) - .withMaxNumDeltaCommitsBeforeCompaction(1).build()) + .withAutoCommit(autoCommit).withAssumeDatePartitioning(true) + .withCompactionConfig(HoodieCompactionConfig.newBuilder().compactionSmallFileSize(1024 * 1024 * 1024) + .withInlineCompaction(false).withMaxNumDeltaCommitsBeforeCompaction(1).build()) .withStorageConfig(HoodieStorageConfig.newBuilder().limitFileSize(1024 * 1024 * 1024).build()) .forTable("test-trip-table") .withIndexConfig(HoodieIndexConfig.newBuilder().withIndexType(HoodieIndex.IndexType.BLOOM).build()) - .withEmbeddedTimelineServerEnabled(true).withFileSystemViewConfig( - FileSystemViewStorageConfig.newBuilder().withStorageType(FileSystemViewStorageType.EMBEDDED_KV_STORE) - .build()); + .withEmbeddedTimelineServerEnabled(true).withFileSystemViewConfig(FileSystemViewStorageConfig.newBuilder() + .withStorageType(FileSystemViewStorageType.EMBEDDED_KV_STORE).build()); } @Test @@ -97,8 +96,8 @@ public class TestAsyncCompaction extends TestHoodieClientBase { int numRecs = 2000; List records = dataGen.generateInserts(firstInstantTime, numRecs); - runNextDeltaCommits(client, Arrays.asList(firstInstantTime, secondInstantTime), - records, cfg, true, new ArrayList<>()); + runNextDeltaCommits(client, Arrays.asList(firstInstantTime, secondInstantTime), records, cfg, true, + new ArrayList<>()); // Schedule compaction but do not run them scheduleCompaction(compactionInstantTime, client, cfg); @@ -158,8 +157,8 @@ public class TestAsyncCompaction extends TestHoodieClientBase { try (HoodieWriteClient client = getHoodieWriteClient(cfg, true);) { List records = dataGen.generateInserts(firstInstantTime, numRecs); - records = runNextDeltaCommits(client, Arrays.asList(firstInstantTime, secondInstantTime), - records, cfg, true, new ArrayList<>()); + records = runNextDeltaCommits(client, Arrays.asList(firstInstantTime, secondInstantTime), records, cfg, true, + new ArrayList<>()); // Schedule compaction but do not run them scheduleCompaction(compactionInstantTime, client, cfg); @@ -177,20 +176,18 @@ public class TestAsyncCompaction extends TestHoodieClientBase { assertTrue("inflight instant has expected instant time", inflightInstant.getTimestamp().equals(inflightInstantTime)); - //This should rollback + // This should rollback client.startCommitWithTime(nextInflightInstantTime); - //Validate + // Validate metaClient = new HoodieTableMetaClient(jsc.hadoopConfiguration(), cfg.getBasePath()); - inflightInstant = - metaClient.getActiveTimeline().filterInflightsExcludingCompaction().firstInstant().get(); + inflightInstant = metaClient.getActiveTimeline().filterInflightsExcludingCompaction().firstInstant().get(); assertTrue("inflight instant has expected instant time", inflightInstant.getTimestamp().equals(nextInflightInstantTime)); assertTrue("Expect only one inflight instant", metaClient.getActiveTimeline().filterInflightsExcludingCompaction().getInstants().count() == 1); - //Expect pending Compaction to be present - pendingCompactionInstant = - metaClient.getActiveTimeline().filterPendingCompactionTimeline().firstInstant().get(); + // Expect pending Compaction to be present + pendingCompactionInstant = metaClient.getActiveTimeline().filterPendingCompactionTimeline().firstInstant().get(); assertTrue("Pending Compaction instant has expected instant time", pendingCompactionInstant.getTimestamp().equals(compactionInstantTime)); } @@ -211,8 +208,8 @@ public class TestAsyncCompaction extends TestHoodieClientBase { int numRecs = 2000; List records = dataGen.generateInserts(firstInstantTime, numRecs); - records = runNextDeltaCommits(client, Arrays.asList(firstInstantTime, secondInstantTime), - records, cfg, true, new ArrayList<>()); + records = runNextDeltaCommits(client, Arrays.asList(firstInstantTime, secondInstantTime), records, cfg, true, + new ArrayList<>()); // Schedule and mark compaction instant as inflight HoodieTableMetaClient metaClient = new HoodieTableMetaClient(jsc.hadoopConfiguration(), cfg.getBasePath()); @@ -221,8 +218,8 @@ public class TestAsyncCompaction extends TestHoodieClientBase { moveCompactionFromRequestedToInflight(compactionInstantTime, client, cfg); // Complete ingestions - runNextDeltaCommits(client, Arrays.asList(thirdInstantTime, fourthInstantTime), - records, cfg, false, Arrays.asList(compactionInstantTime)); + runNextDeltaCommits(client, Arrays.asList(thirdInstantTime, fourthInstantTime), records, cfg, false, + Arrays.asList(compactionInstantTime)); // execute inflight compaction executeCompaction(compactionInstantTime, client, hoodieTable, cfg, numRecs, true); @@ -242,8 +239,8 @@ public class TestAsyncCompaction extends TestHoodieClientBase { int numRecs = 2000; List records = dataGen.generateInserts(firstInstantTime, numRecs); - records = runNextDeltaCommits(client, Arrays.asList(firstInstantTime, secondInstantTime), - records, cfg, true, new ArrayList<>()); + records = runNextDeltaCommits(client, Arrays.asList(firstInstantTime, secondInstantTime), records, cfg, true, + new ArrayList<>()); // Schedule compaction but do not run them HoodieTableMetaClient metaClient = new HoodieTableMetaClient(jsc.hadoopConfiguration(), cfg.getBasePath()); @@ -256,8 +253,8 @@ public class TestAsyncCompaction extends TestHoodieClientBase { boolean gotException = false; try { - runNextDeltaCommits(client, Arrays.asList(failedInstantTime), - records, cfg, false, Arrays.asList(compactionInstantTime)); + runNextDeltaCommits(client, Arrays.asList(failedInstantTime), records, cfg, false, + Arrays.asList(compactionInstantTime)); } catch (IllegalArgumentException iex) { // Latest pending compaction instant time must be earlier than this instant time. Should fail here gotException = true; @@ -279,8 +276,8 @@ public class TestAsyncCompaction extends TestHoodieClientBase { int numRecs = 2000; List records = dataGen.generateInserts(firstInstantTime, numRecs); - records = runNextDeltaCommits(client, Arrays.asList(firstInstantTime, secondInstantTime), - records, cfg, true, new ArrayList<>()); + records = runNextDeltaCommits(client, Arrays.asList(firstInstantTime, secondInstantTime), records, cfg, true, + new ArrayList<>()); HoodieTableMetaClient metaClient = new HoodieTableMetaClient(jsc.hadoopConfiguration(), cfg.getBasePath()); createNextDeltaCommit(inflightInstantTime, records, client, metaClient, cfg, true); @@ -315,8 +312,8 @@ public class TestAsyncCompaction extends TestHoodieClientBase { int numRecs = 2000; List records = dataGen.generateInserts(firstInstantTime, numRecs); - records = runNextDeltaCommits(client, Arrays.asList(firstInstantTime, secondInstantTime), - records, cfg, true, new ArrayList<>()); + records = runNextDeltaCommits(client, Arrays.asList(firstInstantTime, secondInstantTime), records, cfg, true, + new ArrayList<>()); HoodieTableMetaClient metaClient = new HoodieTableMetaClient(jsc.hadoopConfiguration(), cfg.getBasePath()); boolean gotException = false; @@ -337,8 +334,7 @@ public class TestAsyncCompaction extends TestHoodieClientBase { } catch (IllegalArgumentException iex) { gotException = true; } - assertTrue("Compaction Instant to be scheduled cannot have same timestamp as committed instant", - gotException); + assertTrue("Compaction Instant to be scheduled cannot have same timestamp as committed instant", gotException); compactionInstantTime = "006"; scheduleCompaction(compactionInstantTime, client, cfg); @@ -349,8 +345,7 @@ public class TestAsyncCompaction extends TestHoodieClientBase { } catch (IllegalArgumentException iex) { gotException = true; } - assertTrue("Compaction Instant to be scheduled cannot have same timestamp as a pending compaction", - gotException); + assertTrue("Compaction Instant to be scheduled cannot have same timestamp as a pending compaction", gotException); } @Test @@ -365,8 +360,8 @@ public class TestAsyncCompaction extends TestHoodieClientBase { int numRecs = 2000; List records = dataGen.generateInserts(firstInstantTime, numRecs); - records = runNextDeltaCommits(client, Arrays.asList(firstInstantTime, secondInstantTime), - records, cfg, true, new ArrayList<>()); + records = runNextDeltaCommits(client, Arrays.asList(firstInstantTime, secondInstantTime), records, cfg, true, + new ArrayList<>()); HoodieTableMetaClient metaClient = new HoodieTableMetaClient(jsc.hadoopConfiguration(), cfg.getBasePath()); HoodieTable hoodieTable = getHoodieTable(metaClient, cfg); @@ -376,7 +371,7 @@ public class TestAsyncCompaction extends TestHoodieClientBase { @Test public void testInterleavedCompaction() throws Exception { - //Case: Two delta commits before and after compaction schedule + // Case: Two delta commits before and after compaction schedule HoodieWriteConfig cfg = getConfig(true); try (HoodieWriteClient client = getHoodieWriteClient(cfg, true);) { @@ -389,15 +384,15 @@ public class TestAsyncCompaction extends TestHoodieClientBase { int numRecs = 2000; List records = dataGen.generateInserts(firstInstantTime, numRecs); - records = runNextDeltaCommits(client, Arrays.asList(firstInstantTime, secondInstantTime), - records, cfg, true, new ArrayList<>()); + records = runNextDeltaCommits(client, Arrays.asList(firstInstantTime, secondInstantTime), records, cfg, true, + new ArrayList<>()); HoodieTableMetaClient metaClient = new HoodieTableMetaClient(jsc.hadoopConfiguration(), cfg.getBasePath()); HoodieTable hoodieTable = getHoodieTable(metaClient, cfg); scheduleCompaction(compactionInstantTime, client, cfg); - runNextDeltaCommits(client, Arrays.asList(thirdInstantTime, fourthInstantTime), - records, cfg, false, Arrays.asList(compactionInstantTime)); + runNextDeltaCommits(client, Arrays.asList(thirdInstantTime, fourthInstantTime), records, cfg, false, + Arrays.asList(compactionInstantTime)); executeCompaction(compactionInstantTime, client, hoodieTable, cfg, numRecs, true); } } @@ -428,8 +423,8 @@ public class TestAsyncCompaction extends TestHoodieClientBase { } private List runNextDeltaCommits(HoodieWriteClient client, List deltaInstants, - List records, HoodieWriteConfig cfg, boolean insertFirst, - List expPendingCompactionInstants) throws Exception { + List records, HoodieWriteConfig cfg, boolean insertFirst, List expPendingCompactionInstants) + throws Exception { HoodieTableMetaClient metaClient = new HoodieTableMetaClient(jsc.hadoopConfiguration(), cfg.getBasePath()); List> pendingCompactions = @@ -476,8 +471,8 @@ public class TestAsyncCompaction extends TestHoodieClientBase { HoodieWriteConfig cfg) throws IOException { HoodieTableMetaClient metaClient = new HoodieTableMetaClient(jsc.hadoopConfiguration(), cfg.getBasePath()); HoodieInstant compactionInstant = HoodieTimeline.getCompactionRequestedInstant(compactionInstantTime); - HoodieCompactionPlan workload = AvroUtils.deserializeCompactionPlan( - metaClient.getActiveTimeline().getInstantAuxiliaryDetails(compactionInstant).get()); + HoodieCompactionPlan workload = AvroUtils + .deserializeCompactionPlan(metaClient.getActiveTimeline().getInstantAuxiliaryDetails(compactionInstant).get()); metaClient.getActiveTimeline().transitionCompactionRequestedToInflight(compactionInstant); HoodieInstant instant = metaClient.getActiveTimeline().reload().filterPendingCompactionTimeline().getInstants() .filter(in -> in.getTimestamp().equals(compactionInstantTime)).findAny().get(); @@ -489,27 +484,23 @@ public class TestAsyncCompaction extends TestHoodieClientBase { client.scheduleCompactionAtInstant(compactionInstantTime, Option.empty()); HoodieTableMetaClient metaClient = new HoodieTableMetaClient(jsc.hadoopConfiguration(), cfg.getBasePath()); HoodieInstant instant = metaClient.getActiveTimeline().filterPendingCompactionTimeline().lastInstant().get(); - assertEquals("Last compaction instant must be the one set", - instant.getTimestamp(), compactionInstantTime); + assertEquals("Last compaction instant must be the one set", instant.getTimestamp(), compactionInstantTime); } - private void scheduleAndExecuteCompaction(String compactionInstantTime, - HoodieWriteClient client, HoodieTable table, HoodieWriteConfig cfg, int expectedNumRecs, - boolean hasDeltaCommitAfterPendingCompaction) throws IOException { + private void scheduleAndExecuteCompaction(String compactionInstantTime, HoodieWriteClient client, HoodieTable table, + HoodieWriteConfig cfg, int expectedNumRecs, boolean hasDeltaCommitAfterPendingCompaction) throws IOException { scheduleCompaction(compactionInstantTime, client, cfg); executeCompaction(compactionInstantTime, client, table, cfg, expectedNumRecs, hasDeltaCommitAfterPendingCompaction); } - private void executeCompaction(String compactionInstantTime, - HoodieWriteClient client, HoodieTable table, HoodieWriteConfig cfg, int expectedNumRecs, - boolean hasDeltaCommitAfterPendingCompaction) throws IOException { + private void executeCompaction(String compactionInstantTime, HoodieWriteClient client, HoodieTable table, + HoodieWriteConfig cfg, int expectedNumRecs, boolean hasDeltaCommitAfterPendingCompaction) throws IOException { client.compact(compactionInstantTime); List fileSliceList = getCurrentLatestFileSlices(table, cfg); assertTrue("Ensure latest file-slices are not empty", fileSliceList.stream().findAny().isPresent()); - assertFalse("Verify all file-slices have base-instant same as compaction instant", - fileSliceList.stream().filter(fs -> !fs.getBaseInstantTime().equals(compactionInstantTime)) - .findAny().isPresent()); + assertFalse("Verify all file-slices have base-instant same as compaction instant", fileSliceList.stream() + .filter(fs -> !fs.getBaseInstantTime().equals(compactionInstantTime)).findAny().isPresent()); assertFalse("Verify all file-slices have data-files", fileSliceList.stream().filter(fs -> !fs.getDataFile().isPresent()).findAny().isPresent()); @@ -522,12 +513,11 @@ public class TestAsyncCompaction extends TestHoodieClientBase { } // verify that there is a commit - table = getHoodieTable( - new HoodieTableMetaClient(jsc.hadoopConfiguration(), cfg.getBasePath(), true), cfg); + table = getHoodieTable(new HoodieTableMetaClient(jsc.hadoopConfiguration(), cfg.getBasePath(), true), cfg); HoodieTimeline timeline = table.getMetaClient().getCommitTimeline().filterCompletedInstants(); String latestCompactionCommitTime = timeline.lastInstant().get().getTimestamp(); - assertEquals("Expect compaction instant time to be the latest commit time", - latestCompactionCommitTime, compactionInstantTime); + assertEquals("Expect compaction instant time to be the latest commit time", latestCompactionCommitTime, + compactionInstantTime); Assert.assertEquals("Must contain expected records", expectedNumRecs, HoodieClientTestUtils.readSince(basePath, sqlContext, timeline, "000").count()); @@ -546,8 +536,8 @@ public class TestAsyncCompaction extends TestHoodieClientBase { client.commit(instantTime, statuses); } - Option deltaCommit = metaClient.getActiveTimeline().reload().getDeltaCommitTimeline() - .filterCompletedInstants().lastInstant(); + Option deltaCommit = + metaClient.getActiveTimeline().reload().getDeltaCommitTimeline().filterCompletedInstants().lastInstant(); if (skipCommit && !cfg.shouldAutoCommit()) { assertTrue("Delta commit should not be latest instant", deltaCommit.get().getTimestamp().compareTo(instantTime) < 0); @@ -560,8 +550,8 @@ public class TestAsyncCompaction extends TestHoodieClientBase { private List getCurrentLatestDataFiles(HoodieTable table, HoodieWriteConfig cfg) throws IOException { FileStatus[] allFiles = HoodieTestUtils.listAllDataFilesInPath(table.getMetaClient().getFs(), cfg.getBasePath()); - HoodieTableFileSystemView - view = new HoodieTableFileSystemView(table.getMetaClient(), table.getCompletedCommitsTimeline(), allFiles); + HoodieTableFileSystemView view = + new HoodieTableFileSystemView(table.getMetaClient(), table.getCompletedCommitsTimeline(), allFiles); List dataFilesToRead = view.getLatestDataFiles().collect(Collectors.toList()); return dataFilesToRead; } @@ -569,9 +559,8 @@ public class TestAsyncCompaction extends TestHoodieClientBase { private List getCurrentLatestFileSlices(HoodieTable table, HoodieWriteConfig cfg) throws IOException { HoodieTableFileSystemView view = new HoodieTableFileSystemView(table.getMetaClient(), table.getMetaClient().getActiveTimeline().reload().getCommitsAndCompactionTimeline()); - List fileSliceList = - Arrays.asList(HoodieTestDataGenerator.DEFAULT_PARTITION_PATHS).stream().flatMap(partition -> - view.getLatestFileSlices(partition)).collect(Collectors.toList()); + List fileSliceList = Arrays.asList(HoodieTestDataGenerator.DEFAULT_PARTITION_PATHS).stream() + .flatMap(partition -> view.getLatestFileSlices(partition)).collect(Collectors.toList()); return fileSliceList; } diff --git a/hudi-client/src/test/java/org/apache/hudi/TestCleaner.java b/hudi-client/src/test/java/org/apache/hudi/TestCleaner.java index fe2549193..10f160786 100644 --- a/hudi-client/src/test/java/org/apache/hudi/TestCleaner.java +++ b/hudi-client/src/test/java/org/apache/hudi/TestCleaner.java @@ -93,16 +93,13 @@ public class TestCleaner extends TestHoodieClientBase { * @param insertFn Insertion API for testing * @throws Exception in case of error */ - private String insertFirstBigBatchForClientCleanerTest( - HoodieWriteConfig cfg, - HoodieWriteClient client, + private String insertFirstBigBatchForClientCleanerTest(HoodieWriteConfig cfg, HoodieWriteClient client, Function2, String, Integer> recordGenFunction, Function3, HoodieWriteClient, JavaRDD, String> insertFn) throws Exception { /** - * do a big insert - * (this is basically same as insert part of upsert, just adding it here so we can - * catch breakages in insert(), if the implementation diverges.) + * do a big insert (this is basically same as insert part of upsert, just adding it here so we can catch breakages + * in insert(), if the implementation diverges.) */ String newCommitTime = client.startCommit(); @@ -145,8 +142,8 @@ public class TestCleaner extends TestHoodieClientBase { */ @Test public void testInsertPreppedAndCleanByVersions() throws Exception { - testInsertAndCleanByVersions(HoodieWriteClient::insertPreppedRecords, - HoodieWriteClient::upsertPreppedRecords, true); + testInsertAndCleanByVersions(HoodieWriteClient::insertPreppedRecords, HoodieWriteClient::upsertPreppedRecords, + true); } /** @@ -173,20 +170,18 @@ public class TestCleaner extends TestHoodieClientBase { * @param insertFn Insert API to be tested * @param upsertFn Upsert API to be tested * @param isPreppedAPI Flag to indicate if a prepped-version is used. If true, a wrapper function will be used during - * record generation to also tag the regards (de-dupe is implicit as we use uniq record-gen APIs) + * record generation to also tag the regards (de-dupe is implicit as we use uniq record-gen APIs) * @throws Exception in case of errors */ private void testInsertAndCleanByVersions( Function3, HoodieWriteClient, JavaRDD, String> insertFn, - Function3, HoodieWriteClient, JavaRDD, String> upsertFn, - boolean isPreppedAPI - ) throws Exception { + Function3, HoodieWriteClient, JavaRDD, String> upsertFn, boolean isPreppedAPI) + throws Exception { int maxVersions = 2; // keep upto 2 versions for each file - HoodieWriteConfig cfg = getConfigBuilder().withCompactionConfig( - HoodieCompactionConfig.newBuilder().withCleanerPolicy(HoodieCleaningPolicy.KEEP_LATEST_FILE_VERSIONS) - .retainFileVersions(maxVersions).build()) - .withParallelism(1, 1).withBulkInsertParallelism(1) - .withFinalizeWriteParallelism(1) + HoodieWriteConfig cfg = getConfigBuilder() + .withCompactionConfig(HoodieCompactionConfig.newBuilder() + .withCleanerPolicy(HoodieCleaningPolicy.KEEP_LATEST_FILE_VERSIONS).retainFileVersions(maxVersions).build()) + .withParallelism(1, 1).withBulkInsertParallelism(1).withFinalizeWriteParallelism(1) .withConsistencyGuardConfig(ConsistencyGuardConfig.newBuilder().withConsistencyCheckEnabled(true).build()) .build(); try (HoodieWriteClient client = getHoodieWriteClient(cfg);) { @@ -204,11 +199,10 @@ public class TestCleaner extends TestHoodieClientBase { HoodieTable table = HoodieTable.getHoodieTable(metaClient, getConfig(), jsc); for (String partitionPath : dataGen.getPartitionPaths()) { TableFileSystemView fsView = table.getFileSystemView(); - Option added = Option.fromJavaOptional(fsView.getAllFileGroups(partitionPath).findFirst() - .map(fg -> { - fg.getLatestFileSlice().map(fs -> compactionFileIdToLatestFileSlice.put(fg.getFileGroupId(), fs)); - return true; - })); + Option added = Option.fromJavaOptional(fsView.getAllFileGroups(partitionPath).findFirst().map(fg -> { + fg.getLatestFileSlice().map(fs -> compactionFileIdToLatestFileSlice.put(fg.getFileGroupId(), fs)); + return true; + })); if (added.isPresent()) { // Select only one file-group for compaction break; @@ -234,8 +228,7 @@ public class TestCleaner extends TestHoodieClientBase { client.startCommitWithTime(newInstantTime); List records = recordUpsertGenWrappedFunction.apply(newInstantTime, 100); - List statuses = - upsertFn.apply(client, jsc.parallelize(records, 1), newInstantTime).collect(); + List statuses = upsertFn.apply(client, jsc.parallelize(records, 1), newInstantTime).collect(); // Verify there are no errors assertNoWriteErrors(statuses); @@ -249,8 +242,8 @@ public class TestCleaner extends TestHoodieClientBase { // compute all the versions of all files, from time 0 HashMap> fileIdToVersions = new HashMap<>(); for (HoodieInstant entry : timeline.getInstants().collect(Collectors.toList())) { - HoodieCommitMetadata commitMetadata = HoodieCommitMetadata - .fromBytes(timeline.getInstantDetails(entry).get(), HoodieCommitMetadata.class); + HoodieCommitMetadata commitMetadata = + HoodieCommitMetadata.fromBytes(timeline.getInstantDetails(entry).get(), HoodieCommitMetadata.class); for (HoodieWriteStat wstat : commitMetadata.getWriteStats(partitionPath)) { if (!fileIdToVersions.containsKey(wstat.getFileId())) { @@ -267,8 +260,8 @@ public class TestCleaner extends TestHoodieClientBase { // Ensure latest file-slice selected for compaction is retained Option dataFileForCompactionPresent = Option.fromJavaOptional(fileGroup.getAllDataFiles().filter(df -> { - return compactionFileIdToLatestFileSlice.get(fileGroup.getFileGroupId()) - .getBaseInstantTime().equals(df.getCommitTime()); + return compactionFileIdToLatestFileSlice.get(fileGroup.getFileGroupId()).getBaseInstantTime() + .equals(df.getCommitTime()); }).findAny()); Assert.assertTrue("Data File selected for compaction is retained", dataFileForCompactionPresent.isPresent()); @@ -310,8 +303,7 @@ public class TestCleaner extends TestHoodieClientBase { */ @Test public void testInsertPreppedAndCleanByCommits() throws Exception { - testInsertAndCleanByCommits(HoodieWriteClient::insertPreppedRecords, - HoodieWriteClient::upsertPreppedRecords, true); + testInsertAndCleanByCommits(HoodieWriteClient::insertPreppedRecords, HoodieWriteClient::upsertPreppedRecords, true); } /** @@ -338,20 +330,18 @@ public class TestCleaner extends TestHoodieClientBase { * @param insertFn Insert API to be tested * @param upsertFn Upsert API to be tested * @param isPreppedAPI Flag to indicate if a prepped-version is used. If true, a wrapper function will be used during - * record generation to also tag the regards (de-dupe is implicit as we use uniq record-gen APIs) + * record generation to also tag the regards (de-dupe is implicit as we use uniq record-gen APIs) * @throws Exception in case of errors */ private void testInsertAndCleanByCommits( Function3, HoodieWriteClient, JavaRDD, String> insertFn, - Function3, HoodieWriteClient, JavaRDD, String> upsertFn, - boolean isPreppedAPI - ) throws Exception { + Function3, HoodieWriteClient, JavaRDD, String> upsertFn, boolean isPreppedAPI) + throws Exception { int maxCommits = 3; // keep upto 3 commits from the past - HoodieWriteConfig cfg = getConfigBuilder().withCompactionConfig( - HoodieCompactionConfig.newBuilder() + HoodieWriteConfig cfg = getConfigBuilder() + .withCompactionConfig(HoodieCompactionConfig.newBuilder() .withCleanerPolicy(HoodieCleaningPolicy.KEEP_LATEST_FILE_VERSIONS).retainCommits(maxCommits).build()) - .withParallelism(1, 1).withBulkInsertParallelism(1) - .withFinalizeWriteParallelism(1) + .withParallelism(1, 1).withBulkInsertParallelism(1).withFinalizeWriteParallelism(1) .withConsistencyGuardConfig(ConsistencyGuardConfig.newBuilder().withConsistencyCheckEnabled(true).build()) .build(); HoodieWriteClient client = getHoodieWriteClient(cfg); @@ -370,8 +360,7 @@ public class TestCleaner extends TestHoodieClientBase { client.startCommitWithTime(newCommitTime); List records = recordUpsertGenWrappedFunction.apply(newCommitTime, 100); - List statuses = - upsertFn.apply(client, jsc.parallelize(records, 1), newCommitTime).collect(); + List statuses = upsertFn.apply(client, jsc.parallelize(records, 1), newCommitTime).collect(); // Verify there are no errors assertNoWriteErrors(statuses); @@ -381,9 +370,9 @@ public class TestCleaner extends TestHoodieClientBase { Option earliestRetainedCommit = activeTimeline.nthFromLastInstant(maxCommits - 1); Set acceptableCommits = activeTimeline.getInstants().collect(Collectors.toSet()); if (earliestRetainedCommit.isPresent()) { - acceptableCommits.removeAll( - activeTimeline.findInstantsInRange("000", earliestRetainedCommit.get().getTimestamp()).getInstants() - .collect(Collectors.toSet())); + acceptableCommits + .removeAll(activeTimeline.findInstantsInRange("000", earliestRetainedCommit.get().getTimestamp()) + .getInstants().collect(Collectors.toSet())); acceptableCommits.add(earliestRetainedCommit.get()); } @@ -412,18 +401,19 @@ public class TestCleaner extends TestHoodieClientBase { */ @Test public void testKeepLatestFileVersions() throws IOException { - HoodieWriteConfig config = HoodieWriteConfig.newBuilder().withPath(basePath).withAssumeDatePartitioning(true) - .withCompactionConfig(HoodieCompactionConfig.newBuilder().withCleanerPolicy( - HoodieCleaningPolicy.KEEP_LATEST_FILE_VERSIONS).retainFileVersions(1).build()) - .build(); + HoodieWriteConfig config = + HoodieWriteConfig.newBuilder().withPath(basePath).withAssumeDatePartitioning(true) + .withCompactionConfig(HoodieCompactionConfig.newBuilder() + .withCleanerPolicy(HoodieCleaningPolicy.KEEP_LATEST_FILE_VERSIONS).retainFileVersions(1).build()) + .build(); // make 1 commit, with 1 file per partition HoodieTestUtils.createCommitFiles(basePath, "000"); - String file1P0C0 = HoodieTestUtils - .createNewDataFile(basePath, HoodieTestDataGenerator.DEFAULT_FIRST_PARTITION_PATH, "000"); - String file1P1C0 = HoodieTestUtils - .createNewDataFile(basePath, HoodieTestDataGenerator.DEFAULT_SECOND_PARTITION_PATH, "000"); + String file1P0C0 = + HoodieTestUtils.createNewDataFile(basePath, HoodieTestDataGenerator.DEFAULT_FIRST_PARTITION_PATH, "000"); + String file1P1C0 = + HoodieTestUtils.createNewDataFile(basePath, HoodieTestDataGenerator.DEFAULT_SECOND_PARTITION_PATH, "000"); metaClient = HoodieTableMetaClient.reload(metaClient); HoodieTable table = HoodieTable.getHoodieTable(metaClient, config, jsc); @@ -434,24 +424,22 @@ public class TestCleaner extends TestHoodieClientBase { assertEquals("Must not clean any files", 0, getCleanStat(hoodieCleanStatsOne, HoodieTestDataGenerator.DEFAULT_SECOND_PARTITION_PATH).getSuccessDeleteFiles() .size()); - assertTrue(HoodieTestUtils - .doesDataFileExist(basePath, HoodieTestDataGenerator.DEFAULT_FIRST_PARTITION_PATH, "000", file1P0C0)); - assertTrue(HoodieTestUtils - .doesDataFileExist(basePath, HoodieTestDataGenerator.DEFAULT_SECOND_PARTITION_PATH, "000", file1P1C0)); + assertTrue(HoodieTestUtils.doesDataFileExist(basePath, HoodieTestDataGenerator.DEFAULT_FIRST_PARTITION_PATH, "000", + file1P0C0)); + assertTrue(HoodieTestUtils.doesDataFileExist(basePath, HoodieTestDataGenerator.DEFAULT_SECOND_PARTITION_PATH, "000", + file1P1C0)); // make next commit, with 1 insert & 1 update per partition HoodieTestUtils.createCommitFiles(basePath, "001"); metaClient = HoodieTableMetaClient.reload(metaClient); table = HoodieTable.getHoodieTable(metaClient, config, jsc); - String file2P0C1 = HoodieTestUtils - .createNewDataFile(basePath, HoodieTestDataGenerator.DEFAULT_FIRST_PARTITION_PATH, "001"); // insert - String file2P1C1 = HoodieTestUtils - .createNewDataFile(basePath, HoodieTestDataGenerator.DEFAULT_SECOND_PARTITION_PATH, "001"); // insert - HoodieTestUtils - .createDataFile(basePath, HoodieTestDataGenerator.DEFAULT_FIRST_PARTITION_PATH, "001", file1P0C0); // update - HoodieTestUtils - .createDataFile(basePath, HoodieTestDataGenerator.DEFAULT_SECOND_PARTITION_PATH, "001", file1P1C0); // update + String file2P0C1 = + HoodieTestUtils.createNewDataFile(basePath, HoodieTestDataGenerator.DEFAULT_FIRST_PARTITION_PATH, "001"); // insert + String file2P1C1 = + HoodieTestUtils.createNewDataFile(basePath, HoodieTestDataGenerator.DEFAULT_SECOND_PARTITION_PATH, "001"); // insert + HoodieTestUtils.createDataFile(basePath, HoodieTestDataGenerator.DEFAULT_FIRST_PARTITION_PATH, "001", file1P0C0); // update + HoodieTestUtils.createDataFile(basePath, HoodieTestDataGenerator.DEFAULT_SECOND_PARTITION_PATH, "001", file1P1C0); // update List hoodieCleanStatsTwo = table.clean(jsc); assertEquals("Must clean 1 file", 1, @@ -460,47 +448,44 @@ public class TestCleaner extends TestHoodieClientBase { assertEquals("Must clean 1 file", 1, getCleanStat(hoodieCleanStatsTwo, HoodieTestDataGenerator.DEFAULT_SECOND_PARTITION_PATH).getSuccessDeleteFiles() .size()); - assertTrue(HoodieTestUtils - .doesDataFileExist(basePath, HoodieTestDataGenerator.DEFAULT_FIRST_PARTITION_PATH, "001", file2P0C1)); - assertTrue(HoodieTestUtils - .doesDataFileExist(basePath, HoodieTestDataGenerator.DEFAULT_SECOND_PARTITION_PATH, "001", file2P1C1)); - assertFalse(HoodieTestUtils - .doesDataFileExist(basePath, HoodieTestDataGenerator.DEFAULT_FIRST_PARTITION_PATH, "000", file1P0C0)); - assertFalse(HoodieTestUtils - .doesDataFileExist(basePath, HoodieTestDataGenerator.DEFAULT_SECOND_PARTITION_PATH, "000", file1P1C0)); + assertTrue(HoodieTestUtils.doesDataFileExist(basePath, HoodieTestDataGenerator.DEFAULT_FIRST_PARTITION_PATH, "001", + file2P0C1)); + assertTrue(HoodieTestUtils.doesDataFileExist(basePath, HoodieTestDataGenerator.DEFAULT_SECOND_PARTITION_PATH, "001", + file2P1C1)); + assertFalse(HoodieTestUtils.doesDataFileExist(basePath, HoodieTestDataGenerator.DEFAULT_FIRST_PARTITION_PATH, "000", + file1P0C0)); + assertFalse(HoodieTestUtils.doesDataFileExist(basePath, HoodieTestDataGenerator.DEFAULT_SECOND_PARTITION_PATH, + "000", file1P1C0)); // make next commit, with 2 updates to existing files, and 1 insert HoodieTestUtils.createCommitFiles(basePath, "002"); metaClient = HoodieTableMetaClient.reload(metaClient); table = HoodieTable.getHoodieTable(metaClient, config, jsc); - HoodieTestUtils - .createDataFile(basePath, HoodieTestDataGenerator.DEFAULT_FIRST_PARTITION_PATH, "002", file1P0C0); // update - HoodieTestUtils - .createDataFile(basePath, HoodieTestDataGenerator.DEFAULT_FIRST_PARTITION_PATH, "002", file2P0C1); // update - String file3P0C2 = HoodieTestUtils - .createNewDataFile(basePath, HoodieTestDataGenerator.DEFAULT_FIRST_PARTITION_PATH, "002"); + HoodieTestUtils.createDataFile(basePath, HoodieTestDataGenerator.DEFAULT_FIRST_PARTITION_PATH, "002", file1P0C0); // update + HoodieTestUtils.createDataFile(basePath, HoodieTestDataGenerator.DEFAULT_FIRST_PARTITION_PATH, "002", file2P0C1); // update + String file3P0C2 = + HoodieTestUtils.createNewDataFile(basePath, HoodieTestDataGenerator.DEFAULT_FIRST_PARTITION_PATH, "002"); List hoodieCleanStatsThree = table.clean(jsc); assertEquals("Must clean two files", 2, getCleanStat(hoodieCleanStatsThree, HoodieTestDataGenerator.DEFAULT_FIRST_PARTITION_PATH) .getSuccessDeleteFiles().size()); - assertFalse(HoodieTestUtils - .doesDataFileExist(basePath, HoodieTestDataGenerator.DEFAULT_FIRST_PARTITION_PATH, "001", file1P0C0)); - assertFalse(HoodieTestUtils - .doesDataFileExist(basePath, HoodieTestDataGenerator.DEFAULT_FIRST_PARTITION_PATH, "001", file2P0C1)); - assertTrue(HoodieTestUtils - .doesDataFileExist(basePath, HoodieTestDataGenerator.DEFAULT_FIRST_PARTITION_PATH, "002", file3P0C2)); + assertFalse(HoodieTestUtils.doesDataFileExist(basePath, HoodieTestDataGenerator.DEFAULT_FIRST_PARTITION_PATH, "001", + file1P0C0)); + assertFalse(HoodieTestUtils.doesDataFileExist(basePath, HoodieTestDataGenerator.DEFAULT_FIRST_PARTITION_PATH, "001", + file2P0C1)); + assertTrue(HoodieTestUtils.doesDataFileExist(basePath, HoodieTestDataGenerator.DEFAULT_FIRST_PARTITION_PATH, "002", + file3P0C2)); // No cleaning on partially written file, with no commit. - HoodieTestUtils - .createDataFile(basePath, HoodieTestDataGenerator.DEFAULT_FIRST_PARTITION_PATH, "003", file3P0C2); // update + HoodieTestUtils.createDataFile(basePath, HoodieTestDataGenerator.DEFAULT_FIRST_PARTITION_PATH, "003", file3P0C2); // update List hoodieCleanStatsFour = table.clean(jsc); assertEquals("Must not clean any files", 0, getCleanStat(hoodieCleanStatsFour, HoodieTestDataGenerator.DEFAULT_FIRST_PARTITION_PATH).getSuccessDeleteFiles() .size()); - assertTrue(HoodieTestUtils - .doesDataFileExist(basePath, HoodieTestDataGenerator.DEFAULT_FIRST_PARTITION_PATH, "002", file3P0C2)); + assertTrue(HoodieTestUtils.doesDataFileExist(basePath, HoodieTestDataGenerator.DEFAULT_FIRST_PARTITION_PATH, "002", + file3P0C2)); } /** @@ -509,37 +494,33 @@ public class TestCleaner extends TestHoodieClientBase { @Test public void testKeepLatestFileVersionsMOR() throws IOException { - HoodieWriteConfig config = HoodieWriteConfig.newBuilder().withPath(basePath).withAssumeDatePartitioning(true) - .withCompactionConfig(HoodieCompactionConfig.newBuilder().withCleanerPolicy( - HoodieCleaningPolicy.KEEP_LATEST_FILE_VERSIONS).retainFileVersions(1).build()) - .build(); + HoodieWriteConfig config = + HoodieWriteConfig.newBuilder().withPath(basePath).withAssumeDatePartitioning(true) + .withCompactionConfig(HoodieCompactionConfig.newBuilder() + .withCleanerPolicy(HoodieCleaningPolicy.KEEP_LATEST_FILE_VERSIONS).retainFileVersions(1).build()) + .build(); - HoodieTableMetaClient metaClient = HoodieTestUtils.init(jsc.hadoopConfiguration(), basePath, - HoodieTableType.MERGE_ON_READ); + HoodieTableMetaClient metaClient = + HoodieTestUtils.init(jsc.hadoopConfiguration(), basePath, HoodieTableType.MERGE_ON_READ); // Make 3 files, one base file and 2 log files associated with base file - String file1P0 = HoodieTestUtils - .createNewDataFile(basePath, HoodieTestDataGenerator.DEFAULT_FIRST_PARTITION_PATH, "000"); - String file2P0L0 = HoodieTestUtils - .createNewLogFile(fs, basePath, HoodieTestDataGenerator.DEFAULT_FIRST_PARTITION_PATH, "000", file1P0, - Option.empty()); - String file2P0L1 = HoodieTestUtils - .createNewLogFile(fs, basePath, HoodieTestDataGenerator.DEFAULT_FIRST_PARTITION_PATH, "000", file1P0, - Option.of(2)); + String file1P0 = + HoodieTestUtils.createNewDataFile(basePath, HoodieTestDataGenerator.DEFAULT_FIRST_PARTITION_PATH, "000"); + String file2P0L0 = HoodieTestUtils.createNewLogFile(fs, basePath, + HoodieTestDataGenerator.DEFAULT_FIRST_PARTITION_PATH, "000", file1P0, Option.empty()); + String file2P0L1 = HoodieTestUtils.createNewLogFile(fs, basePath, + HoodieTestDataGenerator.DEFAULT_FIRST_PARTITION_PATH, "000", file1P0, Option.of(2)); // make 1 compaction commit HoodieTestUtils.createCompactionCommitFiles(fs, basePath, "000"); // Make 4 files, one base file and 3 log files associated with base file HoodieTestUtils.createDataFile(basePath, HoodieTestDataGenerator.DEFAULT_FIRST_PARTITION_PATH, "001", file1P0); - file2P0L0 = HoodieTestUtils - .createNewLogFile(fs, basePath, HoodieTestDataGenerator.DEFAULT_FIRST_PARTITION_PATH, "001", file1P0, - Option.empty()); - file2P0L0 = HoodieTestUtils - .createNewLogFile(fs, basePath, HoodieTestDataGenerator.DEFAULT_FIRST_PARTITION_PATH, "001", file1P0, - Option.of(2)); - file2P0L0 = HoodieTestUtils - .createNewLogFile(fs, basePath, HoodieTestDataGenerator.DEFAULT_FIRST_PARTITION_PATH, "001", file1P0, - Option.of(3)); + file2P0L0 = HoodieTestUtils.createNewLogFile(fs, basePath, HoodieTestDataGenerator.DEFAULT_FIRST_PARTITION_PATH, + "001", file1P0, Option.empty()); + file2P0L0 = HoodieTestUtils.createNewLogFile(fs, basePath, HoodieTestDataGenerator.DEFAULT_FIRST_PARTITION_PATH, + "001", file1P0, Option.of(2)); + file2P0L0 = HoodieTestUtils.createNewLogFile(fs, basePath, HoodieTestDataGenerator.DEFAULT_FIRST_PARTITION_PATH, + "001", file1P0, Option.of(3)); // make 1 compaction commit HoodieTestUtils.createCompactionCommitFiles(fs, basePath, "001"); @@ -548,16 +529,12 @@ public class TestCleaner extends TestHoodieClientBase { assertEquals("Must clean three files, one parquet and 2 log files", 3, getCleanStat(hoodieCleanStats, HoodieTestDataGenerator.DEFAULT_FIRST_PARTITION_PATH).getSuccessDeleteFiles() .size()); - assertFalse(HoodieTestUtils - .doesDataFileExist(basePath, HoodieTestDataGenerator.DEFAULT_FIRST_PARTITION_PATH, "000", file1P0)); - assertFalse( - HoodieTestUtils - .doesLogFileExist(basePath, HoodieTestDataGenerator.DEFAULT_FIRST_PARTITION_PATH, "000", file2P0L0, - Option.empty())); - assertFalse( - HoodieTestUtils - .doesLogFileExist(basePath, HoodieTestDataGenerator.DEFAULT_FIRST_PARTITION_PATH, "000", file2P0L0, - Option.of(2))); + assertFalse(HoodieTestUtils.doesDataFileExist(basePath, HoodieTestDataGenerator.DEFAULT_FIRST_PARTITION_PATH, "000", + file1P0)); + assertFalse(HoodieTestUtils.doesLogFileExist(basePath, HoodieTestDataGenerator.DEFAULT_FIRST_PARTITION_PATH, "000", + file2P0L0, Option.empty())); + assertFalse(HoodieTestUtils.doesLogFileExist(basePath, HoodieTestDataGenerator.DEFAULT_FIRST_PARTITION_PATH, "000", + file2P0L0, Option.of(2))); } /** @@ -566,16 +543,17 @@ public class TestCleaner extends TestHoodieClientBase { @Test public void testKeepLatestCommits() throws IOException { HoodieWriteConfig config = HoodieWriteConfig.newBuilder().withPath(basePath).withAssumeDatePartitioning(true) - .withCompactionConfig(HoodieCompactionConfig.newBuilder().withCleanerPolicy( - HoodieCleaningPolicy.KEEP_LATEST_COMMITS).retainCommits(2).build()).build(); + .withCompactionConfig(HoodieCompactionConfig.newBuilder() + .withCleanerPolicy(HoodieCleaningPolicy.KEEP_LATEST_COMMITS).retainCommits(2).build()) + .build(); // make 1 commit, with 1 file per partition HoodieTestUtils.createCommitFiles(basePath, "000"); - String file1P0C0 = HoodieTestUtils - .createNewDataFile(basePath, HoodieTestDataGenerator.DEFAULT_FIRST_PARTITION_PATH, "000"); - String file1P1C0 = HoodieTestUtils - .createNewDataFile(basePath, HoodieTestDataGenerator.DEFAULT_SECOND_PARTITION_PATH, "000"); + String file1P0C0 = + HoodieTestUtils.createNewDataFile(basePath, HoodieTestDataGenerator.DEFAULT_FIRST_PARTITION_PATH, "000"); + String file1P1C0 = + HoodieTestUtils.createNewDataFile(basePath, HoodieTestDataGenerator.DEFAULT_SECOND_PARTITION_PATH, "000"); metaClient = HoodieTableMetaClient.reload(metaClient); HoodieTable table = HoodieTable.getHoodieTable(metaClient, config, jsc); @@ -587,24 +565,22 @@ public class TestCleaner extends TestHoodieClientBase { assertEquals("Must not clean any files", 0, getCleanStat(hoodieCleanStatsOne, HoodieTestDataGenerator.DEFAULT_SECOND_PARTITION_PATH).getSuccessDeleteFiles() .size()); - assertTrue(HoodieTestUtils - .doesDataFileExist(basePath, HoodieTestDataGenerator.DEFAULT_FIRST_PARTITION_PATH, "000", file1P0C0)); - assertTrue(HoodieTestUtils - .doesDataFileExist(basePath, HoodieTestDataGenerator.DEFAULT_SECOND_PARTITION_PATH, "000", file1P1C0)); + assertTrue(HoodieTestUtils.doesDataFileExist(basePath, HoodieTestDataGenerator.DEFAULT_FIRST_PARTITION_PATH, "000", + file1P0C0)); + assertTrue(HoodieTestUtils.doesDataFileExist(basePath, HoodieTestDataGenerator.DEFAULT_SECOND_PARTITION_PATH, "000", + file1P1C0)); // make next commit, with 1 insert & 1 update per partition HoodieTestUtils.createCommitFiles(basePath, "001"); metaClient = HoodieTableMetaClient.reload(metaClient); table = HoodieTable.getHoodieTable(metaClient, config, jsc); - String file2P0C1 = HoodieTestUtils - .createNewDataFile(basePath, HoodieTestDataGenerator.DEFAULT_FIRST_PARTITION_PATH, "001"); // insert - String file2P1C1 = HoodieTestUtils - .createNewDataFile(basePath, HoodieTestDataGenerator.DEFAULT_SECOND_PARTITION_PATH, "001"); // insert - HoodieTestUtils - .createDataFile(basePath, HoodieTestDataGenerator.DEFAULT_FIRST_PARTITION_PATH, "001", file1P0C0); // update - HoodieTestUtils - .createDataFile(basePath, HoodieTestDataGenerator.DEFAULT_SECOND_PARTITION_PATH, "001", file1P1C0); // update + String file2P0C1 = + HoodieTestUtils.createNewDataFile(basePath, HoodieTestDataGenerator.DEFAULT_FIRST_PARTITION_PATH, "001"); // insert + String file2P1C1 = + HoodieTestUtils.createNewDataFile(basePath, HoodieTestDataGenerator.DEFAULT_SECOND_PARTITION_PATH, "001"); // insert + HoodieTestUtils.createDataFile(basePath, HoodieTestDataGenerator.DEFAULT_FIRST_PARTITION_PATH, "001", file1P0C0); // update + HoodieTestUtils.createDataFile(basePath, HoodieTestDataGenerator.DEFAULT_SECOND_PARTITION_PATH, "001", file1P1C0); // update List hoodieCleanStatsTwo = table.clean(jsc); assertEquals("Must not clean any files", 0, @@ -613,78 +589,73 @@ public class TestCleaner extends TestHoodieClientBase { assertEquals("Must not clean any files", 0, getCleanStat(hoodieCleanStatsTwo, HoodieTestDataGenerator.DEFAULT_SECOND_PARTITION_PATH).getSuccessDeleteFiles() .size()); - assertTrue(HoodieTestUtils - .doesDataFileExist(basePath, HoodieTestDataGenerator.DEFAULT_FIRST_PARTITION_PATH, "001", file2P0C1)); - assertTrue(HoodieTestUtils - .doesDataFileExist(basePath, HoodieTestDataGenerator.DEFAULT_SECOND_PARTITION_PATH, "001", file2P1C1)); - assertTrue(HoodieTestUtils - .doesDataFileExist(basePath, HoodieTestDataGenerator.DEFAULT_FIRST_PARTITION_PATH, "000", file1P0C0)); - assertTrue(HoodieTestUtils - .doesDataFileExist(basePath, HoodieTestDataGenerator.DEFAULT_SECOND_PARTITION_PATH, "000", file1P1C0)); + assertTrue(HoodieTestUtils.doesDataFileExist(basePath, HoodieTestDataGenerator.DEFAULT_FIRST_PARTITION_PATH, "001", + file2P0C1)); + assertTrue(HoodieTestUtils.doesDataFileExist(basePath, HoodieTestDataGenerator.DEFAULT_SECOND_PARTITION_PATH, "001", + file2P1C1)); + assertTrue(HoodieTestUtils.doesDataFileExist(basePath, HoodieTestDataGenerator.DEFAULT_FIRST_PARTITION_PATH, "000", + file1P0C0)); + assertTrue(HoodieTestUtils.doesDataFileExist(basePath, HoodieTestDataGenerator.DEFAULT_SECOND_PARTITION_PATH, "000", + file1P1C0)); // make next commit, with 2 updates to existing files, and 1 insert HoodieTestUtils.createCommitFiles(basePath, "002"); metaClient = HoodieTableMetaClient.reload(metaClient); table = HoodieTable.getHoodieTable(metaClient, config, jsc); - HoodieTestUtils - .createDataFile(basePath, HoodieTestDataGenerator.DEFAULT_FIRST_PARTITION_PATH, "002", file1P0C0); // update - HoodieTestUtils - .createDataFile(basePath, HoodieTestDataGenerator.DEFAULT_FIRST_PARTITION_PATH, "002", file2P0C1); // update - String file3P0C2 = HoodieTestUtils - .createNewDataFile(basePath, HoodieTestDataGenerator.DEFAULT_FIRST_PARTITION_PATH, "002"); + HoodieTestUtils.createDataFile(basePath, HoodieTestDataGenerator.DEFAULT_FIRST_PARTITION_PATH, "002", file1P0C0); // update + HoodieTestUtils.createDataFile(basePath, HoodieTestDataGenerator.DEFAULT_FIRST_PARTITION_PATH, "002", file2P0C1); // update + String file3P0C2 = + HoodieTestUtils.createNewDataFile(basePath, HoodieTestDataGenerator.DEFAULT_FIRST_PARTITION_PATH, "002"); List hoodieCleanStatsThree = table.clean(jsc); assertEquals("Must not clean any file. We have to keep 1 version before the latest commit time to keep", 0, getCleanStat(hoodieCleanStatsThree, HoodieTestDataGenerator.DEFAULT_FIRST_PARTITION_PATH) .getSuccessDeleteFiles().size()); - assertTrue(HoodieTestUtils - .doesDataFileExist(basePath, HoodieTestDataGenerator.DEFAULT_FIRST_PARTITION_PATH, "000", file1P0C0)); + assertTrue(HoodieTestUtils.doesDataFileExist(basePath, HoodieTestDataGenerator.DEFAULT_FIRST_PARTITION_PATH, "000", + file1P0C0)); // make next commit, with 2 updates to existing files, and 1 insert HoodieTestUtils.createCommitFiles(basePath, "003"); metaClient = HoodieTableMetaClient.reload(metaClient); table = HoodieTable.getHoodieTable(metaClient, config, jsc); - HoodieTestUtils - .createDataFile(basePath, HoodieTestDataGenerator.DEFAULT_FIRST_PARTITION_PATH, "003", file1P0C0); // update - HoodieTestUtils - .createDataFile(basePath, HoodieTestDataGenerator.DEFAULT_FIRST_PARTITION_PATH, "003", file2P0C1); // update - String file4P0C3 = HoodieTestUtils - .createNewDataFile(basePath, HoodieTestDataGenerator.DEFAULT_FIRST_PARTITION_PATH, "003"); + HoodieTestUtils.createDataFile(basePath, HoodieTestDataGenerator.DEFAULT_FIRST_PARTITION_PATH, "003", file1P0C0); // update + HoodieTestUtils.createDataFile(basePath, HoodieTestDataGenerator.DEFAULT_FIRST_PARTITION_PATH, "003", file2P0C1); // update + String file4P0C3 = + HoodieTestUtils.createNewDataFile(basePath, HoodieTestDataGenerator.DEFAULT_FIRST_PARTITION_PATH, "003"); List hoodieCleanStatsFour = table.clean(jsc); assertEquals("Must not clean one old file", 1, getCleanStat(hoodieCleanStatsFour, HoodieTestDataGenerator.DEFAULT_FIRST_PARTITION_PATH).getSuccessDeleteFiles() .size()); - assertFalse(HoodieTestUtils - .doesDataFileExist(basePath, HoodieTestDataGenerator.DEFAULT_FIRST_PARTITION_PATH, "000", file1P0C0)); - assertTrue(HoodieTestUtils - .doesDataFileExist(basePath, HoodieTestDataGenerator.DEFAULT_FIRST_PARTITION_PATH, "001", file1P0C0)); - assertTrue(HoodieTestUtils - .doesDataFileExist(basePath, HoodieTestDataGenerator.DEFAULT_FIRST_PARTITION_PATH, "002", file1P0C0)); - assertTrue(HoodieTestUtils - .doesDataFileExist(basePath, HoodieTestDataGenerator.DEFAULT_FIRST_PARTITION_PATH, "001", file2P0C1)); - assertTrue(HoodieTestUtils - .doesDataFileExist(basePath, HoodieTestDataGenerator.DEFAULT_FIRST_PARTITION_PATH, "002", file2P0C1)); - assertTrue(HoodieTestUtils - .doesDataFileExist(basePath, HoodieTestDataGenerator.DEFAULT_FIRST_PARTITION_PATH, "002", file3P0C2)); - assertTrue(HoodieTestUtils - .doesDataFileExist(basePath, HoodieTestDataGenerator.DEFAULT_FIRST_PARTITION_PATH, "003", file4P0C3)); + assertFalse(HoodieTestUtils.doesDataFileExist(basePath, HoodieTestDataGenerator.DEFAULT_FIRST_PARTITION_PATH, "000", + file1P0C0)); + assertTrue(HoodieTestUtils.doesDataFileExist(basePath, HoodieTestDataGenerator.DEFAULT_FIRST_PARTITION_PATH, "001", + file1P0C0)); + assertTrue(HoodieTestUtils.doesDataFileExist(basePath, HoodieTestDataGenerator.DEFAULT_FIRST_PARTITION_PATH, "002", + file1P0C0)); + assertTrue(HoodieTestUtils.doesDataFileExist(basePath, HoodieTestDataGenerator.DEFAULT_FIRST_PARTITION_PATH, "001", + file2P0C1)); + assertTrue(HoodieTestUtils.doesDataFileExist(basePath, HoodieTestDataGenerator.DEFAULT_FIRST_PARTITION_PATH, "002", + file2P0C1)); + assertTrue(HoodieTestUtils.doesDataFileExist(basePath, HoodieTestDataGenerator.DEFAULT_FIRST_PARTITION_PATH, "002", + file3P0C2)); + assertTrue(HoodieTestUtils.doesDataFileExist(basePath, HoodieTestDataGenerator.DEFAULT_FIRST_PARTITION_PATH, "003", + file4P0C3)); // No cleaning on partially written file, with no commit. - HoodieTestUtils - .createDataFile(basePath, HoodieTestDataGenerator.DEFAULT_FIRST_PARTITION_PATH, "004", file3P0C2); // update + HoodieTestUtils.createDataFile(basePath, HoodieTestDataGenerator.DEFAULT_FIRST_PARTITION_PATH, "004", file3P0C2); // update List hoodieCleanStatsFive = table.clean(jsc); assertEquals("Must not clean any files", 0, getCleanStat(hoodieCleanStatsFive, HoodieTestDataGenerator.DEFAULT_FIRST_PARTITION_PATH).getSuccessDeleteFiles() .size()); - assertTrue(HoodieTestUtils - .doesDataFileExist(basePath, HoodieTestDataGenerator.DEFAULT_FIRST_PARTITION_PATH, "001", file1P0C0)); - assertTrue(HoodieTestUtils - .doesDataFileExist(basePath, HoodieTestDataGenerator.DEFAULT_FIRST_PARTITION_PATH, "001", file2P0C1)); + assertTrue(HoodieTestUtils.doesDataFileExist(basePath, HoodieTestDataGenerator.DEFAULT_FIRST_PARTITION_PATH, "001", + file1P0C0)); + assertTrue(HoodieTestUtils.doesDataFileExist(basePath, HoodieTestDataGenerator.DEFAULT_FIRST_PARTITION_PATH, "001", + file2P0C1)); } /** @@ -711,8 +682,9 @@ public class TestCleaner extends TestHoodieClientBase { @Test public void testCleaningWithZeroPartitonPaths() throws IOException { HoodieWriteConfig config = HoodieWriteConfig.newBuilder().withPath(basePath).withAssumeDatePartitioning(true) - .withCompactionConfig(HoodieCompactionConfig.newBuilder().withCleanerPolicy( - HoodieCleaningPolicy.KEEP_LATEST_COMMITS).retainCommits(2).build()).build(); + .withCompactionConfig(HoodieCompactionConfig.newBuilder() + .withCleanerPolicy(HoodieCleaningPolicy.KEEP_LATEST_COMMITS).retainCommits(2).build()) + .build(); // Make a commit, although there are no partitionPaths. // Example use-case of this is when a client wants to create a table @@ -732,8 +704,9 @@ public class TestCleaner extends TestHoodieClientBase { @Test public void testCleaningSkewedPartitons() throws IOException { HoodieWriteConfig config = HoodieWriteConfig.newBuilder().withPath(basePath).withAssumeDatePartitioning(true) - .withCompactionConfig(HoodieCompactionConfig.newBuilder().withCleanerPolicy( - HoodieCleaningPolicy.KEEP_LATEST_COMMITS).retainCommits(2).build()).build(); + .withCompactionConfig(HoodieCompactionConfig.newBuilder() + .withCleanerPolicy(HoodieCleaningPolicy.KEEP_LATEST_COMMITS).retainCommits(2).build()) + .build(); Map stageOneShuffleReadTaskRecordsCountMap = new HashMap<>(); // Since clean involves repartition in order to uniformly distribute data, @@ -783,22 +756,20 @@ public class TestCleaner extends TestHoodieClientBase { HoodieTable table = HoodieTable.getHoodieTable(metaClient, config, jsc); List hoodieCleanStats = table.clean(jsc); - assertEquals(100, - getCleanStat(hoodieCleanStats, HoodieTestDataGenerator.DEFAULT_FIRST_PARTITION_PATH).getSuccessDeleteFiles() - .size()); - assertEquals(10, - getCleanStat(hoodieCleanStats, HoodieTestDataGenerator.DEFAULT_SECOND_PARTITION_PATH).getSuccessDeleteFiles() - .size()); - assertEquals(10, - getCleanStat(hoodieCleanStats, HoodieTestDataGenerator.DEFAULT_THIRD_PARTITION_PATH).getSuccessDeleteFiles() - .size()); + assertEquals(100, getCleanStat(hoodieCleanStats, HoodieTestDataGenerator.DEFAULT_FIRST_PARTITION_PATH) + .getSuccessDeleteFiles().size()); + assertEquals(10, getCleanStat(hoodieCleanStats, HoodieTestDataGenerator.DEFAULT_SECOND_PARTITION_PATH) + .getSuccessDeleteFiles().size()); + assertEquals(10, getCleanStat(hoodieCleanStats, HoodieTestDataGenerator.DEFAULT_THIRD_PARTITION_PATH) + .getSuccessDeleteFiles().size()); // 3 tasks are expected since the number of partitions is 3 assertEquals(3, stageOneShuffleReadTaskRecordsCountMap.keySet().size()); // Sum of all records processed = total number of files to clean assertEquals(120, stageOneShuffleReadTaskRecordsCountMap.values().stream().reduce((a, b) -> a + b).get().intValue()); - assertTrue("The skew in handling files to clean is not removed. " + assertTrue( + "The skew in handling files to clean is not removed. " + "Each task should handle more records than the partitionPath with least files " + "and less records than the partitionPath with most files.", stageOneShuffleReadTaskRecordsCountMap.values().stream().filter(a -> a > 10 && a < 100).count() == 3); @@ -811,17 +782,18 @@ public class TestCleaner extends TestHoodieClientBase { @Test public void testKeepLatestCommitsWithPendingCompactions() throws IOException { HoodieWriteConfig config = HoodieWriteConfig.newBuilder().withPath(basePath).withAssumeDatePartitioning(true) - .withCompactionConfig(HoodieCompactionConfig.newBuilder().withCleanerPolicy( - HoodieCleaningPolicy.KEEP_LATEST_COMMITS).retainCommits(2).build()).build(); + .withCompactionConfig(HoodieCompactionConfig.newBuilder() + .withCleanerPolicy(HoodieCleaningPolicy.KEEP_LATEST_COMMITS).retainCommits(2).build()) + .build(); // Deletions: - // . FileId Parquet Logs Total Retained Commits - // FileId7 5 10 15 009, 011 - // FileId6 5 10 15 009 - // FileId5 3 6 9 005 - // FileId4 2 4 6 003 - // FileId3 1 2 3 001 - // FileId2 0 0 0 000 - // FileId1 0 0 0 000 + // . FileId Parquet Logs Total Retained Commits + // FileId7 5 10 15 009, 011 + // FileId6 5 10 15 009 + // FileId5 3 6 9 005 + // FileId4 2 4 6 003 + // FileId3 1 2 3 001 + // FileId2 0 0 0 000 + // FileId1 0 0 0 000 testPendingCompactions(config, 48, 18); } @@ -830,18 +802,20 @@ public class TestCleaner extends TestHoodieClientBase { */ @Test public void testKeepLatestVersionsWithPendingCompactions() throws IOException { - HoodieWriteConfig config = HoodieWriteConfig.newBuilder().withPath(basePath).withAssumeDatePartitioning(true) - .withCompactionConfig(HoodieCompactionConfig.newBuilder().withCleanerPolicy( - HoodieCleaningPolicy.KEEP_LATEST_FILE_VERSIONS).retainFileVersions(2).build()).build(); + HoodieWriteConfig config = + HoodieWriteConfig.newBuilder().withPath(basePath).withAssumeDatePartitioning(true) + .withCompactionConfig(HoodieCompactionConfig.newBuilder() + .withCleanerPolicy(HoodieCleaningPolicy.KEEP_LATEST_FILE_VERSIONS).retainFileVersions(2).build()) + .build(); // Deletions: - // . FileId Parquet Logs Total Retained Commits - // FileId7 5 10 15 009, 011 - // FileId6 4 8 12 007, 009 - // FileId5 2 4 6 003 005 - // FileId4 1 2 3 001, 003 - // FileId3 0 0 0 000, 001 - // FileId2 0 0 0 000 - // FileId1 0 0 0 000 + // . FileId Parquet Logs Total Retained Commits + // FileId7 5 10 15 009, 011 + // FileId6 4 8 12 007, 009 + // FileId5 2 4 6 003 005 + // FileId4 1 2 3 001, 003 + // FileId3 0 0 0 000, 001 + // FileId2 0 0 0 000 + // FileId1 0 0 0 000 testPendingCompactions(config, 36, 9); } @@ -853,10 +827,10 @@ public class TestCleaner extends TestHoodieClientBase { */ public void testPendingCompactions(HoodieWriteConfig config, int expNumFilesDeleted, int expNumFilesUnderCompactionDeleted) throws IOException { - HoodieTableMetaClient metaClient = HoodieTestUtils.init(jsc.hadoopConfiguration(), basePath, - HoodieTableType.MERGE_ON_READ); - String[] instants = new String[]{"000", "001", "003", "005", "007", "009", "011", "013"}; - String[] compactionInstants = new String[]{"002", "004", "006", "008", "010"}; + HoodieTableMetaClient metaClient = + HoodieTestUtils.init(jsc.hadoopConfiguration(), basePath, HoodieTableType.MERGE_ON_READ); + String[] instants = new String[] {"000", "001", "003", "005", "007", "009", "011", "013"}; + String[] compactionInstants = new String[] {"002", "004", "006", "008", "010"}; Map expFileIdToPendingCompaction = new HashMap<>(); Map fileIdToLatestInstantBeforeCompaction = new HashMap<>(); Map> compactionInstantsToFileSlices = new HashMap<>(); @@ -870,13 +844,11 @@ public class TestCleaner extends TestHoodieClientBase { // compactions // FileIds 2-5 will be under compaction int maxNumFileIds = 7; - String[] fileIds = new String[] - {"fileId1", "fileId2", "fileId3", "fileId4", "fileId5", "fileId6", "fileId7"}; + String[] fileIds = new String[] {"fileId1", "fileId2", "fileId3", "fileId4", "fileId5", "fileId6", "fileId7"}; int maxNumFileIdsForCompaction = 4; for (int i = 0; i < maxNumFileIds; i++) { - final String fileId = HoodieTestUtils - .createDataFile(basePath, HoodieTestDataGenerator.DEFAULT_FIRST_PARTITION_PATH, instants[0], - fileIds[i]); + final String fileId = HoodieTestUtils.createDataFile(basePath, + HoodieTestDataGenerator.DEFAULT_FIRST_PARTITION_PATH, instants[0], fileIds[i]); HoodieTestUtils.createNewLogFile(fs, basePath, HoodieTestDataGenerator.DEFAULT_FIRST_PARTITION_PATH, instants[0], fileId, Option.empty()); HoodieTestUtils.createNewLogFile(fs, basePath, HoodieTestDataGenerator.DEFAULT_FIRST_PARTITION_PATH, instants[0], @@ -887,9 +859,9 @@ public class TestCleaner extends TestHoodieClientBase { expFileIdToPendingCompaction.put(fileId, compactionInstants[j]); metaClient = HoodieTableMetaClient.reload(metaClient); HoodieTable table = HoodieTable.getHoodieTable(metaClient, config, jsc); - FileSlice slice = table.getRTFileSystemView().getLatestFileSlices( - HoodieTestDataGenerator.DEFAULT_FIRST_PARTITION_PATH) - .filter(fs -> fs.getFileId().equals(fileId)).findFirst().get(); + FileSlice slice = + table.getRTFileSystemView().getLatestFileSlices(HoodieTestDataGenerator.DEFAULT_FIRST_PARTITION_PATH) + .filter(fs -> fs.getFileId().equals(fileId)).findFirst().get(); List slices = new ArrayList<>(); if (compactionInstantsToFileSlices.containsKey(compactionInstants[j])) { slices = compactionInstantsToFileSlices.get(compactionInstants[j]); @@ -898,20 +870,16 @@ public class TestCleaner extends TestHoodieClientBase { compactionInstantsToFileSlices.put(compactionInstants[j], slices); // Add log-files to simulate delta-commits after pending compaction HoodieTestUtils.createNewLogFile(fs, basePath, HoodieTestDataGenerator.DEFAULT_FIRST_PARTITION_PATH, - compactionInstants[j], - fileId, Option.empty()); + compactionInstants[j], fileId, Option.empty()); HoodieTestUtils.createNewLogFile(fs, basePath, HoodieTestDataGenerator.DEFAULT_FIRST_PARTITION_PATH, - compactionInstants[j], - fileId, Option.of(2)); + compactionInstants[j], fileId, Option.of(2)); } else { - HoodieTestUtils - .createDataFile(basePath, HoodieTestDataGenerator.DEFAULT_FIRST_PARTITION_PATH, instants[j], fileId); - HoodieTestUtils - .createNewLogFile(fs, basePath, HoodieTestDataGenerator.DEFAULT_FIRST_PARTITION_PATH, instants[j], fileId, - Option.empty()); - HoodieTestUtils - .createNewLogFile(fs, basePath, HoodieTestDataGenerator.DEFAULT_FIRST_PARTITION_PATH, instants[j], fileId, - Option.of(2)); + HoodieTestUtils.createDataFile(basePath, HoodieTestDataGenerator.DEFAULT_FIRST_PARTITION_PATH, instants[j], + fileId); + HoodieTestUtils.createNewLogFile(fs, basePath, HoodieTestDataGenerator.DEFAULT_FIRST_PARTITION_PATH, + instants[j], fileId, Option.empty()); + HoodieTestUtils.createNewLogFile(fs, basePath, HoodieTestDataGenerator.DEFAULT_FIRST_PARTITION_PATH, + instants[j], fileId, Option.of(2)); fileIdToLatestInstantBeforeCompaction.put(fileId, instants[j]); } } @@ -921,9 +889,8 @@ public class TestCleaner extends TestHoodieClientBase { for (String instant : compactionInstants) { List fileSliceList = compactionInstantsToFileSlices.get(instant); if (null != fileSliceList) { - HoodieTestUtils.createCompactionRequest(metaClient, instant, - fileSliceList.stream().map(fs -> Pair.of(HoodieTestDataGenerator.DEFAULT_FIRST_PARTITION_PATH, fs)) - .collect(Collectors.toList())); + HoodieTestUtils.createCompactionRequest(metaClient, instant, fileSliceList.stream() + .map(fs -> Pair.of(HoodieTestDataGenerator.DEFAULT_FIRST_PARTITION_PATH, fs)).collect(Collectors.toList())); } } @@ -939,38 +906,35 @@ public class TestCleaner extends TestHoodieClientBase { expFileIdToPendingCompaction.entrySet().stream().forEach(entry -> { String fileId = entry.getKey(); String baseInstantForCompaction = fileIdToLatestInstantBeforeCompaction.get(fileId); - Option fileSliceForCompaction = - Option.fromJavaOptional( - hoodieTable.getRTFileSystemView().getLatestFileSlicesBeforeOrOn( - HoodieTestDataGenerator.DEFAULT_FIRST_PARTITION_PATH, - baseInstantForCompaction, true).filter(fs -> fs.getFileId().equals(fileId)).findFirst()); + Option fileSliceForCompaction = Option.fromJavaOptional(hoodieTable.getRTFileSystemView() + .getLatestFileSlicesBeforeOrOn(HoodieTestDataGenerator.DEFAULT_FIRST_PARTITION_PATH, baseInstantForCompaction, + true) + .filter(fs -> fs.getFileId().equals(fileId)).findFirst()); Assert.assertTrue("Base Instant for Compaction must be preserved", fileSliceForCompaction.isPresent()); Assert.assertTrue("FileSlice has data-file", fileSliceForCompaction.get().getDataFile().isPresent()); - Assert.assertEquals("FileSlice has log-files", 2, - fileSliceForCompaction.get().getLogFiles().count()); + Assert.assertEquals("FileSlice has log-files", 2, fileSliceForCompaction.get().getLogFiles().count()); }); // Test for progress (Did we clean some files ?) - long numFilesUnderCompactionDeleted = - hoodieCleanStats.stream().flatMap(cleanStat -> { - return convertPathToFileIdWithCommitTime(newMetaClient, cleanStat.getDeletePathPatterns()).map( - fileIdWithCommitTime -> { - if (expFileIdToPendingCompaction.containsKey(fileIdWithCommitTime.getKey())) { - Assert.assertTrue("Deleted instant time must be less than pending compaction", - HoodieTimeline.compareTimestamps( - fileIdToLatestInstantBeforeCompaction.get(fileIdWithCommitTime.getKey()), - fileIdWithCommitTime.getValue(), HoodieTimeline.GREATER)); - return true; - } - return false; - }); - }).filter(x -> x).count(); - long numDeleted = hoodieCleanStats.stream() - .flatMap(cleanStat -> cleanStat.getDeletePathPatterns().stream()).count(); + long numFilesUnderCompactionDeleted = hoodieCleanStats.stream().flatMap(cleanStat -> { + return convertPathToFileIdWithCommitTime(newMetaClient, cleanStat.getDeletePathPatterns()) + .map(fileIdWithCommitTime -> { + if (expFileIdToPendingCompaction.containsKey(fileIdWithCommitTime.getKey())) { + Assert.assertTrue("Deleted instant time must be less than pending compaction", + HoodieTimeline.compareTimestamps( + fileIdToLatestInstantBeforeCompaction.get(fileIdWithCommitTime.getKey()), + fileIdWithCommitTime.getValue(), HoodieTimeline.GREATER)); + return true; + } + return false; + }); + }).filter(x -> x).count(); + long numDeleted = + hoodieCleanStats.stream().flatMap(cleanStat -> cleanStat.getDeletePathPatterns().stream()).count(); // Tighter check for regression Assert.assertEquals("Correct number of files deleted", expNumFilesDeleted, numDeleted); - Assert.assertEquals("Correct number of files under compaction deleted", - expNumFilesUnderCompactionDeleted, numFilesUnderCompactionDeleted); + Assert.assertEquals("Correct number of files under compaction deleted", expNumFilesUnderCompactionDeleted, + numFilesUnderCompactionDeleted); } /** @@ -991,6 +955,7 @@ public class TestCleaner extends TestHoodieClientBase { /*** * Helper method to return temporary files count + * * @return Number of temporary files found * @throws IOException in case of error */ @@ -1004,22 +969,20 @@ public class TestCleaner extends TestHoodieClientBase { return count; } - private Stream> convertPathToFileIdWithCommitTime( - final HoodieTableMetaClient metaClient, List paths) { - Predicate roFilePredicate = path -> - path.contains(metaClient.getTableConfig().getROFileFormat().getFileExtension()); - Predicate rtFilePredicate = path -> - path.contains(metaClient.getTableConfig().getRTFileFormat().getFileExtension()); - Stream> stream1 = paths.stream().filter(roFilePredicate) - .map(fullPath -> { - String fileName = Paths.get(fullPath).getFileName().toString(); - return Pair.of(FSUtils.getFileId(fileName), FSUtils.getCommitTime(fileName)); - }); - Stream> stream2 = paths.stream().filter(rtFilePredicate) - .map(path -> { - return Pair.of(FSUtils.getFileIdFromLogPath(new Path(path)), - FSUtils.getBaseCommitTimeFromLogPath(new Path(path))); - }); + private Stream> convertPathToFileIdWithCommitTime(final HoodieTableMetaClient metaClient, + List paths) { + Predicate roFilePredicate = + path -> path.contains(metaClient.getTableConfig().getROFileFormat().getFileExtension()); + Predicate rtFilePredicate = + path -> path.contains(metaClient.getTableConfig().getRTFileFormat().getFileExtension()); + Stream> stream1 = paths.stream().filter(roFilePredicate).map(fullPath -> { + String fileName = Paths.get(fullPath).getFileName().toString(); + return Pair.of(FSUtils.getFileId(fileName), FSUtils.getCommitTime(fileName)); + }); + Stream> stream2 = paths.stream().filter(rtFilePredicate).map(path -> { + return Pair.of(FSUtils.getFileIdFromLogPath(new Path(path)), + FSUtils.getBaseCommitTimeFromLogPath(new Path(path))); + }); return Stream.concat(stream1, stream2); } } diff --git a/hudi-client/src/test/java/org/apache/hudi/TestClientRollback.java b/hudi-client/src/test/java/org/apache/hudi/TestClientRollback.java index 11504a479..f5523eab5 100644 --- a/hudi-client/src/test/java/org/apache/hudi/TestClientRollback.java +++ b/hudi-client/src/test/java/org/apache/hudi/TestClientRollback.java @@ -54,9 +54,8 @@ public class TestClientRollback extends TestHoodieClientBase { */ @Test public void testSavepointAndRollback() throws Exception { - HoodieWriteConfig cfg = getConfigBuilder().withCompactionConfig( - HoodieCompactionConfig.newBuilder().withCleanerPolicy(HoodieCleaningPolicy.KEEP_LATEST_COMMITS).retainCommits(1) - .build()).build(); + HoodieWriteConfig cfg = getConfigBuilder().withCompactionConfig(HoodieCompactionConfig.newBuilder() + .withCleanerPolicy(HoodieCleaningPolicy.KEEP_LATEST_COMMITS).retainCommits(1).build()).build(); try (HoodieWriteClient client = getHoodieWriteClient(cfg);) { HoodieTestDataGenerator.writePartitionMetadata(fs, HoodieTestDataGenerator.DEFAULT_PARTITION_PATHS, basePath); @@ -95,8 +94,8 @@ public class TestClientRollback extends TestHoodieClientBase { statuses = client.upsert(jsc.parallelize(records, 1), newCommitTime).collect(); // Verify there are no errors assertNoWriteErrors(statuses); - List partitionPaths = FSUtils.getAllPartitionPaths(fs, cfg.getBasePath(), - getConfig().shouldAssumeDatePartitioning()); + List partitionPaths = + FSUtils.getAllPartitionPaths(fs, cfg.getBasePath(), getConfig().shouldAssumeDatePartitioning()); metaClient = HoodieTableMetaClient.reload(metaClient); HoodieTable table = HoodieTable.getHoodieTable(metaClient, getConfig(), jsc); final ReadOptimizedView view1 = table.getROFileSystemView(); @@ -173,8 +172,8 @@ public class TestClientRollback extends TestHoodieClientBase { String commitTime2 = "20160502020601"; String commitTime3 = "20160506030611"; new File(basePath + "/.hoodie").mkdirs(); - HoodieTestDataGenerator - .writePartitionMetadata(fs, new String[]{"2016/05/01", "2016/05/02", "2016/05/06"}, basePath); + HoodieTestDataGenerator.writePartitionMetadata(fs, new String[] {"2016/05/01", "2016/05/02", "2016/05/06"}, + basePath); // Only first two have commit files HoodieTestUtils.createCommitFiles(basePath, commitTime1, commitTime2); @@ -196,8 +195,8 @@ public class TestClientRollback extends TestHoodieClientBase { String file32 = HoodieTestUtils.createDataFile(basePath, "2016/05/02", commitTime3, "id32"); String file33 = HoodieTestUtils.createDataFile(basePath, "2016/05/06", commitTime3, "id33"); - HoodieWriteConfig config = HoodieWriteConfig.newBuilder().withPath(basePath).withIndexConfig( - HoodieIndexConfig.newBuilder().withIndexType(HoodieIndex.IndexType.INMEMORY).build()).build(); + HoodieWriteConfig config = HoodieWriteConfig.newBuilder().withPath(basePath) + .withIndexConfig(HoodieIndexConfig.newBuilder().withIndexType(HoodieIndex.IndexType.INMEMORY).build()).build(); try (HoodieWriteClient client = getHoodieWriteClient(config, false);) { @@ -263,8 +262,8 @@ public class TestClientRollback extends TestHoodieClientBase { String commitTime2 = "20160502020601"; String commitTime3 = "20160506030611"; new File(basePath + "/.hoodie").mkdirs(); - HoodieTestDataGenerator - .writePartitionMetadata(fs, new String[]{"2016/05/01", "2016/05/02", "2016/05/06"}, basePath); + HoodieTestDataGenerator.writePartitionMetadata(fs, new String[] {"2016/05/01", "2016/05/02", "2016/05/06"}, + basePath); // One good commit HoodieTestUtils.createCommitFiles(basePath, commitTime1); @@ -287,8 +286,8 @@ public class TestClientRollback extends TestHoodieClientBase { String file33 = HoodieTestUtils.createDataFile(basePath, "2016/05/06", commitTime3, "id33"); // Turn auto rollback off - HoodieWriteConfig config = HoodieWriteConfig.newBuilder().withPath(basePath).withIndexConfig( - HoodieIndexConfig.newBuilder().withIndexType(HoodieIndex.IndexType.INMEMORY).build()).build(); + HoodieWriteConfig config = HoodieWriteConfig.newBuilder().withPath(basePath) + .withIndexConfig(HoodieIndexConfig.newBuilder().withIndexType(HoodieIndex.IndexType.INMEMORY).build()).build(); try (HoodieWriteClient client = getHoodieWriteClient(config, false);) { diff --git a/hudi-client/src/test/java/org/apache/hudi/TestCompactionAdminClient.java b/hudi-client/src/test/java/org/apache/hudi/TestCompactionAdminClient.java index dc2b23690..a987f8bd5 100644 --- a/hudi-client/src/test/java/org/apache/hudi/TestCompactionAdminClient.java +++ b/hudi-client/src/test/java/org/apache/hudi/TestCompactionAdminClient.java @@ -67,29 +67,23 @@ public class TestCompactionAdminClient extends TestHoodieClientBase { @Test public void testUnscheduleCompactionPlan() throws Exception { int numEntriesPerInstant = 10; - CompactionTestUtils - .setupAndValidateCompactionOperations(metaClient, false, numEntriesPerInstant, numEntriesPerInstant, - numEntriesPerInstant, numEntriesPerInstant); + CompactionTestUtils.setupAndValidateCompactionOperations(metaClient, false, numEntriesPerInstant, + numEntriesPerInstant, numEntriesPerInstant, numEntriesPerInstant); // THere are delta-commits after compaction instant - validateUnSchedulePlan(client, - "000", "001", numEntriesPerInstant, 2 * numEntriesPerInstant); + validateUnSchedulePlan(client, "000", "001", numEntriesPerInstant, 2 * numEntriesPerInstant); // THere are delta-commits after compaction instant - validateUnSchedulePlan(client, - "002", "003", numEntriesPerInstant, 2 * numEntriesPerInstant); + validateUnSchedulePlan(client, "002", "003", numEntriesPerInstant, 2 * numEntriesPerInstant); // THere are no delta-commits after compaction instant - validateUnSchedulePlan(client, - "004", "005", numEntriesPerInstant, 0); + validateUnSchedulePlan(client, "004", "005", numEntriesPerInstant, 0); // THere are no delta-commits after compaction instant - validateUnSchedulePlan(client, - "006", "007", numEntriesPerInstant, 0); + validateUnSchedulePlan(client, "006", "007", numEntriesPerInstant, 0); } @Test public void testUnscheduleCompactionFileId() throws Exception { int numEntriesPerInstant = 10; - CompactionTestUtils - .setupAndValidateCompactionOperations(metaClient, false, numEntriesPerInstant, numEntriesPerInstant, - numEntriesPerInstant, numEntriesPerInstant); + CompactionTestUtils.setupAndValidateCompactionOperations(metaClient, false, numEntriesPerInstant, + numEntriesPerInstant, numEntriesPerInstant, numEntriesPerInstant); Map instantsWithOp = Arrays.asList("001", "003", "005", "007").stream().map(instant -> { try { @@ -97,29 +91,24 @@ public class TestCompactionAdminClient extends TestHoodieClientBase { } catch (IOException ioe) { throw new HoodieException(ioe); } - }).map(instantWithPlan -> instantWithPlan.getRight().getOperations().stream().map(op -> Pair.of( - instantWithPlan.getLeft(), CompactionOperation.convertFromAvroRecordInstance(op))).findFirst().get()) - .collect(Collectors.toMap(Pair::getLeft, Pair::getRight)); + }).map(instantWithPlan -> instantWithPlan.getRight().getOperations().stream() + .map(op -> Pair.of(instantWithPlan.getLeft(), CompactionOperation.convertFromAvroRecordInstance(op))) + .findFirst().get()).collect(Collectors.toMap(Pair::getLeft, Pair::getRight)); // THere are delta-commits after compaction instant - validateUnScheduleFileId(client, - "000", "001", instantsWithOp.get("001"), 2); + validateUnScheduleFileId(client, "000", "001", instantsWithOp.get("001"), 2); // THere are delta-commits after compaction instant - validateUnScheduleFileId(client, - "002", "003", instantsWithOp.get("003"), 2); + validateUnScheduleFileId(client, "002", "003", instantsWithOp.get("003"), 2); // THere are no delta-commits after compaction instant - validateUnScheduleFileId(client, - "004", "005", instantsWithOp.get("005"), 0); + validateUnScheduleFileId(client, "004", "005", instantsWithOp.get("005"), 0); // THere are no delta-commits after compaction instant - validateUnScheduleFileId(client, - "006", "007", instantsWithOp.get("007"), 0); + validateUnScheduleFileId(client, "006", "007", instantsWithOp.get("007"), 0); } @Test public void testRepairCompactionPlan() throws Exception { int numEntriesPerInstant = 10; - CompactionTestUtils - .setupAndValidateCompactionOperations(metaClient, false, numEntriesPerInstant, numEntriesPerInstant, - numEntriesPerInstant, numEntriesPerInstant); + CompactionTestUtils.setupAndValidateCompactionOperations(metaClient, false, numEntriesPerInstant, + numEntriesPerInstant, numEntriesPerInstant, numEntriesPerInstant); // THere are delta-commits after compaction instant validateRepair("000", "001", numEntriesPerInstant, 2 * numEntriesPerInstant); // THere are delta-commits after compaction instant @@ -140,23 +129,20 @@ public class TestCompactionAdminClient extends TestHoodieClientBase { Assert.assertTrue("Expect some failures in validation", result.stream().filter(r -> !r.isSuccess()).count() > 0); } // Now repair - List> undoFiles = result.stream().flatMap(r -> - client.getRenamingActionsToAlignWithCompactionOperation(metaClient, - compactionInstant, r.getOperation(), Option.empty()).stream()) - .map(rn -> { - try { - client.renameLogFile(metaClient, rn.getKey(), rn.getValue()); - } catch (IOException e) { - throw new HoodieIOException(e.getMessage(), e); - } - return rn; - }).collect(Collectors.toList()); - Map renameFilesFromUndo = - undoFiles.stream().collect(Collectors.toMap(p -> p.getRight().getPath().toString(), - x -> x.getLeft().getPath().toString())); - Map expRenameFiles = - renameFiles.stream().collect(Collectors.toMap(p -> p.getLeft().getPath().toString(), - x -> x.getRight().getPath().toString())); + List> undoFiles = + result.stream().flatMap(r -> client.getRenamingActionsToAlignWithCompactionOperation(metaClient, + compactionInstant, r.getOperation(), Option.empty()).stream()).map(rn -> { + try { + client.renameLogFile(metaClient, rn.getKey(), rn.getValue()); + } catch (IOException e) { + throw new HoodieIOException(e.getMessage(), e); + } + return rn; + }).collect(Collectors.toList()); + Map renameFilesFromUndo = undoFiles.stream() + .collect(Collectors.toMap(p -> p.getRight().getPath().toString(), x -> x.getLeft().getPath().toString())); + Map expRenameFiles = renameFiles.stream() + .collect(Collectors.toMap(p -> p.getLeft().getPath().toString(), x -> x.getRight().getPath().toString())); if (expNumRepairs > 0) { Assert.assertFalse("Rename Files must be non-empty", renameFiles.isEmpty()); } else { @@ -182,14 +168,13 @@ public class TestCompactionAdminClient extends TestHoodieClientBase { private void ensureValidCompactionPlan(String compactionInstant) throws Exception { metaClient = new HoodieTableMetaClient(metaClient.getHadoopConf(), basePath, true); // Ensure compaction-plan is good to begin with - List validationResults = client.validateCompactionPlan(metaClient, - compactionInstant, 1); + List validationResults = client.validateCompactionPlan(metaClient, compactionInstant, 1); Assert.assertFalse("Some validations failed", validationResults.stream().filter(v -> !v.isSuccess()).findAny().isPresent()); } - private void validateRenameFiles(List> renameFiles, - String ingestionInstant, String compactionInstant, HoodieTableFileSystemView fsView) { + private void validateRenameFiles(List> renameFiles, String ingestionInstant, + String compactionInstant, HoodieTableFileSystemView fsView) { // Ensure new names of log-files are on expected lines Set uniqNewLogFiles = new HashSet<>(); Set uniqOldLogFiles = new HashSet<>(); @@ -209,11 +194,10 @@ public class TestCompactionAdminClient extends TestHoodieClientBase { Assert.assertEquals("File Id is expected", oldLogFile.getFileId(), newLogFile.getFileId()); HoodieLogFile lastLogFileBeforeCompaction = fsView.getLatestMergedFileSlicesBeforeOrOn(HoodieTestUtils.DEFAULT_PARTITION_PATHS[0], ingestionInstant) - .filter(fs -> fs.getFileId().equals(oldLogFile.getFileId())) - .map(fs -> fs.getLogFiles().findFirst().get()).findFirst().get(); + .filter(fs -> fs.getFileId().equals(oldLogFile.getFileId())).map(fs -> fs.getLogFiles().findFirst().get()) + .findFirst().get(); Assert.assertEquals("Log Version expected", - lastLogFileBeforeCompaction.getLogVersion() + oldLogFile.getLogVersion(), - newLogFile.getLogVersion()); + lastLogFileBeforeCompaction.getLogVersion() + oldLogFile.getLogVersion(), newLogFile.getLogVersion()); Assert.assertTrue("Log version does not collide", newLogFile.getLogVersion() > lastLogFileBeforeCompaction.getLogVersion()); }); @@ -223,10 +207,9 @@ public class TestCompactionAdminClient extends TestHoodieClientBase { * Validate Unschedule operations */ private List> validateUnSchedulePlan(CompactionAdminClient client, - String ingestionInstant, String compactionInstant, int numEntriesPerInstant, int expNumRenames) - throws Exception { - return validateUnSchedulePlan(client, ingestionInstant, compactionInstant, numEntriesPerInstant, - expNumRenames, false); + String ingestionInstant, String compactionInstant, int numEntriesPerInstant, int expNumRenames) throws Exception { + return validateUnSchedulePlan(client, ingestionInstant, compactionInstant, numEntriesPerInstant, expNumRenames, + false); } /** @@ -240,8 +223,7 @@ public class TestCompactionAdminClient extends TestHoodieClientBase { // Check suggested rename operations List> renameFiles = - client.getRenamingActionsForUnschedulingCompactionPlan(metaClient, compactionInstant, 1, - Option.empty(), false); + client.getRenamingActionsForUnschedulingCompactionPlan(metaClient, compactionInstant, 1, Option.empty(), false); metaClient = new HoodieTableMetaClient(metaClient.getHadoopConf(), basePath, true); // Log files belonging to file-slices created because of compaction request must be renamed @@ -250,8 +232,7 @@ public class TestCompactionAdminClient extends TestHoodieClientBase { final HoodieTableFileSystemView fsView = new HoodieTableFileSystemView(metaClient, metaClient.getCommitsAndCompactionTimeline()); Set expLogFilesToBeRenamed = fsView.getLatestFileSlices(HoodieTestUtils.DEFAULT_PARTITION_PATHS[0]) - .filter(fs -> fs.getBaseInstantTime().equals(compactionInstant)) - .flatMap(fs -> fs.getLogFiles()) + .filter(fs -> fs.getBaseInstantTime().equals(compactionInstant)).flatMap(fs -> fs.getLogFiles()) .collect(Collectors.toSet()); Assert.assertEquals("Log files belonging to file-slices created because of compaction request must be renamed", expLogFilesToBeRenamed, gotLogFilesToBeRenamed); @@ -286,7 +267,8 @@ public class TestCompactionAdminClient extends TestHoodieClientBase { newFsView.getLatestFileSlicesBeforeOrOn(HoodieTestUtils.DEFAULT_PARTITION_PATHS[0], compactionInstant, true) .filter(fs -> fs.getBaseInstantTime().equals(compactionInstant)).forEach(fs -> { Assert.assertFalse("No Data file must be present", fs.getDataFile().isPresent()); - Assert.assertTrue("No Log Files", fs.getLogFiles().count() == 0); }); + Assert.assertTrue("No Log Files", fs.getLogFiles().count() == 0); + }); // Ensure same number of log-files before and after renaming per fileId Map fileIdToCountsAfterRenaming = @@ -295,8 +277,8 @@ public class TestCompactionAdminClient extends TestHoodieClientBase { .map(fs -> Pair.of(fs.getFileId(), fs.getLogFiles().count())) .collect(Collectors.toMap(Pair::getKey, Pair::getValue)); - Assert.assertEquals("Each File Id has same number of log-files", - fileIdToCountsBeforeRenaming, fileIdToCountsAfterRenaming); + Assert.assertEquals("Each File Id has same number of log-files", fileIdToCountsBeforeRenaming, + fileIdToCountsAfterRenaming); Assert.assertEquals("Not Empty", numEntriesPerInstant, fileIdToCountsAfterRenaming.size()); Assert.assertEquals("Expected number of renames", expNumRenames, renameFiles.size()); return renameFiles; @@ -305,15 +287,14 @@ public class TestCompactionAdminClient extends TestHoodieClientBase { /** * Validate Unschedule operations */ - private void validateUnScheduleFileId(CompactionAdminClient client, String ingestionInstant, - String compactionInstant, CompactionOperation op, int expNumRenames) throws Exception { + private void validateUnScheduleFileId(CompactionAdminClient client, String ingestionInstant, String compactionInstant, + CompactionOperation op, int expNumRenames) throws Exception { ensureValidCompactionPlan(compactionInstant); // Check suggested rename operations - List> renameFiles = - client.getRenamingActionsForUnschedulingCompactionOperation(metaClient, compactionInstant, op, - Option.empty(), false); + List> renameFiles = client + .getRenamingActionsForUnschedulingCompactionOperation(metaClient, compactionInstant, op, Option.empty(), false); metaClient = new HoodieTableMetaClient(metaClient.getHadoopConf(), basePath, true); // Log files belonging to file-slices created because of compaction request must be renamed @@ -323,8 +304,7 @@ public class TestCompactionAdminClient extends TestHoodieClientBase { new HoodieTableFileSystemView(metaClient, metaClient.getCommitsAndCompactionTimeline()); Set expLogFilesToBeRenamed = fsView.getLatestFileSlices(HoodieTestUtils.DEFAULT_PARTITION_PATHS[0]) .filter(fs -> fs.getBaseInstantTime().equals(compactionInstant)) - .filter(fs -> fs.getFileId().equals(op.getFileId())) - .flatMap(fs -> fs.getLogFiles()) + .filter(fs -> fs.getFileId().equals(op.getFileId())).flatMap(fs -> fs.getLogFiles()) .collect(Collectors.toSet()); Assert.assertEquals("Log files belonging to file-slices created because of compaction request must be renamed", expLogFilesToBeRenamed, gotLogFilesToBeRenamed); @@ -359,8 +339,8 @@ public class TestCompactionAdminClient extends TestHoodieClientBase { .map(fs -> Pair.of(fs.getFileId(), fs.getLogFiles().count())) .collect(Collectors.toMap(Pair::getKey, Pair::getValue)); - Assert.assertEquals("Each File Id has same number of log-files", - fileIdToCountsBeforeRenaming, fileIdToCountsAfterRenaming); + Assert.assertEquals("Each File Id has same number of log-files", fileIdToCountsBeforeRenaming, + fileIdToCountsAfterRenaming); Assert.assertEquals("Not Empty", 1, fileIdToCountsAfterRenaming.size()); Assert.assertEquals("Expected number of renames", expNumRenames, renameFiles.size()); } diff --git a/hudi-client/src/test/java/org/apache/hudi/TestConsistencyGuard.java b/hudi-client/src/test/java/org/apache/hudi/TestConsistencyGuard.java index c6b6d11d8..53b08c2d7 100644 --- a/hudi-client/src/test/java/org/apache/hudi/TestConsistencyGuard.java +++ b/hudi-client/src/test/java/org/apache/hudi/TestConsistencyGuard.java @@ -51,26 +51,23 @@ public class TestConsistencyGuard extends HoodieClientTestHarness { ConsistencyGuard passing = new FailSafeConsistencyGuard(fs, getConsistencyGuardConfig(1, 1000, 1000)); passing.waitTillFileAppears(new Path(basePath + "/partition/path/f1_1-0-1_000.parquet")); passing.waitTillFileAppears(new Path(basePath + "/partition/path/f2_1-0-1_000.parquet")); - passing.waitTillAllFilesAppear(basePath + "/partition/path", - Arrays.asList(basePath + "/partition/path/f1_1-0-1_000.parquet", - basePath + "/partition/path/f2_1-0-1_000.parquet")); + passing.waitTillAllFilesAppear(basePath + "/partition/path", Arrays + .asList(basePath + "/partition/path/f1_1-0-1_000.parquet", basePath + "/partition/path/f2_1-0-1_000.parquet")); fs.delete(new Path(basePath + "/partition/path/f1_1-0-1_000.parquet"), false); fs.delete(new Path(basePath + "/partition/path/f2_1-0-1_000.parquet"), false); passing.waitTillFileDisappears(new Path(basePath + "/partition/path/f1_1-0-1_000.parquet")); passing.waitTillFileDisappears(new Path(basePath + "/partition/path/f2_1-0-1_000.parquet")); - passing.waitTillAllFilesDisappear(basePath + "/partition/path", - Arrays.asList(basePath + "/partition/path/f1_1-0-1_000.parquet", - basePath + "/partition/path/f2_1-0-1_000.parquet")); + passing.waitTillAllFilesDisappear(basePath + "/partition/path", Arrays + .asList(basePath + "/partition/path/f1_1-0-1_000.parquet", basePath + "/partition/path/f2_1-0-1_000.parquet")); } @Test(expected = TimeoutException.class) public void testCheckFailingAppear() throws Exception { HoodieClientTestUtils.fakeDataFile(basePath, "partition/path", "000", "f1"); ConsistencyGuard passing = new FailSafeConsistencyGuard(fs, getConsistencyGuardConfig()); - passing.waitTillAllFilesAppear(basePath + "/partition/path", - Arrays.asList(basePath + "/partition/path/f1_1-0-2_000.parquet", - basePath + "/partition/path/f2_1-0-2_000.parquet")); + passing.waitTillAllFilesAppear(basePath + "/partition/path", Arrays + .asList(basePath + "/partition/path/f1_1-0-2_000.parquet", basePath + "/partition/path/f2_1-0-2_000.parquet")); } @@ -85,9 +82,8 @@ public class TestConsistencyGuard extends HoodieClientTestHarness { public void testCheckFailingDisappear() throws Exception { HoodieClientTestUtils.fakeDataFile(basePath, "partition/path", "000", "f1"); ConsistencyGuard passing = new FailSafeConsistencyGuard(fs, getConsistencyGuardConfig()); - passing.waitTillAllFilesDisappear(basePath + "/partition/path", - Arrays.asList(basePath + "/partition/path/f1_1-0-1_000.parquet", - basePath + "/partition/path/f2_1-0-2_000.parquet")); + passing.waitTillAllFilesDisappear(basePath + "/partition/path", Arrays + .asList(basePath + "/partition/path/f1_1-0-1_000.parquet", basePath + "/partition/path/f2_1-0-2_000.parquet")); } @Test(expected = TimeoutException.class) diff --git a/hudi-client/src/test/java/org/apache/hudi/TestHoodieClientBase.java b/hudi-client/src/test/java/org/apache/hudi/TestHoodieClientBase.java index 76d2bea7b..ff9f798c7 100644 --- a/hudi-client/src/test/java/org/apache/hudi/TestHoodieClientBase.java +++ b/hudi-client/src/test/java/org/apache/hudi/TestHoodieClientBase.java @@ -110,17 +110,15 @@ public class TestHoodieClientBase extends HoodieClientTestHarness { */ HoodieWriteConfig.Builder getConfigBuilder() { return HoodieWriteConfig.newBuilder().withPath(basePath).withSchema(HoodieTestDataGenerator.TRIP_EXAMPLE_SCHEMA) - .withParallelism(2, 2) - .withBulkInsertParallelism(2).withFinalizeWriteParallelism(2) + .withParallelism(2, 2).withBulkInsertParallelism(2).withFinalizeWriteParallelism(2) .withWriteStatusClass(MetadataMergeWriteStatus.class) .withConsistencyGuardConfig(ConsistencyGuardConfig.newBuilder().withConsistencyCheckEnabled(true).build()) .withCompactionConfig(HoodieCompactionConfig.newBuilder().compactionSmallFileSize(1024 * 1024).build()) .withStorageConfig(HoodieStorageConfig.newBuilder().limitFileSize(1024 * 1024).build()) .forTable("test-trip-table") .withIndexConfig(HoodieIndexConfig.newBuilder().withIndexType(IndexType.BLOOM).build()) - .withEmbeddedTimelineServerEnabled(true).withFileSystemViewConfig( - FileSystemViewStorageConfig.newBuilder().withStorageType(FileSystemViewStorageType.EMBEDDED_KV_STORE) - .build()); + .withEmbeddedTimelineServerEnabled(true).withFileSystemViewConfig(FileSystemViewStorageConfig.newBuilder() + .withStorageType(FileSystemViewStorageType.EMBEDDED_KV_STORE).build()); } protected HoodieTable getHoodieTable(HoodieTableMetaClient metaClient, HoodieWriteConfig config) { @@ -200,15 +198,13 @@ public class TestHoodieClientBase extends HoodieClientTestHarness { * @return Wrapped function */ private Function2, String, Integer> wrapRecordsGenFunctionForPreppedCalls( - final HoodieWriteConfig writeConfig, - final Function2, String, Integer> recordGenFunction) { + final HoodieWriteConfig writeConfig, final Function2, String, Integer> recordGenFunction) { return (commit, numRecords) -> { final HoodieIndex index = HoodieIndex.createIndex(writeConfig, jsc); List records = recordGenFunction.apply(commit, numRecords); final HoodieTableMetaClient metaClient = new HoodieTableMetaClient(jsc.hadoopConfiguration(), basePath, true); HoodieTable table = HoodieTable.getHoodieTable(metaClient, writeConfig, jsc); - JavaRDD taggedRecords = - index.tagLocation(jsc.parallelize(records, 1), jsc, table); + JavaRDD taggedRecords = index.tagLocation(jsc.parallelize(records, 1), jsc, table); return taggedRecords.collect(); }; } @@ -221,10 +217,8 @@ public class TestHoodieClientBase extends HoodieClientTestHarness { * @param wrapped Actual Records Generation function * @return Wrapped Function */ - Function2, String, Integer> generateWrapRecordsFn( - boolean isPreppedAPI, - HoodieWriteConfig writeConfig, - Function2, String, Integer> wrapped) { + Function2, String, Integer> generateWrapRecordsFn(boolean isPreppedAPI, + HoodieWriteConfig writeConfig, Function2, String, Integer> wrapped) { if (isPreppedAPI) { return wrapRecordsGenFunctionForPreppedCalls(writeConfig, wrapped); } else { @@ -247,22 +241,15 @@ public class TestHoodieClientBase extends HoodieClientTestHarness { * @return RDD of write-status * @throws Exception in case of error */ - JavaRDD insertFirstBatch( - HoodieWriteConfig writeConfig, - HoodieWriteClient client, - String newCommitTime, - String initCommitTime, - int numRecordsInThisCommit, - Function3, HoodieWriteClient, JavaRDD, String> writeFn, - boolean isPreppedAPI, - boolean assertForCommit, - int expRecordsInThisCommit) throws Exception { + JavaRDD insertFirstBatch(HoodieWriteConfig writeConfig, HoodieWriteClient client, String newCommitTime, + String initCommitTime, int numRecordsInThisCommit, + Function3, HoodieWriteClient, JavaRDD, String> writeFn, boolean isPreppedAPI, + boolean assertForCommit, int expRecordsInThisCommit) throws Exception { final Function2, String, Integer> recordGenFunction = generateWrapRecordsFn(isPreppedAPI, writeConfig, dataGen::generateInserts); - return writeBatch(client, newCommitTime, initCommitTime, Option.empty(), initCommitTime, - numRecordsInThisCommit, recordGenFunction, writeFn, assertForCommit, - expRecordsInThisCommit, expRecordsInThisCommit, 1); + return writeBatch(client, newCommitTime, initCommitTime, Option.empty(), initCommitTime, numRecordsInThisCommit, + recordGenFunction, writeFn, assertForCommit, expRecordsInThisCommit, expRecordsInThisCommit, 1); } /** @@ -284,27 +271,17 @@ public class TestHoodieClientBase extends HoodieClientTestHarness { * @return RDD of write-status * @throws Exception in case of error */ - JavaRDD updateBatch( - HoodieWriteConfig writeConfig, - HoodieWriteClient client, - String newCommitTime, - String prevCommitTime, - Option> commitTimesBetweenPrevAndNew, - String initCommitTime, + JavaRDD updateBatch(HoodieWriteConfig writeConfig, HoodieWriteClient client, String newCommitTime, + String prevCommitTime, Option> commitTimesBetweenPrevAndNew, String initCommitTime, int numRecordsInThisCommit, - Function3, HoodieWriteClient, JavaRDD, String> writeFn, - boolean isPreppedAPI, - boolean assertForCommit, - int expRecordsInThisCommit, - int expTotalRecords, - int expTotalCommits) - throws Exception { + Function3, HoodieWriteClient, JavaRDD, String> writeFn, boolean isPreppedAPI, + boolean assertForCommit, int expRecordsInThisCommit, int expTotalRecords, int expTotalCommits) throws Exception { final Function2, String, Integer> recordGenFunction = generateWrapRecordsFn(isPreppedAPI, writeConfig, dataGen::generateUniqueUpdates); return writeBatch(client, newCommitTime, prevCommitTime, commitTimesBetweenPrevAndNew, initCommitTime, - numRecordsInThisCommit, recordGenFunction, writeFn, assertForCommit, - expRecordsInThisCommit, expTotalRecords, expTotalCommits); + numRecordsInThisCommit, recordGenFunction, writeFn, assertForCommit, expRecordsInThisCommit, expTotalRecords, + expTotalCommits); } /** @@ -324,22 +301,13 @@ public class TestHoodieClientBase extends HoodieClientTestHarness { * @param expTotalCommits Expected number of commits (including this commit) * @throws Exception in case of error */ - JavaRDD writeBatch( - HoodieWriteClient client, - String newCommitTime, - String prevCommitTime, - Option> commitTimesBetweenPrevAndNew, - String initCommitTime, - int numRecordsInThisCommit, + JavaRDD writeBatch(HoodieWriteClient client, String newCommitTime, String prevCommitTime, + Option> commitTimesBetweenPrevAndNew, String initCommitTime, int numRecordsInThisCommit, Function2, String, Integer> recordGenFunction, Function3, HoodieWriteClient, JavaRDD, String> writeFn, - boolean assertForCommit, - int expRecordsInThisCommit, - int expTotalRecords, - int expTotalCommits) - throws Exception { + boolean assertForCommit, int expRecordsInThisCommit, int expTotalRecords, int expTotalCommits) throws Exception { - //Write 1 (only inserts) + // Write 1 (only inserts) client.startCommitWithTime(newCommitTime); List records = recordGenFunction.apply(newCommitTime, numRecordsInThisCommit); @@ -373,8 +341,7 @@ public class TestHoodieClientBase extends HoodieClientTestHarness { HoodieClientTestUtils.read(jsc, basePath, sqlContext, fs, fullPartitionPaths).count()); // Check that the incremental consumption from prevCommitTime - assertEquals("Incremental consumption from " + prevCommitTime - + " should give all records in latest commit", + assertEquals("Incremental consumption from " + prevCommitTime + " should give all records in latest commit", HoodieClientTestUtils.readCommit(basePath, sqlContext, timeline, newCommitTime).count(), HoodieClientTestUtils.readSince(basePath, sqlContext, timeline, prevCommitTime).count()); if (commitTimesBetweenPrevAndNew.isPresent()) { @@ -407,8 +374,7 @@ public class TestHoodieClientBase extends HoodieClientTestHarness { * @param commitTime Commit Timestamp * @throws IOException in case of error */ - void updateAllFilesInPartition(List files, String partitionPath, String commitTime) - throws IOException { + void updateAllFilesInPartition(List files, String partitionPath, String commitTime) throws IOException { for (String fileId : files) { HoodieTestUtils.createDataFile(basePath, partitionPath, commitTime, fileId); } @@ -423,8 +389,7 @@ public class TestHoodieClientBase extends HoodieClientTestHarness { * @return Created files * @throws IOException in case of error */ - List createFilesInPartition(String partitionPath, String commitTime, int numFiles) - throws IOException { + List createFilesInPartition(String partitionPath, String commitTime, int numFiles) throws IOException { List files = new ArrayList<>(); for (int i = 0; i < numFiles; i++) { files.add(HoodieTestUtils.createNewDataFile(basePath, partitionPath, commitTime)); diff --git a/hudi-client/src/test/java/org/apache/hudi/TestHoodieClientOnCopyOnWriteStorage.java b/hudi-client/src/test/java/org/apache/hudi/TestHoodieClientOnCopyOnWriteStorage.java index e5351493e..c2dedffa7 100644 --- a/hudi-client/src/test/java/org/apache/hudi/TestHoodieClientOnCopyOnWriteStorage.java +++ b/hudi-client/src/test/java/org/apache/hudi/TestHoodieClientOnCopyOnWriteStorage.java @@ -116,8 +116,8 @@ public class TestHoodieClientOnCopyOnWriteStorage extends TestHoodieClientBase { */ @Test public void testAutoCommitOnBulkInsertPrepped() throws Exception { - testAutoCommit((writeClient, recordRDD, commitTime) - -> writeClient.bulkInsertPreppedRecords(recordRDD, commitTime, Option.empty()), true); + testAutoCommit((writeClient, recordRDD, commitTime) -> writeClient.bulkInsertPreppedRecords(recordRDD, commitTime, + Option.empty()), true); } /** @@ -126,8 +126,7 @@ public class TestHoodieClientOnCopyOnWriteStorage extends TestHoodieClientBase { * @param writeFn One of HoodieWriteClient Write API * @throws Exception in case of failure */ - private void testAutoCommit( - Function3, HoodieWriteClient, JavaRDD, String> writeFn, + private void testAutoCommit(Function3, HoodieWriteClient, JavaRDD, String> writeFn, boolean isPrepped) throws Exception { // Set autoCommit false HoodieWriteConfig cfg = getConfigBuilder().withAutoCommit(false).build(); @@ -136,9 +135,8 @@ public class TestHoodieClientOnCopyOnWriteStorage extends TestHoodieClientBase { String prevCommitTime = "000"; String newCommitTime = "001"; int numRecords = 200; - JavaRDD result = - insertFirstBatch(cfg, client, newCommitTime, prevCommitTime, numRecords, writeFn, isPrepped, false, - numRecords); + JavaRDD result = insertFirstBatch(cfg, client, newCommitTime, prevCommitTime, numRecords, writeFn, + isPrepped, false, numRecords); assertFalse("If Autocommit is false, then commit should not be made automatically", HoodieTestUtils.doesCommitExist(basePath, newCommitTime)); @@ -184,16 +182,16 @@ public class TestHoodieClientOnCopyOnWriteStorage extends TestHoodieClientBase { String recordKey = UUID.randomUUID().toString(); HoodieKey keyOne = new HoodieKey(recordKey, "2018-01-01"); - HoodieRecord recordOne = new HoodieRecord(keyOne, - HoodieTestDataGenerator.generateRandomValue(keyOne, newCommitTime)); + HoodieRecord recordOne = + new HoodieRecord(keyOne, HoodieTestDataGenerator.generateRandomValue(keyOne, newCommitTime)); HoodieKey keyTwo = new HoodieKey(recordKey, "2018-02-01"); - HoodieRecord recordTwo = new HoodieRecord(keyTwo, - HoodieTestDataGenerator.generateRandomValue(keyTwo, newCommitTime)); + HoodieRecord recordTwo = + new HoodieRecord(keyTwo, HoodieTestDataGenerator.generateRandomValue(keyTwo, newCommitTime)); // Same key and partition as keyTwo - HoodieRecord recordThree = new HoodieRecord(keyTwo, - HoodieTestDataGenerator.generateRandomValue(keyTwo, newCommitTime)); + HoodieRecord recordThree = + new HoodieRecord(keyTwo, HoodieTestDataGenerator.generateRandomValue(keyTwo, newCommitTime)); JavaRDD records = jsc.parallelize(Arrays.asList(recordOne, recordTwo, recordThree), 1); @@ -205,21 +203,18 @@ public class TestHoodieClientOnCopyOnWriteStorage extends TestHoodieClientBase { // dedup should be done based on both recordKey and partitionPath HoodieWriteClient clientWithDummyNonGlobalIndex = getWriteClientWithDummyIndex(false); - dedupedRecs = - clientWithDummyNonGlobalIndex.deduplicateRecords(records, 1).collect(); + dedupedRecs = clientWithDummyNonGlobalIndex.deduplicateRecords(records, 1).collect(); assertEquals(2, dedupedRecs.size()); assertNodupesWithinPartition(dedupedRecs); // Perform write-action and check - try (HoodieWriteClient client = getHoodieWriteClient( - getConfigBuilder().combineInput(true, true).build(), false);) { + try (HoodieWriteClient client = getHoodieWriteClient(getConfigBuilder().combineInput(true, true).build(), false);) { client.startCommitWithTime(newCommitTime); List statuses = writeFn.apply(client, records, newCommitTime).collect(); assertNoWriteErrors(statuses); assertEquals(2, statuses.size()); - assertNodupesWithinPartition( - statuses.stream().map(WriteStatus::getWrittenRecords) - .flatMap(Collection::stream).collect(Collectors.toList())); + assertNodupesWithinPartition(statuses.stream().map(WriteStatus::getWrittenRecords).flatMap(Collection::stream) + .collect(Collectors.toList())); } } @@ -241,8 +236,7 @@ public class TestHoodieClientOnCopyOnWriteStorage extends TestHoodieClientBase { */ @Test public void testUpserts() throws Exception { - testUpsertsInternal(getConfig(), - HoodieWriteClient::upsert, false); + testUpsertsInternal(getConfig(), HoodieWriteClient::upsert, false); } /** @@ -250,8 +244,7 @@ public class TestHoodieClientOnCopyOnWriteStorage extends TestHoodieClientBase { */ @Test public void testUpsertsPrepped() throws Exception { - testUpsertsInternal(getConfig(), - HoodieWriteClient::upsertPreppedRecords, true); + testUpsertsInternal(getConfig(), HoodieWriteClient::upsertPreppedRecords, true); } /** @@ -262,16 +255,16 @@ public class TestHoodieClientOnCopyOnWriteStorage extends TestHoodieClientBase { * @throws Exception in case of error */ private void testUpsertsInternal(HoodieWriteConfig hoodieWriteConfig, - Function3, HoodieWriteClient, JavaRDD, String> writeFn, - boolean isPrepped) throws Exception { + Function3, HoodieWriteClient, JavaRDD, String> writeFn, boolean isPrepped) + throws Exception { HoodieWriteClient client = getHoodieWriteClient(hoodieWriteConfig, false); - //Write 1 (only inserts) + // Write 1 (only inserts) String newCommitTime = "001"; String initCommitTime = "000"; int numRecords = 200; - insertFirstBatch(hoodieWriteConfig, - client, newCommitTime, initCommitTime, numRecords, HoodieWriteClient::insert, isPrepped, true, numRecords); + insertFirstBatch(hoodieWriteConfig, client, newCommitTime, initCommitTime, numRecords, HoodieWriteClient::insert, + isPrepped, true, numRecords); // Write 2 (updates) String prevCommitTime = newCommitTime; @@ -279,8 +272,8 @@ public class TestHoodieClientOnCopyOnWriteStorage extends TestHoodieClientBase { numRecords = 100; String commitTimeBetweenPrevAndNew = "002"; updateBatch(hoodieWriteConfig, client, newCommitTime, prevCommitTime, - Option.of(Arrays.asList(commitTimeBetweenPrevAndNew)), - initCommitTime, numRecords, writeFn, isPrepped, true, numRecords, 200, 2); + Option.of(Arrays.asList(commitTimeBetweenPrevAndNew)), initCommitTime, numRecords, writeFn, isPrepped, true, + numRecords, 200, 2); } /** @@ -291,8 +284,7 @@ public class TestHoodieClientOnCopyOnWriteStorage extends TestHoodieClientBase { HoodieWriteClient client = getHoodieWriteClient(getConfig(), false); /** - * Write 1 (inserts and deletes) - * Write actual 200 insert records and ignore 100 delete records + * Write 1 (inserts and deletes) Write actual 200 insert records and ignore 100 delete records */ String initCommitTime = "000"; String newCommitTime = "001"; @@ -308,10 +300,8 @@ public class TestHoodieClientOnCopyOnWriteStorage extends TestHoodieClientBase { return recordsInFirstBatch; }; writeBatch(client, newCommitTime, initCommitTime, Option.empty(), initCommitTime, - //unused as genFn uses hard-coded number of inserts/updates/deletes - -1, - recordGenFunction, HoodieWriteClient::upsert, true, - 200, 200, 1); + // unused as genFn uses hard-coded number of inserts/updates/deletes + -1, recordGenFunction, HoodieWriteClient::upsert, true, 200, 200, 1); /** * Write 2 (deletes+writes) @@ -320,17 +310,15 @@ public class TestHoodieClientOnCopyOnWriteStorage extends TestHoodieClientBase { newCommitTime = "004"; final List recordsInSecondBatch = new ArrayList<>(); - recordGenFunction = - (String commitTime, Integer numRecordsInThisCommit) -> { - List fewRecordsForDelete = recordsInFirstBatch.subList(0, 50); - List fewRecordsForUpdate = recordsInFirstBatch.subList(50, 100); - recordsInSecondBatch.addAll(dataGen.generateDeletesFromExistingRecords(fewRecordsForDelete)); - recordsInSecondBatch.addAll(fewRecordsForUpdate); - return recordsInSecondBatch; - }; - writeBatch(client, newCommitTime, prevCommitTime, Option.empty(), initCommitTime, - 100, recordGenFunction, HoodieWriteClient::upsert, true, - 50, 150, 2); + recordGenFunction = (String commitTime, Integer numRecordsInThisCommit) -> { + List fewRecordsForDelete = recordsInFirstBatch.subList(0, 50); + List fewRecordsForUpdate = recordsInFirstBatch.subList(50, 100); + recordsInSecondBatch.addAll(dataGen.generateDeletesFromExistingRecords(fewRecordsForDelete)); + recordsInSecondBatch.addAll(fewRecordsForUpdate); + return recordsInSecondBatch; + }; + writeBatch(client, newCommitTime, prevCommitTime, Option.empty(), initCommitTime, 100, recordGenFunction, + HoodieWriteClient::upsert, true, 50, 150, 2); } /** @@ -342,7 +330,7 @@ public class TestHoodieClientOnCopyOnWriteStorage extends TestHoodieClientBase { final int insertSplitLimit = 100; // setup the small file handling params HoodieWriteConfig config = getSmallInsertWriteConfig(insertSplitLimit); // hold upto 200 records max - dataGen = new HoodieTestDataGenerator(new String[]{testPartitionPath}); + dataGen = new HoodieTestDataGenerator(new String[] {testPartitionPath}); HoodieWriteClient client = getHoodieWriteClient(config, false); @@ -359,8 +347,10 @@ public class TestHoodieClientOnCopyOnWriteStorage extends TestHoodieClientBase { assertEquals("Just 1 file needs to be added.", 1, statuses.size()); String file1 = statuses.get(0).getFileId(); - Assert.assertEquals("file should contain 100 records", readRowKeysFromParquet(jsc.hadoopConfiguration(), - new Path(basePath, statuses.get(0).getStat().getPath())).size(), 100); + Assert.assertEquals("file should contain 100 records", + readRowKeysFromParquet(jsc.hadoopConfiguration(), new Path(basePath, statuses.get(0).getStat().getPath())) + .size(), + 100); // Update + Inserts such that they just expand file1 String commitTime2 = "002"; @@ -379,8 +369,8 @@ public class TestHoodieClientOnCopyOnWriteStorage extends TestHoodieClientBase { assertEquals("Existing file should be expanded", file1, statuses.get(0).getFileId()); assertEquals("Existing file should be expanded", commitTime1, statuses.get(0).getStat().getPrevCommit()); Path newFile = new Path(basePath, statuses.get(0).getStat().getPath()); - assertEquals("file should contain 140 records", - readRowKeysFromParquet(jsc.hadoopConfiguration(), newFile).size(), 140); + assertEquals("file should contain 140 records", readRowKeysFromParquet(jsc.hadoopConfiguration(), newFile).size(), + 140); List records = ParquetUtils.readAvroRecords(jsc.hadoopConfiguration(), newFile); for (GenericRecord record : records) { @@ -406,8 +396,8 @@ public class TestHoodieClientOnCopyOnWriteStorage extends TestHoodieClientBase { HoodieTable table = getHoodieTable(metadata, config); ReadOptimizedView fileSystemView = table.getROFileSystemView(); - List files = fileSystemView.getLatestDataFilesBeforeOrOn(testPartitionPath, commitTime3) - .collect(Collectors.toList()); + List files = + fileSystemView.getLatestDataFilesBeforeOrOn(testPartitionPath, commitTime3).collect(Collectors.toList()); int numTotalInsertsInCommit3 = 0; int numTotalUpdatesInCommit3 = 0; for (HoodieDataFile file : files) { @@ -453,7 +443,7 @@ public class TestHoodieClientOnCopyOnWriteStorage extends TestHoodieClientBase { final int insertSplitLimit = 100; // setup the small file handling params HoodieWriteConfig config = getSmallInsertWriteConfig(insertSplitLimit); // hold upto 200 records max - dataGen = new HoodieTestDataGenerator(new String[]{testPartitionPath}); + dataGen = new HoodieTestDataGenerator(new String[] {testPartitionPath}); HoodieWriteClient client = getHoodieWriteClient(config, false); // Inserts => will write file1 @@ -465,12 +455,14 @@ public class TestHoodieClientOnCopyOnWriteStorage extends TestHoodieClientBase { List statuses = client.insert(insertRecordsRDD1, commitTime1).collect(); assertNoWriteErrors(statuses); - assertPartitionMetadata(new String[]{testPartitionPath}, fs); + assertPartitionMetadata(new String[] {testPartitionPath}, fs); assertEquals("Just 1 file needs to be added.", 1, statuses.size()); String file1 = statuses.get(0).getFileId(); - assertEquals("file should contain 100 records", readRowKeysFromParquet(jsc.hadoopConfiguration(), - new Path(basePath, statuses.get(0).getStat().getPath())).size(), 100); + assertEquals("file should contain 100 records", + readRowKeysFromParquet(jsc.hadoopConfiguration(), new Path(basePath, statuses.get(0).getStat().getPath())) + .size(), + 100); // Second, set of Inserts should just expand file1 String commitTime2 = "002"; @@ -485,8 +477,8 @@ public class TestHoodieClientOnCopyOnWriteStorage extends TestHoodieClientBase { assertEquals("Existing file should be expanded", file1, statuses.get(0).getFileId()); assertEquals("Existing file should be expanded", commitTime1, statuses.get(0).getStat().getPrevCommit()); Path newFile = new Path(basePath, statuses.get(0).getStat().getPath()); - assertEquals("file should contain 140 records", - readRowKeysFromParquet(jsc.hadoopConfiguration(), newFile).size(), 140); + assertEquals("file should contain 140 records", readRowKeysFromParquet(jsc.hadoopConfiguration(), newFile).size(), + 140); List records = ParquetUtils.readAvroRecords(jsc.hadoopConfiguration(), newFile); for (GenericRecord record : records) { @@ -510,8 +502,7 @@ public class TestHoodieClientOnCopyOnWriteStorage extends TestHoodieClientBase { HoodieTableMetaClient metaClient = new HoodieTableMetaClient(jsc.hadoopConfiguration(), basePath); HoodieTable table = getHoodieTable(metaClient, config); List files = table.getROFileSystemView() - .getLatestDataFilesBeforeOrOn(testPartitionPath, commitTime3) - .collect(Collectors.toList()); + .getLatestDataFilesBeforeOrOn(testPartitionPath, commitTime3).collect(Collectors.toList()); assertEquals("Total of 2 valid data files", 2, files.size()); int totalInserts = 0; @@ -598,10 +589,11 @@ public class TestHoodieClientOnCopyOnWriteStorage extends TestHoodieClientBase { String filename = HoodieTestUtils.getCommitFilePath(basePath, commitTime); FileInputStream inputStream = new FileInputStream(filename); String everything = FileIOUtils.readAsUTFString(inputStream); - HoodieCommitMetadata metadata = HoodieCommitMetadata.fromJsonString(everything.toString(), - HoodieCommitMetadata.class); - HoodieRollingStatMetadata rollingStatMetadata = HoodieCommitMetadata.fromJsonString(metadata.getExtraMetadata() - .get(HoodieRollingStatMetadata.ROLLING_STAT_METADATA_KEY), HoodieRollingStatMetadata.class); + HoodieCommitMetadata metadata = + HoodieCommitMetadata.fromJsonString(everything.toString(), HoodieCommitMetadata.class); + HoodieRollingStatMetadata rollingStatMetadata = HoodieCommitMetadata.fromJsonString( + metadata.getExtraMetadata().get(HoodieRollingStatMetadata.ROLLING_STAT_METADATA_KEY), + HoodieRollingStatMetadata.class); int inserts = 0; for (Map.Entry> pstat : rollingStatMetadata.getPartitionToRollingStats() .entrySet()) { @@ -628,8 +620,9 @@ public class TestHoodieClientOnCopyOnWriteStorage extends TestHoodieClientBase { inputStream = new FileInputStream(filename); everything = FileIOUtils.readAsUTFString(inputStream); metadata = HoodieCommitMetadata.fromJsonString(everything.toString(), HoodieCommitMetadata.class); - rollingStatMetadata = HoodieCommitMetadata.fromJsonString(metadata.getExtraMetadata() - .get(HoodieRollingStatMetadata.ROLLING_STAT_METADATA_KEY), HoodieRollingStatMetadata.class); + rollingStatMetadata = HoodieCommitMetadata.fromJsonString( + metadata.getExtraMetadata().get(HoodieRollingStatMetadata.ROLLING_STAT_METADATA_KEY), + HoodieRollingStatMetadata.class); inserts = 0; int upserts = 0; for (Map.Entry> pstat : rollingStatMetadata.getPartitionToRollingStats() @@ -649,8 +642,7 @@ public class TestHoodieClientOnCopyOnWriteStorage extends TestHoodieClientBase { */ @Test public void testConsistencyCheckDuringFinalize() throws Exception { - HoodieTableMetaClient metaClient = new HoodieTableMetaClient(jsc.hadoopConfiguration(), - basePath); + HoodieTableMetaClient metaClient = new HoodieTableMetaClient(jsc.hadoopConfiguration(), basePath); String commitTime = "000"; HoodieWriteConfig cfg = getConfigBuilder().withAutoCommit(false).build(); HoodieWriteClient client = getHoodieWriteClient(cfg); @@ -684,11 +676,8 @@ public class TestHoodieClientOnCopyOnWriteStorage extends TestHoodieClientBase { private Pair> testConsistencyCheck(HoodieTableMetaClient metaClient, String commitTime) throws Exception { HoodieWriteConfig cfg = getConfigBuilder().withAutoCommit(false) - .withConsistencyGuardConfig(ConsistencyGuardConfig.newBuilder() - .withConsistencyCheckEnabled(true) - .withMaxConsistencyCheckIntervalMs(1) - .withInitialConsistencyCheckIntervalMs(1) - .build()) + .withConsistencyGuardConfig(ConsistencyGuardConfig.newBuilder().withConsistencyCheckEnabled(true) + .withMaxConsistencyCheckIntervalMs(1).withInitialConsistencyCheckIntervalMs(1).build()) .build(); HoodieWriteClient client = getHoodieWriteClient(cfg); @@ -699,10 +688,10 @@ public class TestHoodieClientOnCopyOnWriteStorage extends TestHoodieClientBase { // Create a dummy marker file to simulate the case that a marker file was created without data file. // This should fail the commit - String partitionPath = Arrays.stream(fs.globStatus(new Path(String.format("%s/*/*/*/*", - metaClient.getMarkerFolderPath(commitTime))), - path -> path.toString().endsWith(HoodieTableMetaClient.MARKER_EXTN))).limit(1) - .map(status -> status.getPath().getParent().toString()).collect(Collectors.toList()).get(0); + String partitionPath = Arrays + .stream(fs.globStatus(new Path(String.format("%s/*/*/*/*", metaClient.getMarkerFolderPath(commitTime))), + path -> path.toString().endsWith(HoodieTableMetaClient.MARKER_EXTN))) + .limit(1).map(status -> status.getPath().getParent().toString()).collect(Collectors.toList()).get(0); Path markerFilePath = new Path(String.format("%s/%s", partitionPath, FSUtils.makeMarkerFile(commitTime, "1-0-1", UUID.randomUUID().toString()))); metaClient.getFs().create(markerFilePath); @@ -722,9 +711,10 @@ public class TestHoodieClientOnCopyOnWriteStorage extends TestHoodieClientBase { */ private HoodieWriteConfig getSmallInsertWriteConfig(int insertSplitSize) { HoodieWriteConfig.Builder builder = getConfigBuilder(); - return builder.withCompactionConfig( - HoodieCompactionConfig.newBuilder().compactionSmallFileSize(HoodieTestDataGenerator.SIZE_PER_RECORD * 15) - .insertSplitSize(insertSplitSize).build()) // tolerate upto 15 records + return builder + .withCompactionConfig( + HoodieCompactionConfig.newBuilder().compactionSmallFileSize(HoodieTestDataGenerator.SIZE_PER_RECORD * 15) + .insertSplitSize(insertSplitSize).build()) // tolerate upto 15 records .withStorageConfig( HoodieStorageConfig.newBuilder().limitFileSize(HoodieTestDataGenerator.SIZE_PER_RECORD * 20).build()) .build(); diff --git a/hudi-client/src/test/java/org/apache/hudi/TestHoodieReadClient.java b/hudi-client/src/test/java/org/apache/hudi/TestHoodieReadClient.java index d279cbc04..24e7ae4ff 100644 --- a/hudi-client/src/test/java/org/apache/hudi/TestHoodieReadClient.java +++ b/hudi-client/src/test/java/org/apache/hudi/TestHoodieReadClient.java @@ -113,8 +113,7 @@ public class TestHoodieReadClient extends TestHoodieClientBase { */ @Test public void testTagLocationAfterInsert() throws Exception { - testTagLocation(getConfig(), HoodieWriteClient::insert, - HoodieWriteClient::upsert, false); + testTagLocation(getConfig(), HoodieWriteClient::insert, HoodieWriteClient::upsert, false); } /** @@ -122,8 +121,8 @@ public class TestHoodieReadClient extends TestHoodieClientBase { */ @Test public void testTagLocationAfterInsertPrepped() throws Exception { - testTagLocation(getConfig(), HoodieWriteClient::insertPreppedRecords, - HoodieWriteClient::upsertPreppedRecords, true); + testTagLocation(getConfig(), HoodieWriteClient::insertPreppedRecords, HoodieWriteClient::upsertPreppedRecords, + true); } /** @@ -140,9 +139,9 @@ public class TestHoodieReadClient extends TestHoodieClientBase { */ @Test public void testTagLocationAfterBulkInsertPrepped() throws Exception { - testTagLocation(getConfigBuilder().withBulkInsertParallelism(1).build(), - (writeClient, recordRDD, commitTime) - -> writeClient.bulkInsertPreppedRecords(recordRDD, commitTime, Option.empty()), + testTagLocation( + getConfigBuilder().withBulkInsertParallelism(1).build(), (writeClient, recordRDD, commitTime) -> writeClient + .bulkInsertPreppedRecords(recordRDD, commitTime, Option.empty()), HoodieWriteClient::upsertPreppedRecords, true); } @@ -155,27 +154,22 @@ public class TestHoodieReadClient extends TestHoodieClientBase { * @param isPrepped isPrepped flag. * @throws Exception in case of error */ - private void testTagLocation( - HoodieWriteConfig hoodieWriteConfig, + private void testTagLocation(HoodieWriteConfig hoodieWriteConfig, Function3, HoodieWriteClient, JavaRDD, String> insertFn, - Function3, HoodieWriteClient, JavaRDD, String> updateFn, - boolean isPrepped) + Function3, HoodieWriteClient, JavaRDD, String> updateFn, boolean isPrepped) throws Exception { try (HoodieWriteClient client = getHoodieWriteClient(hoodieWriteConfig);) { - //Write 1 (only inserts) + // Write 1 (only inserts) String newCommitTime = "001"; String initCommitTime = "000"; int numRecords = 200; - JavaRDD result = - insertFirstBatch(hoodieWriteConfig, client, newCommitTime, initCommitTime, numRecords, insertFn, isPrepped, - true, numRecords); + JavaRDD result = insertFirstBatch(hoodieWriteConfig, client, newCommitTime, initCommitTime, + numRecords, insertFn, isPrepped, true, numRecords); // Construct HoodieRecord from the WriteStatus but set HoodieKey, Data and HoodieRecordLocation accordingly // since they have been modified in the DAG JavaRDD recordRDD = - jsc.parallelize( - result.collect().stream().map(WriteStatus::getWrittenRecords).flatMap(Collection::stream) - .map(record -> new HoodieRecord(record.getKey(), null)) - .collect(Collectors.toList())); + jsc.parallelize(result.collect().stream().map(WriteStatus::getWrittenRecords).flatMap(Collection::stream) + .map(record -> new HoodieRecord(record.getKey(), null)).collect(Collectors.toList())); // Should have 100 records in table (check using Index), all in locations marked at commit HoodieReadClient readClient = getHoodieReadClient(hoodieWriteConfig.getBasePath()); List taggedRecords = readClient.tagLocation(recordRDD).collect(); @@ -187,14 +181,11 @@ public class TestHoodieReadClient extends TestHoodieClientBase { numRecords = 100; String commitTimeBetweenPrevAndNew = "002"; result = updateBatch(hoodieWriteConfig, client, newCommitTime, prevCommitTime, - Option.of(Arrays.asList(commitTimeBetweenPrevAndNew)), - initCommitTime, numRecords, updateFn, isPrepped, - true, numRecords, 200, 2); + Option.of(Arrays.asList(commitTimeBetweenPrevAndNew)), initCommitTime, numRecords, updateFn, isPrepped, true, + numRecords, 200, 2); recordRDD = - jsc.parallelize( - result.collect().stream().map(WriteStatus::getWrittenRecords).flatMap(Collection::stream) - .map(record -> new HoodieRecord(record.getKey(), null)) - .collect(Collectors.toList())); + jsc.parallelize(result.collect().stream().map(WriteStatus::getWrittenRecords).flatMap(Collection::stream) + .map(record -> new HoodieRecord(record.getKey(), null)).collect(Collectors.toList())); // Index should be able to locate all updates in correct locations. readClient = getHoodieReadClient(hoodieWriteConfig.getBasePath()); taggedRecords = readClient.tagLocation(recordRDD).collect(); diff --git a/hudi-client/src/test/java/org/apache/hudi/TestMultiFS.java b/hudi-client/src/test/java/org/apache/hudi/TestMultiFS.java index 08d0a0366..8dac99b5e 100644 --- a/hudi-client/src/test/java/org/apache/hudi/TestMultiFS.java +++ b/hudi-client/src/test/java/org/apache/hudi/TestMultiFS.java @@ -71,19 +71,17 @@ public class TestMultiFS extends HoodieClientTestHarness { protected HoodieWriteConfig getHoodieWriteConfig(String basePath) { return HoodieWriteConfig.newBuilder().withPath(basePath).withEmbeddedTimelineServerEnabled(true) - .withSchema(HoodieTestDataGenerator.TRIP_EXAMPLE_SCHEMA).withParallelism(2, 2) - .forTable(tableName).withIndexConfig( - HoodieIndexConfig.newBuilder().withIndexType(HoodieIndex.IndexType.BLOOM).build()).build(); + .withSchema(HoodieTestDataGenerator.TRIP_EXAMPLE_SCHEMA).withParallelism(2, 2).forTable(tableName) + .withIndexConfig(HoodieIndexConfig.newBuilder().withIndexType(HoodieIndex.IndexType.BLOOM).build()).build(); } @Test public void readLocalWriteHDFS() throws Exception { // Initialize table and filesystem - HoodieTableMetaClient - .initTableType(jsc.hadoopConfiguration(), dfsBasePath, HoodieTableType.valueOf(tableType), tableName, - HoodieAvroPayload.class.getName()); + HoodieTableMetaClient.initTableType(jsc.hadoopConfiguration(), dfsBasePath, HoodieTableType.valueOf(tableType), + tableName, HoodieAvroPayload.class.getName()); - //Create write client to write some records in + // Create write client to write some records in HoodieWriteConfig cfg = getHoodieWriteConfig(dfsBasePath); HoodieWriteConfig localConfig = getHoodieWriteConfig(tablePath); @@ -105,9 +103,8 @@ public class TestMultiFS extends HoodieClientTestHarness { assertEquals("Should contain 100 records", readRecords.count(), records.size()); // Write to local - HoodieTableMetaClient - .initTableType(jsc.hadoopConfiguration(), tablePath, HoodieTableType.valueOf(tableType), tableName, - HoodieAvroPayload.class.getName()); + HoodieTableMetaClient.initTableType(jsc.hadoopConfiguration(), tablePath, HoodieTableType.valueOf(tableType), + tableName, HoodieAvroPayload.class.getName()); String writeCommitTime = localWriteClient.startCommit(); logger.info("Starting write commit " + writeCommitTime); @@ -120,8 +117,8 @@ public class TestMultiFS extends HoodieClientTestHarness { fs = FSUtils.getFs(tablePath, HoodieTestUtils.getDefaultHadoopConf()); metaClient = new HoodieTableMetaClient(fs.getConf(), tablePath); timeline = new HoodieActiveTimeline(metaClient).getCommitTimeline(); - Dataset localReadRecords = HoodieClientTestUtils - .readCommit(tablePath, sqlContext, timeline, writeCommitTime); + Dataset localReadRecords = + HoodieClientTestUtils.readCommit(tablePath, sqlContext, timeline, writeCommitTime); assertEquals("Should contain 100 records", localReadRecords.count(), localRecords.size()); } } diff --git a/hudi-client/src/test/java/org/apache/hudi/TestWriteStatus.java b/hudi-client/src/test/java/org/apache/hudi/TestWriteStatus.java index 352acd500..f7f949266 100644 --- a/hudi-client/src/test/java/org/apache/hudi/TestWriteStatus.java +++ b/hudi-client/src/test/java/org/apache/hudi/TestWriteStatus.java @@ -28,14 +28,14 @@ import org.mockito.Mockito; public class TestWriteStatus { @Test - public void testFailureFraction() throws IOException { + public void testFailureFraction() throws IOException { WriteStatus status = new WriteStatus(true, 0.1); Throwable t = new Exception("some error in writing"); for (int i = 0; i < 1000; i++) { status.markFailure(Mockito.mock(HoodieRecord.class), t, null); } assertTrue(status.getFailedRecords().size() > 0); - assertTrue(status.getFailedRecords().size() < 150); //150 instead of 100, to prevent flaky test + assertTrue(status.getFailedRecords().size() < 150); // 150 instead of 100, to prevent flaky test assertTrue(status.hasErrors()); } diff --git a/hudi-client/src/test/java/org/apache/hudi/common/HoodieClientTestUtils.java b/hudi-client/src/test/java/org/apache/hudi/common/HoodieClientTestUtils.java index ba930aa3d..8dfb824b7 100644 --- a/hudi-client/src/test/java/org/apache/hudi/common/HoodieClientTestUtils.java +++ b/hudi-client/src/test/java/org/apache/hudi/common/HoodieClientTestUtils.java @@ -117,8 +117,7 @@ public class HoodieClientTestUtils { public static SparkConf getSparkConfForTest(String appName) { SparkConf sparkConf = new SparkConf().setAppName(appName) - .set("spark.serializer", "org.apache.spark.serializer.KryoSerializer") - .setMaster("local[8]"); + .set("spark.serializer", "org.apache.spark.serializer.KryoSerializer").setMaster("local[8]"); return HoodieReadClient.addHoodieSupport(sparkConf); } @@ -126,8 +125,8 @@ public class HoodieClientTestUtils { List commitsToReturn) throws IOException { HashMap fileIdToFullPath = new HashMap<>(); for (HoodieInstant commit : commitsToReturn) { - HoodieCommitMetadata metadata = HoodieCommitMetadata.fromBytes(commitTimeline.getInstantDetails(commit).get(), - HoodieCommitMetadata.class); + HoodieCommitMetadata metadata = + HoodieCommitMetadata.fromBytes(commitTimeline.getInstantDetails(commit).get(), HoodieCommitMetadata.class); fileIdToFullPath.putAll(metadata.getFileIdAndFullPaths(basePath)); } return fileIdToFullPath; @@ -140,8 +139,8 @@ public class HoodieClientTestUtils { new HoodieException("No commit exists at " + commitTime); } try { - HashMap paths = getLatestFileIDsToFullPath(basePath, commitTimeline, - Arrays.asList(commitInstant)); + HashMap paths = + getLatestFileIDsToFullPath(basePath, commitTimeline, Arrays.asList(commitInstant)); log.info("Path :" + paths.values()); return sqlContext.read().parquet(paths.values().toArray(new String[paths.size()])) .filter(String.format("%s ='%s'", HoodieRecord.COMMIT_TIME_METADATA_FIELD, commitTime)); @@ -155,8 +154,8 @@ public class HoodieClientTestUtils { */ public static Dataset readSince(String basePath, SQLContext sqlContext, HoodieTimeline commitTimeline, String lastCommitTime) { - List commitsToReturn = commitTimeline.findInstantsAfter(lastCommitTime, Integer.MAX_VALUE) - .getInstants().collect(Collectors.toList()); + List commitsToReturn = + commitTimeline.findInstantsAfter(lastCommitTime, Integer.MAX_VALUE).getInstants().collect(Collectors.toList()); try { // Go over the commit metadata, and obtain the new files that need to be read. HashMap fileIdToFullPath = getLatestFileIDsToFullPath(basePath, commitTimeline, commitsToReturn); @@ -170,17 +169,14 @@ public class HoodieClientTestUtils { /** * Reads the paths under the a hoodie dataset out as a DataFrame */ - public static Dataset read(JavaSparkContext jsc, String basePath, SQLContext - sqlContext, - FileSystem - fs, String... - paths) { + public static Dataset read(JavaSparkContext jsc, String basePath, SQLContext sqlContext, FileSystem fs, + String... paths) { List filteredPaths = new ArrayList<>(); try { HoodieTableMetaClient metaClient = new HoodieTableMetaClient(fs.getConf(), basePath, true); for (String path : paths) { - ReadOptimizedView fileSystemView = new HoodieTableFileSystemView( - metaClient, metaClient.getCommitsTimeline().filterCompletedInstants(), fs.globStatus(new Path(path))); + ReadOptimizedView fileSystemView = new HoodieTableFileSystemView(metaClient, + metaClient.getCommitsTimeline().filterCompletedInstants(), fs.globStatus(new Path(path))); List latestFiles = fileSystemView.getLatestDataFiles().collect(Collectors.toList()); for (HoodieDataFile file : latestFiles) { filteredPaths.add(file.getPath()); @@ -192,29 +188,20 @@ public class HoodieClientTestUtils { } } - public static String writeParquetFile(String basePath, - String partitionPath, - String filename, - List records, - Schema schema, - BloomFilter filter, - boolean createCommitTime) throws IOException { + public static String writeParquetFile(String basePath, String partitionPath, String filename, + List records, Schema schema, BloomFilter filter, boolean createCommitTime) throws IOException { if (filter == null) { filter = new BloomFilter(10000, 0.0000001); } - HoodieAvroWriteSupport writeSupport = new HoodieAvroWriteSupport(new AvroSchemaConverter().convert(schema), schema, - filter); + HoodieAvroWriteSupport writeSupport = + new HoodieAvroWriteSupport(new AvroSchemaConverter().convert(schema), schema, filter); String commitTime = FSUtils.getCommitTime(filename); HoodieParquetConfig config = new HoodieParquetConfig(writeSupport, CompressionCodecName.GZIP, ParquetWriter.DEFAULT_BLOCK_SIZE, ParquetWriter.DEFAULT_PAGE_SIZE, 120 * 1024 * 1024, - HoodieTestUtils.getDefaultHadoopConf(), - Double.valueOf(HoodieStorageConfig.DEFAULT_STREAM_COMPRESSION_RATIO)); - HoodieParquetWriter writer = new HoodieParquetWriter( - commitTime, - new Path(basePath + "/" + partitionPath + "/" + filename), - config, - schema); + HoodieTestUtils.getDefaultHadoopConf(), Double.valueOf(HoodieStorageConfig.DEFAULT_STREAM_COMPRESSION_RATIO)); + HoodieParquetWriter writer = + new HoodieParquetWriter(commitTime, new Path(basePath + "/" + partitionPath + "/" + filename), config, schema); int seqId = 1; for (HoodieRecord record : records) { GenericRecord avroRecord = (GenericRecord) record.getData().getInsertValue(schema).get(); @@ -232,18 +219,14 @@ public class HoodieClientTestUtils { return filename; } - public static String writeParquetFile(String basePath, - String partitionPath, - List records, - Schema schema, - BloomFilter filter, - boolean createCommitTime) throws IOException, InterruptedException { + public static String writeParquetFile(String basePath, String partitionPath, List records, + Schema schema, BloomFilter filter, boolean createCommitTime) throws IOException, InterruptedException { Thread.sleep(1000); String commitTime = HoodieTestUtils.makeNewCommitTime(); String fileId = UUID.randomUUID().toString(); String filename = FSUtils.makeDataFileName(commitTime, "1-0-1", fileId); HoodieTestUtils.createCommitFiles(basePath, commitTime); - return HoodieClientTestUtils - .writeParquetFile(basePath, partitionPath, filename, records, schema, filter, createCommitTime); + return HoodieClientTestUtils.writeParquetFile(basePath, partitionPath, filename, records, schema, filter, + createCommitTime); } } diff --git a/hudi-client/src/test/java/org/apache/hudi/common/HoodieTestDataGenerator.java b/hudi-client/src/test/java/org/apache/hudi/common/HoodieTestDataGenerator.java index bc90b680c..9514bcebe 100644 --- a/hudi-client/src/test/java/org/apache/hudi/common/HoodieTestDataGenerator.java +++ b/hudi-client/src/test/java/org/apache/hudi/common/HoodieTestDataGenerator.java @@ -69,21 +69,14 @@ public class HoodieTestDataGenerator { public static final String DEFAULT_SECOND_PARTITION_PATH = "2015/03/16"; public static final String DEFAULT_THIRD_PARTITION_PATH = "2015/03/17"; - public static final String[] DEFAULT_PARTITION_PATHS = { - DEFAULT_FIRST_PARTITION_PATH, - DEFAULT_SECOND_PARTITION_PATH, - DEFAULT_THIRD_PARTITION_PATH - }; + public static final String[] DEFAULT_PARTITION_PATHS = + {DEFAULT_FIRST_PARTITION_PATH, DEFAULT_SECOND_PARTITION_PATH, DEFAULT_THIRD_PARTITION_PATH}; public static final int DEFAULT_PARTITION_DEPTH = 3; public static String TRIP_EXAMPLE_SCHEMA = "{\"type\": \"record\"," + "\"name\": \"triprec\"," + "\"fields\": [ " - + "{\"name\": \"timestamp\",\"type\": \"double\"}," - + "{\"name\": \"_row_key\", \"type\": \"string\"}," - + "{\"name\": \"rider\", \"type\": \"string\"}," - + "{\"name\": \"driver\", \"type\": \"string\"}," - + "{\"name\": \"begin_lat\", \"type\": \"double\"}," - + "{\"name\": \"begin_lon\", \"type\": \"double\"}," - + "{\"name\": \"end_lat\", \"type\": \"double\"}," - + "{\"name\": \"end_lon\", \"type\": \"double\"}," + + "{\"name\": \"timestamp\",\"type\": \"double\"}," + "{\"name\": \"_row_key\", \"type\": \"string\"}," + + "{\"name\": \"rider\", \"type\": \"string\"}," + "{\"name\": \"driver\", \"type\": \"string\"}," + + "{\"name\": \"begin_lat\", \"type\": \"double\"}," + "{\"name\": \"begin_lon\", \"type\": \"double\"}," + + "{\"name\": \"end_lat\", \"type\": \"double\"}," + "{\"name\": \"end_lon\", \"type\": \"double\"}," + "{\"name\":\"fare\",\"type\": \"double\"}]}"; public static Schema avroSchema = new Schema.Parser().parse(TRIP_EXAMPLE_SCHEMA); public static Schema avroSchemaWithMetadataFields = HoodieAvroUtils.addMetadataFields(avroSchema); @@ -174,8 +167,8 @@ public class HoodieTestDataGenerator { public static void createCompactionAuxiliaryMetadata(String basePath, HoodieInstant instant, Configuration configuration) throws IOException { - Path commitFile = new Path( - basePath + "/" + HoodieTableMetaClient.AUXILIARYFOLDER_NAME + "/" + instant.getFileName()); + Path commitFile = + new Path(basePath + "/" + HoodieTableMetaClient.AUXILIARYFOLDER_NAME + "/" + instant.getFileName()); FileSystem fs = FSUtils.getFs(basePath, configuration); FSDataOutputStream os = fs.create(commitFile, true); HoodieCompactionPlan workload = new HoodieCompactionPlan(); @@ -189,8 +182,8 @@ public class HoodieTestDataGenerator { public static void createSavepointFile(String basePath, String commitTime, Configuration configuration) throws IOException { - Path commitFile = new Path(basePath + "/" + HoodieTableMetaClient.METAFOLDER_NAME - + "/" + HoodieTimeline.makeSavePointFileName(commitTime)); + Path commitFile = new Path(basePath + "/" + HoodieTableMetaClient.METAFOLDER_NAME + "/" + + HoodieTimeline.makeSavePointFileName(commitTime)); FileSystem fs = FSUtils.getFs(basePath, configuration); FSDataOutputStream os = fs.create(commitFile, true); HoodieCommitMetadata commitMetadata = new HoodieCommitMetadata(); @@ -212,7 +205,7 @@ public class HoodieTestDataGenerator { /** * Generates new inserts, uniformly across the partition paths above. It also updates the list of existing keys. */ - public Stream generateInsertsStream(String commitTime, Integer n) { + public Stream generateInsertsStream(String commitTime, Integer n) { int currSize = getNumExistingKeys(); return IntStream.range(0, n).boxed().map(i -> { @@ -233,16 +226,15 @@ public class HoodieTestDataGenerator { public List generateSameKeyInserts(String commitTime, List origin) throws IOException { List copy = new ArrayList<>(); - for (HoodieRecord r: origin) { + for (HoodieRecord r : origin) { HoodieKey key = r.getKey(); - HoodieRecord record = new HoodieRecord(key, generateRandomValue(key, commitTime)); + HoodieRecord record = new HoodieRecord(key, generateRandomValue(key, commitTime)); copy.add(record); } return copy; } - public List generateInsertsWithHoodieAvroPayload(String commitTime, int limit) throws - IOException { + public List generateInsertsWithHoodieAvroPayload(String commitTime, int limit) throws IOException { List inserts = new ArrayList<>(); int currSize = getNumExistingKeys(); for (int i = 0; i < limit; i++) { @@ -290,8 +282,8 @@ public class HoodieTestDataGenerator { } public HoodieRecord generateDeleteRecord(HoodieKey key) throws IOException { - TestRawTripPayload payload = new TestRawTripPayload(Option.empty(), key.getRecordKey(), key.getPartitionPath(), - null, true); + TestRawTripPayload payload = + new TestRawTripPayload(Option.empty(), key.getRecordKey(), key.getPartitionPath(), null, true); return new HoodieRecord(key, payload); } diff --git a/hudi-client/src/test/java/org/apache/hudi/func/TestBoundedInMemoryExecutor.java b/hudi-client/src/test/java/org/apache/hudi/func/TestBoundedInMemoryExecutor.java index 6a57eb1b6..fbbe67fd0 100644 --- a/hudi-client/src/test/java/org/apache/hudi/func/TestBoundedInMemoryExecutor.java +++ b/hudi-client/src/test/java/org/apache/hudi/func/TestBoundedInMemoryExecutor.java @@ -70,8 +70,7 @@ public class TestBoundedInMemoryExecutor extends HoodieClientTestHarness { } @Override - protected void finish() { - } + protected void finish() {} @Override protected Integer getResult() { @@ -79,11 +78,10 @@ public class TestBoundedInMemoryExecutor extends HoodieClientTestHarness { } }; - SparkBoundedInMemoryExecutor>, Integer> executor = null; + SparkBoundedInMemoryExecutor>, Integer> executor = null; try { - executor = new SparkBoundedInMemoryExecutor(hoodieWriteConfig, - hoodieRecords.iterator(), consumer, getTransformFunction(HoodieTestDataGenerator.avroSchema)); + executor = new SparkBoundedInMemoryExecutor(hoodieWriteConfig, hoodieRecords.iterator(), consumer, + getTransformFunction(HoodieTestDataGenerator.avroSchema)); int result = executor.execute(); // It should buffer and write 100 records Assert.assertEquals(result, 100); diff --git a/hudi-client/src/test/java/org/apache/hudi/func/TestBoundedInMemoryQueue.java b/hudi-client/src/test/java/org/apache/hudi/func/TestBoundedInMemoryQueue.java index 5f7969dc0..ee4dc5859 100644 --- a/hudi-client/src/test/java/org/apache/hudi/func/TestBoundedInMemoryQueue.java +++ b/hudi-client/src/test/java/org/apache/hudi/func/TestBoundedInMemoryQueue.java @@ -80,18 +80,17 @@ public class TestBoundedInMemoryQueue extends HoodieClientTestHarness { final BoundedInMemoryQueue> queue = new BoundedInMemoryQueue(FileIOUtils.KB, getTransformFunction(HoodieTestDataGenerator.avroSchema)); // Produce - Future resFuture = - executorService.submit(() -> { - new IteratorBasedQueueProducer<>(hoodieRecords.iterator()).produce(queue); - queue.close(); - return true; - }); + Future resFuture = executorService.submit(() -> { + new IteratorBasedQueueProducer<>(hoodieRecords.iterator()).produce(queue); + queue.close(); + return true; + }); final Iterator originalRecordIterator = hoodieRecords.iterator(); int recordsRead = 0; while (queue.iterator().hasNext()) { final HoodieRecord originalRecord = originalRecordIterator.next(); - final Option originalInsertValue = originalRecord.getData() - .getInsertValue(HoodieTestDataGenerator.avroSchema); + final Option originalInsertValue = + originalRecord.getData().getInsertValue(HoodieTestDataGenerator.avroSchema); final HoodieInsertValueGenResult payload = queue.iterator().next(); // Ensure that record ordering is guaranteed. Assert.assertEquals(originalRecord, payload.record); @@ -176,10 +175,10 @@ public class TestBoundedInMemoryQueue extends HoodieClientTestHarness { }); // Used to ensure that consumer sees the records generated by a single producer in FIFO order - Map lastSeenMap = IntStream.range(0, numProducers).boxed() - .collect(Collectors.toMap(Function.identity(), x -> -1)); - Map countMap = IntStream.range(0, numProducers).boxed() - .collect(Collectors.toMap(Function.identity(), x -> 0)); + Map lastSeenMap = + IntStream.range(0, numProducers).boxed().collect(Collectors.toMap(Function.identity(), x -> -1)); + Map countMap = + IntStream.range(0, numProducers).boxed().collect(Collectors.toMap(Function.identity(), x -> 0)); // Read recs and ensure we have covered all producer recs. while (queue.iterator().hasNext()) { @@ -198,7 +197,7 @@ public class TestBoundedInMemoryQueue extends HoodieClientTestHarness { Assert.assertEquals(Integer.valueOf(numRecords), countMap.get(i)); } - //Ensure Close future is done + // Ensure Close future is done closeFuture.get(); } @@ -210,15 +209,13 @@ public class TestBoundedInMemoryQueue extends HoodieClientTestHarness { final List hoodieRecords = dataGen.generateInserts(commitTime, numRecords); // maximum number of records to keep in memory. final int recordLimit = 5; - final SizeEstimator> sizeEstimator = - new DefaultSizeEstimator<>(); - HoodieInsertValueGenResult payload = getTransformFunction(HoodieTestDataGenerator.avroSchema) - .apply(hoodieRecords.get(0)); + final SizeEstimator> sizeEstimator = new DefaultSizeEstimator<>(); + HoodieInsertValueGenResult payload = + getTransformFunction(HoodieTestDataGenerator.avroSchema).apply(hoodieRecords.get(0)); final long objSize = sizeEstimator.sizeEstimate(payload); final long memoryLimitInBytes = recordLimit * objSize; final BoundedInMemoryQueue> queue = - new BoundedInMemoryQueue(memoryLimitInBytes, - getTransformFunction(HoodieTestDataGenerator.avroSchema)); + new BoundedInMemoryQueue(memoryLimitInBytes, getTransformFunction(HoodieTestDataGenerator.avroSchema)); // Produce Future resFuture = executorService.submit(() -> { @@ -259,11 +256,10 @@ public class TestBoundedInMemoryQueue extends HoodieClientTestHarness { public void testException() throws Exception { final int numRecords = 256; final List hoodieRecords = dataGen.generateInserts(commitTime, numRecords); - final SizeEstimator>> sizeEstimator = - new DefaultSizeEstimator<>(); + final SizeEstimator>> sizeEstimator = new DefaultSizeEstimator<>(); // queue memory limit - HoodieInsertValueGenResult payload = getTransformFunction(HoodieTestDataGenerator.avroSchema) - .apply(hoodieRecords.get(0)); + HoodieInsertValueGenResult payload = + getTransformFunction(HoodieTestDataGenerator.avroSchema).apply(hoodieRecords.get(0)); final long objSize = sizeEstimator.sizeEstimate(new Tuple2<>(payload.record, payload.insertValue)); final long memoryLimitInBytes = 4 * objSize; diff --git a/hudi-client/src/test/java/org/apache/hudi/func/TestUpdateMapFunction.java b/hudi-client/src/test/java/org/apache/hudi/func/TestUpdateMapFunction.java index 8bd29b1d9..1af2f209e 100644 --- a/hudi-client/src/test/java/org/apache/hudi/func/TestUpdateMapFunction.java +++ b/hudi-client/src/test/java/org/apache/hudi/func/TestUpdateMapFunction.java @@ -87,8 +87,8 @@ public class TestUpdateMapFunction extends HoodieClientTestHarness { insertRecords .add(new HoodieRecord(new HoodieKey(rowChange3.getRowKey(), rowChange3.getPartitionPath()), rowChange3)); - HoodieCreateHandle createHandle = new HoodieCreateHandle(config, "100", table, rowChange1.getPartitionPath(), - "f1-0", insertRecords.iterator()); + HoodieCreateHandle createHandle = + new HoodieCreateHandle(config, "100", table, rowChange1.getPartitionPath(), "f1-0", insertRecords.iterator()); createHandle.write(); WriteStatus insertResult = createHandle.close(); return insertResult; @@ -111,8 +111,8 @@ public class TestUpdateMapFunction extends HoodieClientTestHarness { + "\"time\":\"2016-01-31T03:16:41.415Z\",\"number\":12,\"added_field\":1}"; List updateRecords = new ArrayList<>(); TestRawTripPayload rowChange1 = new TestRawTripPayload(recordStr1); - HoodieRecord record1 = new HoodieRecord(new HoodieKey(rowChange1.getRowKey(), rowChange1.getPartitionPath()), - rowChange1); + HoodieRecord record1 = + new HoodieRecord(new HoodieKey(rowChange1.getRowKey(), rowChange1.getPartitionPath()), rowChange1); record1.unseal(); record1.setCurrentLocation(new HoodieRecordLocation("100", fileId)); record1.seal(); diff --git a/hudi-client/src/test/java/org/apache/hudi/index/TestHBaseQPSResourceAllocator.java b/hudi-client/src/test/java/org/apache/hudi/index/TestHBaseQPSResourceAllocator.java index 520b93ba3..f1a9af4d1 100644 --- a/hudi-client/src/test/java/org/apache/hudi/index/TestHBaseQPSResourceAllocator.java +++ b/hudi-client/src/test/java/org/apache/hudi/index/TestHBaseQPSResourceAllocator.java @@ -105,22 +105,18 @@ public class TestHBaseQPSResourceAllocator extends HoodieClientTestHarness { private HoodieWriteConfig.Builder getConfigBuilder(HoodieHBaseIndexConfig hoodieHBaseIndexConfig) { return HoodieWriteConfig.newBuilder().withPath(basePath).withSchema(HoodieTestDataGenerator.TRIP_EXAMPLE_SCHEMA) - .withParallelism(1, 1).withCompactionConfig( - HoodieCompactionConfig.newBuilder().compactionSmallFileSize(1024 * 1024).withInlineCompaction(false) - .build()).withAutoCommit(false) - .withStorageConfig(HoodieStorageConfig.newBuilder().limitFileSize(1024 * 1024).build()) - .forTable("test-trip-table").withIndexConfig( - HoodieIndexConfig.newBuilder().withIndexType(HoodieIndex.IndexType.HBASE) - .withHBaseIndexConfig(hoodieHBaseIndexConfig) - .build()); + .withParallelism(1, 1) + .withCompactionConfig(HoodieCompactionConfig.newBuilder().compactionSmallFileSize(1024 * 1024) + .withInlineCompaction(false).build()) + .withAutoCommit(false).withStorageConfig(HoodieStorageConfig.newBuilder().limitFileSize(1024 * 1024).build()) + .forTable("test-trip-table").withIndexConfig(HoodieIndexConfig.newBuilder() + .withIndexType(HoodieIndex.IndexType.HBASE).withHBaseIndexConfig(hoodieHBaseIndexConfig).build()); } private HoodieHBaseIndexConfig getConfigWithResourceAllocator(Option resourceAllocatorClass) { - HoodieHBaseIndexConfig.Builder builder = - new HoodieHBaseIndexConfig.Builder() - .hbaseZkPort(Integer.valueOf(hbaseConfig.get("hbase.zookeeper.property.clientPort"))) - .hbaseZkQuorum(hbaseConfig.get("hbase.zookeeper.quorum")).hbaseTableName(tableName) - .hbaseIndexGetBatchSize(100); + HoodieHBaseIndexConfig.Builder builder = new HoodieHBaseIndexConfig.Builder() + .hbaseZkPort(Integer.valueOf(hbaseConfig.get("hbase.zookeeper.property.clientPort"))) + .hbaseZkQuorum(hbaseConfig.get("hbase.zookeeper.quorum")).hbaseTableName(tableName).hbaseIndexGetBatchSize(100); if (resourceAllocatorClass.isPresent()) { builder.withQPSResourceAllocatorType(resourceAllocatorClass.get()); } diff --git a/hudi-client/src/test/java/org/apache/hudi/index/TestHbaseIndex.java b/hudi-client/src/test/java/org/apache/hudi/index/TestHbaseIndex.java index 8817ad356..8fd294336 100644 --- a/hudi-client/src/test/java/org/apache/hudi/index/TestHbaseIndex.java +++ b/hudi-client/src/test/java/org/apache/hudi/index/TestHbaseIndex.java @@ -76,8 +76,7 @@ public class TestHbaseIndex extends HoodieClientTestHarness { private static Configuration hbaseConfig; private static String tableName = "test_table"; - public TestHbaseIndex() throws Exception { - } + public TestHbaseIndex() throws Exception {} @AfterClass public static void clean() throws Exception { @@ -154,9 +153,8 @@ public class TestHbaseIndex extends HoodieClientTestHarness { javaRDD = index.tagLocation(writeRecords, jsc, hoodieTable); assertTrue(javaRDD.filter(record -> record.isCurrentLocationKnown()).collect().size() == 200); assertTrue(javaRDD.map(record -> record.getKey().getRecordKey()).distinct().count() == 200); - assertTrue(javaRDD.filter( - record -> (record.getCurrentLocation() != null && record.getCurrentLocation().getInstantTime() - .equals(newCommitTime))).distinct().count() == 200); + assertTrue(javaRDD.filter(record -> (record.getCurrentLocation() != null + && record.getCurrentLocation().getInstantTime().equals(newCommitTime))).distinct().count() == 200); } } @@ -188,9 +186,8 @@ public class TestHbaseIndex extends HoodieClientTestHarness { JavaRDD javaRDD = index.tagLocation(writeRecords, jsc, hoodieTable); assertTrue(javaRDD.filter(record -> record.isCurrentLocationKnown()).collect().size() == 10); assertTrue(javaRDD.map(record -> record.getKey().getRecordKey()).distinct().count() == 10); - assertTrue(javaRDD.filter( - record -> (record.getCurrentLocation() != null && record.getCurrentLocation().getInstantTime() - .equals(newCommitTime))).distinct().count() == 10); + assertTrue(javaRDD.filter(record -> (record.getCurrentLocation() != null + && record.getCurrentLocation().getInstantTime().equals(newCommitTime))).distinct().count() == 10); } @Test @@ -348,11 +345,7 @@ public class TestHbaseIndex extends HoodieClientTestHarness { HoodieWriteConfig config = getConfig(); HBaseIndex index = new HBaseIndex(config); final JavaRDD writeStatusRDD = jsc.parallelize( - Arrays.asList( - getSampleWriteStatus(1, 2), - getSampleWriteStatus(0, 3), - getSampleWriteStatus(10, 0)), - 10); + Arrays.asList(getSampleWriteStatus(1, 2), getSampleWriteStatus(0, 3), getSampleWriteStatus(10, 0)), 10); final Tuple2 tuple = index.getHBasePutAccessParallelism(writeStatusRDD); final int hbasePutAccessParallelism = Integer.parseInt(tuple._2.toString()); final int hbaseNumPuts = Integer.parseInt(tuple._1.toString()); @@ -365,11 +358,8 @@ public class TestHbaseIndex extends HoodieClientTestHarness { public void testsHBasePutAccessParallelismWithNoInserts() { HoodieWriteConfig config = getConfig(); HBaseIndex index = new HBaseIndex(config); - final JavaRDD writeStatusRDD = jsc.parallelize( - Arrays.asList( - getSampleWriteStatus(0, 2), - getSampleWriteStatus(0, 1)), - 10); + final JavaRDD writeStatusRDD = + jsc.parallelize(Arrays.asList(getSampleWriteStatus(0, 2), getSampleWriteStatus(0, 1)), 10); final Tuple2 tuple = index.getHBasePutAccessParallelism(writeStatusRDD); final int hbasePutAccessParallelism = Integer.parseInt(tuple._2.toString()); final int hbaseNumPuts = Integer.parseInt(tuple._1.toString()); @@ -411,17 +401,16 @@ public class TestHbaseIndex extends HoodieClientTestHarness { private HoodieWriteConfig.Builder getConfigBuilder() { return HoodieWriteConfig.newBuilder().withPath(basePath).withSchema(HoodieTestDataGenerator.TRIP_EXAMPLE_SCHEMA) - .withParallelism(1, 1).withCompactionConfig( - HoodieCompactionConfig.newBuilder().compactionSmallFileSize(1024 * 1024).withInlineCompaction(false) - .build()).withAutoCommit(false) - .withStorageConfig(HoodieStorageConfig.newBuilder().limitFileSize(1024 * 1024).build()) - .forTable("test-trip-table").withIndexConfig( - HoodieIndexConfig.newBuilder().withIndexType(HoodieIndex.IndexType.HBASE) - .withHBaseIndexConfig( - new HoodieHBaseIndexConfig.Builder() - .hbaseZkPort(Integer.valueOf(hbaseConfig.get("hbase.zookeeper.property.clientPort"))) - .hbaseZkQuorum(hbaseConfig.get("hbase.zookeeper.quorum")).hbaseTableName(tableName) - .hbaseIndexGetBatchSize(100).build()) - .build()); + .withParallelism(1, 1) + .withCompactionConfig(HoodieCompactionConfig.newBuilder().compactionSmallFileSize(1024 * 1024) + .withInlineCompaction(false).build()) + .withAutoCommit(false).withStorageConfig(HoodieStorageConfig.newBuilder().limitFileSize(1024 * 1024).build()) + .forTable("test-trip-table") + .withIndexConfig(HoodieIndexConfig.newBuilder().withIndexType(HoodieIndex.IndexType.HBASE) + .withHBaseIndexConfig(new HoodieHBaseIndexConfig.Builder() + .hbaseZkPort(Integer.valueOf(hbaseConfig.get("hbase.zookeeper.property.clientPort"))) + .hbaseZkQuorum(hbaseConfig.get("hbase.zookeeper.quorum")).hbaseTableName(tableName) + .hbaseIndexGetBatchSize(100).build()) + .build()); } } diff --git a/hudi-client/src/test/java/org/apache/hudi/index/TestHoodieIndex.java b/hudi-client/src/test/java/org/apache/hudi/index/TestHoodieIndex.java index 44451e1dc..8ca90805b 100644 --- a/hudi-client/src/test/java/org/apache/hudi/index/TestHoodieIndex.java +++ b/hudi-client/src/test/java/org/apache/hudi/index/TestHoodieIndex.java @@ -50,9 +50,10 @@ public class TestHoodieIndex extends HoodieClientTestHarness { HoodieWriteConfig.Builder clientConfigBuilder = HoodieWriteConfig.newBuilder(); HoodieIndexConfig.Builder indexConfigBuilder = HoodieIndexConfig.newBuilder(); // Different types - HoodieWriteConfig config = clientConfigBuilder.withPath(basePath).withIndexConfig( - indexConfigBuilder.withIndexType(HoodieIndex.IndexType.HBASE) - .withHBaseIndexConfig(new HoodieHBaseIndexConfig.Builder().build()).build()).build(); + HoodieWriteConfig config = clientConfigBuilder.withPath(basePath) + .withIndexConfig(indexConfigBuilder.withIndexType(HoodieIndex.IndexType.HBASE) + .withHBaseIndexConfig(new HoodieHBaseIndexConfig.Builder().build()).build()) + .build(); assertTrue(HoodieIndex.createIndex(config, jsc) instanceof HBaseIndex); config = clientConfigBuilder.withPath(basePath) .withIndexConfig(indexConfigBuilder.withIndexType(HoodieIndex.IndexType.INMEMORY).build()).build(); diff --git a/hudi-client/src/test/java/org/apache/hudi/index/bloom/TestBucketizedBloomCheckPartitioner.java b/hudi-client/src/test/java/org/apache/hudi/index/bloom/TestBucketizedBloomCheckPartitioner.java index e2282b0e9..22bb21861 100644 --- a/hudi-client/src/test/java/org/apache/hudi/index/bloom/TestBucketizedBloomCheckPartitioner.java +++ b/hudi-client/src/test/java/org/apache/hudi/index/bloom/TestBucketizedBloomCheckPartitioner.java @@ -45,12 +45,9 @@ public class TestBucketizedBloomCheckPartitioner { assertEquals("f1 should have 4 buckets", 4, assignments.get("f1").size()); assertEquals("f2 should have 4 buckets", 4, assignments.get("f2").size()); assertEquals("f3 should have 2 buckets", 2, assignments.get("f3").size()); - assertArrayEquals("f1 spread across 3 partitions", new Integer[]{0, 0, 1, 3}, - assignments.get("f1").toArray()); - assertArrayEquals("f2 spread across 3 partitions", new Integer[]{1, 2, 2, 0}, - assignments.get("f2").toArray()); - assertArrayEquals("f3 spread across 2 partitions", new Integer[]{3, 1}, - assignments.get("f3").toArray()); + assertArrayEquals("f1 spread across 3 partitions", new Integer[] {0, 0, 1, 3}, assignments.get("f1").toArray()); + assertArrayEquals("f2 spread across 3 partitions", new Integer[] {1, 2, 2, 0}, assignments.get("f2").toArray()); + assertArrayEquals("f3 spread across 2 partitions", new Integer[] {3, 1}, assignments.get("f3").toArray()); } @Test @@ -64,9 +61,9 @@ public class TestBucketizedBloomCheckPartitioner { BucketizedBloomCheckPartitioner partitioner = new BucketizedBloomCheckPartitioner(100, comparisons1, 10); Map> assignments = partitioner.getFileGroupToPartitions(); assignments.entrySet().stream().forEach(e -> assertEquals(10, e.getValue().size())); - Map partitionToNumBuckets = assignments.entrySet().stream() - .flatMap(e -> e.getValue().stream().map(p -> Pair.of(p, e.getKey()))) - .collect(Collectors.groupingBy(Pair::getLeft, Collectors.counting())); + Map partitionToNumBuckets = + assignments.entrySet().stream().flatMap(e -> e.getValue().stream().map(p -> Pair.of(p, e.getKey()))) + .collect(Collectors.groupingBy(Pair::getLeft, Collectors.counting())); partitionToNumBuckets.entrySet().stream().forEach(e -> assertEquals(1L, e.getValue().longValue())); } diff --git a/hudi-client/src/test/java/org/apache/hudi/index/bloom/TestHoodieBloomIndex.java b/hudi-client/src/test/java/org/apache/hudi/index/bloom/TestHoodieBloomIndex.java index a9234b908..68b820ab1 100644 --- a/hudi-client/src/test/java/org/apache/hudi/index/bloom/TestHoodieBloomIndex.java +++ b/hudi-client/src/test/java/org/apache/hudi/index/bloom/TestHoodieBloomIndex.java @@ -75,8 +75,8 @@ public class TestHoodieBloomIndex extends HoodieClientTestHarness { @Parameterized.Parameters(name = "{index}: Test with rangePruning={0}, treeFiltering ={1}, bucketizedChecking is:{2}") public static Collection data() { - Object[][] data = new Object[][]{{true, true, true}, {false, true, true}, {true, true, false}, - {true, false, true}}; + Object[][] data = + new Object[][] {{true, true, true}, {false, true, true}, {true, true, false}, {true, false, true}}; return Arrays.asList(data); } @@ -106,12 +106,9 @@ public class TestHoodieBloomIndex extends HoodieClientTestHarness { private HoodieWriteConfig makeConfig() { HoodieWriteConfig config = HoodieWriteConfig.newBuilder().withPath(basePath) - .withIndexConfig(HoodieIndexConfig.newBuilder() - .bloomIndexPruneByRanges(rangePruning) - .bloomIndexTreebasedFilter(treeFiltering) - .bloomIndexBucketizedChecking(bucketizedChecking) - .bloomIndexKeysPerBucket(2) - .build()) + .withIndexConfig(HoodieIndexConfig.newBuilder().bloomIndexPruneByRanges(rangePruning) + .bloomIndexTreebasedFilter(treeFiltering).bloomIndexBucketizedChecking(bucketizedChecking) + .bloomIndexKeysPerBucket(2).build()) .build(); return config; } @@ -130,36 +127,31 @@ public class TestHoodieBloomIndex extends HoodieClientTestHarness { new File(basePath + "/2016/04/01").mkdirs(); new File(basePath + "/2015/03/12").mkdirs(); - TestRawTripPayload rowChange1 = new TestRawTripPayload( - "{\"_row_key\":\"000\",\"time\":\"2016-01-31T03:16:41.415Z\",\"number\":12}"); - HoodieRecord record1 = new HoodieRecord(new HoodieKey(rowChange1.getRowKey(), rowChange1.getPartitionPath()), - rowChange1); - TestRawTripPayload rowChange2 = new TestRawTripPayload( - "{\"_row_key\":\"001\",\"time\":\"2016-01-31T03:16:41.415Z\",\"number\":12}"); - HoodieRecord record2 = new HoodieRecord(new HoodieKey(rowChange2.getRowKey(), rowChange2.getPartitionPath()), - rowChange2); - TestRawTripPayload rowChange3 = new TestRawTripPayload( - "{\"_row_key\":\"002\",\"time\":\"2016-01-31T03:16:41.415Z\",\"number\":12}"); - HoodieRecord record3 = new HoodieRecord(new HoodieKey(rowChange3.getRowKey(), rowChange3.getPartitionPath()), - rowChange3); - TestRawTripPayload rowChange4 = new TestRawTripPayload( - "{\"_row_key\":\"003\",\"time\":\"2016-01-31T03:16:41.415Z\",\"number\":12}"); - HoodieRecord record4 = new HoodieRecord(new HoodieKey(rowChange4.getRowKey(), rowChange4.getPartitionPath()), - rowChange4); + TestRawTripPayload rowChange1 = + new TestRawTripPayload("{\"_row_key\":\"000\",\"time\":\"2016-01-31T03:16:41.415Z\",\"number\":12}"); + HoodieRecord record1 = + new HoodieRecord(new HoodieKey(rowChange1.getRowKey(), rowChange1.getPartitionPath()), rowChange1); + TestRawTripPayload rowChange2 = + new TestRawTripPayload("{\"_row_key\":\"001\",\"time\":\"2016-01-31T03:16:41.415Z\",\"number\":12}"); + HoodieRecord record2 = + new HoodieRecord(new HoodieKey(rowChange2.getRowKey(), rowChange2.getPartitionPath()), rowChange2); + TestRawTripPayload rowChange3 = + new TestRawTripPayload("{\"_row_key\":\"002\",\"time\":\"2016-01-31T03:16:41.415Z\",\"number\":12}"); + HoodieRecord record3 = + new HoodieRecord(new HoodieKey(rowChange3.getRowKey(), rowChange3.getPartitionPath()), rowChange3); + TestRawTripPayload rowChange4 = + new TestRawTripPayload("{\"_row_key\":\"003\",\"time\":\"2016-01-31T03:16:41.415Z\",\"number\":12}"); + HoodieRecord record4 = + new HoodieRecord(new HoodieKey(rowChange4.getRowKey(), rowChange4.getPartitionPath()), rowChange4); - HoodieClientTestUtils - .writeParquetFile(basePath, "2016/04/01", "2_0_20160401010101.parquet", - Lists.newArrayList(), schema, null, false); - HoodieClientTestUtils - .writeParquetFile(basePath, "2015/03/12", "1_0_20150312101010.parquet", - Lists.newArrayList(), schema, null, false); - HoodieClientTestUtils - .writeParquetFile(basePath, "2015/03/12", "3_0_20150312101010.parquet", - Arrays.asList(record1), schema, null, false); - HoodieClientTestUtils - .writeParquetFile(basePath, "2015/03/12", "4_0_20150312101010.parquet", - Arrays.asList(record2, record3, record4), schema, null, - false); + HoodieClientTestUtils.writeParquetFile(basePath, "2016/04/01", "2_0_20160401010101.parquet", Lists.newArrayList(), + schema, null, false); + HoodieClientTestUtils.writeParquetFile(basePath, "2015/03/12", "1_0_20150312101010.parquet", Lists.newArrayList(), + schema, null, false); + HoodieClientTestUtils.writeParquetFile(basePath, "2015/03/12", "3_0_20150312101010.parquet", Arrays.asList(record1), + schema, null, false); + HoodieClientTestUtils.writeParquetFile(basePath, "2015/03/12", "4_0_20150312101010.parquet", + Arrays.asList(record2, record3, record4), schema, null, false); List partitions = Arrays.asList("2016/01/21", "2016/04/01", "2015/03/12"); metaClient = HoodieTableMetaClient.reload(metaClient); @@ -188,11 +180,11 @@ public class TestHoodieBloomIndex extends HoodieClientTestHarness { // no longer sorted, but should have same files. - List> expected = Arrays.asList( - new Tuple2<>("2016/04/01", new BloomIndexFileInfo("2")), - new Tuple2<>("2015/03/12", new BloomIndexFileInfo("1")), - new Tuple2<>("2015/03/12", new BloomIndexFileInfo("3", "000", "000")), - new Tuple2<>("2015/03/12", new BloomIndexFileInfo("4", "001", "003"))); + List> expected = + Arrays.asList(new Tuple2<>("2016/04/01", new BloomIndexFileInfo("2")), + new Tuple2<>("2015/03/12", new BloomIndexFileInfo("1")), + new Tuple2<>("2015/03/12", new BloomIndexFileInfo("3", "000", "000")), + new Tuple2<>("2015/03/12", new BloomIndexFileInfo("4", "001", "003"))); assertEquals(expected, filesList); } } @@ -203,20 +195,21 @@ public class TestHoodieBloomIndex extends HoodieClientTestHarness { HoodieBloomIndex index = new HoodieBloomIndex(config); final Map> partitionToFileIndexInfo = new HashMap<>(); - partitionToFileIndexInfo.put("2017/10/22", Arrays.asList(new BloomIndexFileInfo("f1"), - new BloomIndexFileInfo("f2", "000", "000"), new BloomIndexFileInfo("f3", "001", "003"), - new BloomIndexFileInfo("f4", "002", "007"), new BloomIndexFileInfo("f5", "009", "010"))); + partitionToFileIndexInfo.put("2017/10/22", + Arrays.asList(new BloomIndexFileInfo("f1"), new BloomIndexFileInfo("f2", "000", "000"), + new BloomIndexFileInfo("f3", "001", "003"), new BloomIndexFileInfo("f4", "002", "007"), + new BloomIndexFileInfo("f5", "009", "010"))); - JavaPairRDD partitionRecordKeyPairRDD = jsc.parallelize(Arrays.asList( - new Tuple2<>("2017/10/22", "003"), new Tuple2<>("2017/10/22", "002"), new Tuple2<>("2017/10/22", "005"), - new Tuple2<>("2017/10/22", "004"))).mapToPair(t -> t); + JavaPairRDD partitionRecordKeyPairRDD = + jsc.parallelize(Arrays.asList(new Tuple2<>("2017/10/22", "003"), new Tuple2<>("2017/10/22", "002"), + new Tuple2<>("2017/10/22", "005"), new Tuple2<>("2017/10/22", "004"))).mapToPair(t -> t); - List> comparisonKeyList = index.explodeRecordRDDWithFileComparisons( - partitionToFileIndexInfo, partitionRecordKeyPairRDD).collect(); + List> comparisonKeyList = + index.explodeRecordRDDWithFileComparisons(partitionToFileIndexInfo, partitionRecordKeyPairRDD).collect(); assertEquals(10, comparisonKeyList.size()); - Map> recordKeyToFileComps = comparisonKeyList.stream().collect(Collectors.groupingBy( - t -> t._2.getRecordKey(), Collectors.mapping(t -> t._1, Collectors.toList()))); + Map> recordKeyToFileComps = comparisonKeyList.stream() + .collect(Collectors.groupingBy(t -> t._2.getRecordKey(), Collectors.mapping(t -> t._1, Collectors.toList()))); assertEquals(4, recordKeyToFileComps.size()); assertEquals(new HashSet<>(Arrays.asList("f1", "f3", "f4")), new HashSet<>(recordKeyToFileComps.get("002"))); @@ -238,25 +231,24 @@ public class TestHoodieBloomIndex extends HoodieClientTestHarness { String recordStr4 = "{\"_row_key\":\"4eb5b87c-1fej-4edd-87b4-6ec96dc405a0\"," + "\"time\":\"2016-01-31T03:16:41.415Z\",\"number\":32}"; TestRawTripPayload rowChange1 = new TestRawTripPayload(recordStr1); - HoodieRecord record1 = new HoodieRecord(new HoodieKey(rowChange1.getRowKey(), rowChange1.getPartitionPath()), - rowChange1); + HoodieRecord record1 = + new HoodieRecord(new HoodieKey(rowChange1.getRowKey(), rowChange1.getPartitionPath()), rowChange1); TestRawTripPayload rowChange2 = new TestRawTripPayload(recordStr2); - HoodieRecord record2 = new HoodieRecord(new HoodieKey(rowChange2.getRowKey(), rowChange2.getPartitionPath()), - rowChange2); + HoodieRecord record2 = + new HoodieRecord(new HoodieKey(rowChange2.getRowKey(), rowChange2.getPartitionPath()), rowChange2); TestRawTripPayload rowChange3 = new TestRawTripPayload(recordStr3); - HoodieRecord record3 = new HoodieRecord(new HoodieKey(rowChange3.getRowKey(), rowChange3.getPartitionPath()), - rowChange3); + HoodieRecord record3 = + new HoodieRecord(new HoodieKey(rowChange3.getRowKey(), rowChange3.getPartitionPath()), rowChange3); TestRawTripPayload rowChange4 = new TestRawTripPayload(recordStr4); - HoodieRecord record4 = new HoodieRecord(new HoodieKey(rowChange4.getRowKey(), rowChange4.getPartitionPath()), - rowChange4); + HoodieRecord record4 = + new HoodieRecord(new HoodieKey(rowChange4.getRowKey(), rowChange4.getPartitionPath()), rowChange4); // We write record1, record2 to a parquet file, but the bloom filter contains (record1, // record2, record3). BloomFilter filter = new BloomFilter(10000, 0.0000001); filter.add(record3.getRecordKey()); - String filename = HoodieClientTestUtils - .writeParquetFile(basePath, "2016/01/31", - Arrays.asList(record1, record2), schema, filter, true); + String filename = HoodieClientTestUtils.writeParquetFile(basePath, "2016/01/31", Arrays.asList(record1, record2), + schema, filter, true); // The bloom filter contains 3 records assertTrue(filter.mightContain(record1.getRecordKey())); @@ -265,19 +257,19 @@ public class TestHoodieBloomIndex extends HoodieClientTestHarness { assertFalse(filter.mightContain(record4.getRecordKey())); // Compare with file - List uuids = Arrays.asList(record1.getRecordKey(), record2.getRecordKey(), record3.getRecordKey(), - record4.getRecordKey()); + List uuids = + Arrays.asList(record1.getRecordKey(), record2.getRecordKey(), record3.getRecordKey(), record4.getRecordKey()); List results = HoodieKeyLookupHandle.checkCandidatesAgainstFile(jsc.hadoopConfiguration(), uuids, new Path(basePath + "/2016/01/31/" + filename)); assertEquals(results.size(), 2); - assertTrue(results.get(0).equals("1eb5b87a-1feh-4edd-87b4-6ec96dc405a0") || results.get(1).equals( - "1eb5b87a-1feh-4edd-87b4-6ec96dc405a0")); - assertTrue(results.get(0).equals("2eb5b87b-1feu-4edd-87b4-6ec96dc405a0") || results.get(1).equals( - "2eb5b87b-1feu-4edd-87b4-6ec96dc405a0")); + assertTrue(results.get(0).equals("1eb5b87a-1feh-4edd-87b4-6ec96dc405a0") + || results.get(1).equals("1eb5b87a-1feh-4edd-87b4-6ec96dc405a0")); + assertTrue(results.get(0).equals("2eb5b87b-1feu-4edd-87b4-6ec96dc405a0") + || results.get(1).equals("2eb5b87b-1feu-4edd-87b4-6ec96dc405a0")); // TODO(vc): Need more coverage on actual filenames - //assertTrue(results.get(0)._2().equals(filename)); - //assertTrue(results.get(1)._2().equals(filename)); + // assertTrue(results.get(0)._2().equals(filename)); + // assertTrue(results.get(1)._2().equals(filename)); } @Test @@ -306,27 +298,23 @@ public class TestHoodieBloomIndex extends HoodieClientTestHarness { String rowKey1 = UUID.randomUUID().toString(); String rowKey2 = UUID.randomUUID().toString(); String rowKey3 = UUID.randomUUID().toString(); - String recordStr1 = "{\"_row_key\":\"" + rowKey1 + "\"," - + "\"time\":\"2016-01-31T03:16:41.415Z\",\"number\":12}"; - String recordStr2 = "{\"_row_key\":\"" + rowKey2 + "\"," - + "\"time\":\"2016-01-31T03:20:41.415Z\",\"number\":100}"; - String recordStr3 = "{\"_row_key\":\"" + rowKey3 + "\"," - + "\"time\":\"2016-01-31T03:16:41.415Z\",\"number\":15}"; + String recordStr1 = "{\"_row_key\":\"" + rowKey1 + "\"," + "\"time\":\"2016-01-31T03:16:41.415Z\",\"number\":12}"; + String recordStr2 = "{\"_row_key\":\"" + rowKey2 + "\"," + "\"time\":\"2016-01-31T03:20:41.415Z\",\"number\":100}"; + String recordStr3 = "{\"_row_key\":\"" + rowKey3 + "\"," + "\"time\":\"2016-01-31T03:16:41.415Z\",\"number\":15}"; // place same row key under a different partition. - String recordStr4 = "{\"_row_key\":\"" + rowKey1 + "\"," - + "\"time\":\"2015-01-31T03:16:41.415Z\",\"number\":32}"; + String recordStr4 = "{\"_row_key\":\"" + rowKey1 + "\"," + "\"time\":\"2015-01-31T03:16:41.415Z\",\"number\":32}"; TestRawTripPayload rowChange1 = new TestRawTripPayload(recordStr1); - HoodieRecord record1 = new HoodieRecord(new HoodieKey(rowChange1.getRowKey(), rowChange1.getPartitionPath()), - rowChange1); + HoodieRecord record1 = + new HoodieRecord(new HoodieKey(rowChange1.getRowKey(), rowChange1.getPartitionPath()), rowChange1); TestRawTripPayload rowChange2 = new TestRawTripPayload(recordStr2); - HoodieRecord record2 = new HoodieRecord(new HoodieKey(rowChange2.getRowKey(), rowChange2.getPartitionPath()), - rowChange2); + HoodieRecord record2 = + new HoodieRecord(new HoodieKey(rowChange2.getRowKey(), rowChange2.getPartitionPath()), rowChange2); TestRawTripPayload rowChange3 = new TestRawTripPayload(recordStr3); - HoodieRecord record3 = new HoodieRecord(new HoodieKey(rowChange3.getRowKey(), rowChange3.getPartitionPath()), - rowChange3); + HoodieRecord record3 = + new HoodieRecord(new HoodieKey(rowChange3.getRowKey(), rowChange3.getPartitionPath()), rowChange3); TestRawTripPayload rowChange4 = new TestRawTripPayload(recordStr4); - HoodieRecord record4 = new HoodieRecord(new HoodieKey(rowChange4.getRowKey(), rowChange4.getPartitionPath()), - rowChange4); + HoodieRecord record4 = + new HoodieRecord(new HoodieKey(rowChange4.getRowKey(), rowChange4.getPartitionPath()), rowChange4); JavaRDD recordRDD = jsc.parallelize(Arrays.asList(record1, record2, record3, record4)); // Also create the metadata and config @@ -406,8 +394,8 @@ public class TestHoodieBloomIndex extends HoodieClientTestHarness { // Let's tag HoodieBloomIndex bloomIndex = new HoodieBloomIndex(config); - JavaPairRDD>> taggedRecordRDD = bloomIndex - .fetchRecordLocation(keysRDD, jsc, table); + JavaPairRDD>> taggedRecordRDD = + bloomIndex.fetchRecordLocation(keysRDD, jsc, table); // Should not find any files for (Tuple2>> record : taggedRecordRDD.collect()) { @@ -456,17 +444,16 @@ public class TestHoodieBloomIndex extends HoodieClientTestHarness { // We write record1 to a parquet file, using a bloom filter having both records TestRawTripPayload rowChange1 = new TestRawTripPayload(recordStr1); - HoodieRecord record1 = new HoodieRecord(new HoodieKey(rowChange1.getRowKey(), rowChange1.getPartitionPath()), - rowChange1); + HoodieRecord record1 = + new HoodieRecord(new HoodieKey(rowChange1.getRowKey(), rowChange1.getPartitionPath()), rowChange1); TestRawTripPayload rowChange2 = new TestRawTripPayload(recordStr2); - HoodieRecord record2 = new HoodieRecord(new HoodieKey(rowChange2.getRowKey(), rowChange2.getPartitionPath()), - rowChange2); + HoodieRecord record2 = + new HoodieRecord(new HoodieKey(rowChange2.getRowKey(), rowChange2.getPartitionPath()), rowChange2); BloomFilter filter = new BloomFilter(10000, 0.0000001); filter.add(record2.getRecordKey()); - String filename = HoodieClientTestUtils - .writeParquetFile(basePath, "2016/01/31", - Arrays.asList(record1), schema, filter, true); + String filename = + HoodieClientTestUtils.writeParquetFile(basePath, "2016/01/31", Arrays.asList(record1), schema, filter, true); assertTrue(filter.mightContain(record1.getRecordKey())); assertTrue(filter.mightContain(record2.getRecordKey())); diff --git a/hudi-client/src/test/java/org/apache/hudi/index/bloom/TestHoodieGlobalBloomIndex.java b/hudi-client/src/test/java/org/apache/hudi/index/bloom/TestHoodieGlobalBloomIndex.java index cdf44419b..a6cdc40e1 100644 --- a/hudi-client/src/test/java/org/apache/hudi/index/bloom/TestHoodieGlobalBloomIndex.java +++ b/hudi-client/src/test/java/org/apache/hudi/index/bloom/TestHoodieGlobalBloomIndex.java @@ -58,8 +58,7 @@ public class TestHoodieGlobalBloomIndex extends HoodieClientTestHarness { private String schemaStr; private Schema schema; - public TestHoodieGlobalBloomIndex() throws Exception { - } + public TestHoodieGlobalBloomIndex() throws Exception {} @Before public void setUp() throws Exception { @@ -94,35 +93,31 @@ public class TestHoodieGlobalBloomIndex extends HoodieClientTestHarness { new File(basePath + "/2015/03/12").mkdirs(); new File(basePath + "/2015/03/12/" + HoodiePartitionMetadata.HOODIE_PARTITION_METAFILE).createNewFile(); - TestRawTripPayload rowChange1 = new TestRawTripPayload( - "{\"_row_key\":\"000\",\"time\":\"2016-01-31T03:16:41.415Z\",\"number\":12}"); - HoodieRecord record1 = new HoodieRecord(new HoodieKey(rowChange1.getRowKey(), rowChange1.getPartitionPath()), - rowChange1); - TestRawTripPayload rowChange2 = new TestRawTripPayload( - "{\"_row_key\":\"001\",\"time\":\"2016-01-31T03:16:41.415Z\",\"number\":12}"); - HoodieRecord record2 = new HoodieRecord(new HoodieKey(rowChange2.getRowKey(), rowChange2.getPartitionPath()), - rowChange2); - TestRawTripPayload rowChange3 = new TestRawTripPayload( - "{\"_row_key\":\"002\",\"time\":\"2016-01-31T03:16:41.415Z\",\"number\":12}"); - HoodieRecord record3 = new HoodieRecord(new HoodieKey(rowChange3.getRowKey(), rowChange3.getPartitionPath()), - rowChange3); - TestRawTripPayload rowChange4 = new TestRawTripPayload( - "{\"_row_key\":\"003\",\"time\":\"2016-01-31T03:16:41.415Z\",\"number\":12}"); - HoodieRecord record4 = new HoodieRecord(new HoodieKey(rowChange4.getRowKey(), rowChange4.getPartitionPath()), - rowChange4); + TestRawTripPayload rowChange1 = + new TestRawTripPayload("{\"_row_key\":\"000\",\"time\":\"2016-01-31T03:16:41.415Z\",\"number\":12}"); + HoodieRecord record1 = + new HoodieRecord(new HoodieKey(rowChange1.getRowKey(), rowChange1.getPartitionPath()), rowChange1); + TestRawTripPayload rowChange2 = + new TestRawTripPayload("{\"_row_key\":\"001\",\"time\":\"2016-01-31T03:16:41.415Z\",\"number\":12}"); + HoodieRecord record2 = + new HoodieRecord(new HoodieKey(rowChange2.getRowKey(), rowChange2.getPartitionPath()), rowChange2); + TestRawTripPayload rowChange3 = + new TestRawTripPayload("{\"_row_key\":\"002\",\"time\":\"2016-01-31T03:16:41.415Z\",\"number\":12}"); + HoodieRecord record3 = + new HoodieRecord(new HoodieKey(rowChange3.getRowKey(), rowChange3.getPartitionPath()), rowChange3); + TestRawTripPayload rowChange4 = + new TestRawTripPayload("{\"_row_key\":\"003\",\"time\":\"2016-01-31T03:16:41.415Z\",\"number\":12}"); + HoodieRecord record4 = + new HoodieRecord(new HoodieKey(rowChange4.getRowKey(), rowChange4.getPartitionPath()), rowChange4); - HoodieClientTestUtils - .writeParquetFile(basePath, "2016/04/01", "2_0_20160401010101.parquet", - Lists.newArrayList(), schema, null, false); - HoodieClientTestUtils - .writeParquetFile(basePath, "2015/03/12", "1_0_20150312101010.parquet", - Lists.newArrayList(), schema, null, false); - HoodieClientTestUtils - .writeParquetFile(basePath, "2015/03/12", "3_0_20150312101010.parquet", - Arrays.asList(record1), schema, null, false); - HoodieClientTestUtils - .writeParquetFile(basePath, "2015/03/12", "4_0_20150312101010.parquet", - Arrays.asList(record2, record3, record4), schema, null, false); + HoodieClientTestUtils.writeParquetFile(basePath, "2016/04/01", "2_0_20160401010101.parquet", Lists.newArrayList(), + schema, null, false); + HoodieClientTestUtils.writeParquetFile(basePath, "2015/03/12", "1_0_20150312101010.parquet", Lists.newArrayList(), + schema, null, false); + HoodieClientTestUtils.writeParquetFile(basePath, "2015/03/12", "3_0_20150312101010.parquet", Arrays.asList(record1), + schema, null, false); + HoodieClientTestUtils.writeParquetFile(basePath, "2015/03/12", "4_0_20150312101010.parquet", + Arrays.asList(record2, record3, record4), schema, null, false); // intentionally missed the partition "2015/03/12" to see if the GlobalBloomIndex can pick it up List partitions = Arrays.asList("2016/01/21", "2016/04/01"); @@ -154,10 +149,8 @@ public class TestHoodieGlobalBloomIndex extends HoodieClientTestHarness { Map expected = new HashMap<>(); expected.put("2016/04/01/2", new BloomIndexFileInfo("2")); expected.put("2015/03/12/1", new BloomIndexFileInfo("1")); - expected.put("2015/03/12/3", - new BloomIndexFileInfo("3", "000", "000")); - expected.put("2015/03/12/4", - new BloomIndexFileInfo("4", "001", "003")); + expected.put("2015/03/12/3", new BloomIndexFileInfo("3", "000", "000")); + expected.put("2015/03/12/4", new BloomIndexFileInfo("4", "001", "003")); assertEquals(expected, filesMap); } @@ -172,28 +165,24 @@ public class TestHoodieGlobalBloomIndex extends HoodieClientTestHarness { partitionToFileIndexInfo.put("2017/10/22", Arrays.asList(new BloomIndexFileInfo("f1"), new BloomIndexFileInfo("f2", "000", "000"), new BloomIndexFileInfo("f3", "001", "003"))); - partitionToFileIndexInfo.put("2017/10/23", Arrays.asList( - new BloomIndexFileInfo("f4", "002", "007"), new BloomIndexFileInfo("f5", "009", "010"))); + partitionToFileIndexInfo.put("2017/10/23", + Arrays.asList(new BloomIndexFileInfo("f4", "002", "007"), new BloomIndexFileInfo("f5", "009", "010"))); // the partition partition of the key of the incoming records will be ignored - JavaPairRDD partitionRecordKeyPairRDD = jsc.parallelize(Arrays.asList( - new Tuple2<>("2017/10/21", "003"), new Tuple2<>("2017/10/22", "002"), new Tuple2<>("2017/10/22", "005"), - new Tuple2<>("2017/10/23", "004"))).mapToPair(t -> t); + JavaPairRDD partitionRecordKeyPairRDD = + jsc.parallelize(Arrays.asList(new Tuple2<>("2017/10/21", "003"), new Tuple2<>("2017/10/22", "002"), + new Tuple2<>("2017/10/22", "005"), new Tuple2<>("2017/10/23", "004"))).mapToPair(t -> t); List> comparisonKeyList = index.explodeRecordRDDWithFileComparisons(partitionToFileIndexInfo, partitionRecordKeyPairRDD).collect(); - /* expecting: - f4, HoodieKey { recordKey=003 partitionPath=2017/10/23} - f1, HoodieKey { recordKey=003 partitionPath=2017/10/22} - f3, HoodieKey { recordKey=003 partitionPath=2017/10/22} - f4, HoodieKey { recordKey=002 partitionPath=2017/10/23} - f1, HoodieKey { recordKey=002 partitionPath=2017/10/22} - f3, HoodieKey { recordKey=002 partitionPath=2017/10/22} - f4, HoodieKey { recordKey=005 partitionPath=2017/10/23} - f1, HoodieKey { recordKey=005 partitionPath=2017/10/22} - f4, HoodieKey { recordKey=004 partitionPath=2017/10/23} - f1, HoodieKey { recordKey=004 partitionPath=2017/10/22} + /* + * expecting: f4, HoodieKey { recordKey=003 partitionPath=2017/10/23} f1, HoodieKey { recordKey=003 + * partitionPath=2017/10/22} f3, HoodieKey { recordKey=003 partitionPath=2017/10/22} f4, HoodieKey { recordKey=002 + * partitionPath=2017/10/23} f1, HoodieKey { recordKey=002 partitionPath=2017/10/22} f3, HoodieKey { recordKey=002 + * partitionPath=2017/10/22} f4, HoodieKey { recordKey=005 partitionPath=2017/10/23} f1, HoodieKey { recordKey=005 + * partitionPath=2017/10/22} f4, HoodieKey { recordKey=004 partitionPath=2017/10/23} f1, HoodieKey { recordKey=004 + * partitionPath=2017/10/22} */ assertEquals(10, comparisonKeyList.size()); @@ -225,31 +214,31 @@ public class TestHoodieGlobalBloomIndex extends HoodieClientTestHarness { new File(basePath + "/2015/03/12").mkdirs(); new File(basePath + "/2015/03/12/" + HoodiePartitionMetadata.HOODIE_PARTITION_METAFILE).createNewFile(); - TestRawTripPayload rowChange1 = new TestRawTripPayload( - "{\"_row_key\":\"000\",\"time\":\"2016-01-31T03:16:41.415Z\",\"number\":12}"); - HoodieRecord record1 = new HoodieRecord(new HoodieKey(rowChange1.getRowKey(), rowChange1.getPartitionPath()), - rowChange1); - TestRawTripPayload rowChange2 = new TestRawTripPayload( - "{\"_row_key\":\"001\",\"time\":\"2016-01-31T03:16:41.415Z\",\"number\":12}"); - HoodieRecord record2 = new HoodieRecord(new HoodieKey(rowChange2.getRowKey(), rowChange2.getPartitionPath()), - rowChange2); - TestRawTripPayload rowChange3 = new TestRawTripPayload( - "{\"_row_key\":\"002\",\"time\":\"2016-01-31T03:16:41.415Z\",\"number\":12}"); - HoodieRecord record3 = new HoodieRecord(new HoodieKey(rowChange3.getRowKey(), rowChange3.getPartitionPath()), - rowChange3); + TestRawTripPayload rowChange1 = + new TestRawTripPayload("{\"_row_key\":\"000\",\"time\":\"2016-01-31T03:16:41.415Z\",\"number\":12}"); + HoodieRecord record1 = + new HoodieRecord(new HoodieKey(rowChange1.getRowKey(), rowChange1.getPartitionPath()), rowChange1); + TestRawTripPayload rowChange2 = + new TestRawTripPayload("{\"_row_key\":\"001\",\"time\":\"2016-01-31T03:16:41.415Z\",\"number\":12}"); + HoodieRecord record2 = + new HoodieRecord(new HoodieKey(rowChange2.getRowKey(), rowChange2.getPartitionPath()), rowChange2); + TestRawTripPayload rowChange3 = + new TestRawTripPayload("{\"_row_key\":\"002\",\"time\":\"2016-01-31T03:16:41.415Z\",\"number\":12}"); + HoodieRecord record3 = + new HoodieRecord(new HoodieKey(rowChange3.getRowKey(), rowChange3.getPartitionPath()), rowChange3); // this record will be saved in table and will be tagged to the incoming record5 - TestRawTripPayload rowChange4 = new TestRawTripPayload( - "{\"_row_key\":\"003\",\"time\":\"2016-01-31T03:16:41.415Z\",\"number\":12}"); - HoodieRecord record4 = new HoodieRecord(new HoodieKey(rowChange4.getRowKey(), rowChange4.getPartitionPath()), - rowChange4); + TestRawTripPayload rowChange4 = + new TestRawTripPayload("{\"_row_key\":\"003\",\"time\":\"2016-01-31T03:16:41.415Z\",\"number\":12}"); + HoodieRecord record4 = + new HoodieRecord(new HoodieKey(rowChange4.getRowKey(), rowChange4.getPartitionPath()), rowChange4); // this has the same record key as record4 but different time so different partition, but globalbloomIndex should // tag the original partition of the saved record4 - TestRawTripPayload rowChange5 = new TestRawTripPayload( - "{\"_row_key\":\"003\",\"time\":\"2016-02-31T03:16:41.415Z\",\"number\":12}"); - HoodieRecord record5 = new HoodieRecord(new HoodieKey(rowChange5.getRowKey(), rowChange5.getPartitionPath()), - rowChange4); + TestRawTripPayload rowChange5 = + new TestRawTripPayload("{\"_row_key\":\"003\",\"time\":\"2016-02-31T03:16:41.415Z\",\"number\":12}"); + HoodieRecord record5 = + new HoodieRecord(new HoodieKey(rowChange5.getRowKey(), rowChange5.getPartitionPath()), rowChange4); JavaRDD recordRDD = jsc.parallelize(Arrays.asList(record1, record2, record3, record5)); diff --git a/hudi-client/src/test/java/org/apache/hudi/io/TestHoodieCommitArchiveLog.java b/hudi-client/src/test/java/org/apache/hudi/io/TestHoodieCommitArchiveLog.java index 8ea0ae4a2..1f1d32432 100644 --- a/hudi-client/src/test/java/org/apache/hudi/io/TestHoodieCommitArchiveLog.java +++ b/hudi-client/src/test/java/org/apache/hudi/io/TestHoodieCommitArchiveLog.java @@ -75,9 +75,9 @@ public class TestHoodieCommitArchiveLog extends HoodieClientTestHarness { @Test public void testArchiveEmptyDataset() throws IOException { - HoodieWriteConfig cfg = HoodieWriteConfig.newBuilder().withPath(basePath) - .withSchema(HoodieTestDataGenerator.TRIP_EXAMPLE_SCHEMA).withParallelism(2, 2) - .forTable("test-trip-table").build(); + HoodieWriteConfig cfg = + HoodieWriteConfig.newBuilder().withPath(basePath).withSchema(HoodieTestDataGenerator.TRIP_EXAMPLE_SCHEMA) + .withParallelism(2, 2).forTable("test-trip-table").build(); metaClient = HoodieTableMetaClient.reload(metaClient); HoodieCommitArchiveLog archiveLog = new HoodieCommitArchiveLog(cfg, metaClient); boolean result = archiveLog.archiveIfRequired(jsc); @@ -88,8 +88,7 @@ public class TestHoodieCommitArchiveLog extends HoodieClientTestHarness { public void testArchiveDatasetWithArchival() throws IOException { HoodieWriteConfig cfg = HoodieWriteConfig.newBuilder().withPath(basePath) .withSchema(HoodieTestDataGenerator.TRIP_EXAMPLE_SCHEMA).withParallelism(2, 2) - .withCompactionConfig( - HoodieCompactionConfig.newBuilder().retainCommits(1).archiveCommitsWith(2, 4).build()) + .withCompactionConfig(HoodieCompactionConfig.newBuilder().retainCommits(1).archiveCommitsWith(2, 4).build()) .forTable("test-trip-table").build(); HoodieTestUtils.init(hadoopConf, basePath); // Requested Compaction @@ -149,7 +148,7 @@ public class TestHoodieCommitArchiveLog extends HoodieClientTestHarness { HoodieTestUtils.createCleanFiles(basePath, "105", dfs.getConf()); HoodieTestUtils.createInflightCleanFiles(basePath, dfs.getConf(), "106", "107"); - //reload the timeline and get all the commmits before archive + // reload the timeline and get all the commmits before archive timeline = metaClient.getActiveTimeline().reload().getAllCommitsTimeline().filterCompletedInstants(); List originalCommits = timeline.getInstants().collect(Collectors.toList()); @@ -163,49 +162,47 @@ public class TestHoodieCommitArchiveLog extends HoodieClientTestHarness { assertTrue(archiveLog.archiveIfRequired(jsc)); - //reload the timeline and remove the remaining commits + // reload the timeline and remove the remaining commits timeline = metaClient.getActiveTimeline().reload().getAllCommitsTimeline().filterCompletedInstants(); originalCommits.removeAll(timeline.getInstants().collect(Collectors.toList())); // Check compaction instants - List instants = - HoodieTableMetaClient.scanHoodieInstantsFromFileSystem(metaClient.getFs(), - new Path(metaClient.getMetaAuxiliaryPath()), - HoodieActiveTimeline.VALID_EXTENSIONS_IN_ACTIVE_TIMELINE); + List instants = HoodieTableMetaClient.scanHoodieInstantsFromFileSystem(metaClient.getFs(), + new Path(metaClient.getMetaAuxiliaryPath()), HoodieActiveTimeline.VALID_EXTENSIONS_IN_ACTIVE_TIMELINE); assertEquals("Should delete all compaction instants < 104", 4, instants.size()); - assertFalse("Requested Compaction must be absent for 100", instants.contains( - new HoodieInstant(State.REQUESTED, HoodieTimeline.COMPACTION_ACTION, "100"))); - assertFalse("Inflight Compaction must be absent for 100", instants.contains( - new HoodieInstant(State.INFLIGHT, HoodieTimeline.COMPACTION_ACTION, "100"))); - assertFalse("Requested Compaction must be absent for 101", instants.contains( - new HoodieInstant(State.REQUESTED, HoodieTimeline.COMPACTION_ACTION, "101"))); - assertFalse("Inflight Compaction must be absent for 101", instants.contains( - new HoodieInstant(State.INFLIGHT, HoodieTimeline.COMPACTION_ACTION, "101"))); - assertFalse("Requested Compaction must be absent for 102", instants.contains( - new HoodieInstant(State.REQUESTED, HoodieTimeline.COMPACTION_ACTION, "102"))); - assertFalse("Inflight Compaction must be absent for 102", instants.contains( - new HoodieInstant(State.INFLIGHT, HoodieTimeline.COMPACTION_ACTION, "102"))); - assertFalse("Requested Compaction must be absent for 103", instants.contains( - new HoodieInstant(State.REQUESTED, HoodieTimeline.COMPACTION_ACTION, "103"))); - assertFalse("Inflight Compaction must be absent for 103", instants.contains( - new HoodieInstant(State.INFLIGHT, HoodieTimeline.COMPACTION_ACTION, "103"))); - assertTrue("Requested Compaction must be present for 104", instants.contains( - new HoodieInstant(State.REQUESTED, HoodieTimeline.COMPACTION_ACTION, "104"))); - assertTrue("Inflight Compaction must be present for 104", instants.contains( - new HoodieInstant(State.INFLIGHT, HoodieTimeline.COMPACTION_ACTION, "104"))); - assertTrue("Requested Compaction must be present for 105", instants.contains( - new HoodieInstant(State.REQUESTED, HoodieTimeline.COMPACTION_ACTION, "105"))); - assertTrue("Inflight Compaction must be present for 105", instants.contains( - new HoodieInstant(State.INFLIGHT, HoodieTimeline.COMPACTION_ACTION, "105"))); + assertFalse("Requested Compaction must be absent for 100", + instants.contains(new HoodieInstant(State.REQUESTED, HoodieTimeline.COMPACTION_ACTION, "100"))); + assertFalse("Inflight Compaction must be absent for 100", + instants.contains(new HoodieInstant(State.INFLIGHT, HoodieTimeline.COMPACTION_ACTION, "100"))); + assertFalse("Requested Compaction must be absent for 101", + instants.contains(new HoodieInstant(State.REQUESTED, HoodieTimeline.COMPACTION_ACTION, "101"))); + assertFalse("Inflight Compaction must be absent for 101", + instants.contains(new HoodieInstant(State.INFLIGHT, HoodieTimeline.COMPACTION_ACTION, "101"))); + assertFalse("Requested Compaction must be absent for 102", + instants.contains(new HoodieInstant(State.REQUESTED, HoodieTimeline.COMPACTION_ACTION, "102"))); + assertFalse("Inflight Compaction must be absent for 102", + instants.contains(new HoodieInstant(State.INFLIGHT, HoodieTimeline.COMPACTION_ACTION, "102"))); + assertFalse("Requested Compaction must be absent for 103", + instants.contains(new HoodieInstant(State.REQUESTED, HoodieTimeline.COMPACTION_ACTION, "103"))); + assertFalse("Inflight Compaction must be absent for 103", + instants.contains(new HoodieInstant(State.INFLIGHT, HoodieTimeline.COMPACTION_ACTION, "103"))); + assertTrue("Requested Compaction must be present for 104", + instants.contains(new HoodieInstant(State.REQUESTED, HoodieTimeline.COMPACTION_ACTION, "104"))); + assertTrue("Inflight Compaction must be present for 104", + instants.contains(new HoodieInstant(State.INFLIGHT, HoodieTimeline.COMPACTION_ACTION, "104"))); + assertTrue("Requested Compaction must be present for 105", + instants.contains(new HoodieInstant(State.REQUESTED, HoodieTimeline.COMPACTION_ACTION, "105"))); + assertTrue("Inflight Compaction must be present for 105", + instants.contains(new HoodieInstant(State.INFLIGHT, HoodieTimeline.COMPACTION_ACTION, "105"))); - //read the file - Reader reader = HoodieLogFormat.newReader(dfs, - new HoodieLogFile(new Path(basePath + "/.hoodie/.commits_.archive.1_1-0-1")), - HoodieArchivedMetaEntry.getClassSchema()); + // read the file + Reader reader = + HoodieLogFormat.newReader(dfs, new HoodieLogFile(new Path(basePath + "/.hoodie/.commits_.archive.1_1-0-1")), + HoodieArchivedMetaEntry.getClassSchema()); int archivedRecordsCount = 0; List readRecords = new ArrayList<>(); - //read the avro blocks and validate the number of records written in each avro block + // read the avro blocks and validate the number of records written in each avro block while (reader.hasNext()) { HoodieAvroDataBlock blk = (HoodieAvroDataBlock) reader.next(); List records = blk.getRecords(); @@ -215,7 +212,7 @@ public class TestHoodieCommitArchiveLog extends HoodieClientTestHarness { } assertEquals("Total archived records and total read records are the same count", 8, archivedRecordsCount); - //make sure the archived commits are the same as the (originalcommits - commitsleft) + // make sure the archived commits are the same as the (originalcommits - commitsleft) List readCommits = readRecords.stream().map(r -> (GenericRecord) r).map(r -> { return r.get("commitTime").toString(); }).collect(Collectors.toList()); @@ -232,9 +229,9 @@ public class TestHoodieCommitArchiveLog extends HoodieClientTestHarness { @Test public void testArchiveDatasetWithNoArchival() throws IOException { HoodieWriteConfig cfg = HoodieWriteConfig.newBuilder().withPath(basePath) - .withSchema(HoodieTestDataGenerator.TRIP_EXAMPLE_SCHEMA).withParallelism(2, 2) - .forTable("test-trip-table").withCompactionConfig( - HoodieCompactionConfig.newBuilder().retainCommits(1).archiveCommitsWith(2, 5).build()).build(); + .withSchema(HoodieTestDataGenerator.TRIP_EXAMPLE_SCHEMA).withParallelism(2, 2).forTable("test-trip-table") + .withCompactionConfig(HoodieCompactionConfig.newBuilder().retainCommits(1).archiveCommitsWith(2, 5).build()) + .build(); metaClient = HoodieTableMetaClient.reload(metaClient); HoodieCommitArchiveLog archiveLog = new HoodieCommitArchiveLog(cfg, metaClient); // Requested Compaction @@ -273,35 +270,33 @@ public class TestHoodieCommitArchiveLog extends HoodieClientTestHarness { timeline = metaClient.getActiveTimeline().reload().getCommitsTimeline().filterCompletedInstants(); assertEquals("Should not archive commits when maxCommitsToKeep is 5", 4, timeline.countInstants()); - List instants = - HoodieTableMetaClient.scanHoodieInstantsFromFileSystem(metaClient.getFs(), - new Path(metaClient.getMetaAuxiliaryPath()), - HoodieActiveTimeline.VALID_EXTENSIONS_IN_ACTIVE_TIMELINE); + List instants = HoodieTableMetaClient.scanHoodieInstantsFromFileSystem(metaClient.getFs(), + new Path(metaClient.getMetaAuxiliaryPath()), HoodieActiveTimeline.VALID_EXTENSIONS_IN_ACTIVE_TIMELINE); assertEquals("Should not delete any aux compaction files when maxCommitsToKeep is 5", 8, instants.size()); - assertTrue("Requested Compaction must be present for 100", instants.contains( - new HoodieInstant(State.REQUESTED, HoodieTimeline.COMPACTION_ACTION, "100"))); - assertTrue("Inflight Compaction must be present for 100", instants.contains( - new HoodieInstant(State.INFLIGHT, HoodieTimeline.COMPACTION_ACTION, "100"))); - assertTrue("Requested Compaction must be present for 101", instants.contains( - new HoodieInstant(State.REQUESTED, HoodieTimeline.COMPACTION_ACTION, "101"))); - assertTrue("Inflight Compaction must be present for 101", instants.contains( - new HoodieInstant(State.INFLIGHT, HoodieTimeline.COMPACTION_ACTION, "101"))); - assertTrue("Requested Compaction must be present for 102", instants.contains( - new HoodieInstant(State.REQUESTED, HoodieTimeline.COMPACTION_ACTION, "102"))); - assertTrue("Inflight Compaction must be present for 102", instants.contains( - new HoodieInstant(State.INFLIGHT, HoodieTimeline.COMPACTION_ACTION, "102"))); - assertTrue("Requested Compaction must be present for 103", instants.contains( - new HoodieInstant(State.REQUESTED, HoodieTimeline.COMPACTION_ACTION, "103"))); - assertTrue("Inflight Compaction must be present for 103", instants.contains( - new HoodieInstant(State.INFLIGHT, HoodieTimeline.COMPACTION_ACTION, "103"))); + assertTrue("Requested Compaction must be present for 100", + instants.contains(new HoodieInstant(State.REQUESTED, HoodieTimeline.COMPACTION_ACTION, "100"))); + assertTrue("Inflight Compaction must be present for 100", + instants.contains(new HoodieInstant(State.INFLIGHT, HoodieTimeline.COMPACTION_ACTION, "100"))); + assertTrue("Requested Compaction must be present for 101", + instants.contains(new HoodieInstant(State.REQUESTED, HoodieTimeline.COMPACTION_ACTION, "101"))); + assertTrue("Inflight Compaction must be present for 101", + instants.contains(new HoodieInstant(State.INFLIGHT, HoodieTimeline.COMPACTION_ACTION, "101"))); + assertTrue("Requested Compaction must be present for 102", + instants.contains(new HoodieInstant(State.REQUESTED, HoodieTimeline.COMPACTION_ACTION, "102"))); + assertTrue("Inflight Compaction must be present for 102", + instants.contains(new HoodieInstant(State.INFLIGHT, HoodieTimeline.COMPACTION_ACTION, "102"))); + assertTrue("Requested Compaction must be present for 103", + instants.contains(new HoodieInstant(State.REQUESTED, HoodieTimeline.COMPACTION_ACTION, "103"))); + assertTrue("Inflight Compaction must be present for 103", + instants.contains(new HoodieInstant(State.INFLIGHT, HoodieTimeline.COMPACTION_ACTION, "103"))); } @Test public void testArchiveCommitSafety() throws IOException { HoodieWriteConfig cfg = HoodieWriteConfig.newBuilder().withPath(basePath) - .withSchema(HoodieTestDataGenerator.TRIP_EXAMPLE_SCHEMA).withParallelism(2, 2) - .forTable("test-trip-table").withCompactionConfig( - HoodieCompactionConfig.newBuilder().retainCommits(1).archiveCommitsWith(2, 5).build()).build(); + .withSchema(HoodieTestDataGenerator.TRIP_EXAMPLE_SCHEMA).withParallelism(2, 2).forTable("test-trip-table") + .withCompactionConfig(HoodieCompactionConfig.newBuilder().retainCommits(1).archiveCommitsWith(2, 5).build()) + .build(); metaClient = HoodieTableMetaClient.reload(metaClient); HoodieCommitArchiveLog archiveLog = new HoodieCommitArchiveLog(cfg, metaClient); HoodieTestDataGenerator.createCommitFile(basePath, "100", dfs.getConf()); @@ -325,9 +320,9 @@ public class TestHoodieCommitArchiveLog extends HoodieClientTestHarness { @Test public void testArchiveCommitSavepointNoHole() throws IOException { HoodieWriteConfig cfg = HoodieWriteConfig.newBuilder().withPath(basePath) - .withSchema(HoodieTestDataGenerator.TRIP_EXAMPLE_SCHEMA).withParallelism(2, 2) - .forTable("test-trip-table").withCompactionConfig( - HoodieCompactionConfig.newBuilder().retainCommits(1).archiveCommitsWith(2, 5).build()).build(); + .withSchema(HoodieTestDataGenerator.TRIP_EXAMPLE_SCHEMA).withParallelism(2, 2).forTable("test-trip-table") + .withCompactionConfig(HoodieCompactionConfig.newBuilder().retainCommits(1).archiveCommitsWith(2, 5).build()) + .build(); metaClient = HoodieTableMetaClient.reload(metaClient); HoodieCommitArchiveLog archiveLog = new HoodieCommitArchiveLog(cfg, metaClient); HoodieTestDataGenerator.createCommitFile(basePath, "100", dfs.getConf()); @@ -357,9 +352,9 @@ public class TestHoodieCommitArchiveLog extends HoodieClientTestHarness { @Test public void testArchiveCommitCompactionNoHole() throws IOException { HoodieWriteConfig cfg = HoodieWriteConfig.newBuilder().withPath(basePath) - .withSchema(HoodieTestDataGenerator.TRIP_EXAMPLE_SCHEMA).withParallelism(2, 2) - .forTable("test-trip-table").withCompactionConfig( - HoodieCompactionConfig.newBuilder().retainCommits(1).archiveCommitsWith(2, 5).build()).build(); + .withSchema(HoodieTestDataGenerator.TRIP_EXAMPLE_SCHEMA).withParallelism(2, 2).forTable("test-trip-table") + .withCompactionConfig(HoodieCompactionConfig.newBuilder().retainCommits(1).archiveCommitsWith(2, 5).build()) + .build(); metaClient = HoodieTableMetaClient.reload(metaClient); HoodieCommitArchiveLog archiveLog = new HoodieCommitArchiveLog(cfg, metaClient); HoodieTestDataGenerator.createCommitFile(basePath, "100", dfs.getConf()); @@ -382,9 +377,8 @@ public class TestHoodieCommitArchiveLog extends HoodieClientTestHarness { timeline = metaClient.getActiveTimeline().reload().getCommitsAndCompactionTimeline(); assertFalse("Instants before oldest pending compaction can be removed", timeline.containsInstant(new HoodieInstant(false, HoodieTimeline.COMMIT_ACTION, "100"))); - assertEquals( - "Since we have a pending compaction at 101, we should never archive any commit " - + "after 101 (we only " + "archive 100)", 7, timeline.countInstants()); + assertEquals("Since we have a pending compaction at 101, we should never archive any commit " + + "after 101 (we only " + "archive 100)", 7, timeline.countInstants()); assertTrue("Requested Compaction must still be present", timeline.containsInstant(new HoodieInstant(State.REQUESTED, HoodieTimeline.COMPACTION_ACTION, "101"))); assertTrue("Instants greater than oldest pending compaction must be present", diff --git a/hudi-client/src/test/java/org/apache/hudi/io/TestHoodieCompactor.java b/hudi-client/src/test/java/org/apache/hudi/io/TestHoodieCompactor.java index 150f6e6fd..ea2631748 100644 --- a/hudi-client/src/test/java/org/apache/hudi/io/TestHoodieCompactor.java +++ b/hudi-client/src/test/java/org/apache/hudi/io/TestHoodieCompactor.java @@ -86,9 +86,10 @@ public class TestHoodieCompactor extends HoodieClientTestHarness { private HoodieWriteConfig.Builder getConfigBuilder() { return HoodieWriteConfig.newBuilder().withPath(basePath).withSchema(HoodieTestDataGenerator.TRIP_EXAMPLE_SCHEMA) - .withParallelism(2, 2).withCompactionConfig( - HoodieCompactionConfig.newBuilder().compactionSmallFileSize(1024 * 1024).withInlineCompaction(false) - .build()).withStorageConfig(HoodieStorageConfig.newBuilder().limitFileSize(1024 * 1024).build()) + .withParallelism(2, 2) + .withCompactionConfig(HoodieCompactionConfig.newBuilder().compactionSmallFileSize(1024 * 1024) + .withInlineCompaction(false).build()) + .withStorageConfig(HoodieStorageConfig.newBuilder().limitFileSize(1024 * 1024).build()) .withMemoryConfig(HoodieMemoryConfig.newBuilder().withMaxDFSStreamBufferSize(1 * 1024 * 1024).build()) .forTable("test-trip-table") .withIndexConfig(HoodieIndexConfig.newBuilder().withIndexType(HoodieIndex.IndexType.BLOOM).build()); @@ -146,16 +147,15 @@ public class TestHoodieCompactor extends HoodieClientTestHarness { updatedRecords = index.tagLocation(updatedRecordsRDD, jsc, table).collect(); // Write them to corresponding avro logfiles - HoodieTestUtils - .writeRecordsToLogFiles(fs, metaClient.getBasePath(), HoodieTestDataGenerator.avroSchemaWithMetadataFields, - updatedRecords); + HoodieTestUtils.writeRecordsToLogFiles(fs, metaClient.getBasePath(), + HoodieTestDataGenerator.avroSchemaWithMetadataFields, updatedRecords); // Verify that all data file has one log file metaClient = HoodieTableMetaClient.reload(metaClient); table = HoodieTable.getHoodieTable(metaClient, config, jsc); for (String partitionPath : dataGen.getPartitionPaths()) { - List groupedLogFiles = table.getRTFileSystemView().getLatestFileSlices(partitionPath) - .collect(Collectors.toList()); + List groupedLogFiles = + table.getRTFileSystemView().getLatestFileSlices(partitionPath).collect(Collectors.toList()); for (FileSlice fileSlice : groupedLogFiles) { assertEquals("There should be 1 log file written for every data file", 1, fileSlice.getLogFiles().count()); } @@ -173,8 +173,7 @@ public class TestHoodieCompactor extends HoodieClientTestHarness { for (String partitionPath : dataGen.getPartitionPaths()) { List writeStatuses = result.collect(); assertTrue(writeStatuses.stream() - .filter(writeStatus -> writeStatus.getStat().getPartitionPath().contentEquals(partitionPath)) - .count() > 0); + .filter(writeStatus -> writeStatus.getStat().getPartitionPath().contentEquals(partitionPath)).count() > 0); } } } diff --git a/hudi-client/src/test/java/org/apache/hudi/io/TestHoodieMergeHandle.java b/hudi-client/src/test/java/org/apache/hudi/io/TestHoodieMergeHandle.java index 5fdde9978..664063067 100644 --- a/hudi-client/src/test/java/org/apache/hudi/io/TestHoodieMergeHandle.java +++ b/hudi-client/src/test/java/org/apache/hudi/io/TestHoodieMergeHandle.java @@ -78,7 +78,7 @@ public class TestHoodieMergeHandle extends HoodieClientTestHarness { public void testUpsertsForMultipleRecordsInSameFile() throws Exception { // Create records in a single partition String partitionPath = HoodieTestDataGenerator.DEFAULT_PARTITION_PATHS[0]; - dataGen = new HoodieTestDataGenerator(new String[]{partitionPath}); + dataGen = new HoodieTestDataGenerator(new String[] {partitionPath}); // Build a write config with bulkinsertparallelism set HoodieWriteConfig cfg = getConfigBuilder().build(); @@ -86,9 +86,8 @@ public class TestHoodieMergeHandle extends HoodieClientTestHarness { FileSystem fs = FSUtils.getFs(basePath, jsc.hadoopConfiguration()); /** - * Write 1 (only inserts) - * This will do a bulk insert of 44 records of which there are 2 records repeated 21 times each. - * id1 (21 records), id2 (21 records), id3, id4 + * Write 1 (only inserts) This will do a bulk insert of 44 records of which there are 2 records repeated 21 times + * each. id1 (21 records), id2 (21 records), id3, id4 */ String newCommitTime = "001"; client.startCommitWithTime(newCommitTime); @@ -113,16 +112,13 @@ public class TestHoodieMergeHandle extends HoodieClientTestHarness { assertEquals("Expecting a single commit.", 1, timeline.findInstantsAfter("000", Integer.MAX_VALUE).countInstants()); Assert.assertEquals("Latest commit should be 001", newCommitTime, timeline.lastInstant().get().getTimestamp()); - assertEquals("Must contain 44 records", - records.size(), + assertEquals("Must contain 44 records", records.size(), HoodieClientTestUtils.readCommit(basePath, sqlContext, timeline, newCommitTime).count()); /** - * Write 2 (insert) - * This will do a bulk insert of 1 record with the same row_key as record1 in the previous insert - id1. - * At this point, we will have 2 files with the row_keys as shown here - - * File 1 - id1 (21 records), id2 (21 records), id3, id4 - * File 2 - id1 + * Write 2 (insert) This will do a bulk insert of 1 record with the same row_key as record1 in the previous insert + * - id1. At this point, we will have 2 files with the row_keys as shown here - File 1 - id1 (21 records), id2 (21 + * records), id3, id4 File 2 - id1 */ newCommitTime = "002"; client.startCommitWithTime(newCommitTime); @@ -138,19 +134,15 @@ public class TestHoodieMergeHandle extends HoodieClientTestHarness { // verify that there are 2 commits metaClient = HoodieTableMetaClient.reload(metaClient); timeline = new HoodieActiveTimeline(metaClient).getCommitTimeline(); - assertEquals("Expecting two commits.", 2, timeline.findInstantsAfter("000", Integer.MAX_VALUE) - .countInstants()); + assertEquals("Expecting two commits.", 2, timeline.findInstantsAfter("000", Integer.MAX_VALUE).countInstants()); Assert.assertEquals("Latest commit should be 002", newCommitTime, timeline.lastInstant().get().getTimestamp()); Dataset dataSet = getRecords(); assertEquals("Must contain 45 records", 45, dataSet.count()); /** - * Write 3 (insert) - * This will bulk insert 2 new completely new records. - * At this point, we will have 2 files with the row_keys as shown here - - * File 1 - id1 (21 records), id2 (21 records), id3, id4 - * File 2 - id1 - * File 3 - id5, id6 + * Write 3 (insert) This will bulk insert 2 new completely new records. At this point, we will have 2 files with + * the row_keys as shown here - File 1 - id1 (21 records), id2 (21 records), id3, id4 File 2 - id1 File 3 - id5, + * id6 */ newCommitTime = "003"; client.startCommitWithTime(newCommitTime); @@ -162,19 +154,16 @@ public class TestHoodieMergeHandle extends HoodieClientTestHarness { // verify that there are now 3 commits metaClient = HoodieTableMetaClient.reload(metaClient); timeline = new HoodieActiveTimeline(metaClient).getCommitTimeline(); - assertEquals("Expecting three commits.", 3, timeline.findInstantsAfter("000", Integer.MAX_VALUE) - .countInstants()); + assertEquals("Expecting three commits.", 3, timeline.findInstantsAfter("000", Integer.MAX_VALUE).countInstants()); Assert.assertEquals("Latest commit should be 003", newCommitTime, timeline.lastInstant().get().getTimestamp()); dataSet = getRecords(); assertEquals("Must contain 47 records", 47, dataSet.count()); /** - * Write 4 (updates) - * This will generate 2 upsert records with id1 and id2. The rider and driver names in the update records - * will be rider-004 and driver-004. - * After the upsert is complete, all the records with id1 in File 1 and File 2 must be updated, all the records - * with id2 in File 2 must also be updated. - * Also, none of the other records in File 1, File 2 and File 3 must be updated. + * Write 4 (updates) This will generate 2 upsert records with id1 and id2. The rider and driver names in the + * update records will be rider-004 and driver-004. After the upsert is complete, all the records with id1 in File + * 1 and File 2 must be updated, all the records with id2 in File 2 must also be updated. Also, none of the other + * records in File 1, File 2 and File 3 must be updated. */ newCommitTime = "004"; client.startCommitWithTime(newCommitTime); @@ -195,8 +184,7 @@ public class TestHoodieMergeHandle extends HoodieClientTestHarness { // verify there are now 4 commits timeline = new HoodieActiveTimeline(metaClient).getCommitTimeline(); - assertEquals("Expecting four commits.", 4, timeline.findInstantsAfter("000", Integer.MAX_VALUE) - .countInstants()); + assertEquals("Expecting four commits.", 4, timeline.findInstantsAfter("000", Integer.MAX_VALUE).countInstants()); Assert.assertEquals("Latest commit should be 004", timeline.lastInstant().get().getTimestamp(), newCommitTime); // Check the entire dataset has 47 records still @@ -247,15 +235,16 @@ public class TestHoodieMergeHandle extends HoodieClientTestHarness { Assert.assertTrue(statuses.stream() .filter(status -> status.getStat().getPrevCommit() != HoodieWriteStat.NULL_COMMIT).count() > 0); // Num writes should be equal to the number of records inserted - Assert.assertEquals((long) statuses.stream() - .map(status -> status.getStat().getNumWrites()).reduce((a, b) -> a + b).get(), 100); + Assert.assertEquals( + (long) statuses.stream().map(status -> status.getStat().getNumWrites()).reduce((a, b) -> a + b).get(), 100); // Num update writes should be equal to the number of records updated - Assert.assertEquals((long) statuses.stream() - .map(status -> status.getStat().getNumUpdateWrites()).reduce((a, b) -> a + b).get(), 0); + Assert.assertEquals( + (long) statuses.stream().map(status -> status.getStat().getNumUpdateWrites()).reduce((a, b) -> a + b).get(), + 0); // Num update writes should be equal to the number of insert records converted to updates as part of small file // handling - Assert.assertEquals((long) statuses.stream() - .map(status -> status.getStat().getNumInserts()).reduce((a, b) -> a + b).get(), 100); + Assert.assertEquals( + (long) statuses.stream().map(status -> status.getStat().getNumInserts()).reduce((a, b) -> a + b).get(), 100); // Update all the 100 records metaClient = HoodieTableMetaClient.reload(metaClient); @@ -269,18 +258,20 @@ public class TestHoodieMergeHandle extends HoodieClientTestHarness { statuses = writeClient.upsert(updatedRecordsRDD, newCommitTime).collect(); // All records should be upserts into existing parquet - Assert.assertEquals(statuses.stream() - .filter(status -> status.getStat().getPrevCommit() == HoodieWriteStat.NULL_COMMIT).count(), 0); + Assert.assertEquals( + statuses.stream().filter(status -> status.getStat().getPrevCommit() == HoodieWriteStat.NULL_COMMIT).count(), + 0); // Num writes should be equal to the number of records inserted - Assert.assertEquals((long) statuses.stream() - .map(status -> status.getStat().getNumWrites()).reduce((a, b) -> a + b).get(), 100); + Assert.assertEquals( + (long) statuses.stream().map(status -> status.getStat().getNumWrites()).reduce((a, b) -> a + b).get(), 100); // Num update writes should be equal to the number of records updated - Assert.assertEquals((long) statuses.stream() - .map(status -> status.getStat().getNumUpdateWrites()).reduce((a, b) -> a + b).get(), 100); + Assert.assertEquals( + (long) statuses.stream().map(status -> status.getStat().getNumUpdateWrites()).reduce((a, b) -> a + b).get(), + 100); // Num update writes should be equal to the number of insert records converted to updates as part of small file // handling - Assert.assertEquals((long) statuses.stream() - .map(status -> status.getStat().getNumInserts()).reduce((a, b) -> a + b).get(), 0); + Assert.assertEquals( + (long) statuses.stream().map(status -> status.getStat().getNumInserts()).reduce((a, b) -> a + b).get(), 0); newCommitTime = "102"; writeClient.startCommitWithTime(newCommitTime); @@ -294,15 +285,16 @@ public class TestHoodieMergeHandle extends HoodieClientTestHarness { Assert.assertEquals((long) statuses.stream() .filter(status -> status.getStat().getPrevCommit() == HoodieWriteStat.NULL_COMMIT).count(), 0); // Num writes should be equal to the total number of records written - Assert.assertEquals((long) statuses.stream() - .map(status -> status.getStat().getNumWrites()).reduce((a, b) -> a + b).get(), 200); + Assert.assertEquals( + (long) statuses.stream().map(status -> status.getStat().getNumWrites()).reduce((a, b) -> a + b).get(), 200); // Num update writes should be equal to the number of records updated (including inserts converted as updates) - Assert.assertEquals((long) statuses.stream() - .map(status -> status.getStat().getNumUpdateWrites()).reduce((a, b) -> a + b).get(), 100); + Assert.assertEquals( + (long) statuses.stream().map(status -> status.getStat().getNumUpdateWrites()).reduce((a, b) -> a + b).get(), + 100); // Num update writes should be equal to the number of insert records converted to updates as part of small file // handling - Assert.assertEquals((long) statuses.stream() - .map(status -> status.getStat().getNumInserts()).reduce((a, b) -> a + b).get(), 100); + Assert.assertEquals( + (long) statuses.stream().map(status -> status.getStat().getNumInserts()).reduce((a, b) -> a + b).get(), 100); // Verify all records have location set statuses.forEach(writeStatus -> { writeStatus.getWrittenRecords().forEach(r -> { @@ -319,8 +311,7 @@ public class TestHoodieMergeHandle extends HoodieClientTestHarness { for (int i = 0; i < fullPartitionPaths.length; i++) { fullPartitionPaths[i] = String.format("%s/%s/*", basePath, dataGen.getPartitionPaths()[i]); } - Dataset dataSet = HoodieClientTestUtils.read(jsc, basePath, sqlContext, fs, - fullPartitionPaths); + Dataset dataSet = HoodieClientTestUtils.read(jsc, basePath, sqlContext, fs, fullPartitionPaths); return dataSet; } @@ -343,8 +334,7 @@ public class TestHoodieMergeHandle extends HoodieClientTestHarness { .withStorageConfig(HoodieStorageConfig.newBuilder().limitFileSize(1024 * 1024).build()) .forTable("test-trip-table") .withIndexConfig(HoodieIndexConfig.newBuilder().withIndexType(HoodieIndex.IndexType.BLOOM).build()) - .withBulkInsertParallelism(2) - .withWriteStatusClass(TestWriteStatus.class); + .withBulkInsertParallelism(2).withWriteStatusClass(TestWriteStatus.class); } /** diff --git a/hudi-client/src/test/java/org/apache/hudi/io/storage/TestHoodieStorageWriterFactory.java b/hudi-client/src/test/java/org/apache/hudi/io/storage/TestHoodieStorageWriterFactory.java index c4316442b..88a67ab45 100755 --- a/hudi-client/src/test/java/org/apache/hudi/io/storage/TestHoodieStorageWriterFactory.java +++ b/hudi-client/src/test/java/org/apache/hudi/io/storage/TestHoodieStorageWriterFactory.java @@ -1,62 +1,60 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.hudi.io.storage; - -import static org.junit.Assert.fail; - -import java.io.IOException; -import org.apache.avro.generic.IndexedRecord; -import org.apache.hadoop.fs.Path; -import org.apache.hudi.TestHoodieClientBase; -import org.apache.hudi.common.HoodieTestDataGenerator; -import org.apache.hudi.config.HoodieWriteConfig; -import org.apache.hudi.table.HoodieTable; -import org.junit.Assert; -import org.junit.Test; - -/** - * Tests for {@link HoodieStorageWriterFactory}. - */ -public class TestHoodieStorageWriterFactory extends TestHoodieClientBase { - - @Test - public void testGetStorageWriter() throws IOException { - // parquet file format. - final String commitTime = "100"; - final Path parquetPath = new Path(basePath + "/partition/path/f1_1-0-1_000.parquet"); - final HoodieWriteConfig cfg = getConfig(); - HoodieTable table = HoodieTable.getHoodieTable(metaClient, cfg, jsc); - HoodieStorageWriter parquetWriter = - HoodieStorageWriterFactory.getStorageWriter( - commitTime, parquetPath, table, cfg, HoodieTestDataGenerator.avroSchema); - Assert.assertTrue(parquetWriter instanceof HoodieParquetWriter); - - // other file format exception. - final Path logPath = new Path(basePath + "/partition/path/f.b51192a8-574b-4a85-b246-bcfec03ac8bf_100.log.2_1-0-1"); - try { - HoodieStorageWriter logWriter = - HoodieStorageWriterFactory.getStorageWriter( - commitTime, logPath, table, cfg, HoodieTestDataGenerator.avroSchema); - fail("should fail since log storage writer is not supported yet."); - } catch (Exception e) { - Assert.assertTrue(e instanceof UnsupportedOperationException); - Assert.assertTrue(e.getMessage().contains("format not supported yet.")); - } - } -} +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hudi.io.storage; + +import static org.junit.Assert.fail; + +import java.io.IOException; +import org.apache.avro.generic.IndexedRecord; +import org.apache.hadoop.fs.Path; +import org.apache.hudi.TestHoodieClientBase; +import org.apache.hudi.common.HoodieTestDataGenerator; +import org.apache.hudi.config.HoodieWriteConfig; +import org.apache.hudi.table.HoodieTable; +import org.junit.Assert; +import org.junit.Test; + +/** + * Tests for {@link HoodieStorageWriterFactory}. + */ +public class TestHoodieStorageWriterFactory extends TestHoodieClientBase { + + @Test + public void testGetStorageWriter() throws IOException { + // parquet file format. + final String commitTime = "100"; + final Path parquetPath = new Path(basePath + "/partition/path/f1_1-0-1_000.parquet"); + final HoodieWriteConfig cfg = getConfig(); + HoodieTable table = HoodieTable.getHoodieTable(metaClient, cfg, jsc); + HoodieStorageWriter parquetWriter = HoodieStorageWriterFactory.getStorageWriter(commitTime, + parquetPath, table, cfg, HoodieTestDataGenerator.avroSchema); + Assert.assertTrue(parquetWriter instanceof HoodieParquetWriter); + + // other file format exception. + final Path logPath = new Path(basePath + "/partition/path/f.b51192a8-574b-4a85-b246-bcfec03ac8bf_100.log.2_1-0-1"); + try { + HoodieStorageWriter logWriter = HoodieStorageWriterFactory.getStorageWriter(commitTime, logPath, + table, cfg, HoodieTestDataGenerator.avroSchema); + fail("should fail since log storage writer is not supported yet."); + } catch (Exception e) { + Assert.assertTrue(e instanceof UnsupportedOperationException); + Assert.assertTrue(e.getMessage().contains("format not supported yet.")); + } + } +} diff --git a/hudi-client/src/test/java/org/apache/hudi/io/strategy/TestHoodieCompactionStrategy.java b/hudi-client/src/test/java/org/apache/hudi/io/strategy/TestHoodieCompactionStrategy.java index 9a2f0c5ef..93513a8ec 100644 --- a/hudi-client/src/test/java/org/apache/hudi/io/strategy/TestHoodieCompactionStrategy.java +++ b/hudi-client/src/test/java/org/apache/hudi/io/strategy/TestHoodieCompactionStrategy.java @@ -60,8 +60,8 @@ public class TestHoodieCompactionStrategy { sizesMap.put(100 * MB, Lists.newArrayList(MB)); sizesMap.put(90 * MB, Lists.newArrayList(1024 * MB)); UnBoundedCompactionStrategy strategy = new UnBoundedCompactionStrategy(); - HoodieWriteConfig writeConfig = HoodieWriteConfig.newBuilder().withPath("/tmp").withCompactionConfig( - HoodieCompactionConfig.newBuilder().withCompactionStrategy(strategy).build()).build(); + HoodieWriteConfig writeConfig = HoodieWriteConfig.newBuilder().withPath("/tmp") + .withCompactionConfig(HoodieCompactionConfig.newBuilder().withCompactionStrategy(strategy).build()).build(); List operations = createCompactionOperations(writeConfig, sizesMap); List returned = strategy.orderAndFilter(writeConfig, operations, new ArrayList<>()); assertEquals("UnBounded should not re-order or filter", operations, returned); @@ -123,26 +123,21 @@ public class TestHoodieCompactionStrategy { sizesMap.put(90 * MB, Lists.newArrayList(1024 * MB)); Map keyToPartitionMap = new ImmutableMap.Builder().put(120 * MB, partitionPaths[2]) - .put(110 * MB, partitionPaths[2]) - .put(100 * MB, partitionPaths[1]) - .put(90 * MB, partitionPaths[0]) - .build(); + .put(110 * MB, partitionPaths[2]).put(100 * MB, partitionPaths[1]).put(90 * MB, partitionPaths[0]).build(); DayBasedCompactionStrategy strategy = new DayBasedCompactionStrategy(); - HoodieWriteConfig writeConfig = HoodieWriteConfig.newBuilder().withPath("/tmp").withCompactionConfig( - HoodieCompactionConfig.newBuilder().withCompactionStrategy(strategy) - .withTargetPartitionsPerDayBasedCompaction(1) - .build()).build(); + HoodieWriteConfig writeConfig = + HoodieWriteConfig.newBuilder().withPath("/tmp").withCompactionConfig(HoodieCompactionConfig.newBuilder() + .withCompactionStrategy(strategy).withTargetPartitionsPerDayBasedCompaction(1).build()).build(); List operations = createCompactionOperations(writeConfig, sizesMap, keyToPartitionMap); List returned = strategy.orderAndFilter(writeConfig, operations, new ArrayList<>()); assertTrue("DayBasedCompactionStrategy should have resulted in fewer compactions", returned.size() < operations.size()); - Assert.assertEquals("DayBasedCompactionStrategy should have resulted in fewer compactions", - returned.size(), 2); + Assert.assertEquals("DayBasedCompactionStrategy should have resulted in fewer compactions", returned.size(), 2); - int comparision = strategy.getComparator().compare(returned.get(returned.size() - 1).getPartitionPath(), returned - .get(0).getPartitionPath()); + int comparision = strategy.getComparator().compare(returned.get(returned.size() - 1).getPartitionPath(), + returned.get(0).getPartitionPath()); // Either the partition paths are sorted in descending order or they are equal assertTrue("DayBasedCompactionStrategy should sort partitions in descending order", comparision >= 0); } @@ -167,20 +162,14 @@ public class TestHoodieCompactionStrategy { String currentDayPlus1 = format.format(BoundedPartitionAwareCompactionStrategy.getDateAtOffsetFromToday(1)); String currentDayPlus5 = format.format(BoundedPartitionAwareCompactionStrategy.getDateAtOffsetFromToday(5)); - Map keyToPartitionMap = new ImmutableMap.Builder() - .put(120 * MB, currentDay) - .put(110 * MB, currentDayMinus1) - .put(100 * MB, currentDayMinus2) - .put(80 * MB, currentDayMinus3) - .put(90 * MB, currentDayPlus1) - .put(70 * MB, currentDayPlus5) - .build(); + Map keyToPartitionMap = new ImmutableMap.Builder().put(120 * MB, currentDay) + .put(110 * MB, currentDayMinus1).put(100 * MB, currentDayMinus2).put(80 * MB, currentDayMinus3) + .put(90 * MB, currentDayPlus1).put(70 * MB, currentDayPlus5).build(); BoundedPartitionAwareCompactionStrategy strategy = new BoundedPartitionAwareCompactionStrategy(); - HoodieWriteConfig writeConfig = HoodieWriteConfig.newBuilder().withPath("/tmp").withCompactionConfig( - HoodieCompactionConfig.newBuilder().withCompactionStrategy(strategy) - .withTargetPartitionsPerDayBasedCompaction(2) - .build()).build(); + HoodieWriteConfig writeConfig = + HoodieWriteConfig.newBuilder().withPath("/tmp").withCompactionConfig(HoodieCompactionConfig.newBuilder() + .withCompactionStrategy(strategy).withTargetPartitionsPerDayBasedCompaction(2).build()).build(); List operations = createCompactionOperations(writeConfig, sizesMap, keyToPartitionMap); List returned = strategy.orderAndFilter(writeConfig, operations, new ArrayList<>()); @@ -189,8 +178,8 @@ public class TestHoodieCompactionStrategy { Assert.assertEquals("BoundedPartitionAwareCompactionStrategy should have resulted in fewer compactions", returned.size(), 5); - int comparision = strategy.getComparator().compare(returned.get(returned.size() - 1).getPartitionPath(), returned - .get(0).getPartitionPath()); + int comparision = strategy.getComparator().compare(returned.get(returned.size() - 1).getPartitionPath(), + returned.get(0).getPartitionPath()); // Either the partition paths are sorted in descending order or they are equal assertTrue("BoundedPartitionAwareCompactionStrategy should sort partitions in descending order", comparision >= 0); } @@ -215,34 +204,29 @@ public class TestHoodieCompactionStrategy { String currentDayPlus1 = format.format(BoundedPartitionAwareCompactionStrategy.getDateAtOffsetFromToday(1)); String currentDayPlus5 = format.format(BoundedPartitionAwareCompactionStrategy.getDateAtOffsetFromToday(5)); - Map keyToPartitionMap = new ImmutableMap.Builder() - .put(120 * MB, currentDay) - .put(110 * MB, currentDayMinus1) - .put(100 * MB, currentDayMinus2) - .put(80 * MB, currentDayMinus3) - .put(90 * MB, currentDayPlus1) - .put(70 * MB, currentDayPlus5) - .build(); + Map keyToPartitionMap = new ImmutableMap.Builder().put(120 * MB, currentDay) + .put(110 * MB, currentDayMinus1).put(100 * MB, currentDayMinus2).put(80 * MB, currentDayMinus3) + .put(90 * MB, currentDayPlus1).put(70 * MB, currentDayPlus5).build(); UnBoundedPartitionAwareCompactionStrategy strategy = new UnBoundedPartitionAwareCompactionStrategy(); - HoodieWriteConfig writeConfig = HoodieWriteConfig.newBuilder().withPath("/tmp").withCompactionConfig( - HoodieCompactionConfig.newBuilder().withCompactionStrategy(strategy) - .withTargetPartitionsPerDayBasedCompaction(2) - .build()).build(); + HoodieWriteConfig writeConfig = + HoodieWriteConfig.newBuilder().withPath("/tmp").withCompactionConfig(HoodieCompactionConfig.newBuilder() + .withCompactionStrategy(strategy).withTargetPartitionsPerDayBasedCompaction(2).build()).build(); List operations = createCompactionOperations(writeConfig, sizesMap, keyToPartitionMap); List returned = strategy.orderAndFilter(writeConfig, operations, new ArrayList<>()); - assertTrue("UnBoundedPartitionAwareCompactionStrategy should not include last " + writeConfig - .getTargetPartitionsPerDayBasedCompaction() + " partitions or later partitions from today", + assertTrue( + "UnBoundedPartitionAwareCompactionStrategy should not include last " + + writeConfig.getTargetPartitionsPerDayBasedCompaction() + " partitions or later partitions from today", returned.size() < operations.size()); - Assert.assertEquals("BoundedPartitionAwareCompactionStrategy should have resulted in 1 compaction", - returned.size(), 1); + Assert.assertEquals("BoundedPartitionAwareCompactionStrategy should have resulted in 1 compaction", returned.size(), + 1); } private List createCompactionOperations(HoodieWriteConfig config, Map> sizesMap) { - Map keyToPartitionMap = sizesMap.entrySet().stream().map(e -> - Pair.of(e.getKey(), partitionPaths[new Random().nextInt(partitionPaths.length - 1)])) + Map keyToPartitionMap = sizesMap.entrySet().stream() + .map(e -> Pair.of(e.getKey(), partitionPaths[new Random().nextInt(partitionPaths.length - 1)])) .collect(Collectors.toMap(Pair::getKey, Pair::getValue)); return createCompactionOperations(config, sizesMap, keyToPartitionMap); } @@ -256,9 +240,7 @@ public class TestHoodieCompactionStrategy { String partitionPath = keyToPartitionMap.get(k); List logFiles = v.stream().map(TestHoodieLogFile::newLogFile).collect(Collectors.toList()); operations.add(new HoodieCompactionOperation(df.getCommitTime(), - logFiles.stream().map(s -> s.getPath().toString()).collect(Collectors.toList()), - df.getPath(), - df.getFileId(), + logFiles.stream().map(s -> s.getPath().toString()).collect(Collectors.toList()), df.getPath(), df.getFileId(), partitionPath, config.getCompactionStrategy().captureMetrics(config, Option.of(df), partitionPath, logFiles))); }); diff --git a/hudi-client/src/test/java/org/apache/hudi/table/TestCopyOnWriteTable.java b/hudi-client/src/test/java/org/apache/hudi/table/TestCopyOnWriteTable.java index 12ef63323..52c6fb419 100644 --- a/hudi-client/src/test/java/org/apache/hudi/table/TestCopyOnWriteTable.java +++ b/hudi-client/src/test/java/org/apache/hudi/table/TestCopyOnWriteTable.java @@ -105,9 +105,8 @@ public class TestCopyOnWriteTable extends HoodieClientTestHarness { return Pair.of(io.makeNewPath(record.getPartitionPath()), writeToken); }).collect().get(0); - Assert.assertEquals(newPathWithWriteToken.getKey().toString(), - this.basePath + "/" + partitionPath + "/" - + FSUtils.makeDataFileName(commitTime, newPathWithWriteToken.getRight(), fileName)); + Assert.assertEquals(newPathWithWriteToken.getKey().toString(), this.basePath + "/" + partitionPath + "/" + + FSUtils.makeDataFileName(commitTime, newPathWithWriteToken.getRight(), fileName)); } private HoodieWriteConfig makeHoodieClientConfig() throws Exception { @@ -173,9 +172,8 @@ public class TestCopyOnWriteTable extends HoodieClientTestHarness { assertTrue(filter.mightContain(record.getRecordKey())); } // Create a commit file - new File( - this.basePath + "/" + HoodieTableMetaClient.METAFOLDER_NAME + "/" + FSUtils.getCommitTime(parquetFile.getName()) - + ".commit").createNewFile(); + new File(this.basePath + "/" + HoodieTableMetaClient.METAFOLDER_NAME + "/" + + FSUtils.getCommitTime(parquetFile.getName()) + ".commit").createNewFile(); // Read the parquet file, check the record content List fileRecords = ParquetUtils.readAvroRecords(jsc.hadoopConfiguration(), parquetFilePath); @@ -197,8 +195,8 @@ public class TestCopyOnWriteTable extends HoodieClientTestHarness { updatedRecord1.seal(); TestRawTripPayload rowChange4 = new TestRawTripPayload(recordStr4); - HoodieRecord insertedRecord1 = new HoodieRecord( - new HoodieKey(rowChange4.getRowKey(), rowChange4.getPartitionPath()), rowChange4); + HoodieRecord insertedRecord1 = + new HoodieRecord(new HoodieKey(rowChange4.getRowKey(), rowChange4.getPartitionPath()), rowChange4); List updatedRecords = Arrays.asList(updatedRecord1, insertedRecord1); @@ -206,11 +204,10 @@ public class TestCopyOnWriteTable extends HoodieClientTestHarness { String newCommitTime = HoodieTestUtils.makeNewCommitTime(); metaClient = HoodieTableMetaClient.reload(metaClient); final HoodieCopyOnWriteTable newTable = new HoodieCopyOnWriteTable(config, jsc); - List statuses = - jsc.parallelize(Arrays.asList(1)).map(x -> { - return newTable.handleUpdate(newCommitTime, updatedRecord1.getCurrentLocation().getFileId(), - updatedRecords.iterator()); - }).flatMap(x -> HoodieClientTestUtils.collectStatuses(x).iterator()).collect(); + List statuses = jsc.parallelize(Arrays.asList(1)).map(x -> { + return newTable.handleUpdate(newCommitTime, updatedRecord1.getCurrentLocation().getFileId(), + updatedRecords.iterator()); + }).flatMap(x -> HoodieClientTestUtils.collectStatuses(x).iterator()).collect(); // Check the updated file File updatedParquetFile = null; @@ -218,7 +215,7 @@ public class TestCopyOnWriteTable extends HoodieClientTestHarness { if (file.getName().endsWith(".parquet")) { if (FSUtils.getFileId(file.getName()).equals(FSUtils.getFileId(parquetFile.getName())) && HoodieTimeline.compareTimestamps(FSUtils.getCommitTime(file.getName()), - FSUtils.getCommitTime(parquetFile.getName()), HoodieTimeline.GREATER)) { + FSUtils.getCommitTime(parquetFile.getName()), HoodieTimeline.GREATER)) { updatedParquetFile = file; break; } @@ -227,8 +224,8 @@ public class TestCopyOnWriteTable extends HoodieClientTestHarness { assertTrue(updatedParquetFile != null); // Check whether the record has been updated Path updatedParquetFilePath = new Path(updatedParquetFile.getAbsolutePath()); - BloomFilter updatedFilter = ParquetUtils.readBloomFilterFromParquetMetadata(jsc.hadoopConfiguration(), - updatedParquetFilePath); + BloomFilter updatedFilter = + ParquetUtils.readBloomFilterFromParquetMetadata(jsc.hadoopConfiguration(), updatedParquetFilePath); for (HoodieRecord record : records) { // No change to the _row_key assertTrue(updatedFilter.mightContain(record.getRecordKey())); @@ -250,15 +247,15 @@ public class TestCopyOnWriteTable extends HoodieClientTestHarness { // Also check the numRecordsWritten WriteStatus writeStatus = statuses.get(0); assertTrue("Should be only one file generated", statuses.size() == 1); - assertEquals(4, writeStatus.getStat().getNumWrites());//3 rewritten records + 1 new record + assertEquals(4, writeStatus.getStat().getNumWrites());// 3 rewritten records + 1 new record } private List newHoodieRecords(int n, String time) throws Exception { List records = new ArrayList<>(); for (int i = 0; i < n; i++) { - String recordStr = String.format("{\"_row_key\":\"%s\",\"time\":\"%s\",\"number\":%d}", - UUID.randomUUID().toString(), time, i); + String recordStr = + String.format("{\"_row_key\":\"%s\",\"time\":\"%s\",\"number\":%d}", UUID.randomUUID().toString(), time, i); TestRawTripPayload rowChange = new TestRawTripPayload(recordStr); records.add(new HoodieRecord(new HoodieKey(rowChange.getRowKey(), rowChange.getPartitionPath()), rowChange)); } @@ -269,8 +266,8 @@ public class TestCopyOnWriteTable extends HoodieClientTestHarness { @Test public void testMetadataAggregateFromWriteStatus() throws Exception { // Prepare the AvroParquetIO - HoodieWriteConfig config = makeHoodieClientConfigBuilder().withWriteStatusClass(MetadataMergeWriteStatus.class) - .build(); + HoodieWriteConfig config = + makeHoodieClientConfigBuilder().withWriteStatusClass(MetadataMergeWriteStatus.class).build(); String firstCommitTime = HoodieTestUtils.makeNewCommitTime(); metaClient = HoodieTableMetaClient.reload(metaClient); @@ -297,8 +294,8 @@ public class TestCopyOnWriteTable extends HoodieClientTestHarness { return table.handleInsert(firstCommitTime, FSUtils.createNewFileIdPfx(), records.iterator()); }).flatMap(x -> HoodieClientTestUtils.collectStatuses(x).iterator()).collect(); - Map allWriteStatusMergedMetadataMap = MetadataMergeWriteStatus - .mergeMetadataForWriteStatuses(writeStatuses); + Map allWriteStatusMergedMetadataMap = + MetadataMergeWriteStatus.mergeMetadataForWriteStatuses(writeStatuses); assertTrue(allWriteStatusMergedMetadataMap.containsKey("InputRecordCount_1506582000")); // For metadata key InputRecordCount_1506582000, value is 2 for each record. So sum of this // should be 2 * 3 @@ -359,9 +356,8 @@ public class TestCopyOnWriteTable extends HoodieClientTestHarness { @Test public void testFileSizeUpsertRecords() throws Exception { - HoodieWriteConfig config = makeHoodieClientConfigBuilder().withStorageConfig( - HoodieStorageConfig.newBuilder().limitFileSize(64 * 1024).parquetBlockSize(64 * 1024).parquetPageSize(64 * 1024) - .build()).build(); + HoodieWriteConfig config = makeHoodieClientConfigBuilder().withStorageConfig(HoodieStorageConfig.newBuilder() + .limitFileSize(64 * 1024).parquetBlockSize(64 * 1024).parquetPageSize(64 * 1024).build()).build(); String commitTime = HoodieTestUtils.makeNewCommitTime(); metaClient = HoodieTableMetaClient.reload(metaClient); HoodieCopyOnWriteTable table = new HoodieCopyOnWriteTable(config, jsc); @@ -369,9 +365,8 @@ public class TestCopyOnWriteTable extends HoodieClientTestHarness { List records = new ArrayList<>(); // Approx 1150 records are written for block size of 64KB for (int i = 0; i < 2000; i++) { - String recordStr = - "{\"_row_key\":\"" + UUID.randomUUID().toString() + "\",\"time\":\"2016-01-31T03:16:41.415Z\",\"number\":" + i - + "}"; + String recordStr = "{\"_row_key\":\"" + UUID.randomUUID().toString() + + "\",\"time\":\"2016-01-31T03:16:41.415Z\",\"number\":" + i + "}"; TestRawTripPayload rowChange = new TestRawTripPayload(recordStr); records.add(new HoodieRecord(new HoodieKey(rowChange.getRowKey(), rowChange.getPartitionPath()), rowChange)); } @@ -393,19 +388,19 @@ public class TestCopyOnWriteTable extends HoodieClientTestHarness { } - private UpsertPartitioner getUpsertPartitioner(int smallFileSize, int numInserts, - int numUpdates, int fileSize, String testPartitionPath, boolean autoSplitInserts) throws Exception { - HoodieWriteConfig config = makeHoodieClientConfigBuilder().withCompactionConfig( - HoodieCompactionConfig.newBuilder().compactionSmallFileSize(smallFileSize).insertSplitSize(100) - .autoTuneInsertSplits(autoSplitInserts).build()).withStorageConfig( - HoodieStorageConfig.newBuilder().limitFileSize(1000 * 1024).build()).build(); + private UpsertPartitioner getUpsertPartitioner(int smallFileSize, int numInserts, int numUpdates, int fileSize, + String testPartitionPath, boolean autoSplitInserts) throws Exception { + HoodieWriteConfig config = makeHoodieClientConfigBuilder() + .withCompactionConfig(HoodieCompactionConfig.newBuilder().compactionSmallFileSize(smallFileSize) + .insertSplitSize(100).autoTuneInsertSplits(autoSplitInserts).build()) + .withStorageConfig(HoodieStorageConfig.newBuilder().limitFileSize(1000 * 1024).build()).build(); HoodieClientTestUtils.fakeCommitFile(basePath, "001"); HoodieClientTestUtils.fakeDataFile(basePath, testPartitionPath, "001", "file1", fileSize); metaClient = HoodieTableMetaClient.reload(metaClient); HoodieCopyOnWriteTable table = new HoodieCopyOnWriteTable(config, jsc); - HoodieTestDataGenerator dataGenerator = new HoodieTestDataGenerator(new String[]{testPartitionPath}); + HoodieTestDataGenerator dataGenerator = new HoodieTestDataGenerator(new String[] {testPartitionPath}); List insertRecords = dataGenerator.generateInserts("001", numInserts); List updateRecords = dataGenerator.generateUpdates("001", numUpdates); for (HoodieRecord updateRec : updateRecords) { @@ -429,8 +424,7 @@ public class TestCopyOnWriteTable extends HoodieClientTestHarness { public void testUpsertPartitioner() throws Exception { final String testPartitionPath = "2016/09/26"; // Inserts + Updates... Check all updates go together & inserts subsplit - UpsertPartitioner partitioner = getUpsertPartitioner(0, 200, 100, 1024, - testPartitionPath, false); + UpsertPartitioner partitioner = getUpsertPartitioner(0, 200, 100, 1024, testPartitionPath, false); List insertBuckets = partitioner.getInsertBuckets(testPartitionPath); assertEquals("Total of 2 insert buckets", 2, insertBuckets.size()); } @@ -441,8 +435,7 @@ public class TestCopyOnWriteTable extends HoodieClientTestHarness { final String testPartitionPath = "2016/09/26"; // Inserts + Updates .. Check updates go together & inserts subsplit, after expanding // smallest file - UpsertPartitioner partitioner = getUpsertPartitioner(1000 * 1024, 400, 100, 800 * 1024, testPartitionPath, - false); + UpsertPartitioner partitioner = getUpsertPartitioner(1000 * 1024, 400, 100, 800 * 1024, testPartitionPath, false); List insertBuckets = partitioner.getInsertBuckets(testPartitionPath); assertEquals("Should have 3 partitions", 3, partitioner.numPartitions()); @@ -476,8 +469,8 @@ public class TestCopyOnWriteTable extends HoodieClientTestHarness { @Test public void testInsertUpsertWithHoodieAvroPayload() throws Exception { - HoodieWriteConfig config = makeHoodieClientConfigBuilder().withStorageConfig( - HoodieStorageConfig.newBuilder().limitFileSize(1000 * 1024).build()).build(); + HoodieWriteConfig config = makeHoodieClientConfigBuilder() + .withStorageConfig(HoodieStorageConfig.newBuilder().limitFileSize(1000 * 1024).build()).build(); metaClient = HoodieTableMetaClient.reload(metaClient); final HoodieCopyOnWriteTable table = new HoodieCopyOnWriteTable(config, jsc); String commitTime = "000"; diff --git a/hudi-client/src/test/java/org/apache/hudi/table/TestHoodieRecordSizing.java b/hudi-client/src/test/java/org/apache/hudi/table/TestHoodieRecordSizing.java index 67370c1cd..a2424588d 100644 --- a/hudi-client/src/test/java/org/apache/hudi/table/TestHoodieRecordSizing.java +++ b/hudi-client/src/test/java/org/apache/hudi/table/TestHoodieRecordSizing.java @@ -79,20 +79,15 @@ public class TestHoodieRecordSizing { private static LinkedList> generateCommitMetadataList() throws IOException { LinkedList> commits = new LinkedList<>(); // First commit with non zero records and bytes - commits.push(Option.of(generateCommitMetadataWith(2000, 10000).toJsonString() - .getBytes(StandardCharsets.UTF_8))); + commits.push(Option.of(generateCommitMetadataWith(2000, 10000).toJsonString().getBytes(StandardCharsets.UTF_8))); // Second commit with non zero records and bytes - commits.push(Option.of(generateCommitMetadataWith(1500, 7500).toJsonString() - .getBytes(StandardCharsets.UTF_8))); + commits.push(Option.of(generateCommitMetadataWith(1500, 7500).toJsonString().getBytes(StandardCharsets.UTF_8))); // Third commit with both zero records and zero bytes - commits.push(Option.of(generateCommitMetadataWith(0, 0).toJsonString() - .getBytes(StandardCharsets.UTF_8))); + commits.push(Option.of(generateCommitMetadataWith(0, 0).toJsonString().getBytes(StandardCharsets.UTF_8))); // Fourth commit with zero records - commits.push(Option.of(generateCommitMetadataWith(0, 1500).toJsonString() - .getBytes(StandardCharsets.UTF_8))); + commits.push(Option.of(generateCommitMetadataWith(0, 1500).toJsonString().getBytes(StandardCharsets.UTF_8))); // Fifth commit with zero bytes - commits.push(Option.of(generateCommitMetadataWith(2500, 0).toJsonString() - .getBytes(StandardCharsets.UTF_8))); + commits.push(Option.of(generateCommitMetadataWith(2500, 0).toJsonString().getBytes(StandardCharsets.UTF_8))); return commits; } diff --git a/hudi-client/src/test/java/org/apache/hudi/table/TestMergeOnReadTable.java b/hudi-client/src/test/java/org/apache/hudi/table/TestMergeOnReadTable.java index 316e1713d..152ad0f8d 100644 --- a/hudi-client/src/test/java/org/apache/hudi/table/TestMergeOnReadTable.java +++ b/hudi-client/src/test/java/org/apache/hudi/table/TestMergeOnReadTable.java @@ -128,8 +128,8 @@ public class TestMergeOnReadTable extends HoodieClientTestHarness { assertFalse(commit.isPresent()); FileStatus[] allFiles = HoodieTestUtils.listAllDataFilesInPath(metaClient.getFs(), cfg.getBasePath()); - ReadOptimizedView roView = new HoodieTableFileSystemView(metaClient, - metaClient.getCommitTimeline().filterCompletedInstants(), allFiles); + ReadOptimizedView roView = + new HoodieTableFileSystemView(metaClient, metaClient.getCommitTimeline().filterCompletedInstants(), allFiles); Stream dataFilesToRead = roView.getLatestDataFiles(); assertTrue(!dataFilesToRead.findAny().isPresent()); @@ -198,8 +198,8 @@ public class TestMergeOnReadTable extends HoodieClientTestHarness { List statuses = client.upsert(writeRecords, newCommitTime).collect(); assertNoWriteErrors(statuses); - Map allWriteStatusMergedMetadataMap = MetadataMergeWriteStatus - .mergeMetadataForWriteStatuses(statuses); + Map allWriteStatusMergedMetadataMap = + MetadataMergeWriteStatus.mergeMetadataForWriteStatuses(statuses); assertTrue(allWriteStatusMergedMetadataMap.containsKey("InputRecordCount_1506582000")); // For metadata key InputRecordCount_1506582000, value is 2 for each record. So sum of this // should be 2 * records.size() @@ -236,8 +236,8 @@ public class TestMergeOnReadTable extends HoodieClientTestHarness { assertFalse(commit.isPresent()); FileStatus[] allFiles = HoodieTestUtils.listAllDataFilesInPath(metaClient.getFs(), cfg.getBasePath()); - ReadOptimizedView roView = new HoodieTableFileSystemView(metaClient, - metaClient.getCommitTimeline().filterCompletedInstants(), allFiles); + ReadOptimizedView roView = + new HoodieTableFileSystemView(metaClient, metaClient.getCommitTimeline().filterCompletedInstants(), allFiles); Stream dataFilesToRead = roView.getLatestDataFiles(); assertTrue(!dataFilesToRead.findAny().isPresent()); @@ -284,7 +284,7 @@ public class TestMergeOnReadTable extends HoodieClientTestHarness { List dataFiles = roView.getLatestDataFiles().map(hf -> hf.getPath()).collect(Collectors.toList()); List recordsRead = HoodieMergeOnReadTestUtils.getRecordsUsingInputFormat(dataFiles, basePath); - //Wrote 20 records and deleted 20 records, so remaining 20-20 = 0 + // Wrote 20 records and deleted 20 records, so remaining 20-20 = 0 assertEquals("Must contain 0 records", 0, recordsRead.size()); } } @@ -292,7 +292,7 @@ public class TestMergeOnReadTable extends HoodieClientTestHarness { @Test public void testCOWToMORConvertedDatasetRollback() throws Exception { - //Set TableType to COW + // Set TableType to COW HoodieTestUtils.init(jsc.hadoopConfiguration(), basePath, HoodieTableType.COPY_ON_WRITE); HoodieWriteConfig cfg = getConfig(true); @@ -308,7 +308,7 @@ public class TestMergeOnReadTable extends HoodieClientTestHarness { JavaRDD writeRecords = jsc.parallelize(records, 1); List statuses = client.upsert(writeRecords, newCommitTime).collect(); - //verify there are no errors + // verify there are no errors assertNoWriteErrors(statuses); HoodieTableMetaClient metaClient = new HoodieTableMetaClient(jsc.hadoopConfiguration(), cfg.getBasePath()); @@ -328,17 +328,17 @@ public class TestMergeOnReadTable extends HoodieClientTestHarness { // Verify there are no errors assertNoWriteErrors(statuses); - //Set TableType to MOR + // Set TableType to MOR HoodieTestUtils.init(jsc.hadoopConfiguration(), basePath, HoodieTableType.MERGE_ON_READ); - //rollback a COW commit when TableType is MOR + // rollback a COW commit when TableType is MOR client.rollback(newCommitTime); metaClient = HoodieTableMetaClient.reload(metaClient); HoodieTable hoodieTable = HoodieTable.getHoodieTable(metaClient, cfg, jsc); FileStatus[] allFiles = HoodieTestUtils.listAllDataFilesInPath(metaClient.getFs(), cfg.getBasePath()); - HoodieTableFileSystemView roView = new HoodieTableFileSystemView(metaClient, - hoodieTable.getCompletedCommitsTimeline(), allFiles); + HoodieTableFileSystemView roView = + new HoodieTableFileSystemView(metaClient, hoodieTable.getCompletedCommitsTimeline(), allFiles); final String absentCommit = newCommitTime; assertFalse(roView.getLatestDataFiles().filter(file -> { @@ -383,8 +383,8 @@ public class TestMergeOnReadTable extends HoodieClientTestHarness { assertFalse(commit.isPresent()); FileStatus[] allFiles = HoodieTestUtils.listAllDataFilesInPath(metaClient.getFs(), cfg.getBasePath()); - ReadOptimizedView roView = new HoodieTableFileSystemView(metaClient, - metaClient.getCommitTimeline().filterCompletedInstants(), allFiles); + ReadOptimizedView roView = + new HoodieTableFileSystemView(metaClient, metaClient.getCommitTimeline().filterCompletedInstants(), allFiles); Stream dataFilesToRead = roView.getLatestDataFiles(); assertTrue(!dataFilesToRead.findAny().isPresent()); @@ -417,8 +417,8 @@ public class TestMergeOnReadTable extends HoodieClientTestHarness { secondClient.rollback(commitTime1); allFiles = HoodieTestUtils.listAllDataFilesInPath(metaClient.getFs(), cfg.getBasePath()); // After rollback, there should be no parquet file with the failed commit time - Assert.assertEquals(Arrays.asList(allFiles).stream().filter(file -> file.getPath().getName() - .contains(commitTime1)).collect(Collectors.toList()).size(), 0); + Assert.assertEquals(Arrays.asList(allFiles).stream() + .filter(file -> file.getPath().getName().contains(commitTime1)).collect(Collectors.toList()).size(), 0); dataFiles = roView.getLatestDataFiles().map(hf -> hf.getPath()).collect(Collectors.toList()); recordsRead = HoodieMergeOnReadTestUtils.getRecordsUsingInputFormat(dataFiles, basePath); assertEquals(recordsRead.size(), 200); @@ -450,8 +450,8 @@ public class TestMergeOnReadTable extends HoodieClientTestHarness { thirdClient.rollback(commitTime2); allFiles = HoodieTestUtils.listAllDataFilesInPath(metaClient.getFs(), cfg.getBasePath()); // After rollback, there should be no parquet file with the failed commit time - Assert.assertEquals(Arrays.asList(allFiles).stream().filter(file -> file.getPath().getName() - .contains(commitTime2)).collect(Collectors.toList()).size(), 0); + Assert.assertEquals(Arrays.asList(allFiles).stream() + .filter(file -> file.getPath().getName().contains(commitTime2)).collect(Collectors.toList()).size(), 0); metaClient = HoodieTableMetaClient.reload(metaClient); hoodieTable = HoodieTable.getHoodieTable(metaClient, cfg, jsc); @@ -488,9 +488,8 @@ public class TestMergeOnReadTable extends HoodieClientTestHarness { roView = new HoodieTableFileSystemView(metaClient, metaClient.getCommitsTimeline(), allFiles); List dataFiles2 = roView.getLatestDataFiles().collect(Collectors.toList()); - final String compactedCommitTime = metaClient.getActiveTimeline().reload().getCommitsTimeline().lastInstant() - .get() - .getTimestamp(); + final String compactedCommitTime = + metaClient.getActiveTimeline().reload().getCommitsTimeline().lastInstant().get().getTimestamp(); assertTrue(roView.getLatestDataFiles().filter(file -> { if (compactedCommitTime.equals(file.getCommitTime())) { @@ -550,8 +549,8 @@ public class TestMergeOnReadTable extends HoodieClientTestHarness { assertFalse(commit.isPresent()); FileStatus[] allFiles = HoodieTestUtils.listAllDataFilesInPath(metaClient.getFs(), cfg.getBasePath()); - ReadOptimizedView roView = new HoodieTableFileSystemView(metaClient, - metaClient.getCommitTimeline().filterCompletedInstants(), allFiles); + ReadOptimizedView roView = + new HoodieTableFileSystemView(metaClient, metaClient.getCommitTimeline().filterCompletedInstants(), allFiles); Stream dataFilesToRead = roView.getLatestDataFiles(); assertTrue(!dataFilesToRead.findAny().isPresent()); @@ -637,9 +636,8 @@ public class TestMergeOnReadTable extends HoodieClientTestHarness { metaClient = HoodieTableMetaClient.reload(metaClient); roView = new HoodieTableFileSystemView(metaClient, metaClient.getCommitsTimeline(), allFiles); - final String compactedCommitTime = metaClient.getActiveTimeline().reload().getCommitsTimeline().lastInstant() - .get() - .getTimestamp(); + final String compactedCommitTime = + metaClient.getActiveTimeline().reload().getCommitsTimeline().lastInstant().get().getTimestamp(); assertTrue(roView.getLatestDataFiles().filter(file -> { if (compactedCommitTime.equals(file.getCommitTime())) { @@ -670,33 +668,32 @@ public class TestMergeOnReadTable extends HoodieClientTestHarness { metaClient = HoodieTableMetaClient.reload(metaClient); allFiles = HoodieTestUtils.listAllDataFilesInPath(metaClient.getFs(), cfg.getBasePath()); - roView = new HoodieTableFileSystemView(metaClient, - metaClient.getCommitTimeline().filterCompletedInstants(), allFiles); + roView = + new HoodieTableFileSystemView(metaClient, metaClient.getCommitTimeline().filterCompletedInstants(), allFiles); dataFilesToRead = roView.getLatestDataFiles(); assertTrue(!dataFilesToRead.findAny().isPresent()); - RealtimeView rtView = new HoodieTableFileSystemView(metaClient, - metaClient.getCommitTimeline().filterCompletedInstants(), allFiles); - List fileGroups = ((HoodieTableFileSystemView) rtView).getAllFileGroups().collect(Collectors - .toList()); + RealtimeView rtView = + new HoodieTableFileSystemView(metaClient, metaClient.getCommitTimeline().filterCompletedInstants(), allFiles); + List fileGroups = + ((HoodieTableFileSystemView) rtView).getAllFileGroups().collect(Collectors.toList()); assertTrue(fileGroups.isEmpty()); // make sure there are no log files remaining - assertTrue(((HoodieTableFileSystemView) rtView).getAllFileGroups().filter(fileGroup -> fileGroup - .getAllRawFileSlices().filter(f -> f.getLogFiles().count() == 0).count() == 0).count() == 0L); + assertTrue(((HoodieTableFileSystemView) rtView).getAllFileGroups() + .filter(fileGroup -> fileGroup.getAllRawFileSlices().filter(f -> f.getLogFiles().count() == 0).count() == 0) + .count() == 0L); } } protected HoodieWriteConfig getHoodieWriteConfigWithSmallFileHandlingOff() { - return HoodieWriteConfig.newBuilder().withPath(basePath) - .withSchema(TRIP_EXAMPLE_SCHEMA) - .withParallelism(2, 2) - .withAutoCommit(false).withAssumeDatePartitioning(true).withCompactionConfig(HoodieCompactionConfig.newBuilder() - .compactionSmallFileSize(1 * 1024).withInlineCompaction(false) - .withMaxNumDeltaCommitsBeforeCompaction(1).build()) + return HoodieWriteConfig.newBuilder().withPath(basePath).withSchema(TRIP_EXAMPLE_SCHEMA).withParallelism(2, 2) + .withAutoCommit(false).withAssumeDatePartitioning(true) + .withCompactionConfig(HoodieCompactionConfig.newBuilder().compactionSmallFileSize(1 * 1024) + .withInlineCompaction(false).withMaxNumDeltaCommitsBeforeCompaction(1).build()) .withEmbeddedTimelineServerEnabled(true) - .withStorageConfig(HoodieStorageConfig.newBuilder().limitFileSize(1 * 1024).build()) - .forTable("test-trip-table").build(); + .withStorageConfig(HoodieStorageConfig.newBuilder().limitFileSize(1 * 1024).build()).forTable("test-trip-table") + .build(); } @Test @@ -730,8 +727,8 @@ public class TestMergeOnReadTable extends HoodieClientTestHarness { ReadOptimizedView roView = new HoodieTableFileSystemView(metaClient, metaClient.getCommitsTimeline().filterCompletedInstants(), allFiles); Stream dataFilesToRead = roView.getLatestDataFiles(); - Map parquetFileIdToSize = dataFilesToRead.collect( - Collectors.toMap(HoodieDataFile::getFileId, HoodieDataFile::getFileSize)); + Map parquetFileIdToSize = + dataFilesToRead.collect(Collectors.toMap(HoodieDataFile::getFileId, HoodieDataFile::getFileSize)); roView = new HoodieTableFileSystemView(metaClient, hoodieTable.getCompletedCommitsTimeline(), allFiles); dataFilesToRead = roView.getLatestDataFiles(); @@ -740,8 +737,7 @@ public class TestMergeOnReadTable extends HoodieClientTestHarness { dataFilesList.size() > 0); /** - * Write 2 (only updates + inserts, written to .log file + correction of existing parquet - * file size) + * Write 2 (only updates + inserts, written to .log file + correction of existing parquet file size) */ newCommitTime = "002"; client.startCommitWithTime(newCommitTime); @@ -766,15 +762,15 @@ public class TestMergeOnReadTable extends HoodieClientTestHarness { hoodieTable.getActiveTimeline().reload().getCommitsTimeline().filterCompletedInstants(), allFiles); dataFilesToRead = roView.getLatestDataFiles(); List newDataFilesList = dataFilesToRead.collect(Collectors.toList()); - Map parquetFileIdToNewSize = newDataFilesList.stream().collect( - Collectors.toMap(HoodieDataFile::getFileId, HoodieDataFile::getFileSize)); + Map parquetFileIdToNewSize = + newDataFilesList.stream().collect(Collectors.toMap(HoodieDataFile::getFileId, HoodieDataFile::getFileSize)); assertTrue(parquetFileIdToNewSize.entrySet().stream() .filter(entry -> parquetFileIdToSize.get(entry.getKey()) < entry.getValue()).count() > 0); List dataFiles = roView.getLatestDataFiles().map(hf -> hf.getPath()).collect(Collectors.toList()); List recordsRead = HoodieMergeOnReadTestUtils.getRecordsUsingInputFormat(dataFiles, basePath); - //Wrote 20 records in 2 batches + // Wrote 20 records in 2 batches assertEquals("Must contain 40 records", 40, recordsRead.size()); } } @@ -805,9 +801,8 @@ public class TestMergeOnReadTable extends HoodieClientTestHarness { updatedRecords = readClient.tagLocation(updatedRecordsRDD).collect(); // Write them to corresponding avro logfiles - HoodieTestUtils - .writeRecordsToLogFiles(metaClient.getFs(), metaClient.getBasePath(), - HoodieTestDataGenerator.avroSchemaWithMetadataFields, updatedRecords); + HoodieTestUtils.writeRecordsToLogFiles(metaClient.getFs(), metaClient.getBasePath(), + HoodieTestDataGenerator.avroSchemaWithMetadataFields, updatedRecords); // Verify that all data file has one log file metaClient = HoodieTableMetaClient.reload(metaClient); @@ -816,8 +811,8 @@ public class TestMergeOnReadTable extends HoodieClientTestHarness { ((SyncableFileSystemView) (table.getRTFileSystemView())).reset(); for (String partitionPath : dataGen.getPartitionPaths()) { - List groupedLogFiles = table.getRTFileSystemView().getLatestFileSlices(partitionPath) - .collect(Collectors.toList()); + List groupedLogFiles = + table.getRTFileSystemView().getLatestFileSlices(partitionPath).collect(Collectors.toList()); for (FileSlice fileSlice : groupedLogFiles) { assertEquals("There should be 1 log file written for every data file", 1, fileSlice.getLogFiles().count()); } @@ -836,12 +831,12 @@ public class TestMergeOnReadTable extends HoodieClientTestHarness { table = HoodieTable.getHoodieTable(metaClient, config, jsc); HoodieActiveTimeline timeline = metaClient.getActiveTimeline(); - assertTrue("Compaction commit should be > than last insert", HoodieTimeline.compareTimestamps( - timeline.lastInstant().get().getTimestamp(), newCommitTime, HoodieTimeline.GREATER)); + assertTrue("Compaction commit should be > than last insert", HoodieTimeline + .compareTimestamps(timeline.lastInstant().get().getTimestamp(), newCommitTime, HoodieTimeline.GREATER)); for (String partitionPath : dataGen.getPartitionPaths()) { - List groupedLogFiles = table.getRTFileSystemView().getLatestFileSlices(partitionPath) - .collect(Collectors.toList()); + List groupedLogFiles = + table.getRTFileSystemView().getLatestFileSlices(partitionPath).collect(Collectors.toList()); for (FileSlice slice : groupedLogFiles) { assertTrue("After compaction there should be no log files visiable on a Realtime view", slice.getLogFiles().collect(Collectors.toList()).isEmpty()); @@ -869,19 +864,18 @@ public class TestMergeOnReadTable extends HoodieClientTestHarness { JavaRDD statuses = writeClient.insert(recordsRDD, newCommitTime); writeClient.commit(newCommitTime, statuses); - HoodieTable table = HoodieTable - .getHoodieTable(new HoodieTableMetaClient(jsc.hadoopConfiguration(), basePath), config, - jsc); + HoodieTable table = + HoodieTable.getHoodieTable(new HoodieTableMetaClient(jsc.hadoopConfiguration(), basePath), config, jsc); RealtimeView tableRTFileSystemView = table.getRTFileSystemView(); long numLogFiles = 0; for (String partitionPath : dataGen.getPartitionPaths()) { - Assert.assertTrue(tableRTFileSystemView.getLatestFileSlices(partitionPath).filter(fileSlice -> - fileSlice.getDataFile().isPresent()).count() == 0); - Assert.assertTrue(tableRTFileSystemView.getLatestFileSlices(partitionPath).filter(fileSlice -> - fileSlice.getLogFiles().count() > 0).count() > 0); - numLogFiles += tableRTFileSystemView.getLatestFileSlices(partitionPath).filter(fileSlice -> - fileSlice.getLogFiles().count() > 0).count(); + Assert.assertTrue(tableRTFileSystemView.getLatestFileSlices(partitionPath) + .filter(fileSlice -> fileSlice.getDataFile().isPresent()).count() == 0); + Assert.assertTrue(tableRTFileSystemView.getLatestFileSlices(partitionPath) + .filter(fileSlice -> fileSlice.getLogFiles().count() > 0).count() > 0); + numLogFiles += tableRTFileSystemView.getLatestFileSlices(partitionPath) + .filter(fileSlice -> fileSlice.getLogFiles().count() > 0).count(); } Assert.assertTrue(numLogFiles > 0); @@ -910,10 +904,10 @@ public class TestMergeOnReadTable extends HoodieClientTestHarness { List writeStatuses = statuses.collect(); // Ensure that inserts are written to only log files - Assert.assertEquals(writeStatuses.stream().filter(writeStatus -> !writeStatus.getStat().getPath().contains("log") - ).count(), 0); - Assert.assertTrue(writeStatuses.stream().filter(writeStatus -> writeStatus.getStat().getPath().contains("log") - ).count() > 0); + Assert.assertEquals( + writeStatuses.stream().filter(writeStatus -> !writeStatus.getStat().getPath().contains("log")).count(), 0); + Assert.assertTrue( + writeStatuses.stream().filter(writeStatus -> writeStatus.getStat().getPath().contains("log")).count() > 0); // rollback a failed commit boolean rollback = writeClient.rollback(newCommitTime); @@ -934,9 +928,8 @@ public class TestMergeOnReadTable extends HoodieClientTestHarness { // and calling rollback twice final String lastCommitTime = newCommitTime; HoodieTableMetaClient metaClient = new HoodieTableMetaClient(jsc.hadoopConfiguration(), basePath); - HoodieInstant last = - metaClient.getCommitsTimeline().getInstants().filter(instant -> instant.getTimestamp().equals(lastCommitTime)) - .findFirst().get(); + HoodieInstant last = metaClient.getCommitsTimeline().getInstants() + .filter(instant -> instant.getTimestamp().equals(lastCommitTime)).findFirst().get(); String fileName = last.getFileName(); // Save the .commit file to local directory. // Rollback will be called twice to test the case where rollback failed first time and retried. @@ -944,8 +937,8 @@ public class TestMergeOnReadTable extends HoodieClientTestHarness { TemporaryFolder folder = new TemporaryFolder(); folder.create(); File file = folder.newFile(); - metaClient.getFs() - .copyToLocalFile(new Path(metaClient.getMetaPath(), fileName), new Path(file.getAbsolutePath())); + metaClient.getFs().copyToLocalFile(new Path(metaClient.getMetaPath(), fileName), + new Path(file.getAbsolutePath())); writeClient.rollback(newCommitTime); metaClient = HoodieTableMetaClient.reload(metaClient); @@ -954,12 +947,12 @@ public class TestMergeOnReadTable extends HoodieClientTestHarness { long numLogFiles = 0; for (String partitionPath : dataGen.getPartitionPaths()) { - Assert.assertTrue(tableRTFileSystemView.getLatestFileSlices(partitionPath).filter(fileSlice -> - fileSlice.getDataFile().isPresent()).count() == 0); - Assert.assertTrue(tableRTFileSystemView.getLatestFileSlices(partitionPath).filter(fileSlice -> - fileSlice.getLogFiles().count() > 0).count() == 0); - numLogFiles += tableRTFileSystemView.getLatestFileSlices(partitionPath).filter(fileSlice -> - fileSlice.getLogFiles().count() > 0).count(); + Assert.assertTrue(tableRTFileSystemView.getLatestFileSlices(partitionPath) + .filter(fileSlice -> fileSlice.getDataFile().isPresent()).count() == 0); + Assert.assertTrue(tableRTFileSystemView.getLatestFileSlices(partitionPath) + .filter(fileSlice -> fileSlice.getLogFiles().count() > 0).count() == 0); + numLogFiles += tableRTFileSystemView.getLatestFileSlices(partitionPath) + .filter(fileSlice -> fileSlice.getLogFiles().count() > 0).count(); } Assert.assertTrue(numLogFiles == 0); metaClient.getFs().copyFromLocalFile(new Path(file.getAbsolutePath()), @@ -987,19 +980,18 @@ public class TestMergeOnReadTable extends HoodieClientTestHarness { // trigger an action statuses.collect(); - HoodieTable table = HoodieTable - .getHoodieTable(new HoodieTableMetaClient(jsc.hadoopConfiguration(), basePath), config, - jsc); + HoodieTable table = + HoodieTable.getHoodieTable(new HoodieTableMetaClient(jsc.hadoopConfiguration(), basePath), config, jsc); RealtimeView tableRTFileSystemView = table.getRTFileSystemView(); long numLogFiles = 0; for (String partitionPath : dataGen.getPartitionPaths()) { - Assert.assertTrue(tableRTFileSystemView.getLatestFileSlices(partitionPath).filter(fileSlice -> - fileSlice.getDataFile().isPresent()).count() == 0); - Assert.assertTrue(tableRTFileSystemView.getLatestFileSlices(partitionPath).filter(fileSlice -> - fileSlice.getLogFiles().count() > 0).count() > 0); - numLogFiles += tableRTFileSystemView.getLatestFileSlices(partitionPath).filter(fileSlice -> - fileSlice.getLogFiles().count() > 0).count(); + Assert.assertTrue(tableRTFileSystemView.getLatestFileSlices(partitionPath) + .filter(fileSlice -> fileSlice.getDataFile().isPresent()).count() == 0); + Assert.assertTrue(tableRTFileSystemView.getLatestFileSlices(partitionPath) + .filter(fileSlice -> fileSlice.getLogFiles().count() > 0).count() > 0); + numLogFiles += tableRTFileSystemView.getLatestFileSlices(partitionPath) + .filter(fileSlice -> fileSlice.getLogFiles().count() > 0).count(); } Assert.assertTrue(numLogFiles > 0); @@ -1016,10 +1008,10 @@ public class TestMergeOnReadTable extends HoodieClientTestHarness { tableRTFileSystemView = table.getRTFileSystemView(); ((SyncableFileSystemView) tableRTFileSystemView).reset(); for (String partitionPath : dataGen.getPartitionPaths()) { - Assert.assertTrue(tableRTFileSystemView.getLatestFileSlices(partitionPath).filter(fileSlice -> - fileSlice.getDataFile().isPresent()).count() == 0); - Assert.assertTrue(tableRTFileSystemView.getLatestFileSlices(partitionPath).filter(fileSlice -> - fileSlice.getLogFiles().count() > 0).count() > 0); + Assert.assertTrue(tableRTFileSystemView.getLatestFileSlices(partitionPath) + .filter(fileSlice -> fileSlice.getDataFile().isPresent()).count() == 0); + Assert.assertTrue(tableRTFileSystemView.getLatestFileSlices(partitionPath) + .filter(fileSlice -> fileSlice.getLogFiles().count() > 0).count() > 0); } } } @@ -1053,13 +1045,16 @@ public class TestMergeOnReadTable extends HoodieClientTestHarness { // Read from commit file table = HoodieTable.getHoodieTable(metaClient, cfg, jsc); - HoodieCommitMetadata metadata = HoodieCommitMetadata.fromBytes(table.getActiveTimeline().getInstantDetails(table - .getActiveTimeline().getDeltaCommitTimeline().lastInstant().get()).get(), HoodieCommitMetadata.class); - HoodieRollingStatMetadata rollingStatMetadata = HoodieCommitMetadata.fromBytes(metadata.getExtraMetadata() - .get(HoodieRollingStatMetadata.ROLLING_STAT_METADATA_KEY).getBytes(), HoodieRollingStatMetadata.class); + HoodieCommitMetadata metadata = HoodieCommitMetadata.fromBytes( + table.getActiveTimeline() + .getInstantDetails(table.getActiveTimeline().getDeltaCommitTimeline().lastInstant().get()).get(), + HoodieCommitMetadata.class); + HoodieRollingStatMetadata rollingStatMetadata = HoodieCommitMetadata.fromBytes( + metadata.getExtraMetadata().get(HoodieRollingStatMetadata.ROLLING_STAT_METADATA_KEY).getBytes(), + HoodieRollingStatMetadata.class); int inserts = 0; - for (Map.Entry> pstat : - rollingStatMetadata.getPartitionToRollingStats().entrySet()) { + for (Map.Entry> pstat : rollingStatMetadata.getPartitionToRollingStats() + .entrySet()) { for (Map.Entry stat : pstat.getValue().entrySet()) { inserts += stat.getValue().getInserts(); } @@ -1075,10 +1070,13 @@ public class TestMergeOnReadTable extends HoodieClientTestHarness { // Read from commit file table = HoodieTable.getHoodieTable(metaClient, cfg, jsc); - metadata = HoodieCommitMetadata.fromBytes(table.getActiveTimeline().getInstantDetails(table - .getActiveTimeline().getDeltaCommitTimeline().lastInstant().get()).get(), HoodieCommitMetadata.class); - rollingStatMetadata = HoodieCommitMetadata.fromBytes(metadata.getExtraMetadata() - .get(HoodieRollingStatMetadata.ROLLING_STAT_METADATA_KEY).getBytes(), HoodieRollingStatMetadata.class); + metadata = HoodieCommitMetadata.fromBytes( + table.getActiveTimeline() + .getInstantDetails(table.getActiveTimeline().getDeltaCommitTimeline().lastInstant().get()).get(), + HoodieCommitMetadata.class); + rollingStatMetadata = HoodieCommitMetadata.fromBytes( + metadata.getExtraMetadata().get(HoodieRollingStatMetadata.ROLLING_STAT_METADATA_KEY).getBytes(), + HoodieRollingStatMetadata.class); inserts = 0; int upserts = 0; for (Map.Entry> pstat : rollingStatMetadata.getPartitionToRollingStats() @@ -1096,10 +1094,13 @@ public class TestMergeOnReadTable extends HoodieClientTestHarness { // Read from commit file table = HoodieTable.getHoodieTable(metaClient, cfg, jsc); - metadata = HoodieCommitMetadata.fromBytes(table.getActiveTimeline().getInstantDetails(table - .getActiveTimeline().getDeltaCommitTimeline().lastInstant().get()).get(), HoodieCommitMetadata.class); - rollingStatMetadata = HoodieCommitMetadata.fromBytes(metadata.getExtraMetadata() - .get(HoodieRollingStatMetadata.ROLLING_STAT_METADATA_KEY).getBytes(), HoodieRollingStatMetadata.class); + metadata = HoodieCommitMetadata.fromBytes( + table.getActiveTimeline() + .getInstantDetails(table.getActiveTimeline().getDeltaCommitTimeline().lastInstant().get()).get(), + HoodieCommitMetadata.class); + rollingStatMetadata = HoodieCommitMetadata.fromBytes( + metadata.getExtraMetadata().get(HoodieRollingStatMetadata.ROLLING_STAT_METADATA_KEY).getBytes(), + HoodieRollingStatMetadata.class); inserts = 0; upserts = 0; for (Map.Entry> pstat : rollingStatMetadata.getPartitionToRollingStats() @@ -1138,13 +1139,16 @@ public class TestMergeOnReadTable extends HoodieClientTestHarness { // Read from commit file table = HoodieTable.getHoodieTable(metaClient, cfg, jsc); - HoodieCommitMetadata metadata = HoodieCommitMetadata.fromBytes(table.getActiveTimeline().getInstantDetails(table - .getActiveTimeline().getDeltaCommitTimeline().lastInstant().get()).get(), HoodieCommitMetadata.class); - HoodieRollingStatMetadata rollingStatMetadata = HoodieCommitMetadata.fromBytes(metadata.getExtraMetadata() - .get(HoodieRollingStatMetadata.ROLLING_STAT_METADATA_KEY).getBytes(), HoodieRollingStatMetadata.class); + HoodieCommitMetadata metadata = HoodieCommitMetadata.fromBytes( + table.getActiveTimeline() + .getInstantDetails(table.getActiveTimeline().getDeltaCommitTimeline().lastInstant().get()).get(), + HoodieCommitMetadata.class); + HoodieRollingStatMetadata rollingStatMetadata = HoodieCommitMetadata.fromBytes( + metadata.getExtraMetadata().get(HoodieRollingStatMetadata.ROLLING_STAT_METADATA_KEY).getBytes(), + HoodieRollingStatMetadata.class); int inserts = 0; - for (Map.Entry> pstat : - rollingStatMetadata.getPartitionToRollingStats().entrySet()) { + for (Map.Entry> pstat : rollingStatMetadata.getPartitionToRollingStats() + .entrySet()) { for (Map.Entry stat : pstat.getValue().entrySet()) { inserts += stat.getValue().getInserts(); fileIdToInsertsMap.put(stat.getKey(), stat.getValue().getInserts()); @@ -1164,10 +1168,13 @@ public class TestMergeOnReadTable extends HoodieClientTestHarness { // Read from commit file table = HoodieTable.getHoodieTable(metaClient, cfg, jsc); - metadata = HoodieCommitMetadata.fromBytes(table.getActiveTimeline().getInstantDetails(table - .getActiveTimeline().getDeltaCommitTimeline().lastInstant().get()).get(), HoodieCommitMetadata.class); - rollingStatMetadata = HoodieCommitMetadata.fromBytes(metadata.getExtraMetadata() - .get(HoodieRollingStatMetadata.ROLLING_STAT_METADATA_KEY).getBytes(), HoodieRollingStatMetadata.class); + metadata = HoodieCommitMetadata.fromBytes( + table.getActiveTimeline() + .getInstantDetails(table.getActiveTimeline().getDeltaCommitTimeline().lastInstant().get()).get(), + HoodieCommitMetadata.class); + rollingStatMetadata = HoodieCommitMetadata.fromBytes( + metadata.getExtraMetadata().get(HoodieRollingStatMetadata.ROLLING_STAT_METADATA_KEY).getBytes(), + HoodieRollingStatMetadata.class); inserts = 0; int upserts = 0; for (Map.Entry> pstat : rollingStatMetadata.getPartitionToRollingStats() @@ -1192,17 +1199,20 @@ public class TestMergeOnReadTable extends HoodieClientTestHarness { // Read from commit file table = HoodieTable.getHoodieTable(metaClient, cfg, jsc); - metadata = HoodieCommitMetadata.fromBytes(table.getActiveTimeline().getInstantDetails(table - .getActiveTimeline().getCommitsTimeline().lastInstant().get()).get(), HoodieCommitMetadata.class); - HoodieRollingStatMetadata rollingStatMetadata1 = HoodieCommitMetadata.fromBytes(metadata.getExtraMetadata() - .get(HoodieRollingStatMetadata.ROLLING_STAT_METADATA_KEY).getBytes(), HoodieRollingStatMetadata.class); + metadata = HoodieCommitMetadata.fromBytes( + table.getActiveTimeline() + .getInstantDetails(table.getActiveTimeline().getCommitsTimeline().lastInstant().get()).get(), + HoodieCommitMetadata.class); + HoodieRollingStatMetadata rollingStatMetadata1 = HoodieCommitMetadata.fromBytes( + metadata.getExtraMetadata().get(HoodieRollingStatMetadata.ROLLING_STAT_METADATA_KEY).getBytes(), + HoodieRollingStatMetadata.class); // Ensure that the rolling stats from the extra metadata of delta commits is copied over to the compaction commit for (Map.Entry> entry : rollingStatMetadata.getPartitionToRollingStats() .entrySet()) { Assert.assertTrue(rollingStatMetadata1.getPartitionToRollingStats().containsKey(entry.getKey())); - Assert.assertEquals(rollingStatMetadata1.getPartitionToRollingStats().get(entry.getKey()).size(), entry - .getValue().size()); + Assert.assertEquals(rollingStatMetadata1.getPartitionToRollingStats().get(entry.getKey()).size(), + entry.getValue().size()); } // Write inserts + updates @@ -1217,10 +1227,13 @@ public class TestMergeOnReadTable extends HoodieClientTestHarness { // Read from commit file table = HoodieTable.getHoodieTable(metaClient, cfg, jsc); - metadata = HoodieCommitMetadata.fromBytes(table.getActiveTimeline().getInstantDetails(table - .getActiveTimeline().getDeltaCommitTimeline().lastInstant().get()).get(), HoodieCommitMetadata.class); - rollingStatMetadata = HoodieCommitMetadata.fromBytes(metadata.getExtraMetadata() - .get(HoodieRollingStatMetadata.ROLLING_STAT_METADATA_KEY).getBytes(), HoodieRollingStatMetadata.class); + metadata = HoodieCommitMetadata.fromBytes( + table.getActiveTimeline() + .getInstantDetails(table.getActiveTimeline().getDeltaCommitTimeline().lastInstant().get()).get(), + HoodieCommitMetadata.class); + rollingStatMetadata = HoodieCommitMetadata.fromBytes( + metadata.getExtraMetadata().get(HoodieRollingStatMetadata.ROLLING_STAT_METADATA_KEY).getBytes(), + HoodieRollingStatMetadata.class); inserts = 0; upserts = 0; for (Map.Entry> pstat : rollingStatMetadata.getPartitionToRollingStats() @@ -1249,12 +1262,10 @@ public class TestMergeOnReadTable extends HoodieClientTestHarness { protected HoodieWriteConfig.Builder getConfigBuilder(Boolean autoCommit, HoodieIndex.IndexType indexType) { return HoodieWriteConfig.newBuilder().withPath(basePath).withSchema(TRIP_EXAMPLE_SCHEMA).withParallelism(2, 2) .withAutoCommit(autoCommit).withAssumeDatePartitioning(true) - .withCompactionConfig( - HoodieCompactionConfig.newBuilder().compactionSmallFileSize(1024 * 1024 * 1024).withInlineCompaction(false) - .withMaxNumDeltaCommitsBeforeCompaction(1).build()) + .withCompactionConfig(HoodieCompactionConfig.newBuilder().compactionSmallFileSize(1024 * 1024 * 1024) + .withInlineCompaction(false).withMaxNumDeltaCommitsBeforeCompaction(1).build()) .withStorageConfig(HoodieStorageConfig.newBuilder().limitFileSize(1024 * 1024 * 1024).build()) - .withEmbeddedTimelineServerEnabled(true) - .forTable("test-trip-table") + .withEmbeddedTimelineServerEnabled(true).forTable("test-trip-table") .withIndexConfig(HoodieIndexConfig.newBuilder().withIndexType(indexType).build()); } @@ -1264,4 +1275,4 @@ public class TestMergeOnReadTable extends HoodieClientTestHarness { assertFalse("Errors found in write of " + status.getFileId(), status.hasErrors()); } } -} \ No newline at end of file +} diff --git a/hudi-common/pom.xml b/hudi-common/pom.xml index 103f89efb..4f3d193cb 100644 --- a/hudi-common/pom.xml +++ b/hudi-common/pom.xml @@ -25,6 +25,10 @@ hudi-common + + ${project.parent.basedir} + + diff --git a/hudi-common/src/main/java/org/apache/hudi/avro/HoodieAvroWriteSupport.java b/hudi-common/src/main/java/org/apache/hudi/avro/HoodieAvroWriteSupport.java index 8a0164781..2fdfb1b1a 100644 --- a/hudi-common/src/main/java/org/apache/hudi/avro/HoodieAvroWriteSupport.java +++ b/hudi-common/src/main/java/org/apache/hudi/avro/HoodieAvroWriteSupport.java @@ -35,8 +35,7 @@ public class HoodieAvroWriteSupport extends AvroWriteSupport { private String maxRecordKey; - public static final String HOODIE_AVRO_BLOOM_FILTER_METADATA_KEY = - "org.apache.hudi.bloomfilter"; + public static final String HOODIE_AVRO_BLOOM_FILTER_METADATA_KEY = "org.apache.hudi.bloomfilter"; public static final String HOODIE_MIN_RECORD_KEY_FOOTER = "hoodie_min_record_key"; public static final String HOODIE_MAX_RECORD_KEY_FOOTER = "hoodie_max_record_key"; @@ -50,8 +49,7 @@ public class HoodieAvroWriteSupport extends AvroWriteSupport { public WriteSupport.FinalizedWriteContext finalizeWrite() { HashMap extraMetaData = new HashMap<>(); if (bloomFilter != null) { - extraMetaData - .put(HOODIE_AVRO_BLOOM_FILTER_METADATA_KEY, bloomFilter.serializeToString()); + extraMetaData.put(HOODIE_AVRO_BLOOM_FILTER_METADATA_KEY, bloomFilter.serializeToString()); if (minRecordKey != null && maxRecordKey != null) { extraMetaData.put(HOODIE_MIN_RECORD_KEY_FOOTER, minRecordKey); extraMetaData.put(HOODIE_MAX_RECORD_KEY_FOOTER, maxRecordKey); diff --git a/hudi-common/src/main/java/org/apache/hudi/avro/MercifulJsonConverter.java b/hudi-common/src/main/java/org/apache/hudi/avro/MercifulJsonConverter.java index e2a8cc72d..ea28cb381 100644 --- a/hudi-common/src/main/java/org/apache/hudi/avro/MercifulJsonConverter.java +++ b/hudi-common/src/main/java/org/apache/hudi/avro/MercifulJsonConverter.java @@ -30,8 +30,8 @@ import org.apache.avro.generic.GenericRecord; /** * Marjority of this is copied from - * https://github.com/jwills/avro-json/blob/master/src/main/java/com/cloudera/science/avro/ - * common/JsonConverter.java Adjusted for expected behavior of our use cases + * https://github.com/jwills/avro-json/blob/master/src/main/java/com/cloudera/science/avro/ common/JsonConverter.java + * Adjusted for expected behavior of our use cases */ public class MercifulJsonConverter { @@ -51,8 +51,7 @@ public class MercifulJsonConverter { } } - private GenericRecord convert(Map raw, Schema schema) - throws IOException { + private GenericRecord convert(Map raw, Schema schema) throws IOException { GenericRecord result = new GenericData.Record(schema); for (Schema.Field f : schema.getFields()) { String name = f.name(); @@ -128,17 +127,15 @@ public class MercifulJsonConverter { } return mapRes; default: - throw new IllegalArgumentException( - "JsonConverter cannot handle type: " + schema.getType()); + throw new IllegalArgumentException("JsonConverter cannot handle type: " + schema.getType()); } throw new JsonConversionException(value, name, schema); } private boolean isOptional(Schema schema) { - return schema.getType().equals(Schema.Type.UNION) - && schema.getTypes().size() == 2 + return schema.getType().equals(Schema.Type.UNION) && schema.getTypes().size() == 2 && (schema.getTypes().get(0).getType().equals(Schema.Type.NULL) - || schema.getTypes().get(1).getType().equals(Schema.Type.NULL)); + || schema.getTypes().get(1).getType().equals(Schema.Type.NULL)); } private Schema getNonNull(Schema schema) { @@ -160,8 +157,7 @@ public class MercifulJsonConverter { @Override public String toString() { - return String.format("Type conversion error for field %s, %s for %s", - fieldName, value, schema); + return String.format("Type conversion error for field %s, %s for %s", fieldName, value, schema); } } } diff --git a/hudi-common/src/main/java/org/apache/hudi/common/HoodieCleanStat.java b/hudi-common/src/main/java/org/apache/hudi/common/HoodieCleanStat.java index 00252635e..763281a72 100644 --- a/hudi-common/src/main/java/org/apache/hudi/common/HoodieCleanStat.java +++ b/hudi-common/src/main/java/org/apache/hudi/common/HoodieCleanStat.java @@ -41,9 +41,8 @@ public class HoodieCleanStat implements Serializable { // Earliest commit that was retained in this clean private final String earliestCommitToRetain; - public HoodieCleanStat(HoodieCleaningPolicy policy, String partitionPath, - List deletePathPatterns, List successDeleteFiles, - List failedDeleteFiles, String earliestCommitToRetain) { + public HoodieCleanStat(HoodieCleaningPolicy policy, String partitionPath, List deletePathPatterns, + List successDeleteFiles, List failedDeleteFiles, String earliestCommitToRetain) { this.policy = policy; this.partitionPath = partitionPath; this.deletePathPatterns = deletePathPatterns; @@ -115,14 +114,14 @@ public class HoodieCleanStat implements Serializable { } public Builder withEarliestCommitRetained(Option earliestCommitToRetain) { - this.earliestCommitToRetain = (earliestCommitToRetain.isPresent()) - ? earliestCommitToRetain.get().getTimestamp() : "-1"; + this.earliestCommitToRetain = + (earliestCommitToRetain.isPresent()) ? earliestCommitToRetain.get().getTimestamp() : "-1"; return this; } public HoodieCleanStat build() { - return new HoodieCleanStat(policy, partitionPath, deletePathPatterns, - successDeleteFiles, failedDeleteFiles, earliestCommitToRetain); + return new HoodieCleanStat(policy, partitionPath, deletePathPatterns, successDeleteFiles, failedDeleteFiles, + earliestCommitToRetain); } } } diff --git a/hudi-common/src/main/java/org/apache/hudi/common/HoodieJsonPayload.java b/hudi-common/src/main/java/org/apache/hudi/common/HoodieJsonPayload.java index 067565367..238dc4ef1 100644 --- a/hudi-common/src/main/java/org/apache/hudi/common/HoodieJsonPayload.java +++ b/hudi-common/src/main/java/org/apache/hudi/common/HoodieJsonPayload.java @@ -50,8 +50,7 @@ public class HoodieJsonPayload implements HoodieRecordPayload } @Override - public Option combineAndGetUpdateValue(IndexedRecord oldRec, Schema schema) - throws IOException { + public Option combineAndGetUpdateValue(IndexedRecord oldRec, Schema schema) throws IOException { return getInsertValue(schema); } @@ -68,8 +67,7 @@ public class HoodieJsonPayload implements HoodieRecordPayload private byte[] compressData(String jsonData) throws IOException { ByteArrayOutputStream baos = new ByteArrayOutputStream(); Deflater deflater = new Deflater(Deflater.BEST_COMPRESSION); - DeflaterOutputStream dos = - new DeflaterOutputStream(baos, deflater, true); + DeflaterOutputStream dos = new DeflaterOutputStream(baos, deflater, true); try { dos.write(jsonData.getBytes()); } finally { diff --git a/hudi-common/src/main/java/org/apache/hudi/common/HoodieRollbackStat.java b/hudi-common/src/main/java/org/apache/hudi/common/HoodieRollbackStat.java index ae6e1cc22..e41475b51 100644 --- a/hudi-common/src/main/java/org/apache/hudi/common/HoodieRollbackStat.java +++ b/hudi-common/src/main/java/org/apache/hudi/common/HoodieRollbackStat.java @@ -37,8 +37,8 @@ public class HoodieRollbackStat implements Serializable { // Count of HoodieLogFile to commandBlocks written for a particular rollback private final Map commandBlocksCount; - public HoodieRollbackStat(String partitionPath, List successDeleteFiles, - List failedDeleteFiles, Map commandBlocksCount) { + public HoodieRollbackStat(String partitionPath, List successDeleteFiles, List failedDeleteFiles, + Map commandBlocksCount) { this.partitionPath = partitionPath; this.successDeleteFiles = successDeleteFiles; this.failedDeleteFiles = failedDeleteFiles; @@ -73,7 +73,7 @@ public class HoodieRollbackStat implements Serializable { private String partitionPath; public Builder withDeletedFileResults(Map deletedFiles) { - //noinspection Convert2MethodRef + // noinspection Convert2MethodRef successDeleteFiles = deletedFiles.entrySet().stream().filter(s -> s.getValue()) .map(s -> s.getKey().getPath().toString()).collect(Collectors.toList()); failedDeleteFiles = deletedFiles.entrySet().stream().filter(s -> !s.getValue()) @@ -92,8 +92,7 @@ public class HoodieRollbackStat implements Serializable { } public HoodieRollbackStat build() { - return new HoodieRollbackStat(partitionPath, successDeleteFiles, failedDeleteFiles, - commandBlocksCount); + return new HoodieRollbackStat(partitionPath, successDeleteFiles, failedDeleteFiles, commandBlocksCount); } } } diff --git a/hudi-common/src/main/java/org/apache/hudi/common/SerializableConfiguration.java b/hudi-common/src/main/java/org/apache/hudi/common/SerializableConfiguration.java index 8f6f0bab9..acd17193f 100644 --- a/hudi-common/src/main/java/org/apache/hudi/common/SerializableConfiguration.java +++ b/hudi-common/src/main/java/org/apache/hudi/common/SerializableConfiguration.java @@ -57,8 +57,7 @@ public class SerializableConfiguration implements Serializable { @Override public String toString() { StringBuilder str = new StringBuilder(); - configuration.iterator().forEachRemaining(e -> - str.append(String.format("%s => %s \n", e.getKey(), e.getValue()))); + configuration.iterator().forEachRemaining(e -> str.append(String.format("%s => %s \n", e.getKey(), e.getValue()))); return configuration.toString(); } } diff --git a/hudi-common/src/main/java/org/apache/hudi/common/io/storage/HoodieWrapperFileSystem.java b/hudi-common/src/main/java/org/apache/hudi/common/io/storage/HoodieWrapperFileSystem.java index d9ff91485..8b7a5f92b 100644 --- a/hudi-common/src/main/java/org/apache/hudi/common/io/storage/HoodieWrapperFileSystem.java +++ b/hudi-common/src/main/java/org/apache/hudi/common/io/storage/HoodieWrapperFileSystem.java @@ -59,21 +59,19 @@ import org.apache.hudi.exception.HoodieException; import org.apache.hudi.exception.HoodieIOException; /** - * HoodieWrapperFileSystem wraps the default file system. It holds state about the open streams in - * the file system to support getting the written size to each of the open streams. + * HoodieWrapperFileSystem wraps the default file system. It holds state about the open streams in the file system to + * support getting the written size to each of the open streams. */ public class HoodieWrapperFileSystem extends FileSystem { public static final String HOODIE_SCHEME_PREFIX = "hoodie-"; - private ConcurrentMap openStreams = new - ConcurrentHashMap<>(); + private ConcurrentMap openStreams = new ConcurrentHashMap<>(); private FileSystem fileSystem; private URI uri; private ConsistencyGuard consistencyGuard = new NoOpConsistencyGuard(); - public HoodieWrapperFileSystem() { - } + public HoodieWrapperFileSystem() {} public HoodieWrapperFileSystem(FileSystem fileSystem, ConsistencyGuard consistencyGuard) { this.fileSystem = fileSystem; @@ -94,8 +92,8 @@ public class HoodieWrapperFileSystem extends FileSystem { URI oldURI = oldPath.toUri(); URI newURI; try { - newURI = new URI(newScheme, oldURI.getUserInfo(), oldURI.getHost(), oldURI.getPort(), - oldURI.getPath(), oldURI.getQuery(), oldURI.getFragment()); + newURI = new URI(newScheme, oldURI.getUserInfo(), oldURI.getHost(), oldURI.getPort(), oldURI.getPath(), + oldURI.getQuery(), oldURI.getFragment()); return new Path(newURI); } catch (URISyntaxException e) { // TODO - Better Exception handling @@ -108,8 +106,7 @@ public class HoodieWrapperFileSystem extends FileSystem { if (StorageSchemes.isSchemeSupported(scheme)) { newScheme = HOODIE_SCHEME_PREFIX + scheme; } else { - throw new IllegalArgumentException( - "BlockAlignedAvroParquetWriter does not support scheme " + scheme); + throw new IllegalArgumentException("BlockAlignedAvroParquetWriter does not support scheme " + scheme); } return newScheme; } @@ -143,22 +140,21 @@ public class HoodieWrapperFileSystem extends FileSystem { } @Override - public FSDataOutputStream create(Path f, FsPermission permission, boolean overwrite, - int bufferSize, short replication, long blockSize, Progressable progress) throws IOException { + public FSDataOutputStream create(Path f, FsPermission permission, boolean overwrite, int bufferSize, + short replication, long blockSize, Progressable progress) throws IOException { final Path translatedPath = convertToDefaultPath(f); - return wrapOutputStream(f, fileSystem - .create(translatedPath, permission, overwrite, bufferSize, replication, blockSize, - progress)); + return wrapOutputStream(f, + fileSystem.create(translatedPath, permission, overwrite, bufferSize, replication, blockSize, progress)); } - private FSDataOutputStream wrapOutputStream(final Path path, - FSDataOutputStream fsDataOutputStream) throws IOException { + private FSDataOutputStream wrapOutputStream(final Path path, FSDataOutputStream fsDataOutputStream) + throws IOException { if (fsDataOutputStream instanceof SizeAwareFSDataOutputStream) { return fsDataOutputStream; } - SizeAwareFSDataOutputStream os = new SizeAwareFSDataOutputStream(path, - fsDataOutputStream, consistencyGuard, () -> openStreams.remove(path.getName())); + SizeAwareFSDataOutputStream os = new SizeAwareFSDataOutputStream(path, fsDataOutputStream, consistencyGuard, + () -> openStreams.remove(path.getName())); openStreams.put(path.getName(), os); return os; } @@ -184,8 +180,7 @@ public class HoodieWrapperFileSystem extends FileSystem { } @Override - public FSDataOutputStream create(Path f, short replication, Progressable progress) - throws IOException { + public FSDataOutputStream create(Path f, short replication, Progressable progress) throws IOException { return wrapOutputStream(f, fileSystem.create(convertToDefaultPath(f), replication, progress)); } @@ -201,39 +196,35 @@ public class HoodieWrapperFileSystem extends FileSystem { } @Override - public FSDataOutputStream create(Path f, boolean overwrite, int bufferSize, short replication, - long blockSize, Progressable progress) throws IOException { - return wrapOutputStream(f, fileSystem - .create(convertToDefaultPath(f), overwrite, bufferSize, replication, blockSize, progress)); + public FSDataOutputStream create(Path f, boolean overwrite, int bufferSize, short replication, long blockSize, + Progressable progress) throws IOException { + return wrapOutputStream(f, + fileSystem.create(convertToDefaultPath(f), overwrite, bufferSize, replication, blockSize, progress)); } @Override - public FSDataOutputStream create(Path f, FsPermission permission, EnumSet flags, - int bufferSize, short replication, long blockSize, Progressable progress) throws IOException { - return wrapOutputStream(f, fileSystem - .create(convertToDefaultPath(f), permission, flags, bufferSize, replication, blockSize, - progress)); + public FSDataOutputStream create(Path f, FsPermission permission, EnumSet flags, int bufferSize, + short replication, long blockSize, Progressable progress) throws IOException { + return wrapOutputStream(f, + fileSystem.create(convertToDefaultPath(f), permission, flags, bufferSize, replication, blockSize, progress)); } @Override - public FSDataOutputStream create(Path f, FsPermission permission, EnumSet flags, - int bufferSize, short replication, long blockSize, Progressable progress, - Options.ChecksumOpt checksumOpt) throws IOException { - return wrapOutputStream(f, fileSystem - .create(convertToDefaultPath(f), permission, flags, bufferSize, replication, blockSize, - progress, checksumOpt)); + public FSDataOutputStream create(Path f, FsPermission permission, EnumSet flags, int bufferSize, + short replication, long blockSize, Progressable progress, Options.ChecksumOpt checksumOpt) throws IOException { + return wrapOutputStream(f, fileSystem.create(convertToDefaultPath(f), permission, flags, bufferSize, replication, + blockSize, progress, checksumOpt)); } @Override - public FSDataOutputStream create(Path f, boolean overwrite, int bufferSize, short replication, - long blockSize) throws IOException { - return wrapOutputStream(f, fileSystem - .create(convertToDefaultPath(f), overwrite, bufferSize, replication, blockSize)); - } - - @Override - public FSDataOutputStream append(Path f, int bufferSize, Progressable progress) + public FSDataOutputStream create(Path f, boolean overwrite, int bufferSize, short replication, long blockSize) throws IOException { + return wrapOutputStream(f, + fileSystem.create(convertToDefaultPath(f), overwrite, bufferSize, replication, blockSize)); + } + + @Override + public FSDataOutputStream append(Path f, int bufferSize, Progressable progress) throws IOException { return wrapOutputStream(f, fileSystem.append(convertToDefaultPath(f), bufferSize, progress)); } @@ -341,8 +332,7 @@ public class HoodieWrapperFileSystem extends FileSystem { } @Override - public Token[] addDelegationTokens(String renewer, Credentials credentials) - throws IOException { + public Token[] addDelegationTokens(String renewer, Credentials credentials) throws IOException { return fileSystem.addDelegationTokens(renewer, credentials); } @@ -352,8 +342,7 @@ public class HoodieWrapperFileSystem extends FileSystem { } @Override - public BlockLocation[] getFileBlockLocations(FileStatus file, long start, long len) - throws IOException { + public BlockLocation[] getFileBlockLocations(FileStatus file, long start, long len) throws IOException { return fileSystem.getFileBlockLocations(file, start, len); } @@ -383,28 +372,27 @@ public class HoodieWrapperFileSystem extends FileSystem { } @Override - public FSDataOutputStream createNonRecursive(Path f, boolean overwrite, int bufferSize, + public FSDataOutputStream createNonRecursive(Path f, boolean overwrite, int bufferSize, short replication, + long blockSize, Progressable progress) throws IOException { + Path p = convertToDefaultPath(f); + return wrapOutputStream(p, + fileSystem.createNonRecursive(p, overwrite, bufferSize, replication, blockSize, progress)); + } + + @Override + public FSDataOutputStream createNonRecursive(Path f, FsPermission permission, boolean overwrite, int bufferSize, short replication, long blockSize, Progressable progress) throws IOException { Path p = convertToDefaultPath(f); - return wrapOutputStream(p, fileSystem.createNonRecursive(p, overwrite, bufferSize, replication, blockSize, - progress)); + return wrapOutputStream(p, + fileSystem.createNonRecursive(p, permission, overwrite, bufferSize, replication, blockSize, progress)); } @Override - public FSDataOutputStream createNonRecursive(Path f, FsPermission permission, boolean overwrite, + public FSDataOutputStream createNonRecursive(Path f, FsPermission permission, EnumSet flags, int bufferSize, short replication, long blockSize, Progressable progress) throws IOException { Path p = convertToDefaultPath(f); - return wrapOutputStream(p, fileSystem.createNonRecursive(p, permission, overwrite, bufferSize, replication, - blockSize, progress)); - } - - @Override - public FSDataOutputStream createNonRecursive(Path f, FsPermission permission, - EnumSet flags, int bufferSize, short replication, long blockSize, - Progressable progress) throws IOException { - Path p = convertToDefaultPath(f); - return wrapOutputStream(p, fileSystem.createNonRecursive(p, permission, flags, bufferSize, replication, - blockSize, progress)); + return wrapOutputStream(p, + fileSystem.createNonRecursive(p, permission, flags, bufferSize, replication, blockSize, progress)); } @Override @@ -590,10 +578,8 @@ public class HoodieWrapperFileSystem extends FileSystem { } @Override - public void copyFromLocalFile(boolean delSrc, boolean overwrite, Path[] srcs, Path dst) - throws IOException { - fileSystem - .copyFromLocalFile(delSrc, overwrite, convertLocalPaths(srcs), convertToDefaultPath(dst)); + public void copyFromLocalFile(boolean delSrc, boolean overwrite, Path[] srcs, Path dst) throws IOException { + fileSystem.copyFromLocalFile(delSrc, overwrite, convertLocalPaths(srcs), convertToDefaultPath(dst)); try { consistencyGuard.waitTillFileAppears(convertToDefaultPath(dst)); } catch (TimeoutException e) { @@ -602,10 +588,8 @@ public class HoodieWrapperFileSystem extends FileSystem { } @Override - public void copyFromLocalFile(boolean delSrc, boolean overwrite, Path src, Path dst) - throws IOException { - fileSystem - .copyFromLocalFile(delSrc, overwrite, convertToLocalPath(src), convertToDefaultPath(dst)); + public void copyFromLocalFile(boolean delSrc, boolean overwrite, Path src, Path dst) throws IOException { + fileSystem.copyFromLocalFile(delSrc, overwrite, convertToLocalPath(src), convertToDefaultPath(dst)); try { consistencyGuard.waitTillFileAppears(convertToDefaultPath(dst)); } catch (TimeoutException e) { @@ -629,22 +613,19 @@ public class HoodieWrapperFileSystem extends FileSystem { } @Override - public void copyToLocalFile(boolean delSrc, Path src, Path dst, boolean useRawLocalFileSystem) - throws IOException { - fileSystem.copyToLocalFile(delSrc, convertToDefaultPath(src), convertToLocalPath(dst), - useRawLocalFileSystem); + public void copyToLocalFile(boolean delSrc, Path src, Path dst, boolean useRawLocalFileSystem) throws IOException { + fileSystem.copyToLocalFile(delSrc, convertToDefaultPath(src), convertToLocalPath(dst), useRawLocalFileSystem); } @Override public Path startLocalOutput(Path fsOutputFile, Path tmpLocalFile) throws IOException { - return convertToHoodiePath(fileSystem - .startLocalOutput(convertToDefaultPath(fsOutputFile), convertToDefaultPath(tmpLocalFile))); + return convertToHoodiePath( + fileSystem.startLocalOutput(convertToDefaultPath(fsOutputFile), convertToDefaultPath(tmpLocalFile))); } @Override public void completeLocalOutput(Path fsOutputFile, Path tmpLocalFile) throws IOException { - fileSystem.completeLocalOutput(convertToDefaultPath(fsOutputFile), - convertToDefaultPath(tmpLocalFile)); + fileSystem.completeLocalOutput(convertToDefaultPath(fsOutputFile), convertToDefaultPath(tmpLocalFile)); } @Override @@ -691,8 +672,7 @@ public class HoodieWrapperFileSystem extends FileSystem { @Override public void createSymlink(Path target, Path link, boolean createParent) throws IOException { - fileSystem - .createSymlink(convertToDefaultPath(target), convertToDefaultPath(link), createParent); + fileSystem.createSymlink(convertToDefaultPath(target), convertToDefaultPath(link), createParent); } @Override @@ -761,8 +741,7 @@ public class HoodieWrapperFileSystem extends FileSystem { } @Override - public void renameSnapshot(Path path, String snapshotOldName, String snapshotNewName) - throws IOException { + public void renameSnapshot(Path path, String snapshotOldName, String snapshotNewName) throws IOException { fileSystem.renameSnapshot(convertToDefaultPath(path), snapshotOldName, snapshotNewName); } @@ -807,8 +786,7 @@ public class HoodieWrapperFileSystem extends FileSystem { } @Override - public void setXAttr(Path path, String name, byte[] value, EnumSet flag) - throws IOException { + public void setXAttr(Path path, String name, byte[] value, EnumSet flag) throws IOException { fileSystem.setXAttr(convertToDefaultPath(path), name, value, flag); } @@ -899,8 +877,8 @@ public class HoodieWrapperFileSystem extends FileSystem { return openStreams.get(file.getName()).getBytesWritten(); } // When the file is first written, we do not have a track of it - throw new IllegalArgumentException(file.toString() - + " does not have a open stream. Cannot get the bytes written on the stream"); + throw new IllegalArgumentException( + file.toString() + " does not have a open stream. Cannot get the bytes written on the stream"); } public FileSystem getFileSystem() { diff --git a/hudi-common/src/main/java/org/apache/hudi/common/io/storage/SizeAwareFSDataOutputStream.java b/hudi-common/src/main/java/org/apache/hudi/common/io/storage/SizeAwareFSDataOutputStream.java index 9a8f3edcc..5f1c0ab93 100644 --- a/hudi-common/src/main/java/org/apache/hudi/common/io/storage/SizeAwareFSDataOutputStream.java +++ b/hudi-common/src/main/java/org/apache/hudi/common/io/storage/SizeAwareFSDataOutputStream.java @@ -27,8 +27,8 @@ import org.apache.hudi.common.util.ConsistencyGuard; import org.apache.hudi.exception.HoodieException; /** - * Wrapper over FSDataOutputStream to keep track of the size of the written bytes. This - * gives a cheap way to check on the underlying file size. + * Wrapper over FSDataOutputStream to keep track of the size of the written bytes. This gives a cheap way + * to check on the underlying file size. */ public class SizeAwareFSDataOutputStream extends FSDataOutputStream { @@ -41,8 +41,8 @@ public class SizeAwareFSDataOutputStream extends FSDataOutputStream { // Consistency guard private final ConsistencyGuard consistencyGuard; - public SizeAwareFSDataOutputStream(Path path, FSDataOutputStream out, - ConsistencyGuard consistencyGuard, Runnable closeCallback) throws IOException { + public SizeAwareFSDataOutputStream(Path path, FSDataOutputStream out, ConsistencyGuard consistencyGuard, + Runnable closeCallback) throws IOException { super(out); this.path = path; this.closeCallback = closeCallback; diff --git a/hudi-common/src/main/java/org/apache/hudi/common/model/CompactionOperation.java b/hudi-common/src/main/java/org/apache/hudi/common/model/CompactionOperation.java index ff6c210ad..4b1a457df 100644 --- a/hudi-common/src/main/java/org/apache/hudi/common/model/CompactionOperation.java +++ b/hudi-common/src/main/java/org/apache/hudi/common/model/CompactionOperation.java @@ -31,8 +31,8 @@ import org.apache.hudi.common.util.FSUtils; import org.apache.hudi.common.util.Option; /** - * Encapsulates all the needed information about a compaction and make a decision whether this - * compaction is effective or not + * Encapsulates all the needed information about a compaction and make a decision whether this compaction is effective + * or not * */ public class CompactionOperation implements Serializable { @@ -44,10 +44,9 @@ public class CompactionOperation implements Serializable { private HoodieFileGroupId id; private Map metrics; - //Only for serialization/de-serialization + // Only for serialization/de-serialization @Deprecated - public CompactionOperation() { - } + public CompactionOperation() {} public CompactionOperation(String fileId, String partitionPath, String baseInstantTime, Option dataFileCommitTime, List deltaFilePaths, Option dataFilePath, @@ -60,8 +59,8 @@ public class CompactionOperation implements Serializable { this.metrics = metrics; } - public CompactionOperation(Option dataFile, String partitionPath, - List logFiles, Map metrics) { + public CompactionOperation(Option dataFile, String partitionPath, List logFiles, + Map metrics) { if (dataFile.isPresent()) { this.baseInstantTime = dataFile.get().getCommitTime(); this.dataFilePath = Option.of(dataFile.get().getPath()); @@ -75,8 +74,7 @@ public class CompactionOperation implements Serializable { this.dataFileCommitTime = Option.empty(); } - this.deltaFilePaths = logFiles.stream().map(s -> s.getPath().toString()) - .collect(Collectors.toList()); + this.deltaFilePaths = logFiles.stream().map(s -> s.getPath().toString()).collect(Collectors.toList()); this.metrics = metrics; } @@ -113,12 +111,13 @@ public class CompactionOperation implements Serializable { } public Option getBaseFile() { - //TODO: HUDI-130 - Paths return in compaction plan needs to be relative to base-path + // TODO: HUDI-130 - Paths return in compaction plan needs to be relative to base-path return dataFilePath.map(df -> new HoodieDataFile(df)); } /** * Convert Avro generated Compaction operation to POJO for Spark RDD operation + * * @param operation Hoodie Compaction Operation * @return */ @@ -126,8 +125,7 @@ public class CompactionOperation implements Serializable { CompactionOperation op = new CompactionOperation(); op.baseInstantTime = operation.getBaseInstantTime(); op.dataFilePath = Option.ofNullable(operation.getDataFilePath()); - op.dataFileCommitTime = - op.dataFilePath.map(p -> FSUtils.getCommitTime(new Path(p).getName())); + op.dataFileCommitTime = op.dataFilePath.map(p -> FSUtils.getCommitTime(new Path(p).getName())); op.deltaFilePaths = new ArrayList<>(operation.getDeltaFilePaths()); op.id = new HoodieFileGroupId(operation.getPartitionPath(), operation.getFileId()); op.metrics = operation.getMetrics() == null ? new HashMap<>() : new HashMap<>(operation.getMetrics()); @@ -136,14 +134,9 @@ public class CompactionOperation implements Serializable { @Override public String toString() { - return "CompactionOperation{" - + "baseInstantTime='" + baseInstantTime + '\'' - + ", dataFileCommitTime=" + dataFileCommitTime - + ", deltaFilePaths=" + deltaFilePaths - + ", dataFilePath=" + dataFilePath - + ", id='" + id + '\'' - + ", metrics=" + metrics - + '}'; + return "CompactionOperation{" + "baseInstantTime='" + baseInstantTime + '\'' + ", dataFileCommitTime=" + + dataFileCommitTime + ", deltaFilePaths=" + deltaFilePaths + ", dataFilePath=" + dataFilePath + ", id='" + id + + '\'' + ", metrics=" + metrics + '}'; } @Override @@ -158,8 +151,7 @@ public class CompactionOperation implements Serializable { return Objects.equals(baseInstantTime, operation.baseInstantTime) && Objects.equals(dataFileCommitTime, operation.dataFileCommitTime) && Objects.equals(deltaFilePaths, operation.deltaFilePaths) - && Objects.equals(dataFilePath, operation.dataFilePath) - && Objects.equals(id, operation.id); + && Objects.equals(dataFilePath, operation.dataFilePath) && Objects.equals(id, operation.id); } @Override diff --git a/hudi-common/src/main/java/org/apache/hudi/common/model/FileSlice.java b/hudi-common/src/main/java/org/apache/hudi/common/model/FileSlice.java index 567f9b673..bb48b5a60 100644 --- a/hudi-common/src/main/java/org/apache/hudi/common/model/FileSlice.java +++ b/hudi-common/src/main/java/org/apache/hudi/common/model/FileSlice.java @@ -25,8 +25,8 @@ import java.util.stream.Stream; import org.apache.hudi.common.util.Option; /** - * Within a file group, a slice is a combination of data file written at a commit time and list of - * log files, containing changes to the data file from that commit time + * Within a file group, a slice is a combination of data file written at a commit time and list of log files, containing + * changes to the data file from that commit time */ public class FileSlice implements Serializable { @@ -46,8 +46,8 @@ public class FileSlice implements Serializable { private HoodieDataFile dataFile; /** - * List of appendable log files with real time data - Sorted with greater log version first - - * Always empty for copy_on_write storage. + * List of appendable log files with real time data - Sorted with greater log version first - Always empty for + * copy_on_write storage. */ private final TreeSet logFiles; @@ -100,6 +100,7 @@ public class FileSlice implements Serializable { /** * Returns true if there is no data file and no log files. Happens as part of pending compaction + * * @return */ public boolean isEmpty() { @@ -126,10 +127,8 @@ public class FileSlice implements Serializable { return false; } FileSlice slice = (FileSlice) o; - return Objects.equals(fileGroupId, slice.fileGroupId) - && Objects.equals(baseInstantTime, slice.baseInstantTime) - && Objects.equals(dataFile, slice.dataFile) - && Objects.equals(logFiles, slice.logFiles); + return Objects.equals(fileGroupId, slice.fileGroupId) && Objects.equals(baseInstantTime, slice.baseInstantTime) + && Objects.equals(dataFile, slice.dataFile) && Objects.equals(logFiles, slice.logFiles); } @Override diff --git a/hudi-common/src/main/java/org/apache/hudi/common/model/HoodieAvroPayload.java b/hudi-common/src/main/java/org/apache/hudi/common/model/HoodieAvroPayload.java index 7aefc51c6..4f9224342 100644 --- a/hudi-common/src/main/java/org/apache/hudi/common/model/HoodieAvroPayload.java +++ b/hudi-common/src/main/java/org/apache/hudi/common/model/HoodieAvroPayload.java @@ -27,14 +27,14 @@ import org.apache.hudi.common.util.Option; import org.apache.hudi.exception.HoodieIOException; /** - * This is a payload to wrap a existing Hoodie Avro Record. Useful to create a HoodieRecord over - * existing GenericRecords in a hoodie datasets (useful in compactions) + * This is a payload to wrap a existing Hoodie Avro Record. Useful to create a HoodieRecord over existing GenericRecords + * in a hoodie datasets (useful in compactions) */ public class HoodieAvroPayload implements HoodieRecordPayload { // Store the GenericRecord converted to bytes - 1) Doesn't store schema hence memory efficient 2) Makes the payload // java serializable - private final byte [] recordBytes; + private final byte[] recordBytes; public HoodieAvroPayload(Option record) { try { @@ -54,8 +54,7 @@ public class HoodieAvroPayload implements HoodieRecordPayload } @Override - public Option combineAndGetUpdateValue(IndexedRecord currentValue, Schema schema) - throws IOException { + public Option combineAndGetUpdateValue(IndexedRecord currentValue, Schema schema) throws IOException { return getInsertValue(schema); } diff --git a/hudi-common/src/main/java/org/apache/hudi/common/model/HoodieCleaningPolicy.java b/hudi-common/src/main/java/org/apache/hudi/common/model/HoodieCleaningPolicy.java index 36d9c3b2a..55be2e393 100644 --- a/hudi-common/src/main/java/org/apache/hudi/common/model/HoodieCleaningPolicy.java +++ b/hudi-common/src/main/java/org/apache/hudi/common/model/HoodieCleaningPolicy.java @@ -19,6 +19,5 @@ package org.apache.hudi.common.model; public enum HoodieCleaningPolicy { - KEEP_LATEST_FILE_VERSIONS, - KEEP_LATEST_COMMITS + KEEP_LATEST_FILE_VERSIONS, KEEP_LATEST_COMMITS } diff --git a/hudi-common/src/main/java/org/apache/hudi/common/model/HoodieCommitMetadata.java b/hudi-common/src/main/java/org/apache/hudi/common/model/HoodieCommitMetadata.java index b48d39527..5eb8ce46f 100644 --- a/hudi-common/src/main/java/org/apache/hudi/common/model/HoodieCommitMetadata.java +++ b/hudi-common/src/main/java/org/apache/hudi/common/model/HoodieCommitMetadata.java @@ -338,10 +338,7 @@ public class HoodieCommitMetadata implements Serializable { @Override public String toString() { - return "HoodieCommitMetadata{" - + "partitionToWriteStats=" + partitionToWriteStats - + ", compacted=" + compacted - + ", extraMetadataMap=" + extraMetadataMap - + '}'; + return "HoodieCommitMetadata{" + "partitionToWriteStats=" + partitionToWriteStats + ", compacted=" + compacted + + ", extraMetadataMap=" + extraMetadataMap + '}'; } } diff --git a/hudi-common/src/main/java/org/apache/hudi/common/model/HoodieDataFile.java b/hudi-common/src/main/java/org/apache/hudi/common/model/HoodieDataFile.java index 8a34abbb0..668fadde1 100644 --- a/hudi-common/src/main/java/org/apache/hudi/common/model/HoodieDataFile.java +++ b/hudi-common/src/main/java/org/apache/hudi/common/model/HoodieDataFile.java @@ -93,9 +93,6 @@ public class HoodieDataFile implements Serializable { @Override public String toString() { - return "HoodieDataFile{" - + "fullPath=" + fullPath - + ", fileLen=" + fileLen - + '}'; + return "HoodieDataFile{" + "fullPath=" + fullPath + ", fileLen=" + fileLen + '}'; } } diff --git a/hudi-common/src/main/java/org/apache/hudi/common/model/HoodieFileGroup.java b/hudi-common/src/main/java/org/apache/hudi/common/model/HoodieFileGroup.java index 9763b9e4a..af1a9662f 100644 --- a/hudi-common/src/main/java/org/apache/hudi/common/model/HoodieFileGroup.java +++ b/hudi-common/src/main/java/org/apache/hudi/common/model/HoodieFileGroup.java @@ -69,8 +69,8 @@ public class HoodieFileGroup implements Serializable { } /** - * Potentially add a new file-slice by adding base-instant time - * A file-slice without any data-file and log-files can exist (if a compaction just got requested) + * Potentially add a new file-slice by adding base-instant time A file-slice without any data-file and log-files can + * exist (if a compaction just got requested) */ public void addNewFileSliceAtInstant(String baseInstantTime) { if (!fileSlices.containsKey(baseInstantTime)) { @@ -107,15 +107,13 @@ public class HoodieFileGroup implements Serializable { } /** - * A FileSlice is considered committed, if one of the following is true - There is a committed - * data file - There are some log files, that are based off a commit or delta commit + * A FileSlice is considered committed, if one of the following is true - There is a committed data file - There are + * some log files, that are based off a commit or delta commit */ private boolean isFileSliceCommitted(FileSlice slice) { String maxCommitTime = lastInstant.get().getTimestamp(); return timeline.containsOrBeforeTimelineStarts(slice.getBaseInstantTime()) - && HoodieTimeline.compareTimestamps(slice.getBaseInstantTime(), - maxCommitTime, - HoodieTimeline.LESSER_OR_EQUAL); + && HoodieTimeline.compareTimestamps(slice.getBaseInstantTime(), maxCommitTime, HoodieTimeline.LESSER_OR_EQUAL); } @@ -138,9 +136,7 @@ public class HoodieFileGroup implements Serializable { */ public Stream getAllFileSlices() { if (!timeline.empty()) { - return fileSlices.entrySet().stream() - .map(Map.Entry::getValue) - .filter(this::isFileSliceCommitted); + return fileSlices.entrySet().stream().map(Map.Entry::getValue).filter(this::isFileSliceCommitted); } return Stream.empty(); } @@ -166,41 +162,32 @@ public class HoodieFileGroup implements Serializable { * Obtain the latest file slice, upto a commitTime i.e <= maxCommitTime */ public Option getLatestFileSliceBeforeOrOn(String maxCommitTime) { - return Option.fromJavaOptional(getAllFileSlices() - .filter(slice -> - HoodieTimeline.compareTimestamps(slice.getBaseInstantTime(), - maxCommitTime, - HoodieTimeline.LESSER_OR_EQUAL)) - .findFirst()); + return Option.fromJavaOptional(getAllFileSlices().filter(slice -> HoodieTimeline + .compareTimestamps(slice.getBaseInstantTime(), maxCommitTime, HoodieTimeline.LESSER_OR_EQUAL)).findFirst()); } /** * Obtain the latest file slice, upto a commitTime i.e < maxInstantTime + * * @param maxInstantTime Max Instant Time * @return */ public Option getLatestFileSliceBefore(String maxInstantTime) { - return Option.fromJavaOptional(getAllFileSlices() - .filter(slice -> - HoodieTimeline.compareTimestamps(slice.getBaseInstantTime(), - maxInstantTime, - HoodieTimeline.LESSER)) + return Option.fromJavaOptional(getAllFileSlices().filter( + slice -> HoodieTimeline.compareTimestamps(slice.getBaseInstantTime(), maxInstantTime, HoodieTimeline.LESSER)) .findFirst()); } public Option getLatestFileSliceInRange(List commitRange) { - return Option.fromJavaOptional(getAllFileSlices() - .filter(slice -> commitRange.contains(slice.getBaseInstantTime())) - .findFirst()); + return Option.fromJavaOptional( + getAllFileSlices().filter(slice -> commitRange.contains(slice.getBaseInstantTime())).findFirst()); } /** * Stream of committed data files, sorted reverse commit time */ public Stream getAllDataFiles() { - return getAllFileSlices() - .filter(slice -> slice.getDataFile().isPresent()) - .map(slice -> slice.getDataFile().get()); + return getAllFileSlices().filter(slice -> slice.getDataFile().isPresent()).map(slice -> slice.getDataFile().get()); } @Override diff --git a/hudi-common/src/main/java/org/apache/hudi/common/model/HoodieFileGroupId.java b/hudi-common/src/main/java/org/apache/hudi/common/model/HoodieFileGroupId.java index b396de0eb..f764119af 100644 --- a/hudi-common/src/main/java/org/apache/hudi/common/model/HoodieFileGroupId.java +++ b/hudi-common/src/main/java/org/apache/hudi/common/model/HoodieFileGroupId.java @@ -52,8 +52,7 @@ public class HoodieFileGroupId implements Serializable { return false; } HoodieFileGroupId that = (HoodieFileGroupId) o; - return Objects.equals(partitionPath, that.partitionPath) - && Objects.equals(fileId, that.fileId); + return Objects.equals(partitionPath, that.partitionPath) && Objects.equals(fileId, that.fileId); } @Override @@ -63,9 +62,6 @@ public class HoodieFileGroupId implements Serializable { @Override public String toString() { - return "HoodieFileGroupId{" - + "partitionPath='" + partitionPath + '\'' - + ", fileId='" + fileId + '\'' - + '}'; + return "HoodieFileGroupId{" + "partitionPath='" + partitionPath + '\'' + ", fileId='" + fileId + '\'' + '}'; } } diff --git a/hudi-common/src/main/java/org/apache/hudi/common/model/HoodieKey.java b/hudi-common/src/main/java/org/apache/hudi/common/model/HoodieKey.java index 0b124a557..834589b4c 100644 --- a/hudi-common/src/main/java/org/apache/hudi/common/model/HoodieKey.java +++ b/hudi-common/src/main/java/org/apache/hudi/common/model/HoodieKey.java @@ -24,8 +24,8 @@ import java.io.Serializable; /** * HoodieKey consists of *

    - * - recordKey : a recordKey that acts as primary key for a record - partitionPath : path to the - * partition that contains the record + * - recordKey : a recordKey that acts as primary key for a record - partitionPath : path to the partition that contains + * the record */ public class HoodieKey implements Serializable { @@ -56,8 +56,7 @@ public class HoodieKey implements Serializable { return false; } HoodieKey otherKey = (HoodieKey) o; - return Objects.equal(recordKey, otherKey.recordKey) - && Objects.equal(partitionPath, otherKey.partitionPath); + return Objects.equal(recordKey, otherKey.recordKey) && Objects.equal(partitionPath, otherKey.partitionPath); } @Override diff --git a/hudi-common/src/main/java/org/apache/hudi/common/model/HoodieLogFile.java b/hudi-common/src/main/java/org/apache/hudi/common/model/HoodieLogFile.java index a5445932f..1b478ddba 100644 --- a/hudi-common/src/main/java/org/apache/hudi/common/model/HoodieLogFile.java +++ b/hudi-common/src/main/java/org/apache/hudi/common/model/HoodieLogFile.java @@ -109,9 +109,7 @@ public class HoodieLogFile implements Serializable { String baseCommitTime = getBaseCommitTime(); Path path = getPath(); String extension = "." + FSUtils.getFileExtensionFromLog(path); - int newVersion = FSUtils - .computeNextLogVersion(fs, path.getParent(), fileId, - extension, baseCommitTime); + int newVersion = FSUtils.computeNextLogVersion(fs, path.getParent(), fileId, extension, baseCommitTime); return new HoodieLogFile(new Path(path.getParent(), FSUtils.makeLogFileName(fileId, extension, baseCommitTime, newVersion, logWriteToken))); } @@ -179,9 +177,6 @@ public class HoodieLogFile implements Serializable { @Override public String toString() { - return "HoodieLogFile{" - + "pathStr='" + pathStr + '\'' - + ", fileLen=" + fileLen - + '}'; + return "HoodieLogFile{" + "pathStr='" + pathStr + '\'' + ", fileLen=" + fileLen + '}'; } } diff --git a/hudi-common/src/main/java/org/apache/hudi/common/model/HoodiePartitionMetadata.java b/hudi-common/src/main/java/org/apache/hudi/common/model/HoodiePartitionMetadata.java index d3a45099f..1673871a1 100644 --- a/hudi-common/src/main/java/org/apache/hudi/common/model/HoodiePartitionMetadata.java +++ b/hudi-common/src/main/java/org/apache/hudi/common/model/HoodiePartitionMetadata.java @@ -64,12 +64,10 @@ public class HoodiePartitionMetadata { /** * Construct metadata object to be written out. */ - public HoodiePartitionMetadata(FileSystem fs, String commitTime, Path basePath, - Path partitionPath) { + public HoodiePartitionMetadata(FileSystem fs, String commitTime, Path basePath, Path partitionPath) { this(fs, partitionPath); props.setProperty(COMMIT_TIME_KEY, commitTime); - props - .setProperty(PARTITION_DEPTH_KEY, String.valueOf(partitionPath.depth() - basePath.depth())); + props.setProperty(PARTITION_DEPTH_KEY, String.valueOf(partitionPath.depth() - basePath.depth())); } public int getPartitionDepth() { @@ -83,8 +81,8 @@ public class HoodiePartitionMetadata { * Write the metadata safely into partition atomically. */ public void trySave(int taskPartitionId) { - Path tmpMetaPath = new Path(partitionPath, - HoodiePartitionMetadata.HOODIE_PARTITION_METAFILE + "_" + taskPartitionId); + Path tmpMetaPath = + new Path(partitionPath, HoodiePartitionMetadata.HOODIE_PARTITION_METAFILE + "_" + taskPartitionId); Path metaPath = new Path(partitionPath, HoodiePartitionMetadata.HOODIE_PARTITION_METAFILE); boolean metafileExists = false; @@ -102,9 +100,8 @@ public class HoodiePartitionMetadata { fs.rename(tmpMetaPath, metaPath); } } catch (IOException ioe) { - log.warn( - "Error trying to save partition metadata (this is okay, as long as " - + "atleast 1 of these succced), " + partitionPath, ioe); + log.warn("Error trying to save partition metadata (this is okay, as long as " + "atleast 1 of these succced), " + + partitionPath, ioe); } finally { if (!metafileExists) { try { @@ -129,8 +126,7 @@ public class HoodiePartitionMetadata { is = fs.open(metaFile); props.load(is); } catch (IOException ioe) { - throw new HoodieException("Error reading Hoodie partition metadata for " + partitionPath, - ioe); + throw new HoodieException("Error reading Hoodie partition metadata for " + partitionPath, ioe); } finally { if (is != null) { is.close(); @@ -143,8 +139,7 @@ public class HoodiePartitionMetadata { try { return fs.exists(new Path(partitionPath, HoodiePartitionMetadata.HOODIE_PARTITION_METAFILE)); } catch (IOException ioe) { - throw new HoodieException("Error checking Hoodie partition metadata for " + partitionPath, - ioe); + throw new HoodieException("Error checking Hoodie partition metadata for " + partitionPath, ioe); } } } diff --git a/hudi-common/src/main/java/org/apache/hudi/common/model/HoodieRecord.java b/hudi-common/src/main/java/org/apache/hudi/common/model/HoodieRecord.java index a5043de0f..6e02ecc21 100644 --- a/hudi-common/src/main/java/org/apache/hudi/common/model/HoodieRecord.java +++ b/hudi-common/src/main/java/org/apache/hudi/common/model/HoodieRecord.java @@ -36,12 +36,8 @@ public class HoodieRecord implements Serializable public static String FILENAME_METADATA_FIELD = "_hoodie_file_name"; public static final List HOODIE_META_COLUMNS = - new ImmutableList.Builder().add(COMMIT_TIME_METADATA_FIELD) - .add(COMMIT_SEQNO_METADATA_FIELD) - .add(RECORD_KEY_METADATA_FIELD) - .add(PARTITION_PATH_METADATA_FIELD) - .add(FILENAME_METADATA_FIELD) - .build(); + new ImmutableList.Builder().add(COMMIT_TIME_METADATA_FIELD).add(COMMIT_SEQNO_METADATA_FIELD) + .add(RECORD_KEY_METADATA_FIELD).add(PARTITION_PATH_METADATA_FIELD).add(FILENAME_METADATA_FIELD).build(); /** * Identifies the record across the table @@ -95,8 +91,8 @@ public class HoodieRecord implements Serializable } /** - * Release the actual payload, to ease memory pressure. To be called after the record has been - * written to storage. Once deflated, cannot be inflated. + * Release the actual payload, to ease memory pressure. To be called after the record has been written to storage. + * Once deflated, cannot be inflated. */ public void deflate() { this.data = null; @@ -118,8 +114,7 @@ public class HoodieRecord implements Serializable } /** - * Sets the new currentLocation of the record, after being written. This again should happen - * exactly-once. + * Sets the new currentLocation of the record, after being written. This again should happen exactly-once. */ public HoodieRecord setNewLocation(HoodieRecordLocation location) { checkState(); @@ -145,10 +140,8 @@ public class HoodieRecord implements Serializable return false; } HoodieRecord that = (HoodieRecord) o; - return Objects.equal(key, that.key) - && Objects.equal(data, that.data) - && Objects.equal(currentLocation, that.currentLocation) - && Objects.equal(newLocation, that.newLocation); + return Objects.equal(key, that.key) && Objects.equal(data, that.data) + && Objects.equal(currentLocation, that.currentLocation) && Objects.equal(newLocation, that.newLocation); } @Override diff --git a/hudi-common/src/main/java/org/apache/hudi/common/model/HoodieRecordLocation.java b/hudi-common/src/main/java/org/apache/hudi/common/model/HoodieRecordLocation.java index ef1e630a8..b689b6e60 100644 --- a/hudi-common/src/main/java/org/apache/hudi/common/model/HoodieRecordLocation.java +++ b/hudi-common/src/main/java/org/apache/hudi/common/model/HoodieRecordLocation.java @@ -22,8 +22,7 @@ import com.google.common.base.Objects; import java.io.Serializable; /** - * Location of a HoodieRecord within the partition it belongs to. Ultimately, this points to an - * actual file on disk + * Location of a HoodieRecord within the partition it belongs to. Ultimately, this points to an actual file on disk */ public class HoodieRecordLocation implements Serializable { @@ -44,8 +43,7 @@ public class HoodieRecordLocation implements Serializable { return false; } HoodieRecordLocation otherLoc = (HoodieRecordLocation) o; - return Objects.equal(instantTime, otherLoc.instantTime) - && Objects.equal(fileId, otherLoc.fileId); + return Objects.equal(instantTime, otherLoc.instantTime) && Objects.equal(fileId, otherLoc.fileId); } @Override diff --git a/hudi-common/src/main/java/org/apache/hudi/common/model/HoodieRecordPayload.java b/hudi-common/src/main/java/org/apache/hudi/common/model/HoodieRecordPayload.java index 0fc7703ae..4839946c6 100644 --- a/hudi-common/src/main/java/org/apache/hudi/common/model/HoodieRecordPayload.java +++ b/hudi-common/src/main/java/org/apache/hudi/common/model/HoodieRecordPayload.java @@ -26,45 +26,41 @@ import org.apache.avro.generic.IndexedRecord; import org.apache.hudi.common.util.Option; /** - * Every Hoodie dataset has an implementation of the HoodieRecordPayload This abstracts - * out callbacks which depend on record specific logic + * Every Hoodie dataset has an implementation of the HoodieRecordPayload This abstracts out callbacks which + * depend on record specific logic */ public interface HoodieRecordPayload extends Serializable { /** - * When more than one HoodieRecord have the same HoodieKey, this function combines them before - * attempting to insert/upsert (if combining turned on in HoodieClientConfig) + * When more than one HoodieRecord have the same HoodieKey, this function combines them before attempting to + * insert/upsert (if combining turned on in HoodieClientConfig) */ T preCombine(T another); /** - * This methods lets you write custom merging/combining logic to produce new values as a function - * of current value on storage and whats contained in this object. + * This methods lets you write custom merging/combining logic to produce new values as a function of current value on + * storage and whats contained in this object. *

    - * eg: 1) You are updating counters, you may want to add counts to currentValue and write back - * updated counts 2) You may be reading DB redo logs, and merge them with current image for a - * database row on storage + * eg: 1) You are updating counters, you may want to add counts to currentValue and write back updated counts 2) You + * may be reading DB redo logs, and merge them with current image for a database row on storage * * @param currentValue Current value in storage, to merge/combine this payload with - * @param schema Schema used for record - * @return new combined/merged value to be written back to storage. EMPTY to skip writing this - * record. + * @param schema Schema used for record + * @return new combined/merged value to be written back to storage. EMPTY to skip writing this record. */ - Option combineAndGetUpdateValue(IndexedRecord currentValue, Schema schema) - throws IOException; + Option combineAndGetUpdateValue(IndexedRecord currentValue, Schema schema) throws IOException; /** - * Generates an avro record out of the given HoodieRecordPayload, to be written out to storage. - * Called when writing a new value for the given HoodieKey, wherein there is no existing record in - * storage to be combined against. (i.e insert) Return EMPTY to skip writing this record. + * Generates an avro record out of the given HoodieRecordPayload, to be written out to storage. Called when writing a + * new value for the given HoodieKey, wherein there is no existing record in storage to be combined against. (i.e + * insert) Return EMPTY to skip writing this record. */ Option getInsertValue(Schema schema) throws IOException; /** - * This method can be used to extract some metadata from HoodieRecordPayload. The metadata is - * passed to {@code WriteStatus.markSuccess()} and {@code WriteStatus.markFailure()} in order to - * compute some aggregate metrics using the metadata in the context of a write success or - * failure. + * This method can be used to extract some metadata from HoodieRecordPayload. The metadata is passed to + * {@code WriteStatus.markSuccess()} and {@code WriteStatus.markFailure()} in order to compute some aggregate metrics + * using the metadata in the context of a write success or failure. */ default Option> getMetadata() { return Option.empty(); diff --git a/hudi-common/src/main/java/org/apache/hudi/common/model/HoodieTableType.java b/hudi-common/src/main/java/org/apache/hudi/common/model/HoodieTableType.java index 9fbb01744..17c7fd24f 100644 --- a/hudi-common/src/main/java/org/apache/hudi/common/model/HoodieTableType.java +++ b/hudi-common/src/main/java/org/apache/hudi/common/model/HoodieTableType.java @@ -23,14 +23,13 @@ package org.apache.hudi.common.model; *

    * Currently, 1 type is supported *

    - * COPY_ON_WRITE - Performs upserts by versioning entire files, with later versions containing newer - * value of a record. + * COPY_ON_WRITE - Performs upserts by versioning entire files, with later versions containing newer value of a record. *

    * In the future, following might be added. *

    * MERGE_ON_READ - Speeds up upserts, by delaying merge until enough work piles up. *

    - * SIMPLE_LSM - A simple 2 level LSM tree. + * SIMPLE_LSM - A simple 2 level LSM tree. */ public enum HoodieTableType { COPY_ON_WRITE, MERGE_ON_READ diff --git a/hudi-common/src/main/java/org/apache/hudi/common/model/HoodieWriteStat.java b/hudi-common/src/main/java/org/apache/hudi/common/model/HoodieWriteStat.java index ddc33ff0b..331dca847 100644 --- a/hudi-common/src/main/java/org/apache/hudi/common/model/HoodieWriteStat.java +++ b/hudi-common/src/main/java/org/apache/hudi/common/model/HoodieWriteStat.java @@ -48,8 +48,8 @@ public class HoodieWriteStat implements Serializable { private String prevCommit; /** - * Total number of records written for this file. - for updates, its the entire number of records - * in the file - for inserts, its the actual number of records inserted. + * Total number of records written for this file. - for updates, its the entire number of records in the file - for + * inserts, its the actual number of records inserted. */ private long numWrites; @@ -318,25 +318,13 @@ public class HoodieWriteStat implements Serializable { @Override public String toString() { - return "HoodieWriteStat{" - + "fileId='" + fileId + '\'' - + ", path='" + path + '\'' - + ", prevCommit='" + prevCommit + '\'' - + ", numWrites=" + numWrites - + ", numDeletes=" + numDeletes - + ", numUpdateWrites=" + numUpdateWrites - + ", totalWriteBytes=" + totalWriteBytes - + ", totalWriteErrors=" + totalWriteErrors - + ", tempPath='" + tempPath + '\'' - + ", partitionPath='" + partitionPath - + '\'' + ", totalLogRecords=" + totalLogRecords - + ", totalLogFilesCompacted=" + totalLogFilesCompacted - + ", totalLogSizeCompacted=" + totalLogSizeCompacted - + ", totalUpdatedRecordsCompacted=" + totalUpdatedRecordsCompacted - + ", totalLogBlocks=" + totalLogBlocks - + ", totalCorruptLogBlock=" + totalCorruptLogBlock - + ", totalRollbackBlocks=" + totalRollbackBlocks - + '}'; + return "HoodieWriteStat{" + "fileId='" + fileId + '\'' + ", path='" + path + '\'' + ", prevCommit='" + prevCommit + + '\'' + ", numWrites=" + numWrites + ", numDeletes=" + numDeletes + ", numUpdateWrites=" + numUpdateWrites + + ", totalWriteBytes=" + totalWriteBytes + ", totalWriteErrors=" + totalWriteErrors + ", tempPath='" + tempPath + + '\'' + ", partitionPath='" + partitionPath + '\'' + ", totalLogRecords=" + totalLogRecords + + ", totalLogFilesCompacted=" + totalLogFilesCompacted + ", totalLogSizeCompacted=" + totalLogSizeCompacted + + ", totalUpdatedRecordsCompacted=" + totalUpdatedRecordsCompacted + ", totalLogBlocks=" + totalLogBlocks + + ", totalCorruptLogBlock=" + totalCorruptLogBlock + ", totalRollbackBlocks=" + totalRollbackBlocks + '}'; } @Override diff --git a/hudi-common/src/main/java/org/apache/hudi/common/storage/StorageSchemes.java b/hudi-common/src/main/java/org/apache/hudi/common/storage/StorageSchemes.java index 5d6d762b5..19b169b92 100644 --- a/hudi-common/src/main/java/org/apache/hudi/common/storage/StorageSchemes.java +++ b/hudi-common/src/main/java/org/apache/hudi/common/storage/StorageSchemes.java @@ -33,8 +33,7 @@ public enum StorageSchemes { // Apache Ignite FS IGNITE("igfs", true), // AWS S3 - S3A("s3a", false), - S3("s3", false), + S3A("s3a", false), S3("s3", false), // Google Cloud Storage GCS("gs", false), // View FS for federated setups. If federating across cloud stores, then append support is false diff --git a/hudi-common/src/main/java/org/apache/hudi/common/table/HoodieTableConfig.java b/hudi-common/src/main/java/org/apache/hudi/common/table/HoodieTableConfig.java index 6ddc1b348..4421365b3 100644 --- a/hudi-common/src/main/java/org/apache/hudi/common/table/HoodieTableConfig.java +++ b/hudi-common/src/main/java/org/apache/hudi/common/table/HoodieTableConfig.java @@ -36,10 +36,9 @@ import org.apache.log4j.LogManager; import org.apache.log4j.Logger; /** - * Configurations on the Hoodie Table like type of ingestion, storage formats, hive table name etc - * Configurations are loaded from hoodie.properties, these properties are usually set during - * initializing a path as hoodie base path and never changes during the lifetime of a hoodie - * dataset. + * Configurations on the Hoodie Table like type of ingestion, storage formats, hive table name etc Configurations are + * loaded from hoodie.properties, these properties are usually set during initializing a path as hoodie base path and + * never changes during the lifetime of a hoodie dataset. * * @see HoodieTableMetaClient * @since 0.3.0 @@ -51,10 +50,8 @@ public class HoodieTableConfig implements Serializable { public static final String HOODIE_PROPERTIES_FILE = "hoodie.properties"; public static final String HOODIE_TABLE_NAME_PROP_NAME = "hoodie.table.name"; public static final String HOODIE_TABLE_TYPE_PROP_NAME = "hoodie.table.type"; - public static final String HOODIE_RO_FILE_FORMAT_PROP_NAME = - "hoodie.table.ro.file.format"; - public static final String HOODIE_RT_FILE_FORMAT_PROP_NAME = - "hoodie.table.rt.file.format"; + public static final String HOODIE_RO_FILE_FORMAT_PROP_NAME = "hoodie.table.ro.file.format"; + public static final String HOODIE_RT_FILE_FORMAT_PROP_NAME = "hoodie.table.rt.file.format"; public static final String HOODIE_PAYLOAD_CLASS_PROP_NAME = "hoodie.compaction.payload.class"; public static final String HOODIE_ARCHIVELOG_FOLDER_PROP_NAME = "hoodie.archivelog.folder"; @@ -88,37 +85,32 @@ public class HoodieTableConfig implements Serializable { * * @deprecated */ - public HoodieTableConfig() { - } + public HoodieTableConfig() {} /** - * Initialize the hoodie meta directory and any necessary files inside the meta (including the - * hoodie.properties) + * Initialize the hoodie meta directory and any necessary files inside the meta (including the hoodie.properties) */ - public static void createHoodieProperties(FileSystem fs, Path metadataFolder, - Properties properties) throws IOException { + public static void createHoodieProperties(FileSystem fs, Path metadataFolder, Properties properties) + throws IOException { if (!fs.exists(metadataFolder)) { fs.mkdirs(metadataFolder); } Path propertyPath = new Path(metadataFolder, HOODIE_PROPERTIES_FILE); try (FSDataOutputStream outputStream = fs.create(propertyPath)) { if (!properties.containsKey(HOODIE_TABLE_NAME_PROP_NAME)) { - throw new IllegalArgumentException( - HOODIE_TABLE_NAME_PROP_NAME + " property needs to be specified"); + throw new IllegalArgumentException(HOODIE_TABLE_NAME_PROP_NAME + " property needs to be specified"); } if (!properties.containsKey(HOODIE_TABLE_TYPE_PROP_NAME)) { properties.setProperty(HOODIE_TABLE_TYPE_PROP_NAME, DEFAULT_TABLE_TYPE.name()); } - if (properties.getProperty(HOODIE_TABLE_TYPE_PROP_NAME) == HoodieTableType.MERGE_ON_READ - .name() - && !properties.containsKey(HOODIE_PAYLOAD_CLASS_PROP_NAME)) { + if (properties.getProperty(HOODIE_TABLE_TYPE_PROP_NAME) == HoodieTableType.MERGE_ON_READ.name() + && !properties.containsKey(HOODIE_PAYLOAD_CLASS_PROP_NAME)) { properties.setProperty(HOODIE_PAYLOAD_CLASS_PROP_NAME, DEFAULT_PAYLOAD_CLASS); } if (!properties.containsKey(HOODIE_ARCHIVELOG_FOLDER_PROP_NAME)) { properties.setProperty(HOODIE_ARCHIVELOG_FOLDER_PROP_NAME, DEFAULT_ARCHIVELOG_FOLDER); } - properties - .store(outputStream, "Properties saved on " + new Date(System.currentTimeMillis())); + properties.store(outputStream, "Properties saved on " + new Date(System.currentTimeMillis())); } } @@ -139,8 +131,8 @@ public class HoodieTableConfig implements Serializable { public String getPayloadClass() { // There could be datasets written with payload class from com.uber.hoodie. Need to transparently // change to org.apache.hudi - return props.getProperty(HOODIE_PAYLOAD_CLASS_PROP_NAME, DEFAULT_PAYLOAD_CLASS) - .replace("com.uber.hoodie", "org.apache.hudi"); + return props.getProperty(HOODIE_PAYLOAD_CLASS_PROP_NAME, DEFAULT_PAYLOAD_CLASS).replace("com.uber.hoodie", + "org.apache.hudi"); } /** @@ -182,7 +174,7 @@ public class HoodieTableConfig implements Serializable { } public Map getProps() { - return props.entrySet().stream().collect( - Collectors.toMap(e -> String.valueOf(e.getKey()), e -> String.valueOf(e.getValue()))); + return props.entrySet().stream() + .collect(Collectors.toMap(e -> String.valueOf(e.getKey()), e -> String.valueOf(e.getValue()))); } } diff --git a/hudi-common/src/main/java/org/apache/hudi/common/table/HoodieTableMetaClient.java b/hudi-common/src/main/java/org/apache/hudi/common/table/HoodieTableMetaClient.java index 7cf095b2d..7a770e6d2 100644 --- a/hudi-common/src/main/java/org/apache/hudi/common/table/HoodieTableMetaClient.java +++ b/hudi-common/src/main/java/org/apache/hudi/common/table/HoodieTableMetaClient.java @@ -50,12 +50,12 @@ import org.apache.log4j.LogManager; import org.apache.log4j.Logger; /** - * HoodieTableMetaClient allows to access meta-data about a hoodie table It returns - * meta-data about commits, savepoints, compactions, cleanups as a HoodieTimeline - * Create an instance of the HoodieTableMetaClient with FileSystem and basePath to - * start getting the meta-data.

    All the timelines are computed lazily, once computed the - * timeline is cached and never refreshed. Use the HoodieTimeline.reload() to refresh - * timelines. + * HoodieTableMetaClient allows to access meta-data about a hoodie table It returns meta-data about + * commits, savepoints, compactions, cleanups as a HoodieTimeline Create an instance of the + * HoodieTableMetaClient with FileSystem and basePath to start getting the meta-data. + *

    + * All the timelines are computed lazily, once computed the timeline is cached and never refreshed. Use the + * HoodieTimeline.reload() to refresh timelines. * * @see HoodieTimeline * @since 0.3.0 @@ -79,20 +79,17 @@ public class HoodieTableMetaClient implements Serializable { private HoodieArchivedTimeline archivedTimeline; private ConsistencyGuardConfig consistencyGuardConfig = ConsistencyGuardConfig.newBuilder().build(); - public HoodieTableMetaClient(Configuration conf, String basePath) - throws DatasetNotFoundException { + public HoodieTableMetaClient(Configuration conf, String basePath) throws DatasetNotFoundException { // Do not load any timeline by default this(conf, basePath, false); } - public HoodieTableMetaClient(Configuration conf, String basePath, - boolean loadActiveTimelineOnLoad) { + public HoodieTableMetaClient(Configuration conf, String basePath, boolean loadActiveTimelineOnLoad) { this(conf, basePath, loadActiveTimelineOnLoad, ConsistencyGuardConfig.newBuilder().build()); } - public HoodieTableMetaClient(Configuration conf, String basePath, - boolean loadActiveTimelineOnLoad, ConsistencyGuardConfig consistencyGuardConfig) - throws DatasetNotFoundException { + public HoodieTableMetaClient(Configuration conf, String basePath, boolean loadActiveTimelineOnLoad, + ConsistencyGuardConfig consistencyGuardConfig) throws DatasetNotFoundException { log.info("Loading HoodieTableMetaClient from " + basePath); this.basePath = basePath; this.consistencyGuardConfig = consistencyGuardConfig; @@ -117,15 +114,11 @@ public class HoodieTableMetaClient implements Serializable { * * @deprecated */ - public HoodieTableMetaClient() { - } + public HoodieTableMetaClient() {} public static HoodieTableMetaClient reload(HoodieTableMetaClient oldMetaClient) { - return new HoodieTableMetaClient( - oldMetaClient.hadoopConf.get(), - oldMetaClient.basePath, - oldMetaClient.loadActiveTimelineOnLoad, - oldMetaClient.consistencyGuardConfig); + return new HoodieTableMetaClient(oldMetaClient.hadoopConf.get(), oldMetaClient.basePath, + oldMetaClient.loadActiveTimelineOnLoad, oldMetaClient.consistencyGuardConfig); } /** @@ -133,14 +126,12 @@ public class HoodieTableMetaClient implements Serializable { * * @deprecated */ - private void readObject(java.io.ObjectInputStream in) - throws IOException, ClassNotFoundException { + private void readObject(java.io.ObjectInputStream in) throws IOException, ClassNotFoundException { in.defaultReadObject(); fs = null; // will be lazily inited } - private void writeObject(java.io.ObjectOutputStream out) - throws IOException { + private void writeObject(java.io.ObjectOutputStream out) throws IOException { out.defaultWriteObject(); } @@ -173,8 +164,9 @@ public class HoodieTableMetaClient implements Serializable { } /** - * Returns Marker folder path - * @param instantTs Instant Timestamp + * Returns Marker folder path + * + * @param instantTs Instant Timestamp * @return */ public String getMarkerFolderPath(String instantTs) { @@ -215,14 +207,17 @@ public class HoodieTableMetaClient implements Serializable { FileSystem fileSystem = FSUtils.getFs(metaPath, hadoopConf.newCopy()); Preconditions.checkArgument(!(fileSystem instanceof HoodieWrapperFileSystem), "File System not expected to be that of HoodieWrapperFileSystem"); - fs = new HoodieWrapperFileSystem(fileSystem, consistencyGuardConfig.isConsistencyCheckEnabled() - ? new FailSafeConsistencyGuard(fileSystem, consistencyGuardConfig) : new NoOpConsistencyGuard()); + fs = new HoodieWrapperFileSystem(fileSystem, + consistencyGuardConfig.isConsistencyCheckEnabled() + ? new FailSafeConsistencyGuard(fileSystem, consistencyGuardConfig) + : new NoOpConsistencyGuard()); } return fs; } /** * Return raw file-system + * * @return */ public FileSystem getRawFs() { @@ -260,8 +255,8 @@ public class HoodieTableMetaClient implements Serializable { } /** - * Get the archived commits as a timeline. This is costly operation, as all data from the archived - * files are read. This should not be used, unless for historical debugging purposes + * Get the archived commits as a timeline. This is costly operation, as all data from the archived files are read. + * This should not be used, unless for historical debugging purposes * * @return Active commit timeline */ @@ -276,8 +271,8 @@ public class HoodieTableMetaClient implements Serializable { /** * Helper method to initialize a dataset, with given basePath, tableType, name, archiveFolder */ - public static HoodieTableMetaClient initTableType(Configuration hadoopConf, String basePath, - String tableType, String tableName, String archiveLogFolder) throws IOException { + public static HoodieTableMetaClient initTableType(Configuration hadoopConf, String basePath, String tableType, + String tableName, String archiveLogFolder) throws IOException { HoodieTableType type = HoodieTableType.valueOf(tableType); Properties properties = new Properties(); properties.put(HoodieTableConfig.HOODIE_TABLE_NAME_PROP_NAME, tableName); @@ -301,13 +296,12 @@ public class HoodieTableMetaClient implements Serializable { } /** - * Helper method to initialize a given path as a hoodie dataset with configs passed in as as - * Properties + * Helper method to initialize a given path as a hoodie dataset with configs passed in as as Properties * * @return Instance of HoodieTableMetaClient */ - public static HoodieTableMetaClient initDatasetAndGetMetaClient(Configuration hadoopConf, - String basePath, Properties props) throws IOException { + public static HoodieTableMetaClient initDatasetAndGetMetaClient(Configuration hadoopConf, String basePath, + Properties props) throws IOException { log.info("Initializing " + basePath + " as hoodie dataset " + basePath); Path basePathDir = new Path(basePath); final FileSystem fs = FSUtils.getFs(basePath, hadoopConf); @@ -320,9 +314,8 @@ public class HoodieTableMetaClient implements Serializable { } // if anything other than default archive log folder is specified, create that too - String archiveLogPropVal = props - .getProperty(HoodieTableConfig.HOODIE_ARCHIVELOG_FOLDER_PROP_NAME, - HoodieTableConfig.DEFAULT_ARCHIVELOG_FOLDER); + String archiveLogPropVal = props.getProperty(HoodieTableConfig.HOODIE_ARCHIVELOG_FOLDER_PROP_NAME, + HoodieTableConfig.DEFAULT_ARCHIVELOG_FOLDER); if (!archiveLogPropVal.equals(HoodieTableConfig.DEFAULT_ARCHIVELOG_FOLDER)) { Path archiveLogDir = new Path(metaPathDir, archiveLogPropVal); if (!fs.exists(archiveLogDir)) { @@ -346,14 +339,12 @@ public class HoodieTableMetaClient implements Serializable { // We should not use fs.getConf as this might be different from the original configuration // used to create the fs in unit tests HoodieTableMetaClient metaClient = new HoodieTableMetaClient(hadoopConf, basePath); - log.info("Finished initializing Table of type " + metaClient.getTableConfig().getTableType() - + " from " + basePath); + log.info("Finished initializing Table of type " + metaClient.getTableConfig().getTableType() + " from " + basePath); return metaClient; } // HELPER METHODS TO CREATE META FILE NAMES - public static FileStatus[] scanFiles(FileSystem fs, Path metaPath, PathFilter nameFilter) - throws IOException { + public static FileStatus[] scanFiles(FileSystem fs, Path metaPath, PathFilter nameFilter) throws IOException { return fs.listStatus(metaPath, nameFilter); } @@ -375,10 +366,10 @@ public class HoodieTableMetaClient implements Serializable { } /** - * Get the commit + pending-compaction timeline visible for this table. - * A RT filesystem view is constructed with this timeline so that file-slice after pending compaction-requested - * instant-time is also considered valid. A RT file-system view for reading must then merge the file-slices before - * and after pending compaction instant so that all delta-commits are read. + * Get the commit + pending-compaction timeline visible for this table. A RT filesystem view is constructed with this + * timeline so that file-slice after pending compaction-requested instant-time is also considered valid. A RT + * file-system view for reading must then merge the file-slices before and after pending compaction instant so that + * all delta-commits are read. */ public HoodieTimeline getCommitsAndCompactionTimeline() { switch (this.getTableType()) { @@ -415,8 +406,7 @@ public class HoodieTableMetaClient implements Serializable { case MERGE_ON_READ: return HoodieActiveTimeline.DELTA_COMMIT_ACTION; default: - throw new HoodieException( - "Could not commit on unknown storage type " + this.getTableType()); + throw new HoodieException("Could not commit on unknown storage type " + this.getTableType()); } } @@ -424,23 +414,21 @@ public class HoodieTableMetaClient implements Serializable { /** * Helper method to scan all hoodie-instant metafiles and construct HoodieInstant objects * - * @param fs FileSystem - * @param metaPath Meta Path where hoodie instants are present + * @param fs FileSystem + * @param metaPath Meta Path where hoodie instants are present * @param includedExtensions Included hoodie extensions * @return List of Hoodie Instants generated * @throws IOException in case of failure */ - public static List scanHoodieInstantsFromFileSystem( - FileSystem fs, Path metaPath, Set includedExtensions) throws IOException { - return Arrays.stream( - HoodieTableMetaClient - .scanFiles(fs, metaPath, path -> { - // Include only the meta files with extensions that needs to be included - String extension = FSUtils.getFileExtension(path.getName()); - return includedExtensions.contains(extension); - })).sorted(Comparator.comparing( - // Sort the meta-data by the instant time (first part of the file name) - fileStatus -> FSUtils.getInstantTime(fileStatus.getPath().getName()))) + public static List scanHoodieInstantsFromFileSystem(FileSystem fs, Path metaPath, + Set includedExtensions) throws IOException { + return Arrays.stream(HoodieTableMetaClient.scanFiles(fs, metaPath, path -> { + // Include only the meta files with extensions that needs to be included + String extension = FSUtils.getFileExtension(path.getName()); + return includedExtensions.contains(extension); + })).sorted(Comparator.comparing( + // Sort the meta-data by the instant time (first part of the file name) + fileStatus -> FSUtils.getInstantTime(fileStatus.getPath().getName()))) // create HoodieInstantMarkers from FileStatus, which extracts properties .map(HoodieInstant::new).collect(Collectors.toList()); } diff --git a/hudi-common/src/main/java/org/apache/hudi/common/table/HoodieTimeline.java b/hudi-common/src/main/java/org/apache/hudi/common/table/HoodieTimeline.java index 4a1511451..4e42c9910 100644 --- a/hudi-common/src/main/java/org/apache/hudi/common/table/HoodieTimeline.java +++ b/hudi-common/src/main/java/org/apache/hudi/common/table/HoodieTimeline.java @@ -29,10 +29,11 @@ import org.apache.hudi.common.util.Option; import org.apache.hudi.common.util.StringUtils; /** - * HoodieTimeline is a view of meta-data instants in the hoodie dataset. Instants are specific - * points in time represented as HoodieInstant.

    Timelines are immutable once created and - * operations create new instance of timelines which filter on the instants and this can be - * chained. + * HoodieTimeline is a view of meta-data instants in the hoodie dataset. Instants are specific points in time + * represented as HoodieInstant. + *

    + * Timelines are immutable once created and operations create new instance of timelines which filter on the instants and + * this can be chained. * * @see HoodieTableMetaClient * @see HoodieDefaultTimeline @@ -58,22 +59,19 @@ public interface HoodieTimeline extends Serializable { String CLEAN_EXTENSION = "." + CLEAN_ACTION; String ROLLBACK_EXTENSION = "." + ROLLBACK_ACTION; String SAVEPOINT_EXTENSION = "." + SAVEPOINT_ACTION; - //this is to preserve backwards compatibility on commit in-flight filenames + // this is to preserve backwards compatibility on commit in-flight filenames String INFLIGHT_COMMIT_EXTENSION = INFLIGHT_EXTENSION; String INFLIGHT_DELTA_COMMIT_EXTENSION = "." + DELTA_COMMIT_ACTION + INFLIGHT_EXTENSION; String INFLIGHT_CLEAN_EXTENSION = "." + CLEAN_ACTION + INFLIGHT_EXTENSION; String INFLIGHT_ROLLBACK_EXTENSION = "." + ROLLBACK_ACTION + INFLIGHT_EXTENSION; String INFLIGHT_SAVEPOINT_EXTENSION = "." + SAVEPOINT_ACTION + INFLIGHT_EXTENSION; - String REQUESTED_COMPACTION_SUFFIX = - StringUtils.join(COMPACTION_ACTION, REQUESTED_EXTENSION); - String REQUESTED_COMPACTION_EXTENSION = - StringUtils.join(".", REQUESTED_COMPACTION_SUFFIX); - String INFLIGHT_COMPACTION_EXTENSION = - StringUtils.join(".", COMPACTION_ACTION, INFLIGHT_EXTENSION); + String REQUESTED_COMPACTION_SUFFIX = StringUtils.join(COMPACTION_ACTION, REQUESTED_EXTENSION); + String REQUESTED_COMPACTION_EXTENSION = StringUtils.join(".", REQUESTED_COMPACTION_SUFFIX); + String INFLIGHT_COMPACTION_EXTENSION = StringUtils.join(".", COMPACTION_ACTION, INFLIGHT_EXTENSION); String INFLIGHT_RESTORE_EXTENSION = "." + RESTORE_ACTION + INFLIGHT_EXTENSION; String RESTORE_EXTENSION = "." + RESTORE_ACTION; - String INVALID_INSTANT_TS = "0"; + String INVALID_INSTANT_TS = "0"; /** * Filter this timeline to just include the in-flights @@ -97,22 +95,25 @@ public interface HoodieTimeline extends Serializable { HoodieTimeline filterCompletedInstants(); /** - * Filter this timeline to just include the completed + compaction (inflight + requested) instants - * A RT filesystem view is constructed with this timeline so that file-slice after pending compaction-requested - * instant-time is also considered valid. A RT file-system view for reading must then merge the file-slices before - * and after pending compaction instant so that all delta-commits are read. + * Filter this timeline to just include the completed + compaction (inflight + requested) instants A RT filesystem + * view is constructed with this timeline so that file-slice after pending compaction-requested instant-time is also + * considered valid. A RT file-system view for reading must then merge the file-slices before and after pending + * compaction instant so that all delta-commits are read. + * * @return New instance of HoodieTimeline with just completed instants */ HoodieTimeline filterCompletedAndCompactionInstants(); /** - * Timeline to just include commits (commit/deltacommit) and compaction actions + * Timeline to just include commits (commit/deltacommit) and compaction actions + * * @return */ HoodieTimeline getCommitsAndCompactionTimeline(); /** * Filter this timeline to just include requested and inflight compaction instants + * * @return */ HoodieTimeline filterPendingCompactionTimeline(); @@ -162,6 +163,7 @@ public interface HoodieTimeline extends Serializable { /** * Get hash of timeline + * * @return */ String getTimelineHash(); @@ -177,8 +179,8 @@ public interface HoodieTimeline extends Serializable { boolean containsInstant(HoodieInstant instant); /** - * @return true if the passed instant is present as a completed instant on the timeline or if the - * instant is before the first completed instant in the timeline + * @return true if the passed instant is present as a completed instant on the timeline or if the instant is before + * the first completed instant in the timeline */ boolean containsOrBeforeTimelineStarts(String ts); @@ -188,8 +190,8 @@ public interface HoodieTimeline extends Serializable { Stream getInstants(); /** - * @return Get the stream of completed instants in reverse order - * TODO Change code references to getInstants() that reverse the instants later on to use this method instead. + * @return Get the stream of completed instants in reverse order TODO Change code references to getInstants() that + * reverse the instants later on to use this method instead. */ Stream getReverseOrderedInstants(); @@ -206,17 +208,13 @@ public interface HoodieTimeline extends Serializable { /** * Helper methods to compare instants **/ - BiPredicate EQUAL = - (commit1, commit2) -> commit1.compareTo(commit2) == 0; - BiPredicate GREATER_OR_EQUAL = - (commit1, commit2) -> commit1.compareTo(commit2) >= 0; + BiPredicate EQUAL = (commit1, commit2) -> commit1.compareTo(commit2) == 0; + BiPredicate GREATER_OR_EQUAL = (commit1, commit2) -> commit1.compareTo(commit2) >= 0; BiPredicate GREATER = (commit1, commit2) -> commit1.compareTo(commit2) > 0; - BiPredicate LESSER_OR_EQUAL = - (commit1, commit2) -> commit1.compareTo(commit2) <= 0; + BiPredicate LESSER_OR_EQUAL = (commit1, commit2) -> commit1.compareTo(commit2) <= 0; BiPredicate LESSER = (commit1, commit2) -> commit1.compareTo(commit2) < 0; - static boolean compareTimestamps(String commit1, String commit2, - BiPredicate predicateToApply) { + static boolean compareTimestamps(String commit1, String commit2, BiPredicate predicateToApply) { return predicateToApply.test(commit1, commit2); } diff --git a/hudi-common/src/main/java/org/apache/hudi/common/table/SyncableFileSystemView.java b/hudi-common/src/main/java/org/apache/hudi/common/table/SyncableFileSystemView.java index 88f61e9c6..44919f398 100644 --- a/hudi-common/src/main/java/org/apache/hudi/common/table/SyncableFileSystemView.java +++ b/hudi-common/src/main/java/org/apache/hudi/common/table/SyncableFileSystemView.java @@ -22,8 +22,8 @@ package org.apache.hudi.common.table; * A consolidated file-system view interface exposing both realtime and read-optimized views along with * update operations. */ -public interface SyncableFileSystemView extends TableFileSystemView, TableFileSystemView.ReadOptimizedView, - TableFileSystemView.RealtimeView { +public interface SyncableFileSystemView + extends TableFileSystemView, TableFileSystemView.ReadOptimizedView, TableFileSystemView.RealtimeView { @@ -38,9 +38,9 @@ public interface SyncableFileSystemView extends TableFileSystemView, TableFileSy void reset(); /** - * Read the latest timeline and refresh the file-system view to match the current state of the file-system. - * The refresh can either be done incrementally (from reading file-slices in metadata files) or from scratch by - * reseting view storage + * Read the latest timeline and refresh the file-system view to match the current state of the file-system. The + * refresh can either be done incrementally (from reading file-slices in metadata files) or from scratch by reseting + * view storage */ void sync(); } diff --git a/hudi-common/src/main/java/org/apache/hudi/common/table/TableFileSystemView.java b/hudi-common/src/main/java/org/apache/hudi/common/table/TableFileSystemView.java index f5f04a706..57f971d63 100644 --- a/hudi-common/src/main/java/org/apache/hudi/common/table/TableFileSystemView.java +++ b/hudi-common/src/main/java/org/apache/hudi/common/table/TableFileSystemView.java @@ -59,8 +59,7 @@ public interface TableFileSystemView { * Stream all the latest version data files in the given partition with precondition that commitTime(file) before * maxCommitTime */ - Stream getLatestDataFilesBeforeOrOn(String partitionPath, - String maxCommitTime); + Stream getLatestDataFilesBeforeOrOn(String partitionPath, String maxCommitTime); /** * Stream all the latest data files pass @@ -105,20 +104,20 @@ public interface TableFileSystemView { Stream getLatestUnCompactedFileSlices(String partitionPath); /** - * Stream all latest file slices in given partition with precondition that commitTime(file) before maxCommitTime + * Stream all latest file slices in given partition with precondition that commitTime(file) before maxCommitTime * * @param partitionPath Partition path * @param maxCommitTime Max Instant Time * @param includeFileSlicesInPendingCompaction include file-slices that are in pending compaction */ - Stream getLatestFileSlicesBeforeOrOn(String partitionPath, - String maxCommitTime, boolean includeFileSlicesInPendingCompaction); + Stream getLatestFileSlicesBeforeOrOn(String partitionPath, String maxCommitTime, + boolean includeFileSlicesInPendingCompaction); /** - * Stream all "merged" file-slices before on an instant time - * If a file-group has a pending compaction request, the file-slice before and after compaction request instant - * is merged and returned. - * @param partitionPath Partition Path + * Stream all "merged" file-slices before on an instant time If a file-group has a pending compaction request, the + * file-slice before and after compaction request instant is merged and returned. + * + * @param partitionPath Partition Path * @param maxInstantTime Max Instant Time * @return */ diff --git a/hudi-common/src/main/java/org/apache/hudi/common/table/log/AbstractHoodieLogRecordScanner.java b/hudi-common/src/main/java/org/apache/hudi/common/table/log/AbstractHoodieLogRecordScanner.java index 16d788f57..88bfe3957 100644 --- a/hudi-common/src/main/java/org/apache/hudi/common/table/log/AbstractHoodieLogRecordScanner.java +++ b/hudi-common/src/main/java/org/apache/hudi/common/table/log/AbstractHoodieLogRecordScanner.java @@ -51,15 +51,16 @@ import org.apache.log4j.LogManager; import org.apache.log4j.Logger; /** - * Implements logic to scan log blocks and expose valid and deleted log records to subclass implementation. - * Subclass is free to either apply merging or expose raw data back to the caller. + * Implements logic to scan log blocks and expose valid and deleted log records to subclass implementation. Subclass is + * free to either apply merging or expose raw data back to the caller. * - * NOTE: If readBlockLazily is - * turned on, does not merge, instead keeps reading log blocks and merges everything at once This is an optimization to - * avoid seek() back and forth to read new block (forward seek()) and lazily read content of seen block (reverse and - * forward seek()) during merge | | Read Block 1 Metadata | | Read Block 1 Data | | | Read Block 2 - * Metadata | | Read Block 2 Data | | I/O Pass 1 | ..................... | I/O Pass 2 | ................. | | - * | Read Block N Metadata | | Read Block N Data |

    This results in two I/O passes over the log file. + * NOTE: If readBlockLazily is turned on, does not merge, instead keeps reading log blocks and merges everything at once + * This is an optimization to avoid seek() back and forth to read new block (forward seek()) and lazily read content of + * seen block (reverse and forward seek()) during merge | | Read Block 1 Metadata | | Read Block 1 Data | | | Read Block + * 2 Metadata | | Read Block 2 Data | | I/O Pass 1 | ..................... | I/O Pass 2 | ................. | | | Read + * Block N Metadata | | Read Block N Data | + *

    + * This results in two I/O passes over the log file. */ public abstract class AbstractHoodieLogRecordScanner { @@ -122,10 +123,9 @@ public abstract class AbstractHoodieLogRecordScanner { HoodieLogFormatReader logFormatReaderWrapper = null; try { // iterate over the paths - logFormatReaderWrapper = - new HoodieLogFormatReader(fs, - logFilePaths.stream().map(logFile -> new HoodieLogFile(new Path(logFile))) - .collect(Collectors.toList()), readerSchema, readBlocksLazily, reverseReader, bufferSize); + logFormatReaderWrapper = new HoodieLogFormatReader(fs, + logFilePaths.stream().map(logFile -> new HoodieLogFile(new Path(logFile))).collect(Collectors.toList()), + readerSchema, readBlocksLazily, reverseReader, bufferSize); Set scannedLogFiles = new HashSet<>(); while (logFormatReaderWrapper.hasNext()) { HoodieLogFile logFile = logFormatReaderWrapper.getLogFile(); @@ -136,10 +136,9 @@ public abstract class AbstractHoodieLogRecordScanner { HoodieLogBlock r = logFormatReaderWrapper.next(); totalLogBlocks.incrementAndGet(); if (r.getBlockType() != CORRUPT_BLOCK - && !HoodieTimeline.compareTimestamps(r.getLogBlockHeader().get(INSTANT_TIME), - this.latestInstantTime, - HoodieTimeline.LESSER_OR_EQUAL)) { - //hit a block with instant time greater than should be processed, stop processing further + && !HoodieTimeline.compareTimestamps(r.getLogBlockHeader().get(INSTANT_TIME), this.latestInstantTime, + HoodieTimeline.LESSER_OR_EQUAL)) { + // hit a block with instant time greater than should be processed, stop processing further break; } switch (r.getBlockType()) { @@ -167,7 +166,7 @@ public abstract class AbstractHoodieLogRecordScanner { // Consider the following scenario // (Time 0, C1, Task T1) -> Running // (Time 1, C1, Task T1) -> Failed (Wrote either a corrupt block or a correct - // DataBlock (B1) with commitTime C1 + // DataBlock (B1) with commitTime C1 // (Time 2, C1, Task T1.2) -> Running (Task T1 was retried and the attempt number is 2) // (Time 3, C1, Task T1.2) -> Finished (Wrote a correct DataBlock B2) // Now a logFile L1 can have 2 correct Datablocks (B1 and B2) which are the same. @@ -179,8 +178,8 @@ public abstract class AbstractHoodieLogRecordScanner { log.info("Reading a command block from file " + logFile.getPath()); // This is a command block - take appropriate action based on the command HoodieCommandBlock commandBlock = (HoodieCommandBlock) r; - String targetInstantForCommandBlock = r.getLogBlockHeader() - .get(HoodieLogBlock.HeaderMetadataType.TARGET_INSTANT_TIME); + String targetInstantForCommandBlock = + r.getLogBlockHeader().get(HoodieLogBlock.HeaderMetadataType.TARGET_INSTANT_TIME); switch (commandBlock.getType()) { // there can be different types of command blocks case ROLLBACK_PREVIOUS_BLOCK: // Rollback the last read log block @@ -195,20 +194,17 @@ public abstract class AbstractHoodieLogRecordScanner { HoodieLogBlock lastBlock = currentInstantLogBlocks.peek(); // handle corrupt blocks separately since they may not have metadata if (lastBlock.getBlockType() == CORRUPT_BLOCK) { - log.info( - "Rolling back the last corrupted log block read in " + logFile.getPath()); + log.info("Rolling back the last corrupted log block read in " + logFile.getPath()); currentInstantLogBlocks.pop(); numBlocksRolledBack++; } else if (lastBlock.getBlockType() != CORRUPT_BLOCK - && targetInstantForCommandBlock - .contentEquals(lastBlock.getLogBlockHeader().get(INSTANT_TIME))) { + && targetInstantForCommandBlock.contentEquals(lastBlock.getLogBlockHeader().get(INSTANT_TIME))) { // rollback last data block or delete block log.info("Rolling back the last log block read in " + logFile.getPath()); currentInstantLogBlocks.pop(); numBlocksRolledBack++; } else if (!targetInstantForCommandBlock - .contentEquals( - currentInstantLogBlocks.peek().getLogBlockHeader().get(INSTANT_TIME))) { + .contentEquals(currentInstantLogBlocks.peek().getLogBlockHeader().get(INSTANT_TIME))) { // invalid or extra rollback block log.warn("TargetInstantTime " + targetInstantForCommandBlock + " invalid or extra rollback command block in " + logFile.getPath()); @@ -260,15 +256,14 @@ public abstract class AbstractHoodieLogRecordScanner { * Checks if the current logblock belongs to a later instant */ private boolean isNewInstantBlock(HoodieLogBlock logBlock) { - return currentInstantLogBlocks.size() > 0 - && currentInstantLogBlocks.peek().getBlockType() != CORRUPT_BLOCK + return currentInstantLogBlocks.size() > 0 && currentInstantLogBlocks.peek().getBlockType() != CORRUPT_BLOCK && !logBlock.getLogBlockHeader().get(INSTANT_TIME) - .contentEquals(currentInstantLogBlocks.peek().getLogBlockHeader().get(INSTANT_TIME)); + .contentEquals(currentInstantLogBlocks.peek().getLogBlockHeader().get(INSTANT_TIME)); } /** - * Iterate over the GenericRecord in the block, read the hoodie key and partition path and - * call subclass processors to handle it. + * Iterate over the GenericRecord in the block, read the hoodie key and partition path and call subclass processors to + * handle it. */ private void processAvroDataBlock(HoodieAvroDataBlock dataBlock) throws Exception { // TODO (NA) - Implement getRecordItr() in HoodieAvroDataBlock and use that here @@ -286,8 +281,7 @@ public abstract class AbstractHoodieLogRecordScanner { * * @param hoodieRecord Hoodie Record to process */ - protected abstract void processNextRecord(HoodieRecord hoodieRecord) - throws Exception; + protected abstract void processNextRecord(HoodieRecord hoodieRecord) throws Exception; /** * Process next deleted key @@ -299,8 +293,7 @@ public abstract class AbstractHoodieLogRecordScanner { /** * Process the set of log blocks belonging to the last instant which is read fully. */ - private void processQueuedBlocksForInstant(Deque lastBlocks, int numLogFilesSeen) - throws Exception { + private void processQueuedBlocksForInstant(Deque lastBlocks, int numLogFilesSeen) throws Exception { while (!lastBlocks.isEmpty()) { log.info("Number of remaining logblocks to merge " + lastBlocks.size()); // poll the element at the bottom of the stack since that's the order it was inserted diff --git a/hudi-common/src/main/java/org/apache/hudi/common/table/log/HoodieLogFileReader.java b/hudi-common/src/main/java/org/apache/hudi/common/table/log/HoodieLogFileReader.java index b479bfd79..312d09ec0 100644 --- a/hudi-common/src/main/java/org/apache/hudi/common/table/log/HoodieLogFileReader.java +++ b/hudi-common/src/main/java/org/apache/hudi/common/table/log/HoodieLogFileReader.java @@ -46,9 +46,8 @@ import org.apache.log4j.LogManager; import org.apache.log4j.Logger; /** - * Scans a log file and provides block level iterator on the log file Loads the entire block - * contents in memory Can emit either a DataBlock, CommandBlock, DeleteBlock or CorruptBlock (if one - * is found) + * Scans a log file and provides block level iterator on the log file Loads the entire block contents in memory Can emit + * either a DataBlock, CommandBlock, DeleteBlock or CorruptBlock (if one is found) */ class HoodieLogFileReader implements HoodieLogFormat.Reader { @@ -71,8 +70,7 @@ class HoodieLogFileReader implements HoodieLogFormat.Reader { FSDataInputStream fsDataInputStream = fs.open(logFile.getPath(), bufferSize); if (fsDataInputStream.getWrappedStream() instanceof FSInputStream) { this.inputStream = new FSDataInputStream( - new BufferedFSInputStream((FSInputStream) fsDataInputStream.getWrappedStream(), - bufferSize)); + new BufferedFSInputStream((FSInputStream) fsDataInputStream.getWrappedStream(), bufferSize)); } else { // fsDataInputStream.getWrappedStream() maybe a BufferedFSInputStream // need to wrap in another BufferedFSInputStream the make bufferSize work? @@ -84,19 +82,17 @@ class HoodieLogFileReader implements HoodieLogFormat.Reader { this.readBlockLazily = readBlockLazily; this.reverseReader = reverseReader; if (this.reverseReader) { - this.reverseLogFilePosition = this.lastReverseLogFilePosition = fs - .getFileStatus(logFile.getPath()).getLen(); + this.reverseLogFilePosition = this.lastReverseLogFilePosition = fs.getFileStatus(logFile.getPath()).getLen(); } addShutDownHook(); } - HoodieLogFileReader(FileSystem fs, HoodieLogFile logFile, Schema readerSchema, - boolean readBlockLazily, boolean reverseReader) throws IOException { + HoodieLogFileReader(FileSystem fs, HoodieLogFile logFile, Schema readerSchema, boolean readBlockLazily, + boolean reverseReader) throws IOException { this(fs, logFile, readerSchema, DEFAULT_BUFFER_SIZE, readBlockLazily, reverseReader); } - HoodieLogFileReader(FileSystem fs, HoodieLogFile logFile, Schema readerSchema) - throws IOException { + HoodieLogFileReader(FileSystem fs, HoodieLogFile logFile, Schema readerSchema) throws IOException { this(fs, logFile, readerSchema, DEFAULT_BUFFER_SIZE, false, false); } @@ -154,8 +150,7 @@ class HoodieLogFileReader implements HoodieLogFormat.Reader { if (nextBlockVersion.getVersion() != HoodieLogFormatVersion.DEFAULT_VERSION) { type = inputStream.readInt(); - Preconditions.checkArgument(type < HoodieLogBlockType.values().length, - "Invalid block byte type found " + type); + Preconditions.checkArgument(type < HoodieLogBlockType.values().length, "Invalid block byte type found " + type); blockType = HoodieLogBlockType.values()[type]; } @@ -198,18 +193,15 @@ class HoodieLogFileReader implements HoodieLogFormat.Reader { if (nextBlockVersion.getVersion() == HoodieLogFormatVersion.DEFAULT_VERSION) { return HoodieAvroDataBlock.getBlock(content, readerSchema); } else { - return HoodieAvroDataBlock - .getBlock(logFile, inputStream, Option.ofNullable(content), readBlockLazily, - contentPosition, contentLength, blockEndPos, readerSchema, header, footer); + return HoodieAvroDataBlock.getBlock(logFile, inputStream, Option.ofNullable(content), readBlockLazily, + contentPosition, contentLength, blockEndPos, readerSchema, header, footer); } case DELETE_BLOCK: - return HoodieDeleteBlock - .getBlock(logFile, inputStream, Option.ofNullable(content), readBlockLazily, - contentPosition, contentLength, blockEndPos, header, footer); + return HoodieDeleteBlock.getBlock(logFile, inputStream, Option.ofNullable(content), readBlockLazily, + contentPosition, contentLength, blockEndPos, header, footer); case COMMAND_BLOCK: - return HoodieCommandBlock - .getBlock(logFile, inputStream, Option.ofNullable(content), readBlockLazily, - contentPosition, contentLength, blockEndPos, header, footer); + return HoodieCommandBlock.getBlock(logFile, inputStream, Option.ofNullable(content), readBlockLazily, + contentPosition, contentLength, blockEndPos, header, footer); default: throw new HoodieNotSupportedException("Unsupported Block " + blockType); } @@ -224,12 +216,9 @@ class HoodieLogFileReader implements HoodieLogFormat.Reader { log.info("Next available block in " + logFile + " starts at " + nextBlockOffset); int corruptedBlockSize = (int) (nextBlockOffset - currentPos); long contentPosition = inputStream.getPos(); - byte[] corruptedBytes = HoodieLogBlock - .readOrSkipContent(inputStream, corruptedBlockSize, readBlockLazily); - return HoodieCorruptBlock - .getBlock(logFile, inputStream, Option.ofNullable(corruptedBytes), readBlockLazily, - contentPosition, corruptedBlockSize, corruptedBlockSize, new HashMap<>(), - new HashMap<>()); + byte[] corruptedBytes = HoodieLogBlock.readOrSkipContent(inputStream, corruptedBlockSize, readBlockLazily); + return HoodieCorruptBlock.getBlock(logFile, inputStream, Option.ofNullable(corruptedBytes), readBlockLazily, + contentPosition, corruptedBlockSize, corruptedBlockSize, new HashMap<>(), new HashMap<>()); } private boolean isBlockCorrupt(int blocksize) throws IOException { @@ -311,8 +300,7 @@ class HoodieLogFileReader implements HoodieLogFormat.Reader { boolean hasMagic = hasNextMagic(); if (!hasMagic) { throw new CorruptedLogFileException( - logFile - + "could not be read. Did not find the magic bytes at the start of the block"); + logFile + "could not be read. Did not find the magic bytes at the start of the block"); } return hasMagic; } catch (EOFException e) { @@ -362,9 +350,9 @@ class HoodieLogFileReader implements HoodieLogFormat.Reader { } /** - * This is a reverse iterator Note: At any point, an instance of HoodieLogFileReader should either - * iterate reverse (prev) or forward (next). Doing both in the same instance is not supported - * WARNING : Every call to prev() should be preceded with hasPrev() + * This is a reverse iterator Note: At any point, an instance of HoodieLogFileReader should either iterate reverse + * (prev) or forward (next). Doing both in the same instance is not supported WARNING : Every call to prev() should be + * preceded with hasPrev() */ @Override public HoodieLogBlock prev() throws IOException { @@ -380,9 +368,8 @@ class HoodieLogFileReader implements HoodieLogFormat.Reader { } catch (Exception e) { // this could be a corrupt block inputStream.seek(blockEndPos); - throw new CorruptedLogFileException( - "Found possible corrupted block, cannot read log file in reverse, " - + "fallback to forward reading of logfile"); + throw new CorruptedLogFileException("Found possible corrupted block, cannot read log file in reverse, " + + "fallback to forward reading of logfile"); } boolean hasNext = hasNext(); reverseLogFilePosition -= blockSize; @@ -391,10 +378,9 @@ class HoodieLogFileReader implements HoodieLogFormat.Reader { } /** - * Reverse pointer, does not read the block. Return the current position of the log file (in - * reverse) If the pointer (inputstream) is moved in any way, it is the job of the client of this - * class to seek/reset it back to the file position returned from the method to expect correct - * results + * Reverse pointer, does not read the block. Return the current position of the log file (in reverse) If the pointer + * (inputstream) is moved in any way, it is the job of the client of this class to seek/reset it back to the file + * position returned from the method to expect correct results */ public long moveToPrev() throws IOException { diff --git a/hudi-common/src/main/java/org/apache/hudi/common/table/log/HoodieLogFormat.java b/hudi-common/src/main/java/org/apache/hudi/common/table/log/HoodieLogFormat.java index aada3e2da..c0ec90bd8 100644 --- a/hudi-common/src/main/java/org/apache/hudi/common/table/log/HoodieLogFormat.java +++ b/hudi-common/src/main/java/org/apache/hudi/common/table/log/HoodieLogFormat.java @@ -33,11 +33,10 @@ import org.apache.log4j.LogManager; import org.apache.log4j.Logger; /** - * File Format for Hoodie Log Files. The File Format consists of blocks each separated with a - * MAGIC sync marker. A Block can either be a Data block, Command block or Delete Block. Data - * Block - Contains log records serialized as Avro Binary Format Command Block - Specific commands - * like ROLLBACK_PREVIOUS-BLOCK - Tombstone for the previously written block Delete Block - List of - * keys to delete - tombstone for keys + * File Format for Hoodie Log Files. The File Format consists of blocks each separated with a MAGIC sync marker. A Block + * can either be a Data block, Command block or Delete Block. Data Block - Contains log records serialized as Avro + * Binary Format Command Block - Specific commands like ROLLBACK_PREVIOUS-BLOCK - Tombstone for the previously written + * block Delete Block - List of keys to delete - tombstone for keys */ public interface HoodieLogFormat { @@ -47,8 +46,8 @@ public interface HoodieLogFormat { byte[] MAGIC = new byte[] {'#', 'H', 'U', 'D', 'I', '#'}; /** - * The current version of the log format. Anytime the log format changes this version needs to be - * bumped and corresponding changes need to be made to {@link HoodieLogFormatVersion} + * The current version of the log format. Anytime the log format changes this version needs to be bumped and + * corresponding changes need to be made to {@link HoodieLogFormatVersion} */ int currentVersion = 1; @@ -84,12 +83,14 @@ public interface HoodieLogFormat { /** * Read log file in reverse order and check if prev block is present + * * @return */ public boolean hasPrev(); /** * Read log file in reverse order and return prev block if present + * * @return * @throws IOException */ @@ -220,9 +221,8 @@ public interface HoodieLogFormat { // Use rollover write token as write token to create new log file with tokens logWriteToken = rolloverLogWriteToken; } - log.info( - "Computed the next log version for " + logFileId + " in " + parentPath + " as " - + logVersion + " with write-token " + logWriteToken); + log.info("Computed the next log version for " + logFileId + " in " + parentPath + " as " + logVersion + + " with write-token " + logWriteToken); } if (logWriteToken == null) { @@ -259,16 +259,15 @@ public interface HoodieLogFormat { return new HoodieLogFileReader(fs, logFile, readerSchema, HoodieLogFileReader.DEFAULT_BUFFER_SIZE, false, false); } - static HoodieLogFormat.Reader newReader(FileSystem fs, HoodieLogFile logFile, Schema readerSchema, boolean - readBlockLazily, boolean reverseReader) - throws IOException { - return new HoodieLogFileReader(fs, logFile, readerSchema, HoodieLogFileReader.DEFAULT_BUFFER_SIZE, - readBlockLazily, reverseReader); + static HoodieLogFormat.Reader newReader(FileSystem fs, HoodieLogFile logFile, Schema readerSchema, + boolean readBlockLazily, boolean reverseReader) throws IOException { + return new HoodieLogFileReader(fs, logFile, readerSchema, HoodieLogFileReader.DEFAULT_BUFFER_SIZE, readBlockLazily, + reverseReader); } /** - * A set of feature flags associated with a log format. Versions are changed when the log format - * changes. TODO(na) - Implement policies around major/minor versions + * A set of feature flags associated with a log format. Versions are changed when the log format changes. TODO(na) - + * Implement policies around major/minor versions */ abstract class LogFormatVersion { diff --git a/hudi-common/src/main/java/org/apache/hudi/common/table/log/HoodieLogFormatReader.java b/hudi-common/src/main/java/org/apache/hudi/common/table/log/HoodieLogFormatReader.java index fe935de88..c9dbafd2c 100644 --- a/hudi-common/src/main/java/org/apache/hudi/common/table/log/HoodieLogFormatReader.java +++ b/hudi-common/src/main/java/org/apache/hudi/common/table/log/HoodieLogFormatReader.java @@ -43,8 +43,8 @@ public class HoodieLogFormatReader implements HoodieLogFormat.Reader { private static final Logger log = LogManager.getLogger(HoodieLogFormatReader.class); - HoodieLogFormatReader(FileSystem fs, List logFiles, - Schema readerSchema, boolean readBlocksLazily, boolean reverseLogReader, int bufferSize) throws IOException { + HoodieLogFormatReader(FileSystem fs, List logFiles, Schema readerSchema, boolean readBlocksLazily, + boolean reverseLogReader, int bufferSize) throws IOException { this.logFiles = logFiles; this.fs = fs; this.readerSchema = readerSchema; @@ -60,10 +60,9 @@ public class HoodieLogFormatReader implements HoodieLogFormat.Reader { @Override /** - * Note : In lazy mode, clients must ensure close() should be called only after processing - * all log-blocks as the underlying inputstream will be closed. - * TODO: We can introduce invalidate() API at HoodieLogBlock and this object can call invalidate on - * all returned log-blocks so that we check this scenario specifically in HoodieLogBlock + * Note : In lazy mode, clients must ensure close() should be called only after processing all log-blocks as the + * underlying inputstream will be closed. TODO: We can introduce invalidate() API at HoodieLogBlock and this object + * can call invalidate on all returned log-blocks so that we check this scenario specifically in HoodieLogBlock */ public void close() throws IOException { @@ -94,8 +93,8 @@ public class HoodieLogFormatReader implements HoodieLogFormat.Reader { } else { this.prevReadersInOpenState.add(currentReader); } - this.currentReader = new HoodieLogFileReader(fs, nextLogFile, readerSchema, bufferSize, readBlocksLazily, - false); + this.currentReader = + new HoodieLogFileReader(fs, nextLogFile, readerSchema, bufferSize, readBlocksLazily, false); } catch (IOException io) { throw new HoodieIOException("unable to initialize read with log file ", io); } @@ -116,8 +115,7 @@ public class HoodieLogFormatReader implements HoodieLogFormat.Reader { } @Override - public void remove() { - } + public void remove() {} @Override public boolean hasPrev() { diff --git a/hudi-common/src/main/java/org/apache/hudi/common/table/log/HoodieLogFormatVersion.java b/hudi-common/src/main/java/org/apache/hudi/common/table/log/HoodieLogFormatVersion.java index 35a5f811f..1401085f7 100644 --- a/hudi-common/src/main/java/org/apache/hudi/common/table/log/HoodieLogFormatVersion.java +++ b/hudi-common/src/main/java/org/apache/hudi/common/table/log/HoodieLogFormatVersion.java @@ -19,8 +19,7 @@ package org.apache.hudi.common.table.log; /** - * Implements logic to determine behavior for feature flags for - * {@link HoodieLogFormat.LogFormatVersion}. + * Implements logic to determine behavior for feature flags for {@link HoodieLogFormat.LogFormatVersion}. */ final class HoodieLogFormatVersion extends HoodieLogFormat.LogFormatVersion { diff --git a/hudi-common/src/main/java/org/apache/hudi/common/table/log/HoodieLogFormatWriter.java b/hudi-common/src/main/java/org/apache/hudi/common/table/log/HoodieLogFormatWriter.java index 2bf9b334e..17b5028c8 100644 --- a/hudi-common/src/main/java/org/apache/hudi/common/table/log/HoodieLogFormatWriter.java +++ b/hudi-common/src/main/java/org/apache/hudi/common/table/log/HoodieLogFormatWriter.java @@ -38,8 +38,7 @@ import org.apache.log4j.LogManager; import org.apache.log4j.Logger; /** - * HoodieLogFormatWriter can be used to append blocks to a log file Use - * HoodieLogFormat.WriterBuilder to construct + * HoodieLogFormatWriter can be used to append blocks to a log file Use HoodieLogFormat.WriterBuilder to construct */ public class HoodieLogFormatWriter implements HoodieLogFormat.Writer { @@ -62,9 +61,8 @@ public class HoodieLogFormatWriter implements HoodieLogFormat.Writer { * @param replication * @param sizeThreshold */ - HoodieLogFormatWriter(FileSystem fs, HoodieLogFile logFile, Integer bufferSize, - Short replication, Long sizeThreshold, String logWriteToken, String rolloverLogWriteToken) - throws IOException, InterruptedException { + HoodieLogFormatWriter(FileSystem fs, HoodieLogFile logFile, Integer bufferSize, Short replication, Long sizeThreshold, + String logWriteToken, String rolloverLogWriteToken) throws IOException, InterruptedException { this.fs = fs; this.logFile = logFile; this.sizeThreshold = sizeThreshold; @@ -116,12 +114,11 @@ public class HoodieLogFormatWriter implements HoodieLogFormat.Writer { } @Override - public Writer appendBlock(HoodieLogBlock block) - throws IOException, InterruptedException { + public Writer appendBlock(HoodieLogBlock block) throws IOException, InterruptedException { // Find current version - HoodieLogFormat.LogFormatVersion currentLogFormatVersion = new HoodieLogFormatVersion( - HoodieLogFormat.currentVersion); + HoodieLogFormat.LogFormatVersion currentLogFormatVersion = + new HoodieLogFormatVersion(HoodieLogFormat.currentVersion); long currentSize = this.output.size(); // 1. Write the magic header for the start of the block @@ -135,8 +132,7 @@ public class HoodieLogFormatWriter implements HoodieLogFormat.Writer { byte[] footerBytes = HoodieLogBlock.getLogMetadataBytes(block.getLogBlockFooter()); // 2. Write the total size of the block (excluding Magic) - this.output - .writeLong(getLogBlockLength(content.length, headerBytes.length, footerBytes.length)); + this.output.writeLong(getLogBlockLength(content.length, headerBytes.length, footerBytes.length)); // 3. Write the version of this log block this.output.writeInt(currentLogFormatVersion.getVersion()); @@ -162,26 +158,24 @@ public class HoodieLogFormatWriter implements HoodieLogFormat.Writer { } /** - * This method returns the total LogBlock Length which is the sum of 1. Number of bytes to write - * version 2. Number of bytes to write ordinal 3. Length of the headers 4. Number of bytes used to - * write content length 5. Length of the content 6. Length of the footers 7. Number of bytes to - * write totalLogBlockLength + * This method returns the total LogBlock Length which is the sum of 1. Number of bytes to write version 2. Number of + * bytes to write ordinal 3. Length of the headers 4. Number of bytes used to write content length 5. Length of the + * content 6. Length of the footers 7. Number of bytes to write totalLogBlockLength */ private int getLogBlockLength(int contentLength, int headerLength, int footerLength) { - return - Integer.BYTES + // Number of bytes to write version - Integer.BYTES + // Number of bytes to write ordinal - headerLength + // Length of the headers - Long.BYTES + // Number of bytes used to write content length - contentLength + // Length of the content - footerLength + // Length of the footers - Long.BYTES; // bytes to write totalLogBlockLength at end of block (for reverse ptr) + return Integer.BYTES + // Number of bytes to write version + Integer.BYTES + // Number of bytes to write ordinal + headerLength + // Length of the headers + Long.BYTES + // Number of bytes used to write content length + contentLength + // Length of the content + footerLength + // Length of the footers + Long.BYTES; // bytes to write totalLogBlockLength at end of block (for reverse ptr) } private Writer rolloverIfNeeded() throws IOException, InterruptedException { // Roll over if the size is past the threshold if (getCurrentSize() > sizeThreshold) { - //TODO - make an end marker which seals the old log file (no more appends possible to that + // TODO - make an end marker which seals the old log file (no more appends possible to that // file). log.info("CurrentSize " + getCurrentSize() + " has reached threshold " + sizeThreshold + ". Rolling over to the next version"); @@ -195,8 +189,8 @@ public class HoodieLogFormatWriter implements HoodieLogFormat.Writer { } private void createNewFile() throws IOException { - this.output = fs.create(this.logFile.getPath(), false, bufferSize, replication, - WriterBuilder.DEFAULT_SIZE_THRESHOLD, null); + this.output = + fs.create(this.logFile.getPath(), false, bufferSize, replication, WriterBuilder.DEFAULT_SIZE_THRESHOLD, null); } @Override @@ -218,14 +212,13 @@ public class HoodieLogFormatWriter implements HoodieLogFormat.Writer { public long getCurrentSize() throws IOException { if (output == null) { - throw new IllegalStateException( - "Cannot get current size as the underlying stream has been closed already"); + throw new IllegalStateException("Cannot get current size as the underlying stream has been closed already"); } return output.getPos(); } - private void handleAppendExceptionOrRecoverLease(Path path, RemoteException e) throws IOException, - InterruptedException { + private void handleAppendExceptionOrRecoverLease(Path path, RemoteException e) + throws IOException, InterruptedException { if (e.getMessage().contains(APPEND_UNAVAILABLE_EXCEPTION_MESSAGE)) { // This issue happens when all replicas for a file are down and/or being decommissioned. // The fs.append() API could append to the last block for a file. If the last block is full, a new block is diff --git a/hudi-common/src/main/java/org/apache/hudi/common/table/log/HoodieMergedLogRecordScanner.java b/hudi-common/src/main/java/org/apache/hudi/common/table/log/HoodieMergedLogRecordScanner.java index b3361c368..a07d661b5 100644 --- a/hudi-common/src/main/java/org/apache/hudi/common/table/log/HoodieMergedLogRecordScanner.java +++ b/hudi-common/src/main/java/org/apache/hudi/common/table/log/HoodieMergedLogRecordScanner.java @@ -40,12 +40,13 @@ import org.apache.log4j.Logger; * Scans through all the blocks in a list of HoodieLogFile and builds up a compacted/merged list of records which will * be used as a lookup table when merging the base columnar file with the redo log file. * - * NOTE: If readBlockLazily is - * turned on, does not merge, instead keeps reading log blocks and merges everything at once This is an optimization to - * avoid seek() back and forth to read new block (forward seek()) and lazily read content of seen block (reverse and - * forward seek()) during merge | | Read Block 1 Metadata | | Read Block 1 Data | | | Read Block 2 - * Metadata | | Read Block 2 Data | | I/O Pass 1 | ..................... | I/O Pass 2 | ................. | | - * | Read Block N Metadata | | Read Block N Data |

    This results in two I/O passes over the log file. + * NOTE: If readBlockLazily is turned on, does not merge, instead keeps reading log blocks and merges everything at once + * This is an optimization to avoid seek() back and forth to read new block (forward seek()) and lazily read content of + * seen block (reverse and forward seek()) during merge | | Read Block 1 Metadata | | Read Block 1 Data | | | Read Block + * 2 Metadata | | Read Block 2 Data | | I/O Pass 1 | ..................... | I/O Pass 2 | ................. | | | Read + * Block N Metadata | | Read Block N Data | + *

    + * This results in two I/O passes over the log file. */ public class HoodieMergedLogRecordScanner extends AbstractHoodieLogRecordScanner @@ -65,26 +66,24 @@ public class HoodieMergedLogRecordScanner extends AbstractHoodieLogRecordScanner public final HoodieTimer timer = new HoodieTimer(); @SuppressWarnings("unchecked") - public HoodieMergedLogRecordScanner(FileSystem fs, String basePath, List logFilePaths, - Schema readerSchema, String latestInstantTime, Long maxMemorySizeInBytes, - boolean readBlocksLazily, boolean reverseReader, int bufferSize, String spillableMapBasePath) { + public HoodieMergedLogRecordScanner(FileSystem fs, String basePath, List logFilePaths, Schema readerSchema, + String latestInstantTime, Long maxMemorySizeInBytes, boolean readBlocksLazily, boolean reverseReader, + int bufferSize, String spillableMapBasePath) { super(fs, basePath, logFilePaths, readerSchema, latestInstantTime, readBlocksLazily, reverseReader, bufferSize); try { // Store merged records for all versions for this log file, set the in-memory footprint to maxInMemoryMapSize - this.records = new ExternalSpillableMap<>(maxMemorySizeInBytes, spillableMapBasePath, - new DefaultSizeEstimator(), new HoodieRecordSizeEstimator(readerSchema)); + this.records = new ExternalSpillableMap<>(maxMemorySizeInBytes, spillableMapBasePath, new DefaultSizeEstimator(), + new HoodieRecordSizeEstimator(readerSchema)); // Do the scan and merge timer.startTimer(); scan(); this.totalTimeTakenToReadAndMergeBlocks = timer.endTimer(); this.numMergedRecordsInLog = records.size(); log.info("MaxMemoryInBytes allowed for compaction => " + maxMemorySizeInBytes); - log.info("Number of entries in MemoryBasedMap in ExternalSpillableMap => " + records - .getInMemoryMapNumEntries()); - log.info("Total size in bytes of MemoryBasedMap in ExternalSpillableMap => " + records - .getCurrentInMemoryMapSize()); - log.info("Number of entries in DiskBasedMap in ExternalSpillableMap => " + records - .getDiskBasedMapNumEntries()); + log.info("Number of entries in MemoryBasedMap in ExternalSpillableMap => " + records.getInMemoryMapNumEntries()); + log.info( + "Total size in bytes of MemoryBasedMap in ExternalSpillableMap => " + records.getCurrentInMemoryMapSize()); + log.info("Number of entries in DiskBasedMap in ExternalSpillableMap => " + records.getDiskBasedMapNumEntries()); log.info("Size of file spilled to disk => " + records.getSizeOfFileOnDiskInBytes()); } catch (IOException e) { throw new HoodieIOException("IOException when reading log file "); diff --git a/hudi-common/src/main/java/org/apache/hudi/common/table/log/HoodieUnMergedLogRecordScanner.java b/hudi-common/src/main/java/org/apache/hudi/common/table/log/HoodieUnMergedLogRecordScanner.java index 894db45dc..dc279952b 100644 --- a/hudi-common/src/main/java/org/apache/hudi/common/table/log/HoodieUnMergedLogRecordScanner.java +++ b/hudi-common/src/main/java/org/apache/hudi/common/table/log/HoodieUnMergedLogRecordScanner.java @@ -29,9 +29,8 @@ public class HoodieUnMergedLogRecordScanner extends AbstractHoodieLogRecordScann private final LogRecordScannerCallback callback; - public HoodieUnMergedLogRecordScanner(FileSystem fs, String basePath, - List logFilePaths, Schema readerSchema, String latestInstantTime, - boolean readBlocksLazily, boolean reverseReader, int bufferSize, + public HoodieUnMergedLogRecordScanner(FileSystem fs, String basePath, List logFilePaths, Schema readerSchema, + String latestInstantTime, boolean readBlocksLazily, boolean reverseReader, int bufferSize, LogRecordScannerCallback callback) { super(fs, basePath, logFilePaths, readerSchema, latestInstantTime, readBlocksLazily, reverseReader, bufferSize); this.callback = callback; diff --git a/hudi-common/src/main/java/org/apache/hudi/common/table/log/block/HoodieAvroDataBlock.java b/hudi-common/src/main/java/org/apache/hudi/common/table/log/block/HoodieAvroDataBlock.java index bc402e438..f53fa3bac 100644 --- a/hudi-common/src/main/java/org/apache/hudi/common/table/log/block/HoodieAvroDataBlock.java +++ b/hudi-common/src/main/java/org/apache/hudi/common/table/log/block/HoodieAvroDataBlock.java @@ -48,9 +48,8 @@ import org.apache.hudi.common.util.Option; import org.apache.hudi.exception.HoodieIOException; /** - * DataBlock contains a list of records serialized using Avro. The Datablock contains 1. Data Block - * version 2. Total number of records in the block 3. Size of a record 4. Actual avro serialized - * content of the record + * DataBlock contains a list of records serialized using Avro. The Datablock contains 1. Data Block version 2. Total + * number of records in the block 3. Size of a record 4. Actual avro serialized content of the record */ public class HoodieAvroDataBlock extends HoodieLogBlock { @@ -59,41 +58,31 @@ public class HoodieAvroDataBlock extends HoodieLogBlock { private ThreadLocal encoderCache = new ThreadLocal<>(); private ThreadLocal decoderCache = new ThreadLocal<>(); - public HoodieAvroDataBlock(@Nonnull List records, - @Nonnull Map header, + public HoodieAvroDataBlock(@Nonnull List records, @Nonnull Map header, @Nonnull Map footer) { super(header, footer, Option.empty(), Option.empty(), null, false); this.records = records; this.schema = Schema.parse(super.getLogBlockHeader().get(HeaderMetadataType.SCHEMA)); } - public HoodieAvroDataBlock(@Nonnull List records, - @Nonnull Map header) { + public HoodieAvroDataBlock(@Nonnull List records, @Nonnull Map header) { this(records, header, new HashMap<>()); } - private HoodieAvroDataBlock(Option content, @Nonnull FSDataInputStream inputStream, - boolean readBlockLazily, Option blockContentLocation, - Schema readerSchema, @Nonnull Map headers, - @Nonnull Map footer) { + private HoodieAvroDataBlock(Option content, @Nonnull FSDataInputStream inputStream, boolean readBlockLazily, + Option blockContentLocation, Schema readerSchema, + @Nonnull Map headers, @Nonnull Map footer) { super(headers, footer, blockContentLocation, content, inputStream, readBlockLazily); this.schema = readerSchema; } - public static HoodieLogBlock getBlock(HoodieLogFile logFile, - FSDataInputStream inputStream, - Option content, - boolean readBlockLazily, - long position, - long blockSize, - long blockEndpos, - Schema readerSchema, - Map header, - Map footer) { + public static HoodieLogBlock getBlock(HoodieLogFile logFile, FSDataInputStream inputStream, Option content, + boolean readBlockLazily, long position, long blockSize, long blockEndpos, Schema readerSchema, + Map header, Map footer) { return new HoodieAvroDataBlock(content, inputStream, readBlockLazily, - Option.of(new HoodieLogBlockContentLocation(logFile, position, blockSize, blockEndpos)), - readerSchema, header, footer); + Option.of(new HoodieLogBlockContentLocation(logFile, position, blockSize, blockEndpos)), readerSchema, header, + footer); } @@ -171,8 +160,8 @@ public class HoodieAvroDataBlock extends HoodieLogBlock { return schema; } - //TODO (na) - Break down content into smaller chunks of byte [] to be GC as they are used - //TODO (na) - Implement a recordItr instead of recordList + // TODO (na) - Break down content into smaller chunks of byte [] to be GC as they are used + // TODO (na) - Implement a recordItr instead of recordList private void createRecordsFromContentBytes() throws IOException { if (readBlockLazily && !getContent().isPresent()) { @@ -181,16 +170,14 @@ public class HoodieAvroDataBlock extends HoodieLogBlock { } SizeAwareDataInputStream dis = - new SizeAwareDataInputStream( - new DataInputStream(new ByteArrayInputStream(getContent().get()))); + new SizeAwareDataInputStream(new DataInputStream(new ByteArrayInputStream(getContent().get()))); // 1. Read version for this data block int version = dis.readInt(); HoodieAvroDataBlockVersion logBlockVersion = new HoodieAvroDataBlockVersion(version); // Get schema from the header - Schema writerSchema = new Schema.Parser() - .parse(super.getLogBlockHeader().get(HeaderMetadataType.SCHEMA)); + Schema writerSchema = new Schema.Parser().parse(super.getLogBlockHeader().get(HeaderMetadataType.SCHEMA)); // If readerSchema was not present, use writerSchema if (schema == null) { @@ -208,8 +195,8 @@ public class HoodieAvroDataBlock extends HoodieLogBlock { // 3. Read the content for (int i = 0; i < totalRecords; i++) { int recordLength = dis.readInt(); - BinaryDecoder decoder = DecoderFactory.get() - .binaryDecoder(getContent().get(), dis.getNumberOfBytesRead(), recordLength, decoderCache.get()); + BinaryDecoder decoder = DecoderFactory.get().binaryDecoder(getContent().get(), dis.getNumberOfBytesRead(), + recordLength, decoderCache.get()); decoderCache.set(decoder); IndexedRecord record = reader.read(null, decoder); records.add(record); @@ -221,13 +208,13 @@ public class HoodieAvroDataBlock extends HoodieLogBlock { deflate(); } - /*********************************DEPRECATED METHODS***********************************/ + /********************************* DEPRECATED METHODS ***********************************/ @Deprecated @VisibleForTesting /** - * This constructor is retained to provide backwards compatibility to HoodieArchivedLogs - * which were written using HoodieLogFormat V1 + * This constructor is retained to provide backwards compatibility to HoodieArchivedLogs which were written using + * HoodieLogFormat V1 */ public HoodieAvroDataBlock(List records, Schema schema) { super(new HashMap<>(), new HashMap<>(), Option.empty(), Option.empty(), null, false); @@ -237,13 +224,12 @@ public class HoodieAvroDataBlock extends HoodieLogBlock { @Deprecated /** - * This method is retained to provide backwards compatibility to HoodieArchivedLogs which - * were written using HoodieLogFormat V1 + * This method is retained to provide backwards compatibility to HoodieArchivedLogs which were written using + * HoodieLogFormat V1 */ public static HoodieLogBlock getBlock(byte[] content, Schema readerSchema) throws IOException { - SizeAwareDataInputStream dis = new SizeAwareDataInputStream( - new DataInputStream(new ByteArrayInputStream(content))); + SizeAwareDataInputStream dis = new SizeAwareDataInputStream(new DataInputStream(new ByteArrayInputStream(content))); // 1. Read the schema written out int schemaLength = dis.readInt(); @@ -263,8 +249,7 @@ public class HoodieAvroDataBlock extends HoodieLogBlock { // 3. Read the content for (int i = 0; i < totalRecords; i++) { int recordLength = dis.readInt(); - Decoder decoder = DecoderFactory.get() - .binaryDecoder(content, dis.getNumberOfBytesRead(), recordLength, null); + Decoder decoder = DecoderFactory.get().binaryDecoder(content, dis.getNumberOfBytesRead(), recordLength, null); IndexedRecord record = reader.read(null, decoder); records.add(record); dis.skipBytes(recordLength); diff --git a/hudi-common/src/main/java/org/apache/hudi/common/table/log/block/HoodieAvroDataBlockVersion.java b/hudi-common/src/main/java/org/apache/hudi/common/table/log/block/HoodieAvroDataBlockVersion.java index 84f88f668..e1bfdc21f 100644 --- a/hudi-common/src/main/java/org/apache/hudi/common/table/log/block/HoodieAvroDataBlockVersion.java +++ b/hudi-common/src/main/java/org/apache/hudi/common/table/log/block/HoodieAvroDataBlockVersion.java @@ -19,8 +19,8 @@ package org.apache.hudi.common.table.log.block; /** - * A set of feature flags associated with a data log block format. Versions are changed when the log - * block format changes. TODO(na) - Implement policies around major/minor versions + * A set of feature flags associated with a data log block format. Versions are changed when the log block format + * changes. TODO(na) - Implement policies around major/minor versions */ final class HoodieAvroDataBlockVersion extends HoodieLogBlockVersion { diff --git a/hudi-common/src/main/java/org/apache/hudi/common/table/log/block/HoodieCommandBlock.java b/hudi-common/src/main/java/org/apache/hudi/common/table/log/block/HoodieCommandBlock.java index dc4ff6622..2ac9cf6a3 100644 --- a/hudi-common/src/main/java/org/apache/hudi/common/table/log/block/HoodieCommandBlock.java +++ b/hudi-common/src/main/java/org/apache/hudi/common/table/log/block/HoodieCommandBlock.java @@ -39,12 +39,12 @@ public class HoodieCommandBlock extends HoodieLogBlock { this(Option.empty(), null, false, Option.empty(), header, new HashMap<>()); } - private HoodieCommandBlock(Option content, FSDataInputStream inputStream, - boolean readBlockLazily, Option blockContentLocation, - Map header, Map footer) { + private HoodieCommandBlock(Option content, FSDataInputStream inputStream, boolean readBlockLazily, + Option blockContentLocation, Map header, + Map footer) { super(header, footer, blockContentLocation, content, inputStream, readBlockLazily); - this.type = HoodieCommandBlockTypeEnum.values()[Integer - .parseInt(header.get(HeaderMetadataType.COMMAND_BLOCK_TYPE))]; + this.type = + HoodieCommandBlockTypeEnum.values()[Integer.parseInt(header.get(HeaderMetadataType.COMMAND_BLOCK_TYPE))]; } public HoodieCommandBlockTypeEnum getType() { @@ -61,18 +61,11 @@ public class HoodieCommandBlock extends HoodieLogBlock { return new byte[0]; } - public static HoodieLogBlock getBlock(HoodieLogFile logFile, - FSDataInputStream inputStream, - Option content, - boolean readBlockLazily, - long position, - long blockSize, - long blockEndpos, - Map header, + public static HoodieLogBlock getBlock(HoodieLogFile logFile, FSDataInputStream inputStream, Option content, + boolean readBlockLazily, long position, long blockSize, long blockEndpos, Map header, Map footer) { return new HoodieCommandBlock(content, inputStream, readBlockLazily, - Option.of(new HoodieLogBlockContentLocation(logFile, position, blockSize, blockEndpos)), - header, footer); + Option.of(new HoodieLogBlockContentLocation(logFile, position, blockSize, blockEndpos)), header, footer); } } diff --git a/hudi-common/src/main/java/org/apache/hudi/common/table/log/block/HoodieCommandBlockVersion.java b/hudi-common/src/main/java/org/apache/hudi/common/table/log/block/HoodieCommandBlockVersion.java index 4ba7db646..c2f2a2f39 100644 --- a/hudi-common/src/main/java/org/apache/hudi/common/table/log/block/HoodieCommandBlockVersion.java +++ b/hudi-common/src/main/java/org/apache/hudi/common/table/log/block/HoodieCommandBlockVersion.java @@ -19,8 +19,8 @@ package org.apache.hudi.common.table.log.block; /** - * A set of feature flags associated with a command log block format. Versions are changed when the - * log block format changes. TODO(na) - Implement policies around major/minor versions + * A set of feature flags associated with a command log block format. Versions are changed when the log block format + * changes. TODO(na) - Implement policies around major/minor versions */ final class HoodieCommandBlockVersion extends HoodieLogBlockVersion { diff --git a/hudi-common/src/main/java/org/apache/hudi/common/table/log/block/HoodieCorruptBlock.java b/hudi-common/src/main/java/org/apache/hudi/common/table/log/block/HoodieCorruptBlock.java index fb26c6f24..d5e0c6cb2 100644 --- a/hudi-common/src/main/java/org/apache/hudi/common/table/log/block/HoodieCorruptBlock.java +++ b/hudi-common/src/main/java/org/apache/hudi/common/table/log/block/HoodieCorruptBlock.java @@ -25,14 +25,14 @@ import org.apache.hudi.common.model.HoodieLogFile; import org.apache.hudi.common.util.Option; /** - * Corrupt block is emitted whenever the scanner finds the length of the block written at the - * beginning does not match (did not find a EOF or a sync marker after the length) + * Corrupt block is emitted whenever the scanner finds the length of the block written at the beginning does not match + * (did not find a EOF or a sync marker after the length) */ public class HoodieCorruptBlock extends HoodieLogBlock { - private HoodieCorruptBlock(Option corruptedBytes, FSDataInputStream inputStream, - boolean readBlockLazily, Option blockContentLocation, - Map header, Map footer) { + private HoodieCorruptBlock(Option corruptedBytes, FSDataInputStream inputStream, boolean readBlockLazily, + Option blockContentLocation, Map header, + Map footer) { super(header, footer, blockContentLocation, corruptedBytes, inputStream, readBlockLazily); } @@ -51,18 +51,11 @@ public class HoodieCorruptBlock extends HoodieLogBlock { return HoodieLogBlockType.CORRUPT_BLOCK; } - public static HoodieLogBlock getBlock(HoodieLogFile logFile, - FSDataInputStream inputStream, - Option corruptedBytes, - boolean readBlockLazily, - long position, - long blockSize, - long blockEndPos, - Map header, - Map footer) { + public static HoodieLogBlock getBlock(HoodieLogFile logFile, FSDataInputStream inputStream, + Option corruptedBytes, boolean readBlockLazily, long position, long blockSize, long blockEndPos, + Map header, Map footer) { return new HoodieCorruptBlock(corruptedBytes, inputStream, readBlockLazily, - Option.of(new HoodieLogBlockContentLocation(logFile, position, blockSize, blockEndPos)), - header, footer); + Option.of(new HoodieLogBlockContentLocation(logFile, position, blockSize, blockEndPos)), header, footer); } } diff --git a/hudi-common/src/main/java/org/apache/hudi/common/table/log/block/HoodieDeleteBlock.java b/hudi-common/src/main/java/org/apache/hudi/common/table/log/block/HoodieDeleteBlock.java index 71942aa25..ae0a20df8 100644 --- a/hudi-common/src/main/java/org/apache/hudi/common/table/log/block/HoodieDeleteBlock.java +++ b/hudi-common/src/main/java/org/apache/hudi/common/table/log/block/HoodieDeleteBlock.java @@ -40,16 +40,15 @@ public class HoodieDeleteBlock extends HoodieLogBlock { private HoodieKey[] keysToDelete; - public HoodieDeleteBlock(HoodieKey[] keysToDelete, - Map header) { + public HoodieDeleteBlock(HoodieKey[] keysToDelete, Map header) { this(Option.empty(), null, false, Option.empty(), header, new HashMap<>()); this.keysToDelete = keysToDelete; } - private HoodieDeleteBlock(Option content, FSDataInputStream inputStream, - boolean readBlockLazily, Option blockContentLocation, - Map header, Map footer) { + private HoodieDeleteBlock(Option content, FSDataInputStream inputStream, boolean readBlockLazily, + Option blockContentLocation, Map header, + Map footer) { super(header, footer, blockContentLocation, content, inputStream, readBlockLazily); } @@ -81,8 +80,7 @@ public class HoodieDeleteBlock extends HoodieLogBlock { inflate(); } SizeAwareDataInputStream dis = - new SizeAwareDataInputStream( - new DataInputStream(new ByteArrayInputStream(getContent().get()))); + new SizeAwareDataInputStream(new DataInputStream(new ByteArrayInputStream(getContent().get()))); int version = dis.readInt(); int dataLength = dis.readInt(); byte[] data = new byte[dataLength]; @@ -101,18 +99,11 @@ public class HoodieDeleteBlock extends HoodieLogBlock { return HoodieLogBlockType.DELETE_BLOCK; } - public static HoodieLogBlock getBlock(HoodieLogFile logFile, - FSDataInputStream inputStream, - Option content, - boolean readBlockLazily, - long position, - long blockSize, - long blockEndPos, - Map header, + public static HoodieLogBlock getBlock(HoodieLogFile logFile, FSDataInputStream inputStream, Option content, + boolean readBlockLazily, long position, long blockSize, long blockEndPos, Map header, Map footer) throws IOException { return new HoodieDeleteBlock(content, inputStream, readBlockLazily, - Option.of(new HoodieLogBlockContentLocation(logFile, position, blockSize, blockEndPos)), - header, footer); + Option.of(new HoodieLogBlockContentLocation(logFile, position, blockSize, blockEndPos)), header, footer); } } diff --git a/hudi-common/src/main/java/org/apache/hudi/common/table/log/block/HoodieDeleteBlockVersion.java b/hudi-common/src/main/java/org/apache/hudi/common/table/log/block/HoodieDeleteBlockVersion.java index 6fa9e7f40..220ef0ed6 100644 --- a/hudi-common/src/main/java/org/apache/hudi/common/table/log/block/HoodieDeleteBlockVersion.java +++ b/hudi-common/src/main/java/org/apache/hudi/common/table/log/block/HoodieDeleteBlockVersion.java @@ -19,8 +19,8 @@ package org.apache.hudi.common.table.log.block; /** - * A set of feature flags associated with a delete log block format. Versions are changed when the - * log block format changes. TODO(na) - Implement policies around major/minor versions + * A set of feature flags associated with a delete log block format. Versions are changed when the log block format + * changes. TODO(na) - Implement policies around major/minor versions */ final class HoodieDeleteBlockVersion extends HoodieLogBlockVersion { diff --git a/hudi-common/src/main/java/org/apache/hudi/common/table/log/block/HoodieLogBlock.java b/hudi-common/src/main/java/org/apache/hudi/common/table/log/block/HoodieLogBlock.java index c558a032f..3bf8fb21b 100644 --- a/hudi-common/src/main/java/org/apache/hudi/common/table/log/block/HoodieLogBlock.java +++ b/hudi-common/src/main/java/org/apache/hudi/common/table/log/block/HoodieLogBlock.java @@ -40,10 +40,9 @@ import org.apache.hudi.exception.HoodieIOException; public abstract class HoodieLogBlock { /** - * The current version of the log block. Anytime the logBlock format changes this version needs to - * be bumped and corresponding changes need to be made to {@link HoodieLogBlockVersion} TODO : - * Change this to a class, something like HoodieLogBlockVersionV1/V2 and implement/override - * operations there + * The current version of the log block. Anytime the logBlock format changes this version needs to be bumped and + * corresponding changes need to be made to {@link HoodieLogBlockVersion} TODO : Change this to a class, something + * like HoodieLogBlockVersionV1/V2 and implement/override operations there */ public static int version = 1; // Header for each log block @@ -63,10 +62,8 @@ public abstract class HoodieLogBlock { public HoodieLogBlock(@Nonnull Map logBlockHeader, @Nonnull Map logBlockFooter, - @Nonnull Option blockContentLocation, - @Nonnull Option content, - FSDataInputStream inputStream, - boolean readBlockLazily) { + @Nonnull Option blockContentLocation, @Nonnull Option content, + FSDataInputStream inputStream, boolean readBlockLazily) { this.logBlockHeader = logBlockHeader; this.logBlockFooter = logBlockFooter; this.blockContentLocation = blockContentLocation; @@ -109,38 +106,30 @@ public abstract class HoodieLogBlock { } /** - * Type of the log block WARNING: This enum is serialized as the ordinal. Only add new enums at - * the end. + * Type of the log block WARNING: This enum is serialized as the ordinal. Only add new enums at the end. */ public enum HoodieLogBlockType { - COMMAND_BLOCK, - DELETE_BLOCK, - CORRUPT_BLOCK, - AVRO_DATA_BLOCK + COMMAND_BLOCK, DELETE_BLOCK, CORRUPT_BLOCK, AVRO_DATA_BLOCK } /** - * Log Metadata headers abstraction for a HoodieLogBlock WARNING : This enum is serialized as the - * ordinal. Only add new enums at the end. + * Log Metadata headers abstraction for a HoodieLogBlock WARNING : This enum is serialized as the ordinal. Only add + * new enums at the end. */ public enum HeaderMetadataType { - INSTANT_TIME, - TARGET_INSTANT_TIME, - SCHEMA, - COMMAND_BLOCK_TYPE + INSTANT_TIME, TARGET_INSTANT_TIME, SCHEMA, COMMAND_BLOCK_TYPE } /** - * Log Metadata footers abstraction for a HoodieLogBlock WARNING : This enum is serialized as the - * ordinal. Only add new enums at the end. + * Log Metadata footers abstraction for a HoodieLogBlock WARNING : This enum is serialized as the ordinal. Only add + * new enums at the end. */ public enum FooterMetadataType { } /** - * This class is used to store the Location of the Content of a Log Block. It's used when a client - * chooses for a IO intensive CompactedScanner, the location helps to lazily read contents from - * the log file + * This class is used to store the Location of the Content of a Log Block. It's used when a client chooses for a IO + * intensive CompactedScanner, the location helps to lazily read contents from the log file */ public static final class HoodieLogBlockContentLocation { @@ -153,8 +142,8 @@ public abstract class HoodieLogBlock { // The final position where the complete block ends private final long blockEndPos; - HoodieLogBlockContentLocation(HoodieLogFile logFile, long contentPositionInLogFile, - long blockSize, long blockEndPos) { + HoodieLogBlockContentLocation(HoodieLogFile logFile, long contentPositionInLogFile, long blockSize, + long blockEndPos) { this.logFile = logFile; this.contentPositionInLogFile = contentPositionInLogFile; this.blockSize = blockSize; @@ -179,11 +168,9 @@ public abstract class HoodieLogBlock { } /** - * Convert log metadata to bytes 1. Write size of metadata 2. Write enum ordinal 3. Write actual - * bytes + * Convert log metadata to bytes 1. Write size of metadata 2. Write enum ordinal 3. Write actual bytes */ - public static byte[] getLogMetadataBytes(Map metadata) - throws IOException { + public static byte[] getLogMetadataBytes(Map metadata) throws IOException { ByteArrayOutputStream baos = new ByteArrayOutputStream(); DataOutputStream output = new DataOutputStream(baos); output.writeInt(metadata.size()); @@ -197,11 +184,9 @@ public abstract class HoodieLogBlock { } /** - * Convert bytes to LogMetadata, follow the same order as - * {@link HoodieLogBlock#getLogMetadataBytes} + * Convert bytes to LogMetadata, follow the same order as {@link HoodieLogBlock#getLogMetadataBytes} */ - public static Map getLogMetadata(DataInputStream dis) - throws IOException { + public static Map getLogMetadata(DataInputStream dis) throws IOException { Map metadata = Maps.newHashMap(); // 1. Read the metadata written out @@ -225,8 +210,8 @@ public abstract class HoodieLogBlock { * Read or Skip block content of a log block in the log file. Depends on lazy reading enabled in * {@link HoodieMergedLogRecordScanner} */ - public static byte[] readOrSkipContent(FSDataInputStream inputStream, - Integer contentLength, boolean readBlockLazily) throws IOException { + public static byte[] readOrSkipContent(FSDataInputStream inputStream, Integer contentLength, boolean readBlockLazily) + throws IOException { byte[] content = null; if (!readBlockLazily) { // Read the contents in memory @@ -261,9 +246,8 @@ public abstract class HoodieLogBlock { } /** - * After the content bytes is converted into the required DataStructure by a logBlock, deflate the - * content to release byte [] and relieve memory pressure when GC kicks in. NOTE: This still - * leaves the heap fragmented + * After the content bytes is converted into the required DataStructure by a logBlock, deflate the content to release + * byte [] and relieve memory pressure when GC kicks in. NOTE: This still leaves the heap fragmented */ protected void deflate() { content = Option.empty(); @@ -271,8 +255,9 @@ public abstract class HoodieLogBlock { /** * Handles difference in seek behavior for GCS and non-GCS input stream + * * @param inputStream Input Stream - * @param pos Position to seek + * @param pos Position to seek * @throws IOException */ private static void safeSeek(FSDataInputStream inputStream, long pos) throws IOException { diff --git a/hudi-common/src/main/java/org/apache/hudi/common/table/timeline/HoodieActiveTimeline.java b/hudi-common/src/main/java/org/apache/hudi/common/table/timeline/HoodieActiveTimeline.java index fd62eef80..140d4113d 100644 --- a/hudi-common/src/main/java/org/apache/hudi/common/table/timeline/HoodieActiveTimeline.java +++ b/hudi-common/src/main/java/org/apache/hudi/common/table/timeline/HoodieActiveTimeline.java @@ -43,22 +43,25 @@ import org.apache.log4j.LogManager; import org.apache.log4j.Logger; /** - * Represents the Active Timeline for the HoodieDataset. Instants for the last 12 hours - * (configurable) is in the ActiveTimeline and the rest are Archived. ActiveTimeline is a special - * timeline that allows for creation of instants on the timeline.

    The timeline is not - * automatically reloaded on any mutation operation, clients have to manually call reload() so that - * they can chain multiple mutations to the timeline and then call reload() once.

    This class - * can be serialized and de-serialized and on de-serialization the FileSystem is re-initialized. + * Represents the Active Timeline for the HoodieDataset. Instants for the last 12 hours (configurable) is in the + * ActiveTimeline and the rest are Archived. ActiveTimeline is a special timeline that allows for creation of instants + * on the timeline. + *

    + *

    + * The timeline is not automatically reloaded on any mutation operation, clients have to manually call reload() so that + * they can chain multiple mutations to the timeline and then call reload() once. + *

    + *

    + * This class can be serialized and de-serialized and on de-serialization the FileSystem is re-initialized. */ public class HoodieActiveTimeline extends HoodieDefaultTimeline { public static final SimpleDateFormat COMMIT_FORMATTER = new SimpleDateFormat("yyyyMMddHHmmss"); - public static final Set VALID_EXTENSIONS_IN_ACTIVE_TIMELINE = new HashSet<>(Arrays.asList( - new String[]{COMMIT_EXTENSION, INFLIGHT_COMMIT_EXTENSION, DELTA_COMMIT_EXTENSION, - INFLIGHT_DELTA_COMMIT_EXTENSION, SAVEPOINT_EXTENSION, INFLIGHT_SAVEPOINT_EXTENSION, - CLEAN_EXTENSION, INFLIGHT_CLEAN_EXTENSION, INFLIGHT_COMPACTION_EXTENSION, REQUESTED_COMPACTION_EXTENSION, - INFLIGHT_RESTORE_EXTENSION, RESTORE_EXTENSION})); + public static final Set VALID_EXTENSIONS_IN_ACTIVE_TIMELINE = new HashSet<>(Arrays.asList(new String[] { + COMMIT_EXTENSION, INFLIGHT_COMMIT_EXTENSION, DELTA_COMMIT_EXTENSION, INFLIGHT_DELTA_COMMIT_EXTENSION, + SAVEPOINT_EXTENSION, INFLIGHT_SAVEPOINT_EXTENSION, CLEAN_EXTENSION, INFLIGHT_CLEAN_EXTENSION, + INFLIGHT_COMPACTION_EXTENSION, REQUESTED_COMPACTION_EXTENSION, INFLIGHT_RESTORE_EXTENSION, RESTORE_EXTENSION})); private static final transient Logger log = LogManager.getLogger(HoodieActiveTimeline.class); protected HoodieTableMetaClient metaClient; @@ -83,14 +86,11 @@ public class HoodieActiveTimeline extends HoodieDefaultTimeline { this.metaClient = metaClient; // multiple casts will make this lambda serializable - // http://docs.oracle.com/javase/specs/jls/se8/html/jls-15.html#jls-15.16 - this.details = - (Function> & Serializable) this::getInstantDetails; + this.details = (Function> & Serializable) this::getInstantDetails; } public HoodieActiveTimeline(HoodieTableMetaClient metaClient) { - this(metaClient, - new ImmutableSet.Builder() - .addAll(VALID_EXTENSIONS_IN_ACTIVE_TIMELINE).build()); + this(metaClient, new ImmutableSet.Builder().addAll(VALID_EXTENSIONS_IN_ACTIVE_TIMELINE).build()); } /** @@ -98,16 +98,14 @@ public class HoodieActiveTimeline extends HoodieDefaultTimeline { * * @deprecated */ - public HoodieActiveTimeline() { - } + public HoodieActiveTimeline() {} /** * This method is only used when this object is deserialized in a spark executor. * * @deprecated */ - private void readObject(java.io.ObjectInputStream in) - throws IOException, ClassNotFoundException { + private void readObject(java.io.ObjectInputStream in) throws IOException, ClassNotFoundException { in.defaultReadObject(); } @@ -116,29 +114,25 @@ public class HoodieActiveTimeline extends HoodieDefaultTimeline { * */ public HoodieTimeline getCommitsTimeline() { - return getTimelineOfActions( - Sets.newHashSet(COMMIT_ACTION, DELTA_COMMIT_ACTION)); + return getTimelineOfActions(Sets.newHashSet(COMMIT_ACTION, DELTA_COMMIT_ACTION)); } /** * Get all instants (commits, delta commits, in-flight/request compaction) that produce new data, in the active - * timeline * - * With Async compaction a requested/inflight compaction-instant is a valid baseInstant for a file-slice as there - * could be delta-commits with that baseInstant. + * timeline * With Async compaction a requested/inflight compaction-instant is a valid baseInstant for a file-slice as + * there could be delta-commits with that baseInstant. */ public HoodieTimeline getCommitsAndCompactionTimeline() { - return getTimelineOfActions( - Sets.newHashSet(COMMIT_ACTION, DELTA_COMMIT_ACTION, COMPACTION_ACTION)); + return getTimelineOfActions(Sets.newHashSet(COMMIT_ACTION, DELTA_COMMIT_ACTION, COMPACTION_ACTION)); } /** - * Get all instants (commits, delta commits, clean, savepoint, rollback) that result in actions, - * in the active timeline * + * Get all instants (commits, delta commits, clean, savepoint, rollback) that result in actions, in the active + * timeline * */ public HoodieTimeline getAllCommitsTimeline() { - return getTimelineOfActions( - Sets.newHashSet(COMMIT_ACTION, DELTA_COMMIT_ACTION, CLEAN_ACTION, COMPACTION_ACTION, - SAVEPOINT_ACTION, ROLLBACK_ACTION)); + return getTimelineOfActions(Sets.newHashSet(COMMIT_ACTION, DELTA_COMMIT_ACTION, CLEAN_ACTION, COMPACTION_ACTION, + SAVEPOINT_ACTION, ROLLBACK_ACTION)); } /** @@ -157,8 +151,7 @@ public class HoodieActiveTimeline extends HoodieDefaultTimeline { } /** - * Get a timeline of a specific set of actions. useful to create a merged timeline of multiple - * actions + * Get a timeline of a specific set of actions. useful to create a merged timeline of multiple actions * * @param actions actions allowed in the timeline */ @@ -246,8 +239,7 @@ public class HoodieActiveTimeline extends HoodieDefaultTimeline { throw new HoodieIOException("Could not delete in-flight instant " + instant); } } catch (IOException e) { - throw new HoodieIOException( - "Could not remove inflight commit " + inFlightCommitFilePath, e); + throw new HoodieIOException("Could not remove inflight commit " + inFlightCommitFilePath, e); } } @@ -299,7 +291,7 @@ public class HoodieActiveTimeline extends HoodieDefaultTimeline { * Transition Compaction State from inflight to Committed * * @param inflightInstant Inflight instant - * @param data Extra Metadata + * @param data Extra Metadata * @return commit instant */ public HoodieInstant transitionCompactionInflightToComplete(HoodieInstant inflightInstant, Option data) { @@ -319,8 +311,7 @@ public class HoodieActiveTimeline extends HoodieDefaultTimeline { * END - COMPACTION RELATED META-DATA MANAGEMENT **/ - private void transitionState(HoodieInstant fromInstant, HoodieInstant toInstant, - Option data) { + private void transitionState(HoodieInstant fromInstant, HoodieInstant toInstant, Option data) { Preconditions.checkArgument(fromInstant.getTimestamp().equals(toInstant.getTimestamp())); Path commitFilePath = new Path(metaClient.getMetaPath(), toInstant.getFileName()); try { @@ -329,8 +320,7 @@ public class HoodieActiveTimeline extends HoodieDefaultTimeline { createFileInMetaPath(fromInstant.getFileName(), data); boolean success = metaClient.getFs().rename(inflightCommitFile, commitFilePath); if (!success) { - throw new HoodieIOException( - "Could not rename " + inflightCommitFile + " to " + commitFilePath); + throw new HoodieIOException("Could not rename " + inflightCommitFile + " to " + commitFilePath); } } catch (IOException e) { throw new HoodieIOException("Could not complete " + fromInstant, e); @@ -345,8 +335,7 @@ public class HoodieActiveTimeline extends HoodieDefaultTimeline { Path commitFilePath = new Path(metaClient.getMetaPath(), completed.getFileName()); boolean success = metaClient.getFs().rename(commitFilePath, inFlightCommitFilePath); if (!success) { - throw new HoodieIOException( - "Could not rename " + commitFilePath + " to " + inFlightCommitFilePath); + throw new HoodieIOException("Could not rename " + commitFilePath + " to " + inFlightCommitFilePath); } } } catch (IOException e) { diff --git a/hudi-common/src/main/java/org/apache/hudi/common/table/timeline/HoodieArchivedTimeline.java b/hudi-common/src/main/java/org/apache/hudi/common/table/timeline/HoodieArchivedTimeline.java index 105b98271..f5790937e 100644 --- a/hudi-common/src/main/java/org/apache/hudi/common/table/timeline/HoodieArchivedTimeline.java +++ b/hudi-common/src/main/java/org/apache/hudi/common/table/timeline/HoodieArchivedTimeline.java @@ -36,11 +36,15 @@ import org.apache.log4j.LogManager; import org.apache.log4j.Logger; /** - * Represents the Archived Timeline for the HoodieDataset. Instants for the last 12 hours - * (configurable) is in the ActiveTimeline and the rest are in ArchivedTimeline.

    Instants - * are read from the archive file during initialization and never refreshed. To refresh, clients - * need to call reload()

    This class can be serialized and de-serialized and on - * de-serialization the FileSystem is re-initialized. + * Represents the Archived Timeline for the HoodieDataset. Instants for the last 12 hours (configurable) is in the + * ActiveTimeline and the rest are in ArchivedTimeline. + *

    + *

    + * Instants are read from the archive file during initialization and never refreshed. To refresh, clients need to call + * reload() + *

    + *

    + * This class can be serialized and de-serialized and on de-serialization the FileSystem is re-initialized. */ public class HoodieArchivedTimeline extends HoodieDefaultTimeline { @@ -54,8 +58,7 @@ public class HoodieArchivedTimeline extends HoodieDefaultTimeline { // Read back the commits to make sure Path archiveLogPath = HoodieArchivedTimeline.getArchiveLogPath(metaClient.getArchivePath()); try (SequenceFile.Reader reader = - new SequenceFile.Reader(metaClient.getHadoopConf(), - SequenceFile.Reader.file(archiveLogPath))) { + new SequenceFile.Reader(metaClient.getHadoopConf(), SequenceFile.Reader.file(archiveLogPath))) { Text key = new Text(); Text val = new Text(); while (reader.next(key, val)) { @@ -63,17 +66,14 @@ public class HoodieArchivedTimeline extends HoodieDefaultTimeline { // This is okay because only tooling will load the archived commit timeline today readCommits.put(key.toString(), Arrays.copyOf(val.getBytes(), val.getLength())); } - this.setInstants(readCommits.keySet().stream().map( - s -> new HoodieInstant(false, HoodieTimeline.COMMIT_ACTION, s)).collect( - Collectors.toList())); + this.setInstants(readCommits.keySet().stream().map(s -> new HoodieInstant(false, HoodieTimeline.COMMIT_ACTION, s)) + .collect(Collectors.toList())); } catch (IOException e) { - throw new HoodieIOException( - "Could not load archived commit timeline from path " + archiveLogPath, e); + throw new HoodieIOException("Could not load archived commit timeline from path " + archiveLogPath, e); } // multiple casts will make this lambda serializable - // http://docs.oracle.com/javase/specs/jls/se8/html/jls-15.html#jls-15.16 - this.details = - (Function> & Serializable) this::getInstantDetails; + this.details = (Function> & Serializable) this::getInstantDetails; this.metaClient = metaClient; } @@ -82,16 +82,14 @@ public class HoodieArchivedTimeline extends HoodieDefaultTimeline { * * @deprecated */ - public HoodieArchivedTimeline() { - } + public HoodieArchivedTimeline() {} /** * This method is only used when this object is deserialized in a spark executor. * * @deprecated */ - private void readObject(java.io.ObjectInputStream in) - throws IOException, ClassNotFoundException { + private void readObject(java.io.ObjectInputStream in) throws IOException, ClassNotFoundException { in.defaultReadObject(); } diff --git a/hudi-common/src/main/java/org/apache/hudi/common/table/timeline/HoodieDefaultTimeline.java b/hudi-common/src/main/java/org/apache/hudi/common/table/timeline/HoodieDefaultTimeline.java index 79673f352..156633695 100644 --- a/hudi-common/src/main/java/org/apache/hudi/common/table/timeline/HoodieDefaultTimeline.java +++ b/hudi-common/src/main/java/org/apache/hudi/common/table/timeline/HoodieDefaultTimeline.java @@ -37,9 +37,8 @@ import org.apache.log4j.LogManager; import org.apache.log4j.Logger; /** - * HoodieDefaultTimeline is a default implementation of the HoodieTimeline. It provides methods to - * inspect a List[HoodieInstant]. Function to get the details of the instant is passed in as a - * lamdba. + * HoodieDefaultTimeline is a default implementation of the HoodieTimeline. It provides methods to inspect a + * List[HoodieInstant]. Function to get the details of the instant is passed in as a lamdba. * * @see HoodieTimeline */ @@ -53,8 +52,7 @@ public class HoodieDefaultTimeline implements HoodieTimeline { private List instants; private String timelineHash; - public HoodieDefaultTimeline(Stream instants, - Function> details) { + public HoodieDefaultTimeline(Stream instants, Function> details) { this.details = details; setInstants(instants.collect(Collectors.toList())); } @@ -64,8 +62,8 @@ public class HoodieDefaultTimeline implements HoodieTimeline { final MessageDigest md; try { md = MessageDigest.getInstance(HASHING_ALGORITHM); - this.instants.stream().forEach(i -> md.update( - StringUtils.joinUsingDelim("_", i.getTimestamp(), i.getAction(), i.getState().name()).getBytes())); + this.instants.stream().forEach(i -> md + .update(StringUtils.joinUsingDelim("_", i.getTimestamp(), i.getAction(), i.getState().name()).getBytes())); } catch (NoSuchAlgorithmException nse) { throw new HoodieException(nse); } @@ -78,13 +76,11 @@ public class HoodieDefaultTimeline implements HoodieTimeline { * * @deprecated */ - public HoodieDefaultTimeline() { - } + public HoodieDefaultTimeline() {} @Override public HoodieTimeline filterInflights() { - return new HoodieDefaultTimeline(instants.stream().filter(HoodieInstant::isInflight), - details); + return new HoodieDefaultTimeline(instants.stream().filter(HoodieInstant::isInflight), details); } @Override @@ -115,24 +111,22 @@ public class HoodieDefaultTimeline implements HoodieTimeline { @Override public HoodieTimeline filterPendingCompactionTimeline() { return new HoodieDefaultTimeline( - instants.stream().filter(s -> s.getAction().equals(HoodieTimeline.COMPACTION_ACTION)), - details); + instants.stream().filter(s -> s.getAction().equals(HoodieTimeline.COMPACTION_ACTION)), details); } @Override public HoodieDefaultTimeline findInstantsInRange(String startTs, String endTs) { - return new HoodieDefaultTimeline(instants.stream().filter( - s -> HoodieTimeline.compareTimestamps(s.getTimestamp(), startTs, GREATER) - && HoodieTimeline.compareTimestamps( - s.getTimestamp(), endTs, LESSER_OR_EQUAL)), details); + return new HoodieDefaultTimeline( + instants.stream().filter(s -> HoodieTimeline.compareTimestamps(s.getTimestamp(), startTs, GREATER) + && HoodieTimeline.compareTimestamps(s.getTimestamp(), endTs, LESSER_OR_EQUAL)), + details); } @Override public HoodieDefaultTimeline findInstantsAfter(String commitTime, int numCommits) { - return new HoodieDefaultTimeline( - instants.stream() - .filter(s -> HoodieTimeline.compareTimestamps(s.getTimestamp(), commitTime, GREATER)) - .limit(numCommits), details); + return new HoodieDefaultTimeline(instants.stream() + .filter(s -> HoodieTimeline.compareTimestamps(s.getTimestamp(), commitTime, GREATER)).limit(numCommits), + details); } @Override @@ -183,8 +177,7 @@ public class HoodieDefaultTimeline implements HoodieTimeline { @Override public boolean containsOrBeforeTimelineStarts(String instant) { - return instants.stream().anyMatch(s -> s.getTimestamp().equals(instant)) - || isBeforeTimelineStarts(instant); + return instants.stream().anyMatch(s -> s.getTimestamp().equals(instant)) || isBeforeTimelineStarts(instant); } @Override @@ -218,8 +211,7 @@ public class HoodieDefaultTimeline implements HoodieTimeline { @Override public String toString() { - return this.getClass().getName() + ": " + instants.stream().map(Object::toString) - .collect(Collectors.joining(",")); + return this.getClass().getName() + ": " + instants.stream().map(Object::toString).collect(Collectors.joining(",")); } } diff --git a/hudi-common/src/main/java/org/apache/hudi/common/table/timeline/HoodieInstant.java b/hudi-common/src/main/java/org/apache/hudi/common/table/timeline/HoodieInstant.java index 33fb8bd32..e3a268301 100644 --- a/hudi-common/src/main/java/org/apache/hudi/common/table/timeline/HoodieInstant.java +++ b/hudi-common/src/main/java/org/apache/hudi/common/table/timeline/HoodieInstant.java @@ -25,8 +25,8 @@ import org.apache.hudi.common.table.HoodieTimeline; import org.apache.hudi.common.util.FSUtils; /** - * A Hoodie Instant represents a action done on a hoodie dataset. All actions start with a inflight - * instant and then create a completed instant after done. + * A Hoodie Instant represents a action done on a hoodie dataset. All actions start with a inflight instant and then + * create a completed instant after done. * * @see HoodieTimeline */ @@ -76,7 +76,7 @@ public class HoodieInstant implements Serializable { } public HoodieInstant(boolean isInflight, String action, String timestamp) { - //TODO: vb - Preserving for avoiding cascading changes. This constructor will be updated in subsequent PR + // TODO: vb - Preserving for avoiding cascading changes. This constructor will be updated in subsequent PR this.state = isInflight ? State.INFLIGHT : State.COMPLETED; this.action = action; this.timestamp = timestamp; @@ -151,9 +151,7 @@ public class HoodieInstant implements Serializable { return false; } HoodieInstant that = (HoodieInstant) o; - return state == that.state - && Objects.equals(action, that.action) - && Objects.equals(timestamp, that.timestamp); + return state == that.state && Objects.equals(action, that.action) && Objects.equals(timestamp, that.timestamp); } public State getState() { diff --git a/hudi-common/src/main/java/org/apache/hudi/common/table/timeline/dto/CompactionOpDTO.java b/hudi-common/src/main/java/org/apache/hudi/common/table/timeline/dto/CompactionOpDTO.java index 386c303b6..0466505bc 100644 --- a/hudi-common/src/main/java/org/apache/hudi/common/table/timeline/dto/CompactionOpDTO.java +++ b/hudi-common/src/main/java/org/apache/hudi/common/table/timeline/dto/CompactionOpDTO.java @@ -55,8 +55,7 @@ public class CompactionOpDTO { @JsonProperty("metrics") private Map metrics; - public static CompactionOpDTO fromCompactionOperation(String compactionInstantTime, - CompactionOperation op) { + public static CompactionOpDTO fromCompactionOperation(String compactionInstantTime, CompactionOperation op) { CompactionOpDTO dto = new CompactionOpDTO(); dto.fileId = op.getFileId(); dto.compactionInstantTime = compactionInstantTime; @@ -70,8 +69,9 @@ public class CompactionOpDTO { } public static Pair toCompactionOperation(CompactionOpDTO dto) { - return Pair.of(dto.compactionInstantTime, new CompactionOperation(dto.fileId, dto.partitionPath, - dto.baseInstantTime, Option.ofNullable(dto.dataFileCommitTime), dto.deltaFilePaths, - Option.ofNullable(dto.dataFilePath), dto.metrics)); + return Pair.of(dto.compactionInstantTime, + new CompactionOperation(dto.fileId, dto.partitionPath, dto.baseInstantTime, + Option.ofNullable(dto.dataFileCommitTime), dto.deltaFilePaths, Option.ofNullable(dto.dataFilePath), + dto.metrics)); } } diff --git a/hudi-common/src/main/java/org/apache/hudi/common/table/timeline/dto/FileGroupDTO.java b/hudi-common/src/main/java/org/apache/hudi/common/table/timeline/dto/FileGroupDTO.java index 6699ed552..9b26352d1 100644 --- a/hudi-common/src/main/java/org/apache/hudi/common/table/timeline/dto/FileGroupDTO.java +++ b/hudi-common/src/main/java/org/apache/hudi/common/table/timeline/dto/FileGroupDTO.java @@ -50,8 +50,8 @@ public class FileGroupDTO { } public static HoodieFileGroup toFileGroup(FileGroupDTO dto, HoodieTableMetaClient metaClient) { - HoodieFileGroup fileGroup = new HoodieFileGroup(dto.partition, dto.id, - TimelineDTO.toTimeline(dto.timeline, metaClient)); + HoodieFileGroup fileGroup = + new HoodieFileGroup(dto.partition, dto.id, TimelineDTO.toTimeline(dto.timeline, metaClient)); dto.slices.stream().map(FileSliceDTO::toFileSlice).forEach(fileSlice -> fileGroup.addFileSlice(fileSlice)); return fileGroup; } diff --git a/hudi-common/src/main/java/org/apache/hudi/common/table/timeline/dto/TimelineDTO.java b/hudi-common/src/main/java/org/apache/hudi/common/table/timeline/dto/TimelineDTO.java index f31a57092..e68ff0b8b 100644 --- a/hudi-common/src/main/java/org/apache/hudi/common/table/timeline/dto/TimelineDTO.java +++ b/hudi-common/src/main/java/org/apache/hudi/common/table/timeline/dto/TimelineDTO.java @@ -39,7 +39,7 @@ public class TimelineDTO { } public static HoodieTimeline toTimeline(TimelineDTO dto, HoodieTableMetaClient metaClient) { - //TODO: For Now, we will assume, only active-timeline will be transferred. + // TODO: For Now, we will assume, only active-timeline will be transferred. return new HoodieDefaultTimeline(dto.instants.stream().map(InstantDTO::toInstant), metaClient.getActiveTimeline()::getInstantDetails); } diff --git a/hudi-common/src/main/java/org/apache/hudi/common/table/view/AbstractTableFileSystemView.java b/hudi-common/src/main/java/org/apache/hudi/common/table/view/AbstractTableFileSystemView.java index 75de33048..74ddb9e37 100644 --- a/hudi-common/src/main/java/org/apache/hudi/common/table/view/AbstractTableFileSystemView.java +++ b/hudi-common/src/main/java/org/apache/hudi/common/table/view/AbstractTableFileSystemView.java @@ -56,13 +56,11 @@ import org.apache.log4j.LogManager; import org.apache.log4j.Logger; /** - * Common thread-safe implementation for multiple TableFileSystemView Implementations. - * Provides uniform handling of - * (a) Loading file-system views from underlying file-system - * (b) Pending compaction operations and changing file-system views based on that - * (c) Thread-safety in loading and managing file system views for this dataset. - * (d) resetting file-system views - * The actual mechanism of fetching file slices from different view storages is delegated to sub-classes. + * Common thread-safe implementation for multiple TableFileSystemView Implementations. Provides uniform handling of (a) + * Loading file-system views from underlying file-system (b) Pending compaction operations and changing file-system + * views based on that (c) Thread-safety in loading and managing file system views for this dataset. (d) resetting + * file-system views The actual mechanism of fetching file slices from different view storages is delegated to + * sub-classes. */ public abstract class AbstractTableFileSystemView implements SyncableFileSystemView, Serializable { @@ -94,15 +92,14 @@ public abstract class AbstractTableFileSystemView implements SyncableFileSystemV refreshTimeline(visibleActiveTimeline); // Load Pending Compaction Operations - resetPendingCompactionOperations( - CompactionUtils.getAllPendingCompactionOperations(metaClient).values() - .stream().map(e -> Pair.of(e.getKey(), - CompactionOperation.convertFromAvroRecordInstance(e.getValue())))); + resetPendingCompactionOperations(CompactionUtils.getAllPendingCompactionOperations(metaClient).values().stream() + .map(e -> Pair.of(e.getKey(), CompactionOperation.convertFromAvroRecordInstance(e.getValue())))); } /** * Refresh commits timeline - * @param visibleActiveTimeline Visible Active Timeline + * + * @param visibleActiveTimeline Visible Active Timeline */ protected void refreshTimeline(HoodieTimeline visibleActiveTimeline) { this.visibleCommitsAndCompactionTimeline = visibleActiveTimeline.getCommitsAndCompactionTimeline(); @@ -117,13 +114,12 @@ public abstract class AbstractTableFileSystemView implements SyncableFileSystemV long fgBuildTimeTakenMs = timer.endTimer(); timer.startTimer(); // Group by partition for efficient updates for both InMemory and DiskBased stuctures. - fileGroups.stream().collect(Collectors.groupingBy(HoodieFileGroup::getPartitionPath)).entrySet() - .forEach(entry -> { - String partition = entry.getKey(); - if (!isPartitionAvailableInStore(partition)) { - storePartitionView(partition, entry.getValue()); - } - }); + fileGroups.stream().collect(Collectors.groupingBy(HoodieFileGroup::getPartitionPath)).entrySet().forEach(entry -> { + String partition = entry.getKey(); + if (!isPartitionAvailableInStore(partition)) { + storePartitionView(partition, entry.getValue()); + } + }); long storePartitionsTs = timer.endTimer(); log.info("addFilesToView: NumFiles=" + statuses.length + ", FileGroupsCreationTime=" + fgBuildTimeTakenMs + ", StoreTimeTaken=" + storePartitionsTs); @@ -141,19 +137,17 @@ public abstract class AbstractTableFileSystemView implements SyncableFileSystemV protected List buildFileGroups(Stream dataFileStream, Stream logFileStream, HoodieTimeline timeline, boolean addPendingCompactionFileSlice) { - Map, List> dataFiles = dataFileStream - .collect(Collectors.groupingBy((dataFile) -> { + Map, List> dataFiles = + dataFileStream.collect(Collectors.groupingBy((dataFile) -> { String partitionPathStr = getPartitionPathFromFilePath(dataFile.getPath()); return Pair.of(partitionPathStr, dataFile.getFileId()); })); - Map, List> logFiles = logFileStream - .collect(Collectors.groupingBy((logFile) -> { - String partitionPathStr = FSUtils.getRelativePartitionPath( - new Path(metaClient.getBasePath()), - logFile.getPath().getParent()); - return Pair.of(partitionPathStr, logFile.getFileId()); - })); + Map, List> logFiles = logFileStream.collect(Collectors.groupingBy((logFile) -> { + String partitionPathStr = + FSUtils.getRelativePartitionPath(new Path(metaClient.getBasePath()), logFile.getPath().getParent()); + return Pair.of(partitionPathStr, logFile.getFileId()); + })); Set> fileIdSet = new HashSet<>(dataFiles.keySet()); fileIdSet.addAll(logFiles.keySet()); @@ -228,8 +222,8 @@ public abstract class AbstractTableFileSystemView implements SyncableFileSystemV long beginLsTs = System.currentTimeMillis(); FileStatus[] statuses = metaClient.getFs().listStatus(partitionPath); long endLsTs = System.currentTimeMillis(); - log.info("#files found in partition (" + partitionPathStr + ") =" + statuses.length - + ", Time taken =" + (endLsTs - beginLsTs)); + log.info("#files found in partition (" + partitionPathStr + ") =" + statuses.length + ", Time taken =" + + (endLsTs - beginLsTs)); List groups = addFilesToView(statuses); if (groups.isEmpty()) { @@ -253,9 +247,8 @@ public abstract class AbstractTableFileSystemView implements SyncableFileSystemV * @param statuses List of File-Status */ private Stream convertFileStatusesToDataFiles(FileStatus[] statuses) { - Predicate roFilePredicate = fileStatus -> - fileStatus.getPath().getName() - .contains(metaClient.getTableConfig().getROFileFormat().getFileExtension()); + Predicate roFilePredicate = fileStatus -> fileStatus.getPath().getName() + .contains(metaClient.getTableConfig().getROFileFormat().getFileExtension()); return Arrays.stream(statuses).filter(roFilePredicate).map(HoodieDataFile::new); } @@ -265,9 +258,8 @@ public abstract class AbstractTableFileSystemView implements SyncableFileSystemV * @param statuses List of FIle-Status */ private Stream convertFileStatusesToLogFiles(FileStatus[] statuses) { - Predicate rtFilePredicate = fileStatus -> - fileStatus.getPath().getName() - .contains(metaClient.getTableConfig().getRTFileFormat().getFileExtension()); + Predicate rtFilePredicate = fileStatus -> fileStatus.getPath().getName() + .contains(metaClient.getTableConfig().getRTFileFormat().getFileExtension()); return Arrays.stream(statuses).filter(rtFilePredicate).map(HoodieLogFile::new); } @@ -311,8 +303,8 @@ public abstract class AbstractTableFileSystemView implements SyncableFileSystemV log.info("File Slice (" + fileSlice + ") is in pending compaction"); // Data file is filtered out of the file-slice as the corresponding compaction // instant not completed yet. - FileSlice transformed = new FileSlice(fileSlice.getPartitionPath(), - fileSlice.getBaseInstantTime(), fileSlice.getFileId()); + FileSlice transformed = + new FileSlice(fileSlice.getPartitionPath(), fileSlice.getBaseInstantTime(), fileSlice.getFileId()); fileSlice.getLogFiles().forEach(transformed::addLogFile); return transformed; } @@ -359,14 +351,10 @@ public abstract class AbstractTableFileSystemView implements SyncableFileSystemV ensurePartitionLoadedCorrectly(partitionPath); return fetchAllStoredFileGroups(partitionPath) .map(fileGroup -> Option.fromJavaOptional(fileGroup.getAllDataFiles() - .filter(dataFile -> - HoodieTimeline.compareTimestamps(dataFile.getCommitTime(), - maxCommitTime, - HoodieTimeline.LESSER_OR_EQUAL)) - .filter(df -> !isDataFileDueToPendingCompaction(df)) - .findFirst())) - .filter(Option::isPresent) - .map(Option::get); + .filter(dataFile -> HoodieTimeline.compareTimestamps(dataFile.getCommitTime(), maxCommitTime, + HoodieTimeline.LESSER_OR_EQUAL)) + .filter(df -> !isDataFileDueToPendingCompaction(df)).findFirst())) + .filter(Option::isPresent).map(Option::get); } finally { readLock.unlock(); } @@ -378,13 +366,10 @@ public abstract class AbstractTableFileSystemView implements SyncableFileSystemV readLock.lock(); String partitionPath = formatPartitionKey(partitionStr); ensurePartitionLoadedCorrectly(partitionPath); - return fetchHoodieFileGroup(partitionPath, fileId) - .map(fileGroup -> fileGroup.getAllDataFiles() - .filter(dataFile -> - HoodieTimeline.compareTimestamps(dataFile.getCommitTime(), - instantTime, HoodieTimeline.EQUAL)) - .filter(df -> !isDataFileDueToPendingCompaction(df)) - .findFirst().orElse(null)); + return fetchHoodieFileGroup(partitionPath, fileId).map(fileGroup -> fileGroup.getAllDataFiles() + .filter( + dataFile -> HoodieTimeline.compareTimestamps(dataFile.getCommitTime(), instantTime, HoodieTimeline.EQUAL)) + .filter(df -> !isDataFileDueToPendingCompaction(df)).findFirst().orElse(null)); } finally { readLock.unlock(); } @@ -409,10 +394,9 @@ public abstract class AbstractTableFileSystemView implements SyncableFileSystemV try { readLock.lock(); return fetchAllStoredFileGroups().map(fileGroup -> { - return Option.fromJavaOptional(fileGroup.getAllDataFiles() - .filter(dataFile -> commitsToReturn.contains(dataFile.getCommitTime()) - && !isDataFileDueToPendingCompaction(dataFile)) - .findFirst()); + return Option.fromJavaOptional( + fileGroup.getAllDataFiles().filter(dataFile -> commitsToReturn.contains(dataFile.getCommitTime()) + && !isDataFileDueToPendingCompaction(dataFile)).findFirst()); }).filter(Option::isPresent).map(Option::get); } finally { readLock.unlock(); @@ -466,19 +450,17 @@ public abstract class AbstractTableFileSystemView implements SyncableFileSystemV readLock.lock(); String partitionPath = formatPartitionKey(partitionStr); ensurePartitionLoadedCorrectly(partitionPath); - return fetchAllStoredFileGroups(partitionPath) - .map(fileGroup -> { - FileSlice fileSlice = fileGroup.getLatestFileSlice().get(); - // if the file-group is under compaction, pick the latest before compaction instant time. - Option> compactionWithInstantPair = - getPendingCompactionOperationWithInstant(fileSlice.getFileGroupId()); - if (compactionWithInstantPair.isPresent()) { - String compactionInstantTime = compactionWithInstantPair.get().getLeft(); - return fileGroup.getLatestFileSliceBefore(compactionInstantTime); - } - return Option.of(fileSlice); - }) - .map(Option::get); + return fetchAllStoredFileGroups(partitionPath).map(fileGroup -> { + FileSlice fileSlice = fileGroup.getLatestFileSlice().get(); + // if the file-group is under compaction, pick the latest before compaction instant time. + Option> compactionWithInstantPair = + getPendingCompactionOperationWithInstant(fileSlice.getFileGroupId()); + if (compactionWithInstantPair.isPresent()) { + String compactionInstantTime = compactionWithInstantPair.get().getLeft(); + return fileGroup.getLatestFileSliceBefore(compactionInstantTime); + } + return Option.of(fileSlice); + }).map(Option::get); } finally { readLock.unlock(); } @@ -491,8 +473,7 @@ public abstract class AbstractTableFileSystemView implements SyncableFileSystemV readLock.lock(); String partitionPath = formatPartitionKey(partitionStr); ensurePartitionLoadedCorrectly(partitionPath); - Stream fileSliceStream = - fetchLatestFileSlicesBeforeOrOn(partitionPath, maxCommitTime); + Stream fileSliceStream = fetchLatestFileSlicesBeforeOrOn(partitionPath, maxCommitTime); if (includeFileSlicesInPendingCompaction) { return fileSliceStream.map(fs -> filterDataFileAfterPendingCompaction(fs)); } else { @@ -509,17 +490,14 @@ public abstract class AbstractTableFileSystemView implements SyncableFileSystemV readLock.lock(); String partition = formatPartitionKey(partitionStr); ensurePartitionLoadedCorrectly(partition); - return fetchAllStoredFileGroups(partition) - .map(fileGroup -> { - Option fileSlice = fileGroup.getLatestFileSliceBeforeOrOn(maxInstantTime); - // if the file-group is under construction, pick the latest before compaction instant time. - if (fileSlice.isPresent()) { - fileSlice = Option.of(fetchMergedFileSlice(fileGroup, fileSlice.get())); - } - return fileSlice; - }) - .filter(Option::isPresent) - .map(Option::get); + return fetchAllStoredFileGroups(partition).map(fileGroup -> { + Option fileSlice = fileGroup.getLatestFileSliceBeforeOrOn(maxInstantTime); + // if the file-group is under construction, pick the latest before compaction instant time. + if (fileSlice.isPresent()) { + fileSlice = Option.of(fetchMergedFileSlice(fileGroup, fileSlice.get())); + } + return fileSlice; + }).filter(Option::isPresent).map(Option::get); } finally { readLock.unlock(); } @@ -665,8 +643,7 @@ public abstract class AbstractTableFileSystemView implements SyncableFileSystemV * @return file-slice stream */ Stream fetchAllFileSlices(String partitionPath) { - return fetchAllStoredFileGroups(partitionPath) - .map(HoodieFileGroup::getAllFileSlices) + return fetchAllStoredFileGroups(partitionPath).map(HoodieFileGroup::getAllFileSlices) .flatMap(sliceList -> sliceList); } @@ -674,26 +651,21 @@ public abstract class AbstractTableFileSystemView implements SyncableFileSystemV * Default implementation for fetching latest data-files for the partition-path */ Stream fetchLatestDataFiles(final String partitionPath) { - return fetchAllStoredFileGroups(partitionPath) - .map(this::getLatestDataFile) - .filter(Option::isPresent) + return fetchAllStoredFileGroups(partitionPath).map(this::getLatestDataFile).filter(Option::isPresent) .map(Option::get); } protected Option getLatestDataFile(HoodieFileGroup fileGroup) { - return Option.fromJavaOptional( - fileGroup.getAllDataFiles().filter(df -> !isDataFileDueToPendingCompaction(df)).findFirst()); + return Option + .fromJavaOptional(fileGroup.getAllDataFiles().filter(df -> !isDataFileDueToPendingCompaction(df)).findFirst()); } /** * Default implementation for fetching latest data-files across all partitions */ Stream fetchLatestDataFiles() { - return fetchAllStoredFileGroups() - .map(this::getLatestDataFile) - .filter(Option::isPresent) - .map(Option::get); + return fetchAllStoredFileGroups().map(this::getLatestDataFile).filter(Option::isPresent).map(Option::get); } /** @@ -702,8 +674,7 @@ public abstract class AbstractTableFileSystemView implements SyncableFileSystemV * @param partitionPath partition-path */ Stream fetchAllDataFiles(String partitionPath) { - return fetchAllStoredFileGroups(partitionPath) - .map(HoodieFileGroup::getAllDataFiles) + return fetchAllStoredFileGroups(partitionPath).map(HoodieFileGroup::getAllDataFiles) .flatMap(dataFileList -> dataFileList); } @@ -719,9 +690,7 @@ public abstract class AbstractTableFileSystemView implements SyncableFileSystemV * Default implementation for fetching latest file-slices for a partition path */ Stream fetchLatestFileSlices(String partitionPath) { - return fetchAllStoredFileGroups(partitionPath) - .map(HoodieFileGroup::getLatestFileSlice) - .filter(Option::isPresent) + return fetchAllStoredFileGroups(partitionPath).map(HoodieFileGroup::getLatestFileSlice).filter(Option::isPresent) .map(Option::get); } @@ -731,11 +700,9 @@ public abstract class AbstractTableFileSystemView implements SyncableFileSystemV * @param partitionPath Partition Path * @param maxCommitTime Instant Time */ - Stream fetchLatestFileSlicesBeforeOrOn(String partitionPath, - String maxCommitTime) { + Stream fetchLatestFileSlicesBeforeOrOn(String partitionPath, String maxCommitTime) { return fetchAllStoredFileGroups(partitionPath) - .map(fileGroup -> fileGroup.getLatestFileSliceBeforeOrOn(maxCommitTime)) - .filter(Option::isPresent) + .map(fileGroup -> fileGroup.getLatestFileSliceBeforeOrOn(maxCommitTime)).filter(Option::isPresent) .map(Option::get); } @@ -746,8 +713,8 @@ public abstract class AbstractTableFileSystemView implements SyncableFileSystemV * @param penultimateSlice Penultimate file slice for a file-group in commit timeline order */ private static FileSlice mergeCompactionPendingFileSlices(FileSlice lastSlice, FileSlice penultimateSlice) { - FileSlice merged = new FileSlice(penultimateSlice.getPartitionPath(), - penultimateSlice.getBaseInstantTime(), penultimateSlice.getFileId()); + FileSlice merged = new FileSlice(penultimateSlice.getPartitionPath(), penultimateSlice.getBaseInstantTime(), + penultimateSlice.getFileId()); if (penultimateSlice.getDataFile().isPresent()) { merged.setDataFile(penultimateSlice.getDataFile().get()); } @@ -782,24 +749,26 @@ public abstract class AbstractTableFileSystemView implements SyncableFileSystemV /** * Default implementation for fetching latest data-file + * * @param partitionPath Partition path * @param fileId File Id * @return Data File if present */ protected Option fetchLatestDataFile(String partitionPath, String fileId) { - return Option.fromJavaOptional(fetchLatestDataFiles(partitionPath) - .filter(fs -> fs.getFileId().equals(fileId)).findFirst()); + return Option + .fromJavaOptional(fetchLatestDataFiles(partitionPath).filter(fs -> fs.getFileId().equals(fileId)).findFirst()); } /** * Default implementation for fetching file-slice + * * @param partitionPath Partition path * @param fileId File Id * @return File Slice if present */ protected Option fetchLatestFileSlice(String partitionPath, String fileId) { - return Option.fromJavaOptional(fetchLatestFileSlices(partitionPath) - .filter(fs -> fs.getFileId().equals(fileId)).findFirst()); + return Option + .fromJavaOptional(fetchLatestFileSlices(partitionPath).filter(fs -> fs.getFileId().equals(fileId)).findFirst()); } @Override @@ -841,6 +810,7 @@ public abstract class AbstractTableFileSystemView implements SyncableFileSystemV /** * Return Only Commits and Compaction timeline for building file-groups + * * @return */ public HoodieTimeline getVisibleCommitsAndCompactionTimeline() { diff --git a/hudi-common/src/main/java/org/apache/hudi/common/table/view/FileSystemViewManager.java b/hudi-common/src/main/java/org/apache/hudi/common/table/view/FileSystemViewManager.java index 38e3967c1..c9e556a84 100644 --- a/hudi-common/src/main/java/org/apache/hudi/common/table/view/FileSystemViewManager.java +++ b/hudi-common/src/main/java/org/apache/hudi/common/table/view/FileSystemViewManager.java @@ -28,25 +28,21 @@ import org.apache.log4j.LogManager; import org.apache.log4j.Logger; /** - * A container that can potentially hold one or more dataset's - * file-system views. There is one view for each dataset. This is a view built against a timeline containing completed - * actions. In an embedded timeline-server mode, this typically holds only one dataset's view. - * In a stand-alone server mode, this can hold more than one dataset's views. + * A container that can potentially hold one or more dataset's file-system views. There is one view for each dataset. + * This is a view built against a timeline containing completed actions. In an embedded timeline-server mode, this + * typically holds only one dataset's view. In a stand-alone server mode, this can hold more than one dataset's views. * - * FileSystemView can be stored "locally" using the following storage mechanisms: - * a. In Memory - * b. Spillable Map - * c. RocksDB + * FileSystemView can be stored "locally" using the following storage mechanisms: a. In Memory b. Spillable Map c. + * RocksDB * * But there can be cases where the file-system view is managed remoted. For example : Embedded Timeline Server). In * this case, the clients will configure a remote filesystem view client (RemoteHoodieTableFileSystemView) for the * dataset which can connect to the remote file system view and fetch views. THere are 2 modes here : REMOTE_FIRST and - * REMOTE_ONLY - * REMOTE_FIRST : The file-system view implementation on client side will act as a remote proxy. In case, if there - * is problem (or exceptions) querying remote file-system view, a backup local file-system view(using - * either one of in-memory, spillable, rocksDB) is used to server file-system view queries - * REMOTE_ONLY : In this case, there is no backup local file-system view. If there is problem (or exceptions) - * querying remote file-system view, then the exceptions are percolated back to client. + * REMOTE_ONLY REMOTE_FIRST : The file-system view implementation on client side will act as a remote proxy. In case, if + * there is problem (or exceptions) querying remote file-system view, a backup local file-system view(using either one + * of in-memory, spillable, rocksDB) is used to server file-system view queries REMOTE_ONLY : In this case, there is no + * backup local file-system view. If there is problem (or exceptions) querying remote file-system view, then the + * exceptions are percolated back to client. * * FileSystemViewManager is designed to encapsulate the file-system view storage from clients using the file-system * view. FileSystemViewManager uses a factory to construct specific implementation of file-system view and passes it to @@ -73,6 +69,7 @@ public class FileSystemViewManager { /** * Drops reference to File-System Views. Future calls to view results in creating a new view + * * @param basePath */ public void clearFileSystemView(String basePath) { @@ -84,12 +81,12 @@ public class FileSystemViewManager { /** * Main API to get the file-system view for the base-path + * * @param basePath * @return */ public SyncableFileSystemView getFileSystemView(String basePath) { - return globalViewMap.computeIfAbsent(basePath, - (path) -> viewCreator.apply(path, viewStorageConfig)); + return globalViewMap.computeIfAbsent(basePath, (path) -> viewCreator.apply(path, viewStorageConfig)); } /** @@ -104,9 +101,10 @@ public class FileSystemViewManager { /** * Create RocksDB based file System view for a dataset + * * @param conf Hadoop Configuration - * @param viewConf View Storage Configuration - * @param basePath Base Path of dataset + * @param viewConf View Storage Configuration + * @param basePath Base Path of dataset * @return */ private static RocksDbBasedFileSystemView createRocksDBBasedFileSystemView(SerializableConfiguration conf, @@ -118,9 +116,10 @@ public class FileSystemViewManager { /** * Create a spillable Map based file System view for a dataset + * * @param conf Hadoop Configuration - * @param viewConf View Storage Configuration - * @param basePath Base Path of dataset + * @param viewConf View Storage Configuration + * @param basePath Base Path of dataset * @return */ private static SpillableMapBasedFileSystemView createSpillableMapBasedFileSystemView(SerializableConfiguration conf, @@ -134,9 +133,10 @@ public class FileSystemViewManager { /** * Create an in-memory file System view for a dataset + * * @param conf Hadoop Configuration - * @param viewConf View Storage Configuration - * @param basePath Base Path of dataset + * @param viewConf View Storage Configuration + * @param basePath Base Path of dataset * @return */ private static HoodieTableFileSystemView createInMemoryFileSystemView(SerializableConfiguration conf, @@ -149,27 +149,29 @@ public class FileSystemViewManager { /** * Create a remote file System view for a dataset + * * @param conf Hadoop Configuration - * @param viewConf View Storage Configuration - * @param metaClient Hoodie Table MetaClient for the dataset. + * @param viewConf View Storage Configuration + * @param metaClient Hoodie Table MetaClient for the dataset. * @return */ private static RemoteHoodieTableFileSystemView createRemoteFileSystemView(SerializableConfiguration conf, FileSystemViewStorageConfig viewConf, HoodieTableMetaClient metaClient) { logger.info("Creating remote view for basePath " + metaClient.getBasePath() + ". Server=" + viewConf.getRemoteViewServerHost() + ":" + viewConf.getRemoteViewServerPort()); - return new RemoteHoodieTableFileSystemView(viewConf.getRemoteViewServerHost(), - viewConf.getRemoteViewServerPort(), metaClient); + return new RemoteHoodieTableFileSystemView(viewConf.getRemoteViewServerHost(), viewConf.getRemoteViewServerPort(), + metaClient); } /** * Main Factory method for building file-system views - * @param conf Hadoop Configuration + * + * @param conf Hadoop Configuration * @param config View Storage Configuration * @return */ - public static FileSystemViewManager createViewManager( - final SerializableConfiguration conf, final FileSystemViewStorageConfig config) { + public static FileSystemViewManager createViewManager(final SerializableConfiguration conf, + final FileSystemViewStorageConfig config) { logger.info("Creating View Manager with storage type :" + config.getStorageType()); switch (config.getStorageType()) { case EMBEDDED_KV_STORE: @@ -186,9 +188,8 @@ public class FileSystemViewManager { (basePath, viewConfig) -> createInMemoryFileSystemView(conf, viewConfig, basePath)); case REMOTE_ONLY: logger.info("Creating remote only table view"); - return new FileSystemViewManager(conf, config, - (basePath, viewConfig) -> createRemoteFileSystemView(conf, viewConfig, - new HoodieTableMetaClient(conf.newCopy(), basePath))); + return new FileSystemViewManager(conf, config, (basePath, viewConfig) -> createRemoteFileSystemView(conf, + viewConfig, new HoodieTableMetaClient(conf.newCopy(), basePath))); case REMOTE_FIRST: logger.info("Creating remote first table view"); return new FileSystemViewManager(conf, config, (basePath, viewConfig) -> { diff --git a/hudi-common/src/main/java/org/apache/hudi/common/table/view/FileSystemViewStorageConfig.java b/hudi-common/src/main/java/org/apache/hudi/common/table/view/FileSystemViewStorageConfig.java index 3924b8b26..ee556f036 100644 --- a/hudi-common/src/main/java/org/apache/hudi/common/table/view/FileSystemViewStorageConfig.java +++ b/hudi-common/src/main/java/org/apache/hudi/common/table/view/FileSystemViewStorageConfig.java @@ -30,7 +30,7 @@ import org.apache.hudi.config.DefaultHoodieConfig; */ public class FileSystemViewStorageConfig extends DefaultHoodieConfig { - //Property Names + // Property Names public static final String FILESYSTEM_VIEW_STORAGE_TYPE = "hoodie.filesystem.view.type"; public static final String FILESYSTEM_VIEW_INCREMENTAL_SYNC_MODE = "hoodie.filesystem.view.incr.timeline.sync.enable"; public static final String FILESYSTEM_SECONDARY_VIEW_STORAGE_TYPE = "hoodie.filesystem.view.secondary.type"; @@ -85,8 +85,9 @@ public class FileSystemViewStorageConfig extends DefaultHoodieConfig { public long getMaxMemoryForPendingCompaction() { long totalMemory = Long.parseLong(props.getProperty(FILESYSTEM_VIEW_SPILLABLE_MEM)); - long reservedForPendingComaction = new Double(totalMemory * Double.parseDouble( - props.getProperty(FILESYSTEM_VIEW_PENDING_COMPACTION_MEM_FRACTION))).longValue(); + long reservedForPendingComaction = + new Double(totalMemory * Double.parseDouble(props.getProperty(FILESYSTEM_VIEW_PENDING_COMPACTION_MEM_FRACTION))) + .longValue(); return reservedForPendingComaction; } @@ -167,26 +168,26 @@ public class FileSystemViewStorageConfig extends DefaultHoodieConfig { } public FileSystemViewStorageConfig build() { - setDefaultOnCondition(props, !props.containsKey(FILESYSTEM_VIEW_STORAGE_TYPE), - FILESYSTEM_VIEW_STORAGE_TYPE, DEFAULT_VIEW_STORAGE_TYPE.name()); + setDefaultOnCondition(props, !props.containsKey(FILESYSTEM_VIEW_STORAGE_TYPE), FILESYSTEM_VIEW_STORAGE_TYPE, + DEFAULT_VIEW_STORAGE_TYPE.name()); setDefaultOnCondition(props, !props.containsKey(FILESYSTEM_VIEW_INCREMENTAL_SYNC_MODE), FILESYSTEM_VIEW_INCREMENTAL_SYNC_MODE, DEFAULT_FILESYSTEM_VIEW_INCREMENTAL_SYNC_MODE); setDefaultOnCondition(props, !props.containsKey(FILESYSTEM_SECONDARY_VIEW_STORAGE_TYPE), FILESYSTEM_SECONDARY_VIEW_STORAGE_TYPE, DEFAULT_SECONDARY_VIEW_STORAGE_TYPE.name()); - setDefaultOnCondition(props, !props.containsKey(FILESYSTEM_VIEW_REMOTE_HOST), - FILESYSTEM_VIEW_REMOTE_HOST, DEFUALT_REMOTE_VIEW_SERVER_HOST); - setDefaultOnCondition(props, !props.containsKey(FILESYSTEM_VIEW_REMOTE_PORT), - FILESYSTEM_VIEW_REMOTE_PORT, DEFAULT_REMOTE_VIEW_SERVER_PORT.toString()); + setDefaultOnCondition(props, !props.containsKey(FILESYSTEM_VIEW_REMOTE_HOST), FILESYSTEM_VIEW_REMOTE_HOST, + DEFUALT_REMOTE_VIEW_SERVER_HOST); + setDefaultOnCondition(props, !props.containsKey(FILESYSTEM_VIEW_REMOTE_PORT), FILESYSTEM_VIEW_REMOTE_PORT, + DEFAULT_REMOTE_VIEW_SERVER_PORT.toString()); - setDefaultOnCondition(props, !props.containsKey(FILESYSTEM_VIEW_SPILLABLE_DIR), - FILESYSTEM_VIEW_SPILLABLE_DIR, DEFAULT_VIEW_SPILLABLE_DIR); - setDefaultOnCondition(props, !props.containsKey(FILESYSTEM_VIEW_SPILLABLE_MEM), - FILESYSTEM_VIEW_SPILLABLE_MEM, DEFAULT_MAX_MEMORY_FOR_VIEW.toString()); + setDefaultOnCondition(props, !props.containsKey(FILESYSTEM_VIEW_SPILLABLE_DIR), FILESYSTEM_VIEW_SPILLABLE_DIR, + DEFAULT_VIEW_SPILLABLE_DIR); + setDefaultOnCondition(props, !props.containsKey(FILESYSTEM_VIEW_SPILLABLE_MEM), FILESYSTEM_VIEW_SPILLABLE_MEM, + DEFAULT_MAX_MEMORY_FOR_VIEW.toString()); setDefaultOnCondition(props, !props.containsKey(FILESYSTEM_VIEW_PENDING_COMPACTION_MEM_FRACTION), FILESYSTEM_VIEW_PENDING_COMPACTION_MEM_FRACTION, DEFAULT_MEM_FRACTION_FOR_PENDING_COMPACTION.toString()); - setDefaultOnCondition(props, !props.containsKey(ROCKSDB_BASE_PATH_PROP), - ROCKSDB_BASE_PATH_PROP, DEFAULT_ROCKSDB_BASE_PATH); + setDefaultOnCondition(props, !props.containsKey(ROCKSDB_BASE_PATH_PROP), ROCKSDB_BASE_PATH_PROP, + DEFAULT_ROCKSDB_BASE_PATH); // Validations FileSystemViewStorageType.valueOf(props.getProperty(FILESYSTEM_VIEW_STORAGE_TYPE)); diff --git a/hudi-common/src/main/java/org/apache/hudi/common/table/view/HoodieTableFileSystemView.java b/hudi-common/src/main/java/org/apache/hudi/common/table/view/HoodieTableFileSystemView.java index 5b9601e6e..c82ac2713 100644 --- a/hudi-common/src/main/java/org/apache/hudi/common/table/view/HoodieTableFileSystemView.java +++ b/hudi-common/src/main/java/org/apache/hudi/common/table/view/HoodieTableFileSystemView.java @@ -40,6 +40,7 @@ import org.apache.log4j.Logger; /** * TableFileSystemView Implementations based on in-memory storage. + * * @see TableFileSystemView * @since 0.3.0 */ @@ -115,13 +116,11 @@ public class HoodieTableFileSystemView extends IncrementalTimelineSyncFileSystem * * @deprecated */ - private void readObject(java.io.ObjectInputStream in) - throws IOException, ClassNotFoundException { + private void readObject(java.io.ObjectInputStream in) throws IOException, ClassNotFoundException { in.defaultReadObject(); } - private void writeObject(java.io.ObjectOutputStream out) - throws IOException { + private void writeObject(java.io.ObjectOutputStream out) throws IOException { out.defaultWriteObject(); } @@ -133,10 +132,9 @@ public class HoodieTableFileSystemView extends IncrementalTimelineSyncFileSystem @Override protected void resetPendingCompactionOperations(Stream> operations) { // Build fileId to Pending Compaction Instants - this.fgIdToPendingCompaction = createFileIdToPendingCompactionMap( - operations.map(entry -> { - return Pair.of(entry.getValue().getFileGroupId(), Pair.of(entry.getKey(),entry.getValue())); - }).collect(Collectors.toMap(Pair::getKey, Pair::getValue))); + this.fgIdToPendingCompaction = createFileIdToPendingCompactionMap(operations.map(entry -> { + return Pair.of(entry.getValue().getFileGroupId(), Pair.of(entry.getKey(), entry.getValue())); + }).collect(Collectors.toMap(Pair::getKey, Pair::getValue))); } @Override @@ -161,8 +159,8 @@ public class HoodieTableFileSystemView extends IncrementalTimelineSyncFileSystem } /** - * Given a partition path, obtain all filegroups within that. All methods, that work at the - * partition level go through this. + * Given a partition path, obtain all filegroups within that. All methods, that work at the partition level go through + * this. */ @Override Stream fetchAllStoredFileGroups(String partition) { diff --git a/hudi-common/src/main/java/org/apache/hudi/common/table/view/IncrementalTimelineSyncFileSystemView.java b/hudi-common/src/main/java/org/apache/hudi/common/table/view/IncrementalTimelineSyncFileSystemView.java index 22cc53bbd..f31ac4de8 100644 --- a/hudi-common/src/main/java/org/apache/hudi/common/table/view/IncrementalTimelineSyncFileSystemView.java +++ b/hudi-common/src/main/java/org/apache/hudi/common/table/view/IncrementalTimelineSyncFileSystemView.java @@ -157,18 +157,19 @@ public abstract class IncrementalTimelineSyncFileSystemView extends AbstractTabl log.info("Syncing pending compaction instant (" + instant + ")"); HoodieCompactionPlan compactionPlan = CompactionUtils.getCompactionPlan(metaClient, instant.getTimestamp()); List> pendingOps = - CompactionUtils.getPendingCompactionOperations(instant, compactionPlan).map(p -> Pair.of(p.getValue().getKey(), - CompactionOperation.convertFromAvroRecordInstance(p.getValue().getValue()))).collect(Collectors.toList()); + CompactionUtils.getPendingCompactionOperations(instant, compactionPlan) + .map(p -> Pair.of(p.getValue().getKey(), + CompactionOperation.convertFromAvroRecordInstance(p.getValue().getValue()))) + .collect(Collectors.toList()); // First, update Pending compaction instants addPendingCompactionOperations(pendingOps.stream()); - Map>> partitionToFileGroups = - pendingOps.stream().map(opPair -> { - String compactionInstantTime = opPair.getKey(); - HoodieFileGroup fileGroup = new HoodieFileGroup(opPair.getValue().getFileGroupId(), timeline); - fileGroup.addNewFileSliceAtInstant(compactionInstantTime); - return Pair.of(compactionInstantTime, fileGroup); - }).collect(Collectors.groupingBy(x -> x.getValue().getPartitionPath())); + Map>> partitionToFileGroups = pendingOps.stream().map(opPair -> { + String compactionInstantTime = opPair.getKey(); + HoodieFileGroup fileGroup = new HoodieFileGroup(opPair.getValue().getFileGroupId(), timeline); + fileGroup.addNewFileSliceAtInstant(compactionInstantTime); + return Pair.of(compactionInstantTime, fileGroup); + }).collect(Collectors.groupingBy(x -> x.getValue().getPartitionPath())); partitionToFileGroups.entrySet().forEach(entry -> { if (isPartitionAvailableInStore(entry.getKey())) { applyDeltaFileSlicesToPartitionView(entry.getKey(), @@ -185,8 +186,8 @@ public abstract class IncrementalTimelineSyncFileSystemView extends AbstractTabl */ private void addCommitInstant(HoodieTimeline timeline, HoodieInstant instant) throws IOException { log.info("Syncing committed instant (" + instant + ")"); - HoodieCommitMetadata commitMetadata = HoodieCommitMetadata.fromBytes(timeline.getInstantDetails(instant).get(), - HoodieCommitMetadata.class); + HoodieCommitMetadata commitMetadata = + HoodieCommitMetadata.fromBytes(timeline.getInstantDetails(instant).get(), HoodieCommitMetadata.class); commitMetadata.getPartitionToWriteStats().entrySet().stream().forEach(entry -> { String partition = entry.getKey(); if (isPartitionAvailableInStore(partition)) { @@ -196,8 +197,8 @@ public abstract class IncrementalTimelineSyncFileSystemView extends AbstractTabl new Path(String.format("%s/%s", metaClient.getBasePath(), p.getPath()))); return status; }).toArray(FileStatus[]::new); - List fileGroups = buildFileGroups(statuses, timeline.filterCompletedAndCompactionInstants(), - false); + List fileGroups = + buildFileGroups(statuses, timeline.filterCompletedAndCompactionInstants(), false); applyDeltaFileSlicesToPartitionView(partition, fileGroups, DeltaApplyMode.ADD); } else { log.warn("Skipping partition (" + partition + ") when syncing instant (" + instant + ") as it is not loaded"); @@ -214,8 +215,8 @@ public abstract class IncrementalTimelineSyncFileSystemView extends AbstractTabl */ private void addRestoreInstant(HoodieTimeline timeline, HoodieInstant instant) throws IOException { log.info("Syncing restore instant (" + instant + ")"); - HoodieRestoreMetadata metadata = AvroUtils.deserializeAvroMetadata( - timeline.getInstantDetails(instant).get(), HoodieRestoreMetadata.class); + HoodieRestoreMetadata metadata = + AvroUtils.deserializeAvroMetadata(timeline.getInstantDetails(instant).get(), HoodieRestoreMetadata.class); Map>> partitionFiles = metadata.getHoodieRestoreMetadata().entrySet().stream().flatMap(entry -> { @@ -238,8 +239,8 @@ public abstract class IncrementalTimelineSyncFileSystemView extends AbstractTabl */ private void addRollbackInstant(HoodieTimeline timeline, HoodieInstant instant) throws IOException { log.info("Syncing rollback instant (" + instant + ")"); - HoodieRollbackMetadata metadata = AvroUtils.deserializeAvroMetadata( - timeline.getInstantDetails(instant).get(), HoodieRollbackMetadata.class); + HoodieRollbackMetadata metadata = + AvroUtils.deserializeAvroMetadata(timeline.getInstantDetails(instant).get(), HoodieRollbackMetadata.class); metadata.getPartitionMetadata().entrySet().stream().forEach(e -> { removeFileSlicesForPartition(timeline, instant, e.getKey(), e.getValue().getSuccessDeleteFiles()); @@ -255,16 +256,16 @@ public abstract class IncrementalTimelineSyncFileSystemView extends AbstractTabl */ private void addCleanInstant(HoodieTimeline timeline, HoodieInstant instant) throws IOException { log.info("Syncing cleaner instant (" + instant + ")"); - HoodieCleanMetadata cleanMetadata = AvroUtils - .deserializeHoodieCleanMetadata(timeline.getInstantDetails(instant).get()); + HoodieCleanMetadata cleanMetadata = + AvroUtils.deserializeHoodieCleanMetadata(timeline.getInstantDetails(instant).get()); cleanMetadata.getPartitionMetadata().entrySet().stream().forEach(entry -> { removeFileSlicesForPartition(timeline, instant, entry.getKey(), entry.getValue().getSuccessDeleteFiles()); }); log.info("Done Syncing cleaner instant (" + instant + ")"); } - private void removeFileSlicesForPartition(HoodieTimeline timeline, HoodieInstant instant, - String partition, List paths) { + private void removeFileSlicesForPartition(HoodieTimeline timeline, HoodieInstant instant, String partition, + List paths) { if (isPartitionAvailableInStore(partition)) { log.info("Removing file slices for partition (" + partition + ") for instant (" + instant + ")"); FileStatus[] statuses = paths.stream().map(p -> { @@ -272,8 +273,8 @@ public abstract class IncrementalTimelineSyncFileSystemView extends AbstractTabl status.setPath(new Path(p)); return status; }).toArray(FileStatus[]::new); - List fileGroups = buildFileGroups(statuses, - timeline.filterCompletedAndCompactionInstants(), false); + List fileGroups = + buildFileGroups(statuses, timeline.filterCompletedAndCompactionInstants(), false); applyDeltaFileSlicesToPartitionView(partition, fileGroups, DeltaApplyMode.REMOVE); } else { log.warn("Skipping partition (" + partition + ") when syncing instant (" + instant + ") as it is not loaded"); @@ -284,8 +285,7 @@ public abstract class IncrementalTimelineSyncFileSystemView extends AbstractTabl * Apply mode whether to add or remove the delta view */ enum DeltaApplyMode { - ADD, - REMOVE + ADD, REMOVE } /** @@ -306,27 +306,27 @@ public abstract class IncrementalTimelineSyncFileSystemView extends AbstractTabl List fileGroups = fetchAllStoredFileGroups(partition).collect(Collectors.toList()); /** - * Note that while finding the new data/log files added/removed, the path stored in metadata will be missing - * the base-path,scheme and authority. Ensure the matching process takes care of this discrepancy. + * Note that while finding the new data/log files added/removed, the path stored in metadata will be missing the + * base-path,scheme and authority. Ensure the matching process takes care of this discrepancy. */ Map viewDataFiles = fileGroups.stream().flatMap(HoodieFileGroup::getAllRawFileSlices) .map(FileSlice::getDataFile).filter(Option::isPresent).map(Option::get) .map(df -> Pair.of(Path.getPathWithoutSchemeAndAuthority(new Path(df.getPath())).toString(), df)) .collect(Collectors.toMap(Pair::getKey, Pair::getValue)); - //Note: Delta Log Files and Data FIles can be empty when adding/removing pending compactions + // Note: Delta Log Files and Data FIles can be empty when adding/removing pending compactions Map deltaDataFiles = deltaFileGroups.stream().flatMap(HoodieFileGroup::getAllRawFileSlices) .map(FileSlice::getDataFile).filter(Option::isPresent).map(Option::get) .map(df -> Pair.of(Path.getPathWithoutSchemeAndAuthority(new Path(df.getPath())).toString(), df)) .collect(Collectors.toMap(Pair::getKey, Pair::getValue)); - Map viewLogFiles = fileGroups.stream().flatMap(HoodieFileGroup::getAllRawFileSlices) - .flatMap(FileSlice::getLogFiles) - .map(lf -> Pair.of(Path.getPathWithoutSchemeAndAuthority(lf.getPath()).toString(), lf)) - .collect(Collectors.toMap(Pair::getKey, Pair::getValue)); - Map deltaLogFiles = deltaFileGroups.stream().flatMap(HoodieFileGroup::getAllRawFileSlices) - .flatMap(FileSlice::getLogFiles) - .map(lf -> Pair.of(Path.getPathWithoutSchemeAndAuthority(lf.getPath()).toString(), lf)) - .collect(Collectors.toMap(Pair::getKey, Pair::getValue)); + Map viewLogFiles = + fileGroups.stream().flatMap(HoodieFileGroup::getAllRawFileSlices).flatMap(FileSlice::getLogFiles) + .map(lf -> Pair.of(Path.getPathWithoutSchemeAndAuthority(lf.getPath()).toString(), lf)) + .collect(Collectors.toMap(Pair::getKey, Pair::getValue)); + Map deltaLogFiles = + deltaFileGroups.stream().flatMap(HoodieFileGroup::getAllRawFileSlices).flatMap(FileSlice::getLogFiles) + .map(lf -> Pair.of(Path.getPathWithoutSchemeAndAuthority(lf.getPath()).toString(), lf)) + .collect(Collectors.toMap(Pair::getKey, Pair::getValue)); switch (mode) { case ADD: diff --git a/hudi-common/src/main/java/org/apache/hudi/common/table/view/PriorityBasedFileSystemView.java b/hudi-common/src/main/java/org/apache/hudi/common/table/view/PriorityBasedFileSystemView.java index 2275de208..696d77921 100644 --- a/hudi-common/src/main/java/org/apache/hudi/common/table/view/PriorityBasedFileSystemView.java +++ b/hudi-common/src/main/java/org/apache/hudi/common/table/view/PriorityBasedFileSystemView.java @@ -38,8 +38,8 @@ import org.apache.log4j.LogManager; import org.apache.log4j.Logger; /** - * A file system view which proxies request to a preferred File System View implementation. In case of error, - * flip all subsequent calls to a backup file-system view implementation. + * A file system view which proxies request to a preferred File System View implementation. In case of error, flip all + * subsequent calls to a backup file-system view implementation. */ public class PriorityBasedFileSystemView implements SyncableFileSystemView, Serializable { @@ -140,8 +140,7 @@ public class PriorityBasedFileSystemView implements SyncableFileSystemView, Seri @Override public Option getDataFileOn(String partitionPath, String instantTime, String fileId) { - return execute(partitionPath, instantTime, fileId, preferredView::getDataFileOn, - secondaryView::getDataFileOn); + return execute(partitionPath, instantTime, fileId, preferredView::getDataFileOn, secondaryView::getDataFileOn); } @Override diff --git a/hudi-common/src/main/java/org/apache/hudi/common/table/view/RemoteHoodieTableFileSystemView.java b/hudi-common/src/main/java/org/apache/hudi/common/table/view/RemoteHoodieTableFileSystemView.java index 7084acde0..d405cd2e7 100644 --- a/hudi-common/src/main/java/org/apache/hudi/common/table/view/RemoteHoodieTableFileSystemView.java +++ b/hudi-common/src/main/java/org/apache/hudi/common/table/view/RemoteHoodieTableFileSystemView.java @@ -57,31 +57,26 @@ import org.apache.log4j.Logger; public class RemoteHoodieTableFileSystemView implements SyncableFileSystemView, Serializable { private static final String BASE_URL = "/v1/hoodie/view"; - public static final String LATEST_PARTITION_SLICES_URL = String.format("%s/%s", BASE_URL, - "slices/partition/latest/"); - public static final String LATEST_PARTITION_SLICE_URL = String.format("%s/%s", BASE_URL, - "slices/file/latest/"); - public static final String LATEST_PARTITION_UNCOMPACTED_SLICES_URL = String.format("%s/%s", BASE_URL, - "slices/uncompacted/partition/latest/"); + public static final String LATEST_PARTITION_SLICES_URL = String.format("%s/%s", BASE_URL, "slices/partition/latest/"); + public static final String LATEST_PARTITION_SLICE_URL = String.format("%s/%s", BASE_URL, "slices/file/latest/"); + public static final String LATEST_PARTITION_UNCOMPACTED_SLICES_URL = + String.format("%s/%s", BASE_URL, "slices/uncompacted/partition/latest/"); public static final String ALL_SLICES_URL = String.format("%s/%s", BASE_URL, "slices/all"); public static final String LATEST_SLICES_MERGED_BEFORE_ON_INSTANT_URL = String.format("%s/%s", BASE_URL, "slices/merged/beforeoron/latest/"); - public static final String LATEST_SLICES_RANGE_INSTANT_URL = - String.format("%s/%s", BASE_URL, "slices/range/latest/"); + public static final String LATEST_SLICES_RANGE_INSTANT_URL = String.format("%s/%s", BASE_URL, "slices/range/latest/"); public static final String LATEST_SLICES_BEFORE_ON_INSTANT_URL = String.format("%s/%s", BASE_URL, "slices/beforeoron/latest/"); - public static final String PENDING_COMPACTION_OPS = - String.format("%s/%s", BASE_URL, "compactions/pending/"); + public static final String PENDING_COMPACTION_OPS = String.format("%s/%s", BASE_URL, "compactions/pending/"); - public static final String LATEST_PARTITION_DATA_FILES_URL = String.format("%s/%s", BASE_URL, - "datafiles/latest/partition"); - public static final String LATEST_PARTITION_DATA_FILE_URL = String.format("%s/%s", BASE_URL, - "datafile/latest/partition"); + public static final String LATEST_PARTITION_DATA_FILES_URL = + String.format("%s/%s", BASE_URL, "datafiles/latest/partition"); + public static final String LATEST_PARTITION_DATA_FILE_URL = + String.format("%s/%s", BASE_URL, "datafile/latest/partition"); public static final String ALL_DATA_FILES = String.format("%s/%s", BASE_URL, "datafiles/all"); public static final String LATEST_ALL_DATA_FILES = String.format("%s/%s", BASE_URL, "datafiles/all/latest/"); - public static final String LATEST_DATA_FILE_ON_INSTANT_URL = - String.format("%s/%s", BASE_URL, "datafile/on/latest/"); + public static final String LATEST_DATA_FILE_ON_INSTANT_URL = String.format("%s/%s", BASE_URL, "datafile/on/latest/"); public static final String LATEST_DATA_FILES_RANGE_INSTANT_URL = String.format("%s/%s", BASE_URL, "datafiles/range/latest/"); @@ -123,8 +118,7 @@ public class RemoteHoodieTableFileSystemView implements SyncableFileSystemView, private boolean closed = false; private enum RequestMethod { - GET, - POST + GET, POST } public RemoteHoodieTableFileSystemView(String server, int port, HoodieTableMetaClient metaClient) { @@ -140,8 +134,8 @@ public class RemoteHoodieTableFileSystemView implements SyncableFileSystemView, RequestMethod method) throws IOException { Preconditions.checkArgument(!closed, "View already closed"); - URIBuilder builder = new URIBuilder().setHost(serverHost).setPort(serverPort).setPath(requestPath) - .setScheme("http"); + URIBuilder builder = + new URIBuilder().setHost(serverHost).setPort(serverPort).setPath(requestPath).setScheme("http"); queryParameters.entrySet().stream().forEach(entry -> { builder.addParameter(entry.getKey(), entry.getValue()); @@ -213,8 +207,7 @@ public class RemoteHoodieTableFileSystemView implements SyncableFileSystemView, Map paramsMap = getParamsWithPartitionPath(partitionPath); try { List dataFiles = executeRequest(LATEST_PARTITION_DATA_FILES_URL, paramsMap, - new TypeReference>() { - }, RequestMethod.GET); + new TypeReference>() {}, RequestMethod.GET); return dataFiles.stream().map(DataFileDTO::toHoodieDataFile); } catch (IOException e) { throw new HoodieRemoteException(e); @@ -226,8 +219,7 @@ public class RemoteHoodieTableFileSystemView implements SyncableFileSystemView, Map paramsMap = getParams(); try { List dataFiles = executeRequest(LATEST_ALL_DATA_FILES, paramsMap, - new TypeReference>() { - }, RequestMethod.GET); + new TypeReference>() {}, RequestMethod.GET); return dataFiles.stream().map(DataFileDTO::toHoodieDataFile); } catch (IOException e) { throw new HoodieRemoteException(e); @@ -239,8 +231,7 @@ public class RemoteHoodieTableFileSystemView implements SyncableFileSystemView, Map paramsMap = getParamsWithAdditionalParam(partitionPath, MAX_INSTANT_PARAM, maxCommitTime); try { List dataFiles = executeRequest(LATEST_DATA_FILES_BEFORE_ON_INSTANT_URL, paramsMap, - new TypeReference>() { - }, RequestMethod.GET); + new TypeReference>() {}, RequestMethod.GET); return dataFiles.stream().map(DataFileDTO::toHoodieDataFile); } catch (IOException e) { throw new HoodieRemoteException(e); @@ -250,12 +241,10 @@ public class RemoteHoodieTableFileSystemView implements SyncableFileSystemView, @Override public Option getDataFileOn(String partitionPath, String instantTime, String fileId) { Map paramsMap = getParamsWithAdditionalParams(partitionPath, - new String[]{INSTANT_PARAM, FILEID_PARAM}, - new String[]{instantTime, fileId}); + new String[] {INSTANT_PARAM, FILEID_PARAM}, new String[] {instantTime, fileId}); try { List dataFiles = executeRequest(LATEST_DATA_FILE_ON_INSTANT_URL, paramsMap, - new TypeReference>() { - }, RequestMethod.GET); + new TypeReference>() {}, RequestMethod.GET); return Option.fromJavaOptional(dataFiles.stream().map(DataFileDTO::toHoodieDataFile).findFirst()); } catch (IOException e) { throw new HoodieRemoteException(e); @@ -264,12 +253,11 @@ public class RemoteHoodieTableFileSystemView implements SyncableFileSystemView, @Override public Stream getLatestDataFilesInRange(List commitsToReturn) { - Map paramsMap = getParams(INSTANTS_PARAM, - StringUtils.join(commitsToReturn.toArray(new String[0]), ",")); + Map paramsMap = + getParams(INSTANTS_PARAM, StringUtils.join(commitsToReturn.toArray(new String[0]), ",")); try { List dataFiles = executeRequest(LATEST_DATA_FILES_RANGE_INSTANT_URL, paramsMap, - new TypeReference>() { - }, RequestMethod.GET); + new TypeReference>() {}, RequestMethod.GET); return dataFiles.stream().map(DataFileDTO::toHoodieDataFile); } catch (IOException e) { throw new HoodieRemoteException(e); @@ -280,9 +268,8 @@ public class RemoteHoodieTableFileSystemView implements SyncableFileSystemView, public Stream getAllDataFiles(String partitionPath) { Map paramsMap = getParamsWithPartitionPath(partitionPath); try { - List dataFiles = executeRequest(ALL_DATA_FILES, paramsMap, - new TypeReference>() { - }, RequestMethod.GET); + List dataFiles = + executeRequest(ALL_DATA_FILES, paramsMap, new TypeReference>() {}, RequestMethod.GET); return dataFiles.stream().map(DataFileDTO::toHoodieDataFile); } catch (IOException e) { throw new HoodieRemoteException(e); @@ -294,8 +281,7 @@ public class RemoteHoodieTableFileSystemView implements SyncableFileSystemView, Map paramsMap = getParamsWithPartitionPath(partitionPath); try { List dataFiles = executeRequest(LATEST_PARTITION_SLICES_URL, paramsMap, - new TypeReference>() { - }, RequestMethod.GET); + new TypeReference>() {}, RequestMethod.GET); return dataFiles.stream().map(FileSliceDTO::toFileSlice); } catch (IOException e) { throw new HoodieRemoteException(e); @@ -307,8 +293,7 @@ public class RemoteHoodieTableFileSystemView implements SyncableFileSystemView, Map paramsMap = getParamsWithAdditionalParam(partitionPath, FILEID_PARAM, fileId); try { List dataFiles = executeRequest(LATEST_PARTITION_SLICE_URL, paramsMap, - new TypeReference>() { - }, RequestMethod.GET); + new TypeReference>() {}, RequestMethod.GET); return Option.fromJavaOptional(dataFiles.stream().map(FileSliceDTO::toFileSlice).findFirst()); } catch (IOException e) { throw new HoodieRemoteException(e); @@ -320,8 +305,7 @@ public class RemoteHoodieTableFileSystemView implements SyncableFileSystemView, Map paramsMap = getParamsWithPartitionPath(partitionPath); try { List dataFiles = executeRequest(LATEST_PARTITION_UNCOMPACTED_SLICES_URL, paramsMap, - new TypeReference>() { - }, RequestMethod.GET); + new TypeReference>() {}, RequestMethod.GET); return dataFiles.stream().map(FileSliceDTO::toFileSlice); } catch (IOException e) { throw new HoodieRemoteException(e); @@ -332,12 +316,11 @@ public class RemoteHoodieTableFileSystemView implements SyncableFileSystemView, public Stream getLatestFileSlicesBeforeOrOn(String partitionPath, String maxCommitTime, boolean includeFileSlicesInPendingCompaction) { Map paramsMap = getParamsWithAdditionalParams(partitionPath, - new String[]{MAX_INSTANT_PARAM, INCLUDE_FILES_IN_PENDING_COMPACTION_PARAM}, - new String[]{maxCommitTime, String.valueOf(includeFileSlicesInPendingCompaction)}); + new String[] {MAX_INSTANT_PARAM, INCLUDE_FILES_IN_PENDING_COMPACTION_PARAM}, + new String[] {maxCommitTime, String.valueOf(includeFileSlicesInPendingCompaction)}); try { List dataFiles = executeRequest(LATEST_SLICES_BEFORE_ON_INSTANT_URL, paramsMap, - new TypeReference>() { - }, RequestMethod.GET); + new TypeReference>() {}, RequestMethod.GET); return dataFiles.stream().map(FileSliceDTO::toFileSlice); } catch (IOException e) { throw new HoodieRemoteException(e); @@ -349,8 +332,7 @@ public class RemoteHoodieTableFileSystemView implements SyncableFileSystemView, Map paramsMap = getParamsWithAdditionalParam(partitionPath, MAX_INSTANT_PARAM, maxInstantTime); try { List dataFiles = executeRequest(LATEST_SLICES_MERGED_BEFORE_ON_INSTANT_URL, paramsMap, - new TypeReference>() { - }, RequestMethod.GET); + new TypeReference>() {}, RequestMethod.GET); return dataFiles.stream().map(FileSliceDTO::toFileSlice); } catch (IOException e) { throw new HoodieRemoteException(e); @@ -359,12 +341,11 @@ public class RemoteHoodieTableFileSystemView implements SyncableFileSystemView, @Override public Stream getLatestFileSliceInRange(List commitsToReturn) { - Map paramsMap = getParams(INSTANTS_PARAM, - StringUtils.join(commitsToReturn.toArray(new String[0]), ",")); + Map paramsMap = + getParams(INSTANTS_PARAM, StringUtils.join(commitsToReturn.toArray(new String[0]), ",")); try { List dataFiles = executeRequest(LATEST_SLICES_RANGE_INSTANT_URL, paramsMap, - new TypeReference>() { - }, RequestMethod.GET); + new TypeReference>() {}, RequestMethod.GET); return dataFiles.stream().map(FileSliceDTO::toFileSlice); } catch (IOException e) { throw new HoodieRemoteException(e); @@ -375,9 +356,8 @@ public class RemoteHoodieTableFileSystemView implements SyncableFileSystemView, public Stream getAllFileSlices(String partitionPath) { Map paramsMap = getParamsWithPartitionPath(partitionPath); try { - List dataFiles = executeRequest(ALL_SLICES_URL, paramsMap, - new TypeReference>() { - }, RequestMethod.GET); + List dataFiles = + executeRequest(ALL_SLICES_URL, paramsMap, new TypeReference>() {}, RequestMethod.GET); return dataFiles.stream().map(FileSliceDTO::toFileSlice); } catch (IOException e) { throw new HoodieRemoteException(e); @@ -389,8 +369,7 @@ public class RemoteHoodieTableFileSystemView implements SyncableFileSystemView, Map paramsMap = getParamsWithPartitionPath(partitionPath); try { List fileGroups = executeRequest(ALL_FILEGROUPS_FOR_PARTITION_URL, paramsMap, - new TypeReference>() { - }, RequestMethod.GET); + new TypeReference>() {}, RequestMethod.GET); return fileGroups.stream().map(dto -> FileGroupDTO.toFileGroup(dto, metaClient)); } catch (IOException e) { throw new HoodieRemoteException(e); @@ -400,8 +379,7 @@ public class RemoteHoodieTableFileSystemView implements SyncableFileSystemView, public boolean refresh() { Map paramsMap = getParams(); try { - return executeRequest(REFRESH_DATASET, paramsMap, new TypeReference() { - }, RequestMethod.POST); + return executeRequest(REFRESH_DATASET, paramsMap, new TypeReference() {}, RequestMethod.POST); } catch (IOException e) { throw new HoodieRemoteException(e); } @@ -412,8 +390,7 @@ public class RemoteHoodieTableFileSystemView implements SyncableFileSystemView, Map paramsMap = getParams(); try { List dtos = executeRequest(PENDING_COMPACTION_OPS, paramsMap, - new TypeReference>() { - }, RequestMethod.GET); + new TypeReference>() {}, RequestMethod.GET); return dtos.stream().map(CompactionOpDTO::toCompactionOperation); } catch (IOException e) { throw new HoodieRemoteException(e); @@ -434,9 +411,8 @@ public class RemoteHoodieTableFileSystemView implements SyncableFileSystemView, public Option getLastInstant() { Map paramsMap = getParams(); try { - List instants = executeRequest(LAST_INSTANT, paramsMap, - new TypeReference>() { - }, RequestMethod.GET); + List instants = + executeRequest(LAST_INSTANT, paramsMap, new TypeReference>() {}, RequestMethod.GET); return Option.fromJavaOptional(instants.stream().map(InstantDTO::toInstant).findFirst()); } catch (IOException e) { throw new HoodieRemoteException(e); @@ -447,9 +423,8 @@ public class RemoteHoodieTableFileSystemView implements SyncableFileSystemView, public HoodieTimeline getTimeline() { Map paramsMap = getParams(); try { - TimelineDTO timeline = executeRequest(TIMELINE, paramsMap, - new TypeReference() { - }, RequestMethod.GET); + TimelineDTO timeline = + executeRequest(TIMELINE, paramsMap, new TypeReference() {}, RequestMethod.GET); return TimelineDTO.toTimeline(timeline, metaClient); } catch (IOException e) { throw new HoodieRemoteException(e); @@ -458,7 +433,7 @@ public class RemoteHoodieTableFileSystemView implements SyncableFileSystemView, @Override public void sync() { - //noop + // noop } @Override @@ -466,8 +441,7 @@ public class RemoteHoodieTableFileSystemView implements SyncableFileSystemView, Map paramsMap = getParamsWithAdditionalParam(partitionPath, FILEID_PARAM, fileId); try { List dataFiles = executeRequest(LATEST_PARTITION_DATA_FILE_URL, paramsMap, - new TypeReference>() { - }, RequestMethod.GET); + new TypeReference>() {}, RequestMethod.GET); return Option.fromJavaOptional(dataFiles.stream().map(DataFileDTO::toHoodieDataFile).findFirst()); } catch (IOException e) { throw new HoodieRemoteException(e); diff --git a/hudi-common/src/main/java/org/apache/hudi/common/table/view/RocksDbBasedFileSystemView.java b/hudi-common/src/main/java/org/apache/hudi/common/table/view/RocksDbBasedFileSystemView.java index 6d2e1c8fa..837d45689 100644 --- a/hudi-common/src/main/java/org/apache/hudi/common/table/view/RocksDbBasedFileSystemView.java +++ b/hudi-common/src/main/java/org/apache/hudi/common/table/view/RocksDbBasedFileSystemView.java @@ -44,18 +44,15 @@ import org.apache.log4j.LogManager; import org.apache.log4j.Logger; /** - * A file-system view implementation on top of embedded Rocks DB store. - * For each DataSet : 3 column Family is added for storing - * (1) File-Slices and Data Files for View lookups - * (2) Pending compaction operations - * (3) Partitions tracked + * A file-system view implementation on top of embedded Rocks DB store. For each DataSet : 3 column Family is added for + * storing (1) File-Slices and Data Files for View lookups (2) Pending compaction operations (3) Partitions tracked * - * Fine-grained retrieval API to fetch latest file-slice and data-file which are common operations - * for ingestion/compaction are supported. + * Fine-grained retrieval API to fetch latest file-slice and data-file which are common operations for + * ingestion/compaction are supported. * - * TODO: vb The current implementation works in embedded server mode where each restarts blows away the view stores. - * To support view-state preservation across restarts, Hoodie timeline also needs to be stored - * inorder to detect changes to timeline across restarts. + * TODO: vb The current implementation works in embedded server mode where each restarts blows away the view stores. To + * support view-state preservation across restarts, Hoodie timeline also needs to be stored inorder to detect changes to + * timeline across restarts. */ public class RocksDbBasedFileSystemView extends IncrementalTimelineSyncFileSystemView { @@ -69,8 +66,8 @@ public class RocksDbBasedFileSystemView extends IncrementalTimelineSyncFileSyste private boolean closed = false; - public RocksDbBasedFileSystemView(HoodieTableMetaClient metaClient, - HoodieTimeline visibleActiveTimeline, FileSystemViewStorageConfig config) { + public RocksDbBasedFileSystemView(HoodieTableMetaClient metaClient, HoodieTimeline visibleActiveTimeline, + FileSystemViewStorageConfig config) { super(config.isIncrementalTimelineSyncEnabled()); this.config = config; this.schemaHelper = new RocksDBSchemaHelper(metaClient); @@ -78,8 +75,8 @@ public class RocksDbBasedFileSystemView extends IncrementalTimelineSyncFileSyste init(metaClient, visibleActiveTimeline); } - public RocksDbBasedFileSystemView(HoodieTableMetaClient metaClient, - HoodieTimeline visibleActiveTimeline, FileStatus[] fileStatuses, FileSystemViewStorageConfig config) { + public RocksDbBasedFileSystemView(HoodieTableMetaClient metaClient, HoodieTimeline visibleActiveTimeline, + FileStatus[] fileStatuses, FileSystemViewStorageConfig config) { this(metaClient, visibleActiveTimeline, config); addFilesToView(fileStatuses); } @@ -212,9 +209,9 @@ public class RocksDbBasedFileSystemView extends IncrementalTimelineSyncFileSyste Map logFiles = oldSlice.getLogFiles() .map(lf -> Pair.of(Path.getPathWithoutSchemeAndAuthority(lf.getPath()).toString(), lf)) .collect(Collectors.toMap(Pair::getKey, Pair::getValue)); - Map deltaLogFiles = fs.getLogFiles() - .map(lf -> Pair.of(Path.getPathWithoutSchemeAndAuthority(lf.getPath()).toString(), lf)) - .collect(Collectors.toMap(Pair::getKey, Pair::getValue)); + Map deltaLogFiles = + fs.getLogFiles().map(lf -> Pair.of(Path.getPathWithoutSchemeAndAuthority(lf.getPath()).toString(), lf)) + .collect(Collectors.toMap(Pair::getKey, Pair::getValue)); switch (mode) { case ADD: { @@ -237,7 +234,7 @@ public class RocksDbBasedFileSystemView extends IncrementalTimelineSyncFileSyste }); deltaLogFiles.keySet().stream().forEach(p -> logFiles.remove(p)); - //Add remaining log files back + // Add remaining log files back logFiles.values().stream().forEach(lf -> newFileSlice.addLogFile(lf)); if (newFileSlice.getDataFile().isPresent() || (newFileSlice.getLogFiles().count() > 0)) { log.info("Adding back new file-slice after remove FS=" + newFileSlice); @@ -262,15 +259,14 @@ public class RocksDbBasedFileSystemView extends IncrementalTimelineSyncFileSyste @Override Stream> fetchPendingCompactionOperations() { - return rocksDB.>prefixSearch( - schemaHelper.getColFamilyForPendingCompaction(), "").map(Pair::getValue); + return rocksDB.>prefixSearch(schemaHelper.getColFamilyForPendingCompaction(), "") + .map(Pair::getValue); } @Override Stream fetchAllDataFiles(String partitionPath) { return rocksDB.prefixSearch(schemaHelper.getColFamilyForView(), - schemaHelper.getPrefixForDataFileViewByPartition(partitionPath)) - .map(Pair::getValue); + schemaHelper.getPrefixForDataFileViewByPartition(partitionPath)).map(Pair::getValue); } @Override @@ -281,46 +277,50 @@ public class RocksDbBasedFileSystemView extends IncrementalTimelineSyncFileSyste @Override Stream fetchAllStoredFileGroups() { - return getFileGroups(rocksDB.prefixSearch(schemaHelper.getColFamilyForView(), - schemaHelper.getPrefixForSliceView()).map(Pair::getValue)); + return getFileGroups( + rocksDB.prefixSearch(schemaHelper.getColFamilyForView(), schemaHelper.getPrefixForSliceView()) + .map(Pair::getValue)); } @Override protected Option fetchLatestFileSlice(String partitionPath, String fileId) { // Retries only file-slices of the file and filters for the latest - return Option.ofNullable(rocksDB.prefixSearch(schemaHelper.getColFamilyForView(), - schemaHelper.getPrefixForSliceViewByPartitionFile(partitionPath, fileId)) - .map(Pair::getValue) - .reduce(null, (x, y) -> - ((x == null) ? y : (y == null) ? null : HoodieTimeline.compareTimestamps(x.getBaseInstantTime(), - y.getBaseInstantTime(), HoodieTimeline.GREATER) ? x : y))); + return Option.ofNullable(rocksDB + .prefixSearch(schemaHelper.getColFamilyForView(), + schemaHelper.getPrefixForSliceViewByPartitionFile(partitionPath, fileId)) + .map(Pair::getValue).reduce(null, + (x, y) -> ((x == null) ? y + : (y == null) ? null + : HoodieTimeline.compareTimestamps(x.getBaseInstantTime(), y.getBaseInstantTime(), + HoodieTimeline.GREATER) ? x : y))); } @Override protected Option fetchLatestDataFile(String partitionPath, String fileId) { // Retries only file-slices of the file and filters for the latest - return Option.ofNullable(rocksDB.prefixSearch(schemaHelper.getColFamilyForView(), - schemaHelper.getPrefixForDataFileViewByPartitionFile(partitionPath, fileId)) - .map(Pair::getValue) - .reduce(null, (x, y) -> - ((x == null) ? y : (y == null) ? null : HoodieTimeline.compareTimestamps(x.getCommitTime(), - y.getCommitTime(), HoodieTimeline.GREATER) ? x : y))); + return Option + .ofNullable(rocksDB + .prefixSearch(schemaHelper.getColFamilyForView(), + schemaHelper.getPrefixForDataFileViewByPartitionFile(partitionPath, fileId)) + .map(Pair::getValue).reduce(null, + (x, y) -> ((x == null) ? y + : (y == null) ? null + : HoodieTimeline.compareTimestamps(x.getCommitTime(), y.getCommitTime(), HoodieTimeline.GREATER) + ? x + : y))); } @Override - Option fetchHoodieFileGroup(String partitionPath, String fileId) { - return Option.fromJavaOptional( - getFileGroups(rocksDB.prefixSearch(schemaHelper.getColFamilyForView(), - schemaHelper.getPrefixForSliceViewByPartitionFile(partitionPath, fileId)) - .map(Pair::getValue)).findFirst()); + Option fetchHoodieFileGroup(String partitionPath, String fileId) { + return Option.fromJavaOptional(getFileGroups(rocksDB.prefixSearch(schemaHelper.getColFamilyForView(), + schemaHelper.getPrefixForSliceViewByPartitionFile(partitionPath, fileId)).map(Pair::getValue)).findFirst()); } private Stream getFileGroups(Stream sliceStream) { return sliceStream.map(s -> Pair.of(Pair.of(s.getPartitionPath(), s.getFileId()), s)) .collect(Collectors.groupingBy(Pair::getKey)).entrySet().stream().map(slicePair -> { - HoodieFileGroup fg = - new HoodieFileGroup(slicePair.getKey().getKey(), slicePair.getKey().getValue(), - getVisibleCommitsAndCompactionTimeline()); + HoodieFileGroup fg = new HoodieFileGroup(slicePair.getKey().getKey(), slicePair.getKey().getValue(), + getVisibleCommitsAndCompactionTimeline()); slicePair.getValue().forEach(e -> fg.addFileSlice(e.getValue())); return fg; }); diff --git a/hudi-common/src/main/java/org/apache/hudi/common/table/view/SpillableMapBasedFileSystemView.java b/hudi-common/src/main/java/org/apache/hudi/common/table/view/SpillableMapBasedFileSystemView.java index 9ab5696f0..ad31db4f5 100644 --- a/hudi-common/src/main/java/org/apache/hudi/common/table/view/SpillableMapBasedFileSystemView.java +++ b/hudi-common/src/main/java/org/apache/hudi/common/table/view/SpillableMapBasedFileSystemView.java @@ -46,8 +46,8 @@ public class SpillableMapBasedFileSystemView extends HoodieTableFileSystemView { private final long maxMemoryForPendingCompaction; private final String baseStoreDir; - public SpillableMapBasedFileSystemView(HoodieTableMetaClient metaClient, - HoodieTimeline visibleActiveTimeline, FileSystemViewStorageConfig config) { + public SpillableMapBasedFileSystemView(HoodieTableMetaClient metaClient, HoodieTimeline visibleActiveTimeline, + FileSystemViewStorageConfig config) { super(config.isIncrementalTimelineSyncEnabled()); this.maxMemoryForFileGroupMap = config.getMaxMemoryForFileGroupMap(); this.maxMemoryForPendingCompaction = config.getMaxMemoryForPendingCompaction(); @@ -55,8 +55,8 @@ public class SpillableMapBasedFileSystemView extends HoodieTableFileSystemView { init(metaClient, visibleActiveTimeline); } - public SpillableMapBasedFileSystemView(HoodieTableMetaClient metaClient, - HoodieTimeline visibleActiveTimeline, FileStatus[] fileStatuses, FileSystemViewStorageConfig config) { + public SpillableMapBasedFileSystemView(HoodieTableMetaClient metaClient, HoodieTimeline visibleActiveTimeline, + FileStatus[] fileStatuses, FileSystemViewStorageConfig config) { this(metaClient, visibleActiveTimeline, config); addFilesToView(fileStatuses); } @@ -64,12 +64,11 @@ public class SpillableMapBasedFileSystemView extends HoodieTableFileSystemView { @Override protected Map> createPartitionToFileGroups() { try { - log.info("Creating Partition To File groups map using external spillable Map. Max Mem=" - + maxMemoryForFileGroupMap + ", BaseDir=" + baseStoreDir); + log.info("Creating Partition To File groups map using external spillable Map. Max Mem=" + maxMemoryForFileGroupMap + + ", BaseDir=" + baseStoreDir); new File(baseStoreDir).mkdirs(); - return (Map>) - (new ExternalSpillableMap<>(maxMemoryForFileGroupMap, baseStoreDir, new DefaultSizeEstimator(), - new DefaultSizeEstimator<>())); + return (Map>) (new ExternalSpillableMap<>(maxMemoryForFileGroupMap, baseStoreDir, + new DefaultSizeEstimator(), new DefaultSizeEstimator<>())); } catch (IOException e) { throw new RuntimeException(e); } @@ -78,12 +77,11 @@ public class SpillableMapBasedFileSystemView extends HoodieTableFileSystemView { protected Map> createFileIdToPendingCompactionMap( Map> fgIdToPendingCompaction) { try { - log.info("Creating Pending Compaction map using external spillable Map. Max Mem=" - + maxMemoryForPendingCompaction + ", BaseDir=" + baseStoreDir); + log.info("Creating Pending Compaction map using external spillable Map. Max Mem=" + maxMemoryForPendingCompaction + + ", BaseDir=" + baseStoreDir); new File(baseStoreDir).mkdirs(); - Map> pendingMap = - new ExternalSpillableMap<>(maxMemoryForPendingCompaction, baseStoreDir, new DefaultSizeEstimator(), - new DefaultSizeEstimator<>()); + Map> pendingMap = new ExternalSpillableMap<>( + maxMemoryForPendingCompaction, baseStoreDir, new DefaultSizeEstimator(), new DefaultSizeEstimator<>()); pendingMap.putAll(fgIdToPendingCompaction); return pendingMap; } catch (IOException e) { @@ -92,20 +90,20 @@ public class SpillableMapBasedFileSystemView extends HoodieTableFileSystemView { } public Stream getAllFileGroups() { - return ((ExternalSpillableMap)partitionToFileGroupsMap).valueStream() - .flatMap(fg -> ((List)fg).stream()); + return ((ExternalSpillableMap) partitionToFileGroupsMap).valueStream() + .flatMap(fg -> ((List) fg).stream()); } @Override Stream> fetchPendingCompactionOperations() { - return ((ExternalSpillableMap)fgIdToPendingCompaction).valueStream(); + return ((ExternalSpillableMap) fgIdToPendingCompaction).valueStream(); } @Override public Stream fetchAllStoredFileGroups() { - return ((ExternalSpillableMap)partitionToFileGroupsMap).valueStream().flatMap(fg -> { - return ((List)fg).stream(); + return ((ExternalSpillableMap) partitionToFileGroupsMap).valueStream().flatMap(fg -> { + return ((List) fg).stream(); }); } } diff --git a/hudi-common/src/main/java/org/apache/hudi/common/util/AvroUtils.java b/hudi-common/src/main/java/org/apache/hudi/common/util/AvroUtils.java index b0a01b138..f6a99acb4 100644 --- a/hudi-common/src/main/java/org/apache/hudi/common/util/AvroUtils.java +++ b/hudi-common/src/main/java/org/apache/hudi/common/util/AvroUtils.java @@ -47,17 +47,15 @@ import org.apache.hudi.common.HoodieRollbackStat; public class AvroUtils { - public static HoodieCleanMetadata convertCleanMetadata(String startCleanTime, - Option durationInMs, List cleanStats) { - ImmutableMap.Builder partitionMetadataBuilder = - ImmutableMap.builder(); + public static HoodieCleanMetadata convertCleanMetadata(String startCleanTime, Option durationInMs, + List cleanStats) { + ImmutableMap.Builder partitionMetadataBuilder = ImmutableMap.builder(); int totalDeleted = 0; String earliestCommitToRetain = null; for (HoodieCleanStat stat : cleanStats) { HoodieCleanPartitionMetadata metadata = new HoodieCleanPartitionMetadata(stat.getPartitionPath(), stat.getPolicy().name(), - stat.getDeletePathPatterns(), stat.getSuccessDeleteFiles(), - stat.getDeletePathPatterns()); + stat.getDeletePathPatterns(), stat.getSuccessDeleteFiles(), stat.getDeletePathPatterns()); partitionMetadataBuilder.put(stat.getPartitionPath(), metadata); totalDeleted += stat.getSuccessDeleteFiles().size(); if (earliestCommitToRetain == null) { @@ -65,78 +63,67 @@ public class AvroUtils { earliestCommitToRetain = stat.getEarliestCommitToRetain(); } } - return new HoodieCleanMetadata(startCleanTime, durationInMs.orElseGet(() -> -1L), - totalDeleted, earliestCommitToRetain, partitionMetadataBuilder.build()); + return new HoodieCleanMetadata(startCleanTime, durationInMs.orElseGet(() -> -1L), totalDeleted, + earliestCommitToRetain, partitionMetadataBuilder.build()); } - public static HoodieRestoreMetadata convertRestoreMetadata(String startRestoreTime, - Option durationInMs, List commits, Map> commitToStats) { + public static HoodieRestoreMetadata convertRestoreMetadata(String startRestoreTime, Option durationInMs, + List commits, Map> commitToStats) { ImmutableMap.Builder> commitToStatBuilder = ImmutableMap.builder(); for (Map.Entry> commitToStat : commitToStats.entrySet()) { - commitToStatBuilder.put(commitToStat.getKey(), Arrays.asList(convertRollbackMetadata(startRestoreTime, - durationInMs, commits, commitToStat.getValue()))); + commitToStatBuilder.put(commitToStat.getKey(), + Arrays.asList(convertRollbackMetadata(startRestoreTime, durationInMs, commits, commitToStat.getValue()))); } return new HoodieRestoreMetadata(startRestoreTime, durationInMs.orElseGet(() -> -1L), commits, commitToStatBuilder.build()); } - public static HoodieRollbackMetadata convertRollbackMetadata(String startRollbackTime, - Option durationInMs, List commits, List rollbackStats) { - ImmutableMap.Builder partitionMetadataBuilder = - ImmutableMap.builder(); + public static HoodieRollbackMetadata convertRollbackMetadata(String startRollbackTime, Option durationInMs, + List commits, List rollbackStats) { + ImmutableMap.Builder partitionMetadataBuilder = ImmutableMap.builder(); int totalDeleted = 0; for (HoodieRollbackStat stat : rollbackStats) { - HoodieRollbackPartitionMetadata metadata = - new HoodieRollbackPartitionMetadata(stat.getPartitionPath(), - stat.getSuccessDeleteFiles(), stat.getFailedDeleteFiles()); - partitionMetadataBuilder - .put(stat.getPartitionPath(), metadata); + HoodieRollbackPartitionMetadata metadata = new HoodieRollbackPartitionMetadata(stat.getPartitionPath(), + stat.getSuccessDeleteFiles(), stat.getFailedDeleteFiles()); + partitionMetadataBuilder.put(stat.getPartitionPath(), metadata); totalDeleted += stat.getSuccessDeleteFiles().size(); } - return new HoodieRollbackMetadata(startRollbackTime, durationInMs.orElseGet(() -> -1L), - totalDeleted, commits, partitionMetadataBuilder.build()); + return new HoodieRollbackMetadata(startRollbackTime, durationInMs.orElseGet(() -> -1L), totalDeleted, commits, + partitionMetadataBuilder.build()); } public static HoodieSavepointMetadata convertSavepointMetadata(String user, String comment, Map> latestFiles) { - ImmutableMap.Builder partitionMetadataBuilder = - ImmutableMap.builder(); + ImmutableMap.Builder partitionMetadataBuilder = ImmutableMap.builder(); for (Map.Entry> stat : latestFiles.entrySet()) { - HoodieSavepointPartitionMetadata metadata = - new HoodieSavepointPartitionMetadata(stat.getKey(), stat.getValue()); + HoodieSavepointPartitionMetadata metadata = new HoodieSavepointPartitionMetadata(stat.getKey(), stat.getValue()); partitionMetadataBuilder.put(stat.getKey(), metadata); } - return new HoodieSavepointMetadata(user, System.currentTimeMillis(), comment, - partitionMetadataBuilder.build()); + return new HoodieSavepointMetadata(user, System.currentTimeMillis(), comment, partitionMetadataBuilder.build()); } - public static Option serializeCompactionPlan(HoodieCompactionPlan compactionWorkload) - throws IOException { + public static Option serializeCompactionPlan(HoodieCompactionPlan compactionWorkload) throws IOException { return serializeAvroMetadata(compactionWorkload, HoodieCompactionPlan.class); } - public static Option serializeCleanMetadata(HoodieCleanMetadata metadata) - throws IOException { + public static Option serializeCleanMetadata(HoodieCleanMetadata metadata) throws IOException { return serializeAvroMetadata(metadata, HoodieCleanMetadata.class); } - public static Option serializeSavepointMetadata(HoodieSavepointMetadata metadata) - throws IOException { + public static Option serializeSavepointMetadata(HoodieSavepointMetadata metadata) throws IOException { return serializeAvroMetadata(metadata, HoodieSavepointMetadata.class); } - public static Option serializeRollbackMetadata( - HoodieRollbackMetadata rollbackMetadata) throws IOException { + public static Option serializeRollbackMetadata(HoodieRollbackMetadata rollbackMetadata) throws IOException { return serializeAvroMetadata(rollbackMetadata, HoodieRollbackMetadata.class); } - public static Option serializeRestoreMetadata( - HoodieRestoreMetadata restoreMetadata) throws IOException { + public static Option serializeRestoreMetadata(HoodieRestoreMetadata restoreMetadata) throws IOException { return serializeAvroMetadata(restoreMetadata, HoodieRestoreMetadata.class); } - public static Option serializeAvroMetadata(T metadata, - Class clazz) throws IOException { + public static Option serializeAvroMetadata(T metadata, Class clazz) + throws IOException { DatumWriter datumWriter = new SpecificDatumWriter<>(clazz); DataFileWriter fileWriter = new DataFileWriter<>(datumWriter); ByteArrayOutputStream baos = new ByteArrayOutputStream(); @@ -146,28 +133,23 @@ public class AvroUtils { return Option.of(baos.toByteArray()); } - public static HoodieCompactionPlan deserializeCompactionPlan(byte[] bytes) - throws IOException { + public static HoodieCompactionPlan deserializeCompactionPlan(byte[] bytes) throws IOException { return deserializeAvroMetadata(bytes, HoodieCompactionPlan.class); } - public static HoodieCleanMetadata deserializeHoodieCleanMetadata(byte[] bytes) - throws IOException { + public static HoodieCleanMetadata deserializeHoodieCleanMetadata(byte[] bytes) throws IOException { return deserializeAvroMetadata(bytes, HoodieCleanMetadata.class); } - public static HoodieSavepointMetadata deserializeHoodieSavepointMetadata(byte[] bytes) - throws IOException { + public static HoodieSavepointMetadata deserializeHoodieSavepointMetadata(byte[] bytes) throws IOException { return deserializeAvroMetadata(bytes, HoodieSavepointMetadata.class); } - public static T deserializeAvroMetadata(byte[] bytes, - Class clazz) throws IOException { + public static T deserializeAvroMetadata(byte[] bytes, Class clazz) + throws IOException { DatumReader reader = new SpecificDatumReader<>(clazz); - FileReader fileReader = - DataFileReader.openReader(new SeekableByteArrayInput(bytes), reader); - Preconditions - .checkArgument(fileReader.hasNext(), "Could not deserialize metadata of type " + clazz); + FileReader fileReader = DataFileReader.openReader(new SeekableByteArrayInput(bytes), reader); + Preconditions.checkArgument(fileReader.hasNext(), "Could not deserialize metadata of type " + clazz); return fileReader.next(); } } diff --git a/hudi-common/src/main/java/org/apache/hudi/common/util/CompactionUtils.java b/hudi-common/src/main/java/org/apache/hudi/common/util/CompactionUtils.java index 8a2aa23f9..acb781d67 100644 --- a/hudi-common/src/main/java/org/apache/hudi/common/util/CompactionUtils.java +++ b/hudi-common/src/main/java/org/apache/hudi/common/util/CompactionUtils.java @@ -48,8 +48,8 @@ public class CompactionUtils { /** * Generate compaction operation from file-slice * - * @param partitionPath Partition path - * @param fileSlice File Slice + * @param partitionPath Partition path + * @param fileSlice File Slice * @param metricsCaptureFunction Metrics Capture function * @return Compaction Operation */ @@ -74,17 +74,17 @@ public class CompactionUtils { * Generate compaction plan from file-slices * * @param partitionFileSlicePairs list of partition file-slice pairs - * @param extraMetadata Extra Metadata - * @param metricsCaptureFunction Metrics Capture function + * @param extraMetadata Extra Metadata + * @param metricsCaptureFunction Metrics Capture function */ - public static HoodieCompactionPlan buildFromFileSlices( - List> partitionFileSlicePairs, + public static HoodieCompactionPlan buildFromFileSlices(List> partitionFileSlicePairs, Option> extraMetadata, Option, Map>> metricsCaptureFunction) { HoodieCompactionPlan.Builder builder = HoodieCompactionPlan.newBuilder(); extraMetadata.ifPresent(m -> builder.setExtraMetadata(m)); - builder.setOperations(partitionFileSlicePairs.stream().map(pfPair -> - buildFromFileSlice(pfPair.getKey(), pfPair.getValue(), metricsCaptureFunction)).collect(Collectors.toList())); + builder.setOperations(partitionFileSlicePairs.stream() + .map(pfPair -> buildFromFileSlice(pfPair.getKey(), pfPair.getValue(), metricsCaptureFunction)) + .collect(Collectors.toList())); return builder.build(); } @@ -92,12 +92,10 @@ public class CompactionUtils { * Build Avro generated Compaction operation payload from compaction operation POJO for serialization */ public static HoodieCompactionOperation buildHoodieCompactionOperation(CompactionOperation op) { - return HoodieCompactionOperation.newBuilder().setFileId(op.getFileId()) - .setBaseInstantTime(op.getBaseInstantTime()) + return HoodieCompactionOperation.newBuilder().setFileId(op.getFileId()).setBaseInstantTime(op.getBaseInstantTime()) .setPartitionPath(op.getPartitionPath()) .setDataFilePath(op.getDataFilePath().isPresent() ? op.getDataFilePath().get() : null) - .setDeltaFilePaths(op.getDeltaFilePaths()) - .setMetrics(op.getMetrics()).build(); + .setDeltaFilePaths(op.getDeltaFilePaths()).setMetrics(op.getMetrics()).build(); } /** @@ -127,11 +125,10 @@ public class CompactionUtils { }).collect(Collectors.toList()); } - public static HoodieCompactionPlan getCompactionPlan(HoodieTableMetaClient metaClient, - String compactionInstant) throws IOException { - HoodieCompactionPlan compactionPlan = AvroUtils.deserializeCompactionPlan( - metaClient.getActiveTimeline().getInstantAuxiliaryDetails( - HoodieTimeline.getCompactionRequestedInstant(compactionInstant)).get()); + public static HoodieCompactionPlan getCompactionPlan(HoodieTableMetaClient metaClient, String compactionInstant) + throws IOException { + HoodieCompactionPlan compactionPlan = AvroUtils.deserializeCompactionPlan(metaClient.getActiveTimeline() + .getInstantAuxiliaryDetails(HoodieTimeline.getCompactionRequestedInstant(compactionInstant)).get()); return compactionPlan; } @@ -184,6 +181,7 @@ public class CompactionUtils { /** * Return all pending compaction instant times + * * @return */ public static List getPendingCompactionInstantTimes(HoodieTableMetaClient metaClient) { diff --git a/hudi-common/src/main/java/org/apache/hudi/common/util/ConsistencyGuard.java b/hudi-common/src/main/java/org/apache/hudi/common/util/ConsistencyGuard.java index 57be07b5f..0327c62f7 100644 --- a/hudi-common/src/main/java/org/apache/hudi/common/util/ConsistencyGuard.java +++ b/hudi-common/src/main/java/org/apache/hudi/common/util/ConsistencyGuard.java @@ -32,12 +32,12 @@ public interface ConsistencyGuard { * File Visibility */ enum FileVisibility { - APPEAR, - DISAPPEAR, + APPEAR, DISAPPEAR, } /** * Wait for file to be listable based on configurable timeout + * * @param filePath * @throws IOException when having trouble listing the path * @throws TimeoutException when retries exhausted @@ -46,6 +46,7 @@ public interface ConsistencyGuard { /** * Wait for file to be listable based on configurable timeout + * * @param filePath * @throws IOException when having trouble listing the path * @throws TimeoutException when retries exhausted @@ -65,8 +66,9 @@ public interface ConsistencyGuard { /** * Wait Till target visibility is reached - * @param dirPath Directory Path - * @param files Files + * + * @param dirPath Directory Path + * @param files Files * @param targetVisibility Target Visibitlity * @throws IOException * @throws TimeoutException diff --git a/hudi-common/src/main/java/org/apache/hudi/common/util/ConsistencyGuardConfig.java b/hudi-common/src/main/java/org/apache/hudi/common/util/ConsistencyGuardConfig.java index a73fc5ec9..761aabf89 100644 --- a/hudi-common/src/main/java/org/apache/hudi/common/util/ConsistencyGuardConfig.java +++ b/hudi-common/src/main/java/org/apache/hudi/common/util/ConsistencyGuardConfig.java @@ -106,14 +106,14 @@ public class ConsistencyGuardConfig extends DefaultHoodieConfig { } public ConsistencyGuardConfig build() { - setDefaultOnCondition(props, !props.containsKey(CONSISTENCY_CHECK_ENABLED_PROP), - CONSISTENCY_CHECK_ENABLED_PROP, DEFAULT_CONSISTENCY_CHECK_ENABLED); + setDefaultOnCondition(props, !props.containsKey(CONSISTENCY_CHECK_ENABLED_PROP), CONSISTENCY_CHECK_ENABLED_PROP, + DEFAULT_CONSISTENCY_CHECK_ENABLED); setDefaultOnCondition(props, !props.containsKey(INITIAL_CONSISTENCY_CHECK_INTERVAL_MS_PROP), INITIAL_CONSISTENCY_CHECK_INTERVAL_MS_PROP, String.valueOf(DEFAULT_INITIAL_CONSISTENCY_CHECK_INTERVAL_MS)); setDefaultOnCondition(props, !props.containsKey(MAX_CONSISTENCY_CHECK_INTERVAL_MS_PROP), MAX_CONSISTENCY_CHECK_INTERVAL_MS_PROP, String.valueOf(DEFAULT_MAX_CONSISTENCY_CHECK_INTERVAL_MS)); - setDefaultOnCondition(props, !props.containsKey(MAX_CONSISTENCY_CHECKS_PROP), - MAX_CONSISTENCY_CHECKS_PROP, String.valueOf(DEFAULT_MAX_CONSISTENCY_CHECKS)); + setDefaultOnCondition(props, !props.containsKey(MAX_CONSISTENCY_CHECKS_PROP), MAX_CONSISTENCY_CHECKS_PROP, + String.valueOf(DEFAULT_MAX_CONSISTENCY_CHECKS)); return new ConsistencyGuardConfig(props); } diff --git a/hudi-common/src/main/java/org/apache/hudi/common/util/DFSPropertiesConfiguration.java b/hudi-common/src/main/java/org/apache/hudi/common/util/DFSPropertiesConfiguration.java index 982098e36..0d37a0f3f 100644 --- a/hudi-common/src/main/java/org/apache/hudi/common/util/DFSPropertiesConfiguration.java +++ b/hudi-common/src/main/java/org/apache/hudi/common/util/DFSPropertiesConfiguration.java @@ -63,7 +63,7 @@ public class DFSPropertiesConfiguration { int ind = line.indexOf('='); String k = line.substring(0, ind).trim(); String v = line.substring(ind + 1).trim(); - return new String[]{k, v}; + return new String[] {k, v}; } private void visitFile(Path file) { @@ -82,6 +82,7 @@ public class DFSPropertiesConfiguration { /** * Add properties from input stream + * * @param reader Buffered Reader * @throws IOException */ diff --git a/hudi-common/src/main/java/org/apache/hudi/common/util/DefaultSizeEstimator.java b/hudi-common/src/main/java/org/apache/hudi/common/util/DefaultSizeEstimator.java index 883c4cfa5..9cae75edb 100644 --- a/hudi-common/src/main/java/org/apache/hudi/common/util/DefaultSizeEstimator.java +++ b/hudi-common/src/main/java/org/apache/hudi/common/util/DefaultSizeEstimator.java @@ -20,12 +20,13 @@ package org.apache.hudi.common.util; /** * Default implementation of size-estimator that uses Twitter's ObjectSizeCalculator + * * @param */ public class DefaultSizeEstimator implements SizeEstimator { @Override - public long sizeEstimate(T t) { + public long sizeEstimate(T t) { return ObjectSizeCalculator.getObjectSize(t); } } diff --git a/hudi-common/src/main/java/org/apache/hudi/common/util/FSUtils.java b/hudi-common/src/main/java/org/apache/hudi/common/util/FSUtils.java index df2054777..06a2cfa2a 100644 --- a/hudi-common/src/main/java/org/apache/hudi/common/util/FSUtils.java +++ b/hudi-common/src/main/java/org/apache/hudi/common/util/FSUtils.java @@ -83,10 +83,7 @@ public class FSUtils { for (Entry prop : System.getenv().entrySet()) { if (prop.getKey().startsWith(HOODIE_ENV_PROPS_PREFIX)) { LOG.info("Picking up value for hoodie env var :" + prop.getKey()); - conf.set(prop.getKey() - .replace(HOODIE_ENV_PROPS_PREFIX, "") - .replaceAll("_DOT_", "."), - prop.getValue()); + conf.set(prop.getKey().replace(HOODIE_ENV_PROPS_PREFIX, "").replaceAll("_DOT_", "."), prop.getValue()); } } return conf; @@ -98,12 +95,10 @@ public class FSUtils { try { fs = new Path(path).getFileSystem(conf); } catch (IOException e) { - throw new HoodieIOException("Failed to get instance of " + FileSystem.class.getName(), - e); + throw new HoodieIOException("Failed to get instance of " + FileSystem.class.getName(), e); } - LOG.info( - String.format("Hadoop Configuration: fs.defaultFS: [%s], Config:[%s], FileSystem: [%s]", - conf.getRaw("fs.defaultFS"), conf.toString(), fs.toString())); + LOG.info(String.format("Hadoop Configuration: fs.defaultFS: [%s], Config:[%s], FileSystem: [%s]", + conf.getRaw("fs.defaultFS"), conf.toString(), fs.toString())); return fs; } @@ -125,11 +120,11 @@ public class FSUtils { public static String translateMarkerToDataPath(String basePath, String markerPath, String instantTs) { Preconditions.checkArgument(markerPath.endsWith(HoodieTableMetaClient.MARKER_EXTN)); - String markerRootPath = Path.getPathWithoutSchemeAndAuthority(new Path( - String.format("%s/%s/%s", basePath, HoodieTableMetaClient.TEMPFOLDER_NAME, instantTs))).toString(); + String markerRootPath = Path.getPathWithoutSchemeAndAuthority( + new Path(String.format("%s/%s/%s", basePath, HoodieTableMetaClient.TEMPFOLDER_NAME, instantTs))).toString(); int begin = markerPath.indexOf(markerRootPath); - Preconditions.checkArgument(begin >= 0, "Not in marker dir. Marker Path=" + markerPath - + ", Expected Marker Root=" + markerRootPath); + Preconditions.checkArgument(begin >= 0, + "Not in marker dir. Marker Path=" + markerPath + ", Expected Marker Root=" + markerRootPath); String rPath = markerPath.substring(begin + markerRootPath.length() + 1); return String.format("%s/%s%s", basePath, rPath.replace(HoodieTableMetaClient.MARKER_EXTN, ""), HoodieFileFormat.PARQUET.getFileExtension()); @@ -159,42 +154,38 @@ public class FSUtils { /** * Gets all partition paths assuming date partitioning (year, month, day) three levels down. */ - public static List getAllPartitionFoldersThreeLevelsDown(FileSystem fs, String basePath) - throws IOException { + public static List getAllPartitionFoldersThreeLevelsDown(FileSystem fs, String basePath) throws IOException { List datePartitions = new ArrayList<>(); // Avoid listing and including any folders under the metafolder PathFilter filter = getExcludeMetaPathFilter(); FileStatus[] folders = fs.globStatus(new Path(basePath + "/*/*/*"), filter); for (FileStatus status : folders) { Path path = status.getPath(); - datePartitions.add(String.format("%s/%s/%s", path.getParent().getParent().getName(), - path.getParent().getName(), path.getName())); + datePartitions.add(String.format("%s/%s/%s", path.getParent().getParent().getName(), path.getParent().getName(), + path.getName())); } return datePartitions; } /** - * Given a base partition and a partition path, return - * relative path of partition path to the base path + * Given a base partition and a partition path, return relative path of partition path to the base path */ public static String getRelativePartitionPath(Path basePath, Path partitionPath) { basePath = Path.getPathWithoutSchemeAndAuthority(basePath); partitionPath = Path.getPathWithoutSchemeAndAuthority(partitionPath); String partitionFullPath = partitionPath.toString(); - int partitionStartIndex = partitionFullPath.indexOf( - basePath.getName(), + int partitionStartIndex = partitionFullPath.indexOf(basePath.getName(), basePath.getParent() == null ? 0 : basePath.getParent().toString().length()); // Partition-Path could be empty for non-partitioned tables - return partitionStartIndex + basePath.getName().length() == partitionFullPath.length() ? "" : - partitionFullPath.substring(partitionStartIndex + basePath.getName().length() + 1); + return partitionStartIndex + basePath.getName().length() == partitionFullPath.length() ? "" + : partitionFullPath.substring(partitionStartIndex + basePath.getName().length() + 1); } /** - * Obtain all the partition paths, that are present in this table, denoted by presence of {@link - * HoodiePartitionMetadata#HOODIE_PARTITION_METAFILE} + * Obtain all the partition paths, that are present in this table, denoted by presence of + * {@link HoodiePartitionMetadata#HOODIE_PARTITION_METAFILE} */ - public static List getAllFoldersWithPartitionMetaFile(FileSystem fs, String basePathStr) - throws IOException { + public static List getAllFoldersWithPartitionMetaFile(FileSystem fs, String basePathStr) throws IOException { final Path basePath = new Path(basePathStr); final List partitions = new ArrayList<>(); processFiles(fs, basePathStr, (locatedFileStatus) -> { @@ -221,17 +212,18 @@ public class FSUtils { } /** - * Recursively processes all files in the base-path. If excludeMetaFolder is set, the meta-folder and all its - * subdirs are skipped - * @param fs File System - * @param basePathStr Base-Path - * @param consumer Callback for processing + * Recursively processes all files in the base-path. If excludeMetaFolder is set, the meta-folder and all its subdirs + * are skipped + * + * @param fs File System + * @param basePathStr Base-Path + * @param consumer Callback for processing * @param excludeMetaFolder Exclude .hoodie folder * @throws IOException */ @VisibleForTesting - static void processFiles(FileSystem fs, String basePathStr, - Function consumer, boolean excludeMetaFolder) throws IOException { + static void processFiles(FileSystem fs, String basePathStr, Function consumer, + boolean excludeMetaFolder) throws IOException { PathFilter pathFilter = excludeMetaFolder ? getExcludeMetaPathFilter() : ALLOW_ALL_FILTER; FileStatus[] topLevelStatuses = fs.listStatus(new Path(basePathStr)); for (int i = 0; i < topLevelStatuses.length; i++) { @@ -254,8 +246,7 @@ public class FSUtils { } } - public static List getAllPartitionPaths(FileSystem fs, String basePathStr, - boolean assumeDatePartitioning) + public static List getAllPartitionPaths(FileSystem fs, String basePathStr, boolean assumeDatePartitioning) throws IOException { if (assumeDatePartitioning) { return getAllPartitionFoldersThreeLevelsDown(fs, basePathStr); @@ -304,8 +295,8 @@ public class FSUtils { } /** - * Get the first part of the file name in the log file. That will be the fileId. Log file do not - * have commitTime in the file name. + * Get the first part of the file name in the log file. That will be the fileId. Log file do not have commitTime in + * the file name. */ public static String getFileIdFromLogPath(Path path) { Matcher matcher = LOG_FILE_PATTERN.matcher(path.getName()); @@ -326,8 +317,8 @@ public class FSUtils { } /** - * Get the first part of the file name in the log file. That will be the fileId. Log file do not - * have commitTime in the file name. + * Get the first part of the file name in the log file. That will be the fileId. Log file do not have commitTime in + * the file name. */ public static String getBaseCommitTimeFromLogPath(Path path) { Matcher matcher = LOG_FILE_PATTERN.matcher(path.getName()); @@ -395,10 +386,11 @@ public class FSUtils { return Integer.parseInt(matcher.group(4)); } - public static String makeLogFileName(String fileId, String logFileExtension, - String baseCommitTime, int version, String writeToken) { - String suffix = (writeToken == null) ? String.format("%s_%s%s.%d",fileId, baseCommitTime, logFileExtension, version) - : String.format("%s_%s%s.%d_%s", fileId, baseCommitTime, logFileExtension, version, writeToken); + public static String makeLogFileName(String fileId, String logFileExtension, String baseCommitTime, int version, + String writeToken) { + String suffix = + (writeToken == null) ? String.format("%s_%s%s.%d", fileId, baseCommitTime, logFileExtension, version) + : String.format("%s_%s%s.%d_%s", fileId, baseCommitTime, logFileExtension, version, writeToken); return LOG_FILE_PREFIX + suffix; } @@ -420,12 +412,11 @@ public class FSUtils { /** * Get all the log files for the passed in FileId in the partition path */ - public static Stream getAllLogFiles(FileSystem fs, Path partitionPath, - final String fileId, final String logFileExtension, final String baseCommitTime) - throws IOException { - return Arrays.stream(fs.listStatus(partitionPath, - path -> path.getName().startsWith("." + fileId) && path.getName() - .contains(logFileExtension))) + public static Stream getAllLogFiles(FileSystem fs, Path partitionPath, final String fileId, + final String logFileExtension, final String baseCommitTime) throws IOException { + return Arrays + .stream(fs.listStatus(partitionPath, + path -> path.getName().startsWith("." + fileId) && path.getName().contains(logFileExtension))) .map(HoodieLogFile::new).filter(s -> s.getBaseCommitTime().equals(baseCommitTime)); } @@ -433,14 +424,12 @@ public class FSUtils { * Get the latest log version for the fileId in the partition path */ public static Option> getLatestLogVersion(FileSystem fs, Path partitionPath, - final String fileId, final String logFileExtension, final String baseCommitTime) - throws IOException { + final String fileId, final String logFileExtension, final String baseCommitTime) throws IOException { Option latestLogFile = - getLatestLogFile( - getAllLogFiles(fs, partitionPath, fileId, logFileExtension, baseCommitTime)); + getLatestLogFile(getAllLogFiles(fs, partitionPath, fileId, logFileExtension, baseCommitTime)); if (latestLogFile.isPresent()) { - return Option.of(Pair.of(latestLogFile.get().getLogVersion(), - getWriteTokenFromLogPath(latestLogFile.get().getPath()))); + return Option + .of(Pair.of(latestLogFile.get().getLogVersion(), getWriteTokenFromLogPath(latestLogFile.get().getPath()))); } return Option.empty(); } @@ -450,7 +439,7 @@ public class FSUtils { */ public static int computeNextLogVersion(FileSystem fs, Path partitionPath, final String fileId, final String logFileExtension, final String baseCommitTime) throws IOException { - Option> currentVersionWithWriteToken = + Option> currentVersionWithWriteToken = getLatestLogVersion(fs, partitionPath, fileId, logFileExtension, baseCommitTime); // handle potential overflow return (currentVersionWithWriteToken.isPresent()) ? currentVersionWithWriteToken.get().getKey() + 1 @@ -466,10 +455,9 @@ public class FSUtils { } /** - * When a file was opened and the task died without closing the stream, another task executor - * cannot open because the existing lease will be active. We will try to recover the lease, from - * HDFS. If a data node went down, it takes about 10 minutes for the lease to be rocovered. But if - * the client dies, this should be instant. + * When a file was opened and the task died without closing the stream, another task executor cannot open because the + * existing lease will be active. We will try to recover the lease, from HDFS. If a data node went down, it takes + * about 10 minutes for the lease to be rocovered. But if the client dies, this should be instant. */ public static boolean recoverDFSFileLease(final DistributedFileSystem dfs, final Path p) throws IOException, InterruptedException { @@ -489,44 +477,38 @@ public class FSUtils { return recovered; } - public static void deleteOlderCleanMetaFiles(FileSystem fs, String metaPath, - Stream instants) { - //TODO - this should be archived when archival is made general for all meta-data + public static void deleteOlderCleanMetaFiles(FileSystem fs, String metaPath, Stream instants) { + // TODO - this should be archived when archival is made general for all meta-data // skip MIN_CLEAN_TO_KEEP and delete rest instants.skip(MIN_CLEAN_TO_KEEP).map(s -> { try { return fs.delete(new Path(metaPath, s.getFileName()), false); } catch (IOException e) { - throw new HoodieIOException("Could not delete clean meta files" + s.getFileName(), - e); + throw new HoodieIOException("Could not delete clean meta files" + s.getFileName(), e); } }); } - public static void deleteOlderRollbackMetaFiles(FileSystem fs, String metaPath, - Stream instants) { - //TODO - this should be archived when archival is made general for all meta-data + public static void deleteOlderRollbackMetaFiles(FileSystem fs, String metaPath, Stream instants) { + // TODO - this should be archived when archival is made general for all meta-data // skip MIN_ROLLBACK_TO_KEEP and delete rest instants.skip(MIN_ROLLBACK_TO_KEEP).map(s -> { try { return fs.delete(new Path(metaPath, s.getFileName()), false); } catch (IOException e) { - throw new HoodieIOException( - "Could not delete rollback meta files " + s.getFileName(), e); + throw new HoodieIOException("Could not delete rollback meta files " + s.getFileName(), e); } }); } - public static void deleteOlderRestoreMetaFiles(FileSystem fs, String metaPath, - Stream instants) { - //TODO - this should be archived when archival is made general for all meta-data + public static void deleteOlderRestoreMetaFiles(FileSystem fs, String metaPath, Stream instants) { + // TODO - this should be archived when archival is made general for all meta-data // skip MIN_ROLLBACK_TO_KEEP and delete rest instants.skip(MIN_ROLLBACK_TO_KEEP).map(s -> { try { return fs.delete(new Path(metaPath, s.getFileName()), false); } catch (IOException e) { - throw new HoodieIOException( - "Could not delete restore meta files " + s.getFileName(), e); + throw new HoodieIOException("Could not delete restore meta files " + s.getFileName(), e); } }); } @@ -547,18 +529,18 @@ public class FSUtils { public static Path getPartitionPath(Path basePath, String partitionPath) { // FOr non-partitioned table, return only base-path - return ((partitionPath == null) || (partitionPath.isEmpty())) ? basePath : - new Path(basePath, partitionPath); + return ((partitionPath == null) || (partitionPath.isEmpty())) ? basePath : new Path(basePath, partitionPath); } /** * This is due to HUDI-140 GCS has a different behavior for detecting EOF during seek(). + * * @param inputStream FSDataInputStream * @return true if the inputstream or the wrapped one is of type GoogleHadoopFSInputStream */ public static boolean isGCSInputStream(FSDataInputStream inputStream) { return inputStream.getClass().getCanonicalName().equals("com.google.cloud.hadoop.fs.gcs.GoogleHadoopFSInputStream") || inputStream.getWrappedStream().getClass().getCanonicalName() - .equals("com.google.cloud.hadoop.fs.gcs.GoogleHadoopFSInputStream"); + .equals("com.google.cloud.hadoop.fs.gcs.GoogleHadoopFSInputStream"); } } diff --git a/hudi-common/src/main/java/org/apache/hudi/common/util/FailSafeConsistencyGuard.java b/hudi-common/src/main/java/org/apache/hudi/common/util/FailSafeConsistencyGuard.java index bec07a8f5..2bcb901f3 100644 --- a/hudi-common/src/main/java/org/apache/hudi/common/util/FailSafeConsistencyGuard.java +++ b/hudi-common/src/main/java/org/apache/hudi/common/util/FailSafeConsistencyGuard.java @@ -51,12 +51,11 @@ public class FailSafeConsistencyGuard implements ConsistencyGuard { @Override public void waitTillFileAppears(Path filePath) throws TimeoutException { - waitForFileVisibility(filePath, FileVisibility.APPEAR); + waitForFileVisibility(filePath, FileVisibility.APPEAR); } @Override - public void waitTillFileDisappears(Path filePath) - throws TimeoutException { + public void waitTillFileDisappears(Path filePath) throws TimeoutException { waitForFileVisibility(filePath, FileVisibility.DISAPPEAR); } @@ -72,13 +71,13 @@ public class FailSafeConsistencyGuard implements ConsistencyGuard { /** * Helper function to wait for all files belonging to single directory to appear + * * @param dirPath Dir Path * @param files Files to appear/disappear * @param event Appear/Disappear * @throws TimeoutException */ - public void waitForFilesVisibility(String dirPath, List files, FileVisibility event) - throws TimeoutException { + public void waitForFilesVisibility(String dirPath, List files, FileVisibility event) throws TimeoutException { Path dir = new Path(dirPath); List filesWithoutSchemeAndAuthority = files.stream().map(f -> Path.getPathWithoutSchemeAndAuthority(new Path(f))).map(p -> p.toString()) @@ -112,6 +111,7 @@ public class FailSafeConsistencyGuard implements ConsistencyGuard { /** * Helper to check of file visibility + * * @param filePath File Path * @param visibility Visibility * @return @@ -140,6 +140,7 @@ public class FailSafeConsistencyGuard implements ConsistencyGuard { /** * Helper function to wait till file either appears/disappears + * * @param filePath File Path * @param visibility * @throws TimeoutException @@ -166,6 +167,7 @@ public class FailSafeConsistencyGuard implements ConsistencyGuard { /** * Retries the predicate for condfigurable number of times till we the predicate returns success + * * @param predicate Predicate Function * @param timedOutMessage Timed-Out message for logging * @throws TimeoutException when retries are exhausted diff --git a/hudi-common/src/main/java/org/apache/hudi/common/util/FileIOUtils.java b/hudi-common/src/main/java/org/apache/hudi/common/util/FileIOUtils.java index 908c780e2..94845586f 100644 --- a/hudi-common/src/main/java/org/apache/hudi/common/util/FileIOUtils.java +++ b/hudi-common/src/main/java/org/apache/hudi/common/util/FileIOUtils.java @@ -39,10 +39,7 @@ public class FileIOUtils { public static void deleteDirectory(File directory) throws IOException { if (directory.exists()) { - Files.walk(directory.toPath()) - .sorted(Comparator.reverseOrder()) - .map(Path::toFile) - .forEach(File::delete); + Files.walk(directory.toPath()).sorted(Comparator.reverseOrder()).map(Path::toFile).forEach(File::delete); directory.delete(); if (directory.exists()) { throw new IOException("Unable to delete directory " + directory); diff --git a/hudi-common/src/main/java/org/apache/hudi/common/util/HoodieAvroUtils.java b/hudi-common/src/main/java/org/apache/hudi/common/util/HoodieAvroUtils.java index 75b9b556a..76ba1fdde 100644 --- a/hudi-common/src/main/java/org/apache/hudi/common/util/HoodieAvroUtils.java +++ b/hudi-common/src/main/java/org/apache/hudi/common/util/HoodieAvroUtils.java @@ -56,9 +56,8 @@ public class HoodieAvroUtils { private static ThreadLocal reuseDecoder = ThreadLocal.withInitial(() -> null); // All metadata fields are optional strings. - private static final Schema METADATA_FIELD_SCHEMA = Schema.createUnion(Arrays.asList( - Schema.create(Schema.Type.NULL), - Schema.create(Schema.Type.STRING))); + private static final Schema METADATA_FIELD_SCHEMA = + Schema.createUnion(Arrays.asList(Schema.create(Schema.Type.NULL), Schema.create(Schema.Type.STRING))); private static final Schema RECORD_KEY_SCHEMA = initRecordKeySchema(); @@ -66,8 +65,7 @@ public class HoodieAvroUtils { * Convert a given avro record to bytes */ public static byte[] avroToBytes(GenericRecord record) throws IOException { - GenericDatumWriter writer = - new GenericDatumWriter<>(record.getSchema()); + GenericDatumWriter writer = new GenericDatumWriter<>(record.getSchema()); ByteArrayOutputStream out = new ByteArrayOutputStream(); BinaryEncoder encoder = EncoderFactory.get().binaryEncoder(out, reuseEncoder.get()); reuseEncoder.set(encoder); @@ -101,16 +99,16 @@ public class HoodieAvroUtils { public static Schema addMetadataFields(Schema schema) { List parentFields = new ArrayList<>(); - Schema.Field commitTimeField = new Schema.Field(HoodieRecord.COMMIT_TIME_METADATA_FIELD, - METADATA_FIELD_SCHEMA, "", NullNode.getInstance()); - Schema.Field commitSeqnoField = new Schema.Field(HoodieRecord.COMMIT_SEQNO_METADATA_FIELD, - METADATA_FIELD_SCHEMA, "", NullNode.getInstance()); - Schema.Field recordKeyField = new Schema.Field(HoodieRecord.RECORD_KEY_METADATA_FIELD, - METADATA_FIELD_SCHEMA, "", NullNode.getInstance()); - Schema.Field partitionPathField = new Schema.Field(HoodieRecord.PARTITION_PATH_METADATA_FIELD, - METADATA_FIELD_SCHEMA, "", NullNode.getInstance()); - Schema.Field fileNameField = new Schema.Field(HoodieRecord.FILENAME_METADATA_FIELD, - METADATA_FIELD_SCHEMA, "", NullNode.getInstance()); + Schema.Field commitTimeField = + new Schema.Field(HoodieRecord.COMMIT_TIME_METADATA_FIELD, METADATA_FIELD_SCHEMA, "", NullNode.getInstance()); + Schema.Field commitSeqnoField = + new Schema.Field(HoodieRecord.COMMIT_SEQNO_METADATA_FIELD, METADATA_FIELD_SCHEMA, "", NullNode.getInstance()); + Schema.Field recordKeyField = + new Schema.Field(HoodieRecord.RECORD_KEY_METADATA_FIELD, METADATA_FIELD_SCHEMA, "", NullNode.getInstance()); + Schema.Field partitionPathField = + new Schema.Field(HoodieRecord.PARTITION_PATH_METADATA_FIELD, METADATA_FIELD_SCHEMA, "", NullNode.getInstance()); + Schema.Field fileNameField = + new Schema.Field(HoodieRecord.FILENAME_METADATA_FIELD, METADATA_FIELD_SCHEMA, "", NullNode.getInstance()); parentFields.add(commitTimeField); parentFields.add(commitSeqnoField); @@ -127,15 +125,14 @@ public class HoodieAvroUtils { } } - Schema mergedSchema = Schema - .createRecord(schema.getName(), schema.getDoc(), schema.getNamespace(), false); + Schema mergedSchema = Schema.createRecord(schema.getName(), schema.getDoc(), schema.getNamespace(), false); mergedSchema.setFields(parentFields); return mergedSchema; } private static Schema initRecordKeySchema() { - Schema.Field recordKeyField = new Schema.Field(HoodieRecord.RECORD_KEY_METADATA_FIELD, - METADATA_FIELD_SCHEMA, "", NullNode.getInstance()); + Schema.Field recordKeyField = + new Schema.Field(HoodieRecord.RECORD_KEY_METADATA_FIELD, METADATA_FIELD_SCHEMA, "", NullNode.getInstance()); Schema recordKeySchema = Schema.createRecord("HoodieRecordKey", "", "", false); recordKeySchema.setFields(Arrays.asList(recordKeyField)); return recordKeySchema; @@ -145,8 +142,8 @@ public class HoodieAvroUtils { return RECORD_KEY_SCHEMA; } - public static GenericRecord addHoodieKeyToRecord(GenericRecord record, String recordKey, - String partitionPath, String fileName) { + public static GenericRecord addHoodieKeyToRecord(GenericRecord record, String recordKey, String partitionPath, + String fileName) { record.put(HoodieRecord.FILENAME_METADATA_FIELD, fileName); record.put(HoodieRecord.PARTITION_PATH_METADATA_FIELD, partitionPath); record.put(HoodieRecord.RECORD_KEY_METADATA_FIELD, recordKey); @@ -154,9 +151,9 @@ public class HoodieAvroUtils { } /** - * Add null fields to passed in schema. Caller is responsible for ensuring there is no duplicates. - * As different query engines have varying constraints regarding treating the case-sensitivity of fields, its best - * to let caller determine that. + * Add null fields to passed in schema. Caller is responsible for ensuring there is no duplicates. As different query + * engines have varying constraints regarding treating the case-sensitivity of fields, its best to let caller + * determine that. * * @param schema Passed in schema * @param newFieldNames Null Field names to be added @@ -176,8 +173,7 @@ public class HoodieAvroUtils { /** * Adds the Hoodie commit metadata into the provided Generic Record. */ - public static GenericRecord addCommitMetadataToRecord(GenericRecord record, String commitTime, - String commitSeqno) { + public static GenericRecord addCommitMetadataToRecord(GenericRecord record, String commitTime, String commitSeqno) { record.put(HoodieRecord.COMMIT_TIME_METADATA_FIELD, commitTime); record.put(HoodieRecord.COMMIT_SEQNO_METADATA_FIELD, commitSeqno); return record; @@ -207,8 +203,7 @@ public class HoodieAvroUtils { } if (!GenericData.get().validate(newSchema, newRecord)) { throw new SchemaCompatabilityException( - "Unable to validate the rewritten record " + record + " against schema " - + newSchema); + "Unable to validate the rewritten record " + record + " against schema " + newSchema); } return newRecord; } diff --git a/hudi-common/src/main/java/org/apache/hudi/common/util/HoodieRecordSizeEstimator.java b/hudi-common/src/main/java/org/apache/hudi/common/util/HoodieRecordSizeEstimator.java index 0cdd243db..12d435fd5 100644 --- a/hudi-common/src/main/java/org/apache/hudi/common/util/HoodieRecordSizeEstimator.java +++ b/hudi-common/src/main/java/org/apache/hudi/common/util/HoodieRecordSizeEstimator.java @@ -27,6 +27,7 @@ import org.apache.log4j.Logger; /** * Size Estimator for Hoodie record payload + * * @param */ public class HoodieRecordSizeEstimator implements SizeEstimator> { diff --git a/hudi-common/src/main/java/org/apache/hudi/common/util/HoodieTimer.java b/hudi-common/src/main/java/org/apache/hudi/common/util/HoodieTimer.java index ba1f4e088..d4206fccb 100644 --- a/hudi-common/src/main/java/org/apache/hudi/common/util/HoodieTimer.java +++ b/hudi-common/src/main/java/org/apache/hudi/common/util/HoodieTimer.java @@ -23,9 +23,8 @@ import java.util.Deque; import org.apache.hudi.exception.HoodieException; /** - * Timing utility to help keep track of execution times of code blocks. This class helps to allow multiple - * timers started at the same time and automatically returns the execution time in the order in which the - * timers are stopped. + * Timing utility to help keep track of execution times of code blocks. This class helps to allow multiple timers + * started at the same time and automatically returns the execution time in the order in which the timers are stopped. */ public class HoodieTimer { diff --git a/hudi-common/src/main/java/org/apache/hudi/common/util/LogReaderUtils.java b/hudi-common/src/main/java/org/apache/hudi/common/util/LogReaderUtils.java index bb21cec57..0ec3a4bb8 100644 --- a/hudi-common/src/main/java/org/apache/hudi/common/util/LogReaderUtils.java +++ b/hudi-common/src/main/java/org/apache/hudi/common/util/LogReaderUtils.java @@ -49,8 +49,8 @@ public class LogReaderUtils { HoodieLogBlock block = reader.prev(); if (block instanceof HoodieAvroDataBlock && block != null) { HoodieAvroDataBlock lastBlock = (HoodieAvroDataBlock) block; - if (completedTimeline.containsOrBeforeTimelineStarts(lastBlock.getLogBlockHeader().get(HeaderMetadataType - .INSTANT_TIME))) { + if (completedTimeline + .containsOrBeforeTimelineStarts(lastBlock.getLogBlockHeader().get(HeaderMetadataType.INSTANT_TIME))) { writerSchema = Schema.parse(lastBlock.getLogBlockHeader().get(HeaderMetadataType.SCHEMA)); break; } diff --git a/hudi-common/src/main/java/org/apache/hudi/common/util/NoOpConsistencyGuard.java b/hudi-common/src/main/java/org/apache/hudi/common/util/NoOpConsistencyGuard.java index f7c597658..463a1410c 100644 --- a/hudi-common/src/main/java/org/apache/hudi/common/util/NoOpConsistencyGuard.java +++ b/hudi-common/src/main/java/org/apache/hudi/common/util/NoOpConsistencyGuard.java @@ -27,12 +27,10 @@ import org.apache.hadoop.fs.Path; public class NoOpConsistencyGuard implements ConsistencyGuard { @Override - public void waitTillFileAppears(Path filePath) { - } + public void waitTillFileAppears(Path filePath) {} @Override - public void waitTillFileDisappears(Path filePath) { - } + public void waitTillFileDisappears(Path filePath) {} @Override public void waitTillAllFilesAppear(String dirPath, List files) { diff --git a/hudi-common/src/main/java/org/apache/hudi/common/util/ObjectSizeCalculator.java b/hudi-common/src/main/java/org/apache/hudi/common/util/ObjectSizeCalculator.java index 8fdf73703..6eb6842b7 100644 --- a/hudi-common/src/main/java/org/apache/hudi/common/util/ObjectSizeCalculator.java +++ b/hudi-common/src/main/java/org/apache/hudi/common/util/ObjectSizeCalculator.java @@ -54,17 +54,13 @@ import java.util.List; import java.util.Set; /** - * Contains utility methods for calculating the memory usage of objects. It - * only works on the HotSpot JVM, and infers the actual memory layout (32 bit - * vs. 64 bit word size, compressed object pointers vs. uncompressed) from - * best available indicators. It can reliably detect a 32 bit vs. 64 bit JVM. - * It can only make an educated guess at whether compressed OOPs are used, - * though; specifically, it knows what the JVM's default choice of OOP - * compression would be based on HotSpot version and maximum heap sizes, but if - * the choice is explicitly overridden with the -XX:{+|-}UseCompressedOops command line - * switch, it can not detect - * this fact and will report incorrect sizes, as it will presume the default JVM - * behavior. + * Contains utility methods for calculating the memory usage of objects. It only works on the HotSpot JVM, and infers + * the actual memory layout (32 bit vs. 64 bit word size, compressed object pointers vs. uncompressed) from best + * available indicators. It can reliably detect a 32 bit vs. 64 bit JVM. It can only make an educated guess at whether + * compressed OOPs are used, though; specifically, it knows what the JVM's default choice of OOP compression would be + * based on HotSpot version and maximum heap sizes, but if the choice is explicitly overridden with the + * -XX:{+|-}UseCompressedOops command line switch, it can not detect this fact and will report incorrect sizes, + * as it will presume the default JVM behavior. * * @author Attila Szegedi */ @@ -104,8 +100,7 @@ public class ObjectSizeCalculator { int getReferenceSize(); /** - * Returns the quantum field size for a field owned by one of an object's ancestor superclasses - * in this JVM. + * Returns the quantum field size for a field owned by one of an object's ancestor superclasses in this JVM. * * @return the quantum field size for a superclass field. */ @@ -114,24 +109,18 @@ public class ObjectSizeCalculator { private static class CurrentLayout { - private static final MemoryLayoutSpecification SPEC = - getEffectiveMemoryLayoutSpecification(); + private static final MemoryLayoutSpecification SPEC = getEffectiveMemoryLayoutSpecification(); } /** - * Given an object, returns the total allocated size, in bytes, of the object - * and all other objects reachable from it. Attempts to to detect the current JVM memory layout, - * but may fail with {@link UnsupportedOperationException}; + * Given an object, returns the total allocated size, in bytes, of the object and all other objects reachable from it. + * Attempts to to detect the current JVM memory layout, but may fail with {@link UnsupportedOperationException}; * - * @param obj the object; can be null. Passing in a {@link java.lang.Class} object doesn't do - * anything special, it measures the size of all objects - * reachable through it (which will include its class loader, and by - * extension, all other Class objects loaded by - * the same loader, and all the parent class loaders). It doesn't provide the - * size of the static fields in the JVM class that the Class object - * represents. - * @return the total allocated size of the object and all other objects it - * retains. + * @param obj the object; can be null. Passing in a {@link java.lang.Class} object doesn't do anything special, it + * measures the size of all objects reachable through it (which will include its class loader, and by + * extension, all other Class objects loaded by the same loader, and all the parent class loaders). It doesn't + * provide the size of the static fields in the JVM class that the Class object represents. + * @return the total allocated size of the object and all other objects it retains. * @throws UnsupportedOperationException if the current vm memory layout cannot be detected. */ public static long getObjectSize(Object obj) throws UnsupportedOperationException { @@ -164,8 +153,7 @@ public class ObjectSizeCalculator { private long size; /** - * Creates an object size calculator that can calculate object sizes for a given - * {@code memoryLayoutSpecification}. + * Creates an object size calculator that can calculate object sizes for a given {@code memoryLayoutSpecification}. * * @param memoryLayoutSpecification a description of the JVM memory layout. */ @@ -179,24 +167,19 @@ public class ObjectSizeCalculator { } /** - * Given an object, returns the total allocated size, in bytes, of the object - * and all other objects reachable from it. + * Given an object, returns the total allocated size, in bytes, of the object and all other objects reachable from it. * - * @param obj the object; can be null. Passing in a {@link java.lang.Class} object doesn't do - * anything special, it measures the size of all objects - * reachable through it (which will include its class loader, and by - * extension, all other Class objects loaded by - * the same loader, and all the parent class loaders). It doesn't provide the - * size of the static fields in the JVM class that the Class object - * represents. - * @return the total allocated size of the object and all other objects it - * retains. + * @param obj the object; can be null. Passing in a {@link java.lang.Class} object doesn't do anything special, it + * measures the size of all objects reachable through it (which will include its class loader, and by + * extension, all other Class objects loaded by the same loader, and all the parent class loaders). It doesn't + * provide the size of the static fields in the JVM class that the Class object represents. + * @return the total allocated size of the object and all other objects it retains. */ public synchronized long calculateObjectSize(Object obj) { // Breadth-first traversal instead of naive depth-first with recursive // implementation, so we don't blow the stack traversing long linked lists. try { - for (; ; ) { + for (;;) { visit(obj); if (pending.isEmpty()) { return size; @@ -324,8 +307,7 @@ public class ObjectSizeCalculator { } this.fieldsSize = fieldsSize; this.objectSize = roundTo(objectHeaderSize + fieldsSize, objectPadding); - this.referenceFields = referenceFields.toArray( - new Field[referenceFields.size()]); + this.referenceFields = referenceFields.toArray(new Field[referenceFields.size()]); } void visit(Object obj, ObjectSizeCalculator calc) { @@ -338,8 +320,7 @@ public class ObjectSizeCalculator { try { calc.enqueue(f.get(obj)); } catch (IllegalAccessException e) { - final AssertionError ae = new AssertionError( - "Unexpected denial of access to " + f); + final AssertionError ae = new AssertionError("Unexpected denial of access to " + f); ae.initCause(e); throw ae; } @@ -360,17 +341,15 @@ public class ObjectSizeCalculator { if (type == long.class || type == double.class) { return 8; } - throw new AssertionError("Encountered unexpected primitive type " - + type.getName()); + throw new AssertionError("Encountered unexpected primitive type " + type.getName()); } @VisibleForTesting static MemoryLayoutSpecification getEffectiveMemoryLayoutSpecification() { final String vmName = System.getProperty("java.vm.name"); - if (vmName == null || !(vmName.startsWith("Java HotSpot(TM) ") - || vmName.startsWith("OpenJDK") || vmName.startsWith("TwitterJDK"))) { - throw new UnsupportedOperationException( - "ObjectSizeCalculator only supported on HotSpot VM"); + if (vmName == null || !(vmName.startsWith("Java HotSpot(TM) ") || vmName.startsWith("OpenJDK") + || vmName.startsWith("TwitterJDK"))) { + throw new UnsupportedOperationException("ObjectSizeCalculator only supported on HotSpot VM"); } final String dataModel = System.getProperty("sun.arch.data.model"); @@ -403,13 +382,12 @@ public class ObjectSizeCalculator { } }; } else if (!"64".equals(dataModel)) { - throw new UnsupportedOperationException("Unrecognized value '" - + dataModel + "' of sun.arch.data.model system property"); + throw new UnsupportedOperationException( + "Unrecognized value '" + dataModel + "' of sun.arch.data.model system property"); } final String strVmVersion = System.getProperty("java.vm.version"); - final int vmVersion = Integer.parseInt(strVmVersion.substring(0, - strVmVersion.indexOf('.'))); + final int vmVersion = Integer.parseInt(strVmVersion.substring(0, strVmVersion.indexOf('.'))); if (vmVersion >= 17) { long maxMemory = 0; for (MemoryPoolMXBean mp : ManagementFactory.getMemoryPoolMXBeans()) { diff --git a/hudi-common/src/main/java/org/apache/hudi/common/util/Option.java b/hudi-common/src/main/java/org/apache/hudi/common/util/Option.java index e0fa6a429..ecc01fb1f 100644 --- a/hudi-common/src/main/java/org/apache/hudi/common/util/Option.java +++ b/hudi-common/src/main/java/org/apache/hudi/common/util/Option.java @@ -54,13 +54,13 @@ public final class Option implements Serializable { } /** - * Returns an empty {@code Option} instance. No value is present for this Option. + * Returns an empty {@code Option} instance. No value is present for this Option. * * @param Type of the non-existent value * @return an empty {@code Option} * @apiNote Though it may be tempting to do so, avoid testing if an object is empty by comparing with {@code ==} - * against instances returned by {@code Option.empty()}. There is no guarantee that it is a singleton. Instead, use - * {@link #isPresent()}. + * against instances returned by {@code Option.empty()}. There is no guarantee that it is a singleton. + * Instead, use {@link #isPresent()}. */ public static Option empty() { @SuppressWarnings("unchecked") @@ -143,7 +143,7 @@ public final class Option implements Serializable { * * @param predicate a predicate to apply to the value, if present * @return an {@code Option} describing the value of this {@code Option} if a value is present and the value matches - * the given predicate, otherwise an empty {@code Option} + * the given predicate, otherwise an empty {@code Option} * @throws NullPointerException if the predicate is null */ public Option filter(Predicate predicate) { @@ -157,25 +157,27 @@ public final class Option implements Serializable { /** * If a value is present, apply the provided mapping function to it, and if the result is non-null, return an {@code - * Option} describing the result. Otherwise return an empty {@code Option}. + * Option} describing the result. Otherwise return an empty {@code Option}. * * @param The type of the result of the mapping function * @param mapper a mapping function to apply to the value, if present * @return an {@code Option} describing the result of applying a mapping function to the value of this {@code Option}, - * if a value is present, otherwise an empty {@code Option} + * if a value is present, otherwise an empty {@code Option} * @throws NullPointerException if the mapping function is null * @apiNote This method supports post-processing on optional values, without the need to explicitly check for a return - * status. For example, the following code traverses a stream of file names, selects one that has not yet been - * processed, and then opens that file, returning an {@code Option}: + * status. For example, the following code traverses a stream of file names, selects one that has not yet + * been processed, and then opens that file, returning an {@code Option}: * - *
    {@code
    +   *          
    +   * {@code
        *     Option fis =
        *         names.stream().filter(name -> !isProcessedYet(name))
        *                       .findFirst()
        *                       .map(name -> new FileInputStream(name));
    -   * }
    + * } + *
    * - * Here, {@code findFirst} returns an {@code Option}, and then {@code map} returns an {@code + * Here, {@code findFirst} returns an {@code Option}, and then {@code map} returns an {@code * Option} for the desired file if one exists. */ public Option map(Function mapper) { @@ -189,14 +191,14 @@ public final class Option implements Serializable { /** * If a value is present, apply the provided {@code Option}-bearing mapping function to it, return that result, - * otherwise return an empty {@code Option}. This method is similar to {@link #map(Function)}, but the provided - * mapper is one whose result is already an {@code Option}, and if invoked, {@code flatMap} does not wrap it with an + * otherwise return an empty {@code Option}. This method is similar to {@link #map(Function)}, but the provided mapper + * is one whose result is already an {@code Option}, and if invoked, {@code flatMap} does not wrap it with an * additional {@code Option}. * * @param The type parameter to the {@code Option} returned by * @param mapper a mapping function to apply to the value, if present the mapping function * @return the result of applying an {@code Option}-bearing mapping function to the value of this {@code Option}, if a - * value is present, otherwise an empty {@code Option} + * value is present, otherwise an empty {@code Option} * @throws NullPointerException if the mapping function is null or returns a null result */ public Option flatMap(Function> mapper) { @@ -238,7 +240,7 @@ public final class Option implements Serializable { * @throws X if there is no value present * @throws NullPointerException if no value is present and {@code exceptionSupplier} is null * @apiNote A method reference to the exception constructor with an empty argument list can be used as the supplier. - * For example, {@code IllegalStateException::new} + * For example, {@code IllegalStateException::new} */ public T orElseThrow(Supplier exceptionSupplier) throws X { if (value != null) { @@ -289,13 +291,11 @@ public final class Option implements Serializable { * * @return the string representation of this instance * @implSpec If a value is present the result must include its string representation in the result. Empty and present - * Optionals must be unambiguously differentiable. + * Optionals must be unambiguously differentiable. */ @Override public String toString() { - return value != null - ? String.format("Option[%s]", value) - : "Option.empty"; + return value != null ? String.format("Option[%s]", value) : "Option.empty"; } /** diff --git a/hudi-common/src/main/java/org/apache/hudi/common/util/ParquetUtils.java b/hudi-common/src/main/java/org/apache/hudi/common/util/ParquetUtils.java index e4cb6606f..40d8d9f16 100644 --- a/hudi-common/src/main/java/org/apache/hudi/common/util/ParquetUtils.java +++ b/hudi-common/src/main/java/org/apache/hudi/common/util/ParquetUtils.java @@ -51,22 +51,22 @@ public class ParquetUtils { /** * Read the rowKey list from the given parquet file. * - * @param filePath The parquet file path. + * @param filePath The parquet file path. * @param configuration configuration to build fs object - * @return Set Set of row keys + * @return Set Set of row keys */ public static Set readRowKeysFromParquet(Configuration configuration, Path filePath) { return filterParquetRowKeys(configuration, filePath, new HashSet<>()); } /** - * Read the rowKey list matching the given filter, from the given parquet file. If the filter is empty, - * then this will return all the rowkeys. + * Read the rowKey list matching the given filter, from the given parquet file. If the filter is empty, then this will + * return all the rowkeys. * - * @param filePath The parquet file path. - * @param configuration configuration to build fs object - * @param filter record keys filter - * @return Set Set of row keys matching candidateRecordKeys + * @param filePath The parquet file path. + * @param configuration configuration to build fs object + * @param filter record keys filter + * @return Set Set of row keys matching candidateRecordKeys */ public static Set filterParquetRowKeys(Configuration configuration, Path filePath, Set filter) { Option filterFunction = Option.empty(); @@ -102,11 +102,9 @@ public class ParquetUtils { ParquetMetadata footer; try { // TODO(vc): Should we use the parallel reading version here? - footer = ParquetFileReader - .readFooter(FSUtils.getFs(parquetFilePath.toString(), conf).getConf(), parquetFilePath); + footer = ParquetFileReader.readFooter(FSUtils.getFs(parquetFilePath.toString(), conf).getConf(), parquetFilePath); } catch (IOException e) { - throw new HoodieIOException("Failed to read footer for parquet " + parquetFilePath, - e); + throw new HoodieIOException("Failed to read footer for parquet " + parquetFilePath, e); } return footer; } @@ -127,8 +125,8 @@ public class ParquetUtils { if (metadata.containsKey(footerName)) { footerVals.add(metadata.get(footerName)); } else { - throw new MetadataNotFoundException("Could not find index in Parquet footer. " - + "Looked for key " + footerName + " in " + parquetFilePath); + throw new MetadataNotFoundException( + "Could not find index in Parquet footer. " + "Looked for key " + footerName + " in " + parquetFilePath); } } return footerVals; @@ -141,21 +139,20 @@ public class ParquetUtils { /** * Read out the bloom filter from the parquet file meta data. */ - public static BloomFilter readBloomFilterFromParquetMetadata(Configuration configuration, - Path parquetFilePath) { - String footerVal = readParquetFooter(configuration, parquetFilePath, - HoodieAvroWriteSupport.HOODIE_AVRO_BLOOM_FILTER_METADATA_KEY).get(0); + public static BloomFilter readBloomFilterFromParquetMetadata(Configuration configuration, Path parquetFilePath) { + String footerVal = + readParquetFooter(configuration, parquetFilePath, HoodieAvroWriteSupport.HOODIE_AVRO_BLOOM_FILTER_METADATA_KEY) + .get(0); return new BloomFilter(footerVal); } public static String[] readMinMaxRecordKeys(Configuration configuration, Path parquetFilePath) { List minMaxKeys = readParquetFooter(configuration, parquetFilePath, - HoodieAvroWriteSupport.HOODIE_MIN_RECORD_KEY_FOOTER, - HoodieAvroWriteSupport.HOODIE_MAX_RECORD_KEY_FOOTER); + HoodieAvroWriteSupport.HOODIE_MIN_RECORD_KEY_FOOTER, HoodieAvroWriteSupport.HOODIE_MAX_RECORD_KEY_FOOTER); if (minMaxKeys.size() != 2) { - throw new HoodieException(String.format( - "Could not read min/max record key out of footer correctly from %s. read) : %s", - parquetFilePath, minMaxKeys)); + throw new HoodieException( + String.format("Could not read min/max record key out of footer correctly from %s. read) : %s", + parquetFilePath, minMaxKeys)); } return new String[] {minMaxKeys.get(0), minMaxKeys.get(1)}; } diff --git a/hudi-common/src/main/java/org/apache/hudi/common/util/ReflectionUtils.java b/hudi-common/src/main/java/org/apache/hudi/common/util/ReflectionUtils.java index 54b51530b..c9125e35a 100644 --- a/hudi-common/src/main/java/org/apache/hudi/common/util/ReflectionUtils.java +++ b/hudi-common/src/main/java/org/apache/hudi/common/util/ReflectionUtils.java @@ -56,14 +56,11 @@ public class ReflectionUtils { /** * Instantiate a given class with a generic record payload */ - public static T loadPayload(String recordPayloadClass, - Object[] payloadArgs, + public static T loadPayload(String recordPayloadClass, Object[] payloadArgs, Class... constructorArgTypes) { try { - return (T) getClass(recordPayloadClass).getConstructor(constructorArgTypes) - .newInstance(payloadArgs); - } catch (InstantiationException | IllegalAccessException - | InvocationTargetException | NoSuchMethodException e) { + return (T) getClass(recordPayloadClass).getConstructor(constructorArgTypes).newInstance(payloadArgs); + } catch (InstantiationException | IllegalAccessException | InvocationTargetException | NoSuchMethodException e) { throw new HoodieException("Unable to instantiate payload class ", e); } } @@ -74,8 +71,7 @@ public class ReflectionUtils { public static Object loadClass(String clazz, Class[] constructorArgTypes, Object... constructorArgs) { try { return getClass(clazz).getConstructor(constructorArgTypes).newInstance(constructorArgs); - } catch (InstantiationException | IllegalAccessException - | InvocationTargetException | NoSuchMethodException e) { + } catch (InstantiationException | IllegalAccessException | InvocationTargetException | NoSuchMethodException e) { throw new HoodieException("Unable to instantiate class ", e); } } @@ -84,13 +80,13 @@ public class ReflectionUtils { * Creates an instance of the given class. Constructor arg types are inferred. */ public static Object loadClass(String clazz, Object... constructorArgs) { - Class[] constructorArgTypes = Arrays.stream(constructorArgs) - .map(Object::getClass).toArray(Class[]::new); + Class[] constructorArgTypes = Arrays.stream(constructorArgs).map(Object::getClass).toArray(Class[]::new); return loadClass(clazz, constructorArgTypes, constructorArgs); } /** * Return stream of top level class names in the same class path as passed-in class + * * @param clazz */ public static Stream getTopLevelClassesInClasspath(Class clazz) { diff --git a/hudi-common/src/main/java/org/apache/hudi/common/util/RocksDBDAO.java b/hudi-common/src/main/java/org/apache/hudi/common/util/RocksDBDAO.java index 2ae53f975..578f19382 100644 --- a/hudi-common/src/main/java/org/apache/hudi/common/util/RocksDBDAO.java +++ b/hudi-common/src/main/java/org/apache/hudi/common/util/RocksDBDAO.java @@ -64,8 +64,8 @@ public class RocksDBDAO { public RocksDBDAO(String basePath, String rocksDBBasePath) { this.basePath = basePath; - this.rocksDBBasePath = String.format("%s/%s/%s", rocksDBBasePath, - this.basePath.replace("/", "_"), UUID.randomUUID().toString()); + this.rocksDBBasePath = + String.format("%s/%s/%s", rocksDBBasePath, this.basePath.replace("/", "_"), UUID.randomUUID().toString()); init(); } @@ -137,8 +137,8 @@ public class RocksDBDAO { managedColumnFamilies.add(getColumnFamilyDescriptor(RocksDB.DEFAULT_COLUMN_FAMILY)); } else { log.info("Loading column families :" + existing.stream().map(String::new).collect(Collectors.toList())); - managedColumnFamilies.addAll(existing.stream() - .map(RocksDBDAO::getColumnFamilyDescriptor).collect(Collectors.toList())); + managedColumnFamilies + .addAll(existing.stream().map(RocksDBDAO::getColumnFamilyDescriptor).collect(Collectors.toList())); } return managedColumnFamilies; } @@ -350,9 +350,8 @@ public class RocksDBDAO { } } - log.info("Prefix Search for (query=" + prefix + ") on " + columnFamilyName - + ". Total Time Taken (msec)=" + timer.endTimer() - + ". Serialization Time taken(micro)=" + timeTakenMicro + ", num entries=" + results.size()); + log.info("Prefix Search for (query=" + prefix + ") on " + columnFamilyName + ". Total Time Taken (msec)=" + + timer.endTimer() + ". Serialization Time taken(micro)=" + timeTakenMicro + ", num entries=" + results.size()); return results.stream(); } @@ -368,7 +367,7 @@ public class RocksDBDAO { log.info("Prefix DELETE (query=" + prefix + ") on " + columnFamilyName); final RocksIterator it = getRocksDB().newIterator(managedHandlesMap.get(columnFamilyName)); it.seek(prefix.getBytes()); - //Find first and last keys to be deleted + // Find first and last keys to be deleted String firstEntry = null; String lastEntry = null; while (it.isValid() && new String(it.key()).startsWith(prefix)) { @@ -384,9 +383,8 @@ public class RocksDBDAO { if (null != firstEntry) { try { // This will not delete the last entry - getRocksDB().deleteRange(managedHandlesMap.get(columnFamilyName), firstEntry.getBytes(), - lastEntry.getBytes()); - //Delete the last entry + getRocksDB().deleteRange(managedHandlesMap.get(columnFamilyName), firstEntry.getBytes(), lastEntry.getBytes()); + // Delete the last entry getRocksDB().delete(lastEntry.getBytes()); } catch (RocksDBException e) { log.error("Got exception performing range delete"); diff --git a/hudi-common/src/main/java/org/apache/hudi/common/util/RocksDBSchemaHelper.java b/hudi-common/src/main/java/org/apache/hudi/common/util/RocksDBSchemaHelper.java index c14df29a2..437fe5b15 100644 --- a/hudi-common/src/main/java/org/apache/hudi/common/util/RocksDBSchemaHelper.java +++ b/hudi-common/src/main/java/org/apache/hudi/common/util/RocksDBSchemaHelper.java @@ -28,27 +28,17 @@ import org.apache.hudi.common.table.HoodieTableMetaClient; /** * Helper class to generate Key and column names for rocksdb based view * - * For RocksDB, 3 colFamilies are used for storing file-system view for each dataset. - * (a) View - * (b) Partitions Cached - * (c) Pending Compactions - * - * - * View : Key : Store both slice and Data file stored. - * Slice : - * Key = "type=slice,part=,id=,instant=" - * Value = Serialized FileSlice - * Data File : - * Key = "type=df,part=,id=,instant=" - * Value = Serialized DataFile - * - * Partitions : - * Key = "part=" - * Value = Boolean - * + * For RocksDB, 3 colFamilies are used for storing file-system view for each dataset. (a) View (b) Partitions Cached (c) * Pending Compactions - * Key = "part=,id=" - * Value = Pair + * + * + * View : Key : Store both slice and Data file stored. Slice : Key = + * "type=slice,part=,id=,instant=" Value = Serialized FileSlice Data File : Key = + * "type=df,part=,id=,instant=" Value = Serialized DataFile + * + * Partitions : Key = "part=" Value = Boolean + * + * Pending Compactions Key = "part=,id=" Value = Pair */ public class RocksDBSchemaHelper { @@ -80,15 +70,15 @@ public class RocksDBSchemaHelper { } public String getKeyForSliceView(String partitionPath, String fileId, String instantTime) { - return String.format("type=slice,part=%s,id=%s,instant=%s",partitionPath, fileId, instantTime); + return String.format("type=slice,part=%s,id=%s,instant=%s", partitionPath, fileId, instantTime); } public String getPrefixForSliceViewByPartitionFile(String partitionPath, String fileId) { - return String.format("type=slice,part=%s,id=%s,instant=",partitionPath, fileId); + return String.format("type=slice,part=%s,id=%s,instant=", partitionPath, fileId); } public String getPrefixForDataFileViewByPartitionFile(String partitionPath, String fileId) { - return String.format("type=df,part=%s,id=%s,instant=",partitionPath, fileId); + return String.format("type=df,part=%s,id=%s,instant=", partitionPath, fileId); } public String getKeyForDataFileView(HoodieFileGroup fileGroup, FileSlice slice) { diff --git a/hudi-common/src/main/java/org/apache/hudi/common/util/SerializationUtils.java b/hudi-common/src/main/java/org/apache/hudi/common/util/SerializationUtils.java index d5bc6c56a..cc9d2f157 100644 --- a/hudi-common/src/main/java/org/apache/hudi/common/util/SerializationUtils.java +++ b/hudi-common/src/main/java/org/apache/hudi/common/util/SerializationUtils.java @@ -34,8 +34,7 @@ import org.objenesis.instantiator.ObjectInstantiator; /** - * {@link SerializationUtils} class internally uses {@link Kryo} serializer for serializing / - * deserializing objects. + * {@link SerializationUtils} class internally uses {@link Kryo} serializer for serializing / deserializing objects. */ public class SerializationUtils { @@ -44,10 +43,12 @@ public class SerializationUtils { ThreadLocal.withInitial(() -> new KryoSerializerInstance()); // Serialize - //----------------------------------------------------------------------- + // ----------------------------------------------------------------------- /** - *

    Serializes an {@code Object} to a byte array for storage/serialization.

    + *

    + * Serializes an {@code Object} to a byte array for storage/serialization. + *

    * * @param obj the object to serialize to bytes * @return a byte[] with the converted Serializable @@ -58,15 +59,18 @@ public class SerializationUtils { } // Deserialize - //----------------------------------------------------------------------- + // ----------------------------------------------------------------------- /** - *

    Deserializes a single {@code Object} from an array of bytes.

    + *

    + * Deserializes a single {@code Object} from an array of bytes. + *

    * - *

    If the call site incorrectly types the return value, a {@link ClassCastException} is thrown - * from the call site. Without Generics in this declaration, the call site must type cast and can - * cause the same ClassCastException. Note that in both cases, the ClassCastException is in the - * call site, not in this method.

    + *

    + * If the call site incorrectly types the return value, a {@link ClassCastException} is thrown from the call site. + * Without Generics in this declaration, the call site must type cast and can cause the same ClassCastException. Note + * that in both cases, the ClassCastException is in the call site, not in this method. + *

    * * @param the object type to be deserialized * @param objectData the serialized object, must not be null @@ -109,8 +113,8 @@ public class SerializationUtils { } /** - * This class has a no-arg constructor, suitable for use with reflection instantiation. - * For Details checkout com.twitter.chill.KryoBase. + * This class has a no-arg constructor, suitable for use with reflection instantiation. For Details checkout + * com.twitter.chill.KryoBase. */ private static class KryoInstantiator implements Serializable { @@ -153,8 +157,8 @@ public class SerializationUtils { final Constructor constructor = type.getConstructor(); constructor.setAccessible(true); return constructor.newInstance(); - } catch (NoSuchMethodException | IllegalAccessException - | InstantiationException | InvocationTargetException e) { + } catch (NoSuchMethodException | IllegalAccessException | InstantiationException + | InvocationTargetException e) { // ignore this exception. we will fall back to default instantiation strategy. } return super.getInstantiatorStrategy().newInstantiatorOf(type).newInstance(); diff --git a/hudi-common/src/main/java/org/apache/hudi/common/util/SizeEstimator.java b/hudi-common/src/main/java/org/apache/hudi/common/util/SizeEstimator.java index 63fc9aaa3..57348b81f 100644 --- a/hudi-common/src/main/java/org/apache/hudi/common/util/SizeEstimator.java +++ b/hudi-common/src/main/java/org/apache/hudi/common/util/SizeEstimator.java @@ -20,14 +20,14 @@ package org.apache.hudi.common.util; /** * An interface to estimate the size of payload in memory + * * @param */ public interface SizeEstimator { /** - * This method is used to estimate the size of a payload in memory. - * The default implementation returns the total allocated size, in bytes, of the object - * and all other objects reachable from it + * This method is used to estimate the size of a payload in memory. The default implementation returns the total + * allocated size, in bytes, of the object and all other objects reachable from it */ long sizeEstimate(T t); } diff --git a/hudi-common/src/main/java/org/apache/hudi/common/util/SpillableMapUtils.java b/hudi-common/src/main/java/org/apache/hudi/common/util/SpillableMapUtils.java index f9d9cb7f0..3b22893f3 100644 --- a/hudi-common/src/main/java/org/apache/hudi/common/util/SpillableMapUtils.java +++ b/hudi-common/src/main/java/org/apache/hudi/common/util/SpillableMapUtils.java @@ -43,8 +43,7 @@ public class SpillableMapUtils { /** * |crc|timestamp|sizeOfKey|SizeOfValue|key|value| */ - private static FileEntry readInternal(RandomAccessFile file, long valuePosition, - int valueLength) throws IOException { + private static FileEntry readInternal(RandomAccessFile file, long valuePosition, int valueLength) throws IOException { file.seek(valuePosition); long crc = file.readLong(); long timestamp = file.readLong(); @@ -59,24 +58,22 @@ public class SpillableMapUtils { file.read(value, 0, valueSize); long crcOfReadValue = generateChecksum(value); if (!(crc == crcOfReadValue)) { - throw new HoodieCorruptedDataException("checksum of payload written to external disk does not match, " - + "data may be corrupted"); + throw new HoodieCorruptedDataException( + "checksum of payload written to external disk does not match, " + "data may be corrupted"); } return new FileEntry(crc, keySize, valueSize, key, value, timestamp); } /** - * Write Value and other metadata necessary to disk. Each entry has the following sequence of data

    + * Write Value and other metadata necessary to disk. Each entry has the following sequence of data + *

    * |crc|timestamp|sizeOfKey|SizeOfValue|key|value| */ - public static long spillToDisk(SizeAwareDataOutputStream outputStream, - FileEntry fileEntry) throws IOException { + public static long spillToDisk(SizeAwareDataOutputStream outputStream, FileEntry fileEntry) throws IOException { return spill(outputStream, fileEntry); } - private static long spill(SizeAwareDataOutputStream outputStream, - FileEntry fileEntry) - throws IOException { + private static long spill(SizeAwareDataOutputStream outputStream, FileEntry fileEntry) throws IOException { outputStream.writeLong(fileEntry.getCrc()); outputStream.writeLong(fileEntry.getTimestamp()); outputStream.writeInt(fileEntry.getSizeOfKey()); @@ -107,15 +104,10 @@ public class SpillableMapUtils { * Utility method to convert bytes to HoodieRecord using schema and payload class */ public static R convertToHoodieRecordPayload(GenericRecord rec, String payloadClazz) { - String recKey = rec.get(HoodieRecord.RECORD_KEY_METADATA_FIELD) - .toString(); - String partitionPath = - rec.get(HoodieRecord.PARTITION_PATH_METADATA_FIELD) - .toString(); - HoodieRecord hoodieRecord = new HoodieRecord<>( - new HoodieKey(recKey, partitionPath), - ReflectionUtils - .loadPayload(payloadClazz, new Object[]{Option.of(rec)}, Option.class)); + String recKey = rec.get(HoodieRecord.RECORD_KEY_METADATA_FIELD).toString(); + String partitionPath = rec.get(HoodieRecord.PARTITION_PATH_METADATA_FIELD).toString(); + HoodieRecord hoodieRecord = new HoodieRecord<>(new HoodieKey(recKey, partitionPath), + ReflectionUtils.loadPayload(payloadClazz, new Object[] {Option.of(rec)}, Option.class)); return (R) hoodieRecord; } @@ -123,10 +115,8 @@ public class SpillableMapUtils { * Utility method to convert bytes to HoodieRecord using schema and payload class */ public static R generateEmptyPayload(String recKey, String partitionPath, String payloadClazz) { - HoodieRecord hoodieRecord = new HoodieRecord<>( - new HoodieKey(recKey, partitionPath), - ReflectionUtils - .loadPayload(payloadClazz, new Object[]{Option.empty()}, Option.class)); + HoodieRecord hoodieRecord = new HoodieRecord<>(new HoodieKey(recKey, partitionPath), + ReflectionUtils.loadPayload(payloadClazz, new Object[] {Option.empty()}, Option.class)); return (R) hoodieRecord; } } diff --git a/hudi-common/src/main/java/org/apache/hudi/common/util/StringUtils.java b/hudi-common/src/main/java/org/apache/hudi/common/util/StringUtils.java index a4169b6b2..a6eee4ab1 100644 --- a/hudi-common/src/main/java/org/apache/hudi/common/util/StringUtils.java +++ b/hudi-common/src/main/java/org/apache/hudi/common/util/StringUtils.java @@ -24,12 +24,14 @@ package org.apache.hudi.common.util; public class StringUtils { /** - *

    Joins the elements of the provided array into a single String - * containing the provided list of elements.

    + *

    + * Joins the elements of the provided array into a single String containing the provided list of elements. + *

    * - *

    No separator is added to the joined String. - * Null objects or empty strings within the array are represented by - * empty strings.

    + *

    + * No separator is added to the joined String. Null objects or empty strings within the array are represented by empty + * strings. + *

    * *
        * StringUtils.join(null)            = null
    @@ -56,7 +58,7 @@ public class StringUtils {
     
       public static String toHexString(byte[] bytes) {
         StringBuilder sb = new StringBuilder(bytes.length * 2);
    -    for (byte b: bytes) {
    +    for (byte b : bytes) {
           sb.append(String.format("%02x", b));
         }
         return sb.toString();
    diff --git a/hudi-common/src/main/java/org/apache/hudi/common/util/TimelineDiffHelper.java b/hudi-common/src/main/java/org/apache/hudi/common/util/TimelineDiffHelper.java
    index a28162550..2253c31e6 100644
    --- a/hudi-common/src/main/java/org/apache/hudi/common/util/TimelineDiffHelper.java
    +++ b/hudi-common/src/main/java/org/apache/hudi/common/util/TimelineDiffHelper.java
    @@ -55,19 +55,19 @@ public class TimelineDiffHelper {
     
           // Check If any pending compaction is lost. If so, do not allow incremental timeline sync
           List> compactionInstants = getPendingCompactionTransitions(oldT, newT);
    -      List lostPendingCompactions =
    -          compactionInstants.stream().filter(instantPair -> instantPair.getValue() == null).map(Pair::getKey)
    -              .collect(Collectors.toList());
    +      List lostPendingCompactions = compactionInstants.stream()
    +          .filter(instantPair -> instantPair.getValue() == null).map(Pair::getKey).collect(Collectors.toList());
           if (!lostPendingCompactions.isEmpty()) {
             // If a compaction is unscheduled, fall back to complete refresh of fs view since some log files could have been
             // moved. Its unsafe to incrementally sync in that case.
    -        log.warn("Some pending compactions are no longer in new timeline (unscheduled ?)."
    -            + "They are :" + lostPendingCompactions);
    +        log.warn("Some pending compactions are no longer in new timeline (unscheduled ?)." + "They are :"
    +            + lostPendingCompactions);
             return TimelineDiffResult.UNSAFE_SYNC_RESULT;
           }
    -      List finishedCompactionInstants = compactionInstants.stream().filter(instantPair ->
    -          instantPair.getValue().getAction().equals(HoodieTimeline.COMMIT_ACTION)
    -              && instantPair.getValue().isCompleted()).map(Pair::getKey).collect(Collectors.toList());
    +      List finishedCompactionInstants = compactionInstants.stream()
    +          .filter(instantPair -> instantPair.getValue().getAction().equals(HoodieTimeline.COMMIT_ACTION)
    +              && instantPair.getValue().isCompleted())
    +          .map(Pair::getKey).collect(Collectors.toList());
     
           newT.getInstants().filter(instant -> !oldTimelineInstants.contains(instant)).forEach(newInstants::add);
           return new TimelineDiffResult(newInstants, finishedCompactionInstants, true);
    @@ -125,11 +125,8 @@ public class TimelineDiffHelper {
     
         @Override
         public String toString() {
    -      return "TimelineDiffResult{"
    -          + "newlySeenInstants=" + newlySeenInstants
    -          + ", finishedCompactionInstants=" + finishedCompactionInstants
    -          + ", canSyncIncrementally=" + canSyncIncrementally
    -          + '}';
    +      return "TimelineDiffResult{" + "newlySeenInstants=" + newlySeenInstants + ", finishedCompactionInstants="
    +          + finishedCompactionInstants + ", canSyncIncrementally=" + canSyncIncrementally + '}';
         }
       }
     }
    diff --git a/hudi-common/src/main/java/org/apache/hudi/common/util/collection/DiskBasedMap.java b/hudi-common/src/main/java/org/apache/hudi/common/util/collection/DiskBasedMap.java
    index 57496752a..4481770dc 100644
    --- a/hudi-common/src/main/java/org/apache/hudi/common/util/collection/DiskBasedMap.java
    +++ b/hudi-common/src/main/java/org/apache/hudi/common/util/collection/DiskBasedMap.java
    @@ -83,9 +83,10 @@ public final class DiskBasedMap
     
       /**
        * RandomAcessFile is not thread-safe. This API opens a new file handle per thread and returns.
    +   * 
        * @return
        */
    -  private RandomAccessFile getRandomAccessFile()  {
    +  private RandomAccessFile getRandomAccessFile() {
         try {
           RandomAccessFile readHandle = randomAccessFile.get();
           if (readHandle == null) {
    @@ -109,9 +110,9 @@ public final class DiskBasedMap
           writeOnlyFile.getParentFile().mkdir();
         }
         writeOnlyFile.createNewFile();
    -    log.info(
    -        "Spilling to file location " + writeOnlyFile.getAbsolutePath() + " in host (" + InetAddress.getLocalHost()
    -            .getHostAddress() + ") with hostname (" + InetAddress.getLocalHost().getHostName() + ")");
    +    log.info("Spilling to file location " + writeOnlyFile.getAbsolutePath() + " in host ("
    +        + InetAddress.getLocalHost().getHostAddress() + ") with hostname (" + InetAddress.getLocalHost().getHostName()
    +        + ")");
         // Make sure file is deleted when JVM exits
         writeOnlyFile.deleteOnExit();
         addShutDownHook();
    @@ -200,8 +201,8 @@ public final class DiskBasedMap
     
       public static  R get(ValueMetadata entry, RandomAccessFile file) {
         try {
    -      return SerializationUtils.deserialize(SpillableMapUtils.readBytesFromDisk(file,
    -          entry.getOffsetOfValue(), entry.getSizeOfValue()));
    +      return SerializationUtils
    +          .deserialize(SpillableMapUtils.readBytesFromDisk(file, entry.getOffsetOfValue(), entry.getSizeOfValue()));
         } catch (IOException e) {
           throw new HoodieIOException("Unable to readFromDisk Hoodie Record from disk", e);
         }
    @@ -216,8 +217,8 @@ public final class DiskBasedMap
           this.valueMetadataMap.put(key,
               new DiskBasedMap.ValueMetadata(this.filePath, valueSize, filePosition.get(), timestamp));
           byte[] serializedKey = SerializationUtils.serialize(key);
    -      filePosition.set(SpillableMapUtils.spillToDisk(writeOnlyFileHandle,
    -          new FileEntry(SpillableMapUtils.generateChecksum(val),
    +      filePosition
    +          .set(SpillableMapUtils.spillToDisk(writeOnlyFileHandle, new FileEntry(SpillableMapUtils.generateChecksum(val),
                   serializedKey.length, valueSize, serializedKey, val, timestamp)));
         } catch (IOException io) {
           throw new HoodieIOException("Unable to store data in Disk Based map", io);
    @@ -258,8 +259,7 @@ public final class DiskBasedMap
     
       public Stream valueStream() {
         final RandomAccessFile file = getRandomAccessFile();
    -    return valueMetadataMap.values().stream().sorted().sequential()
    -        .map(valueMetaData -> (R)get(valueMetaData, file));
    +    return valueMetadataMap.values().stream().sorted().sequential().map(valueMetaData -> (R) get(valueMetaData, file));
       }
     
       @Override
    @@ -286,8 +286,7 @@ public final class DiskBasedMap
         // Current timestamp when the value was written to disk
         private Long timestamp;
     
    -    public FileEntry(long crc, int sizeOfKey, int sizeOfValue, byte[] key, byte[] value,
    -        long timestamp) {
    +    public FileEntry(long crc, int sizeOfKey, int sizeOfValue, byte[] key, byte[] value, long timestamp) {
           this.crc = crc;
           this.sizeOfKey = sizeOfKey;
           this.sizeOfValue = sizeOfValue;
    diff --git a/hudi-common/src/main/java/org/apache/hudi/common/util/collection/ExternalSpillableMap.java b/hudi-common/src/main/java/org/apache/hudi/common/util/collection/ExternalSpillableMap.java
    index 7e6ecff1d..7f44b1056 100644
    --- a/hudi-common/src/main/java/org/apache/hudi/common/util/collection/ExternalSpillableMap.java
    +++ b/hudi-common/src/main/java/org/apache/hudi/common/util/collection/ExternalSpillableMap.java
    @@ -36,13 +36,19 @@ import org.apache.log4j.LogManager;
     import org.apache.log4j.Logger;
     
     /**
    - * An external map that spills content to disk when there is insufficient space for it to grow. 

    This map holds 2 - * types of data structures :

    (1) Key-Value pairs in a in-memory map (2) Key-ValueMetadata pairs in an in-memory map - * which keeps a marker to the values spilled to disk

    NOTE : Values are only appended to disk. If a remove() is - * called, the entry is marked removed from the in-memory key-valueMetadata map but it's values will be lying around in - * the temp file on disk until the file is cleaned.

    The setting of the spill threshold faces the following - * trade-off: If the spill threshold is too high, the in-memory map may occupy more memory than is available, resulting - * in OOM. However, if the spill threshold is too low, we spill frequently and incur unnecessary disk writes. + * An external map that spills content to disk when there is insufficient space for it to grow. + *

    + * This map holds 2 types of data structures : + *

    + * (1) Key-Value pairs in a in-memory map (2) Key-ValueMetadata pairs in an in-memory map which keeps a marker to the + * values spilled to disk + *

    + * NOTE : Values are only appended to disk. If a remove() is called, the entry is marked removed from the in-memory + * key-valueMetadata map but it's values will be lying around in the temp file on disk until the file is cleaned. + *

    + * The setting of the spill threshold faces the following trade-off: If the spill threshold is too high, the in-memory + * map may occupy more memory than is available, resulting in OOM. However, if the spill threshold is too low, we spill + * frequently and incur unnecessary disk writes. */ public class ExternalSpillableMap implements Map { @@ -70,14 +76,13 @@ public class ExternalSpillableMap keySizeEstimator, SizeEstimator valueSizeEstimator) throws IOException { + + public ExternalSpillableMap(Long maxInMemorySizeInBytes, String baseFilePath, SizeEstimator keySizeEstimator, + SizeEstimator valueSizeEstimator) throws IOException { this.inMemoryMap = new HashMap<>(); this.baseFilePath = baseFilePath; this.diskBasedMap = new DiskBasedMap<>(baseFilePath); - this.maxInMemorySizeInBytes = (long) Math - .floor(maxInMemorySizeInBytes * sizingFactorForInMemoryMap); + this.maxInMemorySizeInBytes = (long) Math.floor(maxInMemorySizeInBytes * sizingFactorForInMemoryMap); this.currentInMemoryMapSize = 0L; this.keySizeEstimator = keySizeEstimator; this.valueSizeEstimator = valueSizeEstimator; @@ -169,11 +174,9 @@ public class ExternalSpillableMap " + estimatedPayloadSize); - } else if (shouldEstimatePayloadSize - && inMemoryMap.size() % NUMBER_OF_RECORDS_TO_ESTIMATE_PAYLOAD_SIZE == 0) { + } else if (shouldEstimatePayloadSize && inMemoryMap.size() % NUMBER_OF_RECORDS_TO_ESTIMATE_PAYLOAD_SIZE == 0) { // Re-estimate the size of a record by calculating the size of the entire map containing // N entries and then dividing by the number of entries present (N). This helps to get a // correct estimation of the size of each record in the JVM. diff --git a/hudi-common/src/main/java/org/apache/hudi/common/util/collection/ImmutablePair.java b/hudi-common/src/main/java/org/apache/hudi/common/util/collection/ImmutablePair.java index 5d340d130..fd2117a5a 100644 --- a/hudi-common/src/main/java/org/apache/hudi/common/util/collection/ImmutablePair.java +++ b/hudi-common/src/main/java/org/apache/hudi/common/util/collection/ImmutablePair.java @@ -20,14 +20,19 @@ package org.apache.hudi.common.util.collection; /** * (NOTE: Adapted from Apache commons-lang3) - *

    An immutable pair consisting of two {@code Object} elements.

    + *

    + * An immutable pair consisting of two {@code Object} elements. + *

    * - *

    Although the implementation is immutable, there is no restriction on the objects - * that may be stored. If mutable objects are stored in the pair, then the pair - * itself effectively becomes mutable. The class is also {@code final}, so a subclass - * can not add undesirable behaviour.

    + *

    + * Although the implementation is immutable, there is no restriction on the objects that may be stored. If mutable + * objects are stored in the pair, then the pair itself effectively becomes mutable. The class is also {@code final}, so + * a subclass can not add undesirable behaviour. + *

    * - *

    #ThreadSafe# if both paired objects are thread-safe

    + *

    + * #ThreadSafe# if both paired objects are thread-safe + *

    * * @param the left element type * @param the right element type @@ -49,10 +54,13 @@ public final class ImmutablePair extends Pair { public final R right; /** - *

    Obtains an immutable pair of from two objects inferring the generic types.

    + *

    + * Obtains an immutable pair of from two objects inferring the generic types. + *

    * - *

    This factory allows the pair to be created using inference to - * obtain the generic types.

    + *

    + * This factory allows the pair to be created using inference to obtain the generic types. + *

    * * @param the left element type * @param the right element type @@ -76,7 +84,7 @@ public final class ImmutablePair extends Pair { this.right = right; } - //----------------------------------------------------------------------- + // ----------------------------------------------------------------------- /** * {@inheritDoc} @@ -95,9 +103,13 @@ public final class ImmutablePair extends Pair { } /** - *

    Throws {@code UnsupportedOperationException}.

    + *

    + * Throws {@code UnsupportedOperationException}. + *

    * - *

    This pair is immutable, so this operation is not supported.

    + *

    + * This pair is immutable, so this operation is not supported. + *

    * * @param value the value to set * @return never diff --git a/hudi-common/src/main/java/org/apache/hudi/common/util/collection/ImmutableTriple.java b/hudi-common/src/main/java/org/apache/hudi/common/util/collection/ImmutableTriple.java index fc94ed241..f1732aa4b 100644 --- a/hudi-common/src/main/java/org/apache/hudi/common/util/collection/ImmutableTriple.java +++ b/hudi-common/src/main/java/org/apache/hudi/common/util/collection/ImmutableTriple.java @@ -20,14 +20,19 @@ package org.apache.hudi.common.util.collection; /** * (NOTE: Adapted from Apache commons-lang3) - *

    An immutable triple consisting of three {@code Object} elements.

    + *

    + * An immutable triple consisting of three {@code Object} elements. + *

    * - *

    Although the implementation is immutable, there is no restriction on the objects - * that may be stored. If mutable objects are stored in the triple, then the triple - * itself effectively becomes mutable. The class is also {@code final}, so a subclass - * can not add undesirable behaviour.

    + *

    + * Although the implementation is immutable, there is no restriction on the objects that may be stored. If mutable + * objects are stored in the triple, then the triple itself effectively becomes mutable. The class is also + * {@code final}, so a subclass can not add undesirable behaviour. + *

    * - *

    #ThreadSafe# if all three objects are thread-safe

    + *

    + * #ThreadSafe# if all three objects are thread-safe + *

    * * @param the left element type * @param the middle element type @@ -54,10 +59,13 @@ public final class ImmutableTriple extends Triple { public final R right; /** - *

    Obtains an immutable triple of from three objects inferring the generic types.

    + *

    + * Obtains an immutable triple of from three objects inferring the generic types. + *

    * - *

    This factory allows the triple to be created using inference to - * obtain the generic types.

    + *

    + * This factory allows the triple to be created using inference to obtain the generic types. + *

    * * @param the left element type * @param the middle element type @@ -85,7 +93,7 @@ public final class ImmutableTriple extends Triple { this.right = right; } - //----------------------------------------------------------------------- + // ----------------------------------------------------------------------- /** * {@inheritDoc} diff --git a/hudi-common/src/main/java/org/apache/hudi/common/util/collection/LazyFileIterable.java b/hudi-common/src/main/java/org/apache/hudi/common/util/collection/LazyFileIterable.java index 7ac39c16e..27ffff86d 100644 --- a/hudi-common/src/main/java/org/apache/hudi/common/util/collection/LazyFileIterable.java +++ b/hudi-common/src/main/java/org/apache/hudi/common/util/collection/LazyFileIterable.java @@ -66,12 +66,9 @@ public class LazyFileIterable implements Iterable { readOnlyFileHandle.seek(0); // sort the map in increasing order of offset of value so disk seek is only in one(forward) direction - this.metadataIterator = map - .entrySet() - .stream() - .sorted( - (Map.Entry o1, Map.Entry o2) -> - o1.getValue().getOffsetOfValue().compareTo(o2.getValue().getOffsetOfValue())) + this.metadataIterator = map.entrySet().stream() + .sorted((Map.Entry o1, Map.Entry o2) -> o1 + .getValue().getOffsetOfValue().compareTo(o2.getValue().getOffsetOfValue())) .collect(Collectors.toList()).iterator(); this.addShutdownHook(); } diff --git a/hudi-common/src/main/java/org/apache/hudi/common/util/collection/Pair.java b/hudi-common/src/main/java/org/apache/hudi/common/util/collection/Pair.java index bcd800cd8..dec2644fc 100644 --- a/hudi-common/src/main/java/org/apache/hudi/common/util/collection/Pair.java +++ b/hudi-common/src/main/java/org/apache/hudi/common/util/collection/Pair.java @@ -23,15 +23,20 @@ import java.util.Map; /** * (NOTE: Adapted from Apache commons-lang3) - *

    A pair consisting of two elements.

    + *

    + * A pair consisting of two elements. + *

    * - *

    This class is an abstract implementation defining the basic API. - * It refers to the elements as 'left' and 'right'. It also implements the - * {@code Map.Entry} interface where the key is 'left' and the value is 'right'.

    + *

    + * This class is an abstract implementation defining the basic API. It refers to the elements as 'left' and 'right'. It + * also implements the {@code Map.Entry} interface where the key is 'left' and the value is 'right'. + *

    * - *

    Subclass implementations may be mutable or immutable. - * However, there is no restriction on the type of the stored objects that may be stored. - * If mutable objects are stored in the pair, then the pair itself effectively becomes mutable.

    + *

    + * Subclass implementations may be mutable or immutable. However, there is no restriction on the type of the stored + * objects that may be stored. If mutable objects are stored in the pair, then the pair itself effectively becomes + * mutable. + *

    * * @param the left element type * @param the right element type @@ -44,10 +49,13 @@ public abstract class Pair implements Map.Entry, ComparableObtains an immutable pair of from two objects inferring the generic types.

    + *

    + * Obtains an immutable pair of from two objects inferring the generic types. + *

    * - *

    This factory allows the pair to be created using inference to - * obtain the generic types.

    + *

    + * This factory allows the pair to be created using inference to obtain the generic types. + *

    * * @param the left element type * @param the right element type @@ -59,31 +67,42 @@ public abstract class Pair implements Map.Entry, Comparable(left, right); } - //----------------------------------------------------------------------- + // ----------------------------------------------------------------------- /** - *

    Gets the left element from this pair.

    + *

    + * Gets the left element from this pair. + *

    * - *

    When treated as a key-value pair, this is the key.

    + *

    + * When treated as a key-value pair, this is the key. + *

    * * @return the left element, may be null */ public abstract L getLeft(); /** - *

    Gets the right element from this pair.

    + *

    + * Gets the right element from this pair. + *

    * - *

    When treated as a key-value pair, this is the value.

    + *

    + * When treated as a key-value pair, this is the value. + *

    * * @return the right element, may be null */ public abstract R getRight(); /** - *

    Gets the key from this pair.

    + *

    + * Gets the key from this pair. + *

    * - *

    This method implements the {@code Map.Entry} interface returning the - * left element as the key.

    + *

    + * This method implements the {@code Map.Entry} interface returning the left element as the key. + *

    * * @return the left element as the key, may be null */ @@ -93,10 +112,13 @@ public abstract class Pair implements Map.Entry, ComparableGets the value from this pair.

    + *

    + * Gets the value from this pair. + *

    * - *

    This method implements the {@code Map.Entry} interface returning the - * right element as the value.

    + *

    + * This method implements the {@code Map.Entry} interface returning the right element as the value. + *

    * * @return the right element as the value, may be null */ @@ -105,11 +127,12 @@ public abstract class Pair implements Map.Entry, ComparableCompares the pair based on the left element followed by the right element. - * The types must be {@code Comparable}.

    + *

    + * Compares the pair based on the left element followed by the right element. The types must be {@code Comparable}. + *

    * * @param other the other pair, not null * @return negative if this is less, zero if equal, positive if greater @@ -133,7 +156,9 @@ public abstract class Pair implements Map.Entry, ComparableCompares this pair to another based on the two elements.

    + *

    + * Compares this pair to another based on the two elements. + *

    * * @param obj the object to compare to, null returns false * @return true if the elements of the pair are equal @@ -145,27 +170,28 @@ public abstract class Pair implements Map.Entry, Comparable) { final Map.Entry other = (Map.Entry) obj; - return getKey().equals(other.getKey()) - && getValue().equals(other.getValue()); + return getKey().equals(other.getKey()) && getValue().equals(other.getValue()); } return false; } /** - *

    Returns a suitable hash code. - * The hash code follows the definition in {@code Map.Entry}.

    + *

    + * Returns a suitable hash code. The hash code follows the definition in {@code Map.Entry}. + *

    * * @return the hash code */ @Override public int hashCode() { // see Map.Entry API specification - return (getKey() == null ? 0 : getKey().hashCode()) - ^ (getValue() == null ? 0 : getValue().hashCode()); + return (getKey() == null ? 0 : getKey().hashCode()) ^ (getValue() == null ? 0 : getValue().hashCode()); } /** - *

    Returns a String representation of this pair using the format {@code ($left,$right)}.

    + *

    + * Returns a String representation of this pair using the format {@code ($left,$right)}. + *

    * * @return a string describing this object, not null */ @@ -175,12 +201,15 @@ public abstract class Pair implements Map.Entry, ComparableFormats the receiver using the given format.

    + *

    + * Formats the receiver using the given format. + *

    * - *

    This uses {@link java.util.Formattable} to perform the formatting. Two variables may - * be used to embed the left and right elements. Use {@code %1$s} for the left - * element (key) and {@code %2$s} for the right element (value). - * The default format used by {@code toString()} is {@code (%1$s,%2$s)}.

    + *

    + * This uses {@link java.util.Formattable} to perform the formatting. Two variables may be used to embed the left and + * right elements. Use {@code %1$s} for the left element (key) and {@code %2$s} for the right element (value). The + * default format used by {@code toString()} is {@code (%1$s,%2$s)}. + *

    * * @param format the format string, optionally containing {@code %1$s} and {@code %2$s}, not null * @return the formatted string, not null diff --git a/hudi-common/src/main/java/org/apache/hudi/common/util/collection/RocksDBBasedMap.java b/hudi-common/src/main/java/org/apache/hudi/common/util/collection/RocksDBBasedMap.java index 20d68788c..1bb5065ad 100644 --- a/hudi-common/src/main/java/org/apache/hudi/common/util/collection/RocksDBBasedMap.java +++ b/hudi-common/src/main/java/org/apache/hudi/common/util/collection/RocksDBBasedMap.java @@ -41,7 +41,7 @@ public final class RocksDBBasedMap iterator() { - return getRocksDBDAO().prefixSearch(columnFamilyName, "") - .map(p -> (R)(p.getValue())).iterator(); + return getRocksDBDAO().prefixSearch(columnFamilyName, "").map(p -> (R) (p.getValue())).iterator(); } } diff --git a/hudi-common/src/main/java/org/apache/hudi/common/util/collection/Triple.java b/hudi-common/src/main/java/org/apache/hudi/common/util/collection/Triple.java index 55bb63b00..60263bc34 100644 --- a/hudi-common/src/main/java/org/apache/hudi/common/util/collection/Triple.java +++ b/hudi-common/src/main/java/org/apache/hudi/common/util/collection/Triple.java @@ -22,14 +22,20 @@ import java.io.Serializable; /** * (NOTE: Adapted from Apache commons-lang3) - *

    A triple consisting of three elements.

    + *

    + * A triple consisting of three elements. + *

    * - *

    This class is an abstract implementation defining the basic API. - * It refers to the elements as 'left', 'middle' and 'right'.

    + *

    + * This class is an abstract implementation defining the basic API. It refers to the elements as 'left', 'middle' and + * 'right'. + *

    * - *

    Subclass implementations may be mutable or immutable. - * However, there is no restriction on the type of the stored objects that may be stored. - * If mutable objects are stored in the triple, then the triple itself effectively becomes mutable.

    + *

    + * Subclass implementations may be mutable or immutable. However, there is no restriction on the type of the stored + * objects that may be stored. If mutable objects are stored in the triple, then the triple itself effectively becomes + * mutable. + *

    * * @param the left element type * @param the middle element type @@ -43,10 +49,13 @@ public abstract class Triple implements Comparable>, Se private static final long serialVersionUID = 1L; /** - *

    Obtains an immutable triple of from three objects inferring the generic types.

    + *

    + * Obtains an immutable triple of from three objects inferring the generic types. + *

    * - *

    This factory allows the triple to be created using inference to - * obtain the generic types.

    + *

    + * This factory allows the triple to be created using inference to obtain the generic types. + *

    * * @param the left element type * @param the middle element type @@ -60,35 +69,42 @@ public abstract class Triple implements Comparable>, Se return new ImmutableTriple(left, middle, right); } - //----------------------------------------------------------------------- + // ----------------------------------------------------------------------- /** - *

    Gets the left element from this triple.

    + *

    + * Gets the left element from this triple. + *

    * * @return the left element, may be null */ public abstract L getLeft(); /** - *

    Gets the middle element from this triple.

    + *

    + * Gets the middle element from this triple. + *

    * * @return the middle element, may be null */ public abstract M getMiddle(); /** - *

    Gets the right element from this triple.

    + *

    + * Gets the right element from this triple. + *

    * * @return the right element, may be null */ public abstract R getRight(); - //----------------------------------------------------------------------- + // ----------------------------------------------------------------------- /** - *

    Compares the triple based on the left element, followed by the middle element, - * finally the right element. - * The types must be {@code Comparable}.

    + *

    + * Compares the triple based on the left element, followed by the middle element, finally the right element. The types + * must be {@code Comparable}. + *

    * * @param other the other triple, not null * @return negative if this is less, zero if equal, positive if greater @@ -109,7 +125,9 @@ public abstract class Triple implements Comparable>, Se } /** - *

    Compares this triple to another based on the three elements.

    + *

    + * Compares this triple to another based on the three elements. + *

    * * @param obj the object to compare to, null returns false * @return true if the elements of the triple are equal @@ -122,27 +140,29 @@ public abstract class Triple implements Comparable>, Se } if (obj instanceof Triple) { final Triple other = (Triple) obj; - return getLeft().equals(other.getLeft()) - && getMiddle().equals(other.getMiddle()) + return getLeft().equals(other.getLeft()) && getMiddle().equals(other.getMiddle()) && getRight().equals(other.getRight()); } return false; } /** - *

    Returns a suitable hash code.

    + *

    + * Returns a suitable hash code. + *

    * * @return the hash code */ @Override public int hashCode() { - return (getLeft() == null ? 0 : getLeft().hashCode()) - ^ (getMiddle() == null ? 0 : getMiddle().hashCode()) + return (getLeft() == null ? 0 : getLeft().hashCode()) ^ (getMiddle() == null ? 0 : getMiddle().hashCode()) ^ (getRight() == null ? 0 : getRight().hashCode()); } /** - *

    Returns a String representation of this triple using the format {@code ($left,$middle,$right)}.

    + *

    + * Returns a String representation of this triple using the format {@code ($left,$middle,$right)}. + *

    * * @return a string describing this object, not null */ @@ -153,12 +173,15 @@ public abstract class Triple implements Comparable>, Se } /** - *

    Formats the receiver using the given format.

    + *

    + * Formats the receiver using the given format. + *

    * - *

    This uses {@link java.util.Formattable} to perform the formatting. Three variables may - * be used to embed the left and right elements. Use {@code %1$s} for the left - * element, {@code %2$s} for the middle and {@code %3$s} for the right element. - * The default format used by {@code toString()} is {@code (%1$s,%2$s,%3$s)}.

    + *

    + * This uses {@link java.util.Formattable} to perform the formatting. Three variables may be used to embed the left + * and right elements. Use {@code %1$s} for the left element, {@code %2$s} for the middle and {@code %3$s} for the + * right element. The default format used by {@code toString()} is {@code (%1$s,%2$s,%3$s)}. + *

    * * @param format the format string, optionally containing {@code %1$s}, {@code %2$s} and {@code %3$s}, not null * @return the formatted string, not null diff --git a/hudi-common/src/main/java/org/apache/hudi/common/util/queue/BoundedInMemoryExecutor.java b/hudi-common/src/main/java/org/apache/hudi/common/util/queue/BoundedInMemoryExecutor.java index e0614f8a0..835764a24 100644 --- a/hudi-common/src/main/java/org/apache/hudi/common/util/queue/BoundedInMemoryExecutor.java +++ b/hudi-common/src/main/java/org/apache/hudi/common/util/queue/BoundedInMemoryExecutor.java @@ -37,9 +37,9 @@ import org.apache.log4j.LogManager; import org.apache.log4j.Logger; /** - * Executor which orchestrates concurrent producers and consumers communicating through a bounded in-memory queue. - * This class takes as input the size limit, queue producer(s), consumer and transformer - * and exposes API to orchestrate concurrent execution of these actors communicating through a central bounded queue + * Executor which orchestrates concurrent producers and consumers communicating through a bounded in-memory queue. This + * class takes as input the size limit, queue producer(s), consumer and transformer and exposes API to orchestrate + * concurrent execution of these actors communicating through a central bounded queue */ public class BoundedInMemoryExecutor { @@ -54,17 +54,13 @@ public class BoundedInMemoryExecutor { // Consumer private final Option> consumer; - public BoundedInMemoryExecutor(final long bufferLimitInBytes, - BoundedInMemoryQueueProducer producer, - Option> consumer, - final Function transformFunction) { + public BoundedInMemoryExecutor(final long bufferLimitInBytes, BoundedInMemoryQueueProducer producer, + Option> consumer, final Function transformFunction) { this(bufferLimitInBytes, Arrays.asList(producer), consumer, transformFunction, new DefaultSizeEstimator<>()); } - public BoundedInMemoryExecutor(final long bufferLimitInBytes, - List> producers, - Option> consumer, - final Function transformFunction, + public BoundedInMemoryExecutor(final long bufferLimitInBytes, List> producers, + Option> consumer, final Function transformFunction, final SizeEstimator sizeEstimator) { this.producers = producers; this.consumer = consumer; @@ -74,8 +70,7 @@ public class BoundedInMemoryExecutor { } /** - * Callback to implement environment specific behavior before executors (producers/consumer) - * run. + * Callback to implement environment specific behavior before executors (producers/consumer) run. */ public void preExecute() { // Do Nothing in general context @@ -118,20 +113,19 @@ public class BoundedInMemoryExecutor { */ private Future startConsumer() { return consumer.map(consumer -> { - return executorService.submit( - () -> { - logger.info("starting consumer thread"); - preExecute(); - try { - E result = consumer.consume(queue); - logger.info("Queue Consumption is done; notifying producer threads"); - return result; - } catch (Exception e) { - logger.error("error consuming records", e); - queue.markAsFailed(e); - throw e; - } - }); + return executorService.submit(() -> { + logger.info("starting consumer thread"); + preExecute(); + try { + E result = consumer.consume(queue); + logger.info("Queue Consumption is done; notifying producer threads"); + return result; + } catch (Exception e) { + logger.error("error consuming records", e); + queue.markAsFailed(e); + throw e; + } + }); }).orElse(CompletableFuture.completedFuture(null)); } diff --git a/hudi-common/src/main/java/org/apache/hudi/common/util/queue/BoundedInMemoryQueue.java b/hudi-common/src/main/java/org/apache/hudi/common/util/queue/BoundedInMemoryQueue.java index 0cc142dea..79a1920fb 100644 --- a/hudi-common/src/main/java/org/apache/hudi/common/util/queue/BoundedInMemoryQueue.java +++ b/hudi-common/src/main/java/org/apache/hudi/common/util/queue/BoundedInMemoryQueue.java @@ -36,12 +36,12 @@ import org.apache.log4j.LogManager; import org.apache.log4j.Logger; /** - * Used for enqueueing input records. Queue limit is controlled by {@link #memoryLimit}. - * Unlike standard bounded queue implementations, this queue bounds the size by memory bytes occupied by its - * tenants. The standard implementation bounds by the number of entries in the queue. + * Used for enqueueing input records. Queue limit is controlled by {@link #memoryLimit}. Unlike standard bounded queue + * implementations, this queue bounds the size by memory bytes occupied by its tenants. The standard implementation + * bounds by the number of entries in the queue. * - * It internally samples every {@link #RECORD_SAMPLING_RATE}th record and adjusts number of records in - * queue accordingly. This is done to ensure that we don't OOM. + * It internally samples every {@link #RECORD_SAMPLING_RATE}th record and adjusts number of records in queue + * accordingly. This is done to ensure that we don't OOM. * * This queue supports multiple producer single consumer pattern. * @@ -65,8 +65,7 @@ public class BoundedInMemoryQueue implements Iterable { // used for sampling records with "RECORD_SAMPLING_RATE" frequency. public final AtomicLong samplingRecordCounter = new AtomicLong(-1); // internal queue for records. - private final LinkedBlockingQueue> queue = new - LinkedBlockingQueue<>(); + private final LinkedBlockingQueue> queue = new LinkedBlockingQueue<>(); // maximum amount of memory to be used for queueing records. private final long memoryLimit; // it holds the root cause of the exception in case either queueing records (consuming from @@ -96,24 +95,21 @@ public class BoundedInMemoryQueue implements Iterable { /** * Construct BoundedInMemoryQueue with default SizeEstimator * - * @param memoryLimit MemoryLimit in bytes + * @param memoryLimit MemoryLimit in bytes * @param transformFunction Transformer Function to convert input payload type to stored payload type */ public BoundedInMemoryQueue(final long memoryLimit, final Function transformFunction) { - this(memoryLimit, transformFunction, new DefaultSizeEstimator() { - }); + this(memoryLimit, transformFunction, new DefaultSizeEstimator() {}); } /** * Construct BoundedInMemoryQueue with passed in size estimator * - * @param memoryLimit MemoryLimit in bytes - * @param transformFunction Transformer Function to convert input payload type to stored payload type + * @param memoryLimit MemoryLimit in bytes + * @param transformFunction Transformer Function to convert input payload type to stored payload type * @param payloadSizeEstimator Payload Size Estimator */ - public BoundedInMemoryQueue( - final long memoryLimit, - final Function transformFunction, + public BoundedInMemoryQueue(final long memoryLimit, final Function transformFunction, final SizeEstimator payloadSizeEstimator) { this.memoryLimit = memoryLimit; this.transformFunction = transformFunction; @@ -127,9 +123,9 @@ public class BoundedInMemoryQueue implements Iterable { } /** - * Samples records with "RECORD_SAMPLING_RATE" frequency and computes average record size in bytes. It is used - * for determining how many maximum records to queue. Based on change in avg size it ma increase or decrease - * available permits. + * Samples records with "RECORD_SAMPLING_RATE" frequency and computes average record size in bytes. It is used for + * determining how many maximum records to queue. Based on change in avg size it ma increase or decrease available + * permits. * * @param payload Payload to size */ @@ -139,10 +135,10 @@ public class BoundedInMemoryQueue implements Iterable { } final long recordSizeInBytes = payloadSizeEstimator.sizeEstimate(payload); - final long newAvgRecordSizeInBytes = Math - .max(1, (avgRecordSizeInBytes * numSamples + recordSizeInBytes) / (numSamples + 1)); - final int newRateLimit = (int) Math - .min(RECORD_CACHING_LIMIT, Math.max(1, this.memoryLimit / newAvgRecordSizeInBytes)); + final long newAvgRecordSizeInBytes = + Math.max(1, (avgRecordSizeInBytes * numSamples + recordSizeInBytes) / (numSamples + 1)); + final int newRateLimit = + (int) Math.min(RECORD_CACHING_LIMIT, Math.max(1, this.memoryLimit / newAvgRecordSizeInBytes)); // If there is any change in number of records to cache then we will either release (if it increased) or acquire // (if it decreased) to adjust rate limiting to newly computed value. @@ -187,8 +183,8 @@ public class BoundedInMemoryQueue implements Iterable { } /** - * Reader interface but never exposed to outside world as this is a single consumer queue. - * Reading is done through a singleton iterator for this queue. + * Reader interface but never exposed to outside world as this is a single consumer queue. Reading is done through a + * singleton iterator for this queue. */ private Option readNextRecord() { if (this.isReadDone.get()) { diff --git a/hudi-common/src/main/java/org/apache/hudi/common/util/queue/BoundedInMemoryQueueProducer.java b/hudi-common/src/main/java/org/apache/hudi/common/util/queue/BoundedInMemoryQueueProducer.java index 41ec245e9..ffe5ded1a 100644 --- a/hudi-common/src/main/java/org/apache/hudi/common/util/queue/BoundedInMemoryQueueProducer.java +++ b/hudi-common/src/main/java/org/apache/hudi/common/util/queue/BoundedInMemoryQueueProducer.java @@ -19,8 +19,7 @@ package org.apache.hudi.common.util.queue; /** - * Producer for BoundedInMemoryQueue. Memory Bounded Buffer supports - * multiple producers single consumer pattern. + * Producer for BoundedInMemoryQueue. Memory Bounded Buffer supports multiple producers single consumer pattern. * * @param Input type for buffer items produced */ diff --git a/hudi-common/src/main/java/org/apache/hudi/config/DefaultHoodieConfig.java b/hudi-common/src/main/java/org/apache/hudi/config/DefaultHoodieConfig.java index 406606fe4..9939a67e7 100644 --- a/hudi-common/src/main/java/org/apache/hudi/config/DefaultHoodieConfig.java +++ b/hudi-common/src/main/java/org/apache/hudi/config/DefaultHoodieConfig.java @@ -32,15 +32,13 @@ public class DefaultHoodieConfig implements Serializable { this.props = props; } - public static void setDefaultOnCondition(Properties props, boolean condition, String propName, - String defaultValue) { + public static void setDefaultOnCondition(Properties props, boolean condition, String propName, String defaultValue) { if (condition) { props.setProperty(propName, defaultValue); } } - public static void setDefaultOnCondition(Properties props, boolean condition, - DefaultHoodieConfig config) { + public static void setDefaultOnCondition(Properties props, boolean condition, DefaultHoodieConfig config) { if (condition) { props.putAll(config.getProps()); } diff --git a/hudi-common/src/main/java/org/apache/hudi/exception/DatasetNotFoundException.java b/hudi-common/src/main/java/org/apache/hudi/exception/DatasetNotFoundException.java index 87dd549e1..711516a1d 100644 --- a/hudi-common/src/main/java/org/apache/hudi/exception/DatasetNotFoundException.java +++ b/hudi-common/src/main/java/org/apache/hudi/exception/DatasetNotFoundException.java @@ -23,7 +23,9 @@ import org.apache.hadoop.fs.FileSystem; import org.apache.hadoop.fs.Path; /** - *

    Exception thrown to indicate that a hoodie dataset was not found on the path provided

    + *

    + * Exception thrown to indicate that a hoodie dataset was not found on the path provided + *

    */ public class DatasetNotFoundException extends HoodieException { @@ -50,8 +52,7 @@ public class DatasetNotFoundException extends HoodieException { // if the base path is file:///, then we have a IllegalArgumentException throw new DatasetNotFoundException(metaPathDir.toString()); } catch (IOException e) { - throw new HoodieIOException( - "Could not check if dataset " + basePathDir + " is valid dataset", e); + throw new HoodieIOException("Could not check if dataset " + basePathDir + " is valid dataset", e); } } } diff --git a/hudi-common/src/main/java/org/apache/hudi/exception/HoodieCorruptedDataException.java b/hudi-common/src/main/java/org/apache/hudi/exception/HoodieCorruptedDataException.java index 9cd48a5b5..e6fb8de39 100644 --- a/hudi-common/src/main/java/org/apache/hudi/exception/HoodieCorruptedDataException.java +++ b/hudi-common/src/main/java/org/apache/hudi/exception/HoodieCorruptedDataException.java @@ -19,7 +19,8 @@ package org.apache.hudi.exception; /** - *

    Exception thrown when any data corruption happens when reading/writing from temporary disk + *

    + * Exception thrown when any data corruption happens when reading/writing from temporary disk *

    */ public class HoodieCorruptedDataException extends HoodieException { diff --git a/hudi-common/src/main/java/org/apache/hudi/exception/HoodieException.java b/hudi-common/src/main/java/org/apache/hudi/exception/HoodieException.java index 580f804d0..2b86dc693 100644 --- a/hudi-common/src/main/java/org/apache/hudi/exception/HoodieException.java +++ b/hudi-common/src/main/java/org/apache/hudi/exception/HoodieException.java @@ -21,9 +21,13 @@ package org.apache.hudi.exception; import java.io.Serializable; /** - *

    Exception thrown for Hoodie failures. The root of the exception hierarchy.

    Hoodie - * Write/Read clients will throw this exception if any of its operations fail. This is a runtime - * (unchecked) exception.

    + *

    + * Exception thrown for Hoodie failures. The root of the exception hierarchy. + *

    + *

    + * Hoodie Write/Read clients will throw this exception if any of its operations fail. This is a runtime (unchecked) + * exception. + *

    */ public class HoodieException extends RuntimeException implements Serializable { diff --git a/hudi-common/src/main/java/org/apache/hudi/exception/HoodieIOException.java b/hudi-common/src/main/java/org/apache/hudi/exception/HoodieIOException.java index 438e0dc9c..edc62c5df 100644 --- a/hudi-common/src/main/java/org/apache/hudi/exception/HoodieIOException.java +++ b/hudi-common/src/main/java/org/apache/hudi/exception/HoodieIOException.java @@ -21,7 +21,9 @@ package org.apache.hudi.exception; import java.io.IOException; /** - *

    Exception thrown for dataset IO-related failures.

    + *

    + * Exception thrown for dataset IO-related failures. + *

    */ public class HoodieIOException extends HoodieException { diff --git a/hudi-common/src/main/java/org/apache/hudi/exception/HoodieIndexException.java b/hudi-common/src/main/java/org/apache/hudi/exception/HoodieIndexException.java index bc923d506..8deaaabaf 100644 --- a/hudi-common/src/main/java/org/apache/hudi/exception/HoodieIndexException.java +++ b/hudi-common/src/main/java/org/apache/hudi/exception/HoodieIndexException.java @@ -19,7 +19,9 @@ package org.apache.hudi.exception; /** - *

    Exception thrown for HoodieIndex related errors.

    + *

    + * Exception thrown for HoodieIndex related errors. + *

    */ public class HoodieIndexException extends HoodieException { diff --git a/hudi-common/src/main/java/org/apache/hudi/exception/HoodieRecordMissingException.java b/hudi-common/src/main/java/org/apache/hudi/exception/HoodieRecordMissingException.java index 1bc6c5716..b3345a786 100644 --- a/hudi-common/src/main/java/org/apache/hudi/exception/HoodieRecordMissingException.java +++ b/hudi-common/src/main/java/org/apache/hudi/exception/HoodieRecordMissingException.java @@ -21,15 +21,15 @@ package org.apache.hudi.exception; import org.apache.hudi.common.model.HoodieRecord; /** - *

    Exception throws when indexing fails to locate the hoodie record. HoodieRecord current - * location and partition path does not match. This is an unrecoverable error

    + *

    + * Exception throws when indexing fails to locate the hoodie record. HoodieRecord current location and partition path + * does not match. This is an unrecoverable error + *

    */ public class HoodieRecordMissingException extends HoodieException { public HoodieRecordMissingException(HoodieRecord record) { - super( - "Record " + record.getRecordKey() + " with partition path " + record.getPartitionPath() - + " in current location " + record.getCurrentLocation() - + " is not found in the partition"); + super("Record " + record.getRecordKey() + " with partition path " + record.getPartitionPath() + + " in current location " + record.getCurrentLocation() + " is not found in the partition"); } } diff --git a/hudi-common/src/main/java/org/apache/hudi/exception/InvalidDatasetException.java b/hudi-common/src/main/java/org/apache/hudi/exception/InvalidDatasetException.java index 943aa1594..2de239f4a 100644 --- a/hudi-common/src/main/java/org/apache/hudi/exception/InvalidDatasetException.java +++ b/hudi-common/src/main/java/org/apache/hudi/exception/InvalidDatasetException.java @@ -19,7 +19,9 @@ package org.apache.hudi.exception; /** - *

    Exception thrown to indicate that a hoodie dataset is invalid

    + *

    + * Exception thrown to indicate that a hoodie dataset is invalid + *

    */ public class InvalidDatasetException extends HoodieException { diff --git a/hudi-common/src/test/java/org/apache/hudi/common/HoodieCommonTestHarness.java b/hudi-common/src/test/java/org/apache/hudi/common/HoodieCommonTestHarness.java index 03aa5b719..fd087d0ba 100644 --- a/hudi-common/src/test/java/org/apache/hudi/common/HoodieCommonTestHarness.java +++ b/hudi-common/src/test/java/org/apache/hudi/common/HoodieCommonTestHarness.java @@ -39,14 +39,14 @@ public class HoodieCommonTestHarness { /** * Initializes basePath. - */ + */ protected void initPath() { this.basePath = folder.getRoot().getAbsolutePath(); } /** - * Initializes an instance of {@link HoodieTableMetaClient} with a special table type - * specified by {@code getTableType()}. + * Initializes an instance of {@link HoodieTableMetaClient} with a special table type specified by + * {@code getTableType()}. * * @throws IOException */ @@ -67,8 +67,7 @@ public class HoodieCommonTestHarness { return new HoodieTableFileSystemView(metaClient, timeline, enableIncrementalTimelineSync); } - protected SyncableFileSystemView getFileSystemView(HoodieTableMetaClient metaClient) - throws IOException { + protected SyncableFileSystemView getFileSystemView(HoodieTableMetaClient metaClient) throws IOException { return getFileSystemView(metaClient, metaClient.getActiveTimeline().filterCompletedAndCompactionInstants()); } @@ -78,8 +77,8 @@ public class HoodieCommonTestHarness { } /** - * Gets a default {@link HoodieTableType#COPY_ON_WRITE} table type. - * Sub-classes can override this method to specify a new table type. + * Gets a default {@link HoodieTableType#COPY_ON_WRITE} table type. Sub-classes can override this method to specify a + * new table type. * * @return an instance of Hoodie table type. */ diff --git a/hudi-common/src/test/java/org/apache/hudi/common/minicluster/HdfsTestService.java b/hudi-common/src/test/java/org/apache/hudi/common/minicluster/HdfsTestService.java index 09eaa733d..3d48b11fe 100644 --- a/hudi-common/src/test/java/org/apache/hudi/common/minicluster/HdfsTestService.java +++ b/hudi-common/src/test/java/org/apache/hudi/common/minicluster/HdfsTestService.java @@ -107,7 +107,7 @@ public class HdfsTestService { * exist. * * @param localDFSLocation The location on the local FS to hold the HDFS metadata and block data - * @param clean Specifies if we want to start a clean cluster + * @param clean Specifies if we want to start a clean cluster * @return Returns true if we should format a DFSCluster, otherwise false */ private static boolean shouldFormatDFSCluster(String localDFSLocation, boolean clean) { @@ -122,9 +122,9 @@ public class HdfsTestService { /** * Configure the DFS Cluster before launching it. * - * @param config The already created Hadoop configuration we'll further configure for HDFS + * @param config The already created Hadoop configuration we'll further configure for HDFS * @param localDFSLocation The location on the local filesystem where cluster data is stored - * @param bindIP An IP address we want to force the datanode and namenode to bind to. + * @param bindIP An IP address we want to force the datanode and namenode to bind to. * @return The updated Configuration object. */ private static Configuration configureDFSCluster(Configuration config, String localDFSLocation, String bindIP, diff --git a/hudi-common/src/test/java/org/apache/hudi/common/model/AvroBinaryTestPayload.java b/hudi-common/src/test/java/org/apache/hudi/common/model/AvroBinaryTestPayload.java index bd3c3e0e9..546bf1464 100644 --- a/hudi-common/src/test/java/org/apache/hudi/common/model/AvroBinaryTestPayload.java +++ b/hudi-common/src/test/java/org/apache/hudi/common/model/AvroBinaryTestPayload.java @@ -49,8 +49,7 @@ public class AvroBinaryTestPayload implements HoodieRecordPayload { } @Override - public Option combineAndGetUpdateValue(IndexedRecord currentValue, Schema schema) - throws IOException { + public Option combineAndGetUpdateValue(IndexedRecord currentValue, Schema schema) throws IOException { return getInsertValue(schema); } diff --git a/hudi-common/src/test/java/org/apache/hudi/common/model/HoodieTestUtils.java b/hudi-common/src/test/java/org/apache/hudi/common/model/HoodieTestUtils.java index 64faf3b65..2a3ffab55 100644 --- a/hudi-common/src/test/java/org/apache/hudi/common/model/HoodieTestUtils.java +++ b/hudi-common/src/test/java/org/apache/hudi/common/model/HoodieTestUtils.java @@ -90,8 +90,7 @@ public class HoodieTestUtils { return new Configuration(); } - public static HoodieTableMetaClient init(String basePath) - throws IOException { + public static HoodieTableMetaClient init(String basePath) throws IOException { return init(basePath, HoodieTableType.COPY_ON_WRITE); } @@ -99,8 +98,7 @@ public class HoodieTestUtils { return init(getDefaultHadoopConf(), basePath, tableType); } - public static HoodieTableMetaClient init(Configuration hadoopConf, String basePath) - throws IOException { + public static HoodieTableMetaClient init(Configuration hadoopConf, String basePath) throws IOException { return init(hadoopConf, basePath, HoodieTableType.COPY_ON_WRITE); } @@ -121,7 +119,7 @@ public class HoodieTestUtils { for (String commitTime : commitTimes) { new File( basePath + "/" + HoodieTableMetaClient.METAFOLDER_NAME + "/" + HoodieTimeline.makeCommitFileName(commitTime)) - .createNewFile(); + .createNewFile(); } } @@ -129,7 +127,7 @@ public class HoodieTestUtils { for (String commitTime : commitTimes) { new File( basePath + "/" + HoodieTableMetaClient.METAFOLDER_NAME + "/" + HoodieTimeline.makeDeltaFileName(commitTime)) - .createNewFile(); + .createNewFile(); } } @@ -139,17 +137,16 @@ public class HoodieTestUtils { public static final void createInflightCommitFiles(String basePath, String... commitTimes) throws IOException { for (String commitTime : commitTimes) { - new File(basePath + "/" + HoodieTableMetaClient.METAFOLDER_NAME + "/" + HoodieTimeline.makeInflightCommitFileName( - commitTime)).createNewFile(); + new File(basePath + "/" + HoodieTableMetaClient.METAFOLDER_NAME + "/" + + HoodieTimeline.makeInflightCommitFileName(commitTime)).createNewFile(); } } public static final void createInflightCleanFiles(String basePath, Configuration configuration, String... commitTimes) throws IOException { for (String commitTime : commitTimes) { - Path commitFile = new Path((basePath + "/" + HoodieTableMetaClient.METAFOLDER_NAME + "/" + HoodieTimeline - .makeInflightCleanerFileName( - commitTime))); + Path commitFile = new Path((basePath + "/" + HoodieTableMetaClient.METAFOLDER_NAME + "/" + + HoodieTimeline.makeInflightCleanerFileName(commitTime))); FileSystem fs = FSUtils.getFs(basePath, configuration); FSDataOutputStream os = fs.create(commitFile, true); } @@ -181,8 +178,8 @@ public class HoodieTestUtils { public static final String createMarkerFile(String basePath, String partitionPath, String commitTime, String fileID) throws IOException { - String folderPath = basePath + "/" + HoodieTableMetaClient.TEMPFOLDER_NAME + "/" + commitTime + "/" - + partitionPath + "/"; + String folderPath = + basePath + "/" + HoodieTableMetaClient.TEMPFOLDER_NAME + "/" + commitTime + "/" + partitionPath + "/"; new File(folderPath).mkdirs(); File f = new File(folderPath + FSUtils.makeMarkerFile(commitTime, DEFAULT_WRITE_TOKEN, fileID)); f.createNewFile(); @@ -196,10 +193,8 @@ public class HoodieTestUtils { if (!makeDir) { throw new IOException("cannot create directory for path " + folderPath); } - boolean createFile = fs.createNewFile(new Path( - folderPath + FSUtils - .makeLogFileName(fileID, ".log", commitTime, version.orElse(DEFAULT_LOG_VERSION), - HoodieLogFormat.UNKNOWN_WRITE_TOKEN))); + boolean createFile = fs.createNewFile(new Path(folderPath + FSUtils.makeLogFileName(fileID, ".log", commitTime, + version.orElse(DEFAULT_LOG_VERSION), HoodieLogFormat.UNKNOWN_WRITE_TOKEN))); if (!createFile) { throw new IOException( StringUtils.format("cannot create data file for commit %s and fileId %s", commitTime, fileID)); @@ -210,9 +205,8 @@ public class HoodieTestUtils { public static final void createCompactionCommitFiles(FileSystem fs, String basePath, String... commitTimes) throws IOException { for (String commitTime : commitTimes) { - boolean createFile = fs.createNewFile(new Path( - basePath + "/" + HoodieTableMetaClient.METAFOLDER_NAME + "/" + HoodieTimeline - .makeCommitFileName(commitTime))); + boolean createFile = fs.createNewFile(new Path(basePath + "/" + HoodieTableMetaClient.METAFOLDER_NAME + "/" + + HoodieTimeline.makeCommitFileName(commitTime))); if (!createFile) { throw new IOException("cannot create commit file for commit " + commitTime); } @@ -222,15 +216,13 @@ public class HoodieTestUtils { public static final void createCompactionRequest(HoodieTableMetaClient metaClient, String instant, List> fileSliceList) throws IOException { HoodieCompactionPlan plan = CompactionUtils.buildFromFileSlices(fileSliceList, Option.empty(), Option.empty()); - HoodieInstant compactionInstant = - new HoodieInstant(State.REQUESTED, HoodieTimeline.COMPACTION_ACTION, instant); + HoodieInstant compactionInstant = new HoodieInstant(State.REQUESTED, HoodieTimeline.COMPACTION_ACTION, instant); metaClient.getActiveTimeline().saveToCompactionRequested(compactionInstant, AvroUtils.serializeCompactionPlan(plan)); } public static final String getDataFilePath(String basePath, String partitionPath, String commitTime, String fileID) { - return basePath + "/" + partitionPath + "/" + FSUtils - .makeDataFileName(commitTime, DEFAULT_WRITE_TOKEN, fileID); + return basePath + "/" + partitionPath + "/" + FSUtils.makeDataFileName(commitTime, DEFAULT_WRITE_TOKEN, fileID); } public static final String getLogFilePath(String basePath, String partitionPath, String commitTime, String fileID, @@ -266,13 +258,13 @@ public class HoodieTestUtils { public static final boolean doesCommitExist(String basePath, String commitTime) { return new File( basePath + "/" + HoodieTableMetaClient.METAFOLDER_NAME + "/" + commitTime + HoodieTimeline.COMMIT_EXTENSION) - .exists(); + .exists(); } public static final boolean doesInflightExist(String basePath, String commitTime) { return new File( basePath + "/" + HoodieTableMetaClient.METAFOLDER_NAME + "/" + commitTime + HoodieTimeline.INFLIGHT_EXTENSION) - .exists(); + .exists(); } public static void createCleanFiles(String basePath, String commitTime, Configuration configuration) @@ -286,8 +278,8 @@ public class HoodieTestUtils { DEFAULT_PARTITION_PATHS[rand.nextInt(DEFAULT_PARTITION_PATHS.length)], new ArrayList<>(), new ArrayList<>(), new ArrayList<>(), commitTime); // Create the clean metadata - HoodieCleanMetadata cleanMetadata = AvroUtils.convertCleanMetadata(commitTime, Option.of(0L), - Arrays.asList(cleanStats)); + HoodieCleanMetadata cleanMetadata = + AvroUtils.convertCleanMetadata(commitTime, Option.of(0L), Arrays.asList(cleanStats)); // Write empty clean metadata os.write(AvroUtils.serializeCleanMetadata(cleanMetadata).get()); } finally { @@ -335,8 +327,8 @@ public class HoodieTestUtils { public static void writeRecordsToLogFiles(FileSystem fs, String basePath, Schema schema, List updatedRecords) { - Map> groupedUpdated = updatedRecords.stream().collect( - Collectors.groupingBy(HoodieRecord::getCurrentLocation)); + Map> groupedUpdated = + updatedRecords.stream().collect(Collectors.groupingBy(HoodieRecord::getCurrentLocation)); groupedUpdated.entrySet().forEach(s -> { HoodieRecordLocation location = s.getKey(); diff --git a/hudi-common/src/test/java/org/apache/hudi/common/model/TestHoodieCommitMetadata.java b/hudi-common/src/test/java/org/apache/hudi/common/model/TestHoodieCommitMetadata.java index 9c755fcbe..8b3eb38ef 100644 --- a/hudi-common/src/test/java/org/apache/hudi/common/model/TestHoodieCommitMetadata.java +++ b/hudi-common/src/test/java/org/apache/hudi/common/model/TestHoodieCommitMetadata.java @@ -36,8 +36,8 @@ public class TestHoodieCommitMetadata { Assert.assertTrue(commitMetadata.getTotalLogFilesCompacted() > 0); String serializedCommitMetadata = commitMetadata.toJsonString(); - HoodieCommitMetadata metadata = HoodieCommitMetadata.fromJsonString(serializedCommitMetadata, - HoodieCommitMetadata.class); + HoodieCommitMetadata metadata = + HoodieCommitMetadata.fromJsonString(serializedCommitMetadata, HoodieCommitMetadata.class); // Make sure timing metrics are not written to instant file Assert.assertTrue(metadata.getTotalScanTime() == 0); Assert.assertTrue(metadata.getTotalLogFilesCompacted() > 0); diff --git a/hudi-common/src/test/java/org/apache/hudi/common/model/TestHoodieRecord.java b/hudi-common/src/test/java/org/apache/hudi/common/model/TestHoodieRecord.java index 408fedc3e..e9c32a160 100644 --- a/hudi-common/src/test/java/org/apache/hudi/common/model/TestHoodieRecord.java +++ b/hudi-common/src/test/java/org/apache/hudi/common/model/TestHoodieRecord.java @@ -42,9 +42,9 @@ public class TestHoodieRecord { @Before public void setUp() throws Exception { final List indexedRecords = SchemaTestUtil.generateHoodieTestRecords(0, 1); - final List hoodieRecords = indexedRecords.stream() - .map(r -> new HoodieRecord(new HoodieKey(UUID.randomUUID().toString(), "0000/00/00"), - new AvroBinaryTestPayload(Option.of((GenericRecord) r)))).collect(Collectors.toList()); + final List hoodieRecords = + indexedRecords.stream().map(r -> new HoodieRecord(new HoodieKey(UUID.randomUUID().toString(), "0000/00/00"), + new AvroBinaryTestPayload(Option.of((GenericRecord) r)))).collect(Collectors.toList()); hoodieRecord = hoodieRecords.get(0); } diff --git a/hudi-common/src/test/java/org/apache/hudi/common/table/HoodieTableMetaClientTest.java b/hudi-common/src/test/java/org/apache/hudi/common/table/HoodieTableMetaClientTest.java index 62f8bf330..7adc49194 100644 --- a/hudi-common/src/test/java/org/apache/hudi/common/table/HoodieTableMetaClientTest.java +++ b/hudi-common/src/test/java/org/apache/hudi/common/table/HoodieTableMetaClientTest.java @@ -58,8 +58,8 @@ public class HoodieTableMetaClientTest extends HoodieCommonTestHarness { @Test public void checkSerDe() throws IOException, ClassNotFoundException { // check if this object is serialized and de-serialized, we are able to read from the file system - HoodieTableMetaClient deseralizedMetaClient = HoodieTestUtils - .serializeDeserialize(metaClient, HoodieTableMetaClient.class); + HoodieTableMetaClient deseralizedMetaClient = + HoodieTestUtils.serializeDeserialize(metaClient, HoodieTableMetaClient.class); assertNotNull(deseralizedMetaClient); HoodieActiveTimeline commitTimeline = deseralizedMetaClient.getActiveTimeline(); HoodieInstant instant = new HoodieInstant(true, HoodieTimeline.COMMIT_ACTION, "1"); @@ -99,10 +99,9 @@ public class HoodieTableMetaClientTest extends HoodieCommonTestHarness { @Test public void checkArchiveCommitTimeline() throws IOException { Path archiveLogPath = HoodieArchivedTimeline.getArchiveLogPath(metaClient.getArchivePath()); - SequenceFile.Writer writer = SequenceFile - .createWriter(metaClient.getHadoopConf(), SequenceFile.Writer.file(archiveLogPath), - SequenceFile.Writer.keyClass(Text.class), - SequenceFile.Writer.valueClass(Text.class)); + SequenceFile.Writer writer = + SequenceFile.createWriter(metaClient.getHadoopConf(), SequenceFile.Writer.file(archiveLogPath), + SequenceFile.Writer.keyClass(Text.class), SequenceFile.Writer.valueClass(Text.class)); writer.append(new Text("1"), new Text("data1")); writer.append(new Text("2"), new Text("data2")); diff --git a/hudi-common/src/test/java/org/apache/hudi/common/table/log/HoodieLogFormatAppendFailureTest.java b/hudi-common/src/test/java/org/apache/hudi/common/table/log/HoodieLogFormatAppendFailureTest.java index b7d592887..f0fcf71ba 100644 --- a/hudi-common/src/test/java/org/apache/hudi/common/table/log/HoodieLogFormatAppendFailureTest.java +++ b/hudi-common/src/test/java/org/apache/hudi/common/table/log/HoodieLogFormatAppendFailureTest.java @@ -52,8 +52,8 @@ import org.junit.Test; /** * This class is intentionally using a different way of setting up the MiniDFSCluster and not relying on - * {@link MiniClusterUtil} to reproduce append() issue : https://issues.apache.org/jira/browse/HDFS-6325 - * Reference : https://issues.apache.org/jira/secure/attachment/12645053/HDFS-6325.patch + * {@link MiniClusterUtil} to reproduce append() issue : https://issues.apache.org/jira/browse/HDFS-6325 Reference : + * https://issues.apache.org/jira/secure/attachment/12645053/HDFS-6325.patch */ public class HoodieLogFormatAppendFailureTest { @@ -83,8 +83,8 @@ public class HoodieLogFormatAppendFailureTest { } @Test(timeout = 60000) - public void testFailedToGetAppendStreamFromHDFSNameNode() throws IOException, URISyntaxException, - InterruptedException, TimeoutException { + public void testFailedToGetAppendStreamFromHDFSNameNode() + throws IOException, URISyntaxException, InterruptedException, TimeoutException { // Use some fs like LocalFileSystem, that does not support appends String uuid = UUID.randomUUID().toString(); @@ -101,9 +101,8 @@ public class HoodieLogFormatAppendFailureTest { HoodieAvroDataBlock dataBlock = new HoodieAvroDataBlock(records, header); Writer writer = HoodieLogFormat.newWriterBuilder().onParentPath(testPath) - .withFileExtension(HoodieArchivedLogFile.ARCHIVE_EXTENSION).withFileId("commits" - + ".archive").overBaseCommit("") - .withFs(fs).build(); + .withFileExtension(HoodieArchivedLogFile.ARCHIVE_EXTENSION).withFileId("commits" + ".archive") + .overBaseCommit("").withFs(fs).build(); writer = writer.appendBlock(dataBlock); // get the current log file version to compare later @@ -134,8 +133,7 @@ public class HoodieLogFormatAppendFailureTest { // return a new writer with a bumped up logVersion writer = HoodieLogFormat.newWriterBuilder().onParentPath(testPath) .withFileExtension(HoodieArchivedLogFile.ARCHIVE_EXTENSION).withFileId("commits" + ".archive") - .overBaseCommit("") - .withFs(fs).build(); + .overBaseCommit("").withFs(fs).build(); // The log version should be different for this new writer Assert.assertFalse(writer.getLogFile().getLogVersion() == logFileVersion); } diff --git a/hudi-common/src/test/java/org/apache/hudi/common/table/log/HoodieLogFormatTest.java b/hudi-common/src/test/java/org/apache/hudi/common/table/log/HoodieLogFormatTest.java index 6ce58505f..5d05d5211 100644 --- a/hudi-common/src/test/java/org/apache/hudi/common/table/log/HoodieLogFormatTest.java +++ b/hudi-common/src/test/java/org/apache/hudi/common/table/log/HoodieLogFormatTest.java @@ -91,7 +91,7 @@ public class HoodieLogFormatTest extends HoodieCommonTestHarness { @Parameterized.Parameters(name = "LogBlockReadMode") public static Collection data() { - return Arrays.asList(new Boolean[][]{{true}, {false}}); + return Arrays.asList(new Boolean[][] {{true}, {false}}); } @BeforeClass @@ -122,9 +122,9 @@ public class HoodieLogFormatTest extends HoodieCommonTestHarness { @Test public void testEmptyLog() throws IOException, InterruptedException { - Writer writer = HoodieLogFormat.newWriterBuilder().onParentPath(partitionPath) - .withFileExtension(HoodieLogFile.DELTA_EXTENSION).withFileId("test-fileid1") - .overBaseCommit("100").withFs(fs).build(); + Writer writer = + HoodieLogFormat.newWriterBuilder().onParentPath(partitionPath).withFileExtension(HoodieLogFile.DELTA_EXTENSION) + .withFileId("test-fileid1").overBaseCommit("100").withFs(fs).build(); assertEquals("Just created this log, size should be 0", 0, writer.getCurrentSize()); assertTrue("Check all log files should start with a .", writer.getLogFile().getFileName().startsWith(".")); assertEquals("Version should be 1 for new log created", 1, writer.getLogFile().getLogVersion()); @@ -132,9 +132,9 @@ public class HoodieLogFormatTest extends HoodieCommonTestHarness { @Test public void testBasicAppend() throws IOException, InterruptedException, URISyntaxException { - Writer writer = HoodieLogFormat.newWriterBuilder().onParentPath(partitionPath) - .withFileExtension(HoodieLogFile.DELTA_EXTENSION).withFileId("test-fileid1") - .overBaseCommit("100").withFs(fs).build(); + Writer writer = + HoodieLogFormat.newWriterBuilder().onParentPath(partitionPath).withFileExtension(HoodieLogFile.DELTA_EXTENSION) + .withFileId("test-fileid1").overBaseCommit("100").withFs(fs).build(); List records = SchemaTestUtil.generateTestRecords(0, 100); Map header = Maps.newHashMap(); header.put(HoodieLogBlock.HeaderMetadataType.INSTANT_TIME, "100"); @@ -150,9 +150,9 @@ public class HoodieLogFormatTest extends HoodieCommonTestHarness { @Test public void testRollover() throws IOException, InterruptedException, URISyntaxException { - Writer writer = HoodieLogFormat.newWriterBuilder().onParentPath(partitionPath) - .withFileExtension(HoodieLogFile.DELTA_EXTENSION).withFileId("test-fileid1") - .overBaseCommit("100").withFs(fs).build(); + Writer writer = + HoodieLogFormat.newWriterBuilder().onParentPath(partitionPath).withFileExtension(HoodieLogFile.DELTA_EXTENSION) + .withFileId("test-fileid1").overBaseCommit("100").withFs(fs).build(); List records = SchemaTestUtil.generateTestRecords(0, 100); Map header = Maps.newHashMap(); header.put(HoodieLogBlock.HeaderMetadataType.INSTANT_TIME, "100"); @@ -165,9 +165,9 @@ public class HoodieLogFormatTest extends HoodieCommonTestHarness { writer.close(); // Create a writer with the size threshold as the size we just wrote - so this has to roll - writer = HoodieLogFormat.newWriterBuilder().onParentPath(partitionPath) - .withFileExtension(HoodieLogFile.DELTA_EXTENSION).withFileId("test-fileid1").overBaseCommit("100") - .withFs(fs).withSizeThreshold(size - 1).build(); + writer = + HoodieLogFormat.newWriterBuilder().onParentPath(partitionPath).withFileExtension(HoodieLogFile.DELTA_EXTENSION) + .withFileId("test-fileid1").overBaseCommit("100").withFs(fs).withSizeThreshold(size - 1).build(); records = SchemaTestUtil.generateTestRecords(0, 100); header.put(HoodieLogBlock.HeaderMetadataType.SCHEMA, getSimpleSchema().toString()); dataBlock = new HoodieAvroDataBlock(records, header); @@ -194,11 +194,9 @@ public class HoodieLogFormatTest extends HoodieCommonTestHarness { private void testConcurrentAppend(boolean logFileExists, boolean newLogFileFormat) throws Exception { HoodieLogFormat.WriterBuilder builder1 = HoodieLogFormat.newWriterBuilder().onParentPath(partitionPath) - .withFileExtension(HoodieLogFile.DELTA_EXTENSION).withFileId("test-fileid1") - .overBaseCommit("100").withFs(fs); + .withFileExtension(HoodieLogFile.DELTA_EXTENSION).withFileId("test-fileid1").overBaseCommit("100").withFs(fs); HoodieLogFormat.WriterBuilder builder2 = HoodieLogFormat.newWriterBuilder().onParentPath(partitionPath) - .withFileExtension(HoodieLogFile.DELTA_EXTENSION).withFileId("test-fileid1") - .overBaseCommit("100").withFs(fs); + .withFileExtension(HoodieLogFile.DELTA_EXTENSION).withFileId("test-fileid1").overBaseCommit("100").withFs(fs); if (newLogFileFormat && logFileExists) { // Assume there is an existing log-file with write token @@ -236,9 +234,9 @@ public class HoodieLogFormatTest extends HoodieCommonTestHarness { @Test public void testMultipleAppend() throws IOException, URISyntaxException, InterruptedException { - Writer writer = HoodieLogFormat.newWriterBuilder().onParentPath(partitionPath) - .withFileExtension(HoodieLogFile.DELTA_EXTENSION).withFileId("test-fileid1") - .overBaseCommit("100").withFs(fs).build(); + Writer writer = + HoodieLogFormat.newWriterBuilder().onParentPath(partitionPath).withFileExtension(HoodieLogFile.DELTA_EXTENSION) + .withFileId("test-fileid1").overBaseCommit("100").withFs(fs).build(); List records = SchemaTestUtil.generateTestRecords(0, 100); Map header = Maps.newHashMap(); header.put(HoodieLogBlock.HeaderMetadataType.INSTANT_TIME, "100"); @@ -248,9 +246,9 @@ public class HoodieLogFormatTest extends HoodieCommonTestHarness { long size1 = writer.getCurrentSize(); writer.close(); - writer = HoodieLogFormat.newWriterBuilder().onParentPath(partitionPath) - .withFileExtension(HoodieLogFile.DELTA_EXTENSION).withFileId("test-fileid1").overBaseCommit("100") - .withFs(fs).build(); + writer = + HoodieLogFormat.newWriterBuilder().onParentPath(partitionPath).withFileExtension(HoodieLogFile.DELTA_EXTENSION) + .withFileId("test-fileid1").overBaseCommit("100").withFs(fs).build(); records = SchemaTestUtil.generateTestRecords(0, 100); header.put(HoodieLogBlock.HeaderMetadataType.SCHEMA, getSimpleSchema().toString()); dataBlock = new HoodieAvroDataBlock(records, header); @@ -262,9 +260,9 @@ public class HoodieLogFormatTest extends HoodieCommonTestHarness { writer.close(); // Close and Open again and append 100 more records - writer = HoodieLogFormat.newWriterBuilder().onParentPath(partitionPath) - .withFileExtension(HoodieLogFile.DELTA_EXTENSION).withFileId("test-fileid1").overBaseCommit("100") - .withFs(fs).build(); + writer = + HoodieLogFormat.newWriterBuilder().onParentPath(partitionPath).withFileExtension(HoodieLogFile.DELTA_EXTENSION) + .withFileId("test-fileid1").overBaseCommit("100").withFs(fs).build(); records = SchemaTestUtil.generateTestRecords(0, 100); header.put(HoodieLogBlock.HeaderMetadataType.SCHEMA, getSimpleSchema().toString()); dataBlock = new HoodieAvroDataBlock(records, header); @@ -285,40 +283,30 @@ public class HoodieLogFormatTest extends HoodieCommonTestHarness { } /** - * This is actually a test on concurrent append and not recovery lease. - * Commenting this out. + * This is actually a test on concurrent append and not recovery lease. Commenting this out. * https://issues.apache.org/jira/browse/HUDI-117 */ /** - @Test - public void testLeaseRecovery() throws IOException, URISyntaxException, InterruptedException { - Writer writer = HoodieLogFormat.newWriterBuilder().onParentPath(partitionPath) - .withFileExtension(HoodieLogFile.DELTA_EXTENSION).withFileId("test-fileid1") - .overBaseCommit("100").withFs(fs).build(); - List records = SchemaTestUtil.generateTestRecords(0, 100); - Map header = Maps.newHashMap(); - header.put(HoodieLogBlock.HeaderMetadataType.INSTANT_TIME, "100"); - header.put(HoodieLogBlock.HeaderMetadataType.SCHEMA, getSimpleSchema().toString()); - HoodieAvroDataBlock dataBlock = new HoodieAvroDataBlock(records, header); - writer = writer.appendBlock(dataBlock); - long size1 = writer.getCurrentSize(); - // do not close this writer - this simulates a data note appending to a log dying without closing the file - // writer.close(); - - writer = HoodieLogFormat.newWriterBuilder().onParentPath(partitionPath) - .withFileExtension(HoodieLogFile.DELTA_EXTENSION).withFileId("test-fileid1").overBaseCommit("100") - .withFs(fs).build(); - records = SchemaTestUtil.generateTestRecords(0, 100); - header.put(HoodieLogBlock.HeaderMetadataType.SCHEMA, getSimpleSchema().toString()); - dataBlock = new HoodieAvroDataBlock(records, header); - writer = writer.appendBlock(dataBlock); - long size2 = writer.getCurrentSize(); - assertTrue("We just wrote a new block - size2 should be > size1", size2 > size1); - assertEquals("Write should be auto-flushed. The size reported by FileStatus and the writer should match", size2, - fs.getFileStatus(writer.getLogFile().getPath()).getLen()); - writer.close(); - } - **/ + * @Test public void testLeaseRecovery() throws IOException, URISyntaxException, InterruptedException { Writer writer + * = HoodieLogFormat.newWriterBuilder().onParentPath(partitionPath) + * .withFileExtension(HoodieLogFile.DELTA_EXTENSION).withFileId("test-fileid1") + * .overBaseCommit("100").withFs(fs).build(); List records = + * SchemaTestUtil.generateTestRecords(0, 100); Map header = + * Maps.newHashMap(); header.put(HoodieLogBlock.HeaderMetadataType.INSTANT_TIME, "100"); + * header.put(HoodieLogBlock.HeaderMetadataType.SCHEMA, getSimpleSchema().toString()); HoodieAvroDataBlock + * dataBlock = new HoodieAvroDataBlock(records, header); writer = writer.appendBlock(dataBlock); long size1 = + * writer.getCurrentSize(); // do not close this writer - this simulates a data note appending to a log dying + * without closing the file // writer.close(); + * + * writer = HoodieLogFormat.newWriterBuilder().onParentPath(partitionPath) + * .withFileExtension(HoodieLogFile.DELTA_EXTENSION).withFileId("test-fileid1").overBaseCommit("100") + * .withFs(fs).build(); records = SchemaTestUtil.generateTestRecords(0, 100); + * header.put(HoodieLogBlock.HeaderMetadataType.SCHEMA, getSimpleSchema().toString()); dataBlock = new + * HoodieAvroDataBlock(records, header); writer = writer.appendBlock(dataBlock); long size2 = + * writer.getCurrentSize(); assertTrue("We just wrote a new block - size2 should be > size1", size2 > size1); + * assertEquals("Write should be auto-flushed. The size reported by FileStatus and the writer should match", + * size2, fs.getFileStatus(writer.getLogFile().getPath()).getLen()); writer.close(); } + **/ @Test public void testAppendNotSupported() throws IOException, URISyntaxException, InterruptedException { @@ -349,13 +337,13 @@ public class HoodieLogFormatTest extends HoodieCommonTestHarness { @SuppressWarnings("unchecked") @Test public void testBasicWriteAndScan() throws IOException, URISyntaxException, InterruptedException { - Writer writer = HoodieLogFormat.newWriterBuilder().onParentPath(partitionPath) - .withFileExtension(HoodieLogFile.DELTA_EXTENSION).withFileId("test-fileid1") - .overBaseCommit("100").withFs(fs).build(); + Writer writer = + HoodieLogFormat.newWriterBuilder().onParentPath(partitionPath).withFileExtension(HoodieLogFile.DELTA_EXTENSION) + .withFileId("test-fileid1").overBaseCommit("100").withFs(fs).build(); Schema schema = getSimpleSchema(); List records = SchemaTestUtil.generateTestRecords(0, 100); - List copyOfRecords = records.stream().map( - record -> HoodieAvroUtils.rewriteRecord((GenericRecord) record, schema)).collect(Collectors.toList()); + List copyOfRecords = records.stream() + .map(record -> HoodieAvroUtils.rewriteRecord((GenericRecord) record, schema)).collect(Collectors.toList()); Map header = Maps.newHashMap(); header.put(HoodieLogBlock.HeaderMetadataType.INSTANT_TIME, "100"); header.put(HoodieLogBlock.HeaderMetadataType.SCHEMA, getSimpleSchema().toString()); @@ -378,13 +366,13 @@ public class HoodieLogFormatTest extends HoodieCommonTestHarness { @SuppressWarnings("unchecked") @Test public void testBasicAppendAndRead() throws IOException, URISyntaxException, InterruptedException { - Writer writer = HoodieLogFormat.newWriterBuilder().onParentPath(partitionPath) - .withFileExtension(HoodieLogFile.DELTA_EXTENSION).withFileId("test-fileid1") - .overBaseCommit("100").withFs(fs).build(); + Writer writer = + HoodieLogFormat.newWriterBuilder().onParentPath(partitionPath).withFileExtension(HoodieLogFile.DELTA_EXTENSION) + .withFileId("test-fileid1").overBaseCommit("100").withFs(fs).build(); List records1 = SchemaTestUtil.generateTestRecords(0, 100); Schema schema = getSimpleSchema(); - List copyOfRecords1 = records1.stream().map( - record -> HoodieAvroUtils.rewriteRecord((GenericRecord) record, schema)).collect(Collectors.toList()); + List copyOfRecords1 = records1.stream() + .map(record -> HoodieAvroUtils.rewriteRecord((GenericRecord) record, schema)).collect(Collectors.toList()); Map header = Maps.newHashMap(); header.put(HoodieLogBlock.HeaderMetadataType.INSTANT_TIME, "100"); header.put(HoodieLogBlock.HeaderMetadataType.SCHEMA, getSimpleSchema().toString()); @@ -392,24 +380,24 @@ public class HoodieLogFormatTest extends HoodieCommonTestHarness { writer = writer.appendBlock(dataBlock); writer.close(); - writer = HoodieLogFormat.newWriterBuilder().onParentPath(partitionPath) - .withFileExtension(HoodieLogFile.DELTA_EXTENSION).withFileId("test-fileid1").overBaseCommit("100") - .withFs(fs).build(); + writer = + HoodieLogFormat.newWriterBuilder().onParentPath(partitionPath).withFileExtension(HoodieLogFile.DELTA_EXTENSION) + .withFileId("test-fileid1").overBaseCommit("100").withFs(fs).build(); List records2 = SchemaTestUtil.generateTestRecords(0, 100); - List copyOfRecords2 = records2.stream().map( - record -> HoodieAvroUtils.rewriteRecord((GenericRecord) record, schema)).collect(Collectors.toList()); + List copyOfRecords2 = records2.stream() + .map(record -> HoodieAvroUtils.rewriteRecord((GenericRecord) record, schema)).collect(Collectors.toList()); header.put(HoodieLogBlock.HeaderMetadataType.SCHEMA, getSimpleSchema().toString()); dataBlock = new HoodieAvroDataBlock(records2, header); writer = writer.appendBlock(dataBlock); writer.close(); // Close and Open again and append 100 more records - writer = HoodieLogFormat.newWriterBuilder().onParentPath(partitionPath) - .withFileExtension(HoodieLogFile.DELTA_EXTENSION).withFileId("test-fileid1").overBaseCommit("100") - .withFs(fs).build(); + writer = + HoodieLogFormat.newWriterBuilder().onParentPath(partitionPath).withFileExtension(HoodieLogFile.DELTA_EXTENSION) + .withFileId("test-fileid1").overBaseCommit("100").withFs(fs).build(); List records3 = SchemaTestUtil.generateTestRecords(0, 100); - List copyOfRecords3 = records3.stream().map( - record -> HoodieAvroUtils.rewriteRecord((GenericRecord) record, schema)).collect(Collectors.toList()); + List copyOfRecords3 = records3.stream() + .map(record -> HoodieAvroUtils.rewriteRecord((GenericRecord) record, schema)).collect(Collectors.toList()); header.put(HoodieLogBlock.HeaderMetadataType.SCHEMA, getSimpleSchema().toString()); dataBlock = new HoodieAvroDataBlock(records3, header); writer = writer.appendBlock(dataBlock); @@ -445,9 +433,9 @@ public class HoodieLogFormatTest extends HoodieCommonTestHarness { @SuppressWarnings("unchecked") @Test public void testBasicAppendAndScanMultipleFiles() throws IOException, URISyntaxException, InterruptedException { - Writer writer = HoodieLogFormat.newWriterBuilder().onParentPath(partitionPath) - .withFileExtension(HoodieLogFile.DELTA_EXTENSION).withSizeThreshold(1024) - .withFileId("test-fileid1").overBaseCommit("100").withFs(fs).build(); + Writer writer = + HoodieLogFormat.newWriterBuilder().onParentPath(partitionPath).withFileExtension(HoodieLogFile.DELTA_EXTENSION) + .withSizeThreshold(1024).withFileId("test-fileid1").overBaseCommit("100").withFs(fs).build(); Schema schema = HoodieAvroUtils.addMetadataFields(getSimpleSchema()); Map header = Maps.newHashMap(); header.put(HoodieLogBlock.HeaderMetadataType.INSTANT_TIME, "100"); @@ -459,8 +447,8 @@ public class HoodieLogFormatTest extends HoodieCommonTestHarness { while (writer.getLogFile().getLogVersion() != 4) { logFiles.add(writer.getLogFile()); List records1 = SchemaTestUtil.generateHoodieTestRecords(0, 100); - List copyOfRecords1 = records1.stream().map( - record -> HoodieAvroUtils.rewriteRecord((GenericRecord) record, schema)).collect(Collectors.toList()); + List copyOfRecords1 = records1.stream() + .map(record -> HoodieAvroUtils.rewriteRecord((GenericRecord) record, schema)).collect(Collectors.toList()); allRecords.add(copyOfRecords1); header.put(HoodieLogBlock.HeaderMetadataType.SCHEMA, schema.toString()); HoodieAvroDataBlock dataBlock = new HoodieAvroDataBlock(records1, header); @@ -486,9 +474,9 @@ public class HoodieLogFormatTest extends HoodieCommonTestHarness { @Test public void testAppendAndReadOnCorruptedLog() throws IOException, URISyntaxException, InterruptedException { - Writer writer = HoodieLogFormat.newWriterBuilder().onParentPath(partitionPath) - .withFileExtension(HoodieLogFile.DELTA_EXTENSION).withFileId("test-fileid1") - .overBaseCommit("100").withFs(fs).build(); + Writer writer = + HoodieLogFormat.newWriterBuilder().onParentPath(partitionPath).withFileExtension(HoodieLogFile.DELTA_EXTENSION) + .withFileId("test-fileid1").overBaseCommit("100").withFs(fs).build(); List records = SchemaTestUtil.generateTestRecords(0, 100); Map header = Maps.newHashMap(); header.put(HoodieLogBlock.HeaderMetadataType.INSTANT_TIME, "100"); @@ -521,7 +509,7 @@ public class HoodieLogFormatTest extends HoodieCommonTestHarness { HoodieLogBlock block = reader.next(); assertEquals("The read block should be a corrupt block", HoodieLogBlockType.CORRUPT_BLOCK, block.getBlockType()); HoodieCorruptBlock corruptBlock = (HoodieCorruptBlock) block; - //assertEquals("", "something-random", new String(corruptBlock.getCorruptedBytes())); + // assertEquals("", "something-random", new String(corruptBlock.getCorruptedBytes())); assertFalse("There should be no more block left", reader.hasNext()); reader.close(); @@ -542,9 +530,9 @@ public class HoodieLogFormatTest extends HoodieCommonTestHarness { outputStream.close(); // Should be able to append a new block - writer = HoodieLogFormat.newWriterBuilder().onParentPath(partitionPath) - .withFileExtension(HoodieLogFile.DELTA_EXTENSION).withFileId("test-fileid1").overBaseCommit("100") - .withFs(fs).build(); + writer = + HoodieLogFormat.newWriterBuilder().onParentPath(partitionPath).withFileExtension(HoodieLogFile.DELTA_EXTENSION) + .withFileId("test-fileid1").overBaseCommit("100").withFs(fs).build(); records = SchemaTestUtil.generateTestRecords(0, 100); header.put(HoodieLogBlock.HeaderMetadataType.SCHEMA, getSimpleSchema().toString()); dataBlock = new HoodieAvroDataBlock(records, header); @@ -561,7 +549,7 @@ public class HoodieLogFormatTest extends HoodieCommonTestHarness { block = reader.next(); assertEquals("The read block should be a corrupt block", HoodieLogBlockType.CORRUPT_BLOCK, block.getBlockType()); corruptBlock = (HoodieCorruptBlock) block; - //assertEquals("", "something-else-random", new String(corruptBlock.getCorruptedBytes())); + // assertEquals("", "something-else-random", new String(corruptBlock.getCorruptedBytes())); assertTrue("We should get the last block next", reader.hasNext()); reader.next(); assertFalse("We should have no more blocks left", reader.hasNext()); @@ -573,13 +561,13 @@ public class HoodieLogFormatTest extends HoodieCommonTestHarness { public void testAvroLogRecordReaderBasic() throws IOException, URISyntaxException, InterruptedException { Schema schema = HoodieAvroUtils.addMetadataFields(getSimpleSchema()); // Set a small threshold so that every block is a new version - Writer writer = HoodieLogFormat.newWriterBuilder().onParentPath(partitionPath) - .withFileExtension(HoodieLogFile.DELTA_EXTENSION).withFileId("test-fileid1") - .overBaseCommit("100").withFs(fs).withSizeThreshold(500).build(); + Writer writer = + HoodieLogFormat.newWriterBuilder().onParentPath(partitionPath).withFileExtension(HoodieLogFile.DELTA_EXTENSION) + .withFileId("test-fileid1").overBaseCommit("100").withFs(fs).withSizeThreshold(500).build(); // Write 1 List records1 = SchemaTestUtil.generateHoodieTestRecords(0, 100); - List copyOfRecords1 = records1.stream().map( - record -> HoodieAvroUtils.rewriteRecord((GenericRecord) record, schema)).collect(Collectors.toList()); + List copyOfRecords1 = records1.stream() + .map(record -> HoodieAvroUtils.rewriteRecord((GenericRecord) record, schema)).collect(Collectors.toList()); Map header = Maps.newHashMap(); header.put(HoodieLogBlock.HeaderMetadataType.INSTANT_TIME, "100"); @@ -589,26 +577,27 @@ public class HoodieLogFormatTest extends HoodieCommonTestHarness { // Write 2 List records2 = SchemaTestUtil.generateHoodieTestRecords(0, 100); - List copyOfRecords2 = records2.stream().map( - record -> HoodieAvroUtils.rewriteRecord((GenericRecord) record, schema)).collect(Collectors.toList()); + List copyOfRecords2 = records2.stream() + .map(record -> HoodieAvroUtils.rewriteRecord((GenericRecord) record, schema)).collect(Collectors.toList()); header.put(HoodieLogBlock.HeaderMetadataType.SCHEMA, schema.toString()); dataBlock = new HoodieAvroDataBlock(records2, header); writer = writer.appendBlock(dataBlock); writer.close(); - List allLogFiles = FSUtils.getAllLogFiles(fs, partitionPath, "test-fileid1", HoodieLogFile.DELTA_EXTENSION, - "100").map(s -> s.getPath().toString()).collect(Collectors.toList()); + List allLogFiles = + FSUtils.getAllLogFiles(fs, partitionPath, "test-fileid1", HoodieLogFile.DELTA_EXTENSION, "100") + .map(s -> s.getPath().toString()).collect(Collectors.toList()); - HoodieMergedLogRecordScanner scanner = new HoodieMergedLogRecordScanner(fs, basePath, allLogFiles, schema, - "100", 10240L, readBlocksLazily, false, bufferSize, BASE_OUTPUT_PATH); + HoodieMergedLogRecordScanner scanner = new HoodieMergedLogRecordScanner(fs, basePath, allLogFiles, schema, "100", + 10240L, readBlocksLazily, false, bufferSize, BASE_OUTPUT_PATH); assertEquals("", 200, scanner.getTotalLogRecords()); Set readKeys = new HashSet<>(200); scanner.forEach(s -> readKeys.add(s.getKey().getRecordKey())); assertEquals("Stream collect should return all 200 records", 200, readKeys.size()); copyOfRecords1.addAll(copyOfRecords2); - Set originalKeys = copyOfRecords1.stream() - .map(s -> ((GenericRecord) s).get(HoodieRecord.RECORD_KEY_METADATA_FIELD).toString()) - .collect(Collectors.toSet()); + Set originalKeys = + copyOfRecords1.stream().map(s -> ((GenericRecord) s).get(HoodieRecord.RECORD_KEY_METADATA_FIELD).toString()) + .collect(Collectors.toSet()); assertEquals("CompositeAvroLogReader should return 200 records from 2 versions", originalKeys, readKeys); } @@ -617,14 +606,14 @@ public class HoodieLogFormatTest extends HoodieCommonTestHarness { throws IOException, URISyntaxException, InterruptedException { Schema schema = HoodieAvroUtils.addMetadataFields(getSimpleSchema()); // Set a small threshold so that every block is a new version - Writer writer = HoodieLogFormat.newWriterBuilder().onParentPath(partitionPath) - .withFileExtension(HoodieLogFile.DELTA_EXTENSION).withFileId("test-fileid1") - .overBaseCommit("100").withFs(fs).build(); + Writer writer = + HoodieLogFormat.newWriterBuilder().onParentPath(partitionPath).withFileExtension(HoodieLogFile.DELTA_EXTENSION) + .withFileId("test-fileid1").overBaseCommit("100").withFs(fs).build(); // Write 1 List records1 = SchemaTestUtil.generateHoodieTestRecords(0, 100); - List copyOfRecords1 = records1.stream().map( - record -> HoodieAvroUtils.rewriteRecord((GenericRecord) record, schema)).collect(Collectors.toList()); + List copyOfRecords1 = records1.stream() + .map(record -> HoodieAvroUtils.rewriteRecord((GenericRecord) record, schema)).collect(Collectors.toList()); Map header = Maps.newHashMap(); header.put(HoodieLogBlock.HeaderMetadataType.INSTANT_TIME, "100"); @@ -649,26 +638,27 @@ public class HoodieLogFormatTest extends HoodieCommonTestHarness { // Write 3 header.put(HoodieLogBlock.HeaderMetadataType.INSTANT_TIME, "102"); List records3 = SchemaTestUtil.generateHoodieTestRecords(0, 100); - List copyOfRecords3 = records3.stream().map( - record -> HoodieAvroUtils.rewriteRecord((GenericRecord) record, schema)).collect(Collectors.toList()); + List copyOfRecords3 = records3.stream() + .map(record -> HoodieAvroUtils.rewriteRecord((GenericRecord) record, schema)).collect(Collectors.toList()); header.put(HoodieLogBlock.HeaderMetadataType.SCHEMA, schema.toString()); dataBlock = new HoodieAvroDataBlock(records3, header); writer = writer.appendBlock(dataBlock); writer.close(); - List allLogFiles = FSUtils.getAllLogFiles(fs, partitionPath, "test-fileid1", HoodieLogFile.DELTA_EXTENSION, - "100").map(s -> s.getPath().toString()).collect(Collectors.toList()); + List allLogFiles = + FSUtils.getAllLogFiles(fs, partitionPath, "test-fileid1", HoodieLogFile.DELTA_EXTENSION, "100") + .map(s -> s.getPath().toString()).collect(Collectors.toList()); - HoodieMergedLogRecordScanner scanner = new HoodieMergedLogRecordScanner(fs, basePath, allLogFiles, schema, - "102", 10240L, readBlocksLazily, false, bufferSize, BASE_OUTPUT_PATH); + HoodieMergedLogRecordScanner scanner = new HoodieMergedLogRecordScanner(fs, basePath, allLogFiles, schema, "102", + 10240L, readBlocksLazily, false, bufferSize, BASE_OUTPUT_PATH); assertEquals("We read 200 records from 2 write batches", 200, scanner.getTotalLogRecords()); Set readKeys = new HashSet<>(200); scanner.forEach(s -> readKeys.add(s.getKey().getRecordKey())); assertEquals("Stream collect should return all 200 records", 200, readKeys.size()); copyOfRecords1.addAll(copyOfRecords3); - Set originalKeys = copyOfRecords1.stream() - .map(s -> ((GenericRecord) s).get(HoodieRecord.RECORD_KEY_METADATA_FIELD).toString()) - .collect(Collectors.toSet()); + Set originalKeys = + copyOfRecords1.stream().map(s -> ((GenericRecord) s).get(HoodieRecord.RECORD_KEY_METADATA_FIELD).toString()) + .collect(Collectors.toSet()); assertEquals("CompositeAvroLogReader should return 200 records from 2 versions", originalKeys, readKeys); } @@ -677,14 +667,14 @@ public class HoodieLogFormatTest extends HoodieCommonTestHarness { throws IOException, URISyntaxException, InterruptedException { Schema schema = HoodieAvroUtils.addMetadataFields(getSimpleSchema()); // Set a small threshold so that every block is a new version - Writer writer = HoodieLogFormat.newWriterBuilder().onParentPath(partitionPath) - .withFileExtension(HoodieLogFile.DELTA_EXTENSION).withFileId("test-fileid1") - .overBaseCommit("100").withFs(fs).build(); + Writer writer = + HoodieLogFormat.newWriterBuilder().onParentPath(partitionPath).withFileExtension(HoodieLogFile.DELTA_EXTENSION) + .withFileId("test-fileid1").overBaseCommit("100").withFs(fs).build(); // Write 1 List records1 = SchemaTestUtil.generateHoodieTestRecords(0, 100); - List copyOfRecords1 = records1.stream().map( - record -> HoodieAvroUtils.rewriteRecord((GenericRecord) record, schema)).collect(Collectors.toList()); + List copyOfRecords1 = records1.stream() + .map(record -> HoodieAvroUtils.rewriteRecord((GenericRecord) record, schema)).collect(Collectors.toList()); Map header = Maps.newHashMap(); header.put(HoodieLogBlock.HeaderMetadataType.INSTANT_TIME, "100"); header.put(HoodieLogBlock.HeaderMetadataType.SCHEMA, schema.toString()); @@ -718,35 +708,36 @@ public class HoodieLogFormatTest extends HoodieCommonTestHarness { header.put(HoodieLogBlock.HeaderMetadataType.COMMAND_BLOCK_TYPE, String.valueOf(HoodieCommandBlock.HoodieCommandBlockTypeEnum.ROLLBACK_PREVIOUS_BLOCK.ordinal())); HoodieCommandBlock commandBlock = new HoodieCommandBlock(header); - writer = HoodieLogFormat.newWriterBuilder().onParentPath(partitionPath) - .withFileExtension(HoodieLogFile.DELTA_EXTENSION).withFileId("test-fileid1").overBaseCommit("100") - .withFs(fs).build(); + writer = + HoodieLogFormat.newWriterBuilder().onParentPath(partitionPath).withFileExtension(HoodieLogFile.DELTA_EXTENSION) + .withFileId("test-fileid1").overBaseCommit("100").withFs(fs).build(); writer = writer.appendBlock(commandBlock); // Write 3 header.put(HoodieLogBlock.HeaderMetadataType.INSTANT_TIME, "103"); List records3 = SchemaTestUtil.generateHoodieTestRecords(0, 100); - List copyOfRecords3 = records3.stream().map( - record -> HoodieAvroUtils.rewriteRecord((GenericRecord) record, schema)).collect(Collectors.toList()); + List copyOfRecords3 = records3.stream() + .map(record -> HoodieAvroUtils.rewriteRecord((GenericRecord) record, schema)).collect(Collectors.toList()); header.put(HoodieLogBlock.HeaderMetadataType.SCHEMA, schema.toString()); dataBlock = new HoodieAvroDataBlock(records3, header); writer = writer.appendBlock(dataBlock); writer.close(); - List allLogFiles = FSUtils.getAllLogFiles(fs, partitionPath, "test-fileid1", HoodieLogFile.DELTA_EXTENSION, - "100").map(s -> s.getPath().toString()).collect(Collectors.toList()); + List allLogFiles = + FSUtils.getAllLogFiles(fs, partitionPath, "test-fileid1", HoodieLogFile.DELTA_EXTENSION, "100") + .map(s -> s.getPath().toString()).collect(Collectors.toList()); - HoodieMergedLogRecordScanner scanner = new HoodieMergedLogRecordScanner(fs, basePath, allLogFiles, schema, - "103", 10240L, true, false, bufferSize, BASE_OUTPUT_PATH); + HoodieMergedLogRecordScanner scanner = new HoodieMergedLogRecordScanner(fs, basePath, allLogFiles, schema, "103", + 10240L, true, false, bufferSize, BASE_OUTPUT_PATH); assertEquals("We would read 200 records", 200, scanner.getTotalLogRecords()); Set readKeys = new HashSet<>(200); scanner.forEach(s -> readKeys.add(s.getKey().getRecordKey())); assertEquals("Stream collect should return all 200 records", 200, readKeys.size()); copyOfRecords1.addAll(copyOfRecords3); - Set originalKeys = copyOfRecords1.stream() - .map(s -> ((GenericRecord) s).get(HoodieRecord.RECORD_KEY_METADATA_FIELD).toString()) - .collect(Collectors.toSet()); + Set originalKeys = + copyOfRecords1.stream().map(s -> ((GenericRecord) s).get(HoodieRecord.RECORD_KEY_METADATA_FIELD).toString()) + .collect(Collectors.toSet()); assertEquals("CompositeAvroLogReader should return 200 records from 2 versions", originalKeys, readKeys); } @@ -755,14 +746,14 @@ public class HoodieLogFormatTest extends HoodieCommonTestHarness { throws IOException, URISyntaxException, InterruptedException { Schema schema = HoodieAvroUtils.addMetadataFields(getSimpleSchema()); // Set a small threshold so that every block is a new version - Writer writer = HoodieLogFormat.newWriterBuilder().onParentPath(partitionPath) - .withFileExtension(HoodieLogFile.DELTA_EXTENSION).withFileId("test-fileid1") - .overBaseCommit("100").withFs(fs).build(); + Writer writer = + HoodieLogFormat.newWriterBuilder().onParentPath(partitionPath).withFileExtension(HoodieLogFile.DELTA_EXTENSION) + .withFileId("test-fileid1").overBaseCommit("100").withFs(fs).build(); // Write 1 List records1 = SchemaTestUtil.generateHoodieTestRecords(0, 100); - List copyOfRecords1 = records1.stream().map( - record -> HoodieAvroUtils.rewriteRecord((GenericRecord) record, schema)).collect(Collectors.toList()); + List copyOfRecords1 = records1.stream() + .map(record -> HoodieAvroUtils.rewriteRecord((GenericRecord) record, schema)).collect(Collectors.toList()); Map header = Maps.newHashMap(); header.put(HoodieLogBlock.HeaderMetadataType.INSTANT_TIME, "100"); header.put(HoodieLogBlock.HeaderMetadataType.SCHEMA, schema.toString()); @@ -772,18 +763,19 @@ public class HoodieLogFormatTest extends HoodieCommonTestHarness { // Write 2 header.put(HoodieLogBlock.HeaderMetadataType.INSTANT_TIME, "101"); List records2 = SchemaTestUtil.generateHoodieTestRecords(0, 100); - List copyOfRecords2 = records2.stream().map( - record -> HoodieAvroUtils.rewriteRecord((GenericRecord) record, schema)).collect(Collectors.toList()); + List copyOfRecords2 = records2.stream() + .map(record -> HoodieAvroUtils.rewriteRecord((GenericRecord) record, schema)).collect(Collectors.toList()); dataBlock = new HoodieAvroDataBlock(records2, header); writer = writer.appendBlock(dataBlock); copyOfRecords1.addAll(copyOfRecords2); - List originalKeys = copyOfRecords1.stream().map( - s -> ((GenericRecord) s).get(HoodieRecord.RECORD_KEY_METADATA_FIELD).toString()).collect(Collectors.toList()); + List originalKeys = + copyOfRecords1.stream().map(s -> ((GenericRecord) s).get(HoodieRecord.RECORD_KEY_METADATA_FIELD).toString()) + .collect(Collectors.toList()); // Delete 50 keys - List deletedKeys = copyOfRecords1.stream().map( - s -> (new HoodieKey(((GenericRecord) s).get(HoodieRecord.RECORD_KEY_METADATA_FIELD).toString(), + List deletedKeys = copyOfRecords1.stream() + .map(s -> (new HoodieKey(((GenericRecord) s).get(HoodieRecord.RECORD_KEY_METADATA_FIELD).toString(), ((GenericRecord) s).get(HoodieRecord.PARTITION_PATH_METADATA_FIELD).toString()))) .collect(Collectors.toList()).subList(0, 50); @@ -791,11 +783,12 @@ public class HoodieLogFormatTest extends HoodieCommonTestHarness { HoodieDeleteBlock deleteBlock = new HoodieDeleteBlock(deletedKeys.toArray(new HoodieKey[50]), header); writer = writer.appendBlock(deleteBlock); - List allLogFiles = FSUtils.getAllLogFiles(fs, partitionPath, "test-fileid1", HoodieLogFile.DELTA_EXTENSION, - "100").map(s -> s.getPath().toString()).collect(Collectors.toList()); + List allLogFiles = + FSUtils.getAllLogFiles(fs, partitionPath, "test-fileid1", HoodieLogFile.DELTA_EXTENSION, "100") + .map(s -> s.getPath().toString()).collect(Collectors.toList()); - HoodieMergedLogRecordScanner scanner = new HoodieMergedLogRecordScanner(fs, basePath, allLogFiles, schema, - "102", 10240L, readBlocksLazily, false, bufferSize, BASE_OUTPUT_PATH); + HoodieMergedLogRecordScanner scanner = new HoodieMergedLogRecordScanner(fs, basePath, allLogFiles, schema, "102", + 10240L, readBlocksLazily, false, bufferSize, BASE_OUTPUT_PATH); assertEquals("We still would read 200 records", 200, scanner.getTotalLogRecords()); final List readKeys = new ArrayList<>(200); final List emptyPayloads = new ArrayList<>(); @@ -825,8 +818,8 @@ public class HoodieLogFormatTest extends HoodieCommonTestHarness { writer = writer.appendBlock(commandBlock); readKeys.clear(); - scanner = new HoodieMergedLogRecordScanner(fs, basePath, allLogFiles, schema, "101", - 10240L, readBlocksLazily, false, bufferSize, BASE_OUTPUT_PATH); + scanner = new HoodieMergedLogRecordScanner(fs, basePath, allLogFiles, schema, "101", 10240L, readBlocksLazily, + false, bufferSize, BASE_OUTPUT_PATH); scanner.forEach(s -> readKeys.add(s.getKey().getRecordKey())); assertEquals("Stream collect should return all 200 records after rollback of delete", 200, readKeys.size()); } @@ -838,14 +831,14 @@ public class HoodieLogFormatTest extends HoodieCommonTestHarness { // Write a Data block and Delete block with same InstantTime (written in same batch) Schema schema = HoodieAvroUtils.addMetadataFields(getSimpleSchema()); // Set a small threshold so that every block is a new version - Writer writer = HoodieLogFormat.newWriterBuilder().onParentPath(partitionPath) - .withFileExtension(HoodieLogFile.DELTA_EXTENSION).withFileId("test-fileid1") - .overBaseCommit("100").withFs(fs).build(); + Writer writer = + HoodieLogFormat.newWriterBuilder().onParentPath(partitionPath).withFileExtension(HoodieLogFile.DELTA_EXTENSION) + .withFileId("test-fileid1").overBaseCommit("100").withFs(fs).build(); // Write 1 List records1 = SchemaTestUtil.generateHoodieTestRecords(0, 100); - List copyOfRecords1 = records1.stream().map( - record -> HoodieAvroUtils.rewriteRecord((GenericRecord) record, schema)).collect(Collectors.toList()); + List copyOfRecords1 = records1.stream() + .map(record -> HoodieAvroUtils.rewriteRecord((GenericRecord) record, schema)).collect(Collectors.toList()); Map header = Maps.newHashMap(); header.put(HoodieLogBlock.HeaderMetadataType.INSTANT_TIME, "100"); header.put(HoodieLogBlock.HeaderMetadataType.TARGET_INSTANT_TIME, "100"); @@ -860,13 +853,14 @@ public class HoodieLogFormatTest extends HoodieCommonTestHarness { dataBlock = new HoodieAvroDataBlock(records2, header); writer = writer.appendBlock(dataBlock); - List originalKeys = copyOfRecords1.stream().map( - s -> ((GenericRecord) s).get(HoodieRecord.RECORD_KEY_METADATA_FIELD).toString()).collect(Collectors.toList()); + List originalKeys = + copyOfRecords1.stream().map(s -> ((GenericRecord) s).get(HoodieRecord.RECORD_KEY_METADATA_FIELD).toString()) + .collect(Collectors.toList()); // Delete 50 keys // Delete 50 keys - List deletedKeys = copyOfRecords1.stream().map( - s -> (new HoodieKey(((GenericRecord) s).get(HoodieRecord.RECORD_KEY_METADATA_FIELD).toString(), + List deletedKeys = copyOfRecords1.stream() + .map(s -> (new HoodieKey(((GenericRecord) s).get(HoodieRecord.RECORD_KEY_METADATA_FIELD).toString(), ((GenericRecord) s).get(HoodieRecord.PARTITION_PATH_METADATA_FIELD).toString()))) .collect(Collectors.toList()).subList(0, 50); @@ -887,12 +881,13 @@ public class HoodieLogFormatTest extends HoodieCommonTestHarness { // Attempt 2 : Write another rollback blocks for a failed write writer = writer.appendBlock(commandBlock); - List allLogFiles = FSUtils.getAllLogFiles(fs, partitionPath, "test-fileid1", HoodieLogFile.DELTA_EXTENSION, - "100").map(s -> s.getPath().toString()).collect(Collectors.toList()); + List allLogFiles = + FSUtils.getAllLogFiles(fs, partitionPath, "test-fileid1", HoodieLogFile.DELTA_EXTENSION, "100") + .map(s -> s.getPath().toString()).collect(Collectors.toList()); // all data must be rolled back before merge - HoodieMergedLogRecordScanner scanner = new HoodieMergedLogRecordScanner(fs, basePath, allLogFiles, schema, - "100", 10240L, readBlocksLazily, false, bufferSize, BASE_OUTPUT_PATH); + HoodieMergedLogRecordScanner scanner = new HoodieMergedLogRecordScanner(fs, basePath, allLogFiles, schema, "100", + 10240L, readBlocksLazily, false, bufferSize, BASE_OUTPUT_PATH); assertEquals("We would have scanned 0 records because of rollback", 0, scanner.getTotalLogRecords()); final List readKeys = new ArrayList<>(); @@ -907,14 +902,14 @@ public class HoodieLogFormatTest extends HoodieCommonTestHarness { // Write a Data block and Delete block with same InstantTime (written in same batch) Schema schema = HoodieAvroUtils.addMetadataFields(getSimpleSchema()); // Set a small threshold so that every block is a new version - Writer writer = HoodieLogFormat.newWriterBuilder().onParentPath(partitionPath) - .withFileExtension(HoodieLogFile.DELTA_EXTENSION).withFileId("test-fileid1") - .overBaseCommit("100").withFs(fs).build(); + Writer writer = + HoodieLogFormat.newWriterBuilder().onParentPath(partitionPath).withFileExtension(HoodieLogFile.DELTA_EXTENSION) + .withFileId("test-fileid1").overBaseCommit("100").withFs(fs).build(); // Write 1 List records1 = SchemaTestUtil.generateHoodieTestRecords(0, 100); - List copyOfRecords1 = records1.stream().map( - record -> HoodieAvroUtils.rewriteRecord((GenericRecord) record, schema)).collect(Collectors.toList()); + List copyOfRecords1 = records1.stream() + .map(record -> HoodieAvroUtils.rewriteRecord((GenericRecord) record, schema)).collect(Collectors.toList()); Map header = Maps.newHashMap(); header.put(HoodieLogBlock.HeaderMetadataType.INSTANT_TIME, "100"); header.put(HoodieLogBlock.HeaderMetadataType.TARGET_INSTANT_TIME, "100"); @@ -922,13 +917,14 @@ public class HoodieLogFormatTest extends HoodieCommonTestHarness { HoodieAvroDataBlock dataBlock = new HoodieAvroDataBlock(records1, header); writer = writer.appendBlock(dataBlock); - List originalKeys = copyOfRecords1.stream().map( - s -> ((GenericRecord) s).get(HoodieRecord.RECORD_KEY_METADATA_FIELD).toString()).collect(Collectors.toList()); + List originalKeys = + copyOfRecords1.stream().map(s -> ((GenericRecord) s).get(HoodieRecord.RECORD_KEY_METADATA_FIELD).toString()) + .collect(Collectors.toList()); // Delete 50 keys // Delete 50 keys - List deletedKeys = copyOfRecords1.stream().map( - s -> (new HoodieKey(((GenericRecord) s).get(HoodieRecord.RECORD_KEY_METADATA_FIELD).toString(), + List deletedKeys = copyOfRecords1.stream() + .map(s -> (new HoodieKey(((GenericRecord) s).get(HoodieRecord.RECORD_KEY_METADATA_FIELD).toString(), ((GenericRecord) s).get(HoodieRecord.PARTITION_PATH_METADATA_FIELD).toString()))) .collect(Collectors.toList()).subList(0, 50); HoodieDeleteBlock deleteBlock = new HoodieDeleteBlock(deletedKeys.toArray(new HoodieKey[50]), header); @@ -941,11 +937,12 @@ public class HoodieLogFormatTest extends HoodieCommonTestHarness { writer = writer.appendBlock(commandBlock); writer = writer.appendBlock(commandBlock); - List allLogFiles = FSUtils.getAllLogFiles(fs, partitionPath, "test-fileid1", HoodieLogFile.DELTA_EXTENSION, - "100").map(s -> s.getPath().toString()).collect(Collectors.toList()); + List allLogFiles = + FSUtils.getAllLogFiles(fs, partitionPath, "test-fileid1", HoodieLogFile.DELTA_EXTENSION, "100") + .map(s -> s.getPath().toString()).collect(Collectors.toList()); - HoodieMergedLogRecordScanner scanner = new HoodieMergedLogRecordScanner(fs, basePath, allLogFiles, schema, - "100", 10240L, readBlocksLazily, false, bufferSize, BASE_OUTPUT_PATH); + HoodieMergedLogRecordScanner scanner = new HoodieMergedLogRecordScanner(fs, basePath, allLogFiles, schema, "100", + 10240L, readBlocksLazily, false, bufferSize, BASE_OUTPUT_PATH); assertEquals("We would read 0 records", 0, scanner.getTotalLogRecords()); } @@ -954,9 +951,9 @@ public class HoodieLogFormatTest extends HoodieCommonTestHarness { throws IOException, URISyntaxException, InterruptedException { Schema schema = HoodieAvroUtils.addMetadataFields(getSimpleSchema()); // Set a small threshold so that every block is a new version - Writer writer = HoodieLogFormat.newWriterBuilder().onParentPath(partitionPath) - .withFileExtension(HoodieLogFile.DELTA_EXTENSION).withFileId("test-fileid1") - .overBaseCommit("100").withFs(fs).build(); + Writer writer = + HoodieLogFormat.newWriterBuilder().onParentPath(partitionPath).withFileExtension(HoodieLogFile.DELTA_EXTENSION) + .withFileId("test-fileid1").overBaseCommit("100").withFs(fs).build(); // Write 1 List records1 = SchemaTestUtil.generateHoodieTestRecords(0, 100); @@ -973,11 +970,12 @@ public class HoodieLogFormatTest extends HoodieCommonTestHarness { HoodieCommandBlock commandBlock = new HoodieCommandBlock(header); writer = writer.appendBlock(commandBlock); - List allLogFiles = FSUtils.getAllLogFiles(fs, partitionPath, "test-fileid1", HoodieLogFile.DELTA_EXTENSION, - "100").map(s -> s.getPath().toString()).collect(Collectors.toList()); + List allLogFiles = + FSUtils.getAllLogFiles(fs, partitionPath, "test-fileid1", HoodieLogFile.DELTA_EXTENSION, "100") + .map(s -> s.getPath().toString()).collect(Collectors.toList()); - HoodieMergedLogRecordScanner scanner = new HoodieMergedLogRecordScanner(fs, basePath, allLogFiles, schema, - "100", 10240L, readBlocksLazily, false, bufferSize, BASE_OUTPUT_PATH); + HoodieMergedLogRecordScanner scanner = new HoodieMergedLogRecordScanner(fs, basePath, allLogFiles, schema, "100", + 10240L, readBlocksLazily, false, bufferSize, BASE_OUTPUT_PATH); assertEquals("We still would read 100 records", 100, scanner.getTotalLogRecords()); final List readKeys = new ArrayList<>(100); scanner.forEach(s -> readKeys.add(s.getKey().getRecordKey())); @@ -991,14 +989,14 @@ public class HoodieLogFormatTest extends HoodieCommonTestHarness { // Write a 3 Data blocs with same InstantTime (written in same batch) Schema schema = HoodieAvroUtils.addMetadataFields(getSimpleSchema()); // Set a small threshold so that every block is a new version - Writer writer = HoodieLogFormat.newWriterBuilder().onParentPath(partitionPath) - .withFileExtension(HoodieLogFile.DELTA_EXTENSION).withFileId("test-fileid1") - .overBaseCommit("100").withFs(fs).build(); + Writer writer = + HoodieLogFormat.newWriterBuilder().onParentPath(partitionPath).withFileExtension(HoodieLogFile.DELTA_EXTENSION) + .withFileId("test-fileid1").overBaseCommit("100").withFs(fs).build(); // Write 1 List records1 = SchemaTestUtil.generateHoodieTestRecords(0, 100); - List copyOfRecords1 = records1.stream().map( - record -> HoodieAvroUtils.rewriteRecord((GenericRecord) record, schema)).collect(Collectors.toList()); + List copyOfRecords1 = records1.stream() + .map(record -> HoodieAvroUtils.rewriteRecord((GenericRecord) record, schema)).collect(Collectors.toList()); Map header = Maps.newHashMap(); header.put(HoodieLogBlock.HeaderMetadataType.INSTANT_TIME, "100"); header.put(HoodieLogBlock.HeaderMetadataType.TARGET_INSTANT_TIME, "100"); @@ -1008,13 +1006,14 @@ public class HoodieLogFormatTest extends HoodieCommonTestHarness { writer = writer.appendBlock(dataBlock); writer = writer.appendBlock(dataBlock); - List originalKeys = copyOfRecords1.stream().map( - s -> ((GenericRecord) s).get(HoodieRecord.RECORD_KEY_METADATA_FIELD).toString()).collect(Collectors.toList()); + List originalKeys = + copyOfRecords1.stream().map(s -> ((GenericRecord) s).get(HoodieRecord.RECORD_KEY_METADATA_FIELD).toString()) + .collect(Collectors.toList()); // Delete 50 keys // Delete 50 keys - List deletedKeys = copyOfRecords1.stream().map( - s -> (new HoodieKey(((GenericRecord) s).get(HoodieRecord.RECORD_KEY_METADATA_FIELD).toString(), + List deletedKeys = copyOfRecords1.stream() + .map(s -> (new HoodieKey(((GenericRecord) s).get(HoodieRecord.RECORD_KEY_METADATA_FIELD).toString(), ((GenericRecord) s).get(HoodieRecord.PARTITION_PATH_METADATA_FIELD).toString()))) .collect(Collectors.toList()).subList(0, 50); HoodieDeleteBlock deleteBlock = new HoodieDeleteBlock(deletedKeys.toArray(new HoodieKey[50]), header); @@ -1027,11 +1026,12 @@ public class HoodieLogFormatTest extends HoodieCommonTestHarness { HoodieCommandBlock commandBlock = new HoodieCommandBlock(header); writer = writer.appendBlock(commandBlock); - List allLogFiles = FSUtils.getAllLogFiles(fs, partitionPath, "test-fileid1", HoodieLogFile.DELTA_EXTENSION, - "100").map(s -> s.getPath().toString()).collect(Collectors.toList()); + List allLogFiles = + FSUtils.getAllLogFiles(fs, partitionPath, "test-fileid1", HoodieLogFile.DELTA_EXTENSION, "100") + .map(s -> s.getPath().toString()).collect(Collectors.toList()); - HoodieMergedLogRecordScanner scanner = new HoodieMergedLogRecordScanner(fs, basePath, allLogFiles, schema, - "101", 10240L, readBlocksLazily, false, bufferSize, BASE_OUTPUT_PATH); + HoodieMergedLogRecordScanner scanner = new HoodieMergedLogRecordScanner(fs, basePath, allLogFiles, schema, "101", + 10240L, readBlocksLazily, false, bufferSize, BASE_OUTPUT_PATH); assertEquals("We would read 0 records", 0, scanner.getTotalLogRecords()); } @@ -1042,9 +1042,9 @@ public class HoodieLogFormatTest extends HoodieCommonTestHarness { // Write a 3 Data blocs with same InstantTime (written in same batch) Schema schema = HoodieAvroUtils.addMetadataFields(getSimpleSchema()); // Set a small threshold so that every block is a new version - Writer writer = HoodieLogFormat.newWriterBuilder().onParentPath(partitionPath) - .withFileExtension(HoodieLogFile.DELTA_EXTENSION).withFileId("test-fileid1") - .overBaseCommit("100").withFs(fs).build(); + Writer writer = + HoodieLogFormat.newWriterBuilder().onParentPath(partitionPath).withFileExtension(HoodieLogFile.DELTA_EXTENSION) + .withFileId("test-fileid1").overBaseCommit("100").withFs(fs).build(); // Write 1 List records1 = SchemaTestUtil.generateHoodieTestRecords(0, 100); @@ -1083,9 +1083,9 @@ public class HoodieLogFormatTest extends HoodieCommonTestHarness { outputStream.flush(); outputStream.close(); - writer = HoodieLogFormat.newWriterBuilder().onParentPath(partitionPath) - .withFileExtension(HoodieLogFile.DELTA_EXTENSION).withFileId("test-fileid1").overBaseCommit("100") - .withFs(fs).build(); + writer = + HoodieLogFormat.newWriterBuilder().onParentPath(partitionPath).withFileExtension(HoodieLogFile.DELTA_EXTENSION) + .withFileId("test-fileid1").overBaseCommit("100").withFs(fs).build(); writer = writer.appendBlock(dataBlock); writer.close(); @@ -1103,9 +1103,9 @@ public class HoodieLogFormatTest extends HoodieCommonTestHarness { outputStream.flush(); outputStream.close(); - writer = HoodieLogFormat.newWriterBuilder().onParentPath(partitionPath) - .withFileExtension(HoodieLogFile.DELTA_EXTENSION).withFileId("test-fileid1").overBaseCommit("100") - .withFs(fs).build(); + writer = + HoodieLogFormat.newWriterBuilder().onParentPath(partitionPath).withFileExtension(HoodieLogFile.DELTA_EXTENSION) + .withFileId("test-fileid1").overBaseCommit("100").withFs(fs).build(); // Write 1 rollback block for the last commit instant header.put(HoodieLogBlock.HeaderMetadataType.INSTANT_TIME, "101"); header.put(HeaderMetadataType.TARGET_INSTANT_TIME, "100"); @@ -1115,24 +1115,25 @@ public class HoodieLogFormatTest extends HoodieCommonTestHarness { writer = writer.appendBlock(commandBlock); writer.close(); - List allLogFiles = FSUtils.getAllLogFiles(fs, partitionPath, "test-fileid1", HoodieLogFile.DELTA_EXTENSION, - "100").map(s -> s.getPath().toString()).collect(Collectors.toList()); + List allLogFiles = + FSUtils.getAllLogFiles(fs, partitionPath, "test-fileid1", HoodieLogFile.DELTA_EXTENSION, "100") + .map(s -> s.getPath().toString()).collect(Collectors.toList()); - HoodieMergedLogRecordScanner scanner = new HoodieMergedLogRecordScanner(fs, basePath, allLogFiles, schema, - "101", 10240L, readBlocksLazily, false, bufferSize, BASE_OUTPUT_PATH); + HoodieMergedLogRecordScanner scanner = new HoodieMergedLogRecordScanner(fs, basePath, allLogFiles, schema, "101", + 10240L, readBlocksLazily, false, bufferSize, BASE_OUTPUT_PATH); assertEquals("We would read 0 records", 0, scanner.getTotalLogRecords()); } @SuppressWarnings("unchecked") @Test public void testBasicAppendAndReadInReverse() throws IOException, URISyntaxException, InterruptedException { - Writer writer = HoodieLogFormat.newWriterBuilder().onParentPath(partitionPath) - .withFileExtension(HoodieLogFile.DELTA_EXTENSION).withFileId("test-fileid1") - .overBaseCommit("100").withFs(fs).build(); + Writer writer = + HoodieLogFormat.newWriterBuilder().onParentPath(partitionPath).withFileExtension(HoodieLogFile.DELTA_EXTENSION) + .withFileId("test-fileid1").overBaseCommit("100").withFs(fs).build(); Schema schema = getSimpleSchema(); List records1 = SchemaTestUtil.generateTestRecords(0, 100); - List copyOfRecords1 = records1.stream().map( - record -> HoodieAvroUtils.rewriteRecord((GenericRecord) record, schema)).collect(Collectors.toList()); + List copyOfRecords1 = records1.stream() + .map(record -> HoodieAvroUtils.rewriteRecord((GenericRecord) record, schema)).collect(Collectors.toList()); Map header = Maps.newHashMap(); header.put(HoodieLogBlock.HeaderMetadataType.INSTANT_TIME, "100"); header.put(HoodieLogBlock.HeaderMetadataType.SCHEMA, schema.toString()); @@ -1140,23 +1141,23 @@ public class HoodieLogFormatTest extends HoodieCommonTestHarness { writer = writer.appendBlock(dataBlock); writer.close(); - writer = HoodieLogFormat.newWriterBuilder().onParentPath(partitionPath) - .withFileExtension(HoodieLogFile.DELTA_EXTENSION).withFileId("test-fileid1").overBaseCommit("100") - .withFs(fs).build(); + writer = + HoodieLogFormat.newWriterBuilder().onParentPath(partitionPath).withFileExtension(HoodieLogFile.DELTA_EXTENSION) + .withFileId("test-fileid1").overBaseCommit("100").withFs(fs).build(); List records2 = SchemaTestUtil.generateTestRecords(0, 100); - List copyOfRecords2 = records2.stream().map( - record -> HoodieAvroUtils.rewriteRecord((GenericRecord) record, schema)).collect(Collectors.toList()); + List copyOfRecords2 = records2.stream() + .map(record -> HoodieAvroUtils.rewriteRecord((GenericRecord) record, schema)).collect(Collectors.toList()); dataBlock = new HoodieAvroDataBlock(records2, header); writer = writer.appendBlock(dataBlock); writer.close(); // Close and Open again and append 100 more records - writer = HoodieLogFormat.newWriterBuilder().onParentPath(partitionPath) - .withFileExtension(HoodieLogFile.DELTA_EXTENSION).withFileId("test-fileid1").overBaseCommit("100") - .withFs(fs).build(); + writer = + HoodieLogFormat.newWriterBuilder().onParentPath(partitionPath).withFileExtension(HoodieLogFile.DELTA_EXTENSION) + .withFileId("test-fileid1").overBaseCommit("100").withFs(fs).build(); List records3 = SchemaTestUtil.generateTestRecords(0, 100); - List copyOfRecords3 = records3.stream().map( - record -> HoodieAvroUtils.rewriteRecord((GenericRecord) record, schema)).collect(Collectors.toList()); + List copyOfRecords3 = records3.stream() + .map(record -> HoodieAvroUtils.rewriteRecord((GenericRecord) record, schema)).collect(Collectors.toList()); dataBlock = new HoodieAvroDataBlock(records3, header); writer = writer.appendBlock(dataBlock); writer.close(); @@ -1195,9 +1196,9 @@ public class HoodieLogFormatTest extends HoodieCommonTestHarness { @Test public void testAppendAndReadOnCorruptedLogInReverse() throws IOException, URISyntaxException, InterruptedException { - Writer writer = HoodieLogFormat.newWriterBuilder().onParentPath(partitionPath) - .withFileExtension(HoodieLogFile.DELTA_EXTENSION).withFileId("test-fileid1") - .overBaseCommit("100").withFs(fs).build(); + Writer writer = + HoodieLogFormat.newWriterBuilder().onParentPath(partitionPath).withFileExtension(HoodieLogFile.DELTA_EXTENSION) + .withFileId("test-fileid1").overBaseCommit("100").withFs(fs).build(); Schema schema = getSimpleSchema(); List records = SchemaTestUtil.generateTestRecords(0, 100); Map header = Maps.newHashMap(); @@ -1225,17 +1226,17 @@ public class HoodieLogFormatTest extends HoodieCommonTestHarness { outputStream.close(); // Should be able to append a new block - writer = HoodieLogFormat.newWriterBuilder().onParentPath(partitionPath) - .withFileExtension(HoodieLogFile.DELTA_EXTENSION).withFileId("test-fileid1").overBaseCommit("100") - .withFs(fs).build(); + writer = + HoodieLogFormat.newWriterBuilder().onParentPath(partitionPath).withFileExtension(HoodieLogFile.DELTA_EXTENSION) + .withFileId("test-fileid1").overBaseCommit("100").withFs(fs).build(); records = SchemaTestUtil.generateTestRecords(0, 100); dataBlock = new HoodieAvroDataBlock(records, header); writer = writer.appendBlock(dataBlock); writer.close(); // First round of reads - we should be able to read the first block and then EOF - HoodieLogFileReader reader = new HoodieLogFileReader(fs, writer.getLogFile(), schema, bufferSize, - readBlocksLazily, true); + HoodieLogFileReader reader = + new HoodieLogFileReader(fs, writer.getLogFile(), schema, bufferSize, readBlocksLazily, true); assertTrue("Last block should be available", reader.hasPrev()); HoodieLogBlock block = reader.prev(); @@ -1254,13 +1255,13 @@ public class HoodieLogFormatTest extends HoodieCommonTestHarness { @SuppressWarnings("unchecked") @Test public void testBasicAppendAndTraverseInReverse() throws IOException, URISyntaxException, InterruptedException { - Writer writer = HoodieLogFormat.newWriterBuilder().onParentPath(partitionPath) - .withFileExtension(HoodieLogFile.DELTA_EXTENSION).withFileId("test-fileid1") - .overBaseCommit("100").withFs(fs).build(); + Writer writer = + HoodieLogFormat.newWriterBuilder().onParentPath(partitionPath).withFileExtension(HoodieLogFile.DELTA_EXTENSION) + .withFileId("test-fileid1").overBaseCommit("100").withFs(fs).build(); Schema schema = getSimpleSchema(); List records1 = SchemaTestUtil.generateTestRecords(0, 100); - List copyOfRecords1 = records1.stream().map( - record -> HoodieAvroUtils.rewriteRecord((GenericRecord) record, schema)).collect(Collectors.toList()); + List copyOfRecords1 = records1.stream() + .map(record -> HoodieAvroUtils.rewriteRecord((GenericRecord) record, schema)).collect(Collectors.toList()); Map header = Maps.newHashMap(); header.put(HoodieLogBlock.HeaderMetadataType.INSTANT_TIME, "100"); header.put(HoodieLogBlock.HeaderMetadataType.SCHEMA, schema.toString()); @@ -1268,23 +1269,23 @@ public class HoodieLogFormatTest extends HoodieCommonTestHarness { writer = writer.appendBlock(dataBlock); writer.close(); - writer = HoodieLogFormat.newWriterBuilder().onParentPath(partitionPath) - .withFileExtension(HoodieLogFile.DELTA_EXTENSION).withFileId("test-fileid1").overBaseCommit("100") - .withFs(fs).build(); + writer = + HoodieLogFormat.newWriterBuilder().onParentPath(partitionPath).withFileExtension(HoodieLogFile.DELTA_EXTENSION) + .withFileId("test-fileid1").overBaseCommit("100").withFs(fs).build(); List records2 = SchemaTestUtil.generateTestRecords(0, 100); - List copyOfRecords2 = records2.stream().map( - record -> HoodieAvroUtils.rewriteRecord((GenericRecord) record, schema)).collect(Collectors.toList()); + List copyOfRecords2 = records2.stream() + .map(record -> HoodieAvroUtils.rewriteRecord((GenericRecord) record, schema)).collect(Collectors.toList()); dataBlock = new HoodieAvroDataBlock(records2, header); writer = writer.appendBlock(dataBlock); writer.close(); // Close and Open again and append 100 more records - writer = HoodieLogFormat.newWriterBuilder().onParentPath(partitionPath) - .withFileExtension(HoodieLogFile.DELTA_EXTENSION).withFileId("test-fileid1").overBaseCommit("100") - .withFs(fs).build(); + writer = + HoodieLogFormat.newWriterBuilder().onParentPath(partitionPath).withFileExtension(HoodieLogFile.DELTA_EXTENSION) + .withFileId("test-fileid1").overBaseCommit("100").withFs(fs).build(); List records3 = SchemaTestUtil.generateTestRecords(0, 100); - List copyOfRecords3 = records3.stream().map( - record -> HoodieAvroUtils.rewriteRecord((GenericRecord) record, schema)).collect(Collectors.toList()); + List copyOfRecords3 = records3.stream() + .map(record -> HoodieAvroUtils.rewriteRecord((GenericRecord) record, schema)).collect(Collectors.toList()); dataBlock = new HoodieAvroDataBlock(records3, header); writer = writer.appendBlock(dataBlock); writer.close(); diff --git a/hudi-common/src/test/java/org/apache/hudi/common/table/string/HoodieActiveTimelineTest.java b/hudi-common/src/test/java/org/apache/hudi/common/table/string/HoodieActiveTimelineTest.java index 81933ab19..542fff285 100644 --- a/hudi-common/src/test/java/org/apache/hudi/common/table/string/HoodieActiveTimelineTest.java +++ b/hudi-common/src/test/java/org/apache/hudi/common/table/string/HoodieActiveTimelineTest.java @@ -98,12 +98,10 @@ public class HoodieActiveTimelineTest extends HoodieCommonTestHarness { public void testTimelineOperations() throws Exception { timeline = new MockHoodieTimeline(Stream.of("01", "03", "05", "07", "09", "11", "13", "15", "17", "19"), Stream.of("21", "23")); - HoodieTestUtils.assertStreamEquals("", Stream.of("05", "07", "09", "11"), - timeline.getCommitTimeline().filterCompletedInstants().findInstantsInRange("04", "11").getInstants() - .map(HoodieInstant::getTimestamp)); - HoodieTestUtils.assertStreamEquals("", Stream.of("09", "11"), - timeline.getCommitTimeline().filterCompletedInstants().findInstantsAfter("07", 2).getInstants() - .map(HoodieInstant::getTimestamp)); + HoodieTestUtils.assertStreamEquals("", Stream.of("05", "07", "09", "11"), timeline.getCommitTimeline() + .filterCompletedInstants().findInstantsInRange("04", "11").getInstants().map(HoodieInstant::getTimestamp)); + HoodieTestUtils.assertStreamEquals("", Stream.of("09", "11"), timeline.getCommitTimeline().filterCompletedInstants() + .findInstantsAfter("07", 2).getInstants().map(HoodieInstant::getTimestamp)); assertFalse(timeline.empty()); assertFalse(timeline.getCommitTimeline().filterInflightsExcludingCompaction().empty()); assertEquals("", 12, timeline.countInstants()); diff --git a/hudi-common/src/test/java/org/apache/hudi/common/table/string/MockHoodieTimeline.java b/hudi-common/src/test/java/org/apache/hudi/common/table/string/MockHoodieTimeline.java index e30b61a4a..72df1c02b 100644 --- a/hudi-common/src/test/java/org/apache/hudi/common/table/string/MockHoodieTimeline.java +++ b/hudi-common/src/test/java/org/apache/hudi/common/table/string/MockHoodieTimeline.java @@ -31,8 +31,9 @@ public class MockHoodieTimeline extends HoodieActiveTimeline { public MockHoodieTimeline(Stream completed, Stream inflights) throws IOException { super(); - this.setInstants(Stream.concat(completed.map(s -> new HoodieInstant(false, HoodieTimeline.COMMIT_ACTION, s)), - inflights.map(s -> new HoodieInstant(true, HoodieTimeline.COMMIT_ACTION, s))) + this.setInstants(Stream + .concat(completed.map(s -> new HoodieInstant(false, HoodieTimeline.COMMIT_ACTION, s)), + inflights.map(s -> new HoodieInstant(true, HoodieTimeline.COMMIT_ACTION, s))) .sorted(Comparator.comparing(new Function() { @Override public String apply(HoodieInstant hoodieInstant) { diff --git a/hudi-common/src/test/java/org/apache/hudi/common/table/view/HoodieTableFileSystemViewTest.java b/hudi-common/src/test/java/org/apache/hudi/common/table/view/HoodieTableFileSystemViewTest.java index 85af25e65..a439ada89 100644 --- a/hudi-common/src/test/java/org/apache/hudi/common/table/view/HoodieTableFileSystemViewTest.java +++ b/hudi-common/src/test/java/org/apache/hudi/common/table/view/HoodieTableFileSystemViewTest.java @@ -96,8 +96,8 @@ public class HoodieTableFileSystemViewTest extends HoodieCommonTestHarness { } /** - * Test case for view generation on a file group where - * the only file-slice does not have data-file. This is the case where upserts directly go to log-files + * Test case for view generation on a file group where the only file-slice does not have data-file. This is the case + * where upserts directly go to log-files */ @Test public void testViewForFileSlicesWithNoBaseFile() throws Exception { @@ -113,10 +113,10 @@ public class HoodieTableFileSystemViewTest extends HoodieCommonTestHarness { String instantTime1 = "1"; String deltaInstantTime1 = "2"; String deltaInstantTime2 = "3"; - String fileName1 = FSUtils.makeLogFileName(fileId, HoodieLogFile.DELTA_EXTENSION, - instantTime1, 0, TEST_WRITE_TOKEN); - String fileName2 = FSUtils.makeLogFileName(fileId, HoodieLogFile.DELTA_EXTENSION, - instantTime1, 1, TEST_WRITE_TOKEN); + String fileName1 = + FSUtils.makeLogFileName(fileId, HoodieLogFile.DELTA_EXTENSION, instantTime1, 0, TEST_WRITE_TOKEN); + String fileName2 = + FSUtils.makeLogFileName(fileId, HoodieLogFile.DELTA_EXTENSION, instantTime1, 1, TEST_WRITE_TOKEN); new File(basePath + "/" + partitionPath + "/" + fileName1).createNewFile(); new File(basePath + "/" + partitionPath + "/" + fileName2).createNewFile(); HoodieActiveTimeline commitTimeline = metaClient.getActiveTimeline(); @@ -144,8 +144,8 @@ public class HoodieTableFileSystemViewTest extends HoodieCommonTestHarness { assertEquals("Log File Order check", fileName1, logFiles.get(1).getFileName()); // Check Merged File Slices API - fileSliceList = rtView.getLatestMergedFileSlicesBeforeOrOn(partitionPath, deltaInstantTime2) - .collect(Collectors.toList()); + fileSliceList = + rtView.getLatestMergedFileSlicesBeforeOrOn(partitionPath, deltaInstantTime2).collect(Collectors.toList()); assertEquals(1, fileSliceList.size()); fileSlice = fileSliceList.get(0); assertEquals("File-Id must be set correctly", fileId, fileSlice.getFileId()); @@ -198,41 +198,40 @@ public class HoodieTableFileSystemViewTest extends HoodieCommonTestHarness { /** * Returns all file-slices including uncommitted ones. + * * @param partitionPath * @return */ private Stream getAllRawFileSlices(String partitionPath) { - return fsView.getAllFileGroups(partitionPath) - .map(group -> group.getAllFileSlicesIncludingInflight()) + return fsView.getAllFileGroups(partitionPath).map(group -> group.getAllFileSlicesIncludingInflight()) .flatMap(sliceList -> sliceList); } /** - * Returns latest raw file-slices including uncommitted ones. + * Returns latest raw file-slices including uncommitted ones. + * * @param partitionPath * @return */ public Stream getLatestRawFileSlices(String partitionPath) { - return fsView.getAllFileGroups(partitionPath) - .map(fileGroup -> fileGroup.getLatestFileSlicesIncludingInflight()) - .filter(fileSliceOpt -> fileSliceOpt.isPresent()) - .map(Option::get); + return fsView.getAllFileGroups(partitionPath).map(fileGroup -> fileGroup.getLatestFileSlicesIncludingInflight()) + .filter(fileSliceOpt -> fileSliceOpt.isPresent()).map(Option::get); } /** * Helper method to test Views in the presence of concurrent compaction - * @param skipCreatingDataFile if set, first File Slice will not have data-file set. This would - * simulate inserts going directly to log files - * @param isCompactionInFlight if set, compaction was inflight (running) when view was tested first time, - * otherwise compaction was in requested state - * @param expTotalFileSlices Total number of file-slices across file-groups in the partition path - * @param expTotalDataFiles Total number of data-files across file-groups in the partition path + * + * @param skipCreatingDataFile if set, first File Slice will not have data-file set. This would simulate inserts going + * directly to log files + * @param isCompactionInFlight if set, compaction was inflight (running) when view was tested first time, otherwise + * compaction was in requested state + * @param expTotalFileSlices Total number of file-slices across file-groups in the partition path + * @param expTotalDataFiles Total number of data-files across file-groups in the partition path * @param includeInvalidAndInflight Whether view includes inflight and invalid file-groups. * @throws Exception */ - protected void testViewForFileSlicesWithAsyncCompaction(boolean skipCreatingDataFile, - boolean isCompactionInFlight, int expTotalFileSlices, int expTotalDataFiles, - boolean includeInvalidAndInflight) throws Exception { + protected void testViewForFileSlicesWithAsyncCompaction(boolean skipCreatingDataFile, boolean isCompactionInFlight, + int expTotalFileSlices, int expTotalDataFiles, boolean includeInvalidAndInflight) throws Exception { String partitionPath = "2016/05/01"; new File(basePath + "/" + partitionPath).mkdirs(); String fileId = UUID.randomUUID().toString(); @@ -247,10 +246,10 @@ public class HoodieTableFileSystemViewTest extends HoodieCommonTestHarness { dataFileName = FSUtils.makeDataFileName(instantTime1, TEST_WRITE_TOKEN, fileId); new File(basePath + "/" + partitionPath + "/" + dataFileName).createNewFile(); } - String fileName1 = FSUtils.makeLogFileName(fileId, HoodieLogFile.DELTA_EXTENSION, - instantTime1, 0, TEST_WRITE_TOKEN); - String fileName2 = FSUtils.makeLogFileName(fileId, HoodieLogFile.DELTA_EXTENSION, - instantTime1, 1, TEST_WRITE_TOKEN); + String fileName1 = + FSUtils.makeLogFileName(fileId, HoodieLogFile.DELTA_EXTENSION, instantTime1, 0, TEST_WRITE_TOKEN); + String fileName2 = + FSUtils.makeLogFileName(fileId, HoodieLogFile.DELTA_EXTENSION, instantTime1, 1, TEST_WRITE_TOKEN); new File(basePath + "/" + partitionPath + "/" + fileName1).createNewFile(); new File(basePath + "/" + partitionPath + "/" + fileName2).createNewFile(); HoodieActiveTimeline commitTimeline = metaClient.getActiveTimeline(); @@ -268,8 +267,8 @@ public class HoodieTableFileSystemViewTest extends HoodieCommonTestHarness { String compactDataFileName = FSUtils.makeDataFileName(compactionRequestedTime, TEST_WRITE_TOKEN, fileId); List> partitionFileSlicesPairs = new ArrayList<>(); partitionFileSlicesPairs.add(Pair.of(partitionPath, fileSlices.get(0))); - HoodieCompactionPlan compactionPlan = CompactionUtils.buildFromFileSlices(partitionFileSlicesPairs, - Option.empty(), Option.empty()); + HoodieCompactionPlan compactionPlan = + CompactionUtils.buildFromFileSlices(partitionFileSlicesPairs, Option.empty(), Option.empty()); HoodieInstant compactionInstant = null; if (isCompactionInFlight) { // Create a Data-file but this should be skipped by view @@ -297,10 +296,10 @@ public class HoodieTableFileSystemViewTest extends HoodieCommonTestHarness { String deltaInstantTime5 = "6"; List allInstantTimes = Arrays.asList(instantTime1, deltaInstantTime1, deltaInstantTime2, compactionRequestedTime, deltaInstantTime4, deltaInstantTime5); - String fileName3 = FSUtils.makeLogFileName(fileId, HoodieLogFile.DELTA_EXTENSION, - compactionRequestedTime, 0, TEST_WRITE_TOKEN); - String fileName4 = FSUtils.makeLogFileName(fileId, HoodieLogFile.DELTA_EXTENSION, - compactionRequestedTime, 1, TEST_WRITE_TOKEN); + String fileName3 = + FSUtils.makeLogFileName(fileId, HoodieLogFile.DELTA_EXTENSION, compactionRequestedTime, 0, TEST_WRITE_TOKEN); + String fileName4 = + FSUtils.makeLogFileName(fileId, HoodieLogFile.DELTA_EXTENSION, compactionRequestedTime, 1, TEST_WRITE_TOKEN); new File(basePath + "/" + partitionPath + "/" + fileName3).createNewFile(); new File(basePath + "/" + partitionPath + "/" + fileName4).createNewFile(); HoodieInstant deltaInstant4 = new HoodieInstant(true, HoodieTimeline.DELTA_COMMIT_ACTION, deltaInstantTime4); @@ -318,8 +317,8 @@ public class HoodieTableFileSystemViewTest extends HoodieCommonTestHarness { } /** Merge API Tests **/ - List fileSliceList = rtView.getLatestMergedFileSlicesBeforeOrOn(partitionPath, deltaInstantTime5) - .collect(Collectors.toList()); + List fileSliceList = + rtView.getLatestMergedFileSlicesBeforeOrOn(partitionPath, deltaInstantTime5).collect(Collectors.toList()); assertEquals("Expect file-slice to be merged", 1, fileSliceList.size()); FileSlice fileSlice = fileSliceList.get(0); assertEquals(fileId, fileSlice.getFileId()); @@ -337,8 +336,8 @@ public class HoodieTableFileSystemViewTest extends HoodieCommonTestHarness { assertEquals("Log File Order check", fileName2, logFiles.get(2).getFileName()); assertEquals("Log File Order check", fileName1, logFiles.get(3).getFileName()); - fileSliceList = rtView.getLatestFileSlicesBeforeOrOn(partitionPath, deltaInstantTime5, true) - .collect(Collectors.toList()); + fileSliceList = + rtView.getLatestFileSlicesBeforeOrOn(partitionPath, deltaInstantTime5, true).collect(Collectors.toList()); assertEquals("Expect only one file-id", 1, fileSliceList.size()); fileSlice = fileSliceList.get(0); assertEquals(fileId, fileSlice.getFileId()); @@ -350,7 +349,7 @@ public class HoodieTableFileSystemViewTest extends HoodieCommonTestHarness { assertEquals("Log File Order check", fileName4, logFiles.get(0).getFileName()); assertEquals("Log File Order check", fileName3, logFiles.get(1).getFileName()); - /** Data Files API tests */ + /** Data Files API tests */ dataFiles = roView.getLatestDataFiles().collect(Collectors.toList()); if (skipCreatingDataFile) { assertEquals("Expect no data file to be returned", 0, dataFiles.size()); @@ -411,8 +410,9 @@ public class HoodieTableFileSystemViewTest extends HoodieCommonTestHarness { inflightDeltaInstantTime, 0, TEST_WRITE_TOKEN); new File(basePath + "/" + partitionPath + "/" + inflightLogFileName).createNewFile(); // Mark instant as inflight - commitTimeline.saveToInflight(new HoodieInstant(State.INFLIGHT, HoodieTimeline.DELTA_COMMIT_ACTION, - inflightDeltaInstantTime), Option.empty()); + commitTimeline.saveToInflight( + new HoodieInstant(State.INFLIGHT, HoodieTimeline.DELTA_COMMIT_ACTION, inflightDeltaInstantTime), + Option.empty()); refreshFsView(); List allRawFileSlices = getAllRawFileSlices(partitionPath).collect(Collectors.toList()); @@ -424,8 +424,8 @@ public class HoodieTableFileSystemViewTest extends HoodieCommonTestHarness { }).collect(Collectors.toList()); if (includeInvalidAndInflight) { - assertEquals("Inflight/Orphan data-file is also expected", 2 - + (isCompactionInFlight ? 1 : 0) + (skipCreatingDataFile ? 0 : 1), dataFiles.size()); + assertEquals("Inflight/Orphan data-file is also expected", + 2 + (isCompactionInFlight ? 1 : 0) + (skipCreatingDataFile ? 0 : 1), dataFiles.size()); Set fileNames = dataFiles.stream().map(HoodieDataFile::getFileName).collect(Collectors.toSet()); assertTrue("Expect orphan data-file to be present", fileNames.contains(orphanDataFileName)); assertTrue("Expect inflight data-file to be present", fileNames.contains(inflightDataFileName)); @@ -438,8 +438,8 @@ public class HoodieTableFileSystemViewTest extends HoodieCommonTestHarness { } fileSliceList = getLatestRawFileSlices(partitionPath).collect(Collectors.toList()); - assertEquals("Expect both inflight and orphan file-slice to be included", - includeInvalidAndInflight ? 5 : 1, fileSliceList.size()); + assertEquals("Expect both inflight and orphan file-slice to be included", includeInvalidAndInflight ? 5 : 1, + fileSliceList.size()); Map fileSliceMap = fileSliceList.stream().collect(Collectors.toMap(FileSlice::getFileId, r -> r)); FileSlice orphanFileSliceWithDataFile = fileSliceMap.get(orphanFileId1); @@ -465,8 +465,7 @@ public class HoodieTableFileSystemViewTest extends HoodieCommonTestHarness { orphanFileSliceWithLogFile.getDataFile().isPresent()); logFiles = orphanFileSliceWithLogFile.getLogFiles().collect(Collectors.toList()); assertEquals("Orphan File Slice with log-file check data-file", 1, logFiles.size()); - assertEquals("Orphan File Slice with log-file check data-file", orphanLogFileName, - logFiles.get(0).getFileName()); + assertEquals("Orphan File Slice with log-file check data-file", orphanLogFileName, logFiles.get(0).getFileName()); assertEquals("Inflight File Slice with log-file check base-commit", inflightDeltaInstantTime, inflightFileSliceWithLogFile.getBaseInstantTime()); assertFalse("Inflight File Slice with log-file check data-file", @@ -495,46 +494,42 @@ public class HoodieTableFileSystemViewTest extends HoodieCommonTestHarness { fileSliceList = rtView.getLatestFileSlices(partitionPath).collect(Collectors.toList()); log.info("FILESLICE LIST=" + fileSliceList); - dataFiles = fileSliceList.stream().map(FileSlice::getDataFile) - .filter(Option::isPresent).map(Option::get).collect(Collectors.toList()); + dataFiles = fileSliceList.stream().map(FileSlice::getDataFile).filter(Option::isPresent).map(Option::get) + .collect(Collectors.toList()); assertEquals("Expect only one data-files in latest view as there is only one file-group", 1, dataFiles.size()); assertEquals("Data Filename must match", compactDataFileName, dataFiles.get(0).getFileName()); assertEquals("Only one latest file-slice in the partition", 1, fileSliceList.size()); fileSlice = fileSliceList.get(0); assertEquals("Check file-Id is set correctly", fileId, fileSlice.getFileId()); - assertEquals("Check data-filename is set correctly", - compactDataFileName, fileSlice.getDataFile().get().getFileName()); - assertEquals("Ensure base-instant is now compaction request instant", - compactionRequestedTime, fileSlice.getBaseInstantTime()); + assertEquals("Check data-filename is set correctly", compactDataFileName, + fileSlice.getDataFile().get().getFileName()); + assertEquals("Ensure base-instant is now compaction request instant", compactionRequestedTime, + fileSlice.getBaseInstantTime()); logFiles = fileSlice.getLogFiles().collect(Collectors.toList()); assertEquals("Only log-files after compaction request shows up", 2, logFiles.size()); assertEquals("Log File Order check", fileName4, logFiles.get(0).getFileName()); assertEquals("Log File Order check", fileName3, logFiles.get(1).getFileName()); - /** Data Files API tests */ + /** Data Files API tests */ dataFiles = roView.getLatestDataFiles().collect(Collectors.toList()); assertEquals("Expect only one data-file to be sent", 1, dataFiles.size()); dataFiles.stream().forEach(df -> { - assertEquals("Expect data-file created by compaction be returned", df.getCommitTime(), - compactionRequestedTime); + assertEquals("Expect data-file created by compaction be returned", df.getCommitTime(), compactionRequestedTime); }); dataFiles = roView.getLatestDataFiles(partitionPath).collect(Collectors.toList()); assertEquals("Expect only one data-file to be sent", 1, dataFiles.size()); dataFiles.stream().forEach(df -> { - assertEquals("Expect data-file created by compaction be returned", df.getCommitTime(), - compactionRequestedTime); + assertEquals("Expect data-file created by compaction be returned", df.getCommitTime(), compactionRequestedTime); }); dataFiles = roView.getLatestDataFilesBeforeOrOn(partitionPath, deltaInstantTime5).collect(Collectors.toList()); assertEquals("Expect only one data-file to be sent", 1, dataFiles.size()); dataFiles.stream().forEach(df -> { - assertEquals("Expect data-file created by compaction be returned", df.getCommitTime(), - compactionRequestedTime); + assertEquals("Expect data-file created by compaction be returned", df.getCommitTime(), compactionRequestedTime); }); dataFiles = roView.getLatestDataFilesInRange(allInstantTimes).collect(Collectors.toList()); assertEquals("Expect only one data-file to be sent", 1, dataFiles.size()); dataFiles.stream().forEach(df -> { - assertEquals("Expect data-file created by compaction be returned", df.getCommitTime(), - compactionRequestedTime); + assertEquals("Expect data-file created by compaction be returned", df.getCommitTime(), compactionRequestedTime); }); assertEquals("Total number of file-slices in partitions matches expected", expTotalFileSlices, @@ -542,8 +537,8 @@ public class HoodieTableFileSystemViewTest extends HoodieCommonTestHarness { assertEquals("Total number of data-files in partitions matches expected", expTotalDataFiles, roView.getAllDataFiles(partitionPath).count()); // file-groups includes inflight/invalid file-ids - assertEquals("Total number of file-groups in partitions matches expected", - 5, fsView.getAllFileGroups(partitionPath).count()); + assertEquals("Total number of file-groups in partitions matches expected", 5, + fsView.getAllFileGroups(partitionPath).count()); } @Test @@ -552,44 +547,39 @@ public class HoodieTableFileSystemViewTest extends HoodieCommonTestHarness { new File(basePath + "/" + partitionPath).mkdirs(); String fileId = UUID.randomUUID().toString(); - assertFalse("No commit, should not find any data file", - roView.getLatestDataFiles(partitionPath).filter(dfile -> dfile.getFileId().equals(fileId)).findFirst() - .isPresent()); + assertFalse("No commit, should not find any data file", roView.getLatestDataFiles(partitionPath) + .filter(dfile -> dfile.getFileId().equals(fileId)).findFirst().isPresent()); // Only one commit, but is not safe String commitTime1 = "1"; String fileName1 = FSUtils.makeDataFileName(commitTime1, TEST_WRITE_TOKEN, fileId); new File(basePath + "/" + partitionPath + "/" + fileName1).createNewFile(); refreshFsView(); - assertFalse("No commit, should not find any data file", - roView.getLatestDataFiles(partitionPath).filter(dfile -> dfile.getFileId().equals(fileId)).findFirst() - .isPresent()); + assertFalse("No commit, should not find any data file", roView.getLatestDataFiles(partitionPath) + .filter(dfile -> dfile.getFileId().equals(fileId)).findFirst().isPresent()); // Make this commit safe HoodieActiveTimeline commitTimeline = metaClient.getActiveTimeline(); HoodieInstant instant1 = new HoodieInstant(true, HoodieTimeline.COMMIT_ACTION, commitTime1); commitTimeline.saveAsComplete(instant1, Option.empty()); refreshFsView(); - assertEquals("", fileName1, - roView.getLatestDataFiles(partitionPath).filter(dfile -> dfile.getFileId().equals(fileId)).findFirst().get() - .getFileName()); + assertEquals("", fileName1, roView.getLatestDataFiles(partitionPath) + .filter(dfile -> dfile.getFileId().equals(fileId)).findFirst().get().getFileName()); // Do another commit, but not safe String commitTime2 = "2"; String fileName2 = FSUtils.makeDataFileName(commitTime2, TEST_WRITE_TOKEN, fileId); new File(basePath + "/" + partitionPath + "/" + fileName2).createNewFile(); refreshFsView(); - assertEquals("", fileName1, - roView.getLatestDataFiles(partitionPath).filter(dfile -> dfile.getFileId().equals(fileId)).findFirst().get() - .getFileName()); + assertEquals("", fileName1, roView.getLatestDataFiles(partitionPath) + .filter(dfile -> dfile.getFileId().equals(fileId)).findFirst().get().getFileName()); // Make it safe HoodieInstant instant2 = new HoodieInstant(true, HoodieTimeline.COMMIT_ACTION, commitTime2); commitTimeline.saveAsComplete(instant2, Option.empty()); refreshFsView(); - assertEquals("", fileName2, - roView.getLatestDataFiles(partitionPath).filter(dfile -> dfile.getFileId().equals(fileId)).findFirst().get() - .getFileName()); + assertEquals("", fileName2, roView.getLatestDataFiles(partitionPath) + .filter(dfile -> dfile.getFileId().equals(fileId)).findFirst().get().getFileName()); } @Test @@ -614,19 +604,23 @@ public class HoodieTableFileSystemViewTest extends HoodieCommonTestHarness { new File(fullPartitionPath + FSUtils.makeDataFileName(commitTime1, TEST_WRITE_TOKEN, fileId1)).createNewFile(); new File(fullPartitionPath + FSUtils.makeDataFileName(commitTime4, TEST_WRITE_TOKEN, fileId1)).createNewFile(); - new File(fullPartitionPath + FSUtils.makeLogFileName(fileId1, HoodieLogFile.DELTA_EXTENSION, - commitTime4, 0, TEST_WRITE_TOKEN)).createNewFile(); - new File(fullPartitionPath + FSUtils.makeLogFileName(fileId1, HoodieLogFile.DELTA_EXTENSION, - commitTime4, 1, TEST_WRITE_TOKEN)).createNewFile(); + new File(fullPartitionPath + + FSUtils.makeLogFileName(fileId1, HoodieLogFile.DELTA_EXTENSION, commitTime4, 0, TEST_WRITE_TOKEN)) + .createNewFile(); + new File(fullPartitionPath + + FSUtils.makeLogFileName(fileId1, HoodieLogFile.DELTA_EXTENSION, commitTime4, 1, TEST_WRITE_TOKEN)) + .createNewFile(); new File(fullPartitionPath + FSUtils.makeDataFileName(commitTime1, TEST_WRITE_TOKEN, fileId2)).createNewFile(); new File(fullPartitionPath + FSUtils.makeDataFileName(commitTime2, TEST_WRITE_TOKEN, fileId2)).createNewFile(); new File(fullPartitionPath + FSUtils.makeDataFileName(commitTime3, TEST_WRITE_TOKEN, fileId2)).createNewFile(); - new File(fullPartitionPath + FSUtils.makeLogFileName(fileId2, - HoodieLogFile.DELTA_EXTENSION, commitTime3, 0, TEST_WRITE_TOKEN)).createNewFile(); + new File(fullPartitionPath + + FSUtils.makeLogFileName(fileId2, HoodieLogFile.DELTA_EXTENSION, commitTime3, 0, TEST_WRITE_TOKEN)) + .createNewFile(); new File(fullPartitionPath + FSUtils.makeDataFileName(commitTime3, TEST_WRITE_TOKEN, fileId3)).createNewFile(); new File(fullPartitionPath + FSUtils.makeDataFileName(commitTime4, TEST_WRITE_TOKEN, fileId3)).createNewFile(); - new File(fullPartitionPath + FSUtils.makeLogFileName(fileId4, HoodieLogFile.DELTA_EXTENSION, - commitTime4, 0, TEST_WRITE_TOKEN)).createNewFile(); + new File(fullPartitionPath + + FSUtils.makeLogFileName(fileId4, HoodieLogFile.DELTA_EXTENSION, commitTime4, 0, TEST_WRITE_TOKEN)) + .createNewFile(); // Create commit/clean files new File(basePath + "/.hoodie/" + cleanTime1 + ".clean").createNewFile(); @@ -638,7 +632,7 @@ public class HoodieTableFileSystemViewTest extends HoodieCommonTestHarness { testStreamLatestVersionInPartition(isLatestFileSliceOnly, fullPartitionPath, commitTime1, commitTime2, commitTime3, commitTime4, fileId1, fileId2, fileId3, fileId4); - // Now create a scenario where archiving deleted commits (1,2, and 3) but retained cleaner clean1. Now clean1 is + // Now create a scenario where archiving deleted commits (1,2, and 3) but retained cleaner clean1. Now clean1 is // the lowest commit time. Scenario for HUDI-162 - Here clean is the earliest action in active timeline new File(basePath + "/.hoodie/" + commitTime1 + ".commit").delete(); new File(basePath + "/.hoodie/" + commitTime2 + ".commit").delete(); @@ -659,15 +653,15 @@ public class HoodieTableFileSystemViewTest extends HoodieCommonTestHarness { // Check files as of lastest commit. List allSlices = rtView.getAllFileSlices("2016/05/01").collect(Collectors.toList()); assertEquals(isLatestFileSliceOnly ? 4 : 8, allSlices.size()); - Map fileSliceMap = allSlices.stream().collect( - Collectors.groupingBy(slice -> slice.getFileId(), Collectors.counting())); + Map fileSliceMap = + allSlices.stream().collect(Collectors.groupingBy(slice -> slice.getFileId(), Collectors.counting())); assertEquals(isLatestFileSliceOnly ? 1 : 2, fileSliceMap.get(fileId1).longValue()); assertEquals(isLatestFileSliceOnly ? 1 : 3, fileSliceMap.get(fileId2).longValue()); assertEquals(isLatestFileSliceOnly ? 1 : 2, fileSliceMap.get(fileId3).longValue()); assertEquals(1, fileSliceMap.get(fileId4).longValue()); - List dataFileList = roView.getLatestDataFilesBeforeOrOn("2016/05/01", commitTime4) - .collect(Collectors.toList()); + List dataFileList = + roView.getLatestDataFilesBeforeOrOn("2016/05/01", commitTime4).collect(Collectors.toList()); assertEquals(3, dataFileList.size()); Set filenames = Sets.newHashSet(); for (HoodieDataFile status : dataFileList) { @@ -679,24 +673,23 @@ public class HoodieTableFileSystemViewTest extends HoodieCommonTestHarness { filenames = Sets.newHashSet(); List logFilesList = rtView.getLatestFileSlicesBeforeOrOn("2016/05/01", commitTime4, true) - .map(slice -> slice.getLogFiles()).flatMap(logFileList -> logFileList) - .collect(Collectors.toList()); + .map(slice -> slice.getLogFiles()).flatMap(logFileList -> logFileList).collect(Collectors.toList()); assertEquals(logFilesList.size(), 4); for (HoodieLogFile logFile : logFilesList) { filenames.add(logFile.getFileName()); } - assertTrue(filenames.contains(FSUtils.makeLogFileName(fileId1, HoodieLogFile.DELTA_EXTENSION, - commitTime4, 0, TEST_WRITE_TOKEN))); - assertTrue(filenames.contains(FSUtils.makeLogFileName(fileId1, HoodieLogFile.DELTA_EXTENSION, - commitTime4, 1, TEST_WRITE_TOKEN))); - assertTrue(filenames.contains(FSUtils.makeLogFileName(fileId2, HoodieLogFile.DELTA_EXTENSION, - commitTime3, 0, TEST_WRITE_TOKEN))); - assertTrue(filenames.contains(FSUtils.makeLogFileName(fileId4, HoodieLogFile.DELTA_EXTENSION, - commitTime4, 0, TEST_WRITE_TOKEN))); + assertTrue(filenames + .contains(FSUtils.makeLogFileName(fileId1, HoodieLogFile.DELTA_EXTENSION, commitTime4, 0, TEST_WRITE_TOKEN))); + assertTrue(filenames + .contains(FSUtils.makeLogFileName(fileId1, HoodieLogFile.DELTA_EXTENSION, commitTime4, 1, TEST_WRITE_TOKEN))); + assertTrue(filenames + .contains(FSUtils.makeLogFileName(fileId2, HoodieLogFile.DELTA_EXTENSION, commitTime3, 0, TEST_WRITE_TOKEN))); + assertTrue(filenames + .contains(FSUtils.makeLogFileName(fileId4, HoodieLogFile.DELTA_EXTENSION, commitTime4, 0, TEST_WRITE_TOKEN))); // Reset the max commit time - List dataFiles = roView.getLatestDataFilesBeforeOrOn("2016/05/01", commitTime3) - .collect(Collectors.toList()); + List dataFiles = + roView.getLatestDataFilesBeforeOrOn("2016/05/01", commitTime3).collect(Collectors.toList()); filenames = Sets.newHashSet(); for (HoodieDataFile status : dataFiles) { filenames.add(status.getFileName()); @@ -711,9 +704,8 @@ public class HoodieTableFileSystemViewTest extends HoodieCommonTestHarness { assertTrue(filenames.contains(FSUtils.makeDataFileName(commitTime3, TEST_WRITE_TOKEN, fileId2))); } - logFilesList = - rtView.getLatestFileSlicesBeforeOrOn("2016/05/01", commitTime3, true).map(slice -> slice.getLogFiles()) - .flatMap(logFileList -> logFileList).collect(Collectors.toList()); + logFilesList = rtView.getLatestFileSlicesBeforeOrOn("2016/05/01", commitTime3, true) + .map(slice -> slice.getLogFiles()).flatMap(logFileList -> logFileList).collect(Collectors.toList()); assertEquals(logFilesList.size(), 1); assertTrue(logFilesList.get(0).getFileName() .equals(FSUtils.makeLogFileName(fileId2, HoodieLogFile.DELTA_EXTENSION, commitTime3, 0, TEST_WRITE_TOKEN))); @@ -806,15 +798,17 @@ public class HoodieTableFileSystemViewTest extends HoodieCommonTestHarness { String fileId3 = UUID.randomUUID().toString(); new File(fullPartitionPath + FSUtils.makeDataFileName(commitTime1, TEST_WRITE_TOKEN, fileId1)).createNewFile(); - new File(fullPartitionPath + FSUtils.makeLogFileName(fileId1, HoodieLogFile.DELTA_EXTENSION, - commitTime1, 0, TEST_WRITE_TOKEN)).createNewFile(); + new File(fullPartitionPath + + FSUtils.makeLogFileName(fileId1, HoodieLogFile.DELTA_EXTENSION, commitTime1, 0, TEST_WRITE_TOKEN)) + .createNewFile(); new File(fullPartitionPath + FSUtils.makeDataFileName(commitTime3, TEST_WRITE_TOKEN, fileId1)).createNewFile(); new File(fullPartitionPath + FSUtils.makeDataFileName(commitTime1, TEST_WRITE_TOKEN, fileId2)).createNewFile(); new File(fullPartitionPath + FSUtils.makeDataFileName(commitTime2, TEST_WRITE_TOKEN, fileId2)).createNewFile(); new File(fullPartitionPath + FSUtils.makeDataFileName(commitTime3, TEST_WRITE_TOKEN, fileId2)).createNewFile(); - new File(fullPartitionPath + FSUtils.makeLogFileName(fileId2, HoodieLogFile.DELTA_EXTENSION, - commitTime3, 0, TEST_WRITE_TOKEN)).createNewFile(); + new File(fullPartitionPath + + FSUtils.makeLogFileName(fileId2, HoodieLogFile.DELTA_EXTENSION, commitTime3, 0, TEST_WRITE_TOKEN)) + .createNewFile(); new File(fullPartitionPath + FSUtils.makeDataFileName(commitTime3, TEST_WRITE_TOKEN, fileId3)).createNewFile(); new File(fullPartitionPath + FSUtils.makeDataFileName(commitTime4, TEST_WRITE_TOKEN, fileId3)).createNewFile(); @@ -832,8 +826,8 @@ public class HoodieTableFileSystemViewTest extends HoodieCommonTestHarness { // Populate view for partition roView.getAllDataFiles("2016/05/01/"); - List dataFiles = roView.getLatestDataFilesInRange(Lists.newArrayList(commitTime2, commitTime3)) - .collect(Collectors.toList()); + List dataFiles = + roView.getLatestDataFilesInRange(Lists.newArrayList(commitTime2, commitTime3)).collect(Collectors.toList()); assertEquals(isLatestFileSliceOnly ? 2 : 3, dataFiles.size()); Set filenames = Sets.newHashSet(); for (HoodieDataFile status : dataFiles) { @@ -846,8 +840,8 @@ public class HoodieTableFileSystemViewTest extends HoodieCommonTestHarness { assertTrue(filenames.contains(FSUtils.makeDataFileName(commitTime3, TEST_WRITE_TOKEN, fileId3))); } - List slices = rtView.getLatestFileSliceInRange(Lists.newArrayList(commitTime3, commitTime4)) - .collect(Collectors.toList()); + List slices = + rtView.getLatestFileSliceInRange(Lists.newArrayList(commitTime3, commitTime4)).collect(Collectors.toList()); assertEquals(3, slices.size()); for (FileSlice slice : slices) { if (slice.getFileId().equals(fileId1)) { @@ -902,8 +896,8 @@ public class HoodieTableFileSystemViewTest extends HoodieCommonTestHarness { assertEquals(7, statuses.length); refreshFsView(); - List dataFiles = roView.getLatestDataFilesBeforeOrOn(partitionPath, commitTime2) - .collect(Collectors.toList()); + List dataFiles = + roView.getLatestDataFilesBeforeOrOn(partitionPath, commitTime2).collect(Collectors.toList()); if (!isLatestFileSliceOnly) { assertEquals(2, dataFiles.size()); Set filenames = Sets.newHashSet(); @@ -935,31 +929,31 @@ public class HoodieTableFileSystemViewTest extends HoodieCommonTestHarness { String fileId2 = UUID.randomUUID().toString(); String fileId3 = UUID.randomUUID().toString(); - new File(fullPartitionPath + "/" + FSUtils.makeDataFileName(commitTime1, - TEST_WRITE_TOKEN, fileId1)).createNewFile(); - new File(fullPartitionPath + "/" - + FSUtils.makeLogFileName(fileId1, HoodieLogFile.DELTA_EXTENSION, - commitTime1, 0, TEST_WRITE_TOKEN)).createNewFile(); - new File(fullPartitionPath + "/" - + FSUtils.makeDataFileName(commitTime4, TEST_WRITE_TOKEN, fileId1)).createNewFile(); - new File(fullPartitionPath + "/" - + FSUtils.makeLogFileName(fileId1, HoodieLogFile.DELTA_EXTENSION, - commitTime4, 0, TEST_WRITE_TOKEN)).createNewFile(); - - new File(fullPartitionPath + "/" - + FSUtils.makeDataFileName(commitTime1, TEST_WRITE_TOKEN, fileId2)).createNewFile(); - new File(fullPartitionPath + "/" - + FSUtils.makeDataFileName(commitTime2, TEST_WRITE_TOKEN, fileId2)).createNewFile(); - new File(fullPartitionPath + "/" - + FSUtils.makeLogFileName(fileId2, HoodieLogFile.DELTA_EXTENSION, commitTime2, 0, TEST_WRITE_TOKEN)) + new File(fullPartitionPath + "/" + FSUtils.makeDataFileName(commitTime1, TEST_WRITE_TOKEN, fileId1)) .createNewFile(); new File(fullPartitionPath + "/" - + FSUtils.makeDataFileName(commitTime3, TEST_WRITE_TOKEN, fileId2)).createNewFile(); + + FSUtils.makeLogFileName(fileId1, HoodieLogFile.DELTA_EXTENSION, commitTime1, 0, TEST_WRITE_TOKEN)) + .createNewFile(); + new File(fullPartitionPath + "/" + FSUtils.makeDataFileName(commitTime4, TEST_WRITE_TOKEN, fileId1)) + .createNewFile(); + new File(fullPartitionPath + "/" + + FSUtils.makeLogFileName(fileId1, HoodieLogFile.DELTA_EXTENSION, commitTime4, 0, TEST_WRITE_TOKEN)) + .createNewFile(); + new File(fullPartitionPath + "/" + FSUtils.makeDataFileName(commitTime1, TEST_WRITE_TOKEN, fileId2)) + .createNewFile(); + new File(fullPartitionPath + "/" + FSUtils.makeDataFileName(commitTime2, TEST_WRITE_TOKEN, fileId2)) + .createNewFile(); new File(fullPartitionPath + "/" - + FSUtils.makeDataFileName(commitTime3, TEST_WRITE_TOKEN, fileId3)).createNewFile(); - new File(fullPartitionPath + "/" - + FSUtils.makeDataFileName(commitTime4, TEST_WRITE_TOKEN, fileId3)).createNewFile(); + + FSUtils.makeLogFileName(fileId2, HoodieLogFile.DELTA_EXTENSION, commitTime2, 0, TEST_WRITE_TOKEN)) + .createNewFile(); + new File(fullPartitionPath + "/" + FSUtils.makeDataFileName(commitTime3, TEST_WRITE_TOKEN, fileId2)) + .createNewFile(); + + new File(fullPartitionPath + "/" + FSUtils.makeDataFileName(commitTime3, TEST_WRITE_TOKEN, fileId3)) + .createNewFile(); + new File(fullPartitionPath + "/" + FSUtils.makeDataFileName(commitTime4, TEST_WRITE_TOKEN, fileId3)) + .createNewFile(); new File(basePath + "/.hoodie/" + commitTime1 + ".commit").createNewFile(); new File(basePath + "/.hoodie/" + commitTime2 + ".commit").createNewFile(); @@ -1017,11 +1011,11 @@ public class HoodieTableFileSystemViewTest extends HoodieCommonTestHarness { String partitionPath2 = "2016/05/02"; String partitionPath3 = "2016/05/03"; - String fullPartitionPath1 = basePath + "/" + partitionPath1 + "/"; + String fullPartitionPath1 = basePath + "/" + partitionPath1 + "/"; new File(fullPartitionPath1).mkdirs(); - String fullPartitionPath2 = basePath + "/" + partitionPath2 + "/"; + String fullPartitionPath2 = basePath + "/" + partitionPath2 + "/"; new File(fullPartitionPath2).mkdirs(); - String fullPartitionPath3 = basePath + "/" + partitionPath3 + "/"; + String fullPartitionPath3 = basePath + "/" + partitionPath3 + "/"; new File(fullPartitionPath3).mkdirs(); String instantTime1 = "1"; String deltaInstantTime1 = "2"; @@ -1031,16 +1025,13 @@ public class HoodieTableFileSystemViewTest extends HoodieCommonTestHarness { String dataFileName = FSUtils.makeDataFileName(instantTime1, TEST_WRITE_TOKEN, fileId); new File(fullPartitionPath1 + dataFileName).createNewFile(); - String fileName1 = FSUtils.makeLogFileName(fileId, HoodieLogFile.DELTA_EXTENSION, - instantTime1, 0, TEST_WRITE_TOKEN); - new File(fullPartitionPath1 + fileName1) - .createNewFile(); + String fileName1 = + FSUtils.makeLogFileName(fileId, HoodieLogFile.DELTA_EXTENSION, instantTime1, 0, TEST_WRITE_TOKEN); + new File(fullPartitionPath1 + fileName1).createNewFile(); new File(fullPartitionPath2 + FSUtils.makeDataFileName(instantTime1, TEST_WRITE_TOKEN, fileId)).createNewFile(); - new File(fullPartitionPath2 + fileName1) - .createNewFile(); + new File(fullPartitionPath2 + fileName1).createNewFile(); new File(fullPartitionPath3 + FSUtils.makeDataFileName(instantTime1, TEST_WRITE_TOKEN, fileId)).createNewFile(); - new File(fullPartitionPath3 + fileName1) - .createNewFile(); + new File(fullPartitionPath3 + fileName1).createNewFile(); HoodieActiveTimeline commitTimeline = metaClient.getActiveTimeline(); HoodieInstant instant1 = new HoodieInstant(true, HoodieTimeline.COMMIT_ACTION, instantTime1); @@ -1052,9 +1043,8 @@ public class HoodieTableFileSystemViewTest extends HoodieCommonTestHarness { commitTimeline.saveAsComplete(deltaInstant3, Option.empty()); // Now we list all partitions - FileStatus[] statuses = metaClient.getFs().listStatus(new Path[] { - new Path(fullPartitionPath1), new Path(fullPartitionPath2), new Path(fullPartitionPath3) - }); + FileStatus[] statuses = metaClient.getFs().listStatus( + new Path[] {new Path(fullPartitionPath1), new Path(fullPartitionPath2), new Path(fullPartitionPath3)}); assertEquals(6, statuses.length); refreshFsView(); Arrays.asList(partitionPath1, partitionPath2, partitionPath3).forEach(p -> fsView.getAllFileGroups(p).count()); @@ -1064,8 +1054,8 @@ public class HoodieTableFileSystemViewTest extends HoodieCommonTestHarness { Assert.assertEquals("Expected number of file-groups", 3, groups.size()); Assert.assertEquals("Partitions must be different for file-groups", 3, groups.stream().map(HoodieFileGroup::getPartitionPath).collect(Collectors.toSet()).size()); - Set fileIds = groups.stream().map(HoodieFileGroup::getFileGroupId) - .map(HoodieFileGroupId::getFileId).collect(Collectors.toSet()); + Set fileIds = groups.stream().map(HoodieFileGroup::getFileGroupId).map(HoodieFileGroupId::getFileId) + .collect(Collectors.toSet()); Assert.assertEquals("File Id must be same", 1, fileIds.size()); Assert.assertTrue("Expected FileId", fileIds.contains(fileId)); @@ -1080,8 +1070,8 @@ public class HoodieTableFileSystemViewTest extends HoodieCommonTestHarness { String compactionRequestedTime = "2"; String compactDataFileName = FSUtils.makeDataFileName(compactionRequestedTime, TEST_WRITE_TOKEN, fileId); - HoodieCompactionPlan compactionPlan = CompactionUtils.buildFromFileSlices(partitionFileSlicesPairs, - Option.empty(), Option.empty()); + HoodieCompactionPlan compactionPlan = + CompactionUtils.buildFromFileSlices(partitionFileSlicesPairs, Option.empty(), Option.empty()); // Create a Data-file for some of the partitions but this should be skipped by view new File(basePath + "/" + partitionPath1 + "/" + compactDataFileName).createNewFile(); @@ -1099,10 +1089,10 @@ public class HoodieTableFileSystemViewTest extends HoodieCommonTestHarness { String deltaInstantTime5 = "6"; List allInstantTimes = Arrays.asList(instantTime1, deltaInstantTime1, deltaInstantTime2, compactionRequestedTime, deltaInstantTime4, deltaInstantTime5); - String fileName3 = FSUtils.makeLogFileName(fileId, HoodieLogFile.DELTA_EXTENSION, - compactionRequestedTime, 0, TEST_WRITE_TOKEN); - String fileName4 = FSUtils.makeLogFileName(fileId, HoodieLogFile.DELTA_EXTENSION, - compactionRequestedTime, 1, TEST_WRITE_TOKEN); + String fileName3 = + FSUtils.makeLogFileName(fileId, HoodieLogFile.DELTA_EXTENSION, compactionRequestedTime, 0, TEST_WRITE_TOKEN); + String fileName4 = + FSUtils.makeLogFileName(fileId, HoodieLogFile.DELTA_EXTENSION, compactionRequestedTime, 1, TEST_WRITE_TOKEN); new File(basePath + "/" + partitionPath1 + "/" + fileName3).createNewFile(); new File(basePath + "/" + partitionPath1 + "/" + fileName4).createNewFile(); new File(basePath + "/" + partitionPath2 + "/" + fileName3).createNewFile(); @@ -1126,8 +1116,8 @@ public class HoodieTableFileSystemViewTest extends HoodieCommonTestHarness { /** Merge API Tests **/ Arrays.asList(partitionPath1, partitionPath2, partitionPath3).stream().forEach(partitionPath -> { - List fileSliceList = rtView.getLatestMergedFileSlicesBeforeOrOn(partitionPath, deltaInstantTime5) - .collect(Collectors.toList()); + List fileSliceList = + rtView.getLatestMergedFileSlicesBeforeOrOn(partitionPath, deltaInstantTime5).collect(Collectors.toList()); assertEquals("Expect file-slice to be merged", 1, fileSliceList.size()); FileSlice fileSlice = fileSliceList.get(0); assertEquals(fileId, fileSlice.getFileId()); @@ -1140,8 +1130,8 @@ public class HoodieTableFileSystemViewTest extends HoodieCommonTestHarness { assertEquals("Log File Order check", fileName3, logFiles.get(1).getFileName()); assertEquals("Log File Order check", fileName1, logFiles.get(2).getFileName()); - fileSliceList = rtView.getLatestFileSlicesBeforeOrOn(partitionPath, deltaInstantTime5, true) - .collect(Collectors.toList()); + fileSliceList = + rtView.getLatestFileSlicesBeforeOrOn(partitionPath, deltaInstantTime5, true).collect(Collectors.toList()); assertEquals("Expect only one file-id", 1, fileSliceList.size()); fileSlice = fileSliceList.get(0); assertEquals(fileId, fileSlice.getFileId()); @@ -1154,8 +1144,8 @@ public class HoodieTableFileSystemViewTest extends HoodieCommonTestHarness { assertEquals("Log File Order check", fileName3, logFiles.get(1).getFileName()); // Check getLatestFileSlicesBeforeOrOn excluding fileIds in pending compaction - fileSliceList = rtView.getLatestFileSlicesBeforeOrOn(partitionPath, deltaInstantTime5, false) - .collect(Collectors.toList()); + fileSliceList = + rtView.getLatestFileSlicesBeforeOrOn(partitionPath, deltaInstantTime5, false).collect(Collectors.toList()); assertEquals("Expect empty list as file-id is in pending compaction", 0, fileSliceList.size()); }); @@ -1168,7 +1158,7 @@ public class HoodieTableFileSystemViewTest extends HoodieCommonTestHarness { Assert.assertTrue(partitionsInCompaction.contains(partitionPath3)); Set fileIdsInCompaction = fsView.getPendingCompactionOperations().map(Pair::getValue) - .map(CompactionOperation::getFileId).collect(Collectors.toSet()); + .map(CompactionOperation::getFileId).collect(Collectors.toSet()); Assert.assertEquals(1, fileIdsInCompaction.size()); Assert.assertTrue(fileIdsInCompaction.contains(fileId)); } diff --git a/hudi-common/src/test/java/org/apache/hudi/common/table/view/IncrementalFSViewSyncTest.java b/hudi-common/src/test/java/org/apache/hudi/common/table/view/IncrementalFSViewSyncTest.java index 713563031..e211ef5fe 100644 --- a/hudi-common/src/test/java/org/apache/hudi/common/table/view/IncrementalFSViewSyncTest.java +++ b/hudi-common/src/test/java/org/apache/hudi/common/table/view/IncrementalFSViewSyncTest.java @@ -75,8 +75,7 @@ public class IncrementalFSViewSyncTest extends HoodieCommonTestHarness { private static String TEST_WRITE_TOKEN = "1-0-1"; - private final List partitions = Arrays.asList("2018/01/01", "2018/01/02", - "2019/03/01"); + private final List partitions = Arrays.asList("2018/01/01", "2018/01/02", "2019/03/01"); private final List fileIdsPerPartition = IntStream.range(0, 10).mapToObj(x -> UUID.randomUUID().toString()).collect(Collectors.toList()); @@ -110,8 +109,7 @@ public class IncrementalFSViewSyncTest extends HoodieCommonTestHarness { unscheduleCompaction(view, "14", "13", "11"); // Add one more delta instant - instantsToFiles.putAll( - testMultipleWriteSteps(view, Arrays.asList("15"), true, "11")); + instantsToFiles.putAll(testMultipleWriteSteps(view, Arrays.asList("15"), true, "11")); // Schedule Compaction again scheduleCompaction(view, "16"); @@ -120,37 +118,31 @@ public class IncrementalFSViewSyncTest extends HoodieCommonTestHarness { testMultipleWriteSteps(view, Arrays.asList("16"), false, "16", 2); // Run 2 more ingest - instantsToFiles.putAll( - testMultipleWriteSteps(view, Arrays.asList("17", "18"), true, "16", 2)); + instantsToFiles.putAll(testMultipleWriteSteps(view, Arrays.asList("17", "18"), true, "16", 2)); // Schedule Compaction again scheduleCompaction(view, "19"); // Run one more ingestion after pending compaction. THis will be 3rd slice - instantsToFiles.putAll( - testMultipleWriteSteps(view, Arrays.asList("20"), true, "19", 3)); + instantsToFiles.putAll(testMultipleWriteSteps(view, Arrays.asList("20"), true, "19", 3)); // Clean first slice testCleans(view, Arrays.asList("21"), new ImmutableMap.Builder>().put("11", Arrays.asList("12", "13", "15")).build(), - instantsToFiles, - Arrays.asList("11")); + instantsToFiles, Arrays.asList("11")); // Add one more ingestion instant. This should be 2nd slice now - instantsToFiles.putAll( - testMultipleWriteSteps(view, Arrays.asList("22"), true, "19", 2)); + instantsToFiles.putAll(testMultipleWriteSteps(view, Arrays.asList("22"), true, "19", 2)); // Restore last ingestion testRestore(view, Arrays.asList("23"), true, new HashMap<>(), Arrays.asList("22"), "24", false); // Run one more ingestion. THis is still 2nd slice - instantsToFiles.putAll( - testMultipleWriteSteps(view, Arrays.asList("24"), true, "19", 2)); + instantsToFiles.putAll(testMultipleWriteSteps(view, Arrays.asList("24"), true, "19", 2)); // Finish Compaction - instantsToFiles.putAll( - testMultipleWriteSteps(view, Arrays.asList("19"), false, "19", 2, - Arrays.asList(new HoodieInstant(State.COMPLETED, HoodieTimeline.DELTA_COMMIT_ACTION, "24")))); + instantsToFiles.putAll(testMultipleWriteSteps(view, Arrays.asList("19"), false, "19", 2, + Arrays.asList(new HoodieInstant(State.COMPLETED, HoodieTimeline.DELTA_COMMIT_ACTION, "24")))); } @Test @@ -183,8 +175,8 @@ public class IncrementalFSViewSyncTest extends HoodieCommonTestHarness { Map> instantsToFiles = testMultipleWriteSteps(view, Arrays.asList("12", "13", "14")); // restore instants in reverse order till we rollback all - testRestore(view, Arrays.asList("15", "16", "17"), false, instantsToFiles, - Arrays.asList("14", "13", "12"), "17", true); + testRestore(view, Arrays.asList("15", "16", "17"), false, instantsToFiles, Arrays.asList("14", "13", "12"), "17", + true); // Add 5 non-empty ingestions back-to-back instantsToFiles = testMultipleWriteSteps(view, Arrays.asList("18", "19", "20")); @@ -207,8 +199,7 @@ public class IncrementalFSViewSyncTest extends HoodieCommonTestHarness { * Case where incremental syncing is catching up on more than one ingestion at a time */ // Run 1 ingestion on MOR table (1 delta commits). View1 is now sync up to this point - instantsToFiles = - testMultipleWriteSteps(view1, Arrays.asList("11"), true, "11"); + instantsToFiles = testMultipleWriteSteps(view1, Arrays.asList("11"), true, "11"); SyncableFileSystemView view2 = getFileSystemView(new HoodieTableMetaClient(metaClient.getHadoopConf(), metaClient.getBasePath())); @@ -258,14 +249,13 @@ public class IncrementalFSViewSyncTest extends HoodieCommonTestHarness { */ testCleans(view2, Arrays.asList("19"), new ImmutableMap.Builder>().put("11", Arrays.asList("12", "13", "14")).build(), - instantsToFiles, - Arrays.asList("11")); + instantsToFiles, Arrays.asList("11")); scheduleCompaction(view2, "20"); instantsToFiles.putAll(testMultipleWriteSteps(view2, Arrays.asList("21", "22"), true, "20", 2)); // Compaction testMultipleWriteSteps(view2, Arrays.asList("20"), false, "20", 2, Arrays.asList(new HoodieInstant(State.COMPLETED, HoodieTimeline.DELTA_COMMIT_ACTION, "22"))); - //Run one more round of ingestion + // Run one more round of ingestion instantsToFiles.putAll(testMultipleWriteSteps(view2, Arrays.asList("23", "24"), true, "20", 2)); view1.sync(); areViewsConsistent(view1, view2, partitions.size() * fileIdsPerPartition.size() * 2); @@ -319,8 +309,8 @@ public class IncrementalFSViewSyncTest extends HoodieCommonTestHarness { * @param cleanedInstants List of cleaned instants */ private void testCleans(SyncableFileSystemView view, List newCleanerInstants, - Map> deltaInstantMap, - Map> instantsToFiles, List cleanedInstants) { + Map> deltaInstantMap, Map> instantsToFiles, + List cleanedInstants) { Assert.assertEquals(newCleanerInstants.size(), cleanedInstants.size()); long initialFileSlices = partitions.stream().mapToLong(p -> view.getAllFileSlices(p).count()).findAny().getAsLong(); long exp = initialFileSlices; @@ -371,17 +361,16 @@ public class IncrementalFSViewSyncTest extends HoodieCommonTestHarness { * @param emptyRestoreInstant Restore instant at which dataset becomes empty */ private void testRestore(SyncableFileSystemView view, List newRestoreInstants, boolean isDeltaCommit, - Map> instantsToFiles, List rolledBackInstants, - String emptyRestoreInstant, boolean isRestore) - throws IOException { + Map> instantsToFiles, List rolledBackInstants, String emptyRestoreInstant, + boolean isRestore) throws IOException { Assert.assertEquals(newRestoreInstants.size(), rolledBackInstants.size()); long initialFileSlices = partitions.stream().mapToLong(p -> view.getAllFileSlices(p).count()).findAny().getAsLong(); IntStream.range(0, newRestoreInstants.size()).forEach(idx -> { String instant = rolledBackInstants.get(idx); try { performRestore(view, instant, instantsToFiles.get(instant), newRestoreInstants.get(idx), isRestore); - final long expTotalFileSlicesPerPartition = isDeltaCommit ? initialFileSlices : - initialFileSlices - ((idx + 1) * fileIdsPerPartition.size()); + final long expTotalFileSlicesPerPartition = + isDeltaCommit ? initialFileSlices : initialFileSlices - ((idx + 1) * fileIdsPerPartition.size()); view.sync(); Assert.assertTrue(view.getLastInstant().isPresent()); log.info("Last Instant is :" + view.getLastInstant().get()); @@ -424,14 +413,12 @@ public class IncrementalFSViewSyncTest extends HoodieCommonTestHarness { throws IOException { Map> partititonToFiles = deleteFiles(files); List cleanStats = partititonToFiles.entrySet().stream().map(e -> { - return new HoodieCleanStat(HoodieCleaningPolicy.KEEP_LATEST_COMMITS, - e.getKey(), e.getValue(), e.getValue(), new ArrayList<>(), - Integer.toString(Integer.parseInt(instant) + 1)); + return new HoodieCleanStat(HoodieCleaningPolicy.KEEP_LATEST_COMMITS, e.getKey(), e.getValue(), e.getValue(), + new ArrayList<>(), Integer.toString(Integer.parseInt(instant) + 1)); }).collect(Collectors.toList()); HoodieCleanMetadata cleanMetadata = AvroUtils.convertCleanMetadata(cleanInstant, Option.empty(), cleanStats); - metaClient.getActiveTimeline().saveAsComplete( - new HoodieInstant(true, HoodieTimeline.CLEAN_ACTION, cleanInstant), + metaClient.getActiveTimeline().saveAsComplete(new HoodieInstant(true, HoodieTimeline.CLEAN_ACTION, cleanInstant), AvroUtils.serializeCleanMetadata(cleanMetadata)); } @@ -453,15 +440,14 @@ public class IncrementalFSViewSyncTest extends HoodieCommonTestHarness { List rollbacks = new ArrayList<>(); rollbacks.add(instant); - HoodieRollbackMetadata rollbackMetadata = AvroUtils - .convertRollbackMetadata(rollbackInstant, Option.empty(), rollbacks, rollbackStats); + HoodieRollbackMetadata rollbackMetadata = + AvroUtils.convertRollbackMetadata(rollbackInstant, Option.empty(), rollbacks, rollbackStats); if (isRestore) { HoodieRestoreMetadata metadata = new HoodieRestoreMetadata(); List rollbackM = new ArrayList<>(); rollbackM.add(rollbackMetadata); - metadata.setHoodieRestoreMetadata( - new ImmutableMap.Builder().put(rollbackInstant, rollbackM).build()); + metadata.setHoodieRestoreMetadata(new ImmutableMap.Builder().put(rollbackInstant, rollbackM).build()); List rollbackInstants = new ArrayList<>(); rollbackInstants.add(rollbackInstant); metadata.setInstantsToRollback(rollbackInstants); @@ -507,14 +493,13 @@ public class IncrementalFSViewSyncTest extends HoodieCommonTestHarness { * @param instantTime COmpaction Instant Time */ private void scheduleCompaction(SyncableFileSystemView view, String instantTime) throws IOException { - List> slices = partitions.stream().flatMap(p -> view.getLatestFileSlices(p) - .map(s -> Pair.of(p, s))).collect(Collectors.toList()); + List> slices = partitions.stream() + .flatMap(p -> view.getLatestFileSlices(p).map(s -> Pair.of(p, s))).collect(Collectors.toList()); long initialExpTotalFileSlices = partitions.stream().mapToLong(p -> view.getAllFileSlices(p).count()).sum(); HoodieCompactionPlan plan = CompactionUtils.buildFromFileSlices(slices, Option.empty(), Option.empty()); - HoodieInstant compactionInstant = - new HoodieInstant(State.REQUESTED, HoodieTimeline.COMPACTION_ACTION, instantTime); + HoodieInstant compactionInstant = new HoodieInstant(State.REQUESTED, HoodieTimeline.COMPACTION_ACTION, instantTime); metaClient.getActiveTimeline().saveToCompactionRequested(compactionInstant, AvroUtils.serializeCompactionPlan(plan)); @@ -526,8 +511,8 @@ public class IncrementalFSViewSyncTest extends HoodieCommonTestHarness { Assert.assertFalse(fs.getDataFile().isPresent()); }); view.getLatestMergedFileSlicesBeforeOrOn(p, instantTime).forEach(fs -> { - Assert.assertTrue(HoodieTimeline.compareTimestamps(instantTime, fs.getBaseInstantTime(), - HoodieTimeline.GREATER)); + Assert + .assertTrue(HoodieTimeline.compareTimestamps(instantTime, fs.getBaseInstantTime(), HoodieTimeline.GREATER)); Assert.assertEquals(p, fs.getPartitionPath()); }); }); @@ -586,11 +571,11 @@ public class IncrementalFSViewSyncTest extends HoodieCommonTestHarness { * @return List of new file created */ private Map> testMultipleWriteSteps(SyncableFileSystemView view, List instants, - boolean deltaCommit, String baseInstantForDeltaCommit, int begin) - throws IOException { + boolean deltaCommit, String baseInstantForDeltaCommit, int begin) throws IOException { return testMultipleWriteSteps(view, instants, deltaCommit, baseInstantForDeltaCommit, begin, - instants.stream().map(i -> new HoodieInstant(State.COMPLETED, - deltaCommit ? HoodieTimeline.DELTA_COMMIT_ACTION : HoodieTimeline.COMMIT_ACTION, i)) + instants.stream() + .map(i -> new HoodieInstant(State.COMPLETED, + deltaCommit ? HoodieTimeline.DELTA_COMMIT_ACTION : HoodieTimeline.COMMIT_ACTION, i)) .collect(Collectors.toList())); } @@ -628,13 +613,14 @@ public class IncrementalFSViewSyncTest extends HoodieCommonTestHarness { log.info("Adding instant=" + instant); HoodieInstant lastInstant = lastInstants.get(idx); // Add a non-empty ingestion to COW table - List filePaths = addInstant(metaClient, instant, deltaCommit, - deltaCommit ? baseInstantForDeltaCommit : instant); + List filePaths = + addInstant(metaClient, instant, deltaCommit, deltaCommit ? baseInstantForDeltaCommit : instant); view.sync(); Assert.assertTrue(view.getLastInstant().isPresent()); Assert.assertEquals(lastInstant.getTimestamp(), view.getLastInstant().get().getTimestamp()); Assert.assertEquals(State.COMPLETED, view.getLastInstant().get().getState()); - Assert.assertEquals("Expected Last=" + lastInstant + ", Found Instants=" + Assert.assertEquals( + "Expected Last=" + lastInstant + ", Found Instants=" + view.getTimeline().getInstants().collect(Collectors.toList()), lastInstant.getAction(), view.getLastInstant().get().getAction()); partitions.forEach(p -> Assert.assertEquals(fileIdsPerPartition.size(), view.getLatestFileSlices(p).count())); @@ -676,53 +662,50 @@ public class IncrementalFSViewSyncTest extends HoodieCommonTestHarness { */ private void areViewsConsistent(SyncableFileSystemView view1, SyncableFileSystemView view2, long expectedTotalFileSlices) { - //Timeline check + // Timeline check HoodieTimeline timeline1 = view1.getTimeline(); HoodieTimeline timeline2 = view2.getTimeline(); Assert.assertEquals(view1.getLastInstant(), view2.getLastInstant()); Iterators.elementsEqual(timeline1.getInstants().iterator(), timeline2.getInstants().iterator()); - //View Checks - Map fileGroupsMap1 = - partitions.stream().flatMap(p -> view1.getAllFileGroups(p)) - .collect(Collectors.toMap(fg -> fg.getFileGroupId(), fg -> fg)); - Map fileGroupsMap2 = - partitions.stream().flatMap(p -> view2.getAllFileGroups(p)) - .collect(Collectors.toMap(fg -> fg.getFileGroupId(), fg -> fg)); + // View Checks + Map fileGroupsMap1 = partitions.stream().flatMap(p -> view1.getAllFileGroups(p)) + .collect(Collectors.toMap(fg -> fg.getFileGroupId(), fg -> fg)); + Map fileGroupsMap2 = partitions.stream().flatMap(p -> view2.getAllFileGroups(p)) + .collect(Collectors.toMap(fg -> fg.getFileGroupId(), fg -> fg)); Assert.assertEquals(fileGroupsMap1.keySet(), fileGroupsMap2.keySet()); - long gotSlicesCount = - fileGroupsMap1.keySet().stream().map(k -> Pair.of(fileGroupsMap1.get(k), fileGroupsMap2.get(k))) - .mapToLong(e -> { - HoodieFileGroup fg1 = e.getKey(); - HoodieFileGroup fg2 = e.getValue(); - Assert.assertEquals(fg1.getFileGroupId(), fg2.getFileGroupId()); - List slices1 = fg1.getAllRawFileSlices().collect(Collectors.toList()); - List slices2 = fg2.getAllRawFileSlices().collect(Collectors.toList()); - Assert.assertEquals(slices1.size(), slices2.size()); - IntStream.range(0, slices1.size()).mapToObj(idx -> Pair.of(slices1.get(idx), slices2.get(idx))) - .forEach(e2 -> { - FileSlice slice1 = e2.getKey(); - FileSlice slice2 = e2.getValue(); - Assert.assertEquals(slice1.getBaseInstantTime(), slice2.getBaseInstantTime()); - Assert.assertEquals(slice1.getFileId(), slice2.getFileId()); - Assert.assertEquals(slice1.getDataFile().isPresent(), slice2.getDataFile().isPresent()); - if (slice1.getDataFile().isPresent()) { - HoodieDataFile df1 = slice1.getDataFile().get(); - HoodieDataFile df2 = slice2.getDataFile().get(); - Assert.assertEquals(df1.getCommitTime(), df2.getCommitTime()); - Assert.assertEquals(df1.getFileId(), df2.getFileId()); - Assert.assertEquals(df1.getFileName(), df2.getFileName()); - Assert.assertEquals(Path.getPathWithoutSchemeAndAuthority(new Path(df1.getPath())), - Path.getPathWithoutSchemeAndAuthority(new Path(df2.getPath()))); - } - List logPaths1 = slice1.getLogFiles() - .map(lf -> Path.getPathWithoutSchemeAndAuthority(lf.getPath())).collect(Collectors.toList()); - List logPaths2 = slice2.getLogFiles() - .map(lf -> Path.getPathWithoutSchemeAndAuthority(lf.getPath())).collect(Collectors.toList()); - Assert.assertEquals(logPaths1, logPaths2); - }); - return slices1.size(); - }).sum(); + long gotSlicesCount = fileGroupsMap1.keySet().stream() + .map(k -> Pair.of(fileGroupsMap1.get(k), fileGroupsMap2.get(k))).mapToLong(e -> { + HoodieFileGroup fg1 = e.getKey(); + HoodieFileGroup fg2 = e.getValue(); + Assert.assertEquals(fg1.getFileGroupId(), fg2.getFileGroupId()); + List slices1 = fg1.getAllRawFileSlices().collect(Collectors.toList()); + List slices2 = fg2.getAllRawFileSlices().collect(Collectors.toList()); + Assert.assertEquals(slices1.size(), slices2.size()); + IntStream.range(0, slices1.size()).mapToObj(idx -> Pair.of(slices1.get(idx), slices2.get(idx))) + .forEach(e2 -> { + FileSlice slice1 = e2.getKey(); + FileSlice slice2 = e2.getValue(); + Assert.assertEquals(slice1.getBaseInstantTime(), slice2.getBaseInstantTime()); + Assert.assertEquals(slice1.getFileId(), slice2.getFileId()); + Assert.assertEquals(slice1.getDataFile().isPresent(), slice2.getDataFile().isPresent()); + if (slice1.getDataFile().isPresent()) { + HoodieDataFile df1 = slice1.getDataFile().get(); + HoodieDataFile df2 = slice2.getDataFile().get(); + Assert.assertEquals(df1.getCommitTime(), df2.getCommitTime()); + Assert.assertEquals(df1.getFileId(), df2.getFileId()); + Assert.assertEquals(df1.getFileName(), df2.getFileName()); + Assert.assertEquals(Path.getPathWithoutSchemeAndAuthority(new Path(df1.getPath())), + Path.getPathWithoutSchemeAndAuthority(new Path(df2.getPath()))); + } + List logPaths1 = slice1.getLogFiles() + .map(lf -> Path.getPathWithoutSchemeAndAuthority(lf.getPath())).collect(Collectors.toList()); + List logPaths2 = slice2.getLogFiles() + .map(lf -> Path.getPathWithoutSchemeAndAuthority(lf.getPath())).collect(Collectors.toList()); + Assert.assertEquals(logPaths1, logPaths2); + }); + return slices1.size(); + }).sum(); Assert.assertEquals(expectedTotalFileSlices, gotSlicesCount); // Pending Compaction Operations Check @@ -732,14 +715,14 @@ public class IncrementalFSViewSyncTest extends HoodieCommonTestHarness { } private List addInstant(HoodieTableMetaClient metaClient, String instant, boolean deltaCommit, - String baseInstant) - throws IOException { + String baseInstant) throws IOException { List> writeStats = partitions.stream().flatMap(p -> { return fileIdsPerPartition.stream().map(f -> { try { File file = new File(basePath + "/" + p + "/" - + (deltaCommit ? FSUtils.makeLogFileName(f, ".log", baseInstant, - Integer.parseInt(instant), TEST_WRITE_TOKEN) : FSUtils.makeDataFileName(instant, TEST_WRITE_TOKEN, f))); + + (deltaCommit + ? FSUtils.makeLogFileName(f, ".log", baseInstant, Integer.parseInt(instant), TEST_WRITE_TOKEN) + : FSUtils.makeDataFileName(instant, TEST_WRITE_TOKEN, f))); file.createNewFile(); HoodieWriteStat w = new HoodieWriteStat(); w.setFileId(f); @@ -753,10 +736,10 @@ public class IncrementalFSViewSyncTest extends HoodieCommonTestHarness { HoodieCommitMetadata metadata = new HoodieCommitMetadata(); writeStats.forEach(e -> metadata.addWriteStat(e.getKey(), e.getValue())); - metaClient.getActiveTimeline().saveAsComplete( - new HoodieInstant(true, deltaCommit ? HoodieTimeline.DELTA_COMMIT_ACTION : HoodieTimeline.COMMIT_ACTION, - instant), - Option.of(metadata.toJsonString().getBytes(StandardCharsets.UTF_8))); + metaClient.getActiveTimeline() + .saveAsComplete(new HoodieInstant(true, + deltaCommit ? HoodieTimeline.DELTA_COMMIT_ACTION : HoodieTimeline.COMMIT_ACTION, instant), + Option.of(metadata.toJsonString().getBytes(StandardCharsets.UTF_8))); // Delete pending compaction if present metaClient.getFs().delete(new Path(metaClient.getMetaPath(), new HoodieInstant(State.REQUESTED, HoodieTimeline.COMPACTION_ACTION, instant).getFileName())); diff --git a/hudi-common/src/test/java/org/apache/hudi/common/table/view/RocksDBBasedIncrementalFSViewSyncTest.java b/hudi-common/src/test/java/org/apache/hudi/common/table/view/RocksDBBasedIncrementalFSViewSyncTest.java index 1f3c60026..ee9a5215f 100644 --- a/hudi-common/src/test/java/org/apache/hudi/common/table/view/RocksDBBasedIncrementalFSViewSyncTest.java +++ b/hudi-common/src/test/java/org/apache/hudi/common/table/view/RocksDBBasedIncrementalFSViewSyncTest.java @@ -28,8 +28,7 @@ public class RocksDBBasedIncrementalFSViewSyncTest extends IncrementalFSViewSync @Override protected SyncableFileSystemView getFileSystemView(HoodieTableMetaClient metaClient, HoodieTimeline timeline) throws IOException { - return new RocksDbBasedFileSystemView(metaClient, timeline, - FileSystemViewStorageConfig.newBuilder().withRocksDBPath(folder.newFolder().getAbsolutePath()) - .withIncrementalTimelineSync(true).build()); + return new RocksDbBasedFileSystemView(metaClient, timeline, FileSystemViewStorageConfig.newBuilder() + .withRocksDBPath(folder.newFolder().getAbsolutePath()).withIncrementalTimelineSync(true).build()); } } diff --git a/hudi-common/src/test/java/org/apache/hudi/common/table/view/RocksDbBasedFileSystemViewTest.java b/hudi-common/src/test/java/org/apache/hudi/common/table/view/RocksDbBasedFileSystemViewTest.java index 68c8dee33..1ef73fa97 100644 --- a/hudi-common/src/test/java/org/apache/hudi/common/table/view/RocksDbBasedFileSystemViewTest.java +++ b/hudi-common/src/test/java/org/apache/hudi/common/table/view/RocksDbBasedFileSystemViewTest.java @@ -26,7 +26,6 @@ public class RocksDbBasedFileSystemViewTest extends HoodieTableFileSystemViewTes protected SyncableFileSystemView getFileSystemView(HoodieTimeline timeline) throws IOException { return new RocksDbBasedFileSystemView(metaClient, timeline, - FileSystemViewStorageConfig.newBuilder().withRocksDBPath(folder.newFolder().getAbsolutePath()) - .build()); + FileSystemViewStorageConfig.newBuilder().withRocksDBPath(folder.newFolder().getAbsolutePath()).build()); } } diff --git a/hudi-common/src/test/java/org/apache/hudi/common/table/view/SpillableMapBasedFileSystemViewTest.java b/hudi-common/src/test/java/org/apache/hudi/common/table/view/SpillableMapBasedFileSystemViewTest.java index db2daf6f5..3b4181ce2 100644 --- a/hudi-common/src/test/java/org/apache/hudi/common/table/view/SpillableMapBasedFileSystemViewTest.java +++ b/hudi-common/src/test/java/org/apache/hudi/common/table/view/SpillableMapBasedFileSystemViewTest.java @@ -24,8 +24,8 @@ import org.apache.hudi.common.table.SyncableFileSystemView; public class SpillableMapBasedFileSystemViewTest extends HoodieTableFileSystemViewTest { protected SyncableFileSystemView getFileSystemView(HoodieTimeline timeline) { - return new SpillableMapBasedFileSystemView(metaClient, timeline, - FileSystemViewStorageConfig.newBuilder().withStorageType(FileSystemViewStorageType.SPILLABLE_DISK) - .withMaxMemoryForView(0L).build()); // pure disk base View + return new SpillableMapBasedFileSystemView(metaClient, timeline, FileSystemViewStorageConfig.newBuilder() + // pure disk base View + .withStorageType(FileSystemViewStorageType.SPILLABLE_DISK).withMaxMemoryForView(0L).build()); } } diff --git a/hudi-common/src/test/java/org/apache/hudi/common/util/CompactionTestUtils.java b/hudi-common/src/test/java/org/apache/hudi/common/util/CompactionTestUtils.java index d320a7697..56a40c29d 100644 --- a/hudi-common/src/test/java/org/apache/hudi/common/util/CompactionTestUtils.java +++ b/hudi-common/src/test/java/org/apache/hudi/common/util/CompactionTestUtils.java @@ -53,8 +53,7 @@ public class CompactionTestUtils { private static String TEST_WRITE_TOKEN = "1-0-1"; public static Map> setupAndValidateCompactionOperations( - HoodieTableMetaClient metaClient, boolean inflight, - int numEntriesInPlan1, int numEntriesInPlan2, + HoodieTableMetaClient metaClient, boolean inflight, int numEntriesInPlan1, int numEntriesInPlan2, int numEntriesInPlan3, int numEntriesInPlan4) throws IOException { HoodieCompactionPlan plan1 = createCompactionPlan(metaClient, "000", "001", numEntriesInPlan1, true, true); HoodieCompactionPlan plan2 = createCompactionPlan(metaClient, "002", "003", numEntriesInPlan2, false, true); @@ -78,17 +77,16 @@ public class CompactionTestUtils { createDeltaCommit(metaClient, "004"); createDeltaCommit(metaClient, "006"); - Map baseInstantsToCompaction = - new ImmutableMap.Builder().put("000", "001").put("002", "003") - .put("004", "005").put("006", "007").build(); + Map baseInstantsToCompaction = new ImmutableMap.Builder().put("000", "001") + .put("002", "003").put("004", "005").put("006", "007").build(); List expectedNumEntries = Arrays.asList(numEntriesInPlan1, numEntriesInPlan2, numEntriesInPlan3, numEntriesInPlan4); - List plans = new ImmutableList.Builder() - .add(plan1, plan2, plan3, plan4).build(); + List plans = + new ImmutableList.Builder().add(plan1, plan2, plan3, plan4).build(); IntStream.range(0, 4).boxed().forEach(idx -> { if (expectedNumEntries.get(idx) > 0) { - Assert.assertEquals("check if plan " + idx + " has exp entries", - expectedNumEntries.get(idx).longValue(), plans.get(idx).getOperations().size()); + Assert.assertEquals("check if plan " + idx + " has exp entries", expectedNumEntries.get(idx).longValue(), + plans.get(idx).getOperations().size()); } else { Assert.assertNull("Plan " + idx + " has null ops", plans.get(idx).getOperations()); } @@ -108,39 +106,37 @@ public class CompactionTestUtils { public static Map> generateExpectedCompactionOperations( List plans, Map baseInstantsToCompaction) { - return plans.stream() - .flatMap(plan -> { - if (plan.getOperations() != null) { - return plan.getOperations().stream().map(op -> Pair.of( - new HoodieFileGroupId(op.getPartitionPath(), op.getFileId()), + return plans.stream().flatMap(plan -> { + if (plan.getOperations() != null) { + return plan.getOperations().stream() + .map(op -> Pair.of(new HoodieFileGroupId(op.getPartitionPath(), op.getFileId()), Pair.of(baseInstantsToCompaction.get(op.getBaseInstantTime()), op))); - } - return Stream.empty(); - }).collect(Collectors.toMap(Pair::getKey, Pair::getValue)); + } + return Stream.empty(); + }).collect(Collectors.toMap(Pair::getKey, Pair::getValue)); } - public static void scheduleCompaction(HoodieTableMetaClient metaClient, - String instantTime, HoodieCompactionPlan compactionPlan) throws IOException { + public static void scheduleCompaction(HoodieTableMetaClient metaClient, String instantTime, + HoodieCompactionPlan compactionPlan) throws IOException { metaClient.getActiveTimeline().saveToCompactionRequested( new HoodieInstant(State.REQUESTED, COMPACTION_ACTION, instantTime), AvroUtils.serializeCompactionPlan(compactionPlan)); } public static void createDeltaCommit(HoodieTableMetaClient metaClient, String instantTime) throws IOException { - metaClient.getActiveTimeline().saveAsComplete( - new HoodieInstant(State.INFLIGHT, DELTA_COMMIT_ACTION, instantTime), Option.empty()); + metaClient.getActiveTimeline().saveAsComplete(new HoodieInstant(State.INFLIGHT, DELTA_COMMIT_ACTION, instantTime), + Option.empty()); } public static void scheduleInflightCompaction(HoodieTableMetaClient metaClient, String instantTime, HoodieCompactionPlan compactionPlan) throws IOException { scheduleCompaction(metaClient, instantTime, compactionPlan); - metaClient.getActiveTimeline().transitionCompactionRequestedToInflight( - new HoodieInstant(State.REQUESTED, COMPACTION_ACTION, instantTime)); + metaClient.getActiveTimeline() + .transitionCompactionRequestedToInflight(new HoodieInstant(State.REQUESTED, COMPACTION_ACTION, instantTime)); } public static HoodieCompactionPlan createCompactionPlan(HoodieTableMetaClient metaClient, String instantId, - String compactionInstantId, int numFileIds, boolean createDataFile, - boolean deltaCommitsAfterCompactionRequests) { + String compactionInstantId, int numFileIds, boolean createDataFile, boolean deltaCommitsAfterCompactionRequests) { List ops = IntStream.range(0, numFileIds).boxed().map(idx -> { try { String fileId = UUID.randomUUID().toString(); @@ -153,15 +149,13 @@ public class CompactionTestUtils { instantId, fileId, Option.of(2)); FileSlice slice = new FileSlice(DEFAULT_PARTITION_PATHS[0], instantId, fileId); if (createDataFile) { - slice.setDataFile(new TestHoodieDataFile(metaClient.getBasePath() + "/" + DEFAULT_PARTITION_PATHS[0] - + "/" + FSUtils.makeDataFileName(instantId, TEST_WRITE_TOKEN, fileId))); + slice.setDataFile(new TestHoodieDataFile(metaClient.getBasePath() + "/" + DEFAULT_PARTITION_PATHS[0] + "/" + + FSUtils.makeDataFileName(instantId, TEST_WRITE_TOKEN, fileId))); } - String logFilePath1 = HoodieTestUtils - .getLogFilePath(metaClient.getBasePath(), DEFAULT_PARTITION_PATHS[0], instantId, fileId, - Option.of(1)); - String logFilePath2 = HoodieTestUtils - .getLogFilePath(metaClient.getBasePath(), DEFAULT_PARTITION_PATHS[0], instantId, fileId, - Option.of(2)); + String logFilePath1 = HoodieTestUtils.getLogFilePath(metaClient.getBasePath(), DEFAULT_PARTITION_PATHS[0], + instantId, fileId, Option.of(1)); + String logFilePath2 = HoodieTestUtils.getLogFilePath(metaClient.getBasePath(), DEFAULT_PARTITION_PATHS[0], + instantId, fileId, Option.of(2)); slice.addLogFile(new HoodieLogFile(new Path(logFilePath1))); slice.addLogFile(new HoodieLogFile(new Path(logFilePath2))); HoodieCompactionOperation op = diff --git a/hudi-common/src/test/java/org/apache/hudi/common/util/SchemaTestUtil.java b/hudi-common/src/test/java/org/apache/hudi/common/util/SchemaTestUtil.java index 4b62f8152..7fbf78244 100644 --- a/hudi-common/src/test/java/org/apache/hudi/common/util/SchemaTestUtil.java +++ b/hudi-common/src/test/java/org/apache/hudi/common/util/SchemaTestUtil.java @@ -109,11 +109,8 @@ public class SchemaTestUtil { public static List generateHoodieTestRecords(int from, int limit, Schema schema) throws IOException, URISyntaxException { List records = generateTestRecords(from, limit); - return records.stream() - .map(s -> HoodieAvroUtils.rewriteRecord((GenericRecord) s, schema)) - .map(p -> convertToHoodieRecords(p, - UUID.randomUUID().toString(), "000/00/00")).collect( - Collectors.toList()); + return records.stream().map(s -> HoodieAvroUtils.rewriteRecord((GenericRecord) s, schema)) + .map(p -> convertToHoodieRecords(p, UUID.randomUUID().toString(), "000/00/00")).collect(Collectors.toList()); } private static HoodieRecord convertToHoodieRecords(IndexedRecord iRecord, String key, String partitionPath) { @@ -124,14 +121,12 @@ public class SchemaTestUtil { public static List updateHoodieTestRecords(List oldRecordKeys, List newRecords, String commitTime) throws IOException, URISyntaxException { - return newRecords.stream() - .map(p -> { - ((GenericRecord) p).put(HoodieRecord.RECORD_KEY_METADATA_FIELD, oldRecordKeys.remove(0)); - ((GenericRecord) p).put(HoodieRecord.PARTITION_PATH_METADATA_FIELD, "0000/00/00"); - ((GenericRecord) p).put(HoodieRecord.COMMIT_TIME_METADATA_FIELD, commitTime); - return p; - }).collect( - Collectors.toList()); + return newRecords.stream().map(p -> { + ((GenericRecord) p).put(HoodieRecord.RECORD_KEY_METADATA_FIELD, oldRecordKeys.remove(0)); + ((GenericRecord) p).put(HoodieRecord.PARTITION_PATH_METADATA_FIELD, "0000/00/00"); + ((GenericRecord) p).put(HoodieRecord.COMMIT_TIME_METADATA_FIELD, commitTime); + return p; + }).collect(Collectors.toList()); } @@ -139,28 +134,21 @@ public class SchemaTestUtil { throws IOException, URISyntaxException { List iRecords = generateTestRecords(from, limit); - return iRecords - .stream() - .map(r -> new HoodieRecord<>(new HoodieKey(UUID.randomUUID().toString(), "0000/00/00"), - new HoodieAvroPayload(Option.of((GenericRecord) r)))).collect(Collectors.toList()); + return iRecords.stream().map(r -> new HoodieRecord<>(new HoodieKey(UUID.randomUUID().toString(), "0000/00/00"), + new HoodieAvroPayload(Option.of((GenericRecord) r)))).collect(Collectors.toList()); } public static List updateHoodieTestRecordsWithoutHoodieMetadata(List oldRecords, - Schema schema, - String fieldNameToUpdate, String newValue) - throws IOException, URISyntaxException { - return oldRecords - .stream() - .map(r -> { - try { - GenericRecord rec = (GenericRecord) r.getData().getInsertValue(schema).get(); - rec.put(fieldNameToUpdate, newValue); - return new HoodieRecord<>(r.getKey(), - new HoodieAvroPayload(Option.of(rec))); - } catch (IOException io) { - throw new HoodieIOException("unable to get data from hoodie record", io); - } - }).collect(Collectors.toList()); + Schema schema, String fieldNameToUpdate, String newValue) throws IOException, URISyntaxException { + return oldRecords.stream().map(r -> { + try { + GenericRecord rec = (GenericRecord) r.getData().getInsertValue(schema).get(); + rec.put(fieldNameToUpdate, newValue); + return new HoodieRecord<>(r.getKey(), new HoodieAvroPayload(Option.of(rec))); + } catch (IOException io) { + throw new HoodieIOException("unable to get data from hoodie record", io); + } + }).collect(Collectors.toList()); } public static Schema getEvolvedSchema() throws IOException { diff --git a/hudi-common/src/test/java/org/apache/hudi/common/util/SpillableMapTestUtils.java b/hudi-common/src/test/java/org/apache/hudi/common/util/SpillableMapTestUtils.java index c0b2d8f25..20515816c 100644 --- a/hudi-common/src/test/java/org/apache/hudi/common/util/SpillableMapTestUtils.java +++ b/hudi-common/src/test/java/org/apache/hudi/common/util/SpillableMapTestUtils.java @@ -37,19 +37,17 @@ public class SpillableMapTestUtils { public static List upsertRecords(List iRecords, Map> records) { List recordKeys = new ArrayList<>(); - iRecords - .stream() - .forEach(r -> { - String key = ((GenericRecord) r).get(HoodieRecord.RECORD_KEY_METADATA_FIELD).toString(); - String partitionPath = ((GenericRecord) r).get(HoodieRecord.PARTITION_PATH_METADATA_FIELD).toString(); - recordKeys.add(key); - HoodieRecord record = new HoodieRecord<>(new HoodieKey(key, partitionPath), - new HoodieAvroPayload(Option.of((GenericRecord) r))); - record.unseal(); - record.setCurrentLocation(new HoodieRecordLocation("DUMMY_COMMIT_TIME", "DUMMY_FILE_ID")); - record.seal(); - records.put(key, record); - }); + iRecords.stream().forEach(r -> { + String key = ((GenericRecord) r).get(HoodieRecord.RECORD_KEY_METADATA_FIELD).toString(); + String partitionPath = ((GenericRecord) r).get(HoodieRecord.PARTITION_PATH_METADATA_FIELD).toString(); + recordKeys.add(key); + HoodieRecord record = + new HoodieRecord<>(new HoodieKey(key, partitionPath), new HoodieAvroPayload(Option.of((GenericRecord) r))); + record.unseal(); + record.setCurrentLocation(new HoodieRecordLocation("DUMMY_COMMIT_TIME", "DUMMY_FILE_ID")); + record.seal(); + records.put(key, record); + }); return recordKeys; } } diff --git a/hudi-common/src/test/java/org/apache/hudi/common/util/TestCompactionUtils.java b/hudi-common/src/test/java/org/apache/hudi/common/util/TestCompactionUtils.java index fa4f4af09..3bb46fbcc 100644 --- a/hudi-common/src/test/java/org/apache/hudi/common/util/TestCompactionUtils.java +++ b/hudi-common/src/test/java/org/apache/hudi/common/util/TestCompactionUtils.java @@ -51,9 +51,7 @@ public class TestCompactionUtils extends HoodieCommonTestHarness { private static String TEST_WRITE_TOKEN = "1-0-1"; private static final Map metrics = - new ImmutableMap.Builder() - .put("key1", 1.0) - .put("key2", 3.0).build(); + new ImmutableMap.Builder().put("key1", 1.0).put("key2", 3.0).build(); private Function, Map> metricsCaptureFn = (partitionFileSlice) -> metrics; @Before @@ -64,37 +62,34 @@ public class TestCompactionUtils extends HoodieCommonTestHarness { @Test public void testBuildFromFileSlice() { // Empty File-Slice with no data and log files - FileSlice emptyFileSlice = new FileSlice(DEFAULT_PARTITION_PATHS[0],"000", "empty1"); - HoodieCompactionOperation op = CompactionUtils.buildFromFileSlice( - DEFAULT_PARTITION_PATHS[0], emptyFileSlice, Option.of(metricsCaptureFn)); + FileSlice emptyFileSlice = new FileSlice(DEFAULT_PARTITION_PATHS[0], "000", "empty1"); + HoodieCompactionOperation op = + CompactionUtils.buildFromFileSlice(DEFAULT_PARTITION_PATHS[0], emptyFileSlice, Option.of(metricsCaptureFn)); testFileSliceCompactionOpEquality(emptyFileSlice, op, DEFAULT_PARTITION_PATHS[0]); // File Slice with data-file but no log files - FileSlice noLogFileSlice = new FileSlice(DEFAULT_PARTITION_PATHS[0],"000", "noLog1"); + FileSlice noLogFileSlice = new FileSlice(DEFAULT_PARTITION_PATHS[0], "000", "noLog1"); noLogFileSlice.setDataFile(new TestHoodieDataFile("/tmp/noLog_1_000.parquet")); - op = CompactionUtils.buildFromFileSlice( - DEFAULT_PARTITION_PATHS[0], noLogFileSlice, Option.of(metricsCaptureFn)); + op = CompactionUtils.buildFromFileSlice(DEFAULT_PARTITION_PATHS[0], noLogFileSlice, Option.of(metricsCaptureFn)); testFileSliceCompactionOpEquality(noLogFileSlice, op, DEFAULT_PARTITION_PATHS[0]); - //File Slice with no data-file but log files present - FileSlice noDataFileSlice = new FileSlice(DEFAULT_PARTITION_PATHS[0],"000", "noData1"); - noDataFileSlice.addLogFile(new HoodieLogFile(new Path( - FSUtils.makeLogFileName("noData1", ".log", "000", 1, TEST_WRITE_TOKEN)))); - noDataFileSlice.addLogFile(new HoodieLogFile(new Path( - FSUtils.makeLogFileName("noData1", ".log", "000", 2, TEST_WRITE_TOKEN)))); - op = CompactionUtils.buildFromFileSlice( - DEFAULT_PARTITION_PATHS[0], noDataFileSlice, Option.of(metricsCaptureFn)); + // File Slice with no data-file but log files present + FileSlice noDataFileSlice = new FileSlice(DEFAULT_PARTITION_PATHS[0], "000", "noData1"); + noDataFileSlice.addLogFile( + new HoodieLogFile(new Path(FSUtils.makeLogFileName("noData1", ".log", "000", 1, TEST_WRITE_TOKEN)))); + noDataFileSlice.addLogFile( + new HoodieLogFile(new Path(FSUtils.makeLogFileName("noData1", ".log", "000", 2, TEST_WRITE_TOKEN)))); + op = CompactionUtils.buildFromFileSlice(DEFAULT_PARTITION_PATHS[0], noDataFileSlice, Option.of(metricsCaptureFn)); testFileSliceCompactionOpEquality(noDataFileSlice, op, DEFAULT_PARTITION_PATHS[0]); - //File Slice with data-file and log files present - FileSlice fileSlice = new FileSlice(DEFAULT_PARTITION_PATHS[0],"000", "noData1"); + // File Slice with data-file and log files present + FileSlice fileSlice = new FileSlice(DEFAULT_PARTITION_PATHS[0], "000", "noData1"); fileSlice.setDataFile(new TestHoodieDataFile("/tmp/noLog_1_000.parquet")); - fileSlice.addLogFile(new HoodieLogFile(new Path( - FSUtils.makeLogFileName("noData1", ".log", "000", 1, TEST_WRITE_TOKEN)))); - fileSlice.addLogFile(new HoodieLogFile(new Path( - FSUtils.makeLogFileName("noData1", ".log", "000", 2, TEST_WRITE_TOKEN)))); - op = CompactionUtils.buildFromFileSlice( - DEFAULT_PARTITION_PATHS[0], fileSlice, Option.of(metricsCaptureFn)); + fileSlice.addLogFile( + new HoodieLogFile(new Path(FSUtils.makeLogFileName("noData1", ".log", "000", 1, TEST_WRITE_TOKEN)))); + fileSlice.addLogFile( + new HoodieLogFile(new Path(FSUtils.makeLogFileName("noData1", ".log", "000", 2, TEST_WRITE_TOKEN)))); + op = CompactionUtils.buildFromFileSlice(DEFAULT_PARTITION_PATHS[0], fileSlice, Option.of(metricsCaptureFn)); testFileSliceCompactionOpEquality(fileSlice, op, DEFAULT_PARTITION_PATHS[0]); } @@ -102,23 +97,23 @@ public class TestCompactionUtils extends HoodieCommonTestHarness { * Generate input for compaction plan tests */ private Pair>, HoodieCompactionPlan> buildCompactionPlan() { - FileSlice emptyFileSlice = new FileSlice(DEFAULT_PARTITION_PATHS[0],"000", "empty1"); - FileSlice fileSlice = new FileSlice(DEFAULT_PARTITION_PATHS[0],"000", "noData1"); + FileSlice emptyFileSlice = new FileSlice(DEFAULT_PARTITION_PATHS[0], "000", "empty1"); + FileSlice fileSlice = new FileSlice(DEFAULT_PARTITION_PATHS[0], "000", "noData1"); fileSlice.setDataFile(new TestHoodieDataFile("/tmp/noLog_1_000.parquet")); - fileSlice.addLogFile(new HoodieLogFile(new Path( - FSUtils.makeLogFileName("noData1", ".log", "000", 1, TEST_WRITE_TOKEN)))); - fileSlice.addLogFile(new HoodieLogFile(new Path( - FSUtils.makeLogFileName("noData1", ".log", "000", 2, TEST_WRITE_TOKEN)))); - FileSlice noLogFileSlice = new FileSlice(DEFAULT_PARTITION_PATHS[0],"000", "noLog1"); + fileSlice.addLogFile( + new HoodieLogFile(new Path(FSUtils.makeLogFileName("noData1", ".log", "000", 1, TEST_WRITE_TOKEN)))); + fileSlice.addLogFile( + new HoodieLogFile(new Path(FSUtils.makeLogFileName("noData1", ".log", "000", 2, TEST_WRITE_TOKEN)))); + FileSlice noLogFileSlice = new FileSlice(DEFAULT_PARTITION_PATHS[0], "000", "noLog1"); noLogFileSlice.setDataFile(new TestHoodieDataFile("/tmp/noLog_1_000.parquet")); - FileSlice noDataFileSlice = new FileSlice(DEFAULT_PARTITION_PATHS[0],"000", "noData1"); - noDataFileSlice.addLogFile(new HoodieLogFile(new Path( - FSUtils.makeLogFileName("noData1", ".log", "000", 1, TEST_WRITE_TOKEN)))); - noDataFileSlice.addLogFile(new HoodieLogFile(new Path( - FSUtils.makeLogFileName("noData1", ".log", "000", 2, TEST_WRITE_TOKEN)))); + FileSlice noDataFileSlice = new FileSlice(DEFAULT_PARTITION_PATHS[0], "000", "noData1"); + noDataFileSlice.addLogFile( + new HoodieLogFile(new Path(FSUtils.makeLogFileName("noData1", ".log", "000", 1, TEST_WRITE_TOKEN)))); + noDataFileSlice.addLogFile( + new HoodieLogFile(new Path(FSUtils.makeLogFileName("noData1", ".log", "000", 2, TEST_WRITE_TOKEN)))); List fileSliceList = Arrays.asList(emptyFileSlice, noDataFileSlice, fileSlice, noLogFileSlice); - List> input = fileSliceList.stream().map(f -> Pair.of(DEFAULT_PARTITION_PATHS[0], f)) - .collect(Collectors.toList()); + List> input = + fileSliceList.stream().map(f -> Pair.of(DEFAULT_PARTITION_PATHS[0], f)).collect(Collectors.toList()); return Pair.of(input, CompactionUtils.buildFromFileSlices(input, Option.empty(), Option.of(metricsCaptureFn))); } @@ -134,14 +129,13 @@ public class TestCompactionUtils extends HoodieCommonTestHarness { Pair>, HoodieCompactionPlan> inputAndPlan = buildCompactionPlan(); HoodieCompactionPlan plan = inputAndPlan.getRight(); List originalOps = plan.getOperations(); - List regeneratedOps = - originalOps.stream().map(op -> { - // Convert to CompactionOperation - return CompactionUtils.buildCompactionOperation(op); - }).map(op2 -> { - // Convert back to HoodieCompactionOperation and check for equality - return CompactionUtils.buildHoodieCompactionOperation(op2); - }).collect(Collectors.toList()); + List regeneratedOps = originalOps.stream().map(op -> { + // Convert to CompactionOperation + return CompactionUtils.buildCompactionOperation(op); + }).map(op2 -> { + // Convert back to HoodieCompactionOperation and check for equality + return CompactionUtils.buildHoodieCompactionOperation(op2); + }).collect(Collectors.toList()); Assert.assertTrue("Transformation did get tested", originalOps.size() > 0); Assert.assertEquals("All fields set correctly in transformations", originalOps, regeneratedOps); } @@ -194,24 +188,22 @@ public class TestCompactionUtils extends HoodieCommonTestHarness { } /** - * Validates if generated compaction plan matches with input file-slices + * Validates if generated compaction plan matches with input file-slices * * @param input File Slices with partition-path - * @param plan Compaction Plan + * @param plan Compaction Plan */ - private void testFileSlicesCompactionPlanEquality(List> input, - HoodieCompactionPlan plan) { + private void testFileSlicesCompactionPlanEquality(List> input, HoodieCompactionPlan plan) { Assert.assertEquals("All file-slices present", input.size(), plan.getOperations().size()); - IntStream.range(0, input.size()).boxed().forEach(idx -> - testFileSliceCompactionOpEquality(input.get(idx).getValue(), plan.getOperations().get(idx), - input.get(idx).getKey())); + IntStream.range(0, input.size()).boxed().forEach(idx -> testFileSliceCompactionOpEquality(input.get(idx).getValue(), + plan.getOperations().get(idx), input.get(idx).getKey())); } /** * Validates if generated compaction operation matches with input file slice and partition path * - * @param slice File Slice - * @param op HoodieCompactionOperation + * @param slice File Slice + * @param op HoodieCompactionOperation * @param expPartitionPath Partition path */ private void testFileSliceCompactionOpEquality(FileSlice slice, HoodieCompactionOperation op, diff --git a/hudi-common/src/test/java/org/apache/hudi/common/util/TestDFSPropertiesConfiguration.java b/hudi-common/src/test/java/org/apache/hudi/common/util/TestDFSPropertiesConfiguration.java index 36d37a686..71499c639 100644 --- a/hudi-common/src/test/java/org/apache/hudi/common/util/TestDFSPropertiesConfiguration.java +++ b/hudi-common/src/test/java/org/apache/hudi/common/util/TestDFSPropertiesConfiguration.java @@ -53,25 +53,18 @@ public class TestDFSPropertiesConfiguration { // create some files. Path filePath = new Path(dfsBasePath + "/t1.props"); - writePropertiesFile(filePath, new String[]{ - "", "#comment", "abc",// to be ignored - "int.prop=123", "double.prop=113.4", "string.prop=str", "boolean.prop=true", "long.prop=1354354354" - }); + writePropertiesFile(filePath, new String[] {"", "#comment", "abc", // to be ignored + "int.prop=123", "double.prop=113.4", "string.prop=str", "boolean.prop=true", "long.prop=1354354354"}); filePath = new Path(dfsBasePath + "/t2.props"); - writePropertiesFile(filePath, new String[]{ - "string.prop=ignored", "include=t1.props" - }); + writePropertiesFile(filePath, new String[] {"string.prop=ignored", "include=t1.props"}); filePath = new Path(dfsBasePath + "/t3.props"); - writePropertiesFile(filePath, new String[]{ - "double.prop=838.3", "include = t2.props", "double.prop=243.4", "string.prop=t3.value" - }); + writePropertiesFile(filePath, + new String[] {"double.prop=838.3", "include = t2.props", "double.prop=243.4", "string.prop=t3.value"}); filePath = new Path(dfsBasePath + "/t4.props"); - writePropertiesFile(filePath, new String[]{ - "double.prop=838.3", "include = t4.props" - }); + writePropertiesFile(filePath, new String[] {"double.prop=838.3", "include = t4.props"}); } @AfterClass @@ -98,7 +91,9 @@ public class TestDFSPropertiesConfiguration { try { props.getString("invalid.key"); fail("Should error out here."); - } catch (IllegalArgumentException iae) { /* ignore */ } + } catch (IllegalArgumentException iae) { + // ignore + } assertEquals(123, props.getInteger("int.prop")); assertEquals(113.4, props.getDouble("double.prop"), 0.001); @@ -133,6 +128,8 @@ public class TestDFSPropertiesConfiguration { try { new DFSPropertiesConfiguration(dfs, new Path(dfsBasePath + "/t4.props")); fail("Should error out on a self-included file."); - } catch (IllegalStateException ise) { /* ignore */ } + } catch (IllegalStateException ise) { + // ignore + } } } diff --git a/hudi-common/src/test/java/org/apache/hudi/common/util/TestFSUtils.java b/hudi-common/src/test/java/org/apache/hudi/common/util/TestFSUtils.java index 17efd3c1c..1b0eb6350 100644 --- a/hudi-common/src/test/java/org/apache/hudi/common/util/TestFSUtils.java +++ b/hudi-common/src/test/java/org/apache/hudi/common/util/TestFSUtils.java @@ -74,16 +74,16 @@ public class TestFSUtils extends HoodieCommonTestHarness { @Test /** - * Tests if process Files return only paths excluding marker directories - * Cleaner, Rollback and compaction-scheduling logic was recursively processing all subfolders including that - * of ".hoodie" when looking for partition-paths. This causes a race when they try to list all folders (recursively) - * but the marker directory (that of compaction inside of ".hoodie" folder) is deleted underneath by compactor. - * This code tests the fix by ensuring ".hoodie" and their subfolders are never processed. + * Tests if process Files return only paths excluding marker directories Cleaner, Rollback and compaction-scheduling + * logic was recursively processing all subfolders including that of ".hoodie" when looking for partition-paths. This + * causes a race when they try to list all folders (recursively) but the marker directory (that of compaction inside + * of ".hoodie" folder) is deleted underneath by compactor. This code tests the fix by ensuring ".hoodie" and their + * subfolders are never processed. */ public void testProcessFiles() throws Exception { // All directories including marker dirs. - List folders = Arrays.asList("2016/04/15", "2016/05/16", ".hoodie/.temp/2/2016/04/15", - ".hoodie/.temp/2/2016/05/16"); + List folders = + Arrays.asList("2016/04/15", "2016/05/16", ".hoodie/.temp/2/2016/04/15", ".hoodie/.temp/2/2016/05/16"); folders.stream().forEach(f -> { try { metaClient.getFs().mkdirs(new Path(new Path(basePath), f)); @@ -93,12 +93,9 @@ public class TestFSUtils extends HoodieCommonTestHarness { }); // Files inside partitions and marker directories - List files = Arrays.asList( - "2016/04/15/1_1-0-1_20190528120000.parquet", - "2016/05/16/2_1-0-1_20190528120000.parquet", - ".hoodie/.temp/2/2016/05/16/2_1-0-1_20190528120000.parquet", - ".hoodie/.temp/2/2016/04/15/1_1-0-1_20190528120000.parquet" - ); + List files = Arrays.asList("2016/04/15/1_1-0-1_20190528120000.parquet", + "2016/05/16/2_1-0-1_20190528120000.parquet", ".hoodie/.temp/2/2016/05/16/2_1-0-1_20190528120000.parquet", + ".hoodie/.temp/2/2016/04/15/1_1-0-1_20190528120000.parquet"); files.stream().forEach(f -> { try { @@ -115,8 +112,8 @@ public class TestFSUtils extends HoodieCommonTestHarness { return true; }, true); - Assert.assertTrue("Hoodie MetaFolder MUST be skipped but got :" + collected, collected.stream() - .noneMatch(s -> s.contains(HoodieTableMetaClient.METAFOLDER_NAME))); + Assert.assertTrue("Hoodie MetaFolder MUST be skipped but got :" + collected, + collected.stream().noneMatch(s -> s.contains(HoodieTableMetaClient.METAFOLDER_NAME))); // Check if only files are listed Assert.assertEquals(2, collected.size()); @@ -127,8 +124,8 @@ public class TestFSUtils extends HoodieCommonTestHarness { return true; }, false); - Assert.assertFalse("Hoodie MetaFolder will be present :" + collected2, collected2.stream() - .noneMatch(s -> s.contains(HoodieTableMetaClient.METAFOLDER_NAME))); + Assert.assertFalse("Hoodie MetaFolder will be present :" + collected2, + collected2.stream().noneMatch(s -> s.contains(HoodieTableMetaClient.METAFOLDER_NAME))); // Check if only files are listed including hoodie.properties Assert.assertEquals("Collected=" + collected2, 5, collected2.size()); } @@ -166,7 +163,7 @@ public class TestFSUtils extends HoodieCommonTestHarness { public void testGetRelativePartitionPath() { Path basePath = new Path("/test/apache"); Path partitionPath = new Path("/test/apache/hudi/sub"); - assertEquals("hudi/sub",FSUtils.getRelativePartitionPath(basePath, partitionPath)); + assertEquals("hudi/sub", FSUtils.getRelativePartitionPath(basePath, partitionPath)); } @Test @@ -232,9 +229,8 @@ public class TestFSUtils extends HoodieCommonTestHarness { String log1Ver0 = makeOldLogFileName("file1", ".log", "1", 0); String log1Ver1 = makeOldLogFileName("file1", ".log", "1", 1); String log1base2 = makeOldLogFileName("file1", ".log", "2", 0); - List logFiles = - Arrays.asList(log1base2, log1Ver1, log1Ver0).stream() - .map(f -> new HoodieLogFile(f)).collect(Collectors.toList()); + List logFiles = Arrays.asList(log1base2, log1Ver1, log1Ver0).stream().map(f -> new HoodieLogFile(f)) + .collect(Collectors.toList()); logFiles.sort(HoodieLogFile.getLogFileComparator()); assertEquals(log1Ver0, logFiles.get(0).getFileName()); assertEquals(log1Ver1, logFiles.get(1).getFileName()); @@ -265,11 +261,8 @@ public class TestFSUtils extends HoodieCommonTestHarness { assertEquals(log1base2W1, logFiles.get(5).getFileName()); } - public static String makeOldLogFileName(String fileId, String logFileExtension, - String baseCommitTime, int version) { - Pattern oldLogFilePattern = - Pattern.compile("\\.(.*)_(.*)\\.(.*)\\.([0-9]*)(\\.([0-9]*))"); - return "." + String - .format("%s_%s%s.%d", fileId, baseCommitTime, logFileExtension, version); + public static String makeOldLogFileName(String fileId, String logFileExtension, String baseCommitTime, int version) { + Pattern oldLogFilePattern = Pattern.compile("\\.(.*)_(.*)\\.(.*)\\.([0-9]*)(\\.([0-9]*))"); + return "." + String.format("%s_%s%s.%d", fileId, baseCommitTime, logFileExtension, version); } } diff --git a/hudi-common/src/test/java/org/apache/hudi/common/util/TestHoodieAvroUtils.java b/hudi-common/src/test/java/org/apache/hudi/common/util/TestHoodieAvroUtils.java index 002100a67..a0dc79bf6 100644 --- a/hudi-common/src/test/java/org/apache/hudi/common/util/TestHoodieAvroUtils.java +++ b/hudi-common/src/test/java/org/apache/hudi/common/util/TestHoodieAvroUtils.java @@ -28,10 +28,9 @@ import org.junit.Test; public class TestHoodieAvroUtils { private static String EXAMPLE_SCHEMA = "{\"type\": \"record\"," + "\"name\": \"testrec\"," + "\"fields\": [ " - + "{\"name\": \"timestamp\",\"type\": \"double\"}," - + "{\"name\": \"_row_key\", \"type\": \"string\"}," - + "{\"name\": \"non_pii_col\", \"type\": \"string\"}," - + "{\"name\": \"pii_col\", \"type\": \"string\", \"column_category\": \"user_profile\"}]}"; + + "{\"name\": \"timestamp\",\"type\": \"double\"}," + "{\"name\": \"_row_key\", \"type\": \"string\"}," + + "{\"name\": \"non_pii_col\", \"type\": \"string\"}," + + "{\"name\": \"pii_col\", \"type\": \"string\", \"column_category\": \"user_profile\"}]}"; @Test public void testPropsPresent() { diff --git a/hudi-common/src/test/java/org/apache/hudi/common/util/TestParquetUtils.java b/hudi-common/src/test/java/org/apache/hudi/common/util/TestParquetUtils.java index eee49bc60..807204346 100644 --- a/hudi-common/src/test/java/org/apache/hudi/common/util/TestParquetUtils.java +++ b/hudi-common/src/test/java/org/apache/hudi/common/util/TestParquetUtils.java @@ -66,8 +66,8 @@ public class TestParquetUtils extends HoodieCommonTestHarness { Collections.sort(rowKeys); assertEquals("Did not read back the expected list of keys", rowKeys, rowKeysInFile); - BloomFilter filterInFile = ParquetUtils.readBloomFilterFromParquetMetadata(HoodieTestUtils.getDefaultHadoopConf(), - new Path(filePath)); + BloomFilter filterInFile = + ParquetUtils.readBloomFilterFromParquetMetadata(HoodieTestUtils.getDefaultHadoopConf(), new Path(filePath)); for (String rowKey : rowKeys) { assertTrue("key should be found in bloom filter", filterInFile.mightContain(rowKey)); } @@ -89,9 +89,8 @@ public class TestParquetUtils extends HoodieCommonTestHarness { writeParquetFile(filePath, rowKeys); // Read and verify - Set filtered = ParquetUtils.filterParquetRowKeys(HoodieTestUtils.getDefaultHadoopConf(), - new Path(filePath), - filter); + Set filtered = + ParquetUtils.filterParquetRowKeys(HoodieTestUtils.getDefaultHadoopConf(), new Path(filePath), filter); assertEquals("Filtered count does not match", filter.size(), filtered.size()); @@ -100,13 +99,12 @@ public class TestParquetUtils extends HoodieCommonTestHarness { } } - private void writeParquetFile(String filePath, - List rowKeys) throws Exception { + private void writeParquetFile(String filePath, List rowKeys) throws Exception { // Write out a parquet file Schema schema = HoodieAvroUtils.getRecordKeySchema(); BloomFilter filter = new BloomFilter(1000, 0.0001); - HoodieAvroWriteSupport writeSupport = new HoodieAvroWriteSupport(new AvroSchemaConverter().convert(schema), schema, - filter); + HoodieAvroWriteSupport writeSupport = + new HoodieAvroWriteSupport(new AvroSchemaConverter().convert(schema), schema, filter); ParquetWriter writer = new ParquetWriter(new Path(filePath), writeSupport, CompressionCodecName.GZIP, 120 * 1024 * 1024, ParquetWriter.DEFAULT_PAGE_SIZE); for (String rowKey : rowKeys) { diff --git a/hudi-common/src/test/java/org/apache/hudi/common/util/TestRecord.java b/hudi-common/src/test/java/org/apache/hudi/common/util/TestRecord.java index b0c4a14f2..e077f63e4 100644 --- a/hudi-common/src/test/java/org/apache/hudi/common/util/TestRecord.java +++ b/hudi-common/src/test/java/org/apache/hudi/common/util/TestRecord.java @@ -88,13 +88,10 @@ public class TestRecord implements Serializable { this.favoriteFloatNumber = (float) ((recordNumber + commitHashCode) / 1024.0); this.favoriteDoubleNumber = (recordNumber + commitHashCode) / 1024.0; this.tags = new HashMap<>(); - this.tags.put("mapItem1", - new TestMapItemRecord("item" + recordNumber, "item" + recordNumber + commitTimeSuffix)); - this.tags.put("mapItem2", - new TestMapItemRecord("item2" + recordNumber, "item2" + recordNumber + commitTimeSuffix)); + this.tags.put("mapItem1", new TestMapItemRecord("item" + recordNumber, "item" + recordNumber + commitTimeSuffix)); + this.tags.put("mapItem2", new TestMapItemRecord("item2" + recordNumber, "item2" + recordNumber + commitTimeSuffix)); this.testNestedRecord = new TestNestedRecord(false, "UserId" + recordNumber + commitTimeSuffix); - this.stringArray = new String[]{"stringArray0" + commitTimeSuffix, - "stringArray1" + commitTimeSuffix}; + this.stringArray = new String[] {"stringArray0" + commitTimeSuffix, "stringArray1" + commitTimeSuffix}; } public String toJsonString() throws IOException { diff --git a/hudi-common/src/test/java/org/apache/hudi/common/util/TestRocksDBManager.java b/hudi-common/src/test/java/org/apache/hudi/common/util/TestRocksDBManager.java index 40cb06b4a..18695d2d5 100644 --- a/hudi-common/src/test/java/org/apache/hudi/common/util/TestRocksDBManager.java +++ b/hudi-common/src/test/java/org/apache/hudi/common/util/TestRocksDBManager.java @@ -42,7 +42,7 @@ public class TestRocksDBManager { @BeforeClass public static void setUpClass() { dbManager = new RocksDBDAO("/dummy/path", - FileSystemViewStorageConfig.newBuilder().build().newBuilder().build().getRocksdbBasePath()); + FileSystemViewStorageConfig.newBuilder().build().newBuilder().build().getRocksdbBasePath()); } @AfterClass @@ -121,8 +121,8 @@ public class TestRocksDBManager { prefixes.stream().forEach(prefix -> { List> gotPayloads = dbManager.prefixSearch(family, prefix).collect(Collectors.toList()); - Assert.assertEquals("Size check for prefix (" + prefix + ") and family (" + family + ")", - 0, gotPayloads.size()); + Assert.assertEquals("Size check for prefix (" + prefix + ") and family (" + family + ")", 0, + gotPayloads.size()); }); }); @@ -170,10 +170,8 @@ public class TestRocksDBManager { return false; } Payload payload = (Payload) o; - return Objects.equals(prefix, payload.prefix) - && Objects.equals(key, payload.key) - && Objects.equals(val, payload.val) - && Objects.equals(family, payload.family); + return Objects.equals(prefix, payload.prefix) && Objects.equals(key, payload.key) + && Objects.equals(val, payload.val) && Objects.equals(family, payload.family); } @Override diff --git a/hudi-common/src/test/java/org/apache/hudi/common/util/TestSerializationUtils.java b/hudi-common/src/test/java/org/apache/hudi/common/util/TestSerializationUtils.java index d97535def..c79f524d6 100644 --- a/hudi-common/src/test/java/org/apache/hudi/common/util/TestSerializationUtils.java +++ b/hudi-common/src/test/java/org/apache/hudi/common/util/TestSerializationUtils.java @@ -71,7 +71,7 @@ public class TestSerializationUtils { @Override public boolean equals(Object obj) { - if (!(obj instanceof NonSerializableClass)) { + if (!(obj instanceof NonSerializableClass)) { return false; } final NonSerializableClass other = (NonSerializableClass) obj; diff --git a/hudi-common/src/test/java/org/apache/hudi/common/util/collection/TestDiskBasedMap.java b/hudi-common/src/test/java/org/apache/hudi/common/util/collection/TestDiskBasedMap.java index 091a9dc4c..5b9d40069 100644 --- a/hudi-common/src/test/java/org/apache/hudi/common/util/collection/TestDiskBasedMap.java +++ b/hudi-common/src/test/java/org/apache/hudi/common/util/collection/TestDiskBasedMap.java @@ -79,8 +79,7 @@ public class TestDiskBasedMap extends HoodieCommonTestHarness { @Test public void testSimpleInsertWithoutHoodieMetadata() throws IOException, URISyntaxException { DiskBasedMap records = new DiskBasedMap<>(basePath); - List hoodieRecords = SchemaTestUtil - .generateHoodieTestRecordsWithoutHoodieMetadata(0, 1000); + List hoodieRecords = SchemaTestUtil.generateHoodieTestRecordsWithoutHoodieMetadata(0, 1000); Set recordKeys = new HashSet<>(); // insert generated records into the map hoodieRecords.stream().forEach(r -> { @@ -113,12 +112,10 @@ public class TestDiskBasedMap extends HoodieCommonTestHarness { assertTrue(fileSize > 0); // generate updates from inserts - List updatedRecords = - SchemaTestUtil - .updateHoodieTestRecords(recordKeys, SchemaTestUtil.generateHoodieTestRecords(0, 100), - HoodieActiveTimeline.createNewCommitTime()); - String newCommitTime = ((GenericRecord) updatedRecords.get(0)) - .get(HoodieRecord.COMMIT_TIME_METADATA_FIELD).toString(); + List updatedRecords = SchemaTestUtil.updateHoodieTestRecords(recordKeys, + SchemaTestUtil.generateHoodieTestRecords(0, 100), HoodieActiveTimeline.createNewCommitTime()); + String newCommitTime = + ((GenericRecord) updatedRecords.get(0)).get(HoodieRecord.COMMIT_TIME_METADATA_FIELD).toString(); // perform upserts recordKeys = SpillableMapTestUtils.upsertRecords(updatedRecords, records); @@ -133,8 +130,8 @@ public class TestDiskBasedMap extends HoodieCommonTestHarness { assert recordKeys.contains(rec.getRecordKey()); try { IndexedRecord indexedRecord = (IndexedRecord) rec.getData().getInsertValue(schema).get(); - String latestCommitTime = ((GenericRecord) indexedRecord) - .get(HoodieRecord.COMMIT_TIME_METADATA_FIELD).toString(); + String latestCommitTime = + ((GenericRecord) indexedRecord).get(HoodieRecord.COMMIT_TIME_METADATA_FIELD).toString(); assertEquals(latestCommitTime, newCommitTime); } catch (IOException io) { throw new UncheckedIOException(io); @@ -149,15 +146,14 @@ public class TestDiskBasedMap extends HoodieCommonTestHarness { // Test sizeEstimator without hoodie metadata fields List hoodieRecords = SchemaTestUtil.generateHoodieTestRecords(0, 1, schema); - long payloadSize = SpillableMapUtils.computePayloadSize(hoodieRecords.remove(0), - new HoodieRecordSizeEstimator(schema)); + long payloadSize = + SpillableMapUtils.computePayloadSize(hoodieRecords.remove(0), new HoodieRecordSizeEstimator(schema)); assertTrue(payloadSize > 0); // Test sizeEstimator with hoodie metadata fields schema = HoodieAvroUtils.addMetadataFields(schema); hoodieRecords = SchemaTestUtil.generateHoodieTestRecords(0, 1, schema); - payloadSize = SpillableMapUtils.computePayloadSize(hoodieRecords.remove(0), - new HoodieRecordSizeEstimator(schema)); + payloadSize = SpillableMapUtils.computePayloadSize(hoodieRecords.remove(0), new HoodieRecordSizeEstimator(schema)); assertTrue(payloadSize > 0); // Following tests payloads without an Avro Schema in the Record @@ -165,24 +161,21 @@ public class TestDiskBasedMap extends HoodieCommonTestHarness { // Test sizeEstimator without hoodie metadata fields and without schema object in the payload schema = SchemaTestUtil.getSimpleSchema(); List indexedRecords = SchemaTestUtil.generateHoodieTestRecords(0, 1); - hoodieRecords = indexedRecords.stream() - .map(r -> new HoodieRecord(new HoodieKey(UUID.randomUUID().toString(), "0000/00/00"), + hoodieRecords = + indexedRecords.stream().map(r -> new HoodieRecord(new HoodieKey(UUID.randomUUID().toString(), "0000/00/00"), new AvroBinaryTestPayload(Option.of((GenericRecord) r)))).collect(Collectors.toList()); - payloadSize = SpillableMapUtils.computePayloadSize(hoodieRecords.remove(0), - new HoodieRecordSizeEstimator(schema)); + payloadSize = SpillableMapUtils.computePayloadSize(hoodieRecords.remove(0), new HoodieRecordSizeEstimator(schema)); assertTrue(payloadSize > 0); // Test sizeEstimator with hoodie metadata fields and without schema object in the payload - final Schema simpleSchemaWithMetadata = HoodieAvroUtils - .addMetadataFields(SchemaTestUtil.getSimpleSchema()); + final Schema simpleSchemaWithMetadata = HoodieAvroUtils.addMetadataFields(SchemaTestUtil.getSimpleSchema()); indexedRecords = SchemaTestUtil.generateHoodieTestRecords(0, 1); hoodieRecords = indexedRecords.stream() .map(r -> new HoodieRecord(new HoodieKey(UUID.randomUUID().toString(), "0000/00/00"), - new AvroBinaryTestPayload(Option - .of(HoodieAvroUtils.rewriteRecord((GenericRecord) r, simpleSchemaWithMetadata))))) + new AvroBinaryTestPayload( + Option.of(HoodieAvroUtils.rewriteRecord((GenericRecord) r, simpleSchemaWithMetadata))))) .collect(Collectors.toList()); - payloadSize = SpillableMapUtils.computePayloadSize(hoodieRecords.remove(0), - new HoodieRecordSizeEstimator(schema)); + payloadSize = SpillableMapUtils.computePayloadSize(hoodieRecords.remove(0), new HoodieRecordSizeEstimator(schema)); assertTrue(payloadSize > 0); } @@ -195,8 +188,7 @@ public class TestDiskBasedMap extends HoodieCommonTestHarness { // Test sizeEstimatorPerformance with simpleSchema Schema schema = SchemaTestUtil.getSimpleSchema(); List hoodieRecords = SchemaTestUtil.generateHoodieTestRecords(0, 1, schema); - HoodieRecordSizeEstimator sizeEstimator = - new HoodieRecordSizeEstimator(schema); + HoodieRecordSizeEstimator sizeEstimator = new HoodieRecordSizeEstimator(schema); HoodieRecord record = hoodieRecords.remove(0); long startTime = System.currentTimeMillis(); SpillableMapUtils.computePayloadSize(record, sizeEstimator); diff --git a/hudi-common/src/test/java/org/apache/hudi/common/util/collection/TestExternalSpillableMap.java b/hudi-common/src/test/java/org/apache/hudi/common/util/collection/TestExternalSpillableMap.java index 1f5d74a0c..c190f1288 100644 --- a/hudi-common/src/test/java/org/apache/hudi/common/util/collection/TestExternalSpillableMap.java +++ b/hudi-common/src/test/java/org/apache/hudi/common/util/collection/TestExternalSpillableMap.java @@ -64,8 +64,7 @@ public class TestExternalSpillableMap extends HoodieCommonTestHarness { Schema schema = HoodieAvroUtils.addMetadataFields(SchemaTestUtil.getSimpleSchema()); String payloadClazz = HoodieAvroPayload.class.getName(); ExternalSpillableMap> records = - new ExternalSpillableMap<>(16L, basePath, - new DefaultSizeEstimator(), new HoodieRecordSizeEstimator(schema)); //16B + new ExternalSpillableMap<>(16L, basePath, new DefaultSizeEstimator(), new HoodieRecordSizeEstimator(schema)); // 16B List iRecords = SchemaTestUtil.generateHoodieTestRecords(0, 100); List recordKeys = SpillableMapTestUtils.upsertRecords(iRecords, records); @@ -86,8 +85,7 @@ public class TestExternalSpillableMap extends HoodieCommonTestHarness { String payloadClazz = HoodieAvroPayload.class.getName(); ExternalSpillableMap> records = - new ExternalSpillableMap<>(16L, basePath, - new DefaultSizeEstimator(), new HoodieRecordSizeEstimator(schema)); //16B + new ExternalSpillableMap<>(16L, basePath, new DefaultSizeEstimator(), new HoodieRecordSizeEstimator(schema)); // 16B List iRecords = SchemaTestUtil.generateHoodieTestRecords(0, 100); List recordKeys = SpillableMapTestUtils.upsertRecords(iRecords, records); @@ -124,8 +122,7 @@ public class TestExternalSpillableMap extends HoodieCommonTestHarness { String payloadClazz = HoodieAvroPayload.class.getName(); ExternalSpillableMap> records = - new ExternalSpillableMap<>(16L, basePath, - new DefaultSizeEstimator(), new HoodieRecordSizeEstimator(schema)); //16B + new ExternalSpillableMap<>(16L, basePath, new DefaultSizeEstimator(), new HoodieRecordSizeEstimator(schema)); // 16B List iRecords = SchemaTestUtil.generateHoodieTestRecords(0, 100); // insert a bunch of records so that values spill to disk too @@ -178,9 +175,8 @@ public class TestExternalSpillableMap extends HoodieCommonTestHarness { public void simpleTestWithException() throws IOException, URISyntaxException { Schema schema = HoodieAvroUtils.addMetadataFields(SchemaTestUtil.getSimpleSchema()); - ExternalSpillableMap> records = - new ExternalSpillableMap<>(16L, failureOutputPath, - new DefaultSizeEstimator(), new HoodieRecordSizeEstimator(schema)); //16B + ExternalSpillableMap> records = new ExternalSpillableMap<>(16L, + failureOutputPath, new DefaultSizeEstimator(), new HoodieRecordSizeEstimator(schema)); // 16B List iRecords = SchemaTestUtil.generateHoodieTestRecords(0, 100); List recordKeys = SpillableMapTestUtils.upsertRecords(iRecords, records); @@ -198,8 +194,7 @@ public class TestExternalSpillableMap extends HoodieCommonTestHarness { String payloadClazz = HoodieAvroPayload.class.getName(); ExternalSpillableMap> records = - new ExternalSpillableMap<>(16L, basePath, - new DefaultSizeEstimator(), new HoodieRecordSizeEstimator(schema)); //16B + new ExternalSpillableMap<>(16L, basePath, new DefaultSizeEstimator(), new HoodieRecordSizeEstimator(schema)); // 16B List recordKeys = new ArrayList<>(); // Ensure we spill to disk @@ -218,8 +213,8 @@ public class TestExternalSpillableMap extends HoodieCommonTestHarness { List keysToBeUpdated = new ArrayList<>(); keysToBeUpdated.add(key); // Update the commitTime for this record - List updatedRecords = SchemaTestUtil - .updateHoodieTestRecords(keysToBeUpdated, recordsToUpdate, newCommitTime); + List updatedRecords = + SchemaTestUtil.updateHoodieTestRecords(keysToBeUpdated, recordsToUpdate, newCommitTime); // Upsert this updated record SpillableMapTestUtils.upsertRecords(updatedRecords, records); GenericRecord gRecord = (GenericRecord) records.get(key).getData().getInsertValue(schema).get(); @@ -251,8 +246,7 @@ public class TestExternalSpillableMap extends HoodieCommonTestHarness { String payloadClazz = HoodieAvroPayload.class.getName(); ExternalSpillableMap> records = - new ExternalSpillableMap<>(16L, basePath, - new DefaultSizeEstimator(), new HoodieRecordSizeEstimator(schema)); //16B + new ExternalSpillableMap<>(16L, basePath, new DefaultSizeEstimator(), new HoodieRecordSizeEstimator(schema)); // 16B List recordKeys = new ArrayList<>(); // Ensure we spill to disk @@ -290,8 +284,8 @@ public class TestExternalSpillableMap extends HoodieCommonTestHarness { key = recordKeys.get(recordKeys.size() - 1); record = records.get(key); // Get the field we want to update - fieldName = schema.getFields().stream().filter(field -> field.schema().getType() == Schema.Type.STRING) - .findAny().get().name(); + fieldName = schema.getFields().stream().filter(field -> field.schema().getType() == Schema.Type.STRING).findAny() + .get().name(); // Use a new value to update this field newValue = "update2"; recordsToUpdate = new ArrayList<>(); @@ -311,6 +305,5 @@ public class TestExternalSpillableMap extends HoodieCommonTestHarness { // TODO : come up with a performance eval test for spillableMap @Test - public void testLargeInsertUpsert() { - } + public void testLargeInsertUpsert() {} } diff --git a/hudi-hadoop-mr/pom.xml b/hudi-hadoop-mr/pom.xml index e95a1dea1..6bc3c8e42 100644 --- a/hudi-hadoop-mr/pom.xml +++ b/hudi-hadoop-mr/pom.xml @@ -25,6 +25,10 @@ hudi-hadoop-mr + + ${project.parent.basedir} + + diff --git a/hudi-hadoop-mr/src/main/java/org/apache/hudi/hadoop/HoodieParquetInputFormat.java b/hudi-hadoop-mr/src/main/java/org/apache/hudi/hadoop/HoodieParquetInputFormat.java index 5e08da6f8..5dde86a30 100644 --- a/hudi-hadoop-mr/src/main/java/org/apache/hudi/hadoop/HoodieParquetInputFormat.java +++ b/hudi-hadoop-mr/src/main/java/org/apache/hudi/hadoop/HoodieParquetInputFormat.java @@ -51,10 +51,9 @@ import org.apache.log4j.LogManager; import org.apache.log4j.Logger; /** - * HoodieInputFormat which understands the Hoodie File Structure and filters files based on the - * Hoodie Mode. If paths that does not correspond to a hoodie dataset then they are passed in as is - * (as what FileInputFormat.listStatus() would do). The JobConf could have paths from multipe - * Hoodie/Non-Hoodie datasets + * HoodieInputFormat which understands the Hoodie File Structure and filters files based on the Hoodie Mode. If paths + * that does not correspond to a hoodie dataset then they are passed in as is (as what FileInputFormat.listStatus() + * would do). The JobConf could have paths from multipe Hoodie/Non-Hoodie datasets */ @UseFileSplitsFromInputFormat public class HoodieParquetInputFormat extends MapredParquetInputFormat implements Configurable { @@ -86,22 +85,19 @@ public class HoodieParquetInputFormat extends MapredParquetInputFormat implement String mode = HoodieHiveUtil.readMode(Job.getInstance(job), tableName); // Get all commits, delta commits, compactions, as all of them produce a base parquet file // today - HoodieTimeline timeline = metadata.getActiveTimeline().getCommitsTimeline() - .filterCompletedInstants(); - ReadOptimizedView roView = new HoodieTableFileSystemView(metadata, - timeline, statuses); + HoodieTimeline timeline = metadata.getActiveTimeline().getCommitsTimeline().filterCompletedInstants(); + ReadOptimizedView roView = new HoodieTableFileSystemView(metadata, timeline, statuses); if (HoodieHiveUtil.INCREMENTAL_SCAN_MODE.equals(mode)) { // this is of the form commitTs_partition_sequenceNumber - String lastIncrementalTs = HoodieHiveUtil - .readStartCommitTime(Job.getInstance(job), tableName); + String lastIncrementalTs = HoodieHiveUtil.readStartCommitTime(Job.getInstance(job), tableName); // Total number of commits to return in this batch. Set this to -1 to get all the commits. Integer maxCommits = HoodieHiveUtil.readMaxCommits(Job.getInstance(job), tableName); LOG.info("Last Incremental timestamp was set as " + lastIncrementalTs); - List commitsToReturn = timeline.findInstantsAfter(lastIncrementalTs, maxCommits) - .getInstants().map(HoodieInstant::getTimestamp).collect(Collectors.toList()); - List filteredFiles = roView.getLatestDataFilesInRange(commitsToReturn) - .collect(Collectors.toList()); + List commitsToReturn = timeline.findInstantsAfter(lastIncrementalTs, maxCommits).getInstants() + .map(HoodieInstant::getTimestamp).collect(Collectors.toList()); + List filteredFiles = + roView.getLatestDataFilesInRange(commitsToReturn).collect(Collectors.toList()); for (HoodieDataFile filteredFile : filteredFiles) { LOG.info("Processing incremental hoodie file - " + filteredFile.getPath()); filteredFile = checkFileStatus(filteredFile); @@ -110,8 +106,7 @@ public class HoodieParquetInputFormat extends MapredParquetInputFormat implement LOG.info("Total paths to process after hoodie incremental filter " + filteredFiles.size()); } else { // filter files on the latest commit found - List filteredFiles = roView.getLatestDataFiles() - .collect(Collectors.toList()); + List filteredFiles = roView.getLatestDataFiles().collect(Collectors.toList()); LOG.info("Total paths to process after hoodie filter " + filteredFiles.size()); for (HoodieDataFile filteredFile : filteredFiles) { if (LOG.isDebugEnabled()) { @@ -127,10 +122,9 @@ public class HoodieParquetInputFormat extends MapredParquetInputFormat implement } /** - * Checks the file status for a race condition which can set the file size to 0. 1. - * HiveInputFormat does super.listStatus() and gets back a FileStatus[] 2. Then it creates the - * HoodieTableMetaClient for the paths listed. 3. Generation of splits looks at FileStatus size to - * create splits, which skips this file + * Checks the file status for a race condition which can set the file size to 0. 1. HiveInputFormat does + * super.listStatus() and gets back a FileStatus[] 2. Then it creates the HoodieTableMetaClient for the paths listed. + * 3. Generation of splits looks at FileStatus size to create splits, which skips this file */ private HoodieDataFile checkFileStatus(HoodieDataFile dataFile) throws IOException { Path dataPath = dataFile.getFileStatus().getPath(); @@ -146,24 +140,22 @@ public class HoodieParquetInputFormat extends MapredParquetInputFormat implement } } - private Map> groupFileStatus(FileStatus[] fileStatuses) - throws IOException { + private Map> groupFileStatus(FileStatus[] fileStatuses) throws IOException { // This assumes the paths for different tables are grouped together Map> grouped = new HashMap<>(); HoodieTableMetaClient metadata = null; String nonHoodieBasePath = null; for (FileStatus status : fileStatuses) { if (!status.getPath().getName().endsWith(".parquet")) { - //FIXME(vc): skip non parquet files for now. This wont be needed once log file name start + // FIXME(vc): skip non parquet files for now. This wont be needed once log file name start // with "." continue; } - if ((metadata == null && nonHoodieBasePath == null) || (metadata == null && !status.getPath() - .toString().contains(nonHoodieBasePath)) || (metadata != null && !status.getPath() - .toString().contains(metadata.getBasePath()))) { + if ((metadata == null && nonHoodieBasePath == null) + || (metadata == null && !status.getPath().toString().contains(nonHoodieBasePath)) + || (metadata != null && !status.getPath().toString().contains(metadata.getBasePath()))) { try { - metadata = getTableMetaClient(status.getPath().getFileSystem(conf), - status.getPath().getParent()); + metadata = getTableMetaClient(status.getPath().getFileSystem(conf), status.getPath().getParent()); nonHoodieBasePath = null; } catch (DatasetNotFoundException | InvalidDatasetException e) { LOG.info("Handling a non-hoodie path " + status.getPath()); @@ -188,29 +180,28 @@ public class HoodieParquetInputFormat extends MapredParquetInputFormat implement } @Override - public RecordReader getRecordReader(final InputSplit split, - final JobConf job, final Reporter reporter) throws IOException { + public RecordReader getRecordReader(final InputSplit split, final JobConf job, + final Reporter reporter) throws IOException { // TODO enable automatic predicate pushdown after fixing issues - // FileSplit fileSplit = (FileSplit) split; - // HoodieTableMetadata metadata = getTableMetadata(fileSplit.getPath().getParent()); - // String tableName = metadata.getTableName(); - // String mode = HoodieHiveUtil.readMode(job, tableName); + // FileSplit fileSplit = (FileSplit) split; + // HoodieTableMetadata metadata = getTableMetadata(fileSplit.getPath().getParent()); + // String tableName = metadata.getTableName(); + // String mode = HoodieHiveUtil.readMode(job, tableName); - // if (HoodieHiveUtil.INCREMENTAL_SCAN_MODE.equals(mode)) { - // FilterPredicate predicate = constructHoodiePredicate(job, tableName, split); - // LOG.info("Setting parquet predicate push down as " + predicate); - // ParquetInputFormat.setFilterPredicate(job, predicate); - //clearOutExistingPredicate(job); - // } + // if (HoodieHiveUtil.INCREMENTAL_SCAN_MODE.equals(mode)) { + // FilterPredicate predicate = constructHoodiePredicate(job, tableName, split); + // LOG.info("Setting parquet predicate push down as " + predicate); + // ParquetInputFormat.setFilterPredicate(job, predicate); + // clearOutExistingPredicate(job); + // } return super.getRecordReader(split, job, reporter); } /** - * Read the table metadata from a data path. This assumes certain hierarchy of files which should - * be changed once a better way is figured out to pass in the hoodie meta directory + * Read the table metadata from a data path. This assumes certain hierarchy of files which should be changed once a + * better way is figured out to pass in the hoodie meta directory */ - protected static HoodieTableMetaClient getTableMetaClient(FileSystem fs, Path dataPath) - throws IOException { + protected static HoodieTableMetaClient getTableMetaClient(FileSystem fs, Path dataPath) throws IOException { int levels = HoodieHiveUtil.DEFAULT_LEVELS_TO_BASEPATH; if (HoodiePartitionMetadata.hasPartitionMetadata(fs, dataPath)) { HoodiePartitionMetadata metadata = new HoodiePartitionMetadata(fs, dataPath); diff --git a/hudi-hadoop-mr/src/main/java/org/apache/hudi/hadoop/HoodieROTablePathFilter.java b/hudi-hadoop-mr/src/main/java/org/apache/hudi/hadoop/HoodieROTablePathFilter.java index aaac77989..7b6e7eec7 100644 --- a/hudi-hadoop-mr/src/main/java/org/apache/hudi/hadoop/HoodieROTablePathFilter.java +++ b/hudi-hadoop-mr/src/main/java/org/apache/hudi/hadoop/HoodieROTablePathFilter.java @@ -37,22 +37,22 @@ import org.apache.log4j.LogManager; import org.apache.log4j.Logger; /** - * Given a path is a part of - Hoodie dataset = accepts ONLY the latest version of each path - - * Non-Hoodie dataset = then always accept + * Given a path is a part of - Hoodie dataset = accepts ONLY the latest version of each path - Non-Hoodie dataset = then + * always accept *

    - * We can set this filter, on a query engine's Hadoop Config and if it respects path filters, then - * you should be able to query both hoodie and non-hoodie datasets as you would normally do. + * We can set this filter, on a query engine's Hadoop Config and if it respects path filters, then you should be able to + * query both hoodie and non-hoodie datasets as you would normally do. *

    - * hadoopConf.setClass("mapreduce.input.pathFilter.class", org.apache.hudi.hadoop - * .HoodieROTablePathFilter.class, org.apache.hadoop.fs.PathFilter.class) + * hadoopConf.setClass("mapreduce.input.pathFilter.class", org.apache.hudi.hadoop .HoodieROTablePathFilter.class, + * org.apache.hadoop.fs.PathFilter.class) */ public class HoodieROTablePathFilter implements PathFilter, Serializable { private static final transient Logger LOG = LogManager.getLogger(HoodieROTablePathFilter.class); /** - * Its quite common, to have all files from a given partition path be passed into accept(), cache - * the check for hoodie metadata for known partition paths and the latest versions of files + * Its quite common, to have all files from a given partition path be passed into accept(), cache the check for hoodie + * metadata for known partition paths and the latest versions of files */ private HashMap> hoodiePathCache; @@ -135,19 +135,16 @@ public class HoodieROTablePathFilter implements PathFilter, Serializable { if (baseDir != null) { try { - HoodieTableMetaClient metaClient = new HoodieTableMetaClient(fs.getConf(), - baseDir.toString()); + HoodieTableMetaClient metaClient = new HoodieTableMetaClient(fs.getConf(), baseDir.toString()); HoodieTableFileSystemView fsView = new HoodieTableFileSystemView(metaClient, - metaClient.getActiveTimeline().getCommitsTimeline().filterCompletedInstants(), - fs.listStatus(folder)); - List latestFiles = fsView.getLatestDataFiles() - .collect(Collectors.toList()); + metaClient.getActiveTimeline().getCommitsTimeline().filterCompletedInstants(), fs.listStatus(folder)); + List latestFiles = fsView.getLatestDataFiles().collect(Collectors.toList()); // populate the cache if (!hoodiePathCache.containsKey(folder.toString())) { hoodiePathCache.put(folder.toString(), new HashSet<>()); } - LOG.info("Based on hoodie metadata from base path: " + baseDir.toString() + ", caching " - + latestFiles.size() + " files under " + folder); + LOG.info("Based on hoodie metadata from base path: " + baseDir.toString() + ", caching " + latestFiles.size() + + " files under " + folder); for (HoodieDataFile lfile : latestFiles) { hoodiePathCache.get(folder.toString()).add(new Path(lfile.getPath())); } diff --git a/hudi-hadoop-mr/src/main/java/org/apache/hudi/hadoop/SafeParquetRecordReaderWrapper.java b/hudi-hadoop-mr/src/main/java/org/apache/hudi/hadoop/SafeParquetRecordReaderWrapper.java index d35c6be25..f4db128b6 100644 --- a/hudi-hadoop-mr/src/main/java/org/apache/hudi/hadoop/SafeParquetRecordReaderWrapper.java +++ b/hudi-hadoop-mr/src/main/java/org/apache/hudi/hadoop/SafeParquetRecordReaderWrapper.java @@ -25,12 +25,11 @@ import org.apache.hadoop.io.Writable; import org.apache.hadoop.mapred.RecordReader; /** - * Record Reader for parquet. Records read from this reader is safe to be - * buffered for concurrent processing. + * Record Reader for parquet. Records read from this reader is safe to be buffered for concurrent processing. * - * In concurrent producer/consumer pattern, where the record is read and buffered by one thread and processed in - * another thread, we need to ensure new instance of ArrayWritable is buffered. ParquetReader createKey/Value is unsafe - * as it gets reused for subsequent fetch. This wrapper makes ParquetReader safe for this use-case. + * In concurrent producer/consumer pattern, where the record is read and buffered by one thread and processed in another + * thread, we need to ensure new instance of ArrayWritable is buffered. ParquetReader createKey/Value is unsafe as it + * gets reused for subsequent fetch. This wrapper makes ParquetReader safe for this use-case. */ public class SafeParquetRecordReaderWrapper implements RecordReader { @@ -62,11 +61,9 @@ public class SafeParquetRecordReaderWrapper implements RecordReader *

    - * CombineHiveInputFormat is a parameterized InputFormat which looks at the path - * name and determine the correct InputFormat for that path name from - * mapredPlan.pathToPartitionInfo(). It can be used to read files with different + * CombineHiveInputFormat is a parameterized InputFormat which looks at the path name and determine the correct + * InputFormat for that path name from mapredPlan.pathToPartitionInfo(). It can be used to read files with different * input format in the same map-reduce job. * * NOTE : This class is implemented to work with Hive 2.x + @@ -116,19 +115,15 @@ public class HoodieCombineHiveInputFormat call() throws Exception { Set nonCombinablePathIndices = new HashSet(); for (int i = 0; i < length; i++) { - PartitionDesc part = - HiveFileFormatUtils.getPartitionDescFromPathRecursively( - pathToPartitionInfo, paths[i + start], - IOPrepareCache.get().allocatePartitionDescMap()); + PartitionDesc part = HiveFileFormatUtils.getPartitionDescFromPathRecursively(pathToPartitionInfo, + paths[i + start], IOPrepareCache.get().allocatePartitionDescMap()); // Use HiveInputFormat if any of the paths is not splittable Class inputFormatClass = part.getInputFileFormatClass(); - InputFormat inputFormat = - getInputFormatFromCache(inputFormatClass, conf); + InputFormat inputFormat = getInputFormatFromCache(inputFormatClass, conf); if (inputFormat instanceof AvoidSplitCombination && ((AvoidSplitCombination) inputFormat).shouldSkipCombine(paths[i + start], conf)) { if (LOG.isDebugEnabled()) { - LOG.debug("The path [" + paths[i + start] - + "] is being parked for HiveInputFormat.getSplits"); + LOG.debug("The path [" + paths[i + start] + "] is being parked for HiveInputFormat.getSplits"); } nonCombinablePathIndices.add(i + start); } @@ -138,10 +133,9 @@ public class HoodieCombineHiveInputFormat pathToPartitionInfo; public CombineHiveInputSplit() throws IOException { - this(ShimLoader.getHadoopShims().getCombineFileInputFormat() - .getInputSplitShim()); + this(ShimLoader.getHadoopShims().getCombineFileInputFormat().getInputSplitShim()); } public CombineHiveInputSplit(CombineFileSplit inputSplitShim) throws IOException { this(inputSplitShim.getJob(), inputSplitShim); } - public CombineHiveInputSplit(JobConf job, CombineFileSplit inputSplitShim) - throws IOException { + public CombineHiveInputSplit(JobConf job, CombineFileSplit inputSplitShim) throws IOException { this(job, inputSplitShim, null); } @@ -176,9 +168,8 @@ public class HoodieCombineHiveInputFormat 0) { - PartitionDesc part = HiveFileFormatUtils - .getPartitionDescFromPathRecursively(this.pathToPartitionInfo, - ipaths[0], IOPrepareCache.get().getPartitionDescMap()); + PartitionDesc part = HiveFileFormatUtils.getPartitionDescFromPathRecursively(this.pathToPartitionInfo, + ipaths[0], IOPrepareCache.get().getPartitionDescMap()); inputFormatClassName = part.getInputFileFormatClass().getName(); } } @@ -307,9 +298,8 @@ public class HoodieCombineHiveInputFormat> opList, - String inputFormatClassName, + public CombinePathInputFormat(List> opList, String inputFormatClassName, String deserializerClassName) { this.opList = opList; this.inputFormatClassName = inputFormatClassName; @@ -340,10 +329,9 @@ public class HoodieCombineHiveInputFormat pathToPartitionInfo) + private InputSplit[] getCombineSplits(JobConf job, int numSplits, Map pathToPartitionInfo) throws IOException { init(job); Map> pathToAliases = mrwork.getPathToAliases(); - Map> aliasToWork = - mrwork.getAliasToWork(); + Map> aliasToWork = mrwork.getAliasToWork(); /** MOD - Initialize a custom combine input format shim that will call listStatus on the custom inputFormat **/ - HoodieCombineHiveInputFormat.HoodieCombineFileInputFormatShim - combine = new HoodieCombineHiveInputFormat.HoodieCombineFileInputFormatShim(); + HoodieCombineHiveInputFormat.HoodieCombineFileInputFormatShim combine = + new HoodieCombineHiveInputFormat.HoodieCombineFileInputFormatShim(); InputSplit[] splits = null; if (combine == null) { @@ -385,13 +371,12 @@ public class HoodieCombineHiveInputFormat inpDirs = new ArrayList(); List inpFiles = new ArrayList(); - Map poolMap = - new HashMap(); + Map poolMap = new HashMap(); Set poolSet = new HashSet(); for (Path path : paths) { - PartitionDesc part = HiveFileFormatUtils.getPartitionDescFromPathRecursively( - pathToPartitionInfo, path, IOPrepareCache.get().allocatePartitionDescMap()); + PartitionDesc part = HiveFileFormatUtils.getPartitionDescFromPathRecursively(pathToPartitionInfo, path, + IOPrepareCache.get().allocatePartitionDescMap()); TableDesc tableDesc = part.getTableDesc(); if ((tableDesc != null) && tableDesc.isNonNative()) { return super.getSplits(job, numSplits); @@ -418,7 +403,7 @@ public class HoodieCombineHiveInputFormat> opList = null; if (!mrwork.isMapperCannotSpanPartns()) { - //if mapper can span partitions, make sure a splits does not contain multiple + // if mapper can span partitions, make sure a splits does not contain multiple // opList + inputFormatClassName + deserializerClassName combination // This is done using the Map of CombinePathInputFormat to PathFilter - opList = HiveFileFormatUtils.doGetWorksFromPath( - pathToAliases, aliasToWork, filterPath); + opList = HiveFileFormatUtils.doGetWorksFromPath(pathToAliases, aliasToWork, filterPath); CombinePathInputFormat combinePathInputFormat = new CombinePathInputFormat(opList, inputFormatClassName, deserializerClassName); f = poolMap.get(combinePathInputFormat); if (f == null) { f = new CombineFilter(filterPath); - LOG.info("CombineHiveInputSplit creating pool for " - + path + "; using filter path " + filterPath); + LOG.info("CombineHiveInputSplit creating pool for " + path + "; using filter path " + filterPath); combine.createPool(job, f); poolMap.put(combinePathInputFormat, f); } else { - LOG.info("CombineHiveInputSplit: pool is already created for " - + path + "; using filter path " + filterPath); + LOG.info("CombineHiveInputSplit: pool is already created for " + path + "; using filter path " + filterPath); f.addPath(filterPath); } } else { @@ -470,8 +452,8 @@ public class HoodieCombineHiveInputFormat iss = new ArrayList(); if (!mrwork.isMapperCannotSpanPartns()) { - //mapper can span partitions - //combine into as few as one split, subject to the PathFilters set + // mapper can span partitions + // combine into as few as one split, subject to the PathFilters set // using combine.createPool. iss = Arrays.asList(combine.getSplits(job, 1)); } else { @@ -507,8 +489,8 @@ public class HoodieCombineHiveInputFormat getNonCombinablePathIndices(JobConf job, Path[] paths, int numThreads) throws ExecutionException, InterruptedException { - LOG.info("Total number of paths: " + paths.length - + ", launching " + numThreads + " threads to check non-combinable ones."); + LOG.info("Total number of paths: " + paths.length + ", launching " + numThreads + + " threads to check non-combinable ones."); int numPathPerThread = (int) Math.ceil((double) paths.length / numThreads); ExecutorService executor = Executors.newFixedThreadPool(numThreads); @@ -517,8 +499,7 @@ public class HoodieCombineHiveInputFormat nonCombinablePathIndices = new HashSet(); for (Future> future : futureList) { @@ -572,15 +553,13 @@ public class HoodieCombineHiveInputFormat 0) { - FileInputFormat.setInputPaths(job, - nonCombinablePaths.toArray(new Path[nonCombinablePaths.size()])); + FileInputFormat.setInputPaths(job, nonCombinablePaths.toArray(new Path[nonCombinablePaths.size()])); InputSplit[] splits = super.getSplits(job, numSplits); for (InputSplit split : splits) { result.add(split); @@ -589,10 +568,9 @@ public class HoodieCombineHiveInputFormat 0) { - FileInputFormat.setInputPaths(job, - combinablePaths.toArray(new Path[combinablePaths.size()])); - Map pathToPartitionInfo = this.pathToPartitionInfo != null - ? this.pathToPartitionInfo : Utilities.getMapWork(job).getPathToPartitionInfo(); + FileInputFormat.setInputPaths(job, combinablePaths.toArray(new Path[combinablePaths.size()])); + Map pathToPartitionInfo = this.pathToPartitionInfo != null ? this.pathToPartitionInfo + : Utilities.getMapWork(job).getPathToPartitionInfo(); InputSplit[] splits = getCombineSplits(job, numSplits, pathToPartitionInfo); for (InputSplit split : splits) { result.add(split); @@ -614,8 +592,8 @@ public class HoodieCombineHiveInputFormat iss, Path... path) throws IOException { + private void processPaths(JobConf job, CombineFileInputFormatShim combine, List iss, Path... path) + throws IOException { JobConf currJob = new JobConf(job); FileInputFormat.setInputPaths(currJob, path); iss.addAll(Arrays.asList(combine.getSplits(currJob, 1))); @@ -645,19 +623,17 @@ public class HoodieCombineHiveInputFormat - * First, splits are grouped by alias they are for. If one split serves more than one - * alias or not for any sampled alias, we just directly add it to returned list. - * Then we find a list of exclusive splits for every alias to be sampled. - * For each alias, we start from position of seedNumber%totalNumber, and keep add - * splits until the total size hits percentage. + * First, splits are grouped by alias they are for. If one split serves more than one alias or not for any sampled + * alias, we just directly add it to returned list. Then we find a list of exclusive splits for every alias to be + * sampled. For each alias, we start from position of seedNumber%totalNumber, and keep add splits until the total size + * hits percentage. * * @return the sampled splits */ private List sampleSplits(List splits) { HashMap nameToSamples = mrwork.getNameToSplitSample(); List retLists = new ArrayList(); - Map> aliasToSplitList = - new HashMap>(); + Map> aliasToSplitList = new HashMap>(); Map> pathToAliases = mrwork.getPathToAliases(); Map> pathToAliasesNoScheme = removeScheme(pathToAliases); @@ -667,14 +643,13 @@ public class HoodieCombineHiveInputFormat l = HiveFileFormatUtils.doGetAliasesFromPath( - schemeless ? pathToAliasesNoScheme : pathToAliases, path); + List l = + HiveFileFormatUtils.doGetAliasesFromPath(schemeless ? pathToAliasesNoScheme : pathToAliases, path); // a path for a split unqualified the split from being sampled if: // 1. it serves more than one alias // 2. the alias it serves is not sampled // 3. it serves different alias than another path for the same split - if (l.size() != 1 || !nameToSamples.containsKey(l.get(0)) - || (alias != null && l.get(0) != alias)) { + if (l.size() != 1 || !nameToSamples.containsKey(l.get(0)) || (alias != null && l.get(0) != alias)) { alias = null; break; } @@ -739,12 +714,10 @@ public class HoodieCombineHiveInputFormat extends CombineFileInputFormat implements org.apache.hadoop.hive.shims.HadoopShims.CombineFileInputFormatShim { @@ -832,8 +801,7 @@ public class HoodieCombineHiveInputFormat getRecordReader(InputSplit split, JobConf job, Reporter reporter) - throws IOException { + public RecordReader getRecordReader(InputSplit split, JobConf job, Reporter reporter) throws IOException { throw new IOException("CombineFileInputFormat.getRecordReader not needed."); } @@ -866,8 +833,7 @@ public class HoodieCombineHiveInputFormat( - Arrays.asList(input.listStatus(new JobConf(job.getConfiguration())))); + result = new ArrayList(Arrays.asList(input.listStatus(new JobConf(job.getConfiguration())))); } else { result = super.listStatus(job); } @@ -903,14 +869,12 @@ public class HoodieCombineHiveInputFormat 0) { - inputSplitShims.add( - new HadoopShimsSecure.InputSplitShim(job, split.getPaths(), split.getStartOffsets(), - split.getLengths(), split.getLocations())); + inputSplitShims.add(new HadoopShimsSecure.InputSplitShim(job, split.getPaths(), split.getStartOffsets(), + split.getLengths(), split.getLocations())); } } - return (CombineFileSplit[]) inputSplitShims - .toArray(new HadoopShimsSecure.InputSplitShim[inputSplitShims.size()]); + return (CombineFileSplit[]) inputSplitShims.toArray(new HadoopShimsSecure.InputSplitShim[inputSplitShims.size()]); } public HadoopShimsSecure.InputSplitShim getInputSplitShim() throws IOException { diff --git a/hudi-hadoop-mr/src/main/java/org/apache/hudi/hadoop/realtime/AbstractRealtimeRecordReader.java b/hudi-hadoop-mr/src/main/java/org/apache/hudi/hadoop/realtime/AbstractRealtimeRecordReader.java index 93a608ca6..ee62ecd79 100644 --- a/hudi-hadoop-mr/src/main/java/org/apache/hudi/hadoop/realtime/AbstractRealtimeRecordReader.java +++ b/hudi-hadoop-mr/src/main/java/org/apache/hudi/hadoop/realtime/AbstractRealtimeRecordReader.java @@ -58,8 +58,7 @@ import org.apache.parquet.hadoop.ParquetFileReader; import org.apache.parquet.schema.MessageType; /** - * Record Reader implementation to merge fresh avro data with base parquet data, to support real - * time queries. + * Record Reader implementation to merge fresh avro data with base parquet data, to support real time queries. */ public abstract class AbstractRealtimeRecordReader { @@ -69,8 +68,7 @@ public abstract class AbstractRealtimeRecordReader { // used to choose a trade off between IO vs Memory when performing compaction process // Depending on outputfile size and memory provided, choose true to avoid OOM for large file // size + small memory - public static final String COMPACTION_LAZY_BLOCK_READ_ENABLED_PROP = - "compaction.lazy.block.read.enabled"; + public static final String COMPACTION_LAZY_BLOCK_READ_ENABLED_PROP = "compaction.lazy.block.read.enabled"; public static final String DEFAULT_COMPACTION_LAZY_BLOCK_READ_ENABLED = "true"; // Property to set the max memory for dfs inputstream buffer size @@ -104,8 +102,7 @@ public abstract class AbstractRealtimeRecordReader { baseFileSchema = readSchema(jobConf, split.getPath()); init(); } catch (IOException e) { - throw new HoodieIOException( - "Could not create HoodieRealtimeRecordReader on path " + this.split.getPath(), e); + throw new HoodieIOException("Could not create HoodieRealtimeRecordReader on path " + this.split.getPath(), e); } } @@ -116,8 +113,8 @@ public abstract class AbstractRealtimeRecordReader { } /** - * Reads the schema from the parquet file. This is different from ParquetUtils as it uses the - * twitter parquet to support hive 1.1.0 + * Reads the schema from the parquet file. This is different from ParquetUtils as it uses the twitter parquet to + * support hive 1.1.0 */ private static MessageType readSchema(Configuration conf, Path parquetFilePath) { try { @@ -157,19 +154,19 @@ public abstract class AbstractRealtimeRecordReader { } /** - * Given a comma separated list of field names and positions at which they appear on Hive, return - * a ordered list of field names, that can be passed onto storage. + * Given a comma separated list of field names and positions at which they appear on Hive, return a ordered list of + * field names, that can be passed onto storage. */ private static List orderFields(String fieldNameCsv, String fieldOrderCsv, List partitioningFields) { String[] fieldOrders = fieldOrderCsv.split(","); - List fieldNames = Arrays.stream(fieldNameCsv.split(",")) - .filter(fn -> !partitioningFields.contains(fn)).collect(Collectors.toList()); + List fieldNames = Arrays.stream(fieldNameCsv.split(",")).filter(fn -> !partitioningFields.contains(fn)) + .collect(Collectors.toList()); // Hive does not provide ids for partitioning fields, so check for lengths excluding that. if (fieldNames.size() != fieldOrders.length) { - throw new HoodieException(String - .format("Error ordering fields for storage read. #fieldNames: %d, #fieldPositions: %d", + throw new HoodieException( + String.format("Error ordering fields for storage read. #fieldNames: %d, #fieldPositions: %d", fieldNames.size(), fieldOrders.length)); } TreeMap orderedFieldMap = new TreeMap<>(); @@ -180,18 +177,17 @@ public abstract class AbstractRealtimeRecordReader { } /** - * Generate a reader schema off the provided writeSchema, to just project out the provided - * columns + * Generate a reader schema off the provided writeSchema, to just project out the provided columns */ public static Schema generateProjectionSchema(Schema writeSchema, List fieldNames) { /** - * Avro & Presto field names seems to be case sensitive (support fields differing only in case) - * whereas Hive/Impala/SparkSQL(default) are case-insensitive. Spark allows this to be configurable - * using spark.sql.caseSensitive=true + * Avro & Presto field names seems to be case sensitive (support fields differing only in case) whereas + * Hive/Impala/SparkSQL(default) are case-insensitive. Spark allows this to be configurable using + * spark.sql.caseSensitive=true * - * For a RT table setup with no delta-files (for a latest file-slice) -> we translate parquet schema to Avro - * Here the field-name case is dependent on parquet schema. Hive (1.x/2.x/CDH) translate column projections - * to lower-cases + * For a RT table setup with no delta-files (for a latest file-slice) -> we translate parquet schema to Avro Here + * the field-name case is dependent on parquet schema. Hive (1.x/2.x/CDH) translate column projections to + * lower-cases * */ List projectedFields = new ArrayList<>(); @@ -201,16 +197,14 @@ public abstract class AbstractRealtimeRecordReader { Schema.Field field = schemaFieldsMap.get(fn.toLowerCase()); if (field == null) { throw new HoodieException("Field " + fn + " not found in log schema. Query cannot proceed! " - + "Derived Schema Fields: " - + new ArrayList<>(schemaFieldsMap.keySet())); + + "Derived Schema Fields: " + new ArrayList<>(schemaFieldsMap.keySet())); } else { - projectedFields - .add(new Schema.Field(field.name(), field.schema(), field.doc(), field.defaultValue())); + projectedFields.add(new Schema.Field(field.name(), field.schema(), field.doc(), field.defaultValue())); } } - Schema projectedSchema = Schema - .createRecord(writeSchema.getName(), writeSchema.getDoc(), writeSchema.getNamespace(), writeSchema.isError()); + Schema projectedSchema = Schema.createRecord(writeSchema.getName(), writeSchema.getDoc(), + writeSchema.getNamespace(), writeSchema.isError()); projectedSchema.setFields(projectedFields); return projectedSchema; } @@ -295,16 +289,16 @@ public abstract class AbstractRealtimeRecordReader { } /** - * Hive implementation of ParquetRecordReader results in partition columns not present in the original parquet file - * to also be part of the projected schema. Hive expects the record reader implementation to return the row in its + * Hive implementation of ParquetRecordReader results in partition columns not present in the original parquet file to + * also be part of the projected schema. Hive expects the record reader implementation to return the row in its * entirety (with un-projected column having null values). As we use writerSchema for this, make sure writer schema * also includes partition columns * * @param schema Schema to be changed */ private static Schema addPartitionFields(Schema schema, List partitioningFields) { - final Set firstLevelFieldNames = schema.getFields().stream().map(Field::name) - .map(String::toLowerCase).collect(Collectors.toSet()); + final Set firstLevelFieldNames = + schema.getFields().stream().map(Field::name).map(String::toLowerCase).collect(Collectors.toSet()); List fieldsToAdd = partitioningFields.stream().map(String::toLowerCase) .filter(x -> !firstLevelFieldNames.contains(x)).collect(Collectors.toList()); @@ -313,12 +307,12 @@ public abstract class AbstractRealtimeRecordReader { /** * Goes through the log files in reverse order and finds the schema from the last available data block. If not, falls - * back to the schema from the latest parquet file. Finally, sets the partition column and projection fields into - * the job conf. + * back to the schema from the latest parquet file. Finally, sets the partition column and projection fields into the + * job conf. */ private void init() throws IOException { - Schema schemaFromLogFile = LogReaderUtils - .readLatestSchemaFromLogFiles(split.getBasePath(), split.getDeltaFilePaths(), jobConf); + Schema schemaFromLogFile = + LogReaderUtils.readLatestSchemaFromLogFiles(split.getBasePath(), split.getDeltaFilePaths(), jobConf); if (schemaFromLogFile == null) { writerSchema = new AvroSchemaConverter().convert(baseFileSchema); LOG.debug("Writer Schema From Parquet => " + writerSchema.getFields()); @@ -332,10 +326,8 @@ public abstract class AbstractRealtimeRecordReader { partitionFields.length() > 0 ? Arrays.stream(partitionFields.split(",")).collect(Collectors.toList()) : new ArrayList<>(); writerSchema = addPartitionFields(writerSchema, partitioningFields); - List projectionFields = orderFields( - jobConf.get(ColumnProjectionUtils.READ_COLUMN_NAMES_CONF_STR), - jobConf.get(ColumnProjectionUtils.READ_COLUMN_IDS_CONF_STR), - partitioningFields); + List projectionFields = orderFields(jobConf.get(ColumnProjectionUtils.READ_COLUMN_NAMES_CONF_STR), + jobConf.get(ColumnProjectionUtils.READ_COLUMN_IDS_CONF_STR), partitioningFields); // TODO(vc): In the future, the reader schema should be updated based on log files & be able // to null out fields not present before readerSchema = generateProjectionSchema(writerSchema, projectionFields); @@ -353,8 +345,8 @@ public abstract class AbstractRealtimeRecordReader { public long getMaxCompactionMemoryInBytes() { // jobConf.getMemoryForMapTask() returns in MB - return (long) Math.ceil(Double - .valueOf(jobConf.get(COMPACTION_MEMORY_FRACTION_PROP, DEFAULT_COMPACTION_MEMORY_FRACTION)) - * jobConf.getMemoryForMapTask() * 1024 * 1024L); + return (long) Math + .ceil(Double.valueOf(jobConf.get(COMPACTION_MEMORY_FRACTION_PROP, DEFAULT_COMPACTION_MEMORY_FRACTION)) + * jobConf.getMemoryForMapTask() * 1024 * 1024L); } } diff --git a/hudi-hadoop-mr/src/main/java/org/apache/hudi/hadoop/realtime/HoodieParquetRealtimeInputFormat.java b/hudi-hadoop-mr/src/main/java/org/apache/hudi/hadoop/realtime/HoodieParquetRealtimeInputFormat.java index 1359b15f3..d37ae2ab9 100644 --- a/hudi-hadoop-mr/src/main/java/org/apache/hudi/hadoop/realtime/HoodieParquetRealtimeInputFormat.java +++ b/hudi-hadoop-mr/src/main/java/org/apache/hudi/hadoop/realtime/HoodieParquetRealtimeInputFormat.java @@ -77,19 +77,18 @@ public class HoodieParquetRealtimeInputFormat extends HoodieParquetInputFormat i @Override public InputSplit[] getSplits(JobConf job, int numSplits) throws IOException { - Stream fileSplits = Arrays.stream(super.getSplits(job, numSplits)) - .map(is -> (FileSplit) is); + Stream fileSplits = Arrays.stream(super.getSplits(job, numSplits)).map(is -> (FileSplit) is); // obtain all unique parent folders for splits - Map> partitionsToParquetSplits = fileSplits - .collect(Collectors.groupingBy(split -> split.getPath().getParent())); + Map> partitionsToParquetSplits = + fileSplits.collect(Collectors.groupingBy(split -> split.getPath().getParent())); // TODO(vc): Should we handle also non-hoodie splits here? Map metaClientMap = new HashMap<>(); - Map partitionsToMetaClient = partitionsToParquetSplits.keySet() - .stream().collect(Collectors.toMap(Function.identity(), p -> { + Map partitionsToMetaClient = + partitionsToParquetSplits.keySet().stream().collect(Collectors.toMap(Function.identity(), p -> { // find if we have a metaclient already for this partition. - Option matchingBasePath = Option.fromJavaOptional(metaClientMap.keySet().stream() - .filter(basePath -> p.toString().startsWith(basePath)).findFirst()); + Option matchingBasePath = Option.fromJavaOptional( + metaClientMap.keySet().stream().filter(basePath -> p.toString().startsWith(basePath)).findFirst()); if (matchingBasePath.isPresent()) { return metaClientMap.get(matchingBasePath.get()); } @@ -109,23 +108,20 @@ public class HoodieParquetRealtimeInputFormat extends HoodieParquetInputFormat i partitionsToParquetSplits.keySet().stream().forEach(partitionPath -> { // for each partition path obtain the data & log file groupings, then map back to inputsplits HoodieTableMetaClient metaClient = partitionsToMetaClient.get(partitionPath); - HoodieTableFileSystemView fsView = new HoodieTableFileSystemView(metaClient, - metaClient.getActiveTimeline()); - String relPartitionPath = FSUtils - .getRelativePartitionPath(new Path(metaClient.getBasePath()), partitionPath); + HoodieTableFileSystemView fsView = new HoodieTableFileSystemView(metaClient, metaClient.getActiveTimeline()); + String relPartitionPath = FSUtils.getRelativePartitionPath(new Path(metaClient.getBasePath()), partitionPath); try { // Both commit and delta-commits are included - pick the latest completed one Option latestCompletedInstant = metaClient.getActiveTimeline().getCommitsTimeline().filterCompletedInstants().lastInstant(); - Stream latestFileSlices = latestCompletedInstant.map(instant -> - fsView.getLatestMergedFileSlicesBeforeOrOn(relPartitionPath, instant.getTimestamp())) + Stream latestFileSlices = latestCompletedInstant + .map(instant -> fsView.getLatestMergedFileSlicesBeforeOrOn(relPartitionPath, instant.getTimestamp())) .orElse(Stream.empty()); // subgroup splits again by file id & match with log files. - Map> groupedInputSplits = partitionsToParquetSplits - .get(partitionPath).stream() + Map> groupedInputSplits = partitionsToParquetSplits.get(partitionPath).stream() .collect(Collectors.groupingBy(split -> FSUtils.getFileId(split.getPath().getName()))); latestFileSlices.forEach(fileSlice -> { List dataFileSplits = groupedInputSplits.get(fileSlice.getFileId()); @@ -135,21 +131,18 @@ public class HoodieParquetRealtimeInputFormat extends HoodieParquetInputFormat i .map(logFile -> logFile.getPath().toString()).collect(Collectors.toList()); // Get the maxCommit from the last delta or compaction or commit - when // bootstrapped from COW table - String maxCommitTime = metaClient.getActiveTimeline().getTimelineOfActions( - Sets.newHashSet(HoodieTimeline.COMMIT_ACTION, HoodieTimeline.ROLLBACK_ACTION, - HoodieTimeline.DELTA_COMMIT_ACTION)).filterCompletedInstants().lastInstant() - .get().getTimestamp(); - rtSplits.add( - new HoodieRealtimeFileSplit(split, metaClient.getBasePath(), logFilePaths, - maxCommitTime)); + String maxCommitTime = metaClient + .getActiveTimeline().getTimelineOfActions(Sets.newHashSet(HoodieTimeline.COMMIT_ACTION, + HoodieTimeline.ROLLBACK_ACTION, HoodieTimeline.DELTA_COMMIT_ACTION)) + .filterCompletedInstants().lastInstant().get().getTimestamp(); + rtSplits.add(new HoodieRealtimeFileSplit(split, metaClient.getBasePath(), logFilePaths, maxCommitTime)); } catch (IOException e) { throw new HoodieIOException("Error creating hoodie real time split ", e); } }); }); } catch (Exception e) { - throw new HoodieException("Error obtaining data file/log file grouping: " + partitionPath, - e); + throw new HoodieException("Error obtaining data file/log file grouping: " + partitionPath, e); } }); LOG.info("Returning a total splits of " + rtSplits.size()); @@ -167,8 +160,7 @@ public class HoodieParquetRealtimeInputFormat extends HoodieParquetInputFormat i /** * Add a field to the existing fields projected */ - private static Configuration addProjectionField(Configuration conf, String fieldName, - int fieldIndex) { + private static Configuration addProjectionField(Configuration conf, String fieldName, int fieldIndex) { String readColNames = conf.get(ColumnProjectionUtils.READ_COLUMN_NAMES_CONF_STR, ""); String readColIds = conf.get(ColumnProjectionUtils.READ_COLUMN_IDS_CONF_STR, ""); @@ -186,8 +178,7 @@ public class HoodieParquetRealtimeInputFormat extends HoodieParquetInputFormat i conf.set(ColumnProjectionUtils.READ_COLUMN_NAMES_CONF_STR, readColNamesPrefix + fieldName); conf.set(ColumnProjectionUtils.READ_COLUMN_IDS_CONF_STR, readColIdsPrefix + fieldIndex); if (LOG.isDebugEnabled()) { - LOG.debug(String.format( - "Adding extra column " + fieldName + ", to enable log merging cols (%s) ids (%s) ", + LOG.debug(String.format("Adding extra column " + fieldName + ", to enable log merging cols (%s) ids (%s) ", conf.get(ColumnProjectionUtils.READ_COLUMN_NAMES_CONF_STR), conf.get(ColumnProjectionUtils.READ_COLUMN_IDS_CONF_STR))); } @@ -197,22 +188,21 @@ public class HoodieParquetRealtimeInputFormat extends HoodieParquetInputFormat i private static synchronized Configuration addRequiredProjectionFields(Configuration configuration) { // Need this to do merge records in HoodieRealtimeRecordReader - configuration = addProjectionField(configuration, HoodieRecord.RECORD_KEY_METADATA_FIELD, - HOODIE_RECORD_KEY_COL_POS); - configuration = addProjectionField(configuration, HoodieRecord.COMMIT_TIME_METADATA_FIELD, - HOODIE_COMMIT_TIME_COL_POS); - configuration = addProjectionField(configuration, HoodieRecord.PARTITION_PATH_METADATA_FIELD, - HOODIE_PARTITION_PATH_COL_POS); + configuration = + addProjectionField(configuration, HoodieRecord.RECORD_KEY_METADATA_FIELD, HOODIE_RECORD_KEY_COL_POS); + configuration = + addProjectionField(configuration, HoodieRecord.COMMIT_TIME_METADATA_FIELD, HOODIE_COMMIT_TIME_COL_POS); + configuration = + addProjectionField(configuration, HoodieRecord.PARTITION_PATH_METADATA_FIELD, HOODIE_PARTITION_PATH_COL_POS); return configuration; } @Override - public RecordReader getRecordReader(final InputSplit split, - final JobConf job, final Reporter reporter) throws IOException { + public RecordReader getRecordReader(final InputSplit split, final JobConf job, + final Reporter reporter) throws IOException { - LOG.info("Before adding Hoodie columns, Projections :" + job - .get(ColumnProjectionUtils.READ_COLUMN_NAMES_CONF_STR) + ", Ids :" - + job.get(ColumnProjectionUtils.READ_COLUMN_IDS_CONF_STR)); + LOG.info("Before adding Hoodie columns, Projections :" + job.get(ColumnProjectionUtils.READ_COLUMN_NAMES_CONF_STR) + + ", Ids :" + job.get(ColumnProjectionUtils.READ_COLUMN_IDS_CONF_STR)); // Hive (across all versions) fails for queries like select count(`_hoodie_commit_time`) from table; // In this case, the projection fields gets removed. Looking at HiveInputFormat implementation, in some cases @@ -222,13 +212,11 @@ public class HoodieParquetRealtimeInputFormat extends HoodieParquetInputFormat i // TO fix this, hoodie columns are appended late at the time record-reader gets built instead of construction time. this.conf = addRequiredProjectionFields(job); - LOG.info("Creating record reader with readCols :" + job - .get(ColumnProjectionUtils.READ_COLUMN_NAMES_CONF_STR) + ", Ids :" - + job.get(ColumnProjectionUtils.READ_COLUMN_IDS_CONF_STR)); + LOG.info("Creating record reader with readCols :" + job.get(ColumnProjectionUtils.READ_COLUMN_NAMES_CONF_STR) + + ", Ids :" + job.get(ColumnProjectionUtils.READ_COLUMN_IDS_CONF_STR)); // sanity check Preconditions.checkArgument(split instanceof HoodieRealtimeFileSplit, - "HoodieRealtimeRecordReader can only work on HoodieRealtimeFileSplit and not with " - + split); + "HoodieRealtimeRecordReader can only work on HoodieRealtimeFileSplit and not with " + split); return new HoodieRealtimeRecordReader((HoodieRealtimeFileSplit) split, job, super.getRecordReader(split, job, reporter)); diff --git a/hudi-hadoop-mr/src/main/java/org/apache/hudi/hadoop/realtime/HoodieRealtimeFileSplit.java b/hudi-hadoop-mr/src/main/java/org/apache/hudi/hadoop/realtime/HoodieRealtimeFileSplit.java index df75b00cd..2fd5afad3 100644 --- a/hudi-hadoop-mr/src/main/java/org/apache/hudi/hadoop/realtime/HoodieRealtimeFileSplit.java +++ b/hudi-hadoop-mr/src/main/java/org/apache/hudi/hadoop/realtime/HoodieRealtimeFileSplit.java @@ -41,10 +41,9 @@ public class HoodieRealtimeFileSplit extends FileSplit { super(); } - public HoodieRealtimeFileSplit(FileSplit baseSplit, String basePath, List deltaLogFiles, - String maxCommitTime) throws IOException { - super(baseSplit.getPath(), baseSplit.getStart(), baseSplit.getLength(), - baseSplit.getLocations()); + public HoodieRealtimeFileSplit(FileSplit baseSplit, String basePath, List deltaLogFiles, String maxCommitTime) + throws IOException { + super(baseSplit.getPath(), baseSplit.getStart(), baseSplit.getLength(), baseSplit.getLocations()); this.deltaFilePaths = deltaLogFiles; this.maxCommitTime = maxCommitTime; this.basePath = basePath; @@ -100,11 +99,7 @@ public class HoodieRealtimeFileSplit extends FileSplit { @Override public String toString() { - return "HoodieRealtimeFileSplit{" - + "DataPath=" + getPath() - + ", deltaFilePaths=" + deltaFilePaths - + ", maxCommitTime='" + maxCommitTime + '\'' - + ", basePath='" + basePath + '\'' - + '}'; + return "HoodieRealtimeFileSplit{" + "DataPath=" + getPath() + ", deltaFilePaths=" + deltaFilePaths + + ", maxCommitTime='" + maxCommitTime + '\'' + ", basePath='" + basePath + '\'' + '}'; } } diff --git a/hudi-hadoop-mr/src/main/java/org/apache/hudi/hadoop/realtime/HoodieRealtimeRecordReader.java b/hudi-hadoop-mr/src/main/java/org/apache/hudi/hadoop/realtime/HoodieRealtimeRecordReader.java index 5e095d868..fb11d3979 100644 --- a/hudi-hadoop-mr/src/main/java/org/apache/hudi/hadoop/realtime/HoodieRealtimeRecordReader.java +++ b/hudi-hadoop-mr/src/main/java/org/apache/hudi/hadoop/realtime/HoodieRealtimeRecordReader.java @@ -28,8 +28,8 @@ import org.apache.log4j.LogManager; import org.apache.log4j.Logger; /** - * Realtime Record Reader which can do compacted (merge-on-read) record reading or - * unmerged reading (parquet and log files read in parallel) based on job configuration. + * Realtime Record Reader which can do compacted (merge-on-read) record reading or unmerged reading (parquet and log + * files read in parallel) based on job configuration. */ public class HoodieRealtimeRecordReader implements RecordReader { @@ -52,8 +52,8 @@ public class HoodieRealtimeRecordReader implements RecordReader { +class RealtimeCompactedRecordReader extends AbstractRealtimeRecordReader + implements RecordReader { private static final Logger LOG = LogManager.getLogger(AbstractRealtimeRecordReader.class); @@ -51,19 +51,18 @@ class RealtimeCompactedRecordReader extends AbstractRealtimeRecordReader impleme } /** - * Goes through the log files and populates a map with latest version of each key logged, since - * the base split was written. + * Goes through the log files and populates a map with latest version of each key logged, since the base split was + * written. */ private HoodieMergedLogRecordScanner getMergedLogRecordScanner() throws IOException { // NOTE: HoodieCompactedLogRecordScanner will not return records for an in-flight commit // but can return records for completed commits > the commit we are trying to read (if using // readCommit() API) - return new HoodieMergedLogRecordScanner( - FSUtils.getFs(split.getPath().toString(), jobConf), split.getBasePath(), + return new HoodieMergedLogRecordScanner(FSUtils.getFs(split.getPath().toString(), jobConf), split.getBasePath(), split.getDeltaFilePaths(), usesCustomPayload ? getWriterSchema() : getReaderSchema(), split.getMaxCommitTime(), getMaxCompactionMemoryInBytes(), - Boolean.valueOf(jobConf.get(COMPACTION_LAZY_BLOCK_READ_ENABLED_PROP, - DEFAULT_COMPACTION_LAZY_BLOCK_READ_ENABLED)), + Boolean + .valueOf(jobConf.get(COMPACTION_LAZY_BLOCK_READ_ENABLED_PROP, DEFAULT_COMPACTION_LAZY_BLOCK_READ_ENABLED)), false, jobConf.getInt(MAX_DFS_STREAM_BUFFER_SIZE_PROP, DEFAULT_MAX_DFS_STREAM_BUFFER_SIZE), jobConf.get(SPILLABLE_MAP_BASE_PATH_PROP, DEFAULT_SPILLABLE_MAP_BASE_PATH)); } @@ -80,8 +79,7 @@ class RealtimeCompactedRecordReader extends AbstractRealtimeRecordReader impleme // TODO(VC): Right now, we assume all records in log, have a matching base record. (which // would be true until we have a way to index logs too) // return from delta records map if we have some match. - String key = arrayWritable.get()[HoodieParquetRealtimeInputFormat.HOODIE_RECORD_KEY_COL_POS] - .toString(); + String key = arrayWritable.get()[HoodieParquetRealtimeInputFormat.HOODIE_RECORD_KEY_COL_POS].toString(); if (deltaRecordMap.containsKey(key)) { // TODO(NA): Invoke preCombine here by converting arrayWritable to Avro. This is required since the // deltaRecord may not be a full record and needs values of columns from the parquet @@ -106,8 +104,8 @@ class RealtimeCompactedRecordReader extends AbstractRealtimeRecordReader impleme ArrayWritable aWritable = (ArrayWritable) avroToArrayWritable(recordToReturn, getWriterSchema()); Writable[] replaceValue = aWritable.get(); if (LOG.isDebugEnabled()) { - LOG.debug(String.format("key %s, base values: %s, log values: %s", key, - arrayWritableToString(arrayWritable), arrayWritableToString(aWritable))); + LOG.debug(String.format("key %s, base values: %s, log values: %s", key, arrayWritableToString(arrayWritable), + arrayWritableToString(aWritable))); } Writable[] originalValue = arrayWritable.get(); try { diff --git a/hudi-hadoop-mr/src/main/java/org/apache/hudi/hadoop/realtime/RealtimeUnmergedRecordReader.java b/hudi-hadoop-mr/src/main/java/org/apache/hudi/hadoop/realtime/RealtimeUnmergedRecordReader.java index 82f81b90e..f7a51e820 100644 --- a/hudi-hadoop-mr/src/main/java/org/apache/hudi/hadoop/realtime/RealtimeUnmergedRecordReader.java +++ b/hudi-hadoop-mr/src/main/java/org/apache/hudi/hadoop/realtime/RealtimeUnmergedRecordReader.java @@ -38,8 +38,8 @@ import org.apache.hudi.common.util.queue.IteratorBasedQueueProducer; import org.apache.hudi.hadoop.RecordReaderValueIterator; import org.apache.hudi.hadoop.SafeParquetRecordReaderWrapper; -class RealtimeUnmergedRecordReader extends AbstractRealtimeRecordReader implements - RecordReader { +class RealtimeUnmergedRecordReader extends AbstractRealtimeRecordReader + implements RecordReader { // Log Record unmerged scanner private final HoodieUnMergedLogRecordScanner logRecordScanner; @@ -60,8 +60,8 @@ class RealtimeUnmergedRecordReader extends AbstractRealtimeRecordReader implemen * Construct a Unmerged record reader that parallely consumes both parquet and log records and buffers for upstream * clients to consume * - * @param split File split - * @param job Job Configuration + * @param split File split + * @param job Job Configuration * @param realReader Parquet Reader */ public RealtimeUnmergedRecordReader(HoodieRealtimeFileSplit split, JobConf job, @@ -74,12 +74,11 @@ class RealtimeUnmergedRecordReader extends AbstractRealtimeRecordReader implemen Option.empty(), x -> x, new DefaultSizeEstimator<>()); // Consumer of this record reader this.iterator = this.executor.getQueue().iterator(); - this.logRecordScanner = new HoodieUnMergedLogRecordScanner( - FSUtils.getFs(split.getPath().toString(), jobConf), split.getBasePath(), - split.getDeltaFilePaths(), getReaderSchema(), split.getMaxCommitTime(), Boolean.valueOf(jobConf - .get(COMPACTION_LAZY_BLOCK_READ_ENABLED_PROP, DEFAULT_COMPACTION_LAZY_BLOCK_READ_ENABLED)), - false, jobConf.getInt(MAX_DFS_STREAM_BUFFER_SIZE_PROP, DEFAULT_MAX_DFS_STREAM_BUFFER_SIZE), - record -> { + this.logRecordScanner = new HoodieUnMergedLogRecordScanner(FSUtils.getFs(split.getPath().toString(), jobConf), + split.getBasePath(), split.getDeltaFilePaths(), getReaderSchema(), split.getMaxCommitTime(), + Boolean + .valueOf(jobConf.get(COMPACTION_LAZY_BLOCK_READ_ENABLED_PROP, DEFAULT_COMPACTION_LAZY_BLOCK_READ_ENABLED)), + false, jobConf.getInt(MAX_DFS_STREAM_BUFFER_SIZE_PROP, DEFAULT_MAX_DFS_STREAM_BUFFER_SIZE), record -> { // convert Hoodie log record to Hadoop AvroWritable and buffer GenericRecord rec = (GenericRecord) record.getData().getInsertValue(getReaderSchema()).get(); ArrayWritable aWritable = (ArrayWritable) avroToArrayWritable(rec, getWriterSchema()); @@ -125,7 +124,7 @@ class RealtimeUnmergedRecordReader extends AbstractRealtimeRecordReader implemen @Override public long getPos() throws IOException { - //TODO: vb - No logical way to represent parallel stream pos in a single long. + // TODO: vb - No logical way to represent parallel stream pos in a single long. // Should we just return invalid (-1). Where is it used ? return 0; } diff --git a/hudi-hadoop-mr/src/test/java/org/apache/hudi/hadoop/HoodieInputFormatTest.java b/hudi-hadoop-mr/src/test/java/org/apache/hudi/hadoop/HoodieInputFormatTest.java index 73915b7b9..6fd5e3a88 100644 --- a/hudi-hadoop-mr/src/test/java/org/apache/hudi/hadoop/HoodieInputFormatTest.java +++ b/hudi-hadoop-mr/src/test/java/org/apache/hudi/hadoop/HoodieInputFormatTest.java @@ -84,18 +84,14 @@ public class HoodieInputFormatTest { // Before the commit files = inputFormat.listStatus(jobConf); assertEquals(10, files.length); - ensureFilesInCommit( - "Commit 200 has not been committed. We should not see files from this commit", files, "200", - 0); + ensureFilesInCommit("Commit 200 has not been committed. We should not see files from this commit", files, "200", 0); InputFormatTestUtil.commit(basePath, "200"); files = inputFormat.listStatus(jobConf); assertEquals(10, files.length); - ensureFilesInCommit( - "5 files have been updated to commit 200. We should see 5 files from commit 200 and 5 " - + "files from 100 commit", files, "200", 5); - ensureFilesInCommit( - "5 files have been updated to commit 200. We should see 5 files from commit 100 and 5 " - + "files from 200 commit", files, "100", 5); + ensureFilesInCommit("5 files have been updated to commit 200. We should see 5 files from commit 200 and 5 " + + "files from 100 commit", files, "200", 5); + ensureFilesInCommit("5 files have been updated to commit 200. We should see 5 files from commit 100 and 5 " + + "files from 200 commit", files, "100", 5); } @Test @@ -110,9 +106,8 @@ public class HoodieInputFormatTest { InputFormatTestUtil.setupIncremental(jobConf, "100", 1); FileStatus[] files = inputFormat.listStatus(jobConf); - assertEquals( - "We should exclude commit 100 when returning incremental pull with start commit time as " - + "100", 0, files.length); + assertEquals("We should exclude commit 100 when returning incremental pull with start commit time as " + "100", 0, + files.length); } @Test @@ -140,43 +135,31 @@ public class HoodieInputFormatTest { InputFormatTestUtil.setupIncremental(jobConf, "100", 1); FileStatus[] files = inputFormat.listStatus(jobConf); - assertEquals("Pulling 1 commit from 100, should get us the 5 files committed at 200", 5, - files.length); - ensureFilesInCommit("Pulling 1 commit from 100, should get us the 5 files committed at 200", - files, "200", 5); + assertEquals("Pulling 1 commit from 100, should get us the 5 files committed at 200", 5, files.length); + ensureFilesInCommit("Pulling 1 commit from 100, should get us the 5 files committed at 200", files, "200", 5); InputFormatTestUtil.setupIncremental(jobConf, "100", 3); files = inputFormat.listStatus(jobConf); - assertEquals( - "Pulling 3 commits from 100, should get us the 3 files from 400 commit, 1 file from 300 " - + "commit and 1 file from 200 commit", 5, files.length); - ensureFilesInCommit("Pulling 3 commits from 100, should get us the 3 files from 400 commit", - files, "400", 3); - ensureFilesInCommit("Pulling 3 commits from 100, should get us the 1 files from 300 commit", - files, "300", 1); - ensureFilesInCommit("Pulling 3 commits from 100, should get us the 1 files from 200 commit", - files, "200", 1); + assertEquals("Pulling 3 commits from 100, should get us the 3 files from 400 commit, 1 file from 300 " + + "commit and 1 file from 200 commit", 5, files.length); + ensureFilesInCommit("Pulling 3 commits from 100, should get us the 3 files from 400 commit", files, "400", 3); + ensureFilesInCommit("Pulling 3 commits from 100, should get us the 1 files from 300 commit", files, "300", 1); + ensureFilesInCommit("Pulling 3 commits from 100, should get us the 1 files from 200 commit", files, "200", 1); InputFormatTestUtil.setupIncremental(jobConf, "100", HoodieHiveUtil.MAX_COMMIT_ALL); files = inputFormat.listStatus(jobConf); - assertEquals( - "Pulling all commits from 100, should get us the 1 file from each of 200,300,400,500,400 " - + "commits", 5, files.length); - ensureFilesInCommit("Pulling all commits from 100, should get us the 1 files from 600 commit", - files, "600", 1); - ensureFilesInCommit("Pulling all commits from 100, should get us the 1 files from 500 commit", - files, "500", 1); - ensureFilesInCommit("Pulling all commits from 100, should get us the 1 files from 400 commit", - files, "400", 1); - ensureFilesInCommit("Pulling all commits from 100, should get us the 1 files from 300 commit", - files, "300", 1); - ensureFilesInCommit("Pulling all commits from 100, should get us the 1 files from 200 commit", - files, "200", 1); + assertEquals("Pulling all commits from 100, should get us the 1 file from each of 200,300,400,500,400 " + "commits", + 5, files.length); + ensureFilesInCommit("Pulling all commits from 100, should get us the 1 files from 600 commit", files, "600", 1); + ensureFilesInCommit("Pulling all commits from 100, should get us the 1 files from 500 commit", files, "500", 1); + ensureFilesInCommit("Pulling all commits from 100, should get us the 1 files from 400 commit", files, "400", 1); + ensureFilesInCommit("Pulling all commits from 100, should get us the 1 files from 300 commit", files, "300", 1); + ensureFilesInCommit("Pulling all commits from 100, should get us the 1 files from 200 commit", files, "200", 1); } - //TODO enable this after enabling predicate pushdown + // TODO enable this after enabling predicate pushdown public void testPredicatePushDown() throws IOException { // initial commit Schema schema = InputFormatTestUtil.readSchema("/sample1.avsc"); @@ -186,8 +169,7 @@ public class HoodieInputFormatTest { // Add the paths FileInputFormat.setInputPaths(jobConf, partitionDir.getPath()); // check whether we have 10 records at this point - ensureRecordsInCommit("We need to have 10 records at this point for commit " + commit1, commit1, - 10, 10); + ensureRecordsInCommit("We need to have 10 records at this point for commit " + commit1, commit1, 10, 10); // update 2 records in the original parquet file and save it as commit 200 String commit2 = "20160629193623"; @@ -196,27 +178,23 @@ public class HoodieInputFormatTest { InputFormatTestUtil.setupIncremental(jobConf, commit1, 1); // check whether we have 2 records at this point - ensureRecordsInCommit( - "We need to have 2 records that was modified at commit " + commit2 + " and no more", - commit2, 2, 2); + ensureRecordsInCommit("We need to have 2 records that was modified at commit " + commit2 + " and no more", commit2, + 2, 2); // Make sure we have the 10 records if we roll back the stattime InputFormatTestUtil.setupIncremental(jobConf, "0", 2); - ensureRecordsInCommit( - "We need to have 8 records that was modified at commit " + commit1 + " and no more", - commit1, 8, 10); - ensureRecordsInCommit( - "We need to have 2 records that was modified at commit " + commit2 + " and no more", - commit2, 2, 10); + ensureRecordsInCommit("We need to have 8 records that was modified at commit " + commit1 + " and no more", commit1, + 8, 10); + ensureRecordsInCommit("We need to have 2 records that was modified at commit " + commit2 + " and no more", commit2, + 2, 10); } - private void ensureRecordsInCommit(String msg, String commit, int expectedNumberOfRecordsInCommit, - int totalExpected) throws IOException { + private void ensureRecordsInCommit(String msg, String commit, int expectedNumberOfRecordsInCommit, int totalExpected) + throws IOException { int actualCount = 0; int totalCount = 0; InputSplit[] splits = inputFormat.getSplits(jobConf, 1); for (InputSplit split : splits) { - RecordReader recordReader = inputFormat - .getRecordReader(split, jobConf, null); + RecordReader recordReader = inputFormat.getRecordReader(split, jobConf, null); NullWritable key = recordReader.createKey(); ArrayWritable writable = recordReader.createValue(); @@ -234,8 +212,7 @@ public class HoodieInputFormatTest { assertEquals(msg, totalExpected, totalCount); } - public static void ensureFilesInCommit(String msg, FileStatus[] files, String commit, - int expected) { + public static void ensureFilesInCommit(String msg, FileStatus[] files, String commit, int expected) { int count = 0; for (FileStatus file : files) { String commitTs = FSUtils.getCommitTime(file.getPath().getName()); diff --git a/hudi-hadoop-mr/src/test/java/org/apache/hudi/hadoop/InputFormatTestUtil.java b/hudi-hadoop-mr/src/test/java/org/apache/hudi/hadoop/InputFormatTestUtil.java index 5543193b3..2932a15b6 100644 --- a/hudi-hadoop-mr/src/test/java/org/apache/hudi/hadoop/InputFormatTestUtil.java +++ b/hudi-hadoop-mr/src/test/java/org/apache/hudi/hadoop/InputFormatTestUtil.java @@ -44,21 +44,20 @@ public class InputFormatTestUtil { private static String TEST_WRITE_TOKEN = "1-0-1"; - public static File prepareDataset(TemporaryFolder basePath, int numberOfFiles, - String commitNumber) throws IOException { + public static File prepareDataset(TemporaryFolder basePath, int numberOfFiles, String commitNumber) + throws IOException { basePath.create(); HoodieTestUtils.init(HoodieTestUtils.getDefaultHadoopConf(), basePath.getRoot().toString()); File partitionPath = basePath.newFolder("2016", "05", "01"); for (int i = 0; i < numberOfFiles; i++) { - File dataFile = new File(partitionPath, - FSUtils.makeDataFileName(commitNumber, TEST_WRITE_TOKEN, "fileid" + i)); + File dataFile = new File(partitionPath, FSUtils.makeDataFileName(commitNumber, TEST_WRITE_TOKEN, "fileid" + i)); dataFile.createNewFile(); } return partitionPath; } - public static void simulateUpdates(File directory, final String originalCommit, - int numberOfFilesUpdated, String newCommit, boolean randomize) throws IOException { + public static void simulateUpdates(File directory, final String originalCommit, int numberOfFilesUpdated, + String newCommit, boolean randomize) throws IOException { List dataFiles = Arrays.asList(directory.listFiles(new FilenameFilter() { @Override public boolean accept(File dir, String name) { @@ -69,8 +68,7 @@ public class InputFormatTestUtil { if (randomize) { Collections.shuffle(dataFiles); } - List toUpdateList = dataFiles - .subList(0, Math.min(numberOfFilesUpdated, dataFiles.size())); + List toUpdateList = dataFiles.subList(0, Math.min(numberOfFilesUpdated, dataFiles.size())); for (File file : toUpdateList) { String fileId = FSUtils.getFileId(file.getName()); File dataFile = new File(directory, FSUtils.makeDataFileName(newCommit, TEST_WRITE_TOKEN, fileId)); @@ -88,18 +86,17 @@ public class InputFormatTestUtil { new File(basePath.getRoot().toString() + "/.hoodie/", commitNumber + ".deltacommit").createNewFile(); } - public static void setupIncremental(JobConf jobConf, String startCommit, - int numberOfCommitsToPull) { - String modePropertyName = String - .format(HoodieHiveUtil.HOODIE_CONSUME_MODE_PATTERN, HoodieTestUtils.RAW_TRIPS_TEST_NAME); + public static void setupIncremental(JobConf jobConf, String startCommit, int numberOfCommitsToPull) { + String modePropertyName = + String.format(HoodieHiveUtil.HOODIE_CONSUME_MODE_PATTERN, HoodieTestUtils.RAW_TRIPS_TEST_NAME); jobConf.set(modePropertyName, HoodieHiveUtil.INCREMENTAL_SCAN_MODE); - String startCommitTimestampName = String - .format(HoodieHiveUtil.HOODIE_START_COMMIT_PATTERN, HoodieTestUtils.RAW_TRIPS_TEST_NAME); + String startCommitTimestampName = + String.format(HoodieHiveUtil.HOODIE_START_COMMIT_PATTERN, HoodieTestUtils.RAW_TRIPS_TEST_NAME); jobConf.set(startCommitTimestampName, startCommit); - String maxCommitPulls = String - .format(HoodieHiveUtil.HOODIE_MAX_COMMIT_PATTERN, HoodieTestUtils.RAW_TRIPS_TEST_NAME); + String maxCommitPulls = + String.format(HoodieHiveUtil.HOODIE_MAX_COMMIT_PATTERN, HoodieTestUtils.RAW_TRIPS_TEST_NAME); jobConf.setInt(maxCommitPulls, numberOfCommitsToPull); } @@ -107,8 +104,8 @@ public class InputFormatTestUtil { return new Schema.Parser().parse(InputFormatTestUtil.class.getResourceAsStream(location)); } - public static File prepareParquetDataset(TemporaryFolder basePath, Schema schema, - int numberOfFiles, int numberOfRecords, String commitNumber) throws IOException { + public static File prepareParquetDataset(TemporaryFolder basePath, Schema schema, int numberOfFiles, + int numberOfRecords, String commitNumber) throws IOException { basePath.create(); HoodieTestUtils.init(HoodieTestUtils.getDefaultHadoopConf(), basePath.getRoot().toString()); File partitionPath = basePath.newFolder("2016", "05", "01"); @@ -117,8 +114,8 @@ public class InputFormatTestUtil { } - public static File prepareSimpleParquetDataset(TemporaryFolder basePath, Schema schema, - int numberOfFiles, int numberOfRecords, String commitNumber) throws Exception { + public static File prepareSimpleParquetDataset(TemporaryFolder basePath, Schema schema, int numberOfFiles, + int numberOfRecords, String commitNumber) throws Exception { basePath.create(); HoodieTestUtils.init(HoodieTestUtils.getDefaultHadoopConf(), basePath.getRoot().toString()); File partitionPath = basePath.newFolder("2016", "05", "01"); @@ -126,8 +123,8 @@ public class InputFormatTestUtil { return partitionPath; } - public static File prepareNonPartitionedParquetDataset(TemporaryFolder baseDir, Schema schema, - int numberOfFiles, int numberOfRecords, String commitNumber) throws IOException { + public static File prepareNonPartitionedParquetDataset(TemporaryFolder baseDir, Schema schema, int numberOfFiles, + int numberOfRecords, String commitNumber) throws IOException { baseDir.create(); HoodieTestUtils.init(HoodieTestUtils.getDefaultHadoopConf(), baseDir.getRoot().toString()); File basePath = baseDir.getRoot(); @@ -135,17 +132,15 @@ public class InputFormatTestUtil { return basePath; } - private static void createData(Schema schema, - File partitionPath, int numberOfFiles, int numberOfRecords, String commitNumber) - throws IOException { + private static void createData(Schema schema, File partitionPath, int numberOfFiles, int numberOfRecords, + String commitNumber) throws IOException { AvroParquetWriter parquetWriter; for (int i = 0; i < numberOfFiles; i++) { String fileId = FSUtils.makeDataFileName(commitNumber, TEST_WRITE_TOKEN, "fileid" + i); File dataFile = new File(partitionPath, fileId); parquetWriter = new AvroParquetWriter(new Path(dataFile.getAbsolutePath()), schema); try { - for (GenericRecord record : generateAvroRecords(schema, numberOfRecords, commitNumber, - fileId)) { + for (GenericRecord record : generateAvroRecords(schema, numberOfRecords, commitNumber, fileId)) { parquetWriter.write(record); } } finally { @@ -154,9 +149,8 @@ public class InputFormatTestUtil { } } - private static void createSimpleData(Schema schema, - File partitionPath, int numberOfFiles, int numberOfRecords, String commitNumber) - throws Exception { + private static void createSimpleData(Schema schema, File partitionPath, int numberOfFiles, int numberOfRecords, + String commitNumber) throws Exception { AvroParquetWriter parquetWriter; for (int i = 0; i < numberOfFiles; i++) { String fileId = FSUtils.makeDataFileName(commitNumber, "1", "fileid" + i); @@ -179,8 +173,8 @@ public class InputFormatTestUtil { } } - private static Iterable generateAvroRecords(Schema schema, - int numberOfRecords, String commitTime, String fileId) throws IOException { + private static Iterable generateAvroRecords(Schema schema, int numberOfRecords, + String commitTime, String fileId) throws IOException { List records = new ArrayList<>(numberOfRecords); for (int i = 0; i < numberOfRecords; i++) { records.add(SchemaTestUtil.generateAvroRecordFromJson(schema, i, commitTime, fileId)); @@ -198,17 +192,14 @@ public class InputFormatTestUtil { })[0]; String fileId = FSUtils.getFileId(fileToUpdate.getName()); File dataFile = new File(directory, FSUtils.makeDataFileName(newCommit, TEST_WRITE_TOKEN, fileId)); - AvroParquetWriter parquetWriter = new AvroParquetWriter(new Path(dataFile.getAbsolutePath()), - schema); + AvroParquetWriter parquetWriter = new AvroParquetWriter(new Path(dataFile.getAbsolutePath()), schema); try { - for (GenericRecord record : generateAvroRecords(schema, totalNumberOfRecords, originalCommit, - fileId)) { + for (GenericRecord record : generateAvroRecords(schema, totalNumberOfRecords, originalCommit, fileId)) { if (numberOfRecordsToUpdate > 0) { // update this record record.put(HoodieRecord.COMMIT_TIME_METADATA_FIELD, newCommit); String oldSeqNo = (String) record.get(HoodieRecord.COMMIT_SEQNO_METADATA_FIELD); - record.put(HoodieRecord.COMMIT_SEQNO_METADATA_FIELD, - oldSeqNo.replace(originalCommit, newCommit)); + record.put(HoodieRecord.COMMIT_SEQNO_METADATA_FIELD, oldSeqNo.replace(originalCommit, newCommit)); numberOfRecordsToUpdate--; } parquetWriter.write(record); diff --git a/hudi-hadoop-mr/src/test/java/org/apache/hudi/hadoop/TestHoodieROTablePathFilter.java b/hudi-hadoop-mr/src/test/java/org/apache/hudi/hadoop/TestHoodieROTablePathFilter.java index 4eff6074a..32c4ba806 100644 --- a/hudi-hadoop-mr/src/test/java/org/apache/hudi/hadoop/TestHoodieROTablePathFilter.java +++ b/hudi-hadoop-mr/src/test/java/org/apache/hudi/hadoop/TestHoodieROTablePathFilter.java @@ -36,12 +36,12 @@ import org.junit.rules.TemporaryFolder; /** */ public class TestHoodieROTablePathFilter extends HoodieCommonTestHarness { - + @Before public void setUp() throws Exception { initMetaClient(); } - + @Test public void testHoodiePaths() throws IOException { // Create a temp folder as the base path @@ -61,28 +61,26 @@ public class TestHoodieROTablePathFilter extends HoodieCommonTestHarness { Path partitionPath = new Path("file://" + basePath + File.separator + "2017/01/01"); assertTrue("Directories should be accepted", pathFilter.accept(partitionPath)); - assertTrue(pathFilter.accept(new Path( - "file:///" + HoodieTestUtils.getDataFilePath(basePath, "2017/01/01", "001", "f1")))); - assertFalse(pathFilter.accept(new Path( - "file:///" + HoodieTestUtils.getDataFilePath(basePath, "2017/01/01", "001", "f2")))); - assertTrue(pathFilter.accept(new Path( - "file:///" + HoodieTestUtils.getDataFilePath(basePath, "2017/01/01", "001", "f3")))); - assertTrue(pathFilter.accept(new Path( - "file:///" + HoodieTestUtils.getDataFilePath(basePath, "2017/01/01", "002", "f2")))); - assertFalse(pathFilter.accept(new Path( - "file:///" + HoodieTestUtils.getDataFilePath(basePath, "2017/01/01", "003", "f3")))); + assertTrue( + pathFilter.accept(new Path("file:///" + HoodieTestUtils.getDataFilePath(basePath, "2017/01/01", "001", "f1")))); + assertFalse( + pathFilter.accept(new Path("file:///" + HoodieTestUtils.getDataFilePath(basePath, "2017/01/01", "001", "f2")))); + assertTrue( + pathFilter.accept(new Path("file:///" + HoodieTestUtils.getDataFilePath(basePath, "2017/01/01", "001", "f3")))); + assertTrue( + pathFilter.accept(new Path("file:///" + HoodieTestUtils.getDataFilePath(basePath, "2017/01/01", "002", "f2")))); + assertFalse( + pathFilter.accept(new Path("file:///" + HoodieTestUtils.getDataFilePath(basePath, "2017/01/01", "003", "f3")))); assertFalse(pathFilter.accept(new Path("file:///" + HoodieTestUtils.getCommitFilePath(basePath, "001")))); assertFalse(pathFilter.accept(new Path("file:///" + HoodieTestUtils.getCommitFilePath(basePath, "002")))); - assertFalse(pathFilter.accept(new Path("file:///" - + HoodieTestUtils.getInflightCommitFilePath(basePath, "003")))); - assertFalse(pathFilter.accept(new Path("file:///" - + HoodieTestUtils.getRequestedCompactionFilePath(basePath, "004")))); - assertFalse(pathFilter.accept(new Path("file:///" + basePath + "/" - + HoodieTableMetaClient.METAFOLDER_NAME + "/"))); + assertFalse(pathFilter.accept(new Path("file:///" + HoodieTestUtils.getInflightCommitFilePath(basePath, "003")))); + assertFalse( + pathFilter.accept(new Path("file:///" + HoodieTestUtils.getRequestedCompactionFilePath(basePath, "004")))); + assertFalse(pathFilter.accept(new Path("file:///" + basePath + "/" + HoodieTableMetaClient.METAFOLDER_NAME + "/"))); assertFalse(pathFilter.accept(new Path("file:///" + basePath + "/" + HoodieTableMetaClient.METAFOLDER_NAME))); - assertFalse(pathFilter.accept(new Path( - "file:///" + HoodieTestUtils.getDataFilePath(basePath, "2017/01/01", "003", "f3")))); + assertFalse( + pathFilter.accept(new Path("file:///" + HoodieTestUtils.getDataFilePath(basePath, "2017/01/01", "003", "f3")))); } diff --git a/hudi-hadoop-mr/src/test/java/org/apache/hudi/hadoop/TestRecordReaderValueIterator.java b/hudi-hadoop-mr/src/test/java/org/apache/hudi/hadoop/TestRecordReaderValueIterator.java index c5f9b824b..3f3f05ebb 100644 --- a/hudi-hadoop-mr/src/test/java/org/apache/hudi/hadoop/TestRecordReaderValueIterator.java +++ b/hudi-hadoop-mr/src/test/java/org/apache/hudi/hadoop/TestRecordReaderValueIterator.java @@ -33,16 +33,9 @@ public class TestRecordReaderValueIterator { @Test public void testValueIterator() { - String[] values = new String[]{ - "hoodie", - "efficient", - "new project", - "realtime", - "spark", - "dataset", - }; - List> entries = IntStream.range(0, values.length) - .boxed().map(idx -> Pair.of(idx, values[idx])).collect(Collectors.toList()); + String[] values = new String[] {"hoodie", "efficient", "new project", "realtime", "spark", "dataset",}; + List> entries = + IntStream.range(0, values.length).boxed().map(idx -> Pair.of(idx, values[idx])).collect(Collectors.toList()); TestRecordReader reader = new TestRecordReader(entries); RecordReaderValueIterator itr = new RecordReaderValueIterator(reader); for (int i = 0; i < values.length; i++) { diff --git a/hudi-hadoop-mr/src/test/java/org/apache/hudi/hadoop/realtime/HoodieRealtimeRecordReaderTest.java b/hudi-hadoop-mr/src/test/java/org/apache/hudi/hadoop/realtime/HoodieRealtimeRecordReaderTest.java index 4fad4ae27..1b666288d 100644 --- a/hudi-hadoop-mr/src/test/java/org/apache/hudi/hadoop/realtime/HoodieRealtimeRecordReaderTest.java +++ b/hudi-hadoop-mr/src/test/java/org/apache/hudi/hadoop/realtime/HoodieRealtimeRecordReaderTest.java @@ -91,28 +91,22 @@ public class HoodieRealtimeRecordReaderTest { @Rule public TemporaryFolder basePath = new TemporaryFolder(); - private Writer writeLogFile(File partitionDir, Schema schema, String fileId, - String baseCommit, String newCommit, int numberOfRecords) - throws InterruptedException, IOException { + private Writer writeLogFile(File partitionDir, Schema schema, String fileId, String baseCommit, String newCommit, + int numberOfRecords) throws InterruptedException, IOException { return writeDataBlockToLogFile(partitionDir, schema, fileId, baseCommit, newCommit, numberOfRecords, 0, 0); } - private Writer writeRollback(File partitionDir, Schema schema, String fileId, - String baseCommit, String newCommit, String rolledBackInstant, int logVersion) - throws InterruptedException, IOException { - Writer writer = HoodieLogFormat.newWriterBuilder() - .onParentPath(new Path(partitionDir.getPath())) - .withFileId(fileId).overBaseCommit(baseCommit) - .withFs(fs) - .withLogVersion(logVersion) - .withLogWriteToken("1-0-1") + private Writer writeRollback(File partitionDir, Schema schema, String fileId, String baseCommit, String newCommit, + String rolledBackInstant, int logVersion) throws InterruptedException, IOException { + Writer writer = HoodieLogFormat.newWriterBuilder().onParentPath(new Path(partitionDir.getPath())).withFileId(fileId) + .overBaseCommit(baseCommit).withFs(fs).withLogVersion(logVersion).withLogWriteToken("1-0-1") .withFileExtension(HoodieLogFile.DELTA_EXTENSION).build(); // generate metadata Map header = Maps.newHashMap(); header.put(HeaderMetadataType.INSTANT_TIME, newCommit); header.put(HeaderMetadataType.TARGET_INSTANT_TIME, rolledBackInstant); - header.put(HeaderMetadataType.COMMAND_BLOCK_TYPE, String.valueOf(HoodieCommandBlockTypeEnum.ROLLBACK_PREVIOUS_BLOCK - .ordinal())); + header.put(HeaderMetadataType.COMMAND_BLOCK_TYPE, + String.valueOf(HoodieCommandBlockTypeEnum.ROLLBACK_PREVIOUS_BLOCK.ordinal())); // if update belongs to an existing log file writer = writer.appendBlock(new HoodieCommandBlock(header)); return writer; @@ -121,12 +115,9 @@ public class HoodieRealtimeRecordReaderTest { private HoodieLogFormat.Writer writeDataBlockToLogFile(File partitionDir, Schema schema, String fileId, String baseCommit, String newCommit, int numberOfRecords, int offset, int logVersion) throws InterruptedException, IOException { - HoodieLogFormat.Writer writer = HoodieLogFormat.newWriterBuilder() - .onParentPath(new Path(partitionDir.getPath())) - .withFileExtension(HoodieLogFile.DELTA_EXTENSION).withFileId(fileId) - .withLogVersion(logVersion) - .withLogWriteToken("1-0-1") - .overBaseCommit(baseCommit).withFs(fs).build(); + HoodieLogFormat.Writer writer = HoodieLogFormat.newWriterBuilder().onParentPath(new Path(partitionDir.getPath())) + .withFileExtension(HoodieLogFile.DELTA_EXTENSION).withFileId(fileId).withLogVersion(logVersion) + .withLogWriteToken("1-0-1").overBaseCommit(baseCommit).withFs(fs).build(); List records = new ArrayList<>(); for (int i = offset; i < offset + numberOfRecords; i++) { records.add(SchemaTestUtil.generateAvroRecordFromJson(schema, i, newCommit, "fileid0")); @@ -141,19 +132,17 @@ public class HoodieRealtimeRecordReaderTest { } private HoodieLogFormat.Writer writeRollbackBlockToLogFile(File partitionDir, Schema schema, String fileId, - String baseCommit, String newCommit, String oldCommit, int logVersion) - throws InterruptedException, IOException { - HoodieLogFormat.Writer writer = HoodieLogFormat.newWriterBuilder() - .onParentPath(new Path(partitionDir.getPath())) - .withFileExtension(HoodieLogFile.DELTA_EXTENSION).withFileId(fileId) - .overBaseCommit(baseCommit).withLogVersion(logVersion).withFs(fs).build(); + String baseCommit, String newCommit, String oldCommit, int logVersion) throws InterruptedException, IOException { + HoodieLogFormat.Writer writer = HoodieLogFormat.newWriterBuilder().onParentPath(new Path(partitionDir.getPath())) + .withFileExtension(HoodieLogFile.DELTA_EXTENSION).withFileId(fileId).overBaseCommit(baseCommit) + .withLogVersion(logVersion).withFs(fs).build(); Map header = Maps.newHashMap(); header.put(HoodieLogBlock.HeaderMetadataType.INSTANT_TIME, newCommit); header.put(HoodieLogBlock.HeaderMetadataType.SCHEMA, schema.toString()); header.put(HoodieLogBlock.HeaderMetadataType.TARGET_INSTANT_TIME, oldCommit); - header.put(HeaderMetadataType.COMMAND_BLOCK_TYPE, String.valueOf(HoodieCommandBlockTypeEnum.ROLLBACK_PREVIOUS_BLOCK - .ordinal())); + header.put(HeaderMetadataType.COMMAND_BLOCK_TYPE, + String.valueOf(HoodieCommandBlockTypeEnum.ROLLBACK_PREVIOUS_BLOCK.ordinal())); HoodieCommandBlock rollbackBlock = new HoodieCommandBlock(header); writer = writer.appendBlock(rollbackBlock); return writer; @@ -172,12 +161,10 @@ public class HoodieRealtimeRecordReaderTest { public void testReader(boolean partitioned) throws Exception { // initial commit Schema schema = HoodieAvroUtils.addMetadataFields(SchemaTestUtil.getEvolvedSchema()); - HoodieTestUtils.init(hadoopConf, basePath.getRoot().getAbsolutePath(), - HoodieTableType.MERGE_ON_READ); + HoodieTestUtils.init(hadoopConf, basePath.getRoot().getAbsolutePath(), HoodieTableType.MERGE_ON_READ); String baseInstant = "100"; - File partitionDir = - partitioned ? InputFormatTestUtil.prepareParquetDataset(basePath, schema, 1, 100, baseInstant) - : InputFormatTestUtil.prepareNonPartitionedParquetDataset(basePath, schema, 1, 100, baseInstant); + File partitionDir = partitioned ? InputFormatTestUtil.prepareParquetDataset(basePath, schema, 1, 100, baseInstant) + : InputFormatTestUtil.prepareNonPartitionedParquetDataset(basePath, schema, 1, 100, baseInstant); InputFormatTestUtil.commit(basePath, baseInstant); // Add the paths FileInputFormat.setInputPaths(jobConf, partitionDir.getPath()); @@ -187,9 +174,9 @@ public class HoodieRealtimeRecordReaderTest { logVersionsWithAction.add(Pair.of(HoodieTimeline.DELTA_COMMIT_ACTION, 2)); // TODO: HUDI-154 Once Hive 2.x PR (PR-674) is merged, enable this change // logVersionsWithAction.add(Pair.of(HoodieTimeline.ROLLBACK_ACTION, 3)); - FileSlice fileSlice = new FileSlice(partitioned ? FSUtils.getRelativePartitionPath(new Path( - basePath.getRoot().getAbsolutePath()), new Path(partitionDir.getAbsolutePath())) : "default", - baseInstant, "fileid0"); + FileSlice fileSlice = + new FileSlice(partitioned ? FSUtils.getRelativePartitionPath(new Path(basePath.getRoot().getAbsolutePath()), + new Path(partitionDir.getAbsolutePath())) : "default", baseInstant, "fileid0"); logVersionsWithAction.stream().forEach(logVersionWithAction -> { try { // update files or generate new log file @@ -197,55 +184,53 @@ public class HoodieRealtimeRecordReaderTest { String action = logVersionWithAction.getKey(); int baseInstantTs = Integer.parseInt(baseInstant); String instantTime = String.valueOf(baseInstantTs + logVersion); - String latestInstant = action.equals(HoodieTimeline.ROLLBACK_ACTION) - ? String.valueOf(baseInstantTs + logVersion - 2) : instantTime; + String latestInstant = + action.equals(HoodieTimeline.ROLLBACK_ACTION) ? String.valueOf(baseInstantTs + logVersion - 2) + : instantTime; HoodieLogFormat.Writer writer = null; if (action.equals(HoodieTimeline.ROLLBACK_ACTION)) { - writer = writeRollback(partitionDir, schema, "fileid0", baseInstant, - instantTime, String.valueOf(baseInstantTs + logVersion - 1), logVersion); + writer = writeRollback(partitionDir, schema, "fileid0", baseInstant, instantTime, + String.valueOf(baseInstantTs + logVersion - 1), logVersion); } else { - writer = writeDataBlockToLogFile(partitionDir, schema, "fileid0", baseInstant, - instantTime, 100, 0, logVersion); + writer = + writeDataBlockToLogFile(partitionDir, schema, "fileid0", baseInstant, instantTime, 100, 0, logVersion); } long size = writer.getCurrentSize(); writer.close(); assertTrue("block - size should be > 0", size > 0); - //create a split with baseFile (parquet file written earlier) and new log file(s) + // create a split with baseFile (parquet file written earlier) and new log file(s) fileSlice.addLogFile(writer.getLogFile()); HoodieRealtimeFileSplit split = new HoodieRealtimeFileSplit( - new FileSplit(new Path(partitionDir + "/fileid0_1-0-1_" + baseInstant + ".parquet"), 0, 1, - jobConf), basePath.getRoot().getPath(), - fileSlice.getLogFiles().sorted(HoodieLogFile.getLogFileComparator()).map(h -> h.getPath().toString()) - .collect(Collectors.toList()), instantTime); + new FileSplit(new Path(partitionDir + "/fileid0_1-0-1_" + baseInstant + ".parquet"), 0, 1, jobConf), + basePath.getRoot().getPath(), fileSlice.getLogFiles().sorted(HoodieLogFile.getLogFileComparator()) + .map(h -> h.getPath().toString()).collect(Collectors.toList()), + instantTime); - //create a RecordReader to be used by HoodieRealtimeRecordReader - RecordReader reader = - new MapredParquetInputFormat().getRecordReader( - new FileSplit(split.getPath(), 0, fs.getLength(split.getPath()), (String[]) null), - jobConf, null); + // create a RecordReader to be used by HoodieRealtimeRecordReader + RecordReader reader = new MapredParquetInputFormat().getRecordReader( + new FileSplit(split.getPath(), 0, fs.getLength(split.getPath()), (String[]) null), jobConf, null); JobConf jobConf = new JobConf(); List fields = schema.getFields(); String names = fields.stream().map(f -> f.name().toString()).collect(Collectors.joining(",")); - String postions = fields.stream().map(f -> String.valueOf(f.pos())) - .collect(Collectors.joining(",")); + String postions = fields.stream().map(f -> String.valueOf(f.pos())).collect(Collectors.joining(",")); jobConf.set(ColumnProjectionUtils.READ_COLUMN_NAMES_CONF_STR, names); jobConf.set(ColumnProjectionUtils.READ_COLUMN_IDS_CONF_STR, postions); if (partitioned) { jobConf.set("partition_columns", "datestr"); } - //validate record reader compaction + // validate record reader compaction HoodieRealtimeRecordReader recordReader = new HoodieRealtimeRecordReader(split, jobConf, reader); - //use reader to read base Parquet File and log file, merge in flight and return latest commit - //here all 100 records should be updated, see above + // use reader to read base Parquet File and log file, merge in flight and return latest commit + // here all 100 records should be updated, see above NullWritable key = recordReader.createKey(); ArrayWritable value = recordReader.createValue(); while (recordReader.next(key, value)) { Writable[] values = value.get(); - //check if the record written is with latest commit, here "101" + // check if the record written is with latest commit, here "101" Assert.assertEquals(latestInstant, values[0].toString()); key = recordReader.createKey(); value = recordReader.createValue(); @@ -263,53 +248,48 @@ public class HoodieRealtimeRecordReaderTest { public void testUnMergedReader() throws Exception { // initial commit Schema schema = HoodieAvroUtils.addMetadataFields(SchemaTestUtil.getEvolvedSchema()); - HoodieTestUtils.init(hadoopConf, basePath.getRoot().getAbsolutePath(), - HoodieTableType.MERGE_ON_READ); + HoodieTestUtils.init(hadoopConf, basePath.getRoot().getAbsolutePath(), HoodieTableType.MERGE_ON_READ); String commitTime = "100"; final int numRecords = 1000; final int firstBatchLastRecordKey = numRecords - 1; final int secondBatchLastRecordKey = 2 * numRecords - 1; - File partitionDir = InputFormatTestUtil - .prepareParquetDataset(basePath, schema, 1, numRecords, commitTime); + File partitionDir = InputFormatTestUtil.prepareParquetDataset(basePath, schema, 1, numRecords, commitTime); InputFormatTestUtil.commit(basePath, commitTime); // Add the paths FileInputFormat.setInputPaths(jobConf, partitionDir.getPath()); // insert new records to log file String newCommitTime = "101"; - HoodieLogFormat.Writer writer = writeDataBlockToLogFile(partitionDir, schema, "fileid0", commitTime, - newCommitTime, numRecords, numRecords, 0); + HoodieLogFormat.Writer writer = + writeDataBlockToLogFile(partitionDir, schema, "fileid0", commitTime, newCommitTime, numRecords, numRecords, 0); long size = writer.getCurrentSize(); writer.close(); assertTrue("block - size should be > 0", size > 0); - //create a split with baseFile (parquet file written earlier) and new log file(s) + // create a split with baseFile (parquet file written earlier) and new log file(s) String logFilePath = writer.getLogFile().getPath().toString(); HoodieRealtimeFileSplit split = new HoodieRealtimeFileSplit( - new FileSplit(new Path(partitionDir + "/fileid0_1-0-1_" + commitTime + ".parquet"), 0, 1, - jobConf), basePath.getRoot().getPath(), Arrays.asList(logFilePath), newCommitTime); + new FileSplit(new Path(partitionDir + "/fileid0_1-0-1_" + commitTime + ".parquet"), 0, 1, jobConf), + basePath.getRoot().getPath(), Arrays.asList(logFilePath), newCommitTime); - //create a RecordReader to be used by HoodieRealtimeRecordReader - RecordReader reader = - new MapredParquetInputFormat().getRecordReader( - new FileSplit(split.getPath(), 0, fs.getLength(split.getPath()), (String[]) null), - jobConf, null); + // create a RecordReader to be used by HoodieRealtimeRecordReader + RecordReader reader = new MapredParquetInputFormat().getRecordReader( + new FileSplit(split.getPath(), 0, fs.getLength(split.getPath()), (String[]) null), jobConf, null); JobConf jobConf = new JobConf(); List fields = schema.getFields(); String names = fields.stream().map(f -> f.name().toString()).collect(Collectors.joining(",")); - String postions = fields.stream().map(f -> String.valueOf(f.pos())) - .collect(Collectors.joining(",")); + String postions = fields.stream().map(f -> String.valueOf(f.pos())).collect(Collectors.joining(",")); jobConf.set(ColumnProjectionUtils.READ_COLUMN_NAMES_CONF_STR, names); jobConf.set(ColumnProjectionUtils.READ_COLUMN_IDS_CONF_STR, postions); jobConf.set("partition_columns", "datestr"); // Enable merge skipping. jobConf.set("hoodie.realtime.merge.skip", "true"); - //validate unmerged record reader + // validate unmerged record reader RealtimeUnmergedRecordReader recordReader = new RealtimeUnmergedRecordReader(split, jobConf, reader); - //use reader to read base Parquet File and log file - //here all records should be present. Also ensure log records are in order. + // use reader to read base Parquet File and log file + // here all records should be present. Also ensure log records are in order. NullWritable key = recordReader.createKey(); ArrayWritable value = recordReader.createValue(); int numRecordsAtCommit1 = 0; @@ -347,43 +327,38 @@ public class HoodieRealtimeRecordReaderTest { public void testReaderWithNestedAndComplexSchema() throws Exception { // initial commit Schema schema = HoodieAvroUtils.addMetadataFields(SchemaTestUtil.getComplexEvolvedSchema()); - HoodieTestUtils.init(hadoopConf, basePath.getRoot().getAbsolutePath(), - HoodieTableType.MERGE_ON_READ); + HoodieTestUtils.init(hadoopConf, basePath.getRoot().getAbsolutePath(), HoodieTableType.MERGE_ON_READ); String commitTime = "100"; int numberOfRecords = 100; int numberOfLogRecords = numberOfRecords / 2; - File partitionDir = InputFormatTestUtil - .prepareParquetDataset(basePath, schema, 1, numberOfRecords, commitTime); + File partitionDir = InputFormatTestUtil.prepareParquetDataset(basePath, schema, 1, numberOfRecords, commitTime); InputFormatTestUtil.commit(basePath, commitTime); // Add the paths FileInputFormat.setInputPaths(jobConf, partitionDir.getPath()); // update files or generate new log file String newCommitTime = "101"; - HoodieLogFormat.Writer writer = writeLogFile(partitionDir, schema, "fileid0", commitTime, - newCommitTime, numberOfLogRecords); + HoodieLogFormat.Writer writer = + writeLogFile(partitionDir, schema, "fileid0", commitTime, newCommitTime, numberOfLogRecords); long size = writer.getCurrentSize(); writer.close(); assertTrue("block - size should be > 0", size > 0); InputFormatTestUtil.deltaCommit(basePath, newCommitTime); - //create a split with baseFile (parquet file written earlier) and new log file(s) + // create a split with baseFile (parquet file written earlier) and new log file(s) String logFilePath = writer.getLogFile().getPath().toString(); HoodieRealtimeFileSplit split = new HoodieRealtimeFileSplit( - new FileSplit(new Path(partitionDir + "/fileid0_1-0-1_" + commitTime + ".parquet"), 0, 1, - jobConf), basePath.getRoot().getPath(), Arrays.asList(logFilePath), newCommitTime); + new FileSplit(new Path(partitionDir + "/fileid0_1-0-1_" + commitTime + ".parquet"), 0, 1, jobConf), + basePath.getRoot().getPath(), Arrays.asList(logFilePath), newCommitTime); - //create a RecordReader to be used by HoodieRealtimeRecordReader - RecordReader reader = - new MapredParquetInputFormat().getRecordReader( - new FileSplit(split.getPath(), 0, fs.getLength(split.getPath()), (String[]) null), - jobConf, null); + // create a RecordReader to be used by HoodieRealtimeRecordReader + RecordReader reader = new MapredParquetInputFormat().getRecordReader( + new FileSplit(split.getPath(), 0, fs.getLength(split.getPath()), (String[]) null), jobConf, null); JobConf jobConf = new JobConf(); List fields = schema.getFields(); String names = fields.stream().map(f -> f.name()).collect(Collectors.joining(",")); - String positions = fields.stream().map(f -> String.valueOf(f.pos())) - .collect(Collectors.joining(",")); + String positions = fields.stream().map(f -> String.valueOf(f.pos())).collect(Collectors.joining(",")); jobConf.set(ColumnProjectionUtils.READ_COLUMN_NAMES_CONF_STR, names); jobConf.set(ColumnProjectionUtils.READ_COLUMN_IDS_CONF_STR, positions); jobConf.set("partition_columns", "datestr"); @@ -401,7 +376,7 @@ public class HoodieRealtimeRecordReaderTest { ++numRecordsRead; Writable[] values = value.get(); String recordCommitTime; - //check if the record written is with latest commit, here "101" + // check if the record written is with latest commit, here "101" if (numRecordsRead > numberOfLogRecords) { recordCommitTime = commitTime; } else { @@ -414,12 +389,10 @@ public class HoodieRealtimeRecordReaderTest { value = recordReader.createValue(); // Assert type STRING - Assert.assertEquals("test value for field: field1", values[5].toString(), - "field" + currentRecordNo); + Assert.assertEquals("test value for field: field1", values[5].toString(), "field" + currentRecordNo); Assert.assertEquals("test value for field: field2", values[6].toString(), "field" + currentRecordNo + recordCommitTimeSuffix); - Assert.assertEquals("test value for field: name", values[7].toString(), - "name" + currentRecordNo); + Assert.assertEquals("test value for field: name", values[7].toString(), "name" + currentRecordNo); // Assert type INT IntWritable intWritable = (IntWritable) values[8]; @@ -459,20 +432,17 @@ public class HoodieRealtimeRecordReaderTest { Assert.assertEquals("test value for field: tags[\"mapItem2\"].item1", ((ArrayWritable) mapItemValue2value).get()[0].toString(), "item2" + currentRecordNo); Assert.assertEquals("test value for field: tags[\"mapItem1\"].item2", - ((ArrayWritable) mapItemValue1value).get()[1].toString(), - "item" + currentRecordNo + recordCommitTimeSuffix); + ((ArrayWritable) mapItemValue1value).get()[1].toString(), "item" + currentRecordNo + recordCommitTimeSuffix); Assert.assertEquals("test value for field: tags[\"mapItem2\"].item2", - ((ArrayWritable) mapItemValue2value).get()[1].toString(), - "item2" + currentRecordNo + recordCommitTimeSuffix); + ((ArrayWritable) mapItemValue2value).get()[1].toString(), "item2" + currentRecordNo + recordCommitTimeSuffix); // Assert type RECORD ArrayWritable recordItem = (ArrayWritable) values[13]; Writable[] nestedRecord = recordItem.get(); - Assert.assertEquals("test value for field: testNestedRecord.isAdmin", - ((BooleanWritable) nestedRecord[0]).get(), false); - Assert - .assertEquals("test value for field: testNestedRecord.userId", nestedRecord[1].toString(), - "UserId" + currentRecordNo + recordCommitTimeSuffix); + Assert.assertEquals("test value for field: testNestedRecord.isAdmin", ((BooleanWritable) nestedRecord[0]).get(), + false); + Assert.assertEquals("test value for field: testNestedRecord.userId", nestedRecord[1].toString(), + "UserId" + currentRecordNo + recordCommitTimeSuffix); // Assert type ARRAY ArrayWritable arrayValue = (ArrayWritable) values[14]; @@ -489,13 +459,12 @@ public class HoodieRealtimeRecordReaderTest { // initial commit List logFilePaths = new ArrayList<>(); Schema schema = HoodieAvroUtils.addMetadataFields(SchemaTestUtil.getSimpleSchema()); - HoodieTestUtils.init(hadoopConf, basePath.getRoot().getAbsolutePath(), - HoodieTableType.MERGE_ON_READ); + HoodieTestUtils.init(hadoopConf, basePath.getRoot().getAbsolutePath(), HoodieTableType.MERGE_ON_READ); String commitTime = "100"; int numberOfRecords = 100; int numberOfLogRecords = numberOfRecords / 2; - File partitionDir = InputFormatTestUtil - .prepareSimpleParquetDataset(basePath, schema, 1, numberOfRecords, commitTime); + File partitionDir = + InputFormatTestUtil.prepareSimpleParquetDataset(basePath, schema, 1, numberOfRecords, commitTime); InputFormatTestUtil.commit(basePath, commitTime); // Add the paths FileInputFormat.setInputPaths(jobConf, partitionDir.getPath()); @@ -504,8 +473,8 @@ public class HoodieRealtimeRecordReaderTest { // update files and generate new log file but don't commit schema = SchemaTestUtil.getComplexEvolvedSchema(); String newCommitTime = "101"; - HoodieLogFormat.Writer writer = writeDataBlockToLogFile(partitionDir, schema, "fileid0", commitTime, - newCommitTime, numberOfLogRecords, 0, 1); + HoodieLogFormat.Writer writer = + writeDataBlockToLogFile(partitionDir, schema, "fileid0", commitTime, newCommitTime, numberOfLogRecords, 0, 1); long size = writer.getCurrentSize(); logFilePaths.add(writer.getLogFile().getPath().toString()); writer.close(); @@ -513,23 +482,20 @@ public class HoodieRealtimeRecordReaderTest { // write rollback for the previous block in new log file version newCommitTime = "102"; - writer = writeRollbackBlockToLogFile(partitionDir, schema, "fileid0", commitTime, - newCommitTime, "101", 1); + writer = writeRollbackBlockToLogFile(partitionDir, schema, "fileid0", commitTime, newCommitTime, "101", 1); logFilePaths.add(writer.getLogFile().getPath().toString()); writer.close(); assertTrue("block - size should be > 0", size > 0); InputFormatTestUtil.deltaCommit(basePath, newCommitTime); - //create a split with baseFile (parquet file written earlier) and new log file(s) + // create a split with baseFile (parquet file written earlier) and new log file(s) HoodieRealtimeFileSplit split = new HoodieRealtimeFileSplit( - new FileSplit(new Path(partitionDir + "/fileid0_1_" + commitTime + ".parquet"), 0, 1, - jobConf), basePath.getRoot().getPath(), logFilePaths, newCommitTime); + new FileSplit(new Path(partitionDir + "/fileid0_1_" + commitTime + ".parquet"), 0, 1, jobConf), + basePath.getRoot().getPath(), logFilePaths, newCommitTime); - //create a RecordReader to be used by HoodieRealtimeRecordReader - RecordReader reader = - new MapredParquetInputFormat().getRecordReader( - new FileSplit(split.getPath(), 0, fs.getLength(split.getPath()), (String[]) null), - jobConf, null); + // create a RecordReader to be used by HoodieRealtimeRecordReader + RecordReader reader = new MapredParquetInputFormat().getRecordReader( + new FileSplit(split.getPath(), 0, fs.getLength(split.getPath()), (String[]) null), jobConf, null); JobConf jobConf = new JobConf(); List fields = schema.getFields(); @@ -537,8 +503,7 @@ public class HoodieRealtimeRecordReaderTest { // Try to read all the fields passed by the new schema String names = fields.stream().map(f -> f.name()).collect(Collectors.joining(",")); - String positions = fields.stream().map(f -> String.valueOf(f.pos())) - .collect(Collectors.joining(",")); + String positions = fields.stream().map(f -> String.valueOf(f.pos())).collect(Collectors.joining(",")); jobConf.set(ColumnProjectionUtils.READ_COLUMN_NAMES_CONF_STR, names); jobConf.set(ColumnProjectionUtils.READ_COLUMN_IDS_CONF_STR, positions); jobConf.set("partition_columns", "datestr"); @@ -554,8 +519,7 @@ public class HoodieRealtimeRecordReaderTest { // Try to read all the fields passed by the new schema names = firstSchemaFields.stream().map(f -> f.name()).collect(Collectors.joining(",")); - positions = firstSchemaFields.stream().map(f -> String.valueOf(f.pos())) - .collect(Collectors.joining(",")); + positions = firstSchemaFields.stream().map(f -> String.valueOf(f.pos())).collect(Collectors.joining(",")); jobConf.set(ColumnProjectionUtils.READ_COLUMN_NAMES_CONF_STR, names); jobConf.set(ColumnProjectionUtils.READ_COLUMN_IDS_CONF_STR, positions); jobConf.set("partition_columns", "datestr"); diff --git a/hudi-hive/pom.xml b/hudi-hive/pom.xml index 0af1bfe4e..c552b7052 100644 --- a/hudi-hive/pom.xml +++ b/hudi-hive/pom.xml @@ -26,6 +26,10 @@ hudi-hive jar + + ${project.parent.basedir} + + diff --git a/hudi-hive/src/main/java/org/apache/hudi/hive/HiveSyncConfig.java b/hudi-hive/src/main/java/org/apache/hudi/hive/HiveSyncConfig.java index 2fc3a2ee4..cd700c3a0 100644 --- a/hudi-hive/src/main/java/org/apache/hudi/hive/HiveSyncConfig.java +++ b/hudi-hive/src/main/java/org/apache/hudi/hive/HiveSyncConfig.java @@ -28,8 +28,7 @@ import java.util.List; */ public class HiveSyncConfig implements Serializable { - @Parameter(names = { - "--database"}, description = "name of the target database in Hive", required = true) + @Parameter(names = {"--database"}, description = "name of the target database in Hive", required = true) public String databaseName; @Parameter(names = {"--table"}, description = "name of the target table in Hive", required = true) @@ -44,33 +43,25 @@ public class HiveSyncConfig implements Serializable { @Parameter(names = {"--jdbc-url"}, description = "Hive jdbc connect url", required = true) public String jdbcUrl; - @Parameter(names = { - "--base-path"}, description = "Basepath of hoodie dataset to sync", required = true) + @Parameter(names = {"--base-path"}, description = "Basepath of hoodie dataset to sync", required = true) public String basePath; @Parameter(names = "--partitioned-by", description = "Fields in the schema partitioned by") public List partitionFields = new ArrayList<>(); - @Parameter(names = "--partition-value-extractor", description = "Class which implements " - + "PartitionValueExtractor " - + "to extract the partition " - + "values from HDFS path") - public String partitionValueExtractorClass = SlashEncodedDayPartitionValueExtractor.class - .getName(); + @Parameter(names = "--partition-value-extractor", description = "Class which implements " + "PartitionValueExtractor " + + "to extract the partition " + "values from HDFS path") + public String partitionValueExtractorClass = SlashEncodedDayPartitionValueExtractor.class.getName(); - @Parameter(names = { - "--assume-date-partitioning"}, description = "Assume standard yyyy/mm/dd partitioning, this" - + " exists to support " - + "backward compatibility. If" - + " you use hoodie 0.3.x, do " - + "not set this parameter") + @Parameter(names = {"--assume-date-partitioning"}, description = "Assume standard yyyy/mm/dd partitioning, this" + + " exists to support " + "backward compatibility. If" + " you use hoodie 0.3.x, do " + "not set this parameter") public Boolean assumeDatePartitioning = false; - @Parameter(names = { - "--use-pre-apache-input-format"}, description = "Use InputFormat under com.uber.hoodie package " - + "instead of org.apache.hudi package. Use this when you are in the process of migrating from " - + "com.uber.hoodie to org.apache.hudi. Stop using this after you migrated the table definition to " - + "org.apache.hudi input format.") + @Parameter(names = {"--use-pre-apache-input-format"}, + description = "Use InputFormat under com.uber.hoodie package " + + "instead of org.apache.hudi package. Use this when you are in the process of migrating from " + + "com.uber.hoodie to org.apache.hudi. Stop using this after you migrated the table definition to " + + "org.apache.hudi input format.") public Boolean usePreApacheInputFormat = false; @Parameter(names = {"--use-jdbc"}, description = "Hive jdbc connect url") @@ -96,19 +87,10 @@ public class HiveSyncConfig implements Serializable { @Override public String toString() { - return "HiveSyncConfig{" - + "databaseName='" + databaseName + '\'' - + ", tableName='" + tableName + '\'' - + ", hiveUser='" + hiveUser + '\'' - + ", hivePass='" + hivePass + '\'' - + ", jdbcUrl='" + jdbcUrl + '\'' - + ", basePath='" + basePath + '\'' - + ", partitionFields=" + partitionFields - + ", partitionValueExtractorClass='" + partitionValueExtractorClass + '\'' - + ", assumeDatePartitioning=" + assumeDatePartitioning - + ", usePreApacheInputFormat=" + usePreApacheInputFormat - + ", useJdbc=" + useJdbc - + ", help=" + help - + '}'; + return "HiveSyncConfig{" + "databaseName='" + databaseName + '\'' + ", tableName='" + tableName + '\'' + + ", hiveUser='" + hiveUser + '\'' + ", hivePass='" + hivePass + '\'' + ", jdbcUrl='" + jdbcUrl + '\'' + + ", basePath='" + basePath + '\'' + ", partitionFields=" + partitionFields + ", partitionValueExtractorClass='" + + partitionValueExtractorClass + '\'' + ", assumeDatePartitioning=" + assumeDatePartitioning + + ", usePreApacheInputFormat=" + usePreApacheInputFormat + ", useJdbc=" + useJdbc + ", help=" + help + '}'; } } diff --git a/hudi-hive/src/main/java/org/apache/hudi/hive/HiveSyncTool.java b/hudi-hive/src/main/java/org/apache/hudi/hive/HiveSyncTool.java index 02afff622..fa2c8b6c5 100644 --- a/hudi-hive/src/main/java/org/apache/hudi/hive/HiveSyncTool.java +++ b/hudi-hive/src/main/java/org/apache/hudi/hive/HiveSyncTool.java @@ -43,11 +43,10 @@ import org.apache.parquet.schema.MessageType; /** * Tool to sync a hoodie HDFS dataset with a hive metastore table. Either use it as a api - * HiveSyncTool.syncHoodieTable(HiveSyncConfig) or as a command line java -cp hoodie-hive.jar - * HiveSyncTool [args] + * HiveSyncTool.syncHoodieTable(HiveSyncConfig) or as a command line java -cp hoodie-hive.jar HiveSyncTool [args] *

    - * This utility will get the schema from the latest commit and will sync hive table schema Also this - * will sync the partitions incrementally (all the partitions modified since the last commit) + * This utility will get the schema from the latest commit and will sync hive table schema Also this will sync the + * partitions incrementally (all the partitions modified since the last commit) */ @SuppressWarnings("WeakerAccess") public class HiveSyncTool { @@ -68,12 +67,12 @@ public class HiveSyncTool { syncHoodieTable(false); break; case MERGE_ON_READ: - //sync a RO table for MOR + // sync a RO table for MOR syncHoodieTable(false); String originalTableName = cfg.tableName; - //TODO : Make realtime table registration optional using a config param + // TODO : Make realtime table registration optional using a config param cfg.tableName = cfg.tableName + SUFFIX_REALTIME_TABLE; - //sync a RT table for MOR + // sync a RT table for MOR syncHoodieTable(true); cfg.tableName = originalTableName; break; @@ -85,8 +84,8 @@ public class HiveSyncTool { } private void syncHoodieTable(boolean isRealTime) throws ClassNotFoundException { - LOG.info("Trying to sync hoodie table " + cfg.tableName + " with base path " - + hoodieHiveClient.getBasePath() + " of type " + hoodieHiveClient.getTableType()); + LOG.info("Trying to sync hoodie table " + cfg.tableName + " with base path " + hoodieHiveClient.getBasePath() + + " of type " + hoodieHiveClient.getTableType()); // Check if the necessary table exists boolean tableExists = hoodieHiveClient.doesTableExist(); @@ -102,8 +101,7 @@ public class HiveSyncTool { lastCommitTimeSynced = hoodieHiveClient.getLastCommitTimeSynced(); } LOG.info("Last commit time synced was found to be " + lastCommitTimeSynced.orElse("null")); - List writtenPartitionsSince = hoodieHiveClient - .getPartitionsWrittenToSince(lastCommitTimeSynced); + List writtenPartitionsSince = hoodieHiveClient.getPartitionsWrittenToSince(lastCommitTimeSynced); LOG.info("Storage partitions scan complete. Found " + writtenPartitionsSince.size()); // Sync the partitions if needed syncPartitions(writtenPartitionsSince); @@ -113,8 +111,8 @@ public class HiveSyncTool { } /** - * Get the latest schema from the last commit and check if its in sync with the hive table schema. - * If not, evolves the table schema. + * Get the latest schema from the last commit and check if its in sync with the hive table schema. If not, evolves the + * table schema. * * @param tableExists - does table exist * @param schema - extracted schema @@ -129,8 +127,8 @@ public class HiveSyncTool { String inputFormatClassName = cfg.usePreApacheInputFormat ? com.uber.hoodie.hadoop.HoodieInputFormat.class.getName() : HoodieParquetInputFormat.class.getName(); - hoodieHiveClient.createTable(schema, inputFormatClassName, - MapredParquetOutputFormat.class.getName(), ParquetHiveSerDe.class.getName()); + hoodieHiveClient.createTable(schema, inputFormatClassName, MapredParquetOutputFormat.class.getName(), + ParquetHiveSerDe.class.getName()); } else { // Custom serde will not work with ALTER TABLE REPLACE COLUMNS // https://github.com/apache/hive/blob/release-1.1.0/ql/src/java/org/apache/hadoop/hive @@ -138,14 +136,13 @@ public class HiveSyncTool { String inputFormatClassName = cfg.usePreApacheInputFormat ? com.uber.hoodie.hadoop.realtime.HoodieRealtimeInputFormat.class.getName() : HoodieParquetRealtimeInputFormat.class.getName(); - hoodieHiveClient.createTable(schema, inputFormatClassName, - MapredParquetOutputFormat.class.getName(), ParquetHiveSerDe.class.getName()); + hoodieHiveClient.createTable(schema, inputFormatClassName, MapredParquetOutputFormat.class.getName(), + ParquetHiveSerDe.class.getName()); } } else { // Check if the dataset schema has evolved Map tableSchema = hoodieHiveClient.getTableSchema(); - SchemaDifference schemaDiff = SchemaUtil.getSchemaDifference(schema, tableSchema, - cfg.partitionFields); + SchemaDifference schemaDiff = SchemaUtil.getSchemaDifference(schema, tableSchema, cfg.partitionFields); if (!schemaDiff.isEmpty()) { LOG.info("Schema difference found for " + cfg.tableName); hoodieHiveClient.updateTableDefinition(schema); @@ -157,14 +154,14 @@ public class HiveSyncTool { /** - * Syncs the list of storage parititions passed in (checks if the partition is in hive, if not - * adds it or if the partition path does not match, it updates the partition path) + * Syncs the list of storage parititions passed in (checks if the partition is in hive, if not adds it or if the + * partition path does not match, it updates the partition path) */ private void syncPartitions(List writtenPartitionsSince) { try { List hivePartitions = hoodieHiveClient.scanTablePartitions(); - List partitionEvents = hoodieHiveClient.getPartitionEvents(hivePartitions, - writtenPartitionsSince); + List partitionEvents = + hoodieHiveClient.getPartitionEvents(hivePartitions, writtenPartitionsSince); List newPartitions = filterPartitions(partitionEvents, PartitionEventType.ADD); LOG.info("New Partitions " + newPartitions); hoodieHiveClient.addPartitionsToTable(newPartitions); diff --git a/hudi-hive/src/main/java/org/apache/hudi/hive/HoodieHiveClient.java b/hudi-hive/src/main/java/org/apache/hudi/hive/HoodieHiveClient.java index 771f45acd..d1882e21d 100644 --- a/hudi-hive/src/main/java/org/apache/hudi/hive/HoodieHiveClient.java +++ b/hudi-hive/src/main/java/org/apache/hudi/hive/HoodieHiveClient.java @@ -112,12 +112,11 @@ public class HoodieHiveClient { } try { - this.partitionValueExtractor = (PartitionValueExtractor) Class.forName( - cfg.partitionValueExtractorClass).newInstance(); + this.partitionValueExtractor = + (PartitionValueExtractor) Class.forName(cfg.partitionValueExtractorClass).newInstance(); } catch (Exception e) { throw new HoodieHiveSyncException( - "Failed to initialize PartitionValueExtractor class " + cfg.partitionValueExtractorClass, - e); + "Failed to initialize PartitionValueExtractor class " + cfg.partitionValueExtractorClass, e); } activeTimeline = metaClient.getActiveTimeline().getCommitsTimeline().filterCompletedInstants(); @@ -157,28 +156,26 @@ public class HoodieHiveClient { private String constructAddPartitions(List partitions) { StringBuilder alterSQL = new StringBuilder("ALTER TABLE "); - alterSQL.append(syncConfig.databaseName).append(".").append(syncConfig.tableName) - .append(" ADD IF NOT EXISTS "); + alterSQL.append(syncConfig.databaseName).append(".").append(syncConfig.tableName).append(" ADD IF NOT EXISTS "); for (String partition : partitions) { String partitionClause = getPartitionClause(partition); String fullPartitionPath = FSUtils.getPartitionPath(syncConfig.basePath, partition).toString(); - alterSQL.append(" PARTITION (").append(partitionClause).append(") LOCATION '") - .append(fullPartitionPath).append("' "); + alterSQL.append(" PARTITION (").append(partitionClause).append(") LOCATION '").append(fullPartitionPath) + .append("' "); } return alterSQL.toString(); } /** * Generate Hive Partition from partition values + * * @param partition Partition path * @return */ private String getPartitionClause(String partition) { - List partitionValues = partitionValueExtractor - .extractPartitionValuesInPath(partition); + List partitionValues = partitionValueExtractor.extractPartitionValuesInPath(partition); Preconditions.checkArgument(syncConfig.partitionFields.size() == partitionValues.size(), - "Partition key parts " + syncConfig.partitionFields - + " does not match with partition values " + partitionValues + "Partition key parts " + syncConfig.partitionFields + " does not match with partition values " + partitionValues + ". Check partition strategy. "); List partBuilder = new ArrayList<>(); for (int i = 0; i < syncConfig.partitionFields.size(); i++) { @@ -204,17 +201,16 @@ public class HoodieHiveClient { } /** - * Iterate over the storage partitions and find if there are any new partitions that need to be - * added or updated. Generate a list of PartitionEvent based on the changes required. + * Iterate over the storage partitions and find if there are any new partitions that need to be added or updated. + * Generate a list of PartitionEvent based on the changes required. */ - List getPartitionEvents(List tablePartitions, - List partitionStoragePartitions) { + List getPartitionEvents(List tablePartitions, List partitionStoragePartitions) { Map paths = Maps.newHashMap(); for (Partition tablePartition : tablePartitions) { List hivePartitionValues = tablePartition.getValues(); Collections.sort(hivePartitionValues); - String fullTablePartitionPath = Path.getPathWithoutSchemeAndAuthority( - new Path(tablePartition.getSd().getLocation())).toUri().getPath(); + String fullTablePartitionPath = + Path.getPathWithoutSchemeAndAuthority(new Path(tablePartition.getSd().getLocation())).toUri().getPath(); paths.put(String.join(", ", hivePartitionValues), fullTablePartitionPath); } @@ -222,8 +218,7 @@ public class HoodieHiveClient { for (String storagePartition : partitionStoragePartitions) { String fullStoragePartitionPath = FSUtils.getPartitionPath(syncConfig.basePath, storagePartition).toString(); // Check if the partition values or if hdfs path is the same - List storagePartitionValues = partitionValueExtractor - .extractPartitionValuesInPath(storagePartition); + List storagePartitionValues = partitionValueExtractor.extractPartitionValuesInPath(storagePartition); Collections.sort(storagePartitionValues); if (!storagePartitionValues.isEmpty()) { String storageValue = String.join(", ", storagePartitionValues); @@ -250,11 +245,9 @@ public class HoodieHiveClient { String newSchemaStr = SchemaUtil.generateSchemaString(newSchema, syncConfig.partitionFields); // Cascade clause should not be present for non-partitioned tables String cascadeClause = syncConfig.partitionFields.size() > 0 ? " cascade" : ""; - StringBuilder sqlBuilder = new StringBuilder("ALTER TABLE ").append("`") - .append(syncConfig.databaseName).append(".") - .append(syncConfig.tableName).append("`") - .append(" REPLACE COLUMNS(").append(newSchemaStr).append(" )") - .append(cascadeClause); + StringBuilder sqlBuilder = new StringBuilder("ALTER TABLE ").append("`").append(syncConfig.databaseName) + .append(".").append(syncConfig.tableName).append("`").append(" REPLACE COLUMNS(").append(newSchemaStr) + .append(" )").append(cascadeClause); LOG.info("Updating table definition with " + sqlBuilder); updateHiveSQL(sqlBuilder.toString()); } catch (IOException e) { @@ -262,12 +255,10 @@ public class HoodieHiveClient { } } - void createTable(MessageType storageSchema, String inputFormatClass, String outputFormatClass, - String serdeClass) { + void createTable(MessageType storageSchema, String inputFormatClass, String outputFormatClass, String serdeClass) { try { - String createSQLQuery = SchemaUtil - .generateCreateDDL(storageSchema, syncConfig, inputFormatClass, - outputFormatClass, serdeClass); + String createSQLQuery = + SchemaUtil.generateCreateDDL(storageSchema, syncConfig, inputFormatClass, outputFormatClass, serdeClass); LOG.info("Creating table with " + createSQLQuery); updateHiveSQL(createSQLQuery); } catch (IOException e) { @@ -288,8 +279,7 @@ public class HoodieHiveClient { ResultSet result = null; try { DatabaseMetaData databaseMetaData = connection.getMetaData(); - result = databaseMetaData - .getColumns(null, syncConfig.databaseName, syncConfig.tableName, null); + result = databaseMetaData.getColumns(null, syncConfig.databaseName, syncConfig.tableName, null); while (result.next()) { String columnName = result.getString(4); String columnType = result.getString(6); @@ -302,8 +292,7 @@ public class HoodieHiveClient { } return schema; } catch (SQLException e) { - throw new HoodieHiveSyncException("Failed to get table schema for " + syncConfig.tableName, - e); + throw new HoodieHiveSyncException("Failed to get table schema for " + syncConfig.tableName, e); } finally { closeQuietly(result, null); } @@ -318,11 +307,11 @@ public class HoodieHiveClient { // get the Schema of the table. final long start = System.currentTimeMillis(); Table table = this.client.getTable(syncConfig.databaseName, syncConfig.tableName); - Map partitionKeysMap = table.getPartitionKeys().stream() - .collect(Collectors.toMap(f -> f.getName(), f -> f.getType().toUpperCase())); + Map partitionKeysMap = + table.getPartitionKeys().stream().collect(Collectors.toMap(f -> f.getName(), f -> f.getType().toUpperCase())); - Map columnsMap = table.getSd().getCols().stream() - .collect(Collectors.toMap(f -> f.getName(), f -> f.getType().toUpperCase())); + Map columnsMap = + table.getSd().getCols().stream().collect(Collectors.toMap(f -> f.getName(), f -> f.getType().toUpperCase())); Map schema = new HashMap<>(); schema.putAll(columnsMap); @@ -336,9 +325,8 @@ public class HoodieHiveClient { } /** - * Gets the schema for a hoodie dataset. Depending on the type of table, read from any file - * written in the latest commit. We will assume that the schema has not changed within a single - * atomic write. + * Gets the schema for a hoodie dataset. Depending on the type of table, read from any file written in the latest + * commit. We will assume that the schema has not changed within a single atomic write. * * @return Parquet schema for this dataset */ @@ -349,57 +337,49 @@ public class HoodieHiveClient { case COPY_ON_WRITE: // If this is COW, get the last commit and read the schema from a file written in the // last commit - HoodieInstant lastCommit = activeTimeline.lastInstant().orElseThrow( - () -> new InvalidDatasetException(syncConfig.basePath)); - HoodieCommitMetadata commitMetadata = HoodieCommitMetadata.fromBytes( - activeTimeline.getInstantDetails(lastCommit).get(), HoodieCommitMetadata.class); - String filePath = commitMetadata.getFileIdAndFullPaths(metaClient.getBasePath()).values() - .stream().findAny().orElseThrow(() -> new IllegalArgumentException( - "Could not find any data file written for commit " + lastCommit - + ", could not get schema for dataset " + metaClient.getBasePath() - + ", Metadata :" + commitMetadata)); + HoodieInstant lastCommit = + activeTimeline.lastInstant().orElseThrow(() -> new InvalidDatasetException(syncConfig.basePath)); + HoodieCommitMetadata commitMetadata = HoodieCommitMetadata + .fromBytes(activeTimeline.getInstantDetails(lastCommit).get(), HoodieCommitMetadata.class); + String filePath = commitMetadata.getFileIdAndFullPaths(metaClient.getBasePath()).values().stream().findAny() + .orElseThrow(() -> new IllegalArgumentException("Could not find any data file written for commit " + + lastCommit + ", could not get schema for dataset " + metaClient.getBasePath() + ", Metadata :" + + commitMetadata)); return readSchemaFromDataFile(new Path(filePath)); case MERGE_ON_READ: // If this is MOR, depending on whether the latest commit is a delta commit or // compaction commit // Get a datafile written and get the schema from that file - Option lastCompactionCommit = metaClient.getActiveTimeline() - .getCommitTimeline() - .filterCompletedInstants() - .lastInstant(); + Option lastCompactionCommit = + metaClient.getActiveTimeline().getCommitTimeline().filterCompletedInstants().lastInstant(); LOG.info("Found the last compaction commit as " + lastCompactionCommit); Option lastDeltaCommit; if (lastCompactionCommit.isPresent()) { - lastDeltaCommit = metaClient.getActiveTimeline().getDeltaCommitTimeline() - .filterCompletedInstants() - .findInstantsAfter(lastCompactionCommit.get().getTimestamp(), - Integer.MAX_VALUE).lastInstant(); + lastDeltaCommit = metaClient.getActiveTimeline().getDeltaCommitTimeline().filterCompletedInstants() + .findInstantsAfter(lastCompactionCommit.get().getTimestamp(), Integer.MAX_VALUE).lastInstant(); } else { - lastDeltaCommit = metaClient.getActiveTimeline().getDeltaCommitTimeline() - .filterCompletedInstants().lastInstant(); + lastDeltaCommit = + metaClient.getActiveTimeline().getDeltaCommitTimeline().filterCompletedInstants().lastInstant(); } LOG.info("Found the last delta commit " + lastDeltaCommit); if (lastDeltaCommit.isPresent()) { HoodieInstant lastDeltaInstant = lastDeltaCommit.get(); // read from the log file wrote - commitMetadata = HoodieCommitMetadata.fromBytes( - activeTimeline.getInstantDetails(lastDeltaInstant).get(), HoodieCommitMetadata.class); + commitMetadata = HoodieCommitMetadata.fromBytes(activeTimeline.getInstantDetails(lastDeltaInstant).get(), + HoodieCommitMetadata.class); Pair filePathWithFormat = - commitMetadata.getFileIdAndFullPaths(metaClient.getBasePath()).values() - .stream().filter(s -> s.contains(HoodieLogFile.DELTA_EXTENSION)) - .findAny().map(f -> Pair.of(f, HoodieFileFormat.HOODIE_LOG)) - .orElseGet(() -> { + commitMetadata.getFileIdAndFullPaths(metaClient.getBasePath()).values().stream() + .filter(s -> s.contains(HoodieLogFile.DELTA_EXTENSION)).findAny() + .map(f -> Pair.of(f, HoodieFileFormat.HOODIE_LOG)).orElseGet(() -> { // No Log files in Delta-Commit. Check if there are any parquet files return commitMetadata.getFileIdAndFullPaths(metaClient.getBasePath()).values().stream() .filter(s -> s.contains((metaClient.getTableConfig().getROFileFormat().getFileExtension()))) - .findAny() - .map(f -> Pair.of(f, HoodieFileFormat.PARQUET)).orElseThrow(() -> { - return new IllegalArgumentException( - "Could not find any data file written for commit " + lastDeltaInstant - + ", could not get schema for dataset " + metaClient.getBasePath() - + ", CommitMetadata :" + commitMetadata); + .findAny().map(f -> Pair.of(f, HoodieFileFormat.PARQUET)).orElseThrow(() -> { + return new IllegalArgumentException("Could not find any data file written for commit " + + lastDeltaInstant + ", could not get schema for dataset " + metaClient.getBasePath() + + ", CommitMetadata :" + commitMetadata); }); }); switch (filePathWithFormat.getRight()) { @@ -419,8 +399,7 @@ public class HoodieHiveClient { throw new InvalidDatasetException(syncConfig.basePath); } } catch (IOException e) { - throw new HoodieHiveSyncException("Failed to get dataset schema for " + syncConfig.tableName, - e); + throw new HoodieHiveSyncException("Failed to get dataset schema for " + syncConfig.tableName, e); } } @@ -428,20 +407,16 @@ public class HoodieHiveClient { * Read schema from a data file from the last compaction commit done. */ @SuppressWarnings("OptionalUsedAsFieldOrParameterType") - private MessageType readSchemaFromLastCompaction(Option lastCompactionCommitOpt) - throws IOException { - HoodieInstant lastCompactionCommit = lastCompactionCommitOpt.orElseThrow( - () -> new HoodieHiveSyncException( - "Could not read schema from last compaction, no compaction commits found on path " - + syncConfig.basePath)); + private MessageType readSchemaFromLastCompaction(Option lastCompactionCommitOpt) throws IOException { + HoodieInstant lastCompactionCommit = lastCompactionCommitOpt.orElseThrow(() -> new HoodieHiveSyncException( + "Could not read schema from last compaction, no compaction commits found on path " + syncConfig.basePath)); // Read from the compacted file wrote - HoodieCommitMetadata compactionMetadata = HoodieCommitMetadata.fromBytes( - activeTimeline.getInstantDetails(lastCompactionCommit).get(), HoodieCommitMetadata.class); - String filePath = compactionMetadata.getFileIdAndFullPaths(metaClient.getBasePath()).values() - .stream().findAny().orElseThrow(() -> new IllegalArgumentException( - "Could not find any data file written for compaction " + lastCompactionCommit - + ", could not get schema for dataset " + metaClient.getBasePath())); + HoodieCommitMetadata compactionMetadata = HoodieCommitMetadata + .fromBytes(activeTimeline.getInstantDetails(lastCompactionCommit).get(), HoodieCommitMetadata.class); + String filePath = compactionMetadata.getFileIdAndFullPaths(metaClient.getBasePath()).values().stream().findAny() + .orElseThrow(() -> new IllegalArgumentException("Could not find any data file written for compaction " + + lastCompactionCommit + ", could not get schema for dataset " + metaClient.getBasePath())); return readSchemaFromDataFile(new Path(filePath)); } @@ -449,8 +424,8 @@ public class HoodieHiveClient { * Read the schema from the log file on path */ @SuppressWarnings("OptionalUsedAsFieldOrParameterType") - private MessageType readSchemaFromLogFile(Option lastCompactionCommitOpt, - Path path) throws IOException { + private MessageType readSchemaFromLogFile(Option lastCompactionCommitOpt, Path path) + throws IOException { MessageType messageType = SchemaUtil.readSchemaFromLogFile(fs, path); // Fall back to read the schema from last compaction if (messageType == null) { @@ -469,8 +444,8 @@ public class HoodieHiveClient { throw new IllegalArgumentException( "Failed to read schema from data file " + parquetFilePath + ". File does not exist."); } - ParquetMetadata fileFooter = ParquetFileReader.readFooter(fs.getConf(), parquetFilePath, - ParquetMetadataConverter.NO_FILTER); + ParquetMetadata fileFooter = + ParquetFileReader.readFooter(fs.getConf(), parquetFilePath, ParquetMetadataConverter.NO_FILTER); return fileFooter.getFileMetaData().getSchema(); } @@ -481,8 +456,7 @@ public class HoodieHiveClient { try { return client.tableExists(syncConfig.databaseName, syncConfig.tableName); } catch (TException e) { - throw new HoodieHiveSyncException("Failed to check if table exists " + syncConfig.tableName, - e); + throw new HoodieHiveSyncException("Failed to check if table exists " + syncConfig.tableName, e); } } @@ -623,11 +597,9 @@ public class HoodieHiveClient { // Get the last commit time from the TBLproperties try { Table database = client.getTable(syncConfig.databaseName, syncConfig.tableName); - return Option.ofNullable( - database.getParameters().getOrDefault(HOODIE_LAST_COMMIT_TIME_SYNC, null)); + return Option.ofNullable(database.getParameters().getOrDefault(HOODIE_LAST_COMMIT_TIME_SYNC, null)); } catch (Exception e) { - throw new HoodieHiveSyncException( - "Failed to get the last commit time synced from the database", e); + throw new HoodieHiveSyncException("Failed to get the last commit time synced from the database", e); } } @@ -650,26 +622,21 @@ public class HoodieHiveClient { if (!lastCommitTimeSynced.isPresent()) { LOG.info("Last commit time synced is not known, listing all partitions in " + syncConfig.basePath + ",FS :" + fs); try { - return FSUtils.getAllPartitionPaths(fs, syncConfig.basePath, - syncConfig.assumeDatePartitioning); + return FSUtils.getAllPartitionPaths(fs, syncConfig.basePath, syncConfig.assumeDatePartitioning); } catch (IOException e) { throw new HoodieIOException("Failed to list all partitions in " + syncConfig.basePath, e); } } else { - LOG.info("Last commit time synced is " + lastCommitTimeSynced.get() - + ", Getting commits since then"); + LOG.info("Last commit time synced is " + lastCommitTimeSynced.get() + ", Getting commits since then"); - HoodieTimeline timelineToSync = activeTimeline.findInstantsAfter(lastCommitTimeSynced.get(), - Integer.MAX_VALUE); + HoodieTimeline timelineToSync = activeTimeline.findInstantsAfter(lastCommitTimeSynced.get(), Integer.MAX_VALUE); return timelineToSync.getInstants().map(s -> { try { return HoodieCommitMetadata.fromBytes(activeTimeline.getInstantDetails(s).get(), HoodieCommitMetadata.class); } catch (IOException e) { - throw new HoodieIOException( - "Failed to get partitions written since " + lastCommitTimeSynced, e); + throw new HoodieIOException("Failed to get partitions written since " + lastCommitTimeSynced, e); } - }).flatMap(s -> s.getPartitionToWriteStats().keySet().stream()).distinct() - .collect(Collectors.toList()); + }).flatMap(s -> s.getPartitionToWriteStats().keySet().stream()).distinct().collect(Collectors.toList()); } } @@ -685,8 +652,7 @@ public class HoodieHiveClient { table.putToParameters(HOODIE_LAST_COMMIT_TIME_SYNC, lastCommitSynced); client.alter_table(syncConfig.databaseName, syncConfig.tableName, table); } catch (Exception e) { - throw new HoodieHiveSyncException( - "Failed to get update last commit time synced to " + lastCommitSynced, e); + throw new HoodieHiveSyncException("Failed to get update last commit time synced to " + lastCommitSynced, e); } } @@ -697,8 +663,7 @@ public class HoodieHiveClient { static class PartitionEvent { public enum PartitionEventType { - ADD, - UPDATE + ADD, UPDATE } PartitionEventType eventType; @@ -717,4 +682,4 @@ public class HoodieHiveClient { return new PartitionEvent(PartitionEventType.UPDATE, storagePartition); } } -} \ No newline at end of file +} diff --git a/hudi-hive/src/main/java/org/apache/hudi/hive/MultiPartKeysValueExtractor.java b/hudi-hive/src/main/java/org/apache/hudi/hive/MultiPartKeysValueExtractor.java index 9701e5b4f..d23250a21 100644 --- a/hudi-hive/src/main/java/org/apache/hudi/hive/MultiPartKeysValueExtractor.java +++ b/hudi-hive/src/main/java/org/apache/hudi/hive/MultiPartKeysValueExtractor.java @@ -34,8 +34,7 @@ public class MultiPartKeysValueExtractor implements PartitionValueExtractor { return Arrays.stream(splits).map(s -> { if (s.contains("=")) { String[] moreSplit = s.split("="); - Preconditions.checkArgument(moreSplit.length == 2, - "Partition Field (" + s + ") not in expected format"); + Preconditions.checkArgument(moreSplit.length == 2, "Partition Field (" + s + ") not in expected format"); return moreSplit[1]; } return s; diff --git a/hudi-hive/src/main/java/org/apache/hudi/hive/NonPartitionedExtractor.java b/hudi-hive/src/main/java/org/apache/hudi/hive/NonPartitionedExtractor.java index 860044e7a..e122bbd82 100644 --- a/hudi-hive/src/main/java/org/apache/hudi/hive/NonPartitionedExtractor.java +++ b/hudi-hive/src/main/java/org/apache/hudi/hive/NonPartitionedExtractor.java @@ -24,7 +24,7 @@ import java.util.List; /** * Extractor for Non-partitioned hive tables */ -public class NonPartitionedExtractor implements PartitionValueExtractor { +public class NonPartitionedExtractor implements PartitionValueExtractor { @Override public List extractPartitionValuesInPath(String partitionPath) { diff --git a/hudi-hive/src/main/java/org/apache/hudi/hive/PartitionValueExtractor.java b/hudi-hive/src/main/java/org/apache/hudi/hive/PartitionValueExtractor.java index 76a44d723..f4820e316 100644 --- a/hudi-hive/src/main/java/org/apache/hudi/hive/PartitionValueExtractor.java +++ b/hudi-hive/src/main/java/org/apache/hudi/hive/PartitionValueExtractor.java @@ -22,12 +22,10 @@ import java.io.Serializable; import java.util.List; /** - * HDFS Path contain hive partition values for the keys it is partitioned on. This mapping is not - * straight forward and requires a pluggable implementation to extract the partition value from HDFS - * path. + * HDFS Path contain hive partition values for the keys it is partitioned on. This mapping is not straight forward and + * requires a pluggable implementation to extract the partition value from HDFS path. *

    - * e.g. Hive table partitioned by datestr=yyyy-mm-dd and hdfs path - * /app/hoodie/dataset1/YYYY=[yyyy]/MM=[mm]/DD=[dd] + * e.g. Hive table partitioned by datestr=yyyy-mm-dd and hdfs path /app/hoodie/dataset1/YYYY=[yyyy]/MM=[mm]/DD=[dd] */ public interface PartitionValueExtractor extends Serializable { diff --git a/hudi-hive/src/main/java/org/apache/hudi/hive/SchemaDifference.java b/hudi-hive/src/main/java/org/apache/hudi/hive/SchemaDifference.java index c1f2291b3..a6594e720 100644 --- a/hudi-hive/src/main/java/org/apache/hudi/hive/SchemaDifference.java +++ b/hudi-hive/src/main/java/org/apache/hudi/hive/SchemaDifference.java @@ -38,9 +38,8 @@ public class SchemaDifference { private final Map updateColumnTypes; private final Map addColumnTypes; - private SchemaDifference(MessageType storageSchema, Map tableSchema, - List deleteColumns, Map updateColumnTypes, - Map addColumnTypes) { + private SchemaDifference(MessageType storageSchema, Map tableSchema, List deleteColumns, + Map updateColumnTypes, Map addColumnTypes) { this.storageSchema = storageSchema; this.tableSchema = tableSchema; this.deleteColumns = ImmutableList.copyOf(deleteColumns); @@ -62,9 +61,8 @@ public class SchemaDifference { @Override public String toString() { - return Objects.toStringHelper(this).add("deleteColumns", deleteColumns) - .add("updateColumnTypes", updateColumnTypes).add("addColumnTypes", addColumnTypes) - .toString(); + return Objects.toStringHelper(this).add("deleteColumns", deleteColumns).add("updateColumnTypes", updateColumnTypes) + .add("addColumnTypes", addColumnTypes).toString(); } public static Builder newBuilder(MessageType storageSchema, Map tableSchema) { @@ -107,8 +105,7 @@ public class SchemaDifference { } public SchemaDifference build() { - return new SchemaDifference(storageSchema, tableSchema, deleteColumns, updateColumnTypes, - addColumnTypes); + return new SchemaDifference(storageSchema, tableSchema, deleteColumns, updateColumnTypes, addColumnTypes); } } } diff --git a/hudi-hive/src/main/java/org/apache/hudi/hive/SlashEncodedDayPartitionValueExtractor.java b/hudi-hive/src/main/java/org/apache/hudi/hive/SlashEncodedDayPartitionValueExtractor.java index 1b772ab37..36e3f1bc0 100644 --- a/hudi-hive/src/main/java/org/apache/hudi/hive/SlashEncodedDayPartitionValueExtractor.java +++ b/hudi-hive/src/main/java/org/apache/hudi/hive/SlashEncodedDayPartitionValueExtractor.java @@ -25,9 +25,8 @@ import org.joda.time.format.DateTimeFormat; import org.joda.time.format.DateTimeFormatter; /** - * HDFS Path contain hive partition values for the keys it is partitioned on. This mapping is not - * straight forward and requires a pluggable implementation to extract the partition value from HDFS - * path. + * HDFS Path contain hive partition values for the keys it is partitioned on. This mapping is not straight forward and + * requires a pluggable implementation to extract the partition value from HDFS path. *

    * This implementation extracts datestr=yyyy-mm-dd from path of type /yyyy/mm/dd */ @@ -51,8 +50,7 @@ public class SlashEncodedDayPartitionValueExtractor implements PartitionValueExt // partition path is expected to be in this format yyyy/mm/dd String[] splits = partitionPath.split("/"); if (splits.length != 3) { - throw new IllegalArgumentException( - "Partition path " + partitionPath + " is not in the form yyyy/mm/dd "); + throw new IllegalArgumentException("Partition path " + partitionPath + " is not in the form yyyy/mm/dd "); } // Get the partition part and remove the / as well at the end int year = Integer.parseInt(splits[0]); diff --git a/hudi-hive/src/main/java/org/apache/hudi/hive/util/ColumnNameXLator.java b/hudi-hive/src/main/java/org/apache/hudi/hive/util/ColumnNameXLator.java index a3584b108..35a9dcfb8 100644 --- a/hudi-hive/src/main/java/org/apache/hudi/hive/util/ColumnNameXLator.java +++ b/hudi-hive/src/main/java/org/apache/hudi/hive/util/ColumnNameXLator.java @@ -28,8 +28,8 @@ public class ColumnNameXLator { public static String translateNestedColumn(String colName) { Map.Entry entry; - for (Iterator ic = xformMap.entrySet().iterator(); ic.hasNext(); - colName = colName.replaceAll((String) entry.getKey(), (String) entry.getValue())) { + for (Iterator ic = xformMap.entrySet().iterator(); ic.hasNext(); colName = + colName.replaceAll((String) entry.getKey(), (String) entry.getValue())) { entry = (Map.Entry) ic.next(); } diff --git a/hudi-hive/src/main/java/org/apache/hudi/hive/util/SchemaUtil.java b/hudi-hive/src/main/java/org/apache/hudi/hive/util/SchemaUtil.java index 289f2f70e..72ff69a99 100644 --- a/hudi-hive/src/main/java/org/apache/hudi/hive/util/SchemaUtil.java +++ b/hudi-hive/src/main/java/org/apache/hudi/hive/util/SchemaUtil.java @@ -56,8 +56,8 @@ public class SchemaUtil { /** * Get the schema difference between the storage schema and hive table schema */ - public static SchemaDifference getSchemaDifference(MessageType storageSchema, - Map tableSchema, List partitionKeys) { + public static SchemaDifference getSchemaDifference(MessageType storageSchema, Map tableSchema, + List partitionKeys) { Map newTableSchema; try { newTableSchema = convertParquetSchemaToHiveSchema(storageSchema); @@ -65,16 +65,13 @@ public class SchemaUtil { throw new HoodieHiveSyncException("Failed to convert parquet schema to hive schema", e); } LOG.info("Getting schema difference for " + tableSchema + "\r\n\r\n" + newTableSchema); - SchemaDifference.Builder schemaDiffBuilder = SchemaDifference - .newBuilder(storageSchema, tableSchema); + SchemaDifference.Builder schemaDiffBuilder = SchemaDifference.newBuilder(storageSchema, tableSchema); Set tableColumns = Sets.newHashSet(); for (Map.Entry field : tableSchema.entrySet()) { String fieldName = field.getKey().toLowerCase(); String tickSurroundedFieldName = tickSurround(fieldName); - if (!isFieldExistsInSchema(newTableSchema, tickSurroundedFieldName) && !partitionKeys - .contains( - fieldName)) { + if (!isFieldExistsInSchema(newTableSchema, tickSurroundedFieldName) && !partitionKeys.contains(fieldName)) { schemaDiffBuilder.deleteTableColumn(fieldName); } else { // check type @@ -85,8 +82,7 @@ public class SchemaUtil { continue; } // We will log this and continue. Hive schema is a superset of all parquet schemas - LOG.warn( - "Ignoring table column " + fieldName + " as its not present in the parquet schema"); + LOG.warn("Ignoring table column " + fieldName + " as its not present in the parquet schema"); continue; } tableColumnType = tableColumnType.replaceAll("\\s+", ""); @@ -99,12 +95,10 @@ public class SchemaUtil { // check for incremental datasets, the schema type change is allowed as per evolution // rules if (!isSchemaTypeUpdateAllowed(tableColumnType, expectedType)) { - throw new HoodieHiveSyncException( - "Could not convert field Type from " + tableColumnType + " to " + expectedType - + " for field " + fieldName); + throw new HoodieHiveSyncException("Could not convert field Type from " + tableColumnType + " to " + + expectedType + " for field " + fieldName); } - schemaDiffBuilder.updateTableColumn(fieldName, - getExpectedType(newTableSchema, tickSurroundedFieldName)); + schemaDiffBuilder.updateTableColumn(fieldName, getExpectedType(newTableSchema, tickSurroundedFieldName)); } } tableColumns.add(tickSurroundedFieldName); @@ -129,8 +123,7 @@ public class SchemaUtil { return null; } - private static boolean isFieldExistsInSchema(Map newTableSchema, - String fieldName) { + private static boolean isFieldExistsInSchema(Map newTableSchema, String fieldName) { for (String entry : newTableSchema.keySet()) { if (entry.toLowerCase().equals(fieldName)) { return true; @@ -146,8 +139,7 @@ public class SchemaUtil { * @param messageType : Parquet Schema * @return : Hive Table schema read from parquet file MAP[String,String] */ - public static Map convertParquetSchemaToHiveSchema(MessageType messageType) - throws IOException { + public static Map convertParquetSchemaToHiveSchema(MessageType messageType) throws IOException { Map schema = Maps.newLinkedHashMap(); List parquetFields = messageType.getFields(); for (Type parquetType : parquetFields) { @@ -173,8 +165,8 @@ public class SchemaUtil { private static String convertField(final Type parquetType) { StringBuilder field = new StringBuilder(); if (parquetType.isPrimitive()) { - final PrimitiveType.PrimitiveTypeName parquetPrimitiveTypeName = parquetType.asPrimitiveType() - .getPrimitiveTypeName(); + final PrimitiveType.PrimitiveTypeName parquetPrimitiveTypeName = + parquetType.asPrimitiveType().getPrimitiveTypeName(); final OriginalType originalType = parquetType.getOriginalType(); if (originalType == OriginalType.DECIMAL) { final DecimalMetadata decimalMetadata = parquetType.asPrimitiveType().getDecimalMetadata(); @@ -182,53 +174,51 @@ public class SchemaUtil { .append(decimalMetadata.getScale()).append(")").toString(); } // TODO - fix the method naming here - return parquetPrimitiveTypeName - .convert(new PrimitiveType.PrimitiveTypeNameConverter() { - @Override - public String convertBOOLEAN(PrimitiveType.PrimitiveTypeName primitiveTypeName) { - return "boolean"; - } + return parquetPrimitiveTypeName.convert(new PrimitiveType.PrimitiveTypeNameConverter() { + @Override + public String convertBOOLEAN(PrimitiveType.PrimitiveTypeName primitiveTypeName) { + return "boolean"; + } - @Override - public String convertINT32(PrimitiveType.PrimitiveTypeName primitiveTypeName) { - return "int"; - } + @Override + public String convertINT32(PrimitiveType.PrimitiveTypeName primitiveTypeName) { + return "int"; + } - @Override - public String convertINT64(PrimitiveType.PrimitiveTypeName primitiveTypeName) { - return "bigint"; - } + @Override + public String convertINT64(PrimitiveType.PrimitiveTypeName primitiveTypeName) { + return "bigint"; + } - @Override - public String convertINT96(PrimitiveType.PrimitiveTypeName primitiveTypeName) { - return "timestamp-millis"; - } + @Override + public String convertINT96(PrimitiveType.PrimitiveTypeName primitiveTypeName) { + return "timestamp-millis"; + } - @Override - public String convertFLOAT(PrimitiveType.PrimitiveTypeName primitiveTypeName) { - return "float"; - } + @Override + public String convertFLOAT(PrimitiveType.PrimitiveTypeName primitiveTypeName) { + return "float"; + } - @Override - public String convertDOUBLE(PrimitiveType.PrimitiveTypeName primitiveTypeName) { - return "double"; - } + @Override + public String convertDOUBLE(PrimitiveType.PrimitiveTypeName primitiveTypeName) { + return "double"; + } - @Override - public String convertFIXED_LEN_BYTE_ARRAY( - PrimitiveType.PrimitiveTypeName primitiveTypeName) { - return "binary"; - } + @Override + public String convertFIXED_LEN_BYTE_ARRAY(PrimitiveType.PrimitiveTypeName primitiveTypeName) { + return "binary"; + } - @Override - public String convertBINARY(PrimitiveType.PrimitiveTypeName primitiveTypeName) { - if (originalType == OriginalType.UTF8 || originalType == OriginalType.ENUM) { - return "string"; - } else { - return "binary"; - } - } - }); + @Override + public String convertBINARY(PrimitiveType.PrimitiveTypeName primitiveTypeName) { + if (originalType == OriginalType.UTF8 || originalType == OriginalType.ENUM) { + return "string"; + } else { + return "binary"; + } + } + }); } else { GroupType parquetGroupType = parquetType.asGroupType(); OriginalType originalType = parquetGroupType.getOriginalType(); @@ -244,8 +234,7 @@ public class SchemaUtil { } return createHiveArray(elementType, parquetGroupType.getName()); case MAP: - if (parquetGroupType.getFieldCount() != 1 || parquetGroupType.getType(0) - .isPrimitive()) { + if (parquetGroupType.getFieldCount() != 1 || parquetGroupType.getType(0).isPrimitive()) { throw new UnsupportedOperationException("Invalid map type " + parquetGroupType); } GroupType mapKeyValType = parquetGroupType.getType(0).asGroupType(); @@ -255,11 +244,10 @@ public class SchemaUtil { throw new UnsupportedOperationException("Invalid map type " + parquetGroupType); } Type keyType = mapKeyValType.getType(0); - if (!keyType.isPrimitive() || !keyType.asPrimitiveType().getPrimitiveTypeName() - .equals(PrimitiveType.PrimitiveTypeName.BINARY) + if (!keyType.isPrimitive() + || !keyType.asPrimitiveType().getPrimitiveTypeName().equals(PrimitiveType.PrimitiveTypeName.BINARY) || !keyType.getOriginalType().equals(OriginalType.UTF8)) { - throw new UnsupportedOperationException( - "Map key type must be binary (UTF8): " + keyType); + throw new UnsupportedOperationException("Map key type must be binary (UTF8): " + keyType); } Type valueType = mapKeyValType.getType(1); return createHiveMap(convertField(keyType), convertField(valueType)); @@ -292,8 +280,8 @@ public class SchemaUtil { StringBuilder struct = new StringBuilder(); struct.append("STRUCT< "); for (Type field : parquetFields) { - //TODO: struct field name is only translated to support special char($) - //We will need to extend it to other collection type + // TODO: struct field name is only translated to support special char($) + // We will need to extend it to other collection type struct.append(hiveCompatibleFieldName(field.getName(), true)).append(" : "); struct.append(convertField(field)).append(", "); } @@ -353,9 +341,8 @@ public class SchemaUtil { } else { final GroupType groupType = elementType.asGroupType(); final List groupFields = groupType.getFields(); - if (groupFields.size() > 1 || (groupFields.size() == 1 && ( - elementType.getName().equals("array") || elementType.getName() - .equals(elementName + "_tuple")))) { + if (groupFields.size() > 1 || (groupFields.size() == 1 + && (elementType.getName().equals("array") || elementType.getName().equals(elementName + "_tuple")))) { array.append(convertField(elementType)); } else { array.append(convertField(groupType.getFields().get(0))); @@ -366,8 +353,7 @@ public class SchemaUtil { } public static boolean isSchemaTypeUpdateAllowed(String prevType, String newType) { - if (prevType == null || prevType.trim().isEmpty() || newType == null || newType.trim() - .isEmpty()) { + if (prevType == null || prevType.trim().isEmpty() || newType == null || newType.trim().isEmpty()) { return false; } prevType = prevType.toLowerCase(); @@ -402,8 +388,8 @@ public class SchemaUtil { return columns.toString(); } - public static String generateCreateDDL(MessageType storageSchema, HiveSyncConfig config, - String inputFormatClass, String outputFormatClass, String serdeClass) throws IOException { + public static String generateCreateDDL(MessageType storageSchema, HiveSyncConfig config, String inputFormatClass, + String outputFormatClass, String serdeClass) throws IOException { Map hiveSchema = convertParquetSchemaToHiveSchema(storageSchema); String columns = generateSchemaString(storageSchema, config.partitionFields); @@ -423,8 +409,8 @@ public class SchemaUtil { } sb = sb.append(" ROW FORMAT SERDE '").append(serdeClass).append("'"); sb = sb.append(" STORED AS INPUTFORMAT '").append(inputFormatClass).append("'"); - sb = sb.append(" OUTPUTFORMAT '").append(outputFormatClass).append("' LOCATION '") - .append(config.basePath).append("'"); + sb = sb.append(" OUTPUTFORMAT '").append(outputFormatClass).append("' LOCATION '").append(config.basePath) + .append("'"); return sb.toString(); } @@ -440,6 +426,7 @@ public class SchemaUtil { /** * Read the schema from the log file on path + * * @return */ @SuppressWarnings("OptionalUsedAsFieldOrParameterType") diff --git a/hudi-hive/src/test/java/org/apache/hudi/hive/HiveSyncToolTest.java b/hudi-hive/src/test/java/org/apache/hudi/hive/HiveSyncToolTest.java index c1ef7567a..a27118d5e 100644 --- a/hudi-hive/src/test/java/org/apache/hudi/hive/HiveSyncToolTest.java +++ b/hudi-hive/src/test/java/org/apache/hudi/hive/HiveSyncToolTest.java @@ -57,7 +57,7 @@ public class HiveSyncToolTest { @Parameterized.Parameters(name = "UseJdbc") public static Collection data() { - return Arrays.asList(new Boolean[][]{{false}, {true}}); + return Arrays.asList(new Boolean[][] {{false}, {true}}); } @Before @@ -71,45 +71,38 @@ public class HiveSyncToolTest { } /** - * Testing converting array types to Hive field declaration strings, according to the Parquet-113 - * spec: https://github.com/apache/parquet-format/blob/master/LogicalTypes.md#lists + * Testing converting array types to Hive field declaration strings, according to the Parquet-113 spec: + * https://github.com/apache/parquet-format/blob/master/LogicalTypes.md#lists */ @Test public void testSchemaConvertArray() throws IOException { // Testing the 3-level annotation structure - MessageType schema = Types.buildMessage().optionalGroup() - .as(OriginalType.LIST).repeatedGroup() - .optional(PrimitiveType.PrimitiveTypeName.INT32).named("element") - .named("list").named("int_list").named("ArrayOfInts"); + MessageType schema = Types.buildMessage().optionalGroup().as(OriginalType.LIST).repeatedGroup() + .optional(PrimitiveType.PrimitiveTypeName.INT32).named("element").named("list").named("int_list") + .named("ArrayOfInts"); String schemaString = SchemaUtil.generateSchemaString(schema); assertEquals("`int_list` ARRAY< int>", schemaString); // A array of arrays - schema = Types.buildMessage().optionalGroup() - .as(OriginalType.LIST).repeatedGroup().requiredGroup() - .as(OriginalType.LIST).repeatedGroup() - .required(PrimitiveType.PrimitiveTypeName.INT32).named("element").named("list") - .named("element").named("list").named("int_list_list").named("ArrayOfArrayOfInts"); + schema = Types.buildMessage().optionalGroup().as(OriginalType.LIST).repeatedGroup().requiredGroup() + .as(OriginalType.LIST).repeatedGroup().required(PrimitiveType.PrimitiveTypeName.INT32).named("element") + .named("list").named("element").named("list").named("int_list_list").named("ArrayOfArrayOfInts"); schemaString = SchemaUtil.generateSchemaString(schema); assertEquals("`int_list_list` ARRAY< ARRAY< int>>", schemaString); // A list of integers - schema = Types.buildMessage().optionalGroup() - .as(OriginalType.LIST) - .repeated(PrimitiveType.PrimitiveTypeName.INT32).named("element").named("int_list") - .named("ArrayOfInts"); + schema = Types.buildMessage().optionalGroup().as(OriginalType.LIST).repeated(PrimitiveType.PrimitiveTypeName.INT32) + .named("element").named("int_list").named("ArrayOfInts"); schemaString = SchemaUtil.generateSchemaString(schema); assertEquals("`int_list` ARRAY< int>", schemaString); // A list of structs with two fields - schema = Types.buildMessage().optionalGroup() - .as(OriginalType.LIST).repeatedGroup() - .required(PrimitiveType.PrimitiveTypeName.BINARY).named("str") - .required(PrimitiveType.PrimitiveTypeName.INT32).named("num").named("element") - .named("tuple_list").named("ArrayOfTuples"); + schema = Types.buildMessage().optionalGroup().as(OriginalType.LIST).repeatedGroup() + .required(PrimitiveType.PrimitiveTypeName.BINARY).named("str").required(PrimitiveType.PrimitiveTypeName.INT32) + .named("num").named("element").named("tuple_list").named("ArrayOfTuples"); schemaString = SchemaUtil.generateSchemaString(schema); assertEquals("`tuple_list` ARRAY< STRUCT< `str` : binary, `num` : int>>", schemaString); @@ -117,10 +110,9 @@ public class HiveSyncToolTest { // A list of structs with a single field // For this case, since the inner group name is "array", we treat the // element type as a one-element struct. - schema = Types.buildMessage().optionalGroup() - .as(OriginalType.LIST).repeatedGroup() - .required(PrimitiveType.PrimitiveTypeName.BINARY).named("str").named("array") - .named("one_tuple_list").named("ArrayOfOneTuples"); + schema = Types.buildMessage().optionalGroup().as(OriginalType.LIST).repeatedGroup() + .required(PrimitiveType.PrimitiveTypeName.BINARY).named("str").named("array").named("one_tuple_list") + .named("ArrayOfOneTuples"); schemaString = SchemaUtil.generateSchemaString(schema); assertEquals("`one_tuple_list` ARRAY< STRUCT< `str` : binary>>", schemaString); @@ -128,10 +120,9 @@ public class HiveSyncToolTest { // A list of structs with a single field // For this case, since the inner group name ends with "_tuple", we also treat the // element type as a one-element struct. - schema = Types.buildMessage().optionalGroup() - .as(OriginalType.LIST).repeatedGroup() - .required(PrimitiveType.PrimitiveTypeName.BINARY).named("str") - .named("one_tuple_list_tuple").named("one_tuple_list").named("ArrayOfOneTuples2"); + schema = Types.buildMessage().optionalGroup().as(OriginalType.LIST).repeatedGroup() + .required(PrimitiveType.PrimitiveTypeName.BINARY).named("str").named("one_tuple_list_tuple") + .named("one_tuple_list").named("ArrayOfOneTuples2"); schemaString = SchemaUtil.generateSchemaString(schema); assertEquals("`one_tuple_list` ARRAY< STRUCT< `str` : binary>>", schemaString); @@ -139,22 +130,18 @@ public class HiveSyncToolTest { // A list of structs with a single field // Unlike the above two cases, for this the element type is the type of the // only field in the struct. - schema = Types.buildMessage().optionalGroup() - .as(OriginalType.LIST).repeatedGroup() - .required(PrimitiveType.PrimitiveTypeName.BINARY).named("str") - .named("one_tuple_list").named("one_tuple_list").named("ArrayOfOneTuples3"); + schema = Types.buildMessage().optionalGroup().as(OriginalType.LIST).repeatedGroup() + .required(PrimitiveType.PrimitiveTypeName.BINARY).named("str").named("one_tuple_list").named("one_tuple_list") + .named("ArrayOfOneTuples3"); schemaString = SchemaUtil.generateSchemaString(schema); assertEquals("`one_tuple_list` ARRAY< binary>", schemaString); // A list of maps - schema = Types.buildMessage().optionalGroup() - .as(OriginalType.LIST).repeatedGroup().as(OriginalType.MAP) - .repeatedGroup().as(OriginalType.MAP_KEY_VALUE) - .required(PrimitiveType.PrimitiveTypeName.BINARY).as(OriginalType.UTF8) - .named("string_key").required(PrimitiveType.PrimitiveTypeName.INT32) - .named("int_value").named("key_value").named("array").named("map_list") - .named("ArrayOfMaps"); + schema = Types.buildMessage().optionalGroup().as(OriginalType.LIST).repeatedGroup().as(OriginalType.MAP) + .repeatedGroup().as(OriginalType.MAP_KEY_VALUE).required(PrimitiveType.PrimitiveTypeName.BINARY) + .as(OriginalType.UTF8).named("string_key").required(PrimitiveType.PrimitiveTypeName.INT32).named("int_value") + .named("key_value").named("array").named("map_list").named("ArrayOfMaps"); schemaString = SchemaUtil.generateSchemaString(schema); assertEquals("`map_list` ARRAY< MAP< string, int>>", schemaString); @@ -166,22 +153,21 @@ public class HiveSyncToolTest { TestUtil.hiveSyncConfig.useJdbc = this.useJdbc; String commitTime = "100"; TestUtil.createCOWDataset(commitTime, 5); - HoodieHiveClient hiveClient = new HoodieHiveClient(TestUtil.hiveSyncConfig, - TestUtil.getHiveConf(), TestUtil.fileSystem); + HoodieHiveClient hiveClient = + new HoodieHiveClient(TestUtil.hiveSyncConfig, TestUtil.getHiveConf(), TestUtil.fileSystem); assertFalse("Table " + TestUtil.hiveSyncConfig.tableName + " should not exist initially", hiveClient.doesTableExist()); // Lets do the sync - HiveSyncTool tool = new HiveSyncTool(TestUtil.hiveSyncConfig, TestUtil.getHiveConf(), - TestUtil.fileSystem); + HiveSyncTool tool = new HiveSyncTool(TestUtil.hiveSyncConfig, TestUtil.getHiveConf(), TestUtil.fileSystem); tool.syncHoodieTable(); assertTrue("Table " + TestUtil.hiveSyncConfig.tableName + " should exist after sync completes", hiveClient.doesTableExist()); - assertEquals("Hive Schema should match the dataset schema + partition field", - hiveClient.getTableSchema().size(), hiveClient.getDataSchema().getColumns().size() + 1); + assertEquals("Hive Schema should match the dataset schema + partition field", hiveClient.getTableSchema().size(), + hiveClient.getDataSchema().getColumns().size() + 1); assertEquals("Table partitions should match the number of partitions we wrote", 5, hiveClient.scanTablePartitions().size()); - assertEquals("The last commit that was sycned should be updated in the TBLPROPERTIES", - commitTime, hiveClient.getLastCommitTimeSynced().get()); + assertEquals("The last commit that was sycned should be updated in the TBLPROPERTIES", commitTime, + hiveClient.getLastCommitTimeSynced().get()); } @Test @@ -189,16 +175,15 @@ public class HiveSyncToolTest { TestUtil.hiveSyncConfig.useJdbc = this.useJdbc; String commitTime1 = "100"; TestUtil.createCOWDataset(commitTime1, 5); - HoodieHiveClient hiveClient = new HoodieHiveClient(TestUtil.hiveSyncConfig, - TestUtil.getHiveConf(), TestUtil.fileSystem); + HoodieHiveClient hiveClient = + new HoodieHiveClient(TestUtil.hiveSyncConfig, TestUtil.getHiveConf(), TestUtil.fileSystem); // Lets do the sync - HiveSyncTool tool = new HiveSyncTool(TestUtil.hiveSyncConfig, TestUtil.getHiveConf(), - TestUtil.fileSystem); + HiveSyncTool tool = new HiveSyncTool(TestUtil.hiveSyncConfig, TestUtil.getHiveConf(), TestUtil.fileSystem); tool.syncHoodieTable(); assertEquals("Table partitions should match the number of partitions we wrote", 5, hiveClient.scanTablePartitions().size()); - assertEquals("The last commit that was sycned should be updated in the TBLPROPERTIES", - commitTime1, hiveClient.getLastCommitTimeSynced().get()); + assertEquals("The last commit that was sycned should be updated in the TBLPROPERTIES", commitTime1, + hiveClient.getLastCommitTimeSynced().get()); // Now lets create more parititions and these are the only ones which needs to be synced DateTime dateTime = DateTime.now().plusDays(6); @@ -206,15 +191,11 @@ public class HiveSyncToolTest { TestUtil.addCOWPartitions(1, true, dateTime, commitTime2); // Lets do the sync - hiveClient = new HoodieHiveClient(TestUtil.hiveSyncConfig, TestUtil.getHiveConf(), - TestUtil.fileSystem); - List writtenPartitionsSince = hiveClient.getPartitionsWrittenToSince( - Option.of(commitTime1)); - assertEquals("We should have one partition written after 100 commit", 1, - writtenPartitionsSince.size()); + hiveClient = new HoodieHiveClient(TestUtil.hiveSyncConfig, TestUtil.getHiveConf(), TestUtil.fileSystem); + List writtenPartitionsSince = hiveClient.getPartitionsWrittenToSince(Option.of(commitTime1)); + assertEquals("We should have one partition written after 100 commit", 1, writtenPartitionsSince.size()); List hivePartitions = hiveClient.scanTablePartitions(); - List partitionEvents = hiveClient.getPartitionEvents(hivePartitions, - writtenPartitionsSince); + List partitionEvents = hiveClient.getPartitionEvents(hivePartitions, writtenPartitionsSince); assertEquals("There should be only one paritition event", 1, partitionEvents.size()); assertEquals("The one partition event must of type ADD", PartitionEventType.ADD, partitionEvents.iterator().next().eventType); @@ -222,8 +203,7 @@ public class HiveSyncToolTest { tool = new HiveSyncTool(TestUtil.hiveSyncConfig, TestUtil.getHiveConf(), TestUtil.fileSystem); tool.syncHoodieTable(); // Sync should add the one partition - assertEquals("The one partition we wrote should be added to hive", 6, - hiveClient.scanTablePartitions().size()); + assertEquals("The one partition we wrote should be added to hive", 6, hiveClient.scanTablePartitions().size()); assertEquals("The last commit that was sycned should be 101", commitTime2, hiveClient.getLastCommitTimeSynced().get()); } @@ -233,11 +213,10 @@ public class HiveSyncToolTest { TestUtil.hiveSyncConfig.useJdbc = this.useJdbc; String commitTime1 = "100"; TestUtil.createCOWDataset(commitTime1, 5); - HoodieHiveClient hiveClient = new HoodieHiveClient(TestUtil.hiveSyncConfig, - TestUtil.getHiveConf(), TestUtil.fileSystem); + HoodieHiveClient hiveClient = + new HoodieHiveClient(TestUtil.hiveSyncConfig, TestUtil.getHiveConf(), TestUtil.fileSystem); // Lets do the sync - HiveSyncTool tool = new HiveSyncTool(TestUtil.hiveSyncConfig, TestUtil.getHiveConf(), - TestUtil.fileSystem); + HiveSyncTool tool = new HiveSyncTool(TestUtil.hiveSyncConfig, TestUtil.getHiveConf(), TestUtil.fileSystem); tool.syncHoodieTable(); int fields = hiveClient.getTableSchema().size(); @@ -253,14 +232,13 @@ public class HiveSyncToolTest { assertEquals("Hive Schema has evolved and should not be 3 more field", fields + 3, hiveClient.getTableSchema().size()); - assertEquals("Hive Schema has evolved - Field favorite_number has evolved from int to long", - "BIGINT", hiveClient.getTableSchema().get("favorite_number")); + assertEquals("Hive Schema has evolved - Field favorite_number has evolved from int to long", "BIGINT", + hiveClient.getTableSchema().get("favorite_number")); assertTrue("Hive Schema has evolved - Field favorite_movie was added", hiveClient.getTableSchema().containsKey("favorite_movie")); // Sync should add the one partition - assertEquals("The one partition we wrote should be added to hive", 6, - hiveClient.scanTablePartitions().size()); + assertEquals("The one partition we wrote should be added to hive", 6, hiveClient.scanTablePartitions().size()); assertEquals("The last commit that was sycned should be 101", commitTime2, hiveClient.getLastCommitTimeSynced().get()); } @@ -271,24 +249,22 @@ public class HiveSyncToolTest { String commitTime = "100"; String deltaCommitTime = "101"; TestUtil.createMORDataset(commitTime, deltaCommitTime, 5); - HoodieHiveClient hiveClient = new HoodieHiveClient(TestUtil.hiveSyncConfig, - TestUtil.getHiveConf(), TestUtil.fileSystem); + HoodieHiveClient hiveClient = + new HoodieHiveClient(TestUtil.hiveSyncConfig, TestUtil.getHiveConf(), TestUtil.fileSystem); assertFalse("Table " + TestUtil.hiveSyncConfig.tableName + " should not exist initially", hiveClient.doesTableExist()); // Lets do the sync - HiveSyncTool tool = new HiveSyncTool(TestUtil.hiveSyncConfig, TestUtil.getHiveConf(), - TestUtil.fileSystem); + HiveSyncTool tool = new HiveSyncTool(TestUtil.hiveSyncConfig, TestUtil.getHiveConf(), TestUtil.fileSystem); tool.syncHoodieTable(); assertTrue("Table " + TestUtil.hiveSyncConfig.tableName + " should exist after sync completes", hiveClient.doesTableExist()); - assertEquals("Hive Schema should match the dataset schema + partition field", - hiveClient.getTableSchema().size(), + assertEquals("Hive Schema should match the dataset schema + partition field", hiveClient.getTableSchema().size(), SchemaTestUtil.getSimpleSchema().getFields().size() + 1); assertEquals("Table partitions should match the number of partitions we wrote", 5, hiveClient.scanTablePartitions().size()); - assertEquals("The last commit that was sycned should be updated in the TBLPROPERTIES", - deltaCommitTime, hiveClient.getLastCommitTimeSynced().get()); + assertEquals("The last commit that was sycned should be updated in the TBLPROPERTIES", deltaCommitTime, + hiveClient.getLastCommitTimeSynced().get()); // Now lets create more parititions and these are the only ones which needs to be synced DateTime dateTime = DateTime.now().plusDays(6); @@ -300,50 +276,43 @@ public class HiveSyncToolTest { // Lets do the sync tool = new HiveSyncTool(TestUtil.hiveSyncConfig, TestUtil.getHiveConf(), TestUtil.fileSystem); tool.syncHoodieTable(); - hiveClient = new HoodieHiveClient(TestUtil.hiveSyncConfig, TestUtil.getHiveConf(), - TestUtil.fileSystem); + hiveClient = new HoodieHiveClient(TestUtil.hiveSyncConfig, TestUtil.getHiveConf(), TestUtil.fileSystem); assertEquals("Hive Schema should match the evolved dataset schema + partition field", - hiveClient.getTableSchema().size(), - SchemaTestUtil.getEvolvedSchema().getFields().size() + 1); + hiveClient.getTableSchema().size(), SchemaTestUtil.getEvolvedSchema().getFields().size() + 1); // Sync should add the one partition - assertEquals("The 2 partitions we wrote should be added to hive", 6, - hiveClient.scanTablePartitions().size()); + assertEquals("The 2 partitions we wrote should be added to hive", 6, hiveClient.scanTablePartitions().size()); assertEquals("The last commit that was sycned should be 103", deltaCommitTime2, hiveClient.getLastCommitTimeSynced().get()); } @Test - public void testSyncMergeOnReadRT() - throws Exception { + public void testSyncMergeOnReadRT() throws Exception { TestUtil.hiveSyncConfig.useJdbc = this.useJdbc; String commitTime = "100"; String deltaCommitTime = "101"; String roTablename = TestUtil.hiveSyncConfig.tableName; - TestUtil.hiveSyncConfig.tableName = - TestUtil.hiveSyncConfig.tableName + HiveSyncTool.SUFFIX_REALTIME_TABLE; + TestUtil.hiveSyncConfig.tableName = TestUtil.hiveSyncConfig.tableName + HiveSyncTool.SUFFIX_REALTIME_TABLE; TestUtil.createMORDataset(commitTime, deltaCommitTime, 5); - HoodieHiveClient hiveClientRT = new HoodieHiveClient(TestUtil.hiveSyncConfig, - TestUtil.getHiveConf(), TestUtil.fileSystem); + HoodieHiveClient hiveClientRT = + new HoodieHiveClient(TestUtil.hiveSyncConfig, TestUtil.getHiveConf(), TestUtil.fileSystem); assertFalse("Table " + TestUtil.hiveSyncConfig.tableName + HiveSyncTool.SUFFIX_REALTIME_TABLE + " should not exist initially", hiveClientRT.doesTableExist()); // Lets do the sync - HiveSyncTool tool = new HiveSyncTool(TestUtil.hiveSyncConfig, TestUtil.getHiveConf(), - TestUtil.fileSystem); + HiveSyncTool tool = new HiveSyncTool(TestUtil.hiveSyncConfig, TestUtil.getHiveConf(), TestUtil.fileSystem); tool.syncHoodieTable(); assertTrue("Table " + TestUtil.hiveSyncConfig.tableName + HiveSyncTool.SUFFIX_REALTIME_TABLE + " should exist after sync completes", hiveClientRT.doesTableExist()); - assertEquals("Hive Schema should match the dataset schema + partition field", - hiveClientRT.getTableSchema().size(), + assertEquals("Hive Schema should match the dataset schema + partition field", hiveClientRT.getTableSchema().size(), SchemaTestUtil.getSimpleSchema().getFields().size() + 1); assertEquals("Table partitions should match the number of partitions we wrote", 5, hiveClientRT.scanTablePartitions().size()); - assertEquals("The last commit that was sycned should be updated in the TBLPROPERTIES", - deltaCommitTime, hiveClientRT.getLastCommitTimeSynced().get()); + assertEquals("The last commit that was sycned should be updated in the TBLPROPERTIES", deltaCommitTime, + hiveClientRT.getLastCommitTimeSynced().get()); // Now lets create more parititions and these are the only ones which needs to be synced DateTime dateTime = DateTime.now().plusDays(6); @@ -355,23 +324,19 @@ public class HiveSyncToolTest { // Lets do the sync tool = new HiveSyncTool(TestUtil.hiveSyncConfig, TestUtil.getHiveConf(), TestUtil.fileSystem); tool.syncHoodieTable(); - hiveClientRT = new HoodieHiveClient(TestUtil.hiveSyncConfig, TestUtil.getHiveConf(), - TestUtil.fileSystem); + hiveClientRT = new HoodieHiveClient(TestUtil.hiveSyncConfig, TestUtil.getHiveConf(), TestUtil.fileSystem); assertEquals("Hive Schema should match the evolved dataset schema + partition field", - hiveClientRT.getTableSchema().size(), - SchemaTestUtil.getEvolvedSchema().getFields().size() + 1); + hiveClientRT.getTableSchema().size(), SchemaTestUtil.getEvolvedSchema().getFields().size() + 1); // Sync should add the one partition - assertEquals("The 2 partitions we wrote should be added to hive", 6, - hiveClientRT.scanTablePartitions().size()); + assertEquals("The 2 partitions we wrote should be added to hive", 6, hiveClientRT.scanTablePartitions().size()); assertEquals("The last commit that was sycned should be 103", deltaCommitTime2, hiveClientRT.getLastCommitTimeSynced().get()); TestUtil.hiveSyncConfig.tableName = roTablename; } @Test - public void testMultiPartitionKeySync() - throws Exception { + public void testMultiPartitionKeySync() throws Exception { TestUtil.hiveSyncConfig.useJdbc = this.useJdbc; String commitTime = "100"; TestUtil.createCOWDataset(commitTime, 5); @@ -382,20 +347,17 @@ public class HiveSyncToolTest { hiveSyncConfig.partitionFields = Lists.newArrayList("year", "month", "day"); TestUtil.getCreatedTablesSet().add(hiveSyncConfig.databaseName + "." + hiveSyncConfig.tableName); - HoodieHiveClient hiveClient = new HoodieHiveClient(hiveSyncConfig, - TestUtil.getHiveConf(), TestUtil.fileSystem); - assertFalse("Table " + hiveSyncConfig.tableName + " should not exist initially", - hiveClient.doesTableExist()); + HoodieHiveClient hiveClient = new HoodieHiveClient(hiveSyncConfig, TestUtil.getHiveConf(), TestUtil.fileSystem); + assertFalse("Table " + hiveSyncConfig.tableName + " should not exist initially", hiveClient.doesTableExist()); // Lets do the sync HiveSyncTool tool = new HiveSyncTool(hiveSyncConfig, TestUtil.getHiveConf(), TestUtil.fileSystem); tool.syncHoodieTable(); - assertTrue("Table " + hiveSyncConfig.tableName + " should exist after sync completes", - hiveClient.doesTableExist()); - assertEquals("Hive Schema should match the dataset schema + partition fields", - hiveClient.getTableSchema().size(), hiveClient.getDataSchema().getColumns().size() + 3); + assertTrue("Table " + hiveSyncConfig.tableName + " should exist after sync completes", hiveClient.doesTableExist()); + assertEquals("Hive Schema should match the dataset schema + partition fields", hiveClient.getTableSchema().size(), + hiveClient.getDataSchema().getColumns().size() + 3); assertEquals("Table partitions should match the number of partitions we wrote", 5, hiveClient.scanTablePartitions().size()); - assertEquals("The last commit that was sycned should be updated in the TBLPROPERTIES", - commitTime, hiveClient.getLastCommitTimeSynced().get()); + assertEquals("The last commit that was sycned should be updated in the TBLPROPERTIES", commitTime, + hiveClient.getLastCommitTimeSynced().get()); } } diff --git a/hudi-hive/src/test/java/org/apache/hudi/hive/TestUtil.java b/hudi-hive/src/test/java/org/apache/hudi/hive/TestUtil.java index f49b7df90..cd18faa7f 100644 --- a/hudi-hive/src/test/java/org/apache/hudi/hive/TestUtil.java +++ b/hudi-hive/src/test/java/org/apache/hudi/hive/TestUtil.java @@ -119,12 +119,10 @@ public class TestUtil { static void clear() throws IOException { fileSystem.delete(new Path(hiveSyncConfig.basePath), true); - HoodieTableMetaClient - .initTableType(configuration, hiveSyncConfig.basePath, HoodieTableType.COPY_ON_WRITE, - hiveSyncConfig.tableName, HoodieAvroPayload.class.getName()); + HoodieTableMetaClient.initTableType(configuration, hiveSyncConfig.basePath, HoodieTableType.COPY_ON_WRITE, + hiveSyncConfig.tableName, HoodieAvroPayload.class.getName()); - HoodieHiveClient client = new HoodieHiveClient(hiveSyncConfig, hiveServer.getHiveConf(), - fileSystem); + HoodieHiveClient client = new HoodieHiveClient(hiveSyncConfig, hiveServer.getHiveConf(), fileSystem); for (String tableName : createdTablesSet) { client.updateHiveSQL("drop table if exists " + tableName); } @@ -154,14 +152,12 @@ public class TestUtil { throws IOException, InitializationError, URISyntaxException, InterruptedException { Path path = new Path(hiveSyncConfig.basePath); FileIOUtils.deleteDirectory(new File(hiveSyncConfig.basePath)); - HoodieTableMetaClient - .initTableType(configuration, hiveSyncConfig.basePath, HoodieTableType.COPY_ON_WRITE, - hiveSyncConfig.tableName, HoodieAvroPayload.class.getName()); + HoodieTableMetaClient.initTableType(configuration, hiveSyncConfig.basePath, HoodieTableType.COPY_ON_WRITE, + hiveSyncConfig.tableName, HoodieAvroPayload.class.getName()); boolean result = fileSystem.mkdirs(path); checkResult(result); DateTime dateTime = DateTime.now(); - HoodieCommitMetadata commitMetadata = createPartitions(numberOfPartitions, true, dateTime, - commitTime); + HoodieCommitMetadata commitMetadata = createPartitions(numberOfPartitions, true, dateTime, commitTime); createdTablesSet.add(hiveSyncConfig.databaseName + "." + hiveSyncConfig.tableName); createCommitFile(commitMetadata, commitTime); } @@ -170,57 +166,51 @@ public class TestUtil { throws IOException, InitializationError, URISyntaxException, InterruptedException { Path path = new Path(hiveSyncConfig.basePath); FileIOUtils.deleteDirectory(new File(hiveSyncConfig.basePath)); - HoodieTableMetaClient - .initTableType(configuration, hiveSyncConfig.basePath, HoodieTableType.MERGE_ON_READ, - hiveSyncConfig.tableName, HoodieAvroPayload.class.getName()); + HoodieTableMetaClient.initTableType(configuration, hiveSyncConfig.basePath, HoodieTableType.MERGE_ON_READ, + hiveSyncConfig.tableName, HoodieAvroPayload.class.getName()); boolean result = fileSystem.mkdirs(path); checkResult(result); DateTime dateTime = DateTime.now(); - HoodieCommitMetadata commitMetadata = createPartitions(numberOfPartitions, true, dateTime, - commitTime); + HoodieCommitMetadata commitMetadata = createPartitions(numberOfPartitions, true, dateTime, commitTime); createdTablesSet.add(hiveSyncConfig.databaseName + "." + hiveSyncConfig.tableName); - createdTablesSet.add(hiveSyncConfig.databaseName + "." + hiveSyncConfig.tableName - + HiveSyncTool.SUFFIX_REALTIME_TABLE); + createdTablesSet + .add(hiveSyncConfig.databaseName + "." + hiveSyncConfig.tableName + HiveSyncTool.SUFFIX_REALTIME_TABLE); HoodieCommitMetadata compactionMetadata = new HoodieCommitMetadata(); - commitMetadata.getPartitionToWriteStats().forEach( - (key, value) -> value.stream().forEach(l -> compactionMetadata.addWriteStat(key, l))); + commitMetadata.getPartitionToWriteStats() + .forEach((key, value) -> value.stream().forEach(l -> compactionMetadata.addWriteStat(key, l))); createCompactionCommitFile(compactionMetadata, commitTime); // Write a delta commit - HoodieCommitMetadata deltaMetadata = createLogFiles(commitMetadata.getPartitionToWriteStats(), - true); + HoodieCommitMetadata deltaMetadata = createLogFiles(commitMetadata.getPartitionToWriteStats(), true); createDeltaCommitFile(deltaMetadata, deltaCommitTime); } - static void addCOWPartitions(int numberOfPartitions, boolean isParquetSchemaSimple, - DateTime startFrom, String commitTime) - throws IOException, URISyntaxException, InterruptedException { - HoodieCommitMetadata commitMetadata = createPartitions(numberOfPartitions, - isParquetSchemaSimple, startFrom, commitTime); + static void addCOWPartitions(int numberOfPartitions, boolean isParquetSchemaSimple, DateTime startFrom, + String commitTime) throws IOException, URISyntaxException, InterruptedException { + HoodieCommitMetadata commitMetadata = + createPartitions(numberOfPartitions, isParquetSchemaSimple, startFrom, commitTime); createdTablesSet.add(hiveSyncConfig.databaseName + "." + hiveSyncConfig.tableName); createCommitFile(commitMetadata, commitTime); } - static void addMORPartitions(int numberOfPartitions, boolean isParquetSchemaSimple, - boolean isLogSchemaSimple, DateTime startFrom, String commitTime, String deltaCommitTime) + static void addMORPartitions(int numberOfPartitions, boolean isParquetSchemaSimple, boolean isLogSchemaSimple, + DateTime startFrom, String commitTime, String deltaCommitTime) throws IOException, URISyntaxException, InterruptedException { - HoodieCommitMetadata commitMetadata = createPartitions(numberOfPartitions, - isParquetSchemaSimple, startFrom, commitTime); + HoodieCommitMetadata commitMetadata = + createPartitions(numberOfPartitions, isParquetSchemaSimple, startFrom, commitTime); createdTablesSet.add(hiveSyncConfig.databaseName + "." + hiveSyncConfig.tableName); - createdTablesSet.add(hiveSyncConfig.databaseName + "." + hiveSyncConfig.tableName - + HiveSyncTool.SUFFIX_REALTIME_TABLE); + createdTablesSet + .add(hiveSyncConfig.databaseName + "." + hiveSyncConfig.tableName + HiveSyncTool.SUFFIX_REALTIME_TABLE); HoodieCommitMetadata compactionMetadata = new HoodieCommitMetadata(); - commitMetadata.getPartitionToWriteStats().forEach( - (key, value) -> value.stream().forEach(l -> compactionMetadata.addWriteStat(key, l))); + commitMetadata.getPartitionToWriteStats() + .forEach((key, value) -> value.stream().forEach(l -> compactionMetadata.addWriteStat(key, l))); createCompactionCommitFile(compactionMetadata, commitTime); - HoodieCommitMetadata deltaMetadata = createLogFiles(commitMetadata.getPartitionToWriteStats(), - isLogSchemaSimple); + HoodieCommitMetadata deltaMetadata = createLogFiles(commitMetadata.getPartitionToWriteStats(), isLogSchemaSimple); createDeltaCommitFile(deltaMetadata, deltaCommitTime); } - private static HoodieCommitMetadata createLogFiles( - Map> partitionWriteStats, boolean isLogSchemaSimple) - throws InterruptedException, IOException, URISyntaxException { + private static HoodieCommitMetadata createLogFiles(Map> partitionWriteStats, + boolean isLogSchemaSimple) throws InterruptedException, IOException, URISyntaxException { HoodieCommitMetadata commitMetadata = new HoodieCommitMetadata(); for (Entry> wEntry : partitionWriteStats.entrySet()) { String partitionPath = wEntry.getKey(); @@ -237,9 +227,8 @@ public class TestUtil { return commitMetadata; } - private static HoodieCommitMetadata createPartitions(int numberOfPartitions, - boolean isParquetSchemaSimple, DateTime startFrom, String commitTime) - throws IOException, URISyntaxException, InterruptedException { + private static HoodieCommitMetadata createPartitions(int numberOfPartitions, boolean isParquetSchemaSimple, + DateTime startFrom, String commitTime) throws IOException, URISyntaxException, InterruptedException { startFrom = startFrom.withTimeAtStartOfDay(); HoodieCommitMetadata commitMetadata = new HoodieCommitMetadata(); @@ -248,22 +237,20 @@ public class TestUtil { Path partPath = new Path(hiveSyncConfig.basePath + "/" + partitionPath); fileSystem.makeQualified(partPath); fileSystem.mkdirs(partPath); - List writeStats = createTestData(partPath, isParquetSchemaSimple, - commitTime); + List writeStats = createTestData(partPath, isParquetSchemaSimple, commitTime); startFrom = startFrom.minusDays(1); writeStats.forEach(s -> commitMetadata.addWriteStat(partitionPath, s)); } return commitMetadata; } - private static List createTestData(Path partPath, boolean isParquetSchemaSimple, - String commitTime) throws IOException, URISyntaxException, InterruptedException { + private static List createTestData(Path partPath, boolean isParquetSchemaSimple, String commitTime) + throws IOException, URISyntaxException, InterruptedException { List writeStats = Lists.newArrayList(); for (int i = 0; i < 5; i++) { // Create 5 files String fileId = UUID.randomUUID().toString(); - Path filePath = new Path(partPath.toString() + "/" + FSUtils.makeDataFileName(commitTime, - "1-0-1", fileId)); + Path filePath = new Path(partPath.toString() + "/" + FSUtils.makeDataFileName(commitTime, "1-0-1", fileId)); generateParquetData(filePath, isParquetSchemaSimple); HoodieWriteStat writeStat = new HoodieWriteStat(); writeStat.setFileId(fileId); @@ -276,18 +263,15 @@ public class TestUtil { @SuppressWarnings({"unchecked", "deprecation"}) private static void generateParquetData(Path filePath, boolean isParquetSchemaSimple) throws IOException, URISyntaxException, InterruptedException { - Schema schema = (isParquetSchemaSimple ? SchemaTestUtil.getSimpleSchema() - : SchemaTestUtil.getEvolvedSchema()); + Schema schema = (isParquetSchemaSimple ? SchemaTestUtil.getSimpleSchema() : SchemaTestUtil.getEvolvedSchema()); org.apache.parquet.schema.MessageType parquetSchema = new AvroSchemaConverter().convert(schema); BloomFilter filter = new BloomFilter(1000, 0.0001); HoodieAvroWriteSupport writeSupport = new HoodieAvroWriteSupport(parquetSchema, schema, filter); - ParquetWriter writer = new ParquetWriter(filePath, writeSupport, CompressionCodecName.GZIP, - 120 * 1024 * 1024, ParquetWriter.DEFAULT_PAGE_SIZE, ParquetWriter.DEFAULT_PAGE_SIZE, - ParquetWriter.DEFAULT_IS_DICTIONARY_ENABLED, ParquetWriter.DEFAULT_IS_VALIDATING_ENABLED, - ParquetWriter.DEFAULT_WRITER_VERSION, fileSystem.getConf()); + ParquetWriter writer = new ParquetWriter(filePath, writeSupport, CompressionCodecName.GZIP, 120 * 1024 * 1024, + ParquetWriter.DEFAULT_PAGE_SIZE, ParquetWriter.DEFAULT_PAGE_SIZE, ParquetWriter.DEFAULT_IS_DICTIONARY_ENABLED, + ParquetWriter.DEFAULT_IS_VALIDATING_ENABLED, ParquetWriter.DEFAULT_WRITER_VERSION, fileSystem.getConf()); - List testRecords = (isParquetSchemaSimple ? SchemaTestUtil - .generateTestRecords(0, 100) + List testRecords = (isParquetSchemaSimple ? SchemaTestUtil.generateTestRecords(0, 100) : SchemaTestUtil.generateEvolvedTestRecords(100, 100)); testRecords.forEach(s -> { try { @@ -301,13 +285,11 @@ public class TestUtil { private static HoodieLogFile generateLogData(Path parquetFilePath, boolean isLogSchemaSimple) throws IOException, InterruptedException, URISyntaxException { - Schema schema = (isLogSchemaSimple ? SchemaTestUtil.getSimpleSchema() - : SchemaTestUtil.getEvolvedSchema()); + Schema schema = (isLogSchemaSimple ? SchemaTestUtil.getSimpleSchema() : SchemaTestUtil.getEvolvedSchema()); HoodieDataFile dataFile = new HoodieDataFile(fileSystem.getFileStatus(parquetFilePath)); // Write a log file for this parquet file Writer logWriter = HoodieLogFormat.newWriterBuilder().onParentPath(parquetFilePath.getParent()) - .withFileExtension(HoodieLogFile.DELTA_EXTENSION) - .withFileId(dataFile.getFileId()) + .withFileExtension(HoodieLogFile.DELTA_EXTENSION).withFileId(dataFile.getFileId()) .overBaseCommit(dataFile.getCommitTime()).withFs(fileSystem).build(); List records = (isLogSchemaSimple ? SchemaTestUtil.generateTestRecords(0, 100) : SchemaTestUtil.generateEvolvedTestRecords(100, 100)); @@ -326,37 +308,30 @@ public class TestUtil { } } - private static void createCommitFile(HoodieCommitMetadata commitMetadata, String commitTime) + private static void createCommitFile(HoodieCommitMetadata commitMetadata, String commitTime) throws IOException { + byte[] bytes = commitMetadata.toJsonString().getBytes(StandardCharsets.UTF_8); + Path fullPath = new Path(hiveSyncConfig.basePath + "/" + HoodieTableMetaClient.METAFOLDER_NAME + "/" + + HoodieTimeline.makeCommitFileName(commitTime)); + FSDataOutputStream fsout = fileSystem.create(fullPath, true); + fsout.write(bytes); + fsout.close(); + } + + private static void createCompactionCommitFile(HoodieCommitMetadata commitMetadata, String commitTime) throws IOException { byte[] bytes = commitMetadata.toJsonString().getBytes(StandardCharsets.UTF_8); - Path fullPath = new Path( - hiveSyncConfig.basePath + "/" + HoodieTableMetaClient.METAFOLDER_NAME + "/" + HoodieTimeline - .makeCommitFileName( - commitTime)); + Path fullPath = new Path(hiveSyncConfig.basePath + "/" + HoodieTableMetaClient.METAFOLDER_NAME + "/" + + HoodieTimeline.makeCommitFileName(commitTime)); FSDataOutputStream fsout = fileSystem.create(fullPath, true); fsout.write(bytes); fsout.close(); } - private static void createCompactionCommitFile(HoodieCommitMetadata commitMetadata, - String commitTime) throws IOException { - byte[] bytes = commitMetadata.toJsonString().getBytes(StandardCharsets.UTF_8); - Path fullPath = new Path( - hiveSyncConfig.basePath + "/" + HoodieTableMetaClient.METAFOLDER_NAME + "/" + HoodieTimeline - .makeCommitFileName( - commitTime)); - FSDataOutputStream fsout = fileSystem.create(fullPath, true); - fsout.write(bytes); - fsout.close(); - } - - private static void createDeltaCommitFile(HoodieCommitMetadata deltaCommitMetadata, - String deltaCommitTime) throws IOException { + private static void createDeltaCommitFile(HoodieCommitMetadata deltaCommitMetadata, String deltaCommitTime) + throws IOException { byte[] bytes = deltaCommitMetadata.toJsonString().getBytes(StandardCharsets.UTF_8); - Path fullPath = new Path( - hiveSyncConfig.basePath + "/" + HoodieTableMetaClient.METAFOLDER_NAME + "/" + HoodieTimeline - .makeDeltaFileName( - deltaCommitTime)); + Path fullPath = new Path(hiveSyncConfig.basePath + "/" + HoodieTableMetaClient.METAFOLDER_NAME + "/" + + HoodieTimeline.makeDeltaFileName(deltaCommitTime)); FSDataOutputStream fsout = fileSystem.create(fullPath, true); fsout.write(bytes); fsout.close(); diff --git a/hudi-hive/src/test/java/org/apache/hudi/hive/util/HiveTestService.java b/hudi-hive/src/test/java/org/apache/hudi/hive/util/HiveTestService.java index 8961558bb..924d7a14a 100644 --- a/hudi-hive/src/test/java/org/apache/hudi/hive/util/HiveTestService.java +++ b/hudi-hive/src/test/java/org/apache/hudi/hive/util/HiveTestService.java @@ -142,7 +142,7 @@ public class HiveTestService { // 'new HiveConf()'. This is fixed by https://issues.apache.org/jira/browse/HIVE-6657, // in Hive 0.14. // As a workaround, the property is set in hive-site.xml in this module. - //conf.set(HiveConf.ConfVars.HIVE_SERVER2_AUTHENTICATION.varname, "NOSASL"); + // conf.set(HiveConf.ConfVars.HIVE_SERVER2_AUTHENTICATION.varname, "NOSASL"); File localHiveDir = new File(localHiveLocation); localHiveDir.mkdirs(); File metastoreDbDir = new File(localHiveDir, "metastore_db"); @@ -225,8 +225,7 @@ public class HiveTestService { private final TTransportFactory parentTransFactory; private final TTransportFactory childTransFactory; - private ChainedTTransportFactory(TTransportFactory parentTransFactory, - TTransportFactory childTransFactory) { + private ChainedTTransportFactory(TTransportFactory parentTransFactory, TTransportFactory childTransFactory) { this.parentTransFactory = parentTransFactory; this.childTransFactory = childTransFactory; } @@ -268,17 +267,15 @@ public class HiveTestService { int minWorkerThreads = conf.getIntVar(HiveConf.ConfVars.METASTORESERVERMINTHREADS); int maxWorkerThreads = conf.getIntVar(HiveConf.ConfVars.METASTORESERVERMAXTHREADS); boolean tcpKeepAlive = conf.getBoolVar(HiveConf.ConfVars.METASTORE_TCP_KEEP_ALIVE); - boolean useFramedTransport = conf.getBoolVar( - HiveConf.ConfVars.METASTORE_USE_THRIFT_FRAMED_TRANSPORT); + boolean useFramedTransport = conf.getBoolVar(HiveConf.ConfVars.METASTORE_USE_THRIFT_FRAMED_TRANSPORT); // don't support SASL yet - //boolean useSasl = conf.getBoolVar(HiveConf.ConfVars.METASTORE_USE_THRIFT_SASL); + // boolean useSasl = conf.getBoolVar(HiveConf.ConfVars.METASTORE_USE_THRIFT_SASL); TServerTransport serverTransport; if (forceBindIP != null) { InetSocketAddress address = new InetSocketAddress(forceBindIP, port); - serverTransport = - tcpKeepAlive ? new TServerSocketKeepAlive(address) : new TServerSocket(address); + serverTransport = tcpKeepAlive ? new TServerSocketKeepAlive(address) : new TServerSocket(address); } else { serverTransport = tcpKeepAlive ? new TServerSocketKeepAlive(port) : new TServerSocket(port); @@ -287,29 +284,24 @@ public class HiveTestService { TProcessor processor; TTransportFactory transFactory; - IHMSHandler handler = (IHMSHandler) HiveMetaStore - .newRetryingHMSHandler("new db based metaserver", - conf, true); + IHMSHandler handler = (IHMSHandler) HiveMetaStore.newRetryingHMSHandler("new db based metaserver", conf, true); if (conf.getBoolVar(HiveConf.ConfVars.METASTORE_EXECUTE_SET_UGI)) { - transFactory = - useFramedTransport ? new ChainedTTransportFactory(new TFramedTransport.Factory(), - new TUGIContainingTransport.Factory()) : new TUGIContainingTransport.Factory(); + transFactory = useFramedTransport + ? new ChainedTTransportFactory(new TFramedTransport.Factory(), new TUGIContainingTransport.Factory()) + : new TUGIContainingTransport.Factory(); processor = new TUGIBasedProcessor(handler); LOG.info("Starting DB backed MetaStore Server with SetUGI enabled"); } else { - transFactory = - useFramedTransport ? new TFramedTransport.Factory() : new TTransportFactory(); + transFactory = useFramedTransport ? new TFramedTransport.Factory() : new TTransportFactory(); processor = new TSetIpAddressProcessor(handler); LOG.info("Starting DB backed MetaStore Server"); } TThreadPoolServer.Args args = new TThreadPoolServer.Args(serverTransport).processor(processor) - .transportFactory(transFactory) - .protocolFactory(new TBinaryProtocol.Factory()) - .minWorkerThreads(minWorkerThreads) - .maxWorkerThreads(maxWorkerThreads); + .transportFactory(transFactory).protocolFactory(new TBinaryProtocol.Factory()) + .minWorkerThreads(minWorkerThreads).maxWorkerThreads(maxWorkerThreads); final TServer tServer = new TThreadPoolServer(args); executorService.submit(new Runnable() { diff --git a/hudi-integ-test/pom.xml b/hudi-integ-test/pom.xml index 486473daf..bb7cec56f 100644 --- a/hudi-integ-test/pom.xml +++ b/hudi-integ-test/pom.xml @@ -135,6 +135,7 @@ false ${skipITs} true + ${project.parent.basedir} diff --git a/hudi-integ-test/src/test/java/org/apache/hudi/integ/ITTestBase.java b/hudi-integ-test/src/test/java/org/apache/hudi/integ/ITTestBase.java index 9b33ba3a8..1c97cec1e 100644 --- a/hudi-integ-test/src/test/java/org/apache/hudi/integ/ITTestBase.java +++ b/hudi-integ-test/src/test/java/org/apache/hudi/integ/ITTestBase.java @@ -87,8 +87,7 @@ public abstract class ITTestBase { } private static String getHiveConsoleCommandFile(String commandFile, String additionalVar) { - StringBuilder builder = new StringBuilder() - .append("beeline -u " + HIVE_SERVER_JDBC_URL) + StringBuilder builder = new StringBuilder().append("beeline -u " + HIVE_SERVER_JDBC_URL) .append(" --hiveconf hive.input.format=org.apache.hadoop.hive.ql.io.HiveInputFormat ") .append(" --hiveconf hive.stats.autogather=false ") .append(" --hivevar hudi.hadoop.bundle=" + HUDI_HADOOP_BUNDLE); @@ -100,30 +99,23 @@ public abstract class ITTestBase { } static String getSparkShellCommand(String commandFile) { - return new StringBuilder() - .append("spark-shell --jars ").append(HUDI_SPARK_BUNDLE) + return new StringBuilder().append("spark-shell --jars ").append(HUDI_SPARK_BUNDLE) .append(" --master local[2] --driver-class-path ").append(HADOOP_CONF_DIR) - .append(" --conf spark.sql.hive.convertMetastoreParquet=false --deploy-mode client --driver-memory 1G --executor-memory 1G --num-executors 1 ") - .append(" --packages com.databricks:spark-avro_2.11:4.0.0 ") - .append(" -i ").append(commandFile) - .toString(); + .append( + " --conf spark.sql.hive.convertMetastoreParquet=false --deploy-mode client --driver-memory 1G --executor-memory 1G --num-executors 1 ") + .append(" --packages com.databricks:spark-avro_2.11:4.0.0 ").append(" -i ").append(commandFile).toString(); } @Before public void init() { String dockerHost = (OVERRIDDEN_DOCKER_HOST != null) ? OVERRIDDEN_DOCKER_HOST : DEFAULT_DOCKER_HOST; - //Assuming insecure docker engine - DockerClientConfig config = DefaultDockerClientConfig.createDefaultConfigBuilder() - .withDockerHost(dockerHost) - .build(); + // Assuming insecure docker engine + DockerClientConfig config = + DefaultDockerClientConfig.createDefaultConfigBuilder().withDockerHost(dockerHost).build(); // using jaxrs/jersey implementation here (netty impl is also available) - DockerCmdExecFactory dockerCmdExecFactory = new JerseyDockerCmdExecFactory() - .withConnectTimeout(1000) - .withMaxTotalConnections(100) - .withMaxPerRouteConnections(10); - dockerClient = DockerClientBuilder.getInstance(config) - .withDockerCmdExecFactory(dockerCmdExecFactory) - .build(); + DockerCmdExecFactory dockerCmdExecFactory = new JerseyDockerCmdExecFactory().withConnectTimeout(1000) + .withMaxTotalConnections(100).withMaxPerRouteConnections(10); + dockerClient = DockerClientBuilder.getInstance(config).withDockerCmdExecFactory(dockerCmdExecFactory).build(); await().atMost(60, SECONDS).until(this::servicesUp); } @@ -131,8 +123,7 @@ public abstract class ITTestBase { List containerList = dockerClient.listContainersCmd().exec(); for (Container c : containerList) { if (!c.getState().equalsIgnoreCase("running")) { - LOG.info("Container : " + Arrays.toString(c.getNames()) - + "not in running state, Curr State :" + c.getState()); + LOG.info("Container : " + Arrays.toString(c.getNames()) + "not in running state, Curr State :" + c.getState()); return false; } } @@ -142,31 +133,31 @@ public abstract class ITTestBase { } private String singleSpace(String str) { - return str.replaceAll("[\\s]+"," "); + return str.replaceAll("[\\s]+", " "); } private TestExecStartResultCallback executeCommandInDocker(String containerName, String[] command, boolean expectedToSucceed) throws Exception { Container sparkWorkerContainer = runningContainers.get(containerName); - ExecCreateCmd cmd = dockerClient.execCreateCmd(sparkWorkerContainer.getId()) - .withCmd(command).withAttachStdout(true).withAttachStderr(true); + ExecCreateCmd cmd = dockerClient.execCreateCmd(sparkWorkerContainer.getId()).withCmd(command).withAttachStdout(true) + .withAttachStderr(true); ExecCreateCmdResponse createCmdResponse = cmd.exec(); - TestExecStartResultCallback callback = new TestExecStartResultCallback(new ByteArrayOutputStream(), - new ByteArrayOutputStream()); - dockerClient.execStartCmd(createCmdResponse.getId()).withDetach(false).withTty(false) - .exec(callback).awaitCompletion(); + TestExecStartResultCallback callback = + new TestExecStartResultCallback(new ByteArrayOutputStream(), new ByteArrayOutputStream()); + dockerClient.execStartCmd(createCmdResponse.getId()).withDetach(false).withTty(false).exec(callback) + .awaitCompletion(); int exitCode = dockerClient.inspectExecCmd(createCmdResponse.getId()).exec().getExitCode(); LOG.info("Exit code for command : " + exitCode); LOG.error("\n\n ###### Stdout #######\n" + callback.getStdout().toString()); LOG.error("\n\n ###### Stderr #######\n" + callback.getStderr().toString()); if (expectedToSucceed) { - Assert.assertTrue("Command (" + Arrays.toString(command) - + ") expected to succeed. Exit (" + exitCode + ")", exitCode == 0); + Assert.assertTrue("Command (" + Arrays.toString(command) + ") expected to succeed. Exit (" + exitCode + ")", + exitCode == 0); } else { - Assert.assertTrue("Command (" + Arrays.toString(command) - + ") expected to fail. Exit (" + exitCode + ")", exitCode != 0); + Assert.assertTrue("Command (" + Arrays.toString(command) + ") expected to fail. Exit (" + exitCode + ")", + exitCode != 0); } cmd.close(); return callback; @@ -178,8 +169,8 @@ public abstract class ITTestBase { } } - TestExecStartResultCallback executeCommandStringInDocker(String containerName, String cmd, - boolean expectedToSucceed) throws Exception { + TestExecStartResultCallback executeCommandStringInDocker(String containerName, String cmd, boolean expectedToSucceed) + throws Exception { LOG.info("\n\n#################################################################################################"); LOG.info("Container : " + containerName + ", Running command :" + cmd); LOG.info("\n#################################################################################################"); @@ -211,16 +202,16 @@ public abstract class ITTestBase { Pair executeSparkSQLCommand(String commandFile, boolean expectedToSucceed) throws Exception { String sparkShellCmd = getSparkShellCommand(commandFile); - TestExecStartResultCallback callback = executeCommandStringInDocker(ADHOC_1_CONTAINER, - sparkShellCmd, expectedToSucceed); + TestExecStartResultCallback callback = + executeCommandStringInDocker(ADHOC_1_CONTAINER, sparkShellCmd, expectedToSucceed); return Pair.of(callback.getStdout().toString(), callback.getStderr().toString()); } private void saveUpLogs() { try { // save up the Hive log files for introspection - String hiveLogStr = executeCommandStringInDocker(HIVESERVER, "cat /tmp/root/hive.log", true) - .getStdout().toString(); + String hiveLogStr = + executeCommandStringInDocker(HIVESERVER, "cat /tmp/root/hive.log", true).getStdout().toString(); String filePath = System.getProperty("java.io.tmpdir") + "/" + System.currentTimeMillis() + "-hive.log"; FileIOUtils.writeStringToFile(hiveLogStr, filePath); LOG.info("Hive log saved up at : " + filePath); @@ -240,10 +231,10 @@ public abstract class ITTestBase { int lastIndex = 0; int count = 0; - while(lastIndex != -1){ + while (lastIndex != -1) { lastIndex = stdOutSingleSpaced.indexOf(expectedOutput, lastIndex); - if(lastIndex != -1){ - count ++; + if (lastIndex != -1) { + count++; lastIndex += expectedOutput.length(); } } diff --git a/hudi-integ-test/src/test/java/org/apache/hudi/integ/ITTestHoodieDemo.java b/hudi-integ-test/src/test/java/org/apache/hudi/integ/ITTestHoodieDemo.java index 177962521..7d079c59c 100644 --- a/hudi-integ-test/src/test/java/org/apache/hudi/integ/ITTestHoodieDemo.java +++ b/hudi-integ-test/src/test/java/org/apache/hudi/integ/ITTestHoodieDemo.java @@ -35,10 +35,8 @@ public class ITTestHoodieDemo extends ITTestBase { private static String HDFS_BATCH_PATH1 = HDFS_DATA_DIR + "/" + "batch_1.json"; private static String HDFS_BATCH_PATH2 = HDFS_DATA_DIR + "/" + "batch_2.json"; - private static String INPUT_BATCH_PATH1 = HOODIE_WS_ROOT + - "/docker/demo/data/batch_1.json"; - private static String INPUT_BATCH_PATH2 = HOODIE_WS_ROOT + - "/docker/demo/data/batch_2.json"; + private static String INPUT_BATCH_PATH1 = HOODIE_WS_ROOT + "/docker/demo/data/batch_1.json"; + private static String INPUT_BATCH_PATH2 = HOODIE_WS_ROOT + "/docker/demo/data/batch_2.json"; private static String COW_BASE_PATH = "/user/hive/warehouse/stock_ticks_cow"; private static String MOR_BASE_PATH = "/user/hive/warehouse/stock_ticks_mor"; @@ -58,13 +56,13 @@ public class ITTestHoodieDemo extends ITTestBase { private static String HIVE_INCREMENTAL_COMMANDS = HOODIE_WS_ROOT + "/docker/demo/hive-incremental.commands"; - private static String HIVE_SYNC_CMD_FMT = " --enable-hive-sync " - + " --hoodie-conf hoodie.datasource.hive_sync.jdbcurl=jdbc:hive2://hiveserver:10000 " - + " --hoodie-conf hoodie.datasource.hive_sync.username=hive " - + " --hoodie-conf hoodie.datasource.hive_sync.password=hive " - + " --hoodie-conf hoodie.datasource.hive_sync.partition_fields=%s " - + " --hoodie-conf hoodie.datasource.hive_sync.database=default " - + " --hoodie-conf hoodie.datasource.hive_sync.table=%s"; + private static String HIVE_SYNC_CMD_FMT = + " --enable-hive-sync " + " --hoodie-conf hoodie.datasource.hive_sync.jdbcurl=jdbc:hive2://hiveserver:10000 " + + " --hoodie-conf hoodie.datasource.hive_sync.username=hive " + + " --hoodie-conf hoodie.datasource.hive_sync.password=hive " + + " --hoodie-conf hoodie.datasource.hive_sync.partition_fields=%s " + + " --hoodie-conf hoodie.datasource.hive_sync.database=default " + + " --hoodie-conf hoodie.datasource.hive_sync.table=%s"; @Test @@ -90,32 +88,30 @@ public class ITTestHoodieDemo extends ITTestBase { } private void setupDemo() throws Exception { - List cmds = new ImmutableList.Builder() - .add("hdfs dfsadmin -safemode wait") // handle NN going into safe mode at times + List cmds = new ImmutableList.Builder().add("hdfs dfsadmin -safemode wait") // handle NN going into + // safe mode at times .add("hdfs dfs -mkdir -p " + HDFS_DATA_DIR) .add("hdfs dfs -copyFromLocal -f " + INPUT_BATCH_PATH1 + " " + HDFS_BATCH_PATH1) - .add("/bin/bash " + DEMO_CONTAINER_SCRIPT) - .build(); + .add("/bin/bash " + DEMO_CONTAINER_SCRIPT).build(); executeCommandStringsInDocker(ADHOC_1_CONTAINER, cmds); } private void ingestFirstBatchAndHiveSync() throws Exception { List cmds = new ImmutableList.Builder() - .add("spark-submit --class org.apache.hudi.utilities.deltastreamer.HoodieDeltaStreamer " - + HUDI_UTILITIES_BUNDLE + " --storage-type COPY_ON_WRITE " - + " --source-class org.apache.hudi.utilities.sources.JsonDFSSource --source-ordering-field ts " - + " --target-base-path " + COW_BASE_PATH + " --target-table " + COW_TABLE_NAME - + " --props /var/demo/config/dfs-source.properties " - + " --schemaprovider-class org.apache.hudi.utilities.schema.FilebasedSchemaProvider " - + String.format(HIVE_SYNC_CMD_FMT, "dt", COW_TABLE_NAME)) - .add("spark-submit --class org.apache.hudi.utilities.deltastreamer.HoodieDeltaStreamer " - + HUDI_UTILITIES_BUNDLE + " --storage-type MERGE_ON_READ " - + " --source-class org.apache.hudi.utilities.sources.JsonDFSSource --source-ordering-field ts " - + " --target-base-path " + MOR_BASE_PATH + " --target-table " + MOR_TABLE_NAME - + " --props /var/demo/config/dfs-source.properties " - + " --schemaprovider-class org.apache.hudi.utilities.schema.FilebasedSchemaProvider " - + " --disable-compaction " - + String.format(HIVE_SYNC_CMD_FMT, "dt", MOR_TABLE_NAME)) + .add("spark-submit --class org.apache.hudi.utilities.deltastreamer.HoodieDeltaStreamer " + HUDI_UTILITIES_BUNDLE + + " --storage-type COPY_ON_WRITE " + + " --source-class org.apache.hudi.utilities.sources.JsonDFSSource --source-ordering-field ts " + + " --target-base-path " + COW_BASE_PATH + " --target-table " + COW_TABLE_NAME + + " --props /var/demo/config/dfs-source.properties " + + " --schemaprovider-class org.apache.hudi.utilities.schema.FilebasedSchemaProvider " + + String.format(HIVE_SYNC_CMD_FMT, "dt", COW_TABLE_NAME)) + .add("spark-submit --class org.apache.hudi.utilities.deltastreamer.HoodieDeltaStreamer " + HUDI_UTILITIES_BUNDLE + + " --storage-type MERGE_ON_READ " + + " --source-class org.apache.hudi.utilities.sources.JsonDFSSource --source-ordering-field ts " + + " --target-base-path " + MOR_BASE_PATH + " --target-table " + MOR_TABLE_NAME + + " --props /var/demo/config/dfs-source.properties " + + " --schemaprovider-class org.apache.hudi.utilities.schema.FilebasedSchemaProvider " + + " --disable-compaction " + String.format(HIVE_SYNC_CMD_FMT, "dt", MOR_TABLE_NAME)) .build(); executeCommandStringsInDocker(ADHOC_1_CONTAINER, cmds); @@ -128,32 +124,25 @@ public class ITTestHoodieDemo extends ITTestBase { assertStdOutContains(stdOutErrPair, "| stock_ticks_mor_rt |"); assertStdOutContains(stdOutErrPair, - "| partition |\n" - + "+----------------+\n" - + "| dt=2018-08-31 |\n" - + "+----------------+\n", 3); + "| partition |\n" + "+----------------+\n" + "| dt=2018-08-31 |\n" + "+----------------+\n", 3); stdOutErrPair = executeHiveCommandFile(HIVE_BATCH1_COMMANDS); - assertStdOutContains(stdOutErrPair, "| symbol | _c1 |\n" - + "+---------+----------------------+\n" + assertStdOutContains(stdOutErrPair, "| symbol | _c1 |\n" + "+---------+----------------------+\n" + "| GOOG | 2018-08-31 10:29:00 |\n", 3); assertStdOutContains(stdOutErrPair, "| symbol | ts | volume | open | close |\n" - + "+---------+----------------------+---------+------------+-----------+\n" - + "| GOOG | 2018-08-31 09:59:00 | 6330 | 1230.5 | 1230.02 |\n" - + "| GOOG | 2018-08-31 10:29:00 | 3391 | 1230.1899 | 1230.085 |\n", 3); + + "+---------+----------------------+---------+------------+-----------+\n" + + "| GOOG | 2018-08-31 09:59:00 | 6330 | 1230.5 | 1230.02 |\n" + + "| GOOG | 2018-08-31 10:29:00 | 3391 | 1230.1899 | 1230.085 |\n", + 3); } private void testSparkSQLAfterFirstBatch() throws Exception { Pair stdOutErrPair = executeSparkSQLCommand(SPARKSQL_BATCH1_COMMANDS, true); + assertStdOutContains(stdOutErrPair, "|default |stock_ticks_cow |false |\n" + + "|default |stock_ticks_mor |false |\n" + "|default |stock_ticks_mor_rt |false |"); assertStdOutContains(stdOutErrPair, - "|default |stock_ticks_cow |false |\n" - + "|default |stock_ticks_mor |false |\n" - + "|default |stock_ticks_mor_rt |false |"); - assertStdOutContains(stdOutErrPair, - "+------+-------------------+\n" - + "|GOOG |2018-08-31 10:29:00|\n" - + "+------+-------------------+", 3); + "+------+-------------------+\n" + "|GOOG |2018-08-31 10:29:00|\n" + "+------+-------------------+", 3); assertStdOutContains(stdOutErrPair, "|GOOG |2018-08-31 09:59:00|6330 |1230.5 |1230.02 |", 3); assertStdOutContains(stdOutErrPair, "|GOOG |2018-08-31 10:29:00|3391 |1230.1899|1230.085|", 3); } @@ -161,34 +150,29 @@ public class ITTestHoodieDemo extends ITTestBase { private void ingestSecondBatchAndHiveSync() throws Exception { List cmds = new ImmutableList.Builder() .add("hdfs dfs -copyFromLocal -f " + INPUT_BATCH_PATH2 + " " + HDFS_BATCH_PATH2) - .add("spark-submit --class org.apache.hudi.utilities.deltastreamer.HoodieDeltaStreamer " - + HUDI_UTILITIES_BUNDLE + " --storage-type COPY_ON_WRITE " - + " --source-class org.apache.hudi.utilities.sources.JsonDFSSource --source-ordering-field ts " - + " --target-base-path " + COW_BASE_PATH + " --target-table " + COW_TABLE_NAME - + " --props /var/demo/config/dfs-source.properties " - + " --schemaprovider-class org.apache.hudi.utilities.schema.FilebasedSchemaProvider " - + String.format(HIVE_SYNC_CMD_FMT, "dt", COW_TABLE_NAME)) - .add("spark-submit --class org.apache.hudi.utilities.deltastreamer.HoodieDeltaStreamer " - + HUDI_UTILITIES_BUNDLE + " --storage-type MERGE_ON_READ " - + " --source-class org.apache.hudi.utilities.sources.JsonDFSSource --source-ordering-field ts " - + " --target-base-path " + MOR_BASE_PATH + " --target-table " + MOR_TABLE_NAME - + " --props /var/demo/config/dfs-source.properties " - + " --schemaprovider-class org.apache.hudi.utilities.schema.FilebasedSchemaProvider " - + " --disable-compaction " - + String.format(HIVE_SYNC_CMD_FMT, "dt", MOR_TABLE_NAME)) + .add("spark-submit --class org.apache.hudi.utilities.deltastreamer.HoodieDeltaStreamer " + HUDI_UTILITIES_BUNDLE + + " --storage-type COPY_ON_WRITE " + + " --source-class org.apache.hudi.utilities.sources.JsonDFSSource --source-ordering-field ts " + + " --target-base-path " + COW_BASE_PATH + " --target-table " + COW_TABLE_NAME + + " --props /var/demo/config/dfs-source.properties " + + " --schemaprovider-class org.apache.hudi.utilities.schema.FilebasedSchemaProvider " + + String.format(HIVE_SYNC_CMD_FMT, "dt", COW_TABLE_NAME)) + .add("spark-submit --class org.apache.hudi.utilities.deltastreamer.HoodieDeltaStreamer " + HUDI_UTILITIES_BUNDLE + + " --storage-type MERGE_ON_READ " + + " --source-class org.apache.hudi.utilities.sources.JsonDFSSource --source-ordering-field ts " + + " --target-base-path " + MOR_BASE_PATH + " --target-table " + MOR_TABLE_NAME + + " --props /var/demo/config/dfs-source.properties " + + " --schemaprovider-class org.apache.hudi.utilities.schema.FilebasedSchemaProvider " + + " --disable-compaction " + String.format(HIVE_SYNC_CMD_FMT, "dt", MOR_TABLE_NAME)) .build(); executeCommandStringsInDocker(ADHOC_1_CONTAINER, cmds); } private void testHiveAfterSecondBatch() throws Exception { Pair stdOutErrPair = executeHiveCommandFile(HIVE_BATCH1_COMMANDS); - assertStdOutContains(stdOutErrPair, - "| symbol | _c1 |\n" - + "+---------+----------------------+\n" + assertStdOutContains(stdOutErrPair, "| symbol | _c1 |\n" + "+---------+----------------------+\n" + "| GOOG | 2018-08-31 10:29:00 |\n"); - assertStdOutContains(stdOutErrPair, - "| symbol | _c1 |\n" - + "+---------+----------------------+\n" + assertStdOutContains(stdOutErrPair, "| symbol | _c1 |\n" + "+---------+----------------------+\n" + "| GOOG | 2018-08-31 10:59:00 |\n", 2); assertStdOutContains(stdOutErrPair, "| symbol | ts | volume | open | close |\n" @@ -197,75 +181,66 @@ public class ITTestHoodieDemo extends ITTestBase { + "| GOOG | 2018-08-31 10:29:00 | 3391 | 1230.1899 | 1230.085 |\n"); assertStdOutContains(stdOutErrPair, "| symbol | ts | volume | open | close |\n" - + "+---------+----------------------+---------+------------+-----------+\n" - + "| GOOG | 2018-08-31 09:59:00 | 6330 | 1230.5 | 1230.02 |\n" - + "| GOOG | 2018-08-31 10:59:00 | 9021 | 1227.1993 | 1227.215 |\n", 2); + + "+---------+----------------------+---------+------------+-----------+\n" + + "| GOOG | 2018-08-31 09:59:00 | 6330 | 1230.5 | 1230.02 |\n" + + "| GOOG | 2018-08-31 10:59:00 | 9021 | 1227.1993 | 1227.215 |\n", + 2); } private void testHiveAfterSecondBatchAfterCompaction() throws Exception { Pair stdOutErrPair = executeHiveCommandFile(HIVE_BATCH2_COMMANDS); - assertStdOutContains(stdOutErrPair, - "| symbol | _c1 |\n" - + "+---------+----------------------+\n" + assertStdOutContains(stdOutErrPair, "| symbol | _c1 |\n" + "+---------+----------------------+\n" + "| GOOG | 2018-08-31 10:59:00 |", 2); - assertStdOutContains(stdOutErrPair, "| symbol | ts | volume | open | close |\n" - + "+---------+----------------------+---------+------------+-----------+\n" - + "| GOOG | 2018-08-31 09:59:00 | 6330 | 1230.5 | 1230.02 |\n" - + "| GOOG | 2018-08-31 10:59:00 | 9021 | 1227.1993 | 1227.215 |", 2); + assertStdOutContains(stdOutErrPair, + "| symbol | ts | volume | open | close |\n" + + "+---------+----------------------+---------+------------+-----------+\n" + + "| GOOG | 2018-08-31 09:59:00 | 6330 | 1230.5 | 1230.02 |\n" + + "| GOOG | 2018-08-31 10:59:00 | 9021 | 1227.1993 | 1227.215 |", + 2); } private void testSparkSQLAfterSecondBatch() throws Exception { Pair stdOutErrPair = executeSparkSQLCommand(SPARKSQL_BATCH2_COMMANDS, true); assertStdOutContains(stdOutErrPair, - "+------+-------------------+\n" - + "|GOOG |2018-08-31 10:59:00|\n" - + "+------+-------------------+", 2); + "+------+-------------------+\n" + "|GOOG |2018-08-31 10:59:00|\n" + "+------+-------------------+", 2); assertStdOutContains(stdOutErrPair, "|GOOG |2018-08-31 09:59:00|6330 |1230.5 |1230.02 |", 3); assertStdOutContains(stdOutErrPair, "|GOOG |2018-08-31 10:59:00|9021 |1227.1993|1227.215|", 2); assertStdOutContains(stdOutErrPair, - "+------+-------------------+\n" - + "|GOOG |2018-08-31 10:29:00|\n" - + "+------+-------------------+"); + "+------+-------------------+\n" + "|GOOG |2018-08-31 10:29:00|\n" + "+------+-------------------+"); assertStdOutContains(stdOutErrPair, "|GOOG |2018-08-31 10:29:00|3391 |1230.1899|1230.085|"); } private void testIncrementalHiveQuery() throws Exception { - String minCommitTime = executeCommandStringInDocker(ADHOC_2_CONTAINER, MIN_COMMIT_TIME_SCRIPT, true) - .getStdout().toString(); - Pair stdOutErrPair = executeHiveCommandFile(HIVE_INCREMENTAL_COMMANDS, - "min.commit.time=" + minCommitTime +"`"); + String minCommitTime = + executeCommandStringInDocker(ADHOC_2_CONTAINER, MIN_COMMIT_TIME_SCRIPT, true).getStdout().toString(); + Pair stdOutErrPair = + executeHiveCommandFile(HIVE_INCREMENTAL_COMMANDS, "min.commit.time=" + minCommitTime + "`"); assertStdOutContains(stdOutErrPair, "| GOOG | 2018-08-31 10:59:00 | 9021 | 1227.1993 | 1227.215 |"); } private void testIncrementalHiveQueryAfterCompaction() throws Exception { - String minCommitTime = executeCommandStringInDocker(ADHOC_2_CONTAINER, MIN_COMMIT_TIME_SCRIPT, true) - .getStdout().toString(); - Pair stdOutErrPair = executeHiveCommandFile(HIVE_INCREMENTAL_COMMANDS, - "min.commit.time=" + minCommitTime +"`"); + String minCommitTime = + executeCommandStringInDocker(ADHOC_2_CONTAINER, MIN_COMMIT_TIME_SCRIPT, true).getStdout().toString(); + Pair stdOutErrPair = + executeHiveCommandFile(HIVE_INCREMENTAL_COMMANDS, "min.commit.time=" + minCommitTime + "`"); assertStdOutContains(stdOutErrPair, "| symbol | ts | volume | open | close |\n" - + "+---------+----------------------+---------+------------+-----------+\n" - + "| GOOG | 2018-08-31 10:59:00 | 9021 | 1227.1993 | 1227.215 |"); + + "+---------+----------------------+---------+------------+-----------+\n" + + "| GOOG | 2018-08-31 10:59:00 | 9021 | 1227.1993 | 1227.215 |"); } private void testIncrementalSparkSQLQuery() throws Exception { Pair stdOutErrPair = executeSparkSQLCommand(SPARKSQL_INCREMENTAL_COMMANDS, true); assertStdOutContains(stdOutErrPair, "|GOOG |2018-08-31 10:59:00|9021 |1227.1993|1227.215|"); - assertStdOutContains(stdOutErrPair, - "|default |stock_ticks_cow |false |\n" - + "|default |stock_ticks_derived_mor |false |\n" - + "|default |stock_ticks_derived_mor_rt|false |\n" - + "|default |stock_ticks_mor |false |\n" - + "|default |stock_ticks_mor_rt |false |\n" + assertStdOutContains(stdOutErrPair, "|default |stock_ticks_cow |false |\n" + + "|default |stock_ticks_derived_mor |false |\n" + "|default |stock_ticks_derived_mor_rt|false |\n" + + "|default |stock_ticks_mor |false |\n" + "|default |stock_ticks_mor_rt |false |\n" + "| |stock_ticks_cow_incr |true |"); - assertStdOutContains(stdOutErrPair, - "|count(1)|\n" - + "+--------+\n" - + "|99 |", 2); + assertStdOutContains(stdOutErrPair, "|count(1)|\n" + "+--------+\n" + "|99 |", 2); } private void scheduleAndRunCompaction() throws Exception { executeCommandStringInDocker(ADHOC_1_CONTAINER, HUDI_CLI_TOOL + " --cmdfile " + COMPACTION_COMMANDS, true); } -} \ No newline at end of file +} diff --git a/hudi-integ-test/src/test/java/org/apache/hudi/integ/ITTestHoodieSanity.java b/hudi-integ-test/src/test/java/org/apache/hudi/integ/ITTestHoodieSanity.java index c8e112624..2795ab78c 100644 --- a/hudi-integ-test/src/test/java/org/apache/hudi/integ/ITTestHoodieSanity.java +++ b/hudi-integ-test/src/test/java/org/apache/hudi/integ/ITTestHoodieSanity.java @@ -28,16 +28,14 @@ import org.junit.Test; public class ITTestHoodieSanity extends ITTestBase { enum PartitionType { - SINGLE_KEY_PARTITIONED, - MULTI_KEYS_PARTITIONED, - NON_PARTITIONED, + SINGLE_KEY_PARTITIONED, MULTI_KEYS_PARTITIONED, NON_PARTITIONED, } @Test /** - * A basic integration test that runs HoodieJavaApp to create a sample COW Hoodie with single partition key - * data-set and performs upserts on it. Hive integration and upsert functionality is checked by running a count - * query in hive console. + * A basic integration test that runs HoodieJavaApp to create a sample COW Hoodie with single partition key data-set + * and performs upserts on it. Hive integration and upsert functionality is checked by running a count query in hive + * console. */ public void testRunHoodieJavaAppOnSinglePartitionKeyCOWTable() throws Exception { String hiveTableName = "docker_hoodie_single_partition_key_cow_test"; @@ -48,8 +46,8 @@ public class ITTestHoodieSanity extends ITTestBase { @Test /** * A basic integration test that runs HoodieJavaApp to create a sample COW Hoodie with multiple partition-keys - * data-set and performs upserts on it. Hive integration and upsert functionality is checked by running a count - * query in hive console. + * data-set and performs upserts on it. Hive integration and upsert functionality is checked by running a count query + * in hive console. */ public void testRunHoodieJavaAppOnMultiPartitionKeysCOWTable() throws Exception { String hiveTableName = "docker_hoodie_multi_partition_key_cow_test"; @@ -59,9 +57,9 @@ public class ITTestHoodieSanity extends ITTestBase { @Test /** - * A basic integration test that runs HoodieJavaApp to create a sample non-partitioned COW Hoodie - * data-set and performs upserts on it. Hive integration and upsert functionality is checked by running a count - * query in hive console. + * A basic integration test that runs HoodieJavaApp to create a sample non-partitioned COW Hoodie data-set and + * performs upserts on it. Hive integration and upsert functionality is checked by running a count query in hive + * console. */ public void testRunHoodieJavaAppOnNonPartitionedCOWTable() throws Exception { String hiveTableName = "docker_hoodie_non_partition_key_cow_test"; @@ -70,10 +68,9 @@ public class ITTestHoodieSanity extends ITTestBase { } /** - * A basic integration test that runs HoodieJavaApp to create a sample COW Hoodie - * data-set and performs upserts on it. Hive integration and upsert functionality is checked by running a count - * query in hive console. - * TODO: Add spark-shell test-case + * A basic integration test that runs HoodieJavaApp to create a sample COW Hoodie data-set and performs upserts on it. + * Hive integration and upsert functionality is checked by running a count query in hive console. TODO: Add + * spark-shell test-case */ public void testRunHoodieJavaAppOnCOWTable(String hiveTableName, PartitionType partitionType) throws Exception { @@ -98,16 +95,14 @@ public class ITTestHoodieSanity extends ITTestBase { // Run Hoodie Java App String cmd; if (partitionType == PartitionType.SINGLE_KEY_PARTITIONED) { - cmd = HOODIE_JAVA_APP + " --hive-sync --table-path " + hdfsUrl - + " --hive-url " + HIVE_SERVER_JDBC_URL + " --hive-table " + hiveTableName; + cmd = HOODIE_JAVA_APP + " --hive-sync --table-path " + hdfsUrl + " --hive-url " + HIVE_SERVER_JDBC_URL + + " --hive-table " + hiveTableName; } else if (partitionType == PartitionType.MULTI_KEYS_PARTITIONED) { - cmd = HOODIE_JAVA_APP + " --hive-sync --table-path " + hdfsUrl - + " --hive-url " + HIVE_SERVER_JDBC_URL + " --hive-table " + hiveTableName - + " --use-multi-partition-keys"; + cmd = HOODIE_JAVA_APP + " --hive-sync --table-path " + hdfsUrl + " --hive-url " + HIVE_SERVER_JDBC_URL + + " --hive-table " + hiveTableName + " --use-multi-partition-keys"; } else { - cmd = HOODIE_JAVA_APP + " --hive-sync --table-path " + hdfsUrl - + " --hive-url " + HIVE_SERVER_JDBC_URL + " --hive-table " + hiveTableName - + " --non-partitioned"; + cmd = HOODIE_JAVA_APP + " --hive-sync --table-path " + hdfsUrl + " --hive-url " + HIVE_SERVER_JDBC_URL + + " --hive-table " + hiveTableName + " --non-partitioned"; } executeCommandStringInDocker(ADHOC_1_CONTAINER, cmd, true); diff --git a/hudi-spark/pom.xml b/hudi-spark/pom.xml index 948e5afdb..b490dd740 100644 --- a/hudi-spark/pom.xml +++ b/hudi-spark/pom.xml @@ -26,6 +26,10 @@ hudi-spark jar + + ${project.parent.basedir} + + diff --git a/hudi-spark/src/main/java/org/apache/hudi/BaseAvroPayload.java b/hudi-spark/src/main/java/org/apache/hudi/BaseAvroPayload.java index 38acdea3b..30c3fdd17 100644 --- a/hudi-spark/src/main/java/org/apache/hudi/BaseAvroPayload.java +++ b/hudi-spark/src/main/java/org/apache/hudi/BaseAvroPayload.java @@ -33,7 +33,7 @@ public abstract class BaseAvroPayload implements Serializable { /** * Avro data extracted from the source converted to bytes */ - protected final byte [] recordBytes; + protected final byte[] recordBytes; /** * For purposes of preCombining diff --git a/hudi-spark/src/main/java/org/apache/hudi/ComplexKeyGenerator.java b/hudi-spark/src/main/java/org/apache/hudi/ComplexKeyGenerator.java index cc2ccb277..8419257d8 100644 --- a/hudi-spark/src/main/java/org/apache/hudi/ComplexKeyGenerator.java +++ b/hudi-spark/src/main/java/org/apache/hudi/ComplexKeyGenerator.java @@ -26,8 +26,7 @@ import org.apache.hudi.common.util.TypedProperties; import org.apache.hudi.exception.HoodieException; /** - * Complex key generator, which takes names of fields to be used for recordKey and partitionPath as - * configs. + * Complex key generator, which takes names of fields to be used for recordKey and partitionPath as configs. */ public class ComplexKeyGenerator extends KeyGenerator { @@ -42,15 +41,14 @@ public class ComplexKeyGenerator extends KeyGenerator { public ComplexKeyGenerator(TypedProperties props) { super(props); this.recordKeyFields = Arrays.asList(props.getString(DataSourceWriteOptions.RECORDKEY_FIELD_OPT_KEY()).split(",")); - this.partitionPathFields = Arrays.asList(props - .getString(DataSourceWriteOptions.PARTITIONPATH_FIELD_OPT_KEY()).split(",")); + this.partitionPathFields = + Arrays.asList(props.getString(DataSourceWriteOptions.PARTITIONPATH_FIELD_OPT_KEY()).split(",")); } @Override public HoodieKey getKey(GenericRecord record) { if (recordKeyFields == null || partitionPathFields == null) { - throw new HoodieException( - "Unable to find field names for record key or partition path in cfg"); + throw new HoodieException("Unable to find field names for record key or partition path in cfg"); } StringBuilder recordKey = new StringBuilder(); for (String recordKeyField : recordKeyFields) { diff --git a/hudi-spark/src/main/java/org/apache/hudi/DataSourceUtils.java b/hudi-spark/src/main/java/org/apache/hudi/DataSourceUtils.java index 3c9709855..da9002299 100644 --- a/hudi-spark/src/main/java/org/apache/hudi/DataSourceUtils.java +++ b/hudi-spark/src/main/java/org/apache/hudi/DataSourceUtils.java @@ -66,7 +66,7 @@ public class DataSourceUtils { String[] parts = fieldName.split("\\."); GenericRecord valueNode = record; int i = 0; - for (;i < parts.length; i++) { + for (; i < parts.length; i++) { String part = parts[i]; Object val = valueNode.get(part); if (val == null) { @@ -84,23 +84,21 @@ public class DataSourceUtils { valueNode = (GenericRecord) val; } } - throw new HoodieException(fieldName + "(Part -" + parts[i] + ") field not found in record. " - + "Acceptable fields were :" + valueNode.getSchema().getFields() - .stream().map(Field::name).collect(Collectors.toList())); + throw new HoodieException( + fieldName + "(Part -" + parts[i] + ") field not found in record. " + "Acceptable fields were :" + + valueNode.getSchema().getFields().stream().map(Field::name).collect(Collectors.toList())); } /** * Create a key generator class via reflection, passing in any configs needed. * * If the class name of key generator is configured through the properties file, i.e., {@code - * props}, use the corresponding key generator class; otherwise, use the default key generator - * class specified in {@code DataSourceWriteOptions}. + * props}, use the corresponding key generator class; otherwise, use the default key generator class specified in + * {@code DataSourceWriteOptions}. */ public static KeyGenerator createKeyGenerator(TypedProperties props) throws IOException { - String keyGeneratorClass = props.getString( - DataSourceWriteOptions.KEYGENERATOR_CLASS_OPT_KEY(), - DataSourceWriteOptions.DEFAULT_KEYGENERATOR_CLASS_OPT_VAL() - ); + String keyGeneratorClass = props.getString(DataSourceWriteOptions.KEYGENERATOR_CLASS_OPT_KEY(), + DataSourceWriteOptions.DEFAULT_KEYGENERATOR_CLASS_OPT_VAL()); try { return (KeyGenerator) ReflectionUtils.loadClass(keyGeneratorClass, props); } catch (Throwable e) { @@ -111,7 +109,7 @@ public class DataSourceUtils { /** * Create a partition value extractor class via reflection, passing in any configs needed */ - public static PartitionValueExtractor createPartitionExtractor(String partitionExtractorClass) { + public static PartitionValueExtractor createPartitionExtractor(String partitionExtractorClass) { try { return (PartitionValueExtractor) ReflectionUtils.loadClass(partitionExtractorClass); } catch (Throwable e) { @@ -122,18 +120,17 @@ public class DataSourceUtils { /** * Create a payload class via reflection, passing in an ordering/precombine value. */ - public static HoodieRecordPayload createPayload(String payloadClass, GenericRecord record, - Comparable orderingVal) throws IOException { + public static HoodieRecordPayload createPayload(String payloadClass, GenericRecord record, Comparable orderingVal) + throws IOException { try { - return (HoodieRecordPayload) ReflectionUtils - .loadClass(payloadClass, new Class[]{GenericRecord.class, Comparable.class}, record, orderingVal); + return (HoodieRecordPayload) ReflectionUtils.loadClass(payloadClass, + new Class[] {GenericRecord.class, Comparable.class}, record, orderingVal); } catch (Throwable e) { throw new IOException("Could not create payload for class: " + payloadClass, e); } } - public static void checkRequiredProperties(TypedProperties props, - List checkPropNames) { + public static void checkRequiredProperties(TypedProperties props, List checkPropNames) { checkPropNames.stream().forEach(prop -> { if (!props.containsKey(prop)) { throw new HoodieNotSupportedException("Required property " + prop + " is missing"); @@ -141,28 +138,22 @@ public class DataSourceUtils { }); } - public static HoodieWriteClient createHoodieClient(JavaSparkContext jssc, String schemaStr, - String basePath, String tblName, Map parameters) throws Exception { + public static HoodieWriteClient createHoodieClient(JavaSparkContext jssc, String schemaStr, String basePath, + String tblName, Map parameters) throws Exception { // inline compaction is on by default for MOR boolean inlineCompact = parameters.get(DataSourceWriteOptions.STORAGE_TYPE_OPT_KEY()) .equals(DataSourceWriteOptions.MOR_STORAGE_TYPE_OPT_VAL()); // insert/bulk-insert combining to be true, if filtering for duplicates - boolean combineInserts = Boolean.parseBoolean(parameters.get( - DataSourceWriteOptions.INSERT_DROP_DUPS_OPT_KEY())); + boolean combineInserts = Boolean.parseBoolean(parameters.get(DataSourceWriteOptions.INSERT_DROP_DUPS_OPT_KEY())); - HoodieWriteConfig writeConfig = HoodieWriteConfig.newBuilder() - .withPath(basePath).withAutoCommit(false) - .combineInput(combineInserts, true) - .withSchema(schemaStr).forTable(tblName).withIndexConfig( - HoodieIndexConfig.newBuilder().withIndexType(HoodieIndex.IndexType.BLOOM).build()) + HoodieWriteConfig writeConfig = HoodieWriteConfig.newBuilder().withPath(basePath).withAutoCommit(false) + .combineInput(combineInserts, true).withSchema(schemaStr).forTable(tblName) + .withIndexConfig(HoodieIndexConfig.newBuilder().withIndexType(HoodieIndex.IndexType.BLOOM).build()) .withCompactionConfig(HoodieCompactionConfig.newBuilder() - .withPayloadClass(parameters.get( - DataSourceWriteOptions - .PAYLOAD_CLASS_OPT_KEY())) - .withInlineCompaction(inlineCompact) - .build()) + .withPayloadClass(parameters.get(DataSourceWriteOptions.PAYLOAD_CLASS_OPT_KEY())) + .withInlineCompaction(inlineCompact).build()) // override above with Hoodie configs specified as options. .withProps(parameters).build(); @@ -170,27 +161,26 @@ public class DataSourceUtils { } - public static JavaRDD doWriteOperation(HoodieWriteClient client, - JavaRDD hoodieRecords, String commitTime, String operation) { + public static JavaRDD doWriteOperation(HoodieWriteClient client, JavaRDD hoodieRecords, + String commitTime, String operation) { if (operation.equals(DataSourceWriteOptions.BULK_INSERT_OPERATION_OPT_VAL())) { return client.bulkInsert(hoodieRecords, commitTime); } else if (operation.equals(DataSourceWriteOptions.INSERT_OPERATION_OPT_VAL())) { return client.insert(hoodieRecords, commitTime); } else { - //default is upsert + // default is upsert return client.upsert(hoodieRecords, commitTime); } } - public static HoodieRecord createHoodieRecord(GenericRecord gr, Comparable orderingVal, - HoodieKey hKey, String payloadClass) throws IOException { + public static HoodieRecord createHoodieRecord(GenericRecord gr, Comparable orderingVal, HoodieKey hKey, + String payloadClass) throws IOException { HoodieRecordPayload payload = DataSourceUtils.createPayload(payloadClass, gr, orderingVal); return new HoodieRecord<>(hKey, payload); } @SuppressWarnings("unchecked") - public static JavaRDD dropDuplicates(JavaSparkContext jssc, - JavaRDD incomingHoodieRecords, + public static JavaRDD dropDuplicates(JavaSparkContext jssc, JavaRDD incomingHoodieRecords, HoodieWriteConfig writeConfig, Option timelineService) throws Exception { HoodieReadClient client = null; try { @@ -209,15 +199,10 @@ public class DataSourceUtils { } @SuppressWarnings("unchecked") - public static JavaRDD dropDuplicates(JavaSparkContext jssc, - JavaRDD incomingHoodieRecords, - Map parameters, - Option timelineService) - throws Exception { - HoodieWriteConfig writeConfig = HoodieWriteConfig - .newBuilder() - .withPath(parameters.get("path")) - .withProps(parameters).build(); + public static JavaRDD dropDuplicates(JavaSparkContext jssc, JavaRDD incomingHoodieRecords, + Map parameters, Option timelineService) throws Exception { + HoodieWriteConfig writeConfig = + HoodieWriteConfig.newBuilder().withPath(parameters.get("path")).withProps(parameters).build(); return dropDuplicates(jssc, incomingHoodieRecords, writeConfig, timelineService); } @@ -234,17 +219,17 @@ public class DataSourceUtils { hiveSyncConfig.databaseName = props.getString(DataSourceWriteOptions.HIVE_DATABASE_OPT_KEY(), DataSourceWriteOptions.DEFAULT_HIVE_DATABASE_OPT_VAL()); hiveSyncConfig.tableName = props.getString(DataSourceWriteOptions.HIVE_TABLE_OPT_KEY()); - hiveSyncConfig.hiveUser = props.getString(DataSourceWriteOptions.HIVE_USER_OPT_KEY(), - DataSourceWriteOptions.DEFAULT_HIVE_USER_OPT_VAL()); - hiveSyncConfig.hivePass = props.getString(DataSourceWriteOptions.HIVE_PASS_OPT_KEY(), - DataSourceWriteOptions.DEFAULT_HIVE_PASS_OPT_VAL()); - hiveSyncConfig.jdbcUrl = props.getString(DataSourceWriteOptions.HIVE_URL_OPT_KEY(), - DataSourceWriteOptions.DEFAULT_HIVE_URL_OPT_VAL()); + hiveSyncConfig.hiveUser = + props.getString(DataSourceWriteOptions.HIVE_USER_OPT_KEY(), DataSourceWriteOptions.DEFAULT_HIVE_USER_OPT_VAL()); + hiveSyncConfig.hivePass = + props.getString(DataSourceWriteOptions.HIVE_PASS_OPT_KEY(), DataSourceWriteOptions.DEFAULT_HIVE_PASS_OPT_VAL()); + hiveSyncConfig.jdbcUrl = + props.getString(DataSourceWriteOptions.HIVE_URL_OPT_KEY(), DataSourceWriteOptions.DEFAULT_HIVE_URL_OPT_VAL()); hiveSyncConfig.partitionFields = - props.getStringList(DataSourceWriteOptions.HIVE_PARTITION_FIELDS_OPT_KEY(), ",", new ArrayList<>()); + props.getStringList(DataSourceWriteOptions.HIVE_PARTITION_FIELDS_OPT_KEY(), ",", new ArrayList<>()); hiveSyncConfig.partitionValueExtractorClass = - props.getString(DataSourceWriteOptions.HIVE_PARTITION_EXTRACTOR_CLASS_OPT_KEY(), - SlashEncodedDayPartitionValueExtractor.class.getName()); + props.getString(DataSourceWriteOptions.HIVE_PARTITION_EXTRACTOR_CLASS_OPT_KEY(), + SlashEncodedDayPartitionValueExtractor.class.getName()); return hiveSyncConfig; } } diff --git a/hudi-spark/src/main/java/org/apache/hudi/EmptyHoodieRecordPayload.java b/hudi-spark/src/main/java/org/apache/hudi/EmptyHoodieRecordPayload.java index 0d8982dcb..ddcbeb74e 100644 --- a/hudi-spark/src/main/java/org/apache/hudi/EmptyHoodieRecordPayload.java +++ b/hudi-spark/src/main/java/org/apache/hudi/EmptyHoodieRecordPayload.java @@ -29,7 +29,7 @@ import org.apache.hudi.common.util.Option; */ public class EmptyHoodieRecordPayload implements HoodieRecordPayload { - public EmptyHoodieRecordPayload(GenericRecord record, Comparable orderingVal) { } + public EmptyHoodieRecordPayload(GenericRecord record, Comparable orderingVal) {} @Override public EmptyHoodieRecordPayload preCombine(EmptyHoodieRecordPayload another) { diff --git a/hudi-spark/src/main/java/org/apache/hudi/HoodieDataSourceHelpers.java b/hudi-spark/src/main/java/org/apache/hudi/HoodieDataSourceHelpers.java index a5573fb62..d31036822 100644 --- a/hudi-spark/src/main/java/org/apache/hudi/HoodieDataSourceHelpers.java +++ b/hudi-spark/src/main/java/org/apache/hudi/HoodieDataSourceHelpers.java @@ -29,14 +29,13 @@ import org.apache.hudi.common.table.timeline.HoodieActiveTimeline; import org.apache.hudi.common.table.timeline.HoodieInstant; /** - * List of helpers to aid, construction of instanttime for read and write operations using - * datasource + * List of helpers to aid, construction of instanttime for read and write operations using datasource */ public class HoodieDataSourceHelpers { /** - * Checks if the Hoodie dataset has new data since given timestamp. This can be subsequently fed - * to an incremental view read, to perform incremental processing. + * Checks if the Hoodie dataset has new data since given timestamp. This can be subsequently fed to an incremental + * view read, to perform incremental processing. */ public static boolean hasNewCommits(FileSystem fs, String basePath, String commitTimestamp) { return listCommitsSince(fs, basePath, commitTimestamp).size() > 0; @@ -45,8 +44,7 @@ public class HoodieDataSourceHelpers { /** * Get a list of instant times that have occurred, from the given instant timestamp. */ - public static List listCommitsSince(FileSystem fs, String basePath, - String instantTimestamp) { + public static List listCommitsSince(FileSystem fs, String basePath, String instantTimestamp) { HoodieTimeline timeline = allCompletedCommitsCompactions(fs, basePath); return timeline.findInstantsAfter(instantTimestamp, Integer.MAX_VALUE).getInstants() .map(HoodieInstant::getTimestamp).collect(Collectors.toList()); @@ -61,15 +59,14 @@ public class HoodieDataSourceHelpers { } /** - * Obtain all the commits, compactions that have occurred on the timeline, whose instant times - * could be fed into the datasource options. + * Obtain all the commits, compactions that have occurred on the timeline, whose instant times could be fed into the + * datasource options. */ public static HoodieTimeline allCompletedCommitsCompactions(FileSystem fs, String basePath) { HoodieTableMetaClient metaClient = new HoodieTableMetaClient(fs.getConf(), basePath, true); if (metaClient.getTableType().equals(HoodieTableType.MERGE_ON_READ)) { return metaClient.getActiveTimeline().getTimelineOfActions( - Sets.newHashSet(HoodieActiveTimeline.COMMIT_ACTION, - HoodieActiveTimeline.DELTA_COMMIT_ACTION)); + Sets.newHashSet(HoodieActiveTimeline.COMMIT_ACTION, HoodieActiveTimeline.DELTA_COMMIT_ACTION)); } else { return metaClient.getCommitTimeline().filterCompletedInstants(); } diff --git a/hudi-spark/src/main/java/org/apache/hudi/KeyGenerator.java b/hudi-spark/src/main/java/org/apache/hudi/KeyGenerator.java index 583db3c10..c35663e60 100644 --- a/hudi-spark/src/main/java/org/apache/hudi/KeyGenerator.java +++ b/hudi-spark/src/main/java/org/apache/hudi/KeyGenerator.java @@ -24,9 +24,7 @@ import org.apache.hudi.common.model.HoodieKey; import org.apache.hudi.common.util.TypedProperties; /** - * Abstract class to extend for plugging in extraction of - * {@link HoodieKey} - * from an Avro record + * Abstract class to extend for plugging in extraction of {@link HoodieKey} from an Avro record */ public abstract class KeyGenerator implements Serializable { diff --git a/hudi-spark/src/main/java/org/apache/hudi/OverwriteWithLatestAvroPayload.java b/hudi-spark/src/main/java/org/apache/hudi/OverwriteWithLatestAvroPayload.java index 39e2df706..f2646ccd7 100644 --- a/hudi-spark/src/main/java/org/apache/hudi/OverwriteWithLatestAvroPayload.java +++ b/hudi-spark/src/main/java/org/apache/hudi/OverwriteWithLatestAvroPayload.java @@ -32,8 +32,8 @@ import org.apache.hudi.common.util.Option; * 1. preCombine - Picks the latest delta record for a key, based on an ordering field 2. * combineAndGetUpdateValue/getInsertValue - Simply overwrites storage with latest delta record */ -public class OverwriteWithLatestAvroPayload extends BaseAvroPayload implements - HoodieRecordPayload { +public class OverwriteWithLatestAvroPayload extends BaseAvroPayload + implements HoodieRecordPayload { /** * @param record @@ -58,8 +58,7 @@ public class OverwriteWithLatestAvroPayload extends BaseAvroPayload implements } @Override - public Option combineAndGetUpdateValue(IndexedRecord currentValue, Schema schema) - throws IOException { + public Option combineAndGetUpdateValue(IndexedRecord currentValue, Schema schema) throws IOException { // combining strategy here trivially ignores currentValue on disk and writes this record return getInsertValue(schema); } diff --git a/hudi-spark/src/main/java/org/apache/hudi/QuickstartUtils.java b/hudi-spark/src/main/java/org/apache/hudi/QuickstartUtils.java index c21e410a1..504ad18f3 100644 --- a/hudi-spark/src/main/java/org/apache/hudi/QuickstartUtils.java +++ b/hudi-spark/src/main/java/org/apache/hudi/QuickstartUtils.java @@ -39,8 +39,8 @@ import org.apache.hudi.common.util.Option; import org.apache.hudi.exception.HoodieIOException; /** - * Class to be used in quickstart guide for generating inserts and updates against a corpus. - * Test data uses a toy Uber trips, data model. + * Class to be used in quickstart guide for generating inserts and updates against a corpus. Test data uses a toy Uber + * trips, data model. */ public class QuickstartUtils { @@ -49,20 +49,13 @@ public class QuickstartUtils { private static final String DEFAULT_SECOND_PARTITION_PATH = "americas/brazil/sao_paulo"; private static final String DEFAULT_THIRD_PARTITION_PATH = "asia/india/chennai"; - private static final String[] DEFAULT_PARTITION_PATHS = { - DEFAULT_FIRST_PARTITION_PATH, - DEFAULT_SECOND_PARTITION_PATH, - DEFAULT_THIRD_PARTITION_PATH - }; + private static final String[] DEFAULT_PARTITION_PATHS = + {DEFAULT_FIRST_PARTITION_PATH, DEFAULT_SECOND_PARTITION_PATH, DEFAULT_THIRD_PARTITION_PATH}; static String TRIP_EXAMPLE_SCHEMA = "{\"type\": \"record\"," + "\"name\": \"triprec\"," + "\"fields\": [ " - + "{\"name\": \"ts\",\"type\": \"double\"}," - + "{\"name\": \"uuid\", \"type\": \"string\"}," - + "{\"name\": \"rider\", \"type\": \"string\"}," - + "{\"name\": \"driver\", \"type\": \"string\"}," - + "{\"name\": \"begin_lat\", \"type\": \"double\"}," - + "{\"name\": \"begin_lon\", \"type\": \"double\"}," - + "{\"name\": \"end_lat\", \"type\": \"double\"}," - + "{\"name\": \"end_lon\", \"type\": \"double\"}," + + "{\"name\": \"ts\",\"type\": \"double\"}," + "{\"name\": \"uuid\", \"type\": \"string\"}," + + "{\"name\": \"rider\", \"type\": \"string\"}," + "{\"name\": \"driver\", \"type\": \"string\"}," + + "{\"name\": \"begin_lat\", \"type\": \"double\"}," + "{\"name\": \"begin_lon\", \"type\": \"double\"}," + + "{\"name\": \"end_lat\", \"type\": \"double\"}," + "{\"name\": \"end_lon\", \"type\": \"double\"}," + "{\"name\":\"fare\",\"type\": \"double\"}]}"; static Schema avroSchema = new Schema.Parser().parse(TRIP_EXAMPLE_SCHEMA); @@ -87,8 +80,7 @@ public class QuickstartUtils { int stringLength = 3; StringBuilder buffer = new StringBuilder(stringLength); for (int i = 0; i < stringLength; i++) { - int randomLimitedInt = leftLimit + (int) - (rand.nextFloat() * (rightLimit - leftLimit + 1)); + int randomLimitedInt = leftLimit + (int) (rand.nextFloat() * (rightLimit - leftLimit + 1)); buffer.append((char) randomLimitedInt); } return buffer.toString(); @@ -99,7 +91,7 @@ public class QuickstartUtils { } public static GenericRecord generateGenericRecord(String rowKey, String riderName, String driverName, - double timestamp) { + double timestamp) { GenericRecord rec = new GenericData.Record(avroSchema); rec.put("uuid", rowKey); rec.put("ts", timestamp); @@ -114,15 +106,15 @@ public class QuickstartUtils { } /** - * Generates a new avro record of the above schema format, retaining the key if optionally provided. - * The riderDriverSuffix string is a random String to simulate updates by changing the rider driver fields - * for records belonging to the same commit. It is purely used for demo purposes. In real world, the actual - * updates are assumed to be provided based on the application requirements. + * Generates a new avro record of the above schema format, retaining the key if optionally provided. The + * riderDriverSuffix string is a random String to simulate updates by changing the rider driver fields for records + * belonging to the same commit. It is purely used for demo purposes. In real world, the actual updates are assumed + * to be provided based on the application requirements. */ - public static OverwriteWithLatestAvroPayload generateRandomValue(HoodieKey key, String riderDriverSuffix) throws - IOException { - GenericRecord rec = generateGenericRecord(key.getRecordKey(), "rider-" + riderDriverSuffix, "driver-" - + riderDriverSuffix, 0.0); + public static OverwriteWithLatestAvroPayload generateRandomValue(HoodieKey key, String riderDriverSuffix) + throws IOException { + GenericRecord rec = + generateGenericRecord(key.getRecordKey(), "rider-" + riderDriverSuffix, "driver-" + riderDriverSuffix, 0.0); return new OverwriteWithLatestAvroPayload(Option.of(rec)); } @@ -182,19 +174,19 @@ public class QuickstartUtils { private static Option convertToString(HoodieRecord record) { try { - String str = HoodieAvroUtils.bytesToAvro(((OverwriteWithLatestAvroPayload) record.getData()).recordBytes, - DataGenerator.avroSchema).toString(); + String str = HoodieAvroUtils + .bytesToAvro(((OverwriteWithLatestAvroPayload) record.getData()).recordBytes, DataGenerator.avroSchema) + .toString(); str = "{" + str.substring(str.indexOf("\"ts\":")); - return Option.of(str.replaceAll("}", - ", \"partitionpath\": \"" + record.getPartitionPath() + "\"}")); + return Option.of(str.replaceAll("}", ", \"partitionpath\": \"" + record.getPartitionPath() + "\"}")); } catch (IOException e) { return Option.empty(); } } public static List convertToStringList(List records) { - return records.stream().map(hr -> convertToString(hr)).filter(os -> os.isPresent()) - .map(os -> os.get()).collect(Collectors.toList()); + return records.stream().map(hr -> convertToString(hr)).filter(os -> os.isPresent()).map(os -> os.get()) + .collect(Collectors.toList()); } public static Map getQuickstartWriteConfigs() { diff --git a/hudi-spark/src/main/java/org/apache/hudi/SimpleKeyGenerator.java b/hudi-spark/src/main/java/org/apache/hudi/SimpleKeyGenerator.java index fec48ffe4..b9bb25865 100644 --- a/hudi-spark/src/main/java/org/apache/hudi/SimpleKeyGenerator.java +++ b/hudi-spark/src/main/java/org/apache/hudi/SimpleKeyGenerator.java @@ -24,8 +24,7 @@ import org.apache.hudi.common.util.TypedProperties; import org.apache.hudi.exception.HoodieException; /** - * Simple key generator, which takes names of fields to be used for recordKey and partitionPath as - * configs. + * Simple key generator, which takes names of fields to be used for recordKey and partitionPath as configs. */ public class SimpleKeyGenerator extends KeyGenerator { @@ -38,15 +37,13 @@ public class SimpleKeyGenerator extends KeyGenerator { public SimpleKeyGenerator(TypedProperties props) { super(props); this.recordKeyField = props.getString(DataSourceWriteOptions.RECORDKEY_FIELD_OPT_KEY()); - this.partitionPathField = props - .getString(DataSourceWriteOptions.PARTITIONPATH_FIELD_OPT_KEY()); + this.partitionPathField = props.getString(DataSourceWriteOptions.PARTITIONPATH_FIELD_OPT_KEY()); } @Override public HoodieKey getKey(GenericRecord record) { if (recordKeyField == null || partitionPathField == null) { - throw new HoodieException( - "Unable to find field names for record key or partition path in cfg"); + throw new HoodieException("Unable to find field names for record key or partition path in cfg"); } String recordKey = DataSourceUtils.getNestedFieldValAsString(record, recordKeyField); diff --git a/hudi-spark/src/test/java/DataSourceTestUtils.java b/hudi-spark/src/test/java/DataSourceTestUtils.java index 30d991856..9f85b5391 100644 --- a/hudi-spark/src/test/java/DataSourceTestUtils.java +++ b/hudi-spark/src/test/java/DataSourceTestUtils.java @@ -32,15 +32,14 @@ public class DataSourceTestUtils { try { String str = ((TestRawTripPayload) record.getData()).getJsonData(); str = "{" + str.substring(str.indexOf("\"timestamp\":")); - return Option.of(str.replaceAll("}", - ", \"partition\": \"" + record.getPartitionPath() + "\"}")); + return Option.of(str.replaceAll("}", ", \"partition\": \"" + record.getPartitionPath() + "\"}")); } catch (IOException e) { return Option.empty(); } } public static List convertToStringList(List records) { - return records.stream().map(hr -> convertToString(hr)).filter(os -> os.isPresent()) - .map(os -> os.get()).collect(Collectors.toList()); + return records.stream().map(hr -> convertToString(hr)).filter(os -> os.isPresent()).map(os -> os.get()) + .collect(Collectors.toList()); } } diff --git a/hudi-spark/src/test/java/HoodieJavaApp.java b/hudi-spark/src/test/java/HoodieJavaApp.java index ee7c6b939..32ae31bf6 100644 --- a/hudi-spark/src/test/java/HoodieJavaApp.java +++ b/hudi-spark/src/test/java/HoodieJavaApp.java @@ -97,9 +97,7 @@ public class HoodieJavaApp { // Spark session setup.. SparkSession spark = SparkSession.builder().appName("Hoodie Spark APP") - .config("spark.serializer", - "org.apache.spark.serializer.KryoSerializer").master("local[1]") - .getOrCreate(); + .config("spark.serializer", "org.apache.spark.serializer.KryoSerializer").master("local[1]").getOrCreate(); JavaSparkContext jssc = new JavaSparkContext(spark.sparkContext()); FileSystem fs = FileSystem.get(jssc.hadoopConfiguration()); @@ -107,7 +105,7 @@ public class HoodieJavaApp { HoodieTestDataGenerator dataGen = null; if (nonPartitionedTable) { // All data goes to base-path - dataGen = new HoodieTestDataGenerator(new String[]{""}); + dataGen = new HoodieTestDataGenerator(new String[] {""}); } else { dataGen = new HoodieTestDataGenerator(); } @@ -116,31 +114,34 @@ public class HoodieJavaApp { * Commit with only inserts */ // Generate some input.. - List records1 = DataSourceTestUtils.convertToStringList( - dataGen.generateInserts("001"/* ignore */, 100)); + List records1 = DataSourceTestUtils.convertToStringList(dataGen.generateInserts("001"/* ignore */, 100)); Dataset inputDF1 = spark.read().json(jssc.parallelize(records1, 2)); // Save as hoodie dataset (copy on write) - DataFrameWriter writer = inputDF1.write().format("org.apache.hudi") // specify the hoodie source - .option("hoodie.insert.shuffle.parallelism", - "2") // any hoodie client config can be passed like this - .option("hoodie.upsert.shuffle.parallelism", - "2") // full list in HoodieWriteConfig & its package - .option(DataSourceWriteOptions.STORAGE_TYPE_OPT_KEY(), tableType) // Hoodie Table Type - .option(DataSourceWriteOptions.OPERATION_OPT_KEY(), - DataSourceWriteOptions.INSERT_OPERATION_OPT_VAL()) // insert - .option(DataSourceWriteOptions.RECORDKEY_FIELD_OPT_KEY(), - "_row_key") // This is the record key - .option(DataSourceWriteOptions.PARTITIONPATH_FIELD_OPT_KEY(), - "partition") // this is the partition to place it into - .option(DataSourceWriteOptions.PRECOMBINE_FIELD_OPT_KEY(), - "timestamp") // use to combine duplicate records in input/with disk val - .option(HoodieWriteConfig.TABLE_NAME, tableName) // Used by hive sync and queries + // specify the hoodie source + DataFrameWriter writer = inputDF1.write().format("org.apache.hudi") + // any hoodie client config can be passed like this + .option("hoodie.insert.shuffle.parallelism", "2") + // full list in HoodieWriteConfig & its package + .option("hoodie.upsert.shuffle.parallelism", "2") + // Hoodie Table Type + .option(DataSourceWriteOptions.STORAGE_TYPE_OPT_KEY(), tableType) + // insert + .option(DataSourceWriteOptions.OPERATION_OPT_KEY(), DataSourceWriteOptions.INSERT_OPERATION_OPT_VAL()) + // This is the record key + .option(DataSourceWriteOptions.RECORDKEY_FIELD_OPT_KEY(), "_row_key") + // this is the partition to place it into + .option(DataSourceWriteOptions.PARTITIONPATH_FIELD_OPT_KEY(), "partition") + // use to combine duplicate records in input/with disk val + .option(DataSourceWriteOptions.PRECOMBINE_FIELD_OPT_KEY(), "timestamp") + // Used by hive sync and queries + .option(HoodieWriteConfig.TABLE_NAME, tableName) + // Add Key Extractor .option(DataSourceWriteOptions.KEYGENERATOR_CLASS_OPT_KEY(), - nonPartitionedTable ? NonpartitionedKeyGenerator.class.getCanonicalName() : - SimpleKeyGenerator.class.getCanonicalName()) // Add Key Extractor - .mode( - SaveMode.Overwrite); // This will remove any existing data at path below, and create a + nonPartitionedTable ? NonpartitionedKeyGenerator.class.getCanonicalName() + : SimpleKeyGenerator.class.getCanonicalName()) + // This will remove any existing data at path below, and create a + .mode(SaveMode.Overwrite); updateHiveSyncConfig(writer); // new dataset if needed @@ -151,8 +152,7 @@ public class HoodieJavaApp { /** * Commit that updates records */ - List records2 = DataSourceTestUtils.convertToStringList( - dataGen.generateUpdates("002"/* ignore */, 100)); + List records2 = DataSourceTestUtils.convertToStringList(dataGen.generateUpdates("002"/* ignore */, 100)); Dataset inputDF2 = spark.read().json(jssc.parallelize(records2, 2)); writer = inputDF2.write().format("org.apache.hudi").option("hoodie.insert.shuffle.parallelism", "2") .option("hoodie.upsert.shuffle.parallelism", "2") @@ -161,8 +161,8 @@ public class HoodieJavaApp { .option(DataSourceWriteOptions.PARTITIONPATH_FIELD_OPT_KEY(), "partition") .option(DataSourceWriteOptions.PRECOMBINE_FIELD_OPT_KEY(), "timestamp") .option(DataSourceWriteOptions.KEYGENERATOR_CLASS_OPT_KEY(), - nonPartitionedTable ? NonpartitionedKeyGenerator.class.getCanonicalName() : - SimpleKeyGenerator.class.getCanonicalName()) // Add Key Extractor + nonPartitionedTable ? NonpartitionedKeyGenerator.class.getCanonicalName() + : SimpleKeyGenerator.class.getCanonicalName()) // Add Key Extractor .option(HoodieWriteConfig.TABLE_NAME, tableName).mode(SaveMode.Append); updateHiveSyncConfig(writer); @@ -180,20 +180,18 @@ public class HoodieJavaApp { hoodieROViewDF.registerTempTable("hoodie_ro"); spark.sql("describe hoodie_ro").show(); // all trips whose fare was greater than 2. - spark.sql("select fare, begin_lon, begin_lat, timestamp from hoodie_ro where fare > 2.0") - .show(); + spark.sql("select fare, begin_lon, begin_lat, timestamp from hoodie_ro where fare > 2.0").show(); if (tableType.equals(HoodieTableType.COPY_ON_WRITE.name())) { /** * Consume incrementally, only changes in commit 2 above. Currently only supported for COPY_ON_WRITE TABLE */ Dataset hoodieIncViewDF = spark.read().format("org.apache.hudi") - .option(DataSourceReadOptions.VIEW_TYPE_OPT_KEY(), - DataSourceReadOptions.VIEW_TYPE_INCREMENTAL_OPT_VAL()) - .option(DataSourceReadOptions.BEGIN_INSTANTTIME_OPT_KEY(), - commitInstantTime1) // Only changes in write 2 above - .load( - tablePath); // For incremental view, pass in the root/base path of dataset + .option(DataSourceReadOptions.VIEW_TYPE_OPT_KEY(), DataSourceReadOptions.VIEW_TYPE_INCREMENTAL_OPT_VAL()) + // Only changes in write 2 above + .option(DataSourceReadOptions.BEGIN_INSTANTTIME_OPT_KEY(), commitInstantTime1) + // For incremental view, pass in the root/base path of dataset + .load(tablePath); logger.info("You will only see records from : " + commitInstantTime2); hoodieIncViewDF.groupBy(hoodieIncViewDF.col("_hoodie_commit_time")).count().show(); @@ -202,6 +200,7 @@ public class HoodieJavaApp { /** * Setup configs for syncing to hive + * * @param writer * @return */ @@ -215,12 +214,13 @@ public class HoodieJavaApp { .option(DataSourceWriteOptions.HIVE_PASS_OPT_KEY(), hivePass) .option(DataSourceWriteOptions.HIVE_SYNC_ENABLED_OPT_KEY(), "true"); if (nonPartitionedTable) { - writer = writer.option(DataSourceWriteOptions.HIVE_PARTITION_EXTRACTOR_CLASS_OPT_KEY(), - NonPartitionedExtractor.class.getCanonicalName()) + writer = writer + .option(DataSourceWriteOptions.HIVE_PARTITION_EXTRACTOR_CLASS_OPT_KEY(), + NonPartitionedExtractor.class.getCanonicalName()) .option(DataSourceWriteOptions.PARTITIONPATH_FIELD_OPT_KEY(), ""); } else if (useMultiPartitionKeys) { - writer = writer.option(DataSourceWriteOptions.HIVE_PARTITION_FIELDS_OPT_KEY(), "year,month,day") - .option(DataSourceWriteOptions.HIVE_PARTITION_EXTRACTOR_CLASS_OPT_KEY(), + writer = writer.option(DataSourceWriteOptions.HIVE_PARTITION_FIELDS_OPT_KEY(), "year,month,day").option( + DataSourceWriteOptions.HIVE_PARTITION_EXTRACTOR_CLASS_OPT_KEY(), MultiPartKeysValueExtractor.class.getCanonicalName()); } else { writer = writer.option(DataSourceWriteOptions.HIVE_PARTITION_FIELDS_OPT_KEY(), "dateStr"); diff --git a/hudi-spark/src/test/java/HoodieJavaStreamingApp.java b/hudi-spark/src/test/java/HoodieJavaStreamingApp.java index 2a0b6e228..fb04c6d3f 100644 --- a/hudi-spark/src/test/java/HoodieJavaStreamingApp.java +++ b/hudi-spark/src/test/java/HoodieJavaStreamingApp.java @@ -113,9 +113,7 @@ public class HoodieJavaStreamingApp { public void run() throws Exception { // Spark session setup.. SparkSession spark = SparkSession.builder().appName("Hoodie Spark Streaming APP") - .config("spark.serializer", - "org.apache.spark.serializer.KryoSerializer").master("local[1]") - .getOrCreate(); + .config("spark.serializer", "org.apache.spark.serializer.KryoSerializer").master("local[1]").getOrCreate(); JavaSparkContext jssc = new JavaSparkContext(spark.sparkContext()); // folder path clean up and creation, preparing the environment @@ -128,18 +126,15 @@ public class HoodieJavaStreamingApp { // Generator of some records to be loaded in. HoodieTestDataGenerator dataGen = new HoodieTestDataGenerator(); - List records1 = DataSourceTestUtils.convertToStringList( - dataGen.generateInserts("001", 100)); + List records1 = DataSourceTestUtils.convertToStringList(dataGen.generateInserts("001", 100)); Dataset inputDF1 = spark.read().json(jssc.parallelize(records1, 2)); - List records2 = DataSourceTestUtils.convertToStringList( - dataGen.generateUpdates("002", 100)); + List records2 = DataSourceTestUtils.convertToStringList(dataGen.generateUpdates("002", 100)); Dataset inputDF2 = spark.read().json(jssc.parallelize(records2, 2)); // setup the input for streaming - Dataset streamingInput = spark.readStream().schema(inputDF1.schema()) - .json(streamingSourcePath); + Dataset streamingInput = spark.readStream().schema(inputDF1.schema()).json(streamingSourcePath); // start streaming and showing @@ -174,16 +169,14 @@ public class HoodieJavaStreamingApp { /** * Adding data to the streaming source and showing results over time + * * @param spark * @param fs * @param inputDF1 * @param inputDF2 * @throws Exception */ - public void show(SparkSession spark, - FileSystem fs, - Dataset inputDF1, - Dataset inputDF2) throws Exception { + public void show(SparkSession spark, FileSystem fs, Dataset inputDF1, Dataset inputDF2) throws Exception { inputDF1.write().mode(SaveMode.Append).json(streamingSourcePath); // wait for spark streaming to process one microbatch Thread.sleep(3000); @@ -206,20 +199,18 @@ public class HoodieJavaStreamingApp { hoodieROViewDF.registerTempTable("hoodie_ro"); spark.sql("describe hoodie_ro").show(); // all trips whose fare was greater than 2. - spark.sql("select fare, begin_lon, begin_lat, timestamp from hoodie_ro where fare > 2.0") - .show(); + spark.sql("select fare, begin_lon, begin_lat, timestamp from hoodie_ro where fare > 2.0").show(); if (tableType.equals(HoodieTableType.COPY_ON_WRITE.name())) { /** * Consume incrementally, only changes in commit 2 above. Currently only supported for COPY_ON_WRITE TABLE */ Dataset hoodieIncViewDF = spark.read().format("org.apache.hudi") - .option(DataSourceReadOptions.VIEW_TYPE_OPT_KEY(), - DataSourceReadOptions.VIEW_TYPE_INCREMENTAL_OPT_VAL()) - .option(DataSourceReadOptions.BEGIN_INSTANTTIME_OPT_KEY(), - commitInstantTime1) // Only changes in write 2 above - .load( - tablePath); // For incremental view, pass in the root/base path of dataset + .option(DataSourceReadOptions.VIEW_TYPE_OPT_KEY(), DataSourceReadOptions.VIEW_TYPE_INCREMENTAL_OPT_VAL()) + // Only changes in write 2 above + .option(DataSourceReadOptions.BEGIN_INSTANTTIME_OPT_KEY(), commitInstantTime1) + // For incremental view, pass in the root/base path of dataset + .load(tablePath); logger.info("You will only see records from : " + commitInstantTime2); hoodieIncViewDF.groupBy(hoodieIncViewDF.col("_hoodie_commit_time")).count().show(); @@ -228,33 +219,28 @@ public class HoodieJavaStreamingApp { /** * Hoodie spark streaming job + * * @param streamingInput * @throws Exception */ public void stream(Dataset streamingInput) throws Exception { - DataStreamWriter writer = streamingInput - .writeStream() - .format("org.apache.hudi") - .option("hoodie.insert.shuffle.parallelism", "2") - .option("hoodie.upsert.shuffle.parallelism", "2") + DataStreamWriter writer = streamingInput.writeStream().format("org.apache.hudi") + .option("hoodie.insert.shuffle.parallelism", "2").option("hoodie.upsert.shuffle.parallelism", "2") .option(DataSourceWriteOptions.STORAGE_TYPE_OPT_KEY(), tableType) .option(DataSourceWriteOptions.RECORDKEY_FIELD_OPT_KEY(), "_row_key") .option(DataSourceWriteOptions.PARTITIONPATH_FIELD_OPT_KEY(), "partition") .option(DataSourceWriteOptions.PRECOMBINE_FIELD_OPT_KEY(), "timestamp") - .option(HoodieWriteConfig.TABLE_NAME, tableName) - .option("checkpointLocation", streamingCheckpointingPath) + .option(HoodieWriteConfig.TABLE_NAME, tableName).option("checkpointLocation", streamingCheckpointingPath) .outputMode(OutputMode.Append()); updateHiveSyncConfig(writer); - writer - .trigger(new ProcessingTime(500)) - .start(tablePath) - .awaitTermination(streamingDurationInMs); + writer.trigger(new ProcessingTime(500)).start(tablePath).awaitTermination(streamingDurationInMs); } /** * Setup configs for syncing to hive + * * @param writer * @return */ @@ -268,8 +254,8 @@ public class HoodieJavaStreamingApp { .option(DataSourceWriteOptions.HIVE_PASS_OPT_KEY(), hivePass) .option(DataSourceWriteOptions.HIVE_SYNC_ENABLED_OPT_KEY(), "true"); if (useMultiPartitionKeys) { - writer = writer.option(DataSourceWriteOptions.HIVE_PARTITION_FIELDS_OPT_KEY(), "year,month,day") - .option(DataSourceWriteOptions.HIVE_PARTITION_EXTRACTOR_CLASS_OPT_KEY(), + writer = writer.option(DataSourceWriteOptions.HIVE_PARTITION_FIELDS_OPT_KEY(), "year,month,day").option( + DataSourceWriteOptions.HIVE_PARTITION_EXTRACTOR_CLASS_OPT_KEY(), MultiPartKeysValueExtractor.class.getCanonicalName()); } else { writer = writer.option(DataSourceWriteOptions.HIVE_PARTITION_FIELDS_OPT_KEY(), "dateStr"); diff --git a/hudi-timeline-service/pom.xml b/hudi-timeline-service/pom.xml index 8d661033a..7c7d2221a 100644 --- a/hudi-timeline-service/pom.xml +++ b/hudi-timeline-service/pom.xml @@ -26,6 +26,10 @@ hudi-timeline-service jar + + ${project.parent.basedir} + + diff --git a/hudi-timeline-service/src/main/java/org/apache/hudi/timeline/service/FileSystemViewHandler.java b/hudi-timeline-service/src/main/java/org/apache/hudi/timeline/service/FileSystemViewHandler.java index a19a2749b..cb3d8a70e 100644 --- a/hudi-timeline-service/src/main/java/org/apache/hudi/timeline/service/FileSystemViewHandler.java +++ b/hudi-timeline-service/src/main/java/org/apache/hudi/timeline/service/FileSystemViewHandler.java @@ -81,15 +81,14 @@ public class FileSystemViewHandler { */ private boolean isLocalViewBehind(Context ctx) { String basePath = ctx.queryParam(RemoteHoodieTableFileSystemView.BASEPATH_PARAM); - String lastKnownInstantFromClient = ctx - .queryParam(RemoteHoodieTableFileSystemView.LAST_INSTANT_TS, HoodieTimeline.INVALID_INSTANT_TS); + String lastKnownInstantFromClient = + ctx.queryParam(RemoteHoodieTableFileSystemView.LAST_INSTANT_TS, HoodieTimeline.INVALID_INSTANT_TS); String timelineHashFromClient = ctx.queryParam(RemoteHoodieTableFileSystemView.TIMELINE_HASH, ""); - HoodieTimeline localTimeline = viewManager.getFileSystemView(basePath).getTimeline() - .filterCompletedAndCompactionInstants(); + HoodieTimeline localTimeline = + viewManager.getFileSystemView(basePath).getTimeline().filterCompletedAndCompactionInstants(); if (logger.isDebugEnabled()) { - logger.debug("Client [ LastTs=" + lastKnownInstantFromClient - + ", TimelineHash=" + timelineHashFromClient + "], localTimeline=" - + localTimeline.getInstants().collect(Collectors.toList())); + logger.debug("Client [ LastTs=" + lastKnownInstantFromClient + ", TimelineHash=" + timelineHashFromClient + + "], localTimeline=" + localTimeline.getInstants().collect(Collectors.toList())); } if ((localTimeline.getInstants().count() == 0) @@ -132,8 +131,8 @@ public class FileSystemViewHandler { private void writeValueAsString(Context ctx, Object obj) throws JsonProcessingException { boolean prettyPrint = ctx.queryParam("pretty") != null ? true : false; long beginJsonTs = System.currentTimeMillis(); - String result = prettyPrint ? mapper.writerWithDefaultPrettyPrinter().writeValueAsString(obj) - : mapper.writeValueAsString(obj); + String result = + prettyPrint ? mapper.writerWithDefaultPrettyPrinter().writeValueAsString(obj) : mapper.writeValueAsString(obj); long endJsonTs = System.currentTimeMillis(); logger.debug("Jsonify TimeTaken=" + (endJsonTs - beginJsonTs)); ctx.result(result); @@ -144,14 +143,14 @@ public class FileSystemViewHandler { */ private void registerTimelineAPI() { app.get(RemoteHoodieTableFileSystemView.LAST_INSTANT, new ViewHandler(ctx -> { - List dtos = instantHandler.getLastInstant(ctx.validatedQueryParam( - RemoteHoodieTableFileSystemView.BASEPATH_PARAM).getValue()); + List dtos = instantHandler + .getLastInstant(ctx.validatedQueryParam(RemoteHoodieTableFileSystemView.BASEPATH_PARAM).getValue()); writeValueAsString(ctx, dtos); }, false)); app.get(RemoteHoodieTableFileSystemView.TIMELINE, new ViewHandler(ctx -> { - TimelineDTO dto = instantHandler.getTimeline(ctx.validatedQueryParam( - RemoteHoodieTableFileSystemView.BASEPATH_PARAM).getValue()); + TimelineDTO dto = instantHandler + .getTimeline(ctx.validatedQueryParam(RemoteHoodieTableFileSystemView.BASEPATH_PARAM).getValue()); writeValueAsString(ctx, dto); }, false)); } @@ -161,62 +160,54 @@ public class FileSystemViewHandler { */ private void registerDataFilesAPI() { app.get(RemoteHoodieTableFileSystemView.LATEST_PARTITION_DATA_FILES_URL, new ViewHandler(ctx -> { - List dtos = - dataFileHandler - .getLatestDataFiles(ctx.validatedQueryParam(RemoteHoodieTableFileSystemView.BASEPATH_PARAM).getOrThrow(), - ctx.validatedQueryParam(RemoteHoodieTableFileSystemView.PARTITION_PARAM).getOrThrow()); + List dtos = dataFileHandler.getLatestDataFiles( + ctx.validatedQueryParam(RemoteHoodieTableFileSystemView.BASEPATH_PARAM).getOrThrow(), + ctx.validatedQueryParam(RemoteHoodieTableFileSystemView.PARTITION_PARAM).getOrThrow()); writeValueAsString(ctx, dtos); }, true)); app.get(RemoteHoodieTableFileSystemView.LATEST_PARTITION_DATA_FILE_URL, new ViewHandler(ctx -> { - List dtos = - dataFileHandler - .getLatestDataFile(ctx.validatedQueryParam(RemoteHoodieTableFileSystemView.BASEPATH_PARAM).getOrThrow(), - ctx.validatedQueryParam(RemoteHoodieTableFileSystemView.PARTITION_PARAM).getOrThrow(), - ctx.validatedQueryParam(RemoteHoodieTableFileSystemView.FILEID_PARAM).getOrThrow()); + List dtos = dataFileHandler.getLatestDataFile( + ctx.validatedQueryParam(RemoteHoodieTableFileSystemView.BASEPATH_PARAM).getOrThrow(), + ctx.validatedQueryParam(RemoteHoodieTableFileSystemView.PARTITION_PARAM).getOrThrow(), + ctx.validatedQueryParam(RemoteHoodieTableFileSystemView.FILEID_PARAM).getOrThrow()); writeValueAsString(ctx, dtos); }, true)); app.get(RemoteHoodieTableFileSystemView.LATEST_ALL_DATA_FILES, new ViewHandler(ctx -> { - List dtos = - dataFileHandler - .getLatestDataFiles(ctx.validatedQueryParam(RemoteHoodieTableFileSystemView.BASEPATH_PARAM).getOrThrow()); + List dtos = dataFileHandler + .getLatestDataFiles(ctx.validatedQueryParam(RemoteHoodieTableFileSystemView.BASEPATH_PARAM).getOrThrow()); writeValueAsString(ctx, dtos); }, true)); app.get(RemoteHoodieTableFileSystemView.LATEST_DATA_FILES_BEFORE_ON_INSTANT_URL, new ViewHandler(ctx -> { - List dtos = - dataFileHandler.getLatestDataFilesBeforeOrOn(ctx.validatedQueryParam( - RemoteHoodieTableFileSystemView.BASEPATH_PARAM).getOrThrow(), - ctx.validatedQueryParam(RemoteHoodieTableFileSystemView.PARTITION_PARAM).getOrThrow(), - ctx.validatedQueryParam(RemoteHoodieTableFileSystemView.MAX_INSTANT_PARAM).getOrThrow()); + List dtos = dataFileHandler.getLatestDataFilesBeforeOrOn( + ctx.validatedQueryParam(RemoteHoodieTableFileSystemView.BASEPATH_PARAM).getOrThrow(), + ctx.validatedQueryParam(RemoteHoodieTableFileSystemView.PARTITION_PARAM).getOrThrow(), + ctx.validatedQueryParam(RemoteHoodieTableFileSystemView.MAX_INSTANT_PARAM).getOrThrow()); writeValueAsString(ctx, dtos); }, true)); app.get(RemoteHoodieTableFileSystemView.LATEST_DATA_FILE_ON_INSTANT_URL, new ViewHandler(ctx -> { - List dtos = - dataFileHandler - .getLatestDataFileOn(ctx.validatedQueryParam(RemoteHoodieTableFileSystemView.BASEPATH_PARAM).getOrThrow(), - ctx.validatedQueryParam(RemoteHoodieTableFileSystemView.PARTITION_PARAM).getOrThrow(), - ctx.queryParam(RemoteHoodieTableFileSystemView.INSTANT_PARAM), ctx.validatedQueryParam( - RemoteHoodieTableFileSystemView.FILEID_PARAM).getOrThrow()); + List dtos = dataFileHandler.getLatestDataFileOn( + ctx.validatedQueryParam(RemoteHoodieTableFileSystemView.BASEPATH_PARAM).getOrThrow(), + ctx.validatedQueryParam(RemoteHoodieTableFileSystemView.PARTITION_PARAM).getOrThrow(), + ctx.queryParam(RemoteHoodieTableFileSystemView.INSTANT_PARAM), + ctx.validatedQueryParam(RemoteHoodieTableFileSystemView.FILEID_PARAM).getOrThrow()); writeValueAsString(ctx, dtos); }, true)); app.get(RemoteHoodieTableFileSystemView.ALL_DATA_FILES, new ViewHandler(ctx -> { - List dtos = - dataFileHandler - .getAllDataFiles(ctx.validatedQueryParam(RemoteHoodieTableFileSystemView.BASEPATH_PARAM).getOrThrow(), - ctx.validatedQueryParam(RemoteHoodieTableFileSystemView.PARTITION_PARAM).getOrThrow()); + List dtos = dataFileHandler.getAllDataFiles( + ctx.validatedQueryParam(RemoteHoodieTableFileSystemView.BASEPATH_PARAM).getOrThrow(), + ctx.validatedQueryParam(RemoteHoodieTableFileSystemView.PARTITION_PARAM).getOrThrow()); writeValueAsString(ctx, dtos); }, true)); app.get(RemoteHoodieTableFileSystemView.LATEST_DATA_FILES_RANGE_INSTANT_URL, new ViewHandler(ctx -> { - List dtos = - dataFileHandler.getLatestDataFilesInRange(ctx.validatedQueryParam( - RemoteHoodieTableFileSystemView.BASEPATH_PARAM).getOrThrow(), - Arrays.asList( - ctx.validatedQueryParam(RemoteHoodieTableFileSystemView.INSTANTS_PARAM).getOrThrow().split(","))); + List dtos = dataFileHandler.getLatestDataFilesInRange( + ctx.validatedQueryParam(RemoteHoodieTableFileSystemView.BASEPATH_PARAM).getOrThrow(), Arrays + .asList(ctx.validatedQueryParam(RemoteHoodieTableFileSystemView.INSTANTS_PARAM).getOrThrow().split(","))); writeValueAsString(ctx, dtos); }, true)); } @@ -226,64 +217,57 @@ public class FileSystemViewHandler { */ private void registerFileSlicesAPI() { app.get(RemoteHoodieTableFileSystemView.LATEST_PARTITION_SLICES_URL, new ViewHandler(ctx -> { - List dtos = - sliceHandler - .getLatestFileSlices(ctx.validatedQueryParam(RemoteHoodieTableFileSystemView.BASEPATH_PARAM).getOrThrow(), - ctx.validatedQueryParam(RemoteHoodieTableFileSystemView.PARTITION_PARAM).getOrThrow()); + List dtos = sliceHandler.getLatestFileSlices( + ctx.validatedQueryParam(RemoteHoodieTableFileSystemView.BASEPATH_PARAM).getOrThrow(), + ctx.validatedQueryParam(RemoteHoodieTableFileSystemView.PARTITION_PARAM).getOrThrow()); writeValueAsString(ctx, dtos); }, true)); app.get(RemoteHoodieTableFileSystemView.LATEST_PARTITION_SLICE_URL, new ViewHandler(ctx -> { - List dtos = - sliceHandler - .getLatestFileSlice(ctx.validatedQueryParam(RemoteHoodieTableFileSystemView.BASEPATH_PARAM).getOrThrow(), - ctx.validatedQueryParam(RemoteHoodieTableFileSystemView.PARTITION_PARAM).getOrThrow(), - ctx.validatedQueryParam(RemoteHoodieTableFileSystemView.FILEID_PARAM).getOrThrow()); + List dtos = sliceHandler.getLatestFileSlice( + ctx.validatedQueryParam(RemoteHoodieTableFileSystemView.BASEPATH_PARAM).getOrThrow(), + ctx.validatedQueryParam(RemoteHoodieTableFileSystemView.PARTITION_PARAM).getOrThrow(), + ctx.validatedQueryParam(RemoteHoodieTableFileSystemView.FILEID_PARAM).getOrThrow()); writeValueAsString(ctx, dtos); }, true)); app.get(RemoteHoodieTableFileSystemView.LATEST_PARTITION_UNCOMPACTED_SLICES_URL, new ViewHandler(ctx -> { - List dtos = - sliceHandler.getLatestUnCompactedFileSlices(ctx.validatedQueryParam( - RemoteHoodieTableFileSystemView.BASEPATH_PARAM).getOrThrow(), - ctx.validatedQueryParam(RemoteHoodieTableFileSystemView.PARTITION_PARAM).getOrThrow()); + List dtos = sliceHandler.getLatestUnCompactedFileSlices( + ctx.validatedQueryParam(RemoteHoodieTableFileSystemView.BASEPATH_PARAM).getOrThrow(), + ctx.validatedQueryParam(RemoteHoodieTableFileSystemView.PARTITION_PARAM).getOrThrow()); writeValueAsString(ctx, dtos); }, true)); app.get(RemoteHoodieTableFileSystemView.ALL_SLICES_URL, new ViewHandler(ctx -> { - List dtos = - sliceHandler - .getAllFileSlices(ctx.validatedQueryParam(RemoteHoodieTableFileSystemView.BASEPATH_PARAM).getOrThrow(), - ctx.validatedQueryParam(RemoteHoodieTableFileSystemView.PARTITION_PARAM).getOrThrow()); + List dtos = sliceHandler.getAllFileSlices( + ctx.validatedQueryParam(RemoteHoodieTableFileSystemView.BASEPATH_PARAM).getOrThrow(), + ctx.validatedQueryParam(RemoteHoodieTableFileSystemView.PARTITION_PARAM).getOrThrow()); writeValueAsString(ctx, dtos); }, true)); app.get(RemoteHoodieTableFileSystemView.LATEST_SLICES_RANGE_INSTANT_URL, new ViewHandler(ctx -> { - List dtos = - sliceHandler.getLatestFileSliceInRange( - ctx.validatedQueryParam(RemoteHoodieTableFileSystemView.BASEPATH_PARAM).getOrThrow(), - Arrays.asList( - ctx.validatedQueryParam(RemoteHoodieTableFileSystemView.INSTANTS_PARAM).getOrThrow().split(","))); + List dtos = sliceHandler.getLatestFileSliceInRange( + ctx.validatedQueryParam(RemoteHoodieTableFileSystemView.BASEPATH_PARAM).getOrThrow(), Arrays + .asList(ctx.validatedQueryParam(RemoteHoodieTableFileSystemView.INSTANTS_PARAM).getOrThrow().split(","))); writeValueAsString(ctx, dtos); }, true)); app.get(RemoteHoodieTableFileSystemView.LATEST_SLICES_MERGED_BEFORE_ON_INSTANT_URL, new ViewHandler(ctx -> { - List dtos = - sliceHandler.getLatestMergedFileSlicesBeforeOrOn(ctx.validatedQueryParam( - RemoteHoodieTableFileSystemView.BASEPATH_PARAM).getOrThrow(), - ctx.validatedQueryParam(RemoteHoodieTableFileSystemView.PARTITION_PARAM).getOrThrow(), - ctx.validatedQueryParam(RemoteHoodieTableFileSystemView.MAX_INSTANT_PARAM).getOrThrow()); + List dtos = sliceHandler.getLatestMergedFileSlicesBeforeOrOn( + ctx.validatedQueryParam(RemoteHoodieTableFileSystemView.BASEPATH_PARAM).getOrThrow(), + ctx.validatedQueryParam(RemoteHoodieTableFileSystemView.PARTITION_PARAM).getOrThrow(), + ctx.validatedQueryParam(RemoteHoodieTableFileSystemView.MAX_INSTANT_PARAM).getOrThrow()); writeValueAsString(ctx, dtos); }, true)); app.get(RemoteHoodieTableFileSystemView.LATEST_SLICES_BEFORE_ON_INSTANT_URL, new ViewHandler(ctx -> { - List dtos = - sliceHandler.getLatestFileSlicesBeforeOrOn(ctx.validatedQueryParam( - RemoteHoodieTableFileSystemView.BASEPATH_PARAM).getOrThrow(), - ctx.validatedQueryParam(RemoteHoodieTableFileSystemView.PARTITION_PARAM).getOrThrow(), - ctx.validatedQueryParam(RemoteHoodieTableFileSystemView.MAX_INSTANT_PARAM).getOrThrow(), - Boolean.valueOf(ctx.validatedQueryParam( - RemoteHoodieTableFileSystemView.INCLUDE_FILES_IN_PENDING_COMPACTION_PARAM).getOrThrow())); + List dtos = sliceHandler.getLatestFileSlicesBeforeOrOn( + ctx.validatedQueryParam(RemoteHoodieTableFileSystemView.BASEPATH_PARAM).getOrThrow(), + ctx.validatedQueryParam(RemoteHoodieTableFileSystemView.PARTITION_PARAM).getOrThrow(), + ctx.validatedQueryParam(RemoteHoodieTableFileSystemView.MAX_INSTANT_PARAM).getOrThrow(), + Boolean.valueOf( + ctx.validatedQueryParam(RemoteHoodieTableFileSystemView.INCLUDE_FILES_IN_PENDING_COMPACTION_PARAM) + .getOrThrow())); writeValueAsString(ctx, dtos); }, true)); @@ -294,15 +278,15 @@ public class FileSystemViewHandler { }, true)); app.get(RemoteHoodieTableFileSystemView.ALL_FILEGROUPS_FOR_PARTITION_URL, new ViewHandler(ctx -> { - List dtos = sliceHandler.getAllFileGroups(ctx.validatedQueryParam( - RemoteHoodieTableFileSystemView.BASEPATH_PARAM).getOrThrow(), + List dtos = sliceHandler.getAllFileGroups( + ctx.validatedQueryParam(RemoteHoodieTableFileSystemView.BASEPATH_PARAM).getOrThrow(), ctx.validatedQueryParam(RemoteHoodieTableFileSystemView.PARTITION_PARAM).getOrThrow()); writeValueAsString(ctx, dtos); }, true)); app.post(RemoteHoodieTableFileSystemView.REFRESH_DATASET, new ViewHandler(ctx -> { - boolean success = sliceHandler.refreshDataset(ctx.validatedQueryParam( - RemoteHoodieTableFileSystemView.BASEPATH_PARAM).getOrThrow()); + boolean success = sliceHandler + .refreshDataset(ctx.validatedQueryParam(RemoteHoodieTableFileSystemView.BASEPATH_PARAM).getOrThrow()); writeValueAsString(ctx, success); }, false)); } @@ -348,11 +332,13 @@ public class FileSystemViewHandler { if (refreshCheck) { long beginFinalCheck = System.currentTimeMillis(); - String errMsg = "Last known instant from client was " - + context.queryParam(RemoteHoodieTableFileSystemView.LAST_INSTANT_TS, HoodieTimeline.INVALID_INSTANT_TS) - + " but server has the following timeline " - + viewManager.getFileSystemView(context.queryParam(RemoteHoodieTableFileSystemView.BASEPATH_PARAM)) - .getTimeline().getInstants().collect(Collectors.toList()); + String errMsg = + "Last known instant from client was " + + context.queryParam(RemoteHoodieTableFileSystemView.LAST_INSTANT_TS, + HoodieTimeline.INVALID_INSTANT_TS) + + " but server has the following timeline " + + viewManager.getFileSystemView(context.queryParam(RemoteHoodieTableFileSystemView.BASEPATH_PARAM)) + .getTimeline().getInstants().collect(Collectors.toList()); Preconditions.checkArgument(!isLocalViewBehind(context), errMsg); long endFinalCheck = System.currentTimeMillis(); finalCheckTimeTaken = endFinalCheck - beginFinalCheck; @@ -364,9 +350,12 @@ public class FileSystemViewHandler { } finally { long endTs = System.currentTimeMillis(); long timeTakenMillis = endTs - beginTs; - logger.info(String.format("TimeTakenMillis[Total=%d, Refresh=%d, handle=%d, Check=%d], " - + "Success=%s, Query=%s, Host=%s, synced=%s", timeTakenMillis, refreshCheckTimeTaken, handleTimeTaken, - finalCheckTimeTaken, success, context.queryString(), context.host(), synced)); + logger + .info(String.format( + "TimeTakenMillis[Total=%d, Refresh=%d, handle=%d, Check=%d], " + + "Success=%s, Query=%s, Host=%s, synced=%s", + timeTakenMillis, refreshCheckTimeTaken, handleTimeTaken, finalCheckTimeTaken, success, + context.queryString(), context.host(), synced)); } } } diff --git a/hudi-timeline-service/src/main/java/org/apache/hudi/timeline/service/TimelineService.java b/hudi-timeline-service/src/main/java/org/apache/hudi/timeline/service/TimelineService.java index bf7f99a0b..531917310 100644 --- a/hudi-timeline-service/src/main/java/org/apache/hudi/timeline/service/TimelineService.java +++ b/hudi-timeline-service/src/main/java/org/apache/hudi/timeline/service/TimelineService.java @@ -50,8 +50,8 @@ public class TimelineService { return serverPort; } - public TimelineService(int serverPort, FileSystemViewManager globalFileSystemViewManager, - Configuration conf) throws IOException { + public TimelineService(int serverPort, FileSystemViewManager globalFileSystemViewManager, Configuration conf) + throws IOException { this.conf = FSUtils.prepareHadoopConf(conf); this.fs = FileSystem.get(conf); this.serverPort = serverPort; @@ -89,8 +89,7 @@ public class TimelineService { description = "Directory where spilled view entries will be stored. Used for SPILLABLE_DISK storage type") public String baseStorePathForFileGroups = FileSystemViewStorageConfig.DEFAULT_VIEW_SPILLABLE_DIR; - @Parameter(names = {"--rocksdb-path", "-rp"}, - description = "Root directory for RocksDB") + @Parameter(names = {"--rocksdb-path", "-rp"}, description = "Root directory for RocksDB") public String rocksDBPath = FileSystemViewStorageConfig.DEFAULT_ROCKSDB_BASE_PATH; @Parameter(names = {"--help", "-h"}) diff --git a/hudi-timeline-service/src/main/java/org/apache/hudi/timeline/service/handlers/DataFileHandler.java b/hudi-timeline-service/src/main/java/org/apache/hudi/timeline/service/handlers/DataFileHandler.java index a1cb8ce8c..f3364495c 100644 --- a/hudi-timeline-service/src/main/java/org/apache/hudi/timeline/service/handlers/DataFileHandler.java +++ b/hudi-timeline-service/src/main/java/org/apache/hudi/timeline/service/handlers/DataFileHandler.java @@ -32,8 +32,7 @@ import org.apache.hudi.common.table.view.FileSystemViewManager; */ public class DataFileHandler extends Handler { - public DataFileHandler(Configuration conf, - FileSystemViewManager viewManager) throws IOException { + public DataFileHandler(Configuration conf, FileSystemViewManager viewManager) throws IOException { super(conf, viewManager); } @@ -48,8 +47,8 @@ public class DataFileHandler extends Handler { } public List getLatestDataFiles(String basePath) { - return viewManager.getFileSystemView(basePath).getLatestDataFiles() - .map(DataFileDTO::fromHoodieDataFile).collect(Collectors.toList()); + return viewManager.getFileSystemView(basePath).getLatestDataFiles().map(DataFileDTO::fromHoodieDataFile) + .collect(Collectors.toList()); } public List getLatestDataFilesBeforeOrOn(String basePath, String partitionPath, String maxInstantTime) { @@ -71,8 +70,8 @@ public class DataFileHandler extends Handler { } public List getAllDataFiles(String basePath, String partitionPath) { - return viewManager.getFileSystemView(basePath).getAllDataFiles(partitionPath) - .map(DataFileDTO::fromHoodieDataFile).collect(Collectors.toList()); + return viewManager.getFileSystemView(basePath).getAllDataFiles(partitionPath).map(DataFileDTO::fromHoodieDataFile) + .collect(Collectors.toList()); } } diff --git a/hudi-timeline-service/src/main/java/org/apache/hudi/timeline/service/handlers/FileSliceHandler.java b/hudi-timeline-service/src/main/java/org/apache/hudi/timeline/service/handlers/FileSliceHandler.java index d63d31739..eb283fb3d 100644 --- a/hudi-timeline-service/src/main/java/org/apache/hudi/timeline/service/handlers/FileSliceHandler.java +++ b/hudi-timeline-service/src/main/java/org/apache/hudi/timeline/service/handlers/FileSliceHandler.java @@ -34,8 +34,7 @@ import org.apache.hudi.common.table.view.FileSystemViewManager; */ public class FileSliceHandler extends Handler { - public FileSliceHandler(Configuration conf, - FileSystemViewManager viewManager) throws IOException { + public FileSliceHandler(Configuration conf, FileSystemViewManager viewManager) throws IOException { super(conf, viewManager); } @@ -49,16 +48,17 @@ public class FileSliceHandler extends Handler { .map(FileSliceDTO::fromFileSlice).collect(Collectors.toList()); } - public List getLatestMergedFileSlicesBeforeOrOn(String basePath, - String partitionPath, String maxInstantTime) { + public List getLatestMergedFileSlicesBeforeOrOn(String basePath, String partitionPath, + String maxInstantTime) { return viewManager.getFileSystemView(basePath).getLatestMergedFileSlicesBeforeOrOn(partitionPath, maxInstantTime) .map(FileSliceDTO::fromFileSlice).collect(Collectors.toList()); } - public List getLatestFileSlicesBeforeOrOn(String basePath, String partitionPath, - String maxInstantTime, boolean includeFileSlicesInPendingCompaction) { - return viewManager.getFileSystemView(basePath).getLatestFileSlicesBeforeOrOn(partitionPath, maxInstantTime, - includeFileSlicesInPendingCompaction).map(FileSliceDTO::fromFileSlice).collect(Collectors.toList()); + public List getLatestFileSlicesBeforeOrOn(String basePath, String partitionPath, String maxInstantTime, + boolean includeFileSlicesInPendingCompaction) { + return viewManager.getFileSystemView(basePath) + .getLatestFileSlicesBeforeOrOn(partitionPath, maxInstantTime, includeFileSlicesInPendingCompaction) + .map(FileSliceDTO::fromFileSlice).collect(Collectors.toList()); } public List getLatestUnCompactedFileSlices(String basePath, String partitionPath) { @@ -67,8 +67,8 @@ public class FileSliceHandler extends Handler { } public List getLatestFileSlices(String basePath, String partitionPath) { - return viewManager.getFileSystemView(basePath).getLatestFileSlices(partitionPath) - .map(FileSliceDTO::fromFileSlice).collect(Collectors.toList()); + return viewManager.getFileSystemView(basePath).getLatestFileSlices(partitionPath).map(FileSliceDTO::fromFileSlice) + .collect(Collectors.toList()); } public List getLatestFileSlice(String basePath, String partitionPath, String fileId) { @@ -83,8 +83,8 @@ public class FileSliceHandler extends Handler { } public List getAllFileGroups(String basePath, String partitionPath) { - return viewManager.getFileSystemView(basePath).getAllFileGroups(partitionPath) - .map(FileGroupDTO::fromFileGroup).collect(Collectors.toList()); + return viewManager.getFileSystemView(basePath).getAllFileGroups(partitionPath).map(FileGroupDTO::fromFileGroup) + .collect(Collectors.toList()); } public boolean refreshDataset(String basePath) { diff --git a/hudi-timeline-service/src/main/java/org/apache/hudi/timeline/service/handlers/TimelineHandler.java b/hudi-timeline-service/src/main/java/org/apache/hudi/timeline/service/handlers/TimelineHandler.java index 1dfb5d6fe..49fc2ce98 100644 --- a/hudi-timeline-service/src/main/java/org/apache/hudi/timeline/service/handlers/TimelineHandler.java +++ b/hudi-timeline-service/src/main/java/org/apache/hudi/timeline/service/handlers/TimelineHandler.java @@ -32,14 +32,13 @@ import org.apache.hudi.common.table.view.FileSystemViewManager; */ public class TimelineHandler extends Handler { - public TimelineHandler(Configuration conf, - FileSystemViewManager viewManager) throws IOException { + public TimelineHandler(Configuration conf, FileSystemViewManager viewManager) throws IOException { super(conf, viewManager); } public List getLastInstant(String basePath) { - return viewManager.getFileSystemView(basePath).getLastInstant() - .map(InstantDTO::fromInstant).map(dto -> Arrays.asList(dto)).orElse(new ArrayList<>()); + return viewManager.getFileSystemView(basePath).getLastInstant().map(InstantDTO::fromInstant) + .map(dto -> Arrays.asList(dto)).orElse(new ArrayList<>()); } public TimelineDTO getTimeline(String basePath) { diff --git a/hudi-timeline-service/src/test/java/org/apache/hudi/timeline/table/view/RemoteHoodieTableFileSystemViewTest.java b/hudi-timeline-service/src/test/java/org/apache/hudi/timeline/table/view/RemoteHoodieTableFileSystemViewTest.java index cf901e039..9784d2fab 100644 --- a/hudi-timeline-service/src/test/java/org/apache/hudi/timeline/table/view/RemoteHoodieTableFileSystemViewTest.java +++ b/hudi-timeline-service/src/test/java/org/apache/hudi/timeline/table/view/RemoteHoodieTableFileSystemViewTest.java @@ -41,13 +41,11 @@ public class RemoteHoodieTableFileSystemViewTest extends HoodieTableFileSystemVi private RemoteHoodieTableFileSystemView view; protected SyncableFileSystemView getFileSystemView(HoodieTimeline timeline) { - FileSystemViewStorageConfig sConf = FileSystemViewStorageConfig.newBuilder() - .withStorageType(FileSystemViewStorageType.SPILLABLE_DISK).build(); + FileSystemViewStorageConfig sConf = + FileSystemViewStorageConfig.newBuilder().withStorageType(FileSystemViewStorageType.SPILLABLE_DISK).build(); try { - server = - new TimelineService(0, - FileSystemViewManager.createViewManager(new SerializableConfiguration(metaClient.getHadoopConf()), - sConf)); + server = new TimelineService(0, + FileSystemViewManager.createViewManager(new SerializableConfiguration(metaClient.getHadoopConf()), sConf)); server.startService(); } catch (Exception ex) { throw new RuntimeException(ex); diff --git a/hudi-utilities/pom.xml b/hudi-utilities/pom.xml index 98c491971..612aa2f79 100644 --- a/hudi-utilities/pom.xml +++ b/hudi-utilities/pom.xml @@ -26,6 +26,10 @@ hudi-utilities jar + + ${project.parent.basedir} + + diff --git a/hudi-utilities/src/main/java/org/apache/hudi/utilities/HDFSParquetImporter.java b/hudi-utilities/src/main/java/org/apache/hudi/utilities/HDFSParquetImporter.java index 2dc907cd5..a99b5659a 100644 --- a/hudi-utilities/src/main/java/org/apache/hudi/utilities/HDFSParquetImporter.java +++ b/hudi-utilities/src/main/java/org/apache/hudi/utilities/HDFSParquetImporter.java @@ -75,8 +75,8 @@ public class HDFSParquetImporter implements Serializable { public HDFSParquetImporter(Config cfg) throws IOException { this.cfg = cfg; - this.props = cfg.propsFilePath == null ? UtilHelpers.buildProperties(cfg.configs) : - UtilHelpers.readConfig(fs, new Path(cfg.propsFilePath), cfg.configs).getConfig(); + this.props = cfg.propsFilePath == null ? UtilHelpers.buildProperties(cfg.configs) + : UtilHelpers.readConfig(fs, new Path(cfg.propsFilePath), cfg.configs).getConfig(); log.info("Creating Cleaner with configs : " + props.toString()); } @@ -88,8 +88,8 @@ public class HDFSParquetImporter implements Serializable { System.exit(1); } HDFSParquetImporter dataImporter = new HDFSParquetImporter(cfg); - JavaSparkContext jssc = UtilHelpers - .buildSparkContext("data-importer-" + cfg.tableName, cfg.sparkMaster, cfg.sparkMemory); + JavaSparkContext jssc = + UtilHelpers.buildSparkContext("data-importer-" + cfg.tableName, cfg.sparkMaster, cfg.sparkMemory); try { dataImporter.dataImport(jssc, cfg.retry); } finally { @@ -123,18 +123,17 @@ public class HDFSParquetImporter implements Serializable { fs.delete(new Path(cfg.targetPath), true); } - //Get schema. + // Get schema. String schemaStr = UtilHelpers.parseSchema(fs, cfg.schemaFile); // Initialize target hoodie table. Properties properties = new Properties(); properties.put(HoodieTableConfig.HOODIE_TABLE_NAME_PROP_NAME, cfg.tableName); properties.put(HoodieTableConfig.HOODIE_TABLE_TYPE_PROP_NAME, cfg.tableType); - HoodieTableMetaClient - .initDatasetAndGetMetaClient(jsc.hadoopConfiguration(), cfg.targetPath, properties); + HoodieTableMetaClient.initDatasetAndGetMetaClient(jsc.hadoopConfiguration(), cfg.targetPath, properties); - HoodieWriteClient client = UtilHelpers.createHoodieClient(jsc, cfg.targetPath, schemaStr, - cfg.parallelism, Option.empty(), props); + HoodieWriteClient client = + UtilHelpers.createHoodieClient(jsc, cfg.targetPath, schemaStr, cfg.parallelism, Option.empty(), props); JavaRDD> hoodieRecords = buildHoodieRecordsForImport(jsc, schemaStr); // Get instant time. @@ -147,66 +146,56 @@ public class HDFSParquetImporter implements Serializable { return -1; } - protected JavaRDD> buildHoodieRecordsForImport( - JavaSparkContext jsc, String schemaStr) throws IOException { + protected JavaRDD> buildHoodieRecordsForImport(JavaSparkContext jsc, + String schemaStr) throws IOException { Job job = Job.getInstance(jsc.hadoopConfiguration()); // Allow recursive directories to be found job.getConfiguration().set(FileInputFormat.INPUT_DIR_RECURSIVE, "true"); // To parallelize reading file status. job.getConfiguration().set(FileInputFormat.LIST_STATUS_NUM_THREADS, "1024"); - AvroReadSupport - .setAvroReadSchema(jsc.hadoopConfiguration(), (new Schema.Parser().parse(schemaStr))); + AvroReadSupport.setAvroReadSchema(jsc.hadoopConfiguration(), (new Schema.Parser().parse(schemaStr))); ParquetInputFormat.setReadSupportClass(job, (AvroReadSupport.class)); - return jsc.newAPIHadoopFile(cfg.srcPath, - ParquetInputFormat.class, Void.class, GenericRecord.class, job.getConfiguration()) + return jsc + .newAPIHadoopFile(cfg.srcPath, ParquetInputFormat.class, Void.class, GenericRecord.class, + job.getConfiguration()) // To reduce large number of // tasks. - .coalesce(16 * cfg.parallelism) - .map(entry -> { - GenericRecord genericRecord - = ((Tuple2) entry)._2(); - Object partitionField = - genericRecord.get(cfg.partitionKey); + .coalesce(16 * cfg.parallelism).map(entry -> { + GenericRecord genericRecord = ((Tuple2) entry)._2(); + Object partitionField = genericRecord.get(cfg.partitionKey); if (partitionField == null) { - throw new HoodieIOException( - "partition key is missing. :" - + cfg.partitionKey); + throw new HoodieIOException("partition key is missing. :" + cfg.partitionKey); } Object rowField = genericRecord.get(cfg.rowKey); if (rowField == null) { - throw new HoodieIOException( - "row field is missing. :" + cfg.rowKey); + throw new HoodieIOException("row field is missing. :" + cfg.rowKey); } String partitionPath = partitionField.toString(); logger.info("Row Key : " + rowField + ", Partition Path is (" + partitionPath + ")"); if (partitionField instanceof Number) { try { long ts = (long) (Double.parseDouble(partitionField.toString()) * 1000L); - partitionPath = - PARTITION_FORMATTER.format(new Date(ts)); + partitionPath = PARTITION_FORMATTER.format(new Date(ts)); } catch (NumberFormatException nfe) { logger.warn("Unable to parse date from partition field. Assuming partition as (" + partitionField + ")"); } } - return new HoodieRecord<>( - new HoodieKey( - (String) rowField, partitionPath), - new HoodieJsonPayload( - genericRecord.toString())); + return new HoodieRecord<>(new HoodieKey((String) rowField, partitionPath), + new HoodieJsonPayload(genericRecord.toString())); }); } /** * Imports records to Hoodie dataset * - * @param client Hoodie Client - * @param instantTime Instant Time + * @param client Hoodie Client + * @param instantTime Instant Time * @param hoodieRecords Hoodie Records - * @param Type + * @param Type */ - protected JavaRDD load(HoodieWriteClient client, - String instantTime, JavaRDD> hoodieRecords) { + protected JavaRDD load(HoodieWriteClient client, String instantTime, + JavaRDD> hoodieRecords) { if (cfg.command.toLowerCase().equals("insert")) { return client.insert(hoodieRecords, instantTime); } @@ -220,48 +209,40 @@ public class HDFSParquetImporter implements Serializable { @Override public void validate(String name, String value) throws ParameterException { if (value == null || !validFormats.contains(value)) { - throw new ParameterException(String.format( - "Invalid format type: value:%s: supported formats:%s", value, validFormats)); + throw new ParameterException( + String.format("Invalid format type: value:%s: supported formats:%s", value, validFormats)); } } } public static class Config implements Serializable { - @Parameter(names = {"--command", "-c"}, - description = "Write command Valid values are insert(default)/upsert", + @Parameter(names = {"--command", "-c"}, description = "Write command Valid values are insert(default)/upsert", required = false) public String command = "INSERT"; - @Parameter(names = {"--src-path", - "-sp"}, description = "Base path for the input dataset", required = true) + @Parameter(names = {"--src-path", "-sp"}, description = "Base path for the input dataset", required = true) public String srcPath = null; - @Parameter(names = {"--target-path", - "-tp"}, description = "Base path for the target hoodie dataset", required = true) + @Parameter(names = {"--target-path", "-tp"}, description = "Base path for the target hoodie dataset", + required = true) public String targetPath = null; @Parameter(names = {"--table-name", "-tn"}, description = "Table name", required = true) public String tableName = null; @Parameter(names = {"--table-type", "-tt"}, description = "Table type", required = true) public String tableType = null; - @Parameter(names = {"--row-key-field", - "-rk"}, description = "Row key field name", required = true) + @Parameter(names = {"--row-key-field", "-rk"}, description = "Row key field name", required = true) public String rowKey = null; - @Parameter(names = {"--partition-key-field", - "-pk"}, description = "Partition key field name", required = true) + @Parameter(names = {"--partition-key-field", "-pk"}, description = "Partition key field name", required = true) public String partitionKey = null; - @Parameter(names = {"--parallelism", - "-pl"}, description = "Parallelism for hoodie insert", required = true) + @Parameter(names = {"--parallelism", "-pl"}, description = "Parallelism for hoodie insert", required = true) public int parallelism = 1; - @Parameter(names = {"--schema-file", - "-sf"}, description = "path for Avro schema file", required = true) + @Parameter(names = {"--schema-file", "-sf"}, description = "path for Avro schema file", required = true) public String schemaFile = null; - @Parameter(names = {"--format", - "-f"}, description = "Format for the input data.", required = false, validateValueWith = - FormatValidator.class) + @Parameter(names = {"--format", "-f"}, description = "Format for the input data.", required = false, + validateValueWith = FormatValidator.class) public String format = null; @Parameter(names = {"--spark-master", "-ms"}, description = "Spark master", required = false) public String sparkMaster = null; - @Parameter(names = {"--spark-memory", - "-sm"}, description = "spark memory to use", required = true) + @Parameter(names = {"--spark-memory", "-sm"}, description = "spark memory to use", required = true) public String sparkMemory = null; @Parameter(names = {"--retry", "-rt"}, description = "number of retries", required = false) public int retry = 0; diff --git a/hudi-utilities/src/main/java/org/apache/hudi/utilities/HiveIncrementalPuller.java b/hudi-utilities/src/main/java/org/apache/hudi/utilities/HiveIncrementalPuller.java index 3f443c3fe..2cba5fe78 100644 --- a/hudi-utilities/src/main/java/org/apache/hudi/utilities/HiveIncrementalPuller.java +++ b/hudi-utilities/src/main/java/org/apache/hudi/utilities/HiveIncrementalPuller.java @@ -49,13 +49,13 @@ import org.apache.log4j.Logger; import org.stringtemplate.v4.ST; /** - * Utility to pull data after a given commit, based on the supplied HiveQL and save the delta as - * another hive temporary table. + * Utility to pull data after a given commit, based on the supplied HiveQL and save the delta as another hive temporary + * table. *

    * Current Limitations: *

    - * - Only the source table can be incrementally pulled (usually the largest table) - The - * incrementally pulled table can't be referenced more than once. + * - Only the source table can be incrementally pulled (usually the largest table) - The incrementally pulled table + * can't be referenced more than once. */ public class HiveIncrementalPuller { @@ -109,8 +109,8 @@ public class HiveIncrementalPuller { public HiveIncrementalPuller(Config config) throws IOException { this.config = config; validateConfig(config); - String templateContent = FileIOUtils.readAsUTFString( - this.getClass().getResourceAsStream("IncrementalPull.sqltemplate")); + String templateContent = + FileIOUtils.readAsUTFString(this.getClass().getResourceAsStream("IncrementalPull.sqltemplate")); incrementalPullSQLtemplate = new ST(templateContent); } @@ -143,14 +143,12 @@ public class HiveIncrementalPuller { // drop the temp table if exists String tempDbTable = config.tmpDb + "." + config.targetTable + "__" + config.sourceTable; String tempDbTablePath = - config.hoodieTmpDir + "/" + config.targetTable + "__" + config.sourceTable + "/" - + lastCommitTime; + config.hoodieTmpDir + "/" + config.targetTable + "__" + config.sourceTable + "/" + lastCommitTime; executeStatement("drop table " + tempDbTable, stmt); deleteHDFSPath(fs, tempDbTablePath); if (!ensureTempPathExists(fs, lastCommitTime)) { - throw new IllegalStateException( - "Could not create target path at " + new Path(config.hoodieTmpDir, - config.targetTable + "/" + lastCommitTime)); + throw new IllegalStateException("Could not create target path at " + + new Path(config.hoodieTmpDir, config.targetTable + "/" + lastCommitTime)); } initHiveBeelineProperties(stmt); @@ -178,12 +176,10 @@ public class HiveIncrementalPuller { String storedAsClause = getStoredAsClause(); incrementalPullSQLtemplate.add("storedAsClause", storedAsClause); - String incrementalSQL = new Scanner(new File(config.incrementalSQLFile)).useDelimiter("\\Z") - .next(); + String incrementalSQL = new Scanner(new File(config.incrementalSQLFile)).useDelimiter("\\Z").next(); if (!incrementalSQL.contains(config.sourceDb + "." + config.sourceTable)) { log.info("Incremental SQL does not have " + config.sourceDb + "." + config.sourceTable - + ", which means its pulling from a different table. Fencing this from " - + "happening."); + + ", which means its pulling from a different table. Fencing this from " + "happening."); throw new HoodieIncrementalPullSQLException( "Incremental SQL does not have " + config.sourceDb + "." + config.sourceTable); } @@ -196,8 +192,7 @@ public class HiveIncrementalPuller { + "means its not pulling incrementally"); } - incrementalPullSQLtemplate - .add("incrementalSQL", String.format(incrementalSQL, config.fromCommitTime)); + incrementalPullSQLtemplate.add("incrementalSQL", String.format(incrementalSQL, config.fromCommitTime)); String sql = incrementalPullSQLtemplate.render(); // Check if the SQL is pulling from the right database executeStatement(sql, stmt); @@ -212,8 +207,7 @@ public class HiveIncrementalPuller { // set the queue executeStatement("set mapred.job.queue.name=" + config.yarnQueueName, stmt); // Set the inputformat to HoodieCombineHiveInputFormat - executeStatement( - "set hive.input.format=org.apache.hudi.hadoop.hive.HoodieCombineHiveInputFormat", stmt); + executeStatement("set hive.input.format=org.apache.hudi.hadoop.hive.HoodieCombineHiveInputFormat", stmt); // Allow queries without partition predicate executeStatement("set hive.strict.checks.large.query=false", stmt); // Dont gather stats for the table created @@ -221,12 +215,10 @@ public class HiveIncrementalPuller { // Set the hoodie modie executeStatement("set hoodie." + config.sourceTable + ".consume.mode=INCREMENTAL", stmt); // Set the from commit time - executeStatement( - "set hoodie." + config.sourceTable + ".consume.start.timestamp=" + config.fromCommitTime, - stmt); + executeStatement("set hoodie." + config.sourceTable + ".consume.start.timestamp=" + config.fromCommitTime, stmt); // Set number of commits to pull - executeStatement("set hoodie." + config.sourceTable + ".consume.max.commits=" + String.valueOf( - config.maxCommits), stmt); + executeStatement("set hoodie." + config.sourceTable + ".consume.max.commits=" + String.valueOf(config.maxCommits), + stmt); } private boolean deleteHDFSPath(FileSystem fs, String path) throws IOException { @@ -240,9 +232,8 @@ public class HiveIncrementalPuller { } private String inferCommitTime(FileSystem fs) throws SQLException, IOException { - log.info( - "FromCommitTime not specified. Trying to infer it from Hoodie dataset " + config.targetDb - + "." + config.targetTable); + log.info("FromCommitTime not specified. Trying to infer it from Hoodie dataset " + config.targetDb + "." + + config.targetTable); String targetDataLocation = getTableLocation(config.targetDb, config.targetTable); return scanForCommitTime(fs, targetDataLocation); } @@ -256,14 +247,12 @@ public class HiveIncrementalPuller { resultSet = stmt.executeQuery("describe formatted `" + db + "." + table + "`"); while (resultSet.next()) { if (resultSet.getString(1).trim().equals("Location:")) { - log.info( - "Inferred table location for " + db + "." + table + " as " + resultSet.getString(2)); + log.info("Inferred table location for " + db + "." + table + " as " + resultSet.getString(2)); return resultSet.getString(2); } } } catch (SQLException e) { - throw new HoodieIncrementalPullException( - "Failed to get data location for table " + db + "." + table, e); + throw new HoodieIncrementalPullException("Failed to get data location for table " + db + "." + table, e); } finally { try { if (stmt != null) { @@ -281,16 +270,15 @@ public class HiveIncrementalPuller { private String scanForCommitTime(FileSystem fs, String targetDataPath) throws IOException { if (targetDataPath == null) { - throw new IllegalArgumentException( - "Please specify either --fromCommitTime or --targetDataPath"); + throw new IllegalArgumentException("Please specify either --fromCommitTime or --targetDataPath"); } if (!fs.exists(new Path(targetDataPath)) || !fs.exists(new Path(targetDataPath + "/.hoodie"))) { return "0"; } HoodieTableMetaClient metadata = new HoodieTableMetaClient(fs.getConf(), targetDataPath); - Option lastCommit = metadata.getActiveTimeline().getCommitsTimeline() - .filterCompletedInstants().lastInstant(); + Option lastCommit = + metadata.getActiveTimeline().getCommitsTimeline().filterCompletedInstants().lastInstant(); if (lastCommit.isPresent()) { return lastCommit.get().getTimestamp(); } @@ -298,15 +286,13 @@ public class HiveIncrementalPuller { } private boolean ensureTempPathExists(FileSystem fs, String lastCommitTime) throws IOException { - Path targetBaseDirPath = new Path(config.hoodieTmpDir, - config.targetTable + "__" + config.sourceTable); + Path targetBaseDirPath = new Path(config.hoodieTmpDir, config.targetTable + "__" + config.sourceTable); if (!fs.exists(targetBaseDirPath)) { log.info("Creating " + targetBaseDirPath + " with permission drwxrwxrwx"); - boolean result = FileSystem.mkdirs(fs, targetBaseDirPath, - new FsPermission(FsAction.ALL, FsAction.ALL, FsAction.ALL)); + boolean result = + FileSystem.mkdirs(fs, targetBaseDirPath, new FsPermission(FsAction.ALL, FsAction.ALL, FsAction.ALL)); if (!result) { - throw new HoodieException( - "Could not create " + targetBaseDirPath + " with the required permissions"); + throw new HoodieException("Could not create " + targetBaseDirPath + " with the required permissions"); } } @@ -318,23 +304,20 @@ public class HiveIncrementalPuller { } } log.info("Creating " + targetPath + " with permission drwxrwxrwx"); - return FileSystem.mkdirs(fs, targetBaseDirPath, - new FsPermission(FsAction.ALL, FsAction.ALL, FsAction.ALL)); + return FileSystem.mkdirs(fs, targetBaseDirPath, new FsPermission(FsAction.ALL, FsAction.ALL, FsAction.ALL)); } - private String getLastCommitTimePulled(FileSystem fs, String sourceTableLocation) - throws IOException { + private String getLastCommitTimePulled(FileSystem fs, String sourceTableLocation) throws IOException { HoodieTableMetaClient metadata = new HoodieTableMetaClient(fs.getConf(), sourceTableLocation); - List commitsToSync = metadata.getActiveTimeline().getCommitsTimeline() - .filterCompletedInstants() - .findInstantsAfter(config.fromCommitTime, config.maxCommits) - .getInstants().map(HoodieInstant::getTimestamp) + List commitsToSync = metadata.getActiveTimeline().getCommitsTimeline().filterCompletedInstants() + .findInstantsAfter(config.fromCommitTime, config.maxCommits).getInstants().map(HoodieInstant::getTimestamp) .collect(Collectors.toList()); if (commitsToSync.isEmpty()) { - log.warn("Nothing to sync. All commits in " + config.sourceTable + " are " - + metadata.getActiveTimeline().getCommitsTimeline().filterCompletedInstants() - .getInstants().collect(Collectors.toList()) - + " and from commit time is " + config.fromCommitTime); + log.warn( + "Nothing to sync. All commits in " + + config.sourceTable + " are " + metadata.getActiveTimeline().getCommitsTimeline() + .filterCompletedInstants().getInstants().collect(Collectors.toList()) + + " and from commit time is " + config.fromCommitTime); return null; } log.info("Syncing commits " + commitsToSync); diff --git a/hudi-utilities/src/main/java/org/apache/hudi/utilities/HoodieCleaner.java b/hudi-utilities/src/main/java/org/apache/hudi/utilities/HoodieCleaner.java index aee1ca436..8ca8cd1aa 100644 --- a/hudi-utilities/src/main/java/org/apache/hudi/utilities/HoodieCleaner.java +++ b/hudi-utilities/src/main/java/org/apache/hudi/utilities/HoodieCleaner.java @@ -62,8 +62,8 @@ public class HoodieCleaner { this.cfg = cfg; this.jssc = jssc; this.fs = FSUtils.getFs(cfg.basePath, jssc.hadoopConfiguration()); - this.props = cfg.propsFilePath == null ? UtilHelpers.buildProperties(cfg.configs) : - UtilHelpers.readConfig(fs, new Path(cfg.propsFilePath), cfg.configs).getConfig(); + this.props = cfg.propsFilePath == null ? UtilHelpers.buildProperties(cfg.configs) + : UtilHelpers.readConfig(fs, new Path(cfg.propsFilePath), cfg.configs).getConfig(); log.info("Creating Cleaner with configs : " + props.toString()); } @@ -74,8 +74,7 @@ public class HoodieCleaner { } private HoodieWriteConfig getHoodieClientConfig() throws Exception { - return HoodieWriteConfig.newBuilder().combineInput(true, true).withPath(cfg.basePath) - .withAutoCommit(false) + return HoodieWriteConfig.newBuilder().combineInput(true, true).withPath(cfg.basePath).withAutoCommit(false) .withProps(props).build(); } diff --git a/hudi-utilities/src/main/java/org/apache/hudi/utilities/HoodieCompactionAdminTool.java b/hudi-utilities/src/main/java/org/apache/hudi/utilities/HoodieCompactionAdminTool.java index 74a4128f3..63bf441ff 100644 --- a/hudi-utilities/src/main/java/org/apache/hudi/utilities/HoodieCompactionAdminTool.java +++ b/hudi-utilities/src/main/java/org/apache/hudi/utilities/HoodieCompactionAdminTool.java @@ -74,26 +74,23 @@ public class HoodieCompactionAdminTool { serializeOperationResult(fs, res); break; case UNSCHEDULE_FILE: - List r = - admin.unscheduleCompactionFileId(new HoodieFileGroupId(cfg.partitionPath, cfg.fileId), - cfg.skipValidation, cfg.dryRun); + List r = admin.unscheduleCompactionFileId( + new HoodieFileGroupId(cfg.partitionPath, cfg.fileId), cfg.skipValidation, cfg.dryRun); if (cfg.printOutput) { System.out.println(r); } serializeOperationResult(fs, r); break; case UNSCHEDULE_PLAN: - List r2 = - admin - .unscheduleCompactionPlan(cfg.compactionInstantTime, cfg.skipValidation, cfg.parallelism, cfg.dryRun); + List r2 = admin.unscheduleCompactionPlan(cfg.compactionInstantTime, cfg.skipValidation, + cfg.parallelism, cfg.dryRun); if (cfg.printOutput) { printOperationResult("Result of Unscheduling Compaction Plan :", r2); } serializeOperationResult(fs, r2); break; case REPAIR: - List r3 = - admin.repairCompaction(cfg.compactionInstantTime, cfg.parallelism, cfg.dryRun); + List r3 = admin.repairCompaction(cfg.compactionInstantTime, cfg.parallelism, cfg.dryRun); if (cfg.printOutput) { printOperationResult("Result of Repair Operation :", r3); } @@ -122,7 +119,7 @@ public class HoodieCompactionAdminTool { * Print Operation Result * * @param initialLine Initial Line - * @param result Result + * @param result Result */ private void printOperationResult(String initialLine, List result) { System.out.println(initialLine); @@ -135,10 +132,7 @@ public class HoodieCompactionAdminTool { * Operation Types */ public enum Operation { - VALIDATE, - UNSCHEDULE_PLAN, - UNSCHEDULE_FILE, - REPAIR + VALIDATE, UNSCHEDULE_PLAN, UNSCHEDULE_FILE, REPAIR } /** diff --git a/hudi-utilities/src/main/java/org/apache/hudi/utilities/HoodieCompactor.java b/hudi-utilities/src/main/java/org/apache/hudi/utilities/HoodieCompactor.java index 7aa56926b..540653dcc 100644 --- a/hudi-utilities/src/main/java/org/apache/hudi/utilities/HoodieCompactor.java +++ b/hudi-utilities/src/main/java/org/apache/hudi/utilities/HoodieCompactor.java @@ -45,29 +45,24 @@ public class HoodieCompactor { public HoodieCompactor(Config cfg) { this.cfg = cfg; - this.props = cfg.propsFilePath == null ? UtilHelpers.buildProperties(cfg.configs) : - UtilHelpers.readConfig(fs, new Path(cfg.propsFilePath), cfg.configs).getConfig(); + this.props = cfg.propsFilePath == null ? UtilHelpers.buildProperties(cfg.configs) + : UtilHelpers.readConfig(fs, new Path(cfg.propsFilePath), cfg.configs).getConfig(); } public static class Config implements Serializable { - @Parameter(names = {"--base-path", - "-sp"}, description = "Base path for the dataset", required = true) + @Parameter(names = {"--base-path", "-sp"}, description = "Base path for the dataset", required = true) public String basePath = null; @Parameter(names = {"--table-name", "-tn"}, description = "Table name", required = true) public String tableName = null; - @Parameter(names = {"--instant-time", - "-sp"}, description = "Compaction Instant time", required = true) + @Parameter(names = {"--instant-time", "-sp"}, description = "Compaction Instant time", required = true) public String compactionInstantTime = null; - @Parameter(names = {"--parallelism", - "-pl"}, description = "Parallelism for hoodie insert", required = true) + @Parameter(names = {"--parallelism", "-pl"}, description = "Parallelism for hoodie insert", required = true) public int parallelism = 1; - @Parameter(names = {"--schema-file", - "-sf"}, description = "path for Avro schema file", required = true) + @Parameter(names = {"--schema-file", "-sf"}, description = "path for Avro schema file", required = true) public String schemaFile = null; @Parameter(names = {"--spark-master", "-ms"}, description = "Spark master", required = false) public String sparkMaster = null; - @Parameter(names = {"--spark-memory", - "-sm"}, description = "spark memory to use", required = true) + @Parameter(names = {"--spark-memory", "-sm"}, description = "spark memory to use", required = true) public String sparkMemory = null; @Parameter(names = {"--retry", "-rt"}, description = "number of retries", required = false) public int retry = 0; @@ -120,18 +115,18 @@ public class HoodieCompactor { } private int doCompact(JavaSparkContext jsc) throws Exception { - //Get schema. + // Get schema. String schemaStr = UtilHelpers.parseSchema(fs, cfg.schemaFile); - HoodieWriteClient client = UtilHelpers.createHoodieClient(jsc, cfg.basePath, schemaStr, cfg.parallelism, - Option.empty(), props); + HoodieWriteClient client = + UtilHelpers.createHoodieClient(jsc, cfg.basePath, schemaStr, cfg.parallelism, Option.empty(), props); JavaRDD writeResponse = client.compact(cfg.compactionInstantTime); return UtilHelpers.handleErrors(jsc, cfg.compactionInstantTime, writeResponse); } private int doSchedule(JavaSparkContext jsc) throws Exception { - //Get schema. - HoodieWriteClient client = UtilHelpers.createHoodieClient(jsc, cfg.basePath, "", cfg.parallelism, - Option.of(cfg.strategyClassName), props); + // Get schema. + HoodieWriteClient client = + UtilHelpers.createHoodieClient(jsc, cfg.basePath, "", cfg.parallelism, Option.of(cfg.strategyClassName), props); client.scheduleCompactionAtInstant(cfg.compactionInstantTime, Option.empty()); return 0; } diff --git a/hudi-utilities/src/main/java/org/apache/hudi/utilities/HoodieSnapshotCopier.java b/hudi-utilities/src/main/java/org/apache/hudi/utilities/HoodieSnapshotCopier.java index 135095ffa..a37f7da1b 100644 --- a/hudi-utilities/src/main/java/org/apache/hudi/utilities/HoodieSnapshotCopier.java +++ b/hudi-utilities/src/main/java/org/apache/hudi/utilities/HoodieSnapshotCopier.java @@ -47,8 +47,7 @@ import org.apache.spark.api.java.JavaSparkContext; import scala.Tuple2; /** - * Hoodie snapshot copy job which copies latest files from all partitions to another place, for - * snapshot backup. + * Hoodie snapshot copy job which copies latest files from all partitions to another place, for snapshot backup. */ public class HoodieSnapshotCopier implements Serializable { @@ -56,50 +55,42 @@ public class HoodieSnapshotCopier implements Serializable { static class Config implements Serializable { - @Parameter(names = {"--base-path", - "-bp"}, description = "Hoodie table base path", required = true) + @Parameter(names = {"--base-path", "-bp"}, description = "Hoodie table base path", required = true) String basePath = null; - @Parameter(names = {"--output-path", - "-op"}, description = "The snapshot output path", required = true) + @Parameter(names = {"--output-path", "-op"}, description = "The snapshot output path", required = true) String outputPath = null; - @Parameter(names = {"--date-partitioned", - "-dp"}, description = "Can we assume date partitioning?") + @Parameter(names = {"--date-partitioned", "-dp"}, description = "Can we assume date partitioning?") boolean shouldAssumeDatePartitioning = false; } public void snapshot(JavaSparkContext jsc, String baseDir, final String outputDir, final boolean shouldAssumeDatePartitioning) throws IOException { FileSystem fs = FSUtils.getFs(baseDir, jsc.hadoopConfiguration()); - final SerializableConfiguration serConf = new SerializableConfiguration( - jsc.hadoopConfiguration()); + final SerializableConfiguration serConf = new SerializableConfiguration(jsc.hadoopConfiguration()); final HoodieTableMetaClient tableMetadata = new HoodieTableMetaClient(fs.getConf(), baseDir); - final ReadOptimizedView fsView = new HoodieTableFileSystemView( - tableMetadata, + final ReadOptimizedView fsView = new HoodieTableFileSystemView(tableMetadata, tableMetadata.getActiveTimeline().getCommitsTimeline().filterCompletedInstants()); // Get the latest commit - Option latestCommit = tableMetadata.getActiveTimeline().getCommitsTimeline() - .filterCompletedInstants().lastInstant(); + Option latestCommit = + tableMetadata.getActiveTimeline().getCommitsTimeline().filterCompletedInstants().lastInstant(); if (!latestCommit.isPresent()) { logger.warn("No commits present. Nothing to snapshot"); return; } final String latestCommitTimestamp = latestCommit.get().getTimestamp(); - logger.info(String.format( - "Starting to snapshot latest version files which are also no-late-than %s.", + logger.info(String.format("Starting to snapshot latest version files which are also no-late-than %s.", latestCommitTimestamp)); - List partitions = FSUtils - .getAllPartitionPaths(fs, baseDir, shouldAssumeDatePartitioning); + List partitions = FSUtils.getAllPartitionPaths(fs, baseDir, shouldAssumeDatePartitioning); if (partitions.size() > 0) { logger.info(String.format("The job needs to copy %d partitions.", partitions.size())); // Make sure the output directory is empty Path outputPath = new Path(outputDir); if (fs.exists(outputPath)) { - logger.warn( - String.format("The output path %s targetBasePath already exists, deleting", outputPath)); + logger.warn(String.format("The output path %s targetBasePath already exists, deleting", outputPath)); fs.delete(new Path(outputDir), true); } @@ -107,14 +98,12 @@ public class HoodieSnapshotCopier implements Serializable { // Only take latest version files <= latestCommit. FileSystem fs1 = FSUtils.getFs(baseDir, serConf.newCopy()); List> filePaths = new ArrayList<>(); - Stream dataFiles = fsView.getLatestDataFilesBeforeOrOn(partition, - latestCommitTimestamp); - dataFiles.forEach( - hoodieDataFile -> filePaths.add(new Tuple2<>(partition, hoodieDataFile.getPath()))); + Stream dataFiles = fsView.getLatestDataFilesBeforeOrOn(partition, latestCommitTimestamp); + dataFiles.forEach(hoodieDataFile -> filePaths.add(new Tuple2<>(partition, hoodieDataFile.getPath()))); // also need to copy over partition metadata - Path partitionMetaFile = new Path(new Path(baseDir, partition), - HoodiePartitionMetadata.HOODIE_PARTITION_METAFILE); + Path partitionMetaFile = + new Path(new Path(baseDir, partition), HoodiePartitionMetadata.HOODIE_PARTITION_METAFILE); if (fs1.exists(partitionMetaFile)) { filePaths.add(new Tuple2<>(partition, partitionMetaFile.toString())); } @@ -129,15 +118,14 @@ public class HoodieSnapshotCopier implements Serializable { if (!ifs.exists(toPartitionPath)) { ifs.mkdirs(toPartitionPath); } - FileUtil.copy(ifs, sourceFilePath, ifs, new Path(toPartitionPath, sourceFilePath.getName()), - false, ifs.getConf()); + FileUtil.copy(ifs, sourceFilePath, ifs, new Path(toPartitionPath, sourceFilePath.getName()), false, + ifs.getConf()); }); // Also copy the .commit files - logger.info( - String.format("Copying .commit files which are no-late-than %s.", latestCommitTimestamp)); - FileStatus[] commitFilesToCopy = fs.listStatus( - new Path(baseDir + "/" + HoodieTableMetaClient.METAFOLDER_NAME), (commitFilePath) -> { + logger.info(String.format("Copying .commit files which are no-late-than %s.", latestCommitTimestamp)); + FileStatus[] commitFilesToCopy = + fs.listStatus(new Path(baseDir + "/" + HoodieTableMetaClient.METAFOLDER_NAME), (commitFilePath) -> { if (commitFilePath.getName().equals(HoodieTableConfig.HOODIE_PROPERTIES_FILE)) { return true; } else { @@ -147,15 +135,14 @@ public class HoodieSnapshotCopier implements Serializable { } }); for (FileStatus commitStatus : commitFilesToCopy) { - Path targetFilePath = new Path( - outputDir + "/" + HoodieTableMetaClient.METAFOLDER_NAME + "/" + commitStatus.getPath() - .getName()); + Path targetFilePath = + new Path(outputDir + "/" + HoodieTableMetaClient.METAFOLDER_NAME + "/" + commitStatus.getPath().getName()); if (!fs.exists(targetFilePath.getParent())) { fs.mkdirs(targetFilePath.getParent()); } if (fs.exists(targetFilePath)) { - logger.error(String.format( - "The target output commit file (%s targetBasePath) already exists.", targetFilePath)); + logger.error( + String.format("The target output commit file (%s targetBasePath) already exists.", targetFilePath)); } FileUtil.copy(fs, commitStatus.getPath(), fs, targetFilePath, false, fs.getConf()); } @@ -166,8 +153,7 @@ public class HoodieSnapshotCopier implements Serializable { // Create the _SUCCESS tag Path successTagPath = new Path(outputDir + "/_SUCCESS"); if (!fs.exists(successTagPath)) { - logger.info(String.format( - "Creating _SUCCESS under targetBasePath: $s", outputDir)); + logger.info(String.format("Creating _SUCCESS under targetBasePath: $s", outputDir)); fs.createNewFile(successTagPath); } } @@ -176,8 +162,8 @@ public class HoodieSnapshotCopier implements Serializable { // Take input configs final Config cfg = new Config(); new JCommander(cfg, args); - logger.info(String.format("Snapshot hoodie table from %s targetBasePath to %stargetBasePath", - cfg.basePath, cfg.outputPath)); + logger.info(String.format("Snapshot hoodie table from %s targetBasePath to %stargetBasePath", cfg.basePath, + cfg.outputPath)); // Create a spark job to do the snapshot copy SparkConf sparkConf = new SparkConf().setAppName("Hoodie-snapshot-copier"); diff --git a/hudi-utilities/src/main/java/org/apache/hudi/utilities/HoodieWithTimelineServer.java b/hudi-utilities/src/main/java/org/apache/hudi/utilities/HoodieWithTimelineServer.java index f06f3477a..dc49ebd88 100644 --- a/hudi-utilities/src/main/java/org/apache/hudi/utilities/HoodieWithTimelineServer.java +++ b/hudi-utilities/src/main/java/org/apache/hudi/utilities/HoodieWithTimelineServer.java @@ -51,8 +51,7 @@ public class HoodieWithTimelineServer implements Serializable { @Parameter(names = {"--spark-master", "-ms"}, description = "Spark master", required = false) public String sparkMaster = null; - @Parameter(names = {"--spark-memory", - "-sm"}, description = "spark memory to use", required = true) + @Parameter(names = {"--spark-memory", "-sm"}, description = "spark memory to use", required = true) public String sparkMemory = null; @Parameter(names = {"--num-partitions", "-n"}, description = "Num Partitions", required = false) public Integer numPartitions = 100; @@ -87,8 +86,7 @@ public class HoodieWithTimelineServer implements Serializable { System.out.println("Driver Hostname is :" + driverHost); List messages = new ArrayList<>(); IntStream.range(0, cfg.numPartitions).forEach(i -> messages.add("Hello World")); - List gotMessages = - jsc.parallelize(messages).map(msg -> sendRequest(driverHost, cfg.serverPort)).collect(); + List gotMessages = jsc.parallelize(messages).map(msg -> sendRequest(driverHost, cfg.serverPort)).collect(); System.out.println("Got Messages :" + gotMessages); Preconditions.checkArgument(gotMessages.equals(messages), "Got expected reply from Server"); } diff --git a/hudi-utilities/src/main/java/org/apache/hudi/utilities/UtilHelpers.java b/hudi-utilities/src/main/java/org/apache/hudi/utilities/UtilHelpers.java index e2c6d6043..8c334f695 100644 --- a/hudi-utilities/src/main/java/org/apache/hudi/utilities/UtilHelpers.java +++ b/hudi-utilities/src/main/java/org/apache/hudi/utilities/UtilHelpers.java @@ -58,23 +58,22 @@ import org.apache.spark.sql.SparkSession; public class UtilHelpers { private static Logger logger = LogManager.getLogger(UtilHelpers.class); - public static Source createSource(String sourceClass, TypedProperties cfg, - JavaSparkContext jssc, SparkSession sparkSession, SchemaProvider schemaProvider) - throws IOException { + public static Source createSource(String sourceClass, TypedProperties cfg, JavaSparkContext jssc, + SparkSession sparkSession, SchemaProvider schemaProvider) throws IOException { try { return (Source) ReflectionUtils.loadClass(sourceClass, - new Class[]{TypedProperties.class, JavaSparkContext.class, SparkSession.class, SchemaProvider.class}, - cfg, jssc, sparkSession, schemaProvider); + new Class[] {TypedProperties.class, JavaSparkContext.class, SparkSession.class, SchemaProvider.class}, cfg, + jssc, sparkSession, schemaProvider); } catch (Throwable e) { throw new IOException("Could not load source class " + sourceClass, e); } } - public static SchemaProvider createSchemaProvider(String schemaProviderClass, - TypedProperties cfg, JavaSparkContext jssc) throws IOException { + public static SchemaProvider createSchemaProvider(String schemaProviderClass, TypedProperties cfg, + JavaSparkContext jssc) throws IOException { try { - return schemaProviderClass == null ? null : - (SchemaProvider) ReflectionUtils.loadClass(schemaProviderClass, cfg, jssc); + return schemaProviderClass == null ? null + : (SchemaProvider) ReflectionUtils.loadClass(schemaProviderClass, cfg, jssc); } catch (Throwable e) { throw new IOException("Could not load schema provider class " + schemaProviderClass, e); } @@ -116,7 +115,7 @@ public class UtilHelpers { /** * Parse Schema from file * - * @param fs File System + * @param fs File System * @param schemaFile Schema File */ public static String parseSchema(FileSystem fs, String schemaFile) throws Exception { @@ -149,8 +148,7 @@ public class UtilHelpers { sparkConf.set("spark.serializer", "org.apache.spark.serializer.KryoSerializer"); sparkConf.set("spark.hadoop.mapred.output.compress", "true"); sparkConf.set("spark.hadoop.mapred.output.compression.codec", "true"); - sparkConf.set("spark.hadoop.mapred.output.compression.codec", - "org.apache.hadoop.io.compress.GzipCodec"); + sparkConf.set("spark.hadoop.mapred.output.compression.codec", "org.apache.hadoop.io.compress.GzipCodec"); sparkConf.set("spark.hadoop.mapred.output.compression.type", "BLOCK"); additionalConfigs.entrySet().forEach(e -> sparkConf.set(e.getKey(), e.getValue())); @@ -168,6 +166,7 @@ public class UtilHelpers { /** * Build Spark Context for ingestion/compaction + * * @return */ public static JavaSparkContext buildSparkContext(String appName, String sparkMaster, String sparkMemory) { @@ -179,25 +178,22 @@ public class UtilHelpers { /** * Build Hoodie write client * - * @param jsc Java Spark Context - * @param basePath Base Path - * @param schemaStr Schema + * @param jsc Java Spark Context + * @param basePath Base Path + * @param schemaStr Schema * @param parallelism Parallelism */ - public static HoodieWriteClient createHoodieClient(JavaSparkContext jsc, String basePath, - String schemaStr, int parallelism, Option compactionStrategyClass, TypedProperties properties) - throws Exception { - HoodieCompactionConfig compactionConfig = - compactionStrategyClass.map(strategy -> HoodieCompactionConfig.newBuilder().withInlineCompaction(false) - .withCompactionStrategy(ReflectionUtils.loadClass(strategy)) - .build()).orElse(HoodieCompactionConfig.newBuilder().withInlineCompaction(false).build()); - HoodieWriteConfig config = HoodieWriteConfig.newBuilder().withPath(basePath) - .withParallelism(parallelism, parallelism).withSchema(schemaStr) - .combineInput(true, true) - .withCompactionConfig(compactionConfig) - .withIndexConfig(HoodieIndexConfig.newBuilder().withIndexType(HoodieIndex.IndexType.BLOOM).build()) - .withProps(properties) - .build(); + public static HoodieWriteClient createHoodieClient(JavaSparkContext jsc, String basePath, String schemaStr, + int parallelism, Option compactionStrategyClass, TypedProperties properties) throws Exception { + HoodieCompactionConfig compactionConfig = compactionStrategyClass + .map(strategy -> HoodieCompactionConfig.newBuilder().withInlineCompaction(false) + .withCompactionStrategy(ReflectionUtils.loadClass(strategy)).build()) + .orElse(HoodieCompactionConfig.newBuilder().withInlineCompaction(false).build()); + HoodieWriteConfig config = + HoodieWriteConfig.newBuilder().withPath(basePath).withParallelism(parallelism, parallelism) + .withSchema(schemaStr).combineInput(true, true).withCompactionConfig(compactionConfig) + .withIndexConfig(HoodieIndexConfig.newBuilder().withIndexType(HoodieIndex.IndexType.BLOOM).build()) + .withProps(properties).build(); return new HoodieWriteClient(jsc, config); } @@ -206,13 +202,11 @@ public class UtilHelpers { writeResponse.foreach(writeStatus -> { if (writeStatus.hasErrors()) { errors.add(1); - logger.error(String.format("Error processing records :writeStatus:%s", - writeStatus.getStat().toString())); + logger.error(String.format("Error processing records :writeStatus:%s", writeStatus.getStat().toString())); } }); if (errors.value() == 0) { - logger.info( - String.format("Dataset imported into hoodie dataset with %s instant time.", instantTime)); + logger.info(String.format("Dataset imported into hoodie dataset with %s instant time.", instantTime)); return 0; } logger.error(String.format("Import failed with %d errors.", errors.value())); diff --git a/hudi-utilities/src/main/java/org/apache/hudi/utilities/adhoc/UpgradePayloadFromUberToApache.java b/hudi-utilities/src/main/java/org/apache/hudi/utilities/adhoc/UpgradePayloadFromUberToApache.java index 0bc6ceb31..679694090 100644 --- a/hudi-utilities/src/main/java/org/apache/hudi/utilities/adhoc/UpgradePayloadFromUberToApache.java +++ b/hudi-utilities/src/main/java/org/apache/hudi/utilities/adhoc/UpgradePayloadFromUberToApache.java @@ -38,10 +38,9 @@ import org.apache.log4j.LogManager; import org.apache.log4j.Logger; /** - * This is an one-time use class meant for migrating the configuration for - * "hoodie.compaction.payload.class" in .hoodie/hoodie.properties from com.uber.hoodie to - * org.apache.hudi - * It takes in a file containing base-paths for a set of hudi datasets and does the migration + * This is an one-time use class meant for migrating the configuration for "hoodie.compaction.payload.class" in + * .hoodie/hoodie.properties from com.uber.hoodie to org.apache.hudi It takes in a file containing base-paths for a set + * of hudi datasets and does the migration */ public class UpgradePayloadFromUberToApache implements Serializable { @@ -66,8 +65,8 @@ public class UpgradePayloadFromUberToApache implements Serializable { if (!basePath.startsWith("#")) { logger.info("Performing upgrade for " + basePath); String metaPath = String.format("%s/.hoodie", basePath); - HoodieTableMetaClient metaClient = new HoodieTableMetaClient( - FSUtils.prepareHadoopConf(new Configuration()), basePath, false); + HoodieTableMetaClient metaClient = + new HoodieTableMetaClient(FSUtils.prepareHadoopConf(new Configuration()), basePath, false); HoodieTableConfig tableConfig = metaClient.getTableConfig(); if (tableConfig.getTableType().equals(HoodieTableType.MERGE_ON_READ)) { Map propsMap = tableConfig.getProps(); @@ -75,10 +74,8 @@ public class UpgradePayloadFromUberToApache implements Serializable { String payloadClass = propsMap.get(HoodieCompactionConfig.PAYLOAD_CLASS_PROP); logger.info("Found payload class=" + payloadClass); if (payloadClass.startsWith("com.uber.hoodie")) { - String newPayloadClass = payloadClass.replace("com.uber.hoodie", - "org.apache.hudi"); - logger.info("Replacing payload class (" + payloadClass - + ") with (" + newPayloadClass + ")"); + String newPayloadClass = payloadClass.replace("com.uber.hoodie", "org.apache.hudi"); + logger.info("Replacing payload class (" + payloadClass + ") with (" + newPayloadClass + ")"); Map newPropsMap = new HashMap<>(propsMap); newPropsMap.put(HoodieCompactionConfig.PAYLOAD_CLASS_PROP, newPayloadClass); Properties props = new Properties(); diff --git a/hudi-utilities/src/main/java/org/apache/hudi/utilities/deltastreamer/AbstractDeltaStreamerService.java b/hudi-utilities/src/main/java/org/apache/hudi/utilities/deltastreamer/AbstractDeltaStreamerService.java index e3d900bbb..ceb745a70 100644 --- a/hudi-utilities/src/main/java/org/apache/hudi/utilities/deltastreamer/AbstractDeltaStreamerService.java +++ b/hudi-utilities/src/main/java/org/apache/hudi/utilities/deltastreamer/AbstractDeltaStreamerService.java @@ -61,6 +61,7 @@ public abstract class AbstractDeltaStreamerService implements Serializable { /** * Wait till the service shutdown. If the service shutdown with exception, it will be thrown + * * @throws ExecutionException * @throws InterruptedException */ @@ -76,6 +77,7 @@ public abstract class AbstractDeltaStreamerService implements Serializable { /** * Request shutdown either forcefully or gracefully. Graceful shutdown allows the service to finish up the current * round of work and shutdown. For graceful shutdown, it waits till the service is shutdown + * * @param force Forcefully shutdown */ void shutdown(boolean force) { @@ -98,8 +100,9 @@ public abstract class AbstractDeltaStreamerService implements Serializable { } /** - * Start the service. Runs the service in a different thread and returns. Also starts a monitor thread - * to run-callbacks in case of shutdown + * Start the service. Runs the service in a different thread and returns. Also starts a monitor thread to + * run-callbacks in case of shutdown + * * @param onShutdownCallback */ public void start(Function onShutdownCallback) { @@ -112,12 +115,14 @@ public abstract class AbstractDeltaStreamerService implements Serializable { /** * Service implementation + * * @return */ protected abstract Pair startService(); /** * A monitor thread is started which would trigger a callback if the service is shutdown + * * @param onShutdownCallback */ private void monitorThreads(Function onShutdownCallback) { @@ -128,8 +133,7 @@ public abstract class AbstractDeltaStreamerService implements Serializable { log.info("Monitoring thread(s) !!"); future.get(); } catch (ExecutionException ex) { - log.error("Monitor noticed one or more threads failed." - + " Requesting graceful shutdown of other threads", ex); + log.error("Monitor noticed one or more threads failed." + " Requesting graceful shutdown of other threads", ex); error = true; shutdown(false); } catch (InterruptedException ie) { diff --git a/hudi-utilities/src/main/java/org/apache/hudi/utilities/deltastreamer/Compactor.java b/hudi-utilities/src/main/java/org/apache/hudi/utilities/deltastreamer/Compactor.java index 7a2a3f672..a72b68a45 100644 --- a/hudi-utilities/src/main/java/org/apache/hudi/utilities/deltastreamer/Compactor.java +++ b/hudi-utilities/src/main/java/org/apache/hudi/utilities/deltastreamer/Compactor.java @@ -51,10 +51,9 @@ public class Compactor implements Serializable { long numWriteErrors = res.collect().stream().filter(r -> r.hasErrors()).count(); if (numWriteErrors != 0) { // We treat even a single error in compaction as fatal - log.error("Compaction for instant (" + instant + ") failed with write errors. " - + "Errors :" + numWriteErrors); - throw new HoodieException("Compaction for instant (" + instant + ") failed with write errors. " - + "Errors :" + numWriteErrors); + log.error("Compaction for instant (" + instant + ") failed with write errors. " + "Errors :" + numWriteErrors); + throw new HoodieException( + "Compaction for instant (" + instant + ") failed with write errors. " + "Errors :" + numWriteErrors); } // Commit compaction compactionClient.commitCompaction(instant.getTimestamp(), res, Option.empty()); diff --git a/hudi-utilities/src/main/java/org/apache/hudi/utilities/deltastreamer/DeltaSync.java b/hudi-utilities/src/main/java/org/apache/hudi/utilities/deltastreamer/DeltaSync.java index 601bcc370..929196e28 100644 --- a/hudi-utilities/src/main/java/org/apache/hudi/utilities/deltastreamer/DeltaSync.java +++ b/hudi-utilities/src/main/java/org/apache/hudi/utilities/deltastreamer/DeltaSync.java @@ -155,11 +155,9 @@ public class DeltaSync implements Serializable { private final HoodieTableType tableType; - public DeltaSync(HoodieDeltaStreamer.Config cfg, SparkSession sparkSession, - SchemaProvider schemaProvider, HoodieTableType tableType, TypedProperties props, - JavaSparkContext jssc, FileSystem fs, HiveConf hiveConf, - Function onInitializingHoodieWriteClient) - throws IOException { + public DeltaSync(HoodieDeltaStreamer.Config cfg, SparkSession sparkSession, SchemaProvider schemaProvider, + HoodieTableType tableType, TypedProperties props, JavaSparkContext jssc, FileSystem fs, HiveConf hiveConf, + Function onInitializingHoodieWriteClient) throws IOException { this.cfg = cfg; this.jssc = jssc; @@ -176,8 +174,8 @@ public class DeltaSync implements Serializable { this.transformer = UtilHelpers.createTransformer(cfg.transformerClassName); this.keyGenerator = DataSourceUtils.createKeyGenerator(props); - this.formatAdapter = new SourceFormatAdapter(UtilHelpers.createSource(cfg.sourceClassName, props, jssc, - sparkSession, schemaProvider)); + this.formatAdapter = new SourceFormatAdapter( + UtilHelpers.createSource(cfg.sourceClassName, props, jssc, sparkSession, schemaProvider)); this.hiveConf = hiveConf; if (cfg.filterDupes) { @@ -194,8 +192,7 @@ public class DeltaSync implements Serializable { private void refreshTimeline() throws IOException { if (fs.exists(new Path(cfg.targetBasePath))) { HoodieTableMetaClient meta = new HoodieTableMetaClient(new Configuration(fs.getConf()), cfg.targetBasePath); - this.commitTimelineOpt = Option.of(meta.getActiveTimeline().getCommitsTimeline() - .filterCompletedInstants()); + this.commitTimelineOpt = Option.of(meta.getActiveTimeline().getCommitsTimeline().filterCompletedInstants()); } else { this.commitTimelineOpt = Option.empty(); HoodieTableMetaClient.initTableType(new Configuration(jssc.hadoopConfiguration()), cfg.targetBasePath, @@ -214,8 +211,7 @@ public class DeltaSync implements Serializable { // Refresh Timeline refreshTimeline(); - Pair>> srcRecordsWithCkpt = - readFromSource(commitTimelineOpt); + Pair>> srcRecordsWithCkpt = readFromSource(commitTimelineOpt); if (null != srcRecordsWithCkpt) { // this is the first input batch. If schemaProvider not set, use it and register Avro Schema and start @@ -246,8 +242,8 @@ public class DeltaSync implements Serializable { if (commitTimelineOpt.isPresent()) { Option lastCommit = commitTimelineOpt.get().lastInstant(); if (lastCommit.isPresent()) { - HoodieCommitMetadata commitMetadata = HoodieCommitMetadata.fromBytes( - commitTimelineOpt.get().getInstantDetails(lastCommit.get()).get(), HoodieCommitMetadata.class); + HoodieCommitMetadata commitMetadata = HoodieCommitMetadata + .fromBytes(commitTimelineOpt.get().getInstantDetails(lastCommit.get()).get(), HoodieCommitMetadata.class); if (cfg.checkpoint != null && !cfg.checkpoint.equals(commitMetadata.getMetadata(CHECKPOINT_RESET_KEY))) { resumeCheckpointStr = Option.of(cfg.checkpoint); } else if (commitMetadata.getMetadata(CHECKPOINT_KEY) != null) { @@ -274,25 +270,22 @@ public class DeltaSync implements Serializable { if (transformer != null) { // Transformation is needed. Fetch New rows in Row Format, apply transformation and then convert them // to generic records for writing - InputBatch> dataAndCheckpoint = formatAdapter.fetchNewDataInRowFormat( - resumeCheckpointStr, cfg.sourceLimit); + InputBatch> dataAndCheckpoint = + formatAdapter.fetchNewDataInRowFormat(resumeCheckpointStr, cfg.sourceLimit); Option> transformed = dataAndCheckpoint.getBatch().map(data -> transformer.apply(jssc, sparkSession, data, props)); checkpointStr = dataAndCheckpoint.getCheckpointForNextBatch(); - avroRDDOptional = transformed.map(t -> - AvroConversionUtils.createRdd(t, HOODIE_RECORD_STRUCT_NAME, HOODIE_RECORD_NAMESPACE).toJavaRDD() - ); + avroRDDOptional = transformed + .map(t -> AvroConversionUtils.createRdd(t, HOODIE_RECORD_STRUCT_NAME, HOODIE_RECORD_NAMESPACE).toJavaRDD()); // Use Transformed Row's schema if not overridden // Use Transformed Row's schema if not overridden. If target schema is not specified // default to RowBasedSchemaProvider - schemaProvider = - this.schemaProvider == null || this.schemaProvider.getTargetSchema() == null - ? transformed - .map(r -> (SchemaProvider) new RowBasedSchemaProvider(r.schema())) - .orElse(dataAndCheckpoint.getSchemaProvider()) - : this.schemaProvider; + schemaProvider = this.schemaProvider == null || this.schemaProvider.getTargetSchema() == null + ? transformed.map(r -> (SchemaProvider) new RowBasedSchemaProvider(r.schema())).orElse( + dataAndCheckpoint.getSchemaProvider()) + : this.schemaProvider; } else { // Pull the data from the source & prepare the write InputBatch> dataAndCheckpoint = @@ -303,8 +296,8 @@ public class DeltaSync implements Serializable { } if (Objects.equals(checkpointStr, resumeCheckpointStr.orElse(null))) { - log.info("No new data, source checkpoint has not changed. Nothing to commit." - + "Old checkpoint=(" + resumeCheckpointStr + "). New Checkpoint=(" + checkpointStr + ")"); + log.info("No new data, source checkpoint has not changed. Nothing to commit." + "Old checkpoint=(" + + resumeCheckpointStr + "). New Checkpoint=(" + checkpointStr + ")"); return null; } @@ -339,8 +332,7 @@ public class DeltaSync implements Serializable { if (cfg.filterDupes) { // turn upserts to insert cfg.operation = cfg.operation == Operation.UPSERT ? Operation.INSERT : cfg.operation; - records = DataSourceUtils.dropDuplicates(jssc, records, writeClient.getConfig(), - writeClient.getTimelineServer()); + records = DataSourceUtils.dropDuplicates(jssc, records, writeClient.getConfig(), writeClient.getTimelineServer()); } boolean isEmpty = records.isEmpty(); @@ -375,8 +367,7 @@ public class DeltaSync implements Serializable { + totalErrorRecords + "/" + totalRecords); } - boolean success = writeClient.commit(commitTime, writeStatusRDD, - Option.of(checkpointCommitMetadata)); + boolean success = writeClient.commit(commitTime, writeStatusRDD, Option.of(checkpointCommitMetadata)); if (success) { log.info("Commit " + commitTime + " successful!"); @@ -396,14 +387,12 @@ public class DeltaSync implements Serializable { throw new HoodieException("Commit " + commitTime + " failed!"); } } else { - log.error("Delta Sync found errors when writing. Errors/Total=" - + totalErrorRecords + "/" + totalRecords); + log.error("Delta Sync found errors when writing. Errors/Total=" + totalErrorRecords + "/" + totalRecords); log.error("Printing out the top 100 errors"); writeStatusRDD.filter(ws -> ws.hasErrors()).take(100).forEach(ws -> { log.error("Global error :", ws.getGlobalError()); if (ws.getErrors().size() > 0) { - ws.getErrors().entrySet().forEach(r -> - log.trace("Error for key:" + r.getKey() + " is " + r.getValue())); + ws.getErrors().entrySet().forEach(r -> log.trace("Error for key:" + r.getKey() + " is " + r.getValue())); } }); // Rolling back instant @@ -432,7 +421,7 @@ public class DeltaSync implements Serializable { try { Thread.sleep(1000); } catch (InterruptedException e) { - //No-Op + // No-Op } } } @@ -445,8 +434,8 @@ public class DeltaSync implements Serializable { private void syncHive() throws ClassNotFoundException { if (cfg.enableHiveSync) { HiveSyncConfig hiveSyncConfig = DataSourceUtils.buildHiveSyncConfig(props, cfg.targetBasePath); - log.info("Syncing target hoodie table with hive table(" + hiveSyncConfig.tableName - + "). Hive metastore URL :" + hiveSyncConfig.jdbcUrl + ", basePath :" + cfg.targetBasePath); + log.info("Syncing target hoodie table with hive table(" + hiveSyncConfig.tableName + "). Hive metastore URL :" + + hiveSyncConfig.jdbcUrl + ", basePath :" + cfg.targetBasePath); new HiveSyncTool(hiveSyncConfig, hiveConf, fs).syncHoodieTable(); } @@ -474,17 +463,13 @@ public class DeltaSync implements Serializable { */ private HoodieWriteConfig getHoodieClientConfig(SchemaProvider schemaProvider) { HoodieWriteConfig.Builder builder = - HoodieWriteConfig.newBuilder() - .withPath(cfg.targetBasePath) - .combineInput(cfg.filterDupes, true) - .withCompactionConfig(HoodieCompactionConfig.newBuilder() - .withPayloadClass(cfg.payloadClassName) + HoodieWriteConfig.newBuilder().withPath(cfg.targetBasePath).combineInput(cfg.filterDupes, true) + .withCompactionConfig(HoodieCompactionConfig.newBuilder().withPayloadClass(cfg.payloadClassName) // Inline compaction is disabled for continuous mode. otherwise enabled for MOR .withInlineCompaction(cfg.isInlineCompactionEnabled()).build()) .forTable(cfg.targetTableName) .withIndexConfig(HoodieIndexConfig.newBuilder().withIndexType(HoodieIndex.IndexType.BLOOM).build()) - .withAutoCommit(false) - .withProps(props); + .withAutoCommit(false).withProps(props); if (null != schemaProvider && null != schemaProvider.getTargetSchema()) { builder = builder.withSchema(schemaProvider.getTargetSchema().toString()); diff --git a/hudi-utilities/src/main/java/org/apache/hudi/utilities/deltastreamer/HoodieDeltaStreamer.java b/hudi-utilities/src/main/java/org/apache/hudi/utilities/deltastreamer/HoodieDeltaStreamer.java index ee826ac10..3a6baa518 100644 --- a/hudi-utilities/src/main/java/org/apache/hudi/utilities/deltastreamer/HoodieDeltaStreamer.java +++ b/hudi-utilities/src/main/java/org/apache/hudi/utilities/deltastreamer/HoodieDeltaStreamer.java @@ -71,13 +71,9 @@ import org.apache.spark.sql.SparkSession; * dataset. Does not maintain any state, queries at runtime to see how far behind the target dataset is from the source * dataset. This can be overriden to force sync from a timestamp. * - * In continuous mode, DeltaStreamer runs in loop-mode going through the below operations - * (a) pull-from-source - * (b) write-to-sink - * (c) Schedule Compactions if needed - * (d) Conditionally Sync to Hive - * each cycle. For MOR table with continuous mode enabled, a seperate compactor thread is allocated to execute - * compactions + * In continuous mode, DeltaStreamer runs in loop-mode going through the below operations (a) pull-from-source (b) + * write-to-sink (c) Schedule Compactions if needed (d) Conditionally Sync to Hive each cycle. For MOR table with + * continuous mode enabled, a seperate compactor thread is allocated to execute compactions */ public class HoodieDeltaStreamer implements Serializable { @@ -111,6 +107,7 @@ public class HoodieDeltaStreamer implements Serializable { /** * Main method to start syncing + * * @throws Exception */ public void sync() throws Exception { @@ -146,8 +143,9 @@ public class HoodieDeltaStreamer implements Serializable { public static class Config implements Serializable { - @Parameter(names = {"--target-base-path"}, description = "base path for the target hoodie dataset. " - + "(Will be created if did not exist first time around. If exists, expected to be a hoodie dataset)", + @Parameter(names = {"--target-base-path"}, + description = "base path for the target hoodie dataset. " + + "(Will be created if did not exist first time around. If exists, expected to be a hoodie dataset)", required = true) public String targetBasePath; @@ -155,8 +153,8 @@ public class HoodieDeltaStreamer implements Serializable { @Parameter(names = {"--target-table"}, description = "name of the target table in Hive", required = true) public String targetTableName; - @Parameter(names = {"--storage-type"}, description = "Type of Storage. " - + "COPY_ON_WRITE (or) MERGE_ON_READ", required = true) + @Parameter(names = {"--storage-type"}, description = "Type of Storage. " + "COPY_ON_WRITE (or) MERGE_ON_READ", + required = true) public String storageType; @Parameter(names = {"--props"}, description = "path to properties file on localfs or dfs, with configurations for " @@ -170,9 +168,10 @@ public class HoodieDeltaStreamer implements Serializable { + "(using the CLI parameter \"--propsFilePath\") can also be passed command line using this parameter") public List configs = new ArrayList<>(); - @Parameter(names = {"--source-class"}, description = "Subclass of org.apache.hudi.utilities.sources to read data. " - + "Built-in options: org.apache.hudi.utilities.sources.{JsonDFSSource (default), AvroDFSSource, " - + "JsonKafkaSource, AvroKafkaSource, HiveIncrPullSource}") + @Parameter(names = {"--source-class"}, + description = "Subclass of org.apache.hudi.utilities.sources to read data. " + + "Built-in options: org.apache.hudi.utilities.sources.{JsonDFSSource (default), AvroDFSSource, " + + "JsonKafkaSource, AvroKafkaSource, HiveIncrPullSource}") public String sourceClassName = JsonDFSSource.class.getName(); @Parameter(names = {"--source-ordering-field"}, description = "Field within source record to decide how" @@ -203,12 +202,11 @@ public class HoodieDeltaStreamer implements Serializable { public long sourceLimit = Long.MAX_VALUE; @Parameter(names = {"--op"}, description = "Takes one of these values : UPSERT (default), INSERT (use when input " - + "is purely new data/inserts to gain speed)", - converter = OperationConvertor.class) + + "is purely new data/inserts to gain speed)", converter = OperationConvertor.class) public Operation operation = Operation.UPSERT; - @Parameter(names = {"--filter-dupes"}, description = "Should duplicate records from source be dropped/filtered out" - + "before insert/bulk-insert") + @Parameter(names = {"--filter-dupes"}, + description = "Should duplicate records from source be dropped/filtered out" + "before insert/bulk-insert") public Boolean filterDupes = false; @Parameter(names = {"--enable-hive-sync"}, description = "Enable syncing to hive") @@ -223,8 +221,8 @@ public class HoodieDeltaStreamer implements Serializable { + " source-fetch -> Transform -> Hudi Write in loop") public Boolean continuousMode = false; - @Parameter(names = {"--min-sync-interval-seconds"}, description = "the min sync interval of each sync in " - + "continuous mode") + @Parameter(names = {"--min-sync-interval-seconds"}, + description = "the min sync interval of each sync in " + "continuous mode") public Integer minSyncIntervalSeconds = 0; @Parameter(names = {"--spark-master"}, description = "spark master to use.") @@ -233,8 +231,8 @@ public class HoodieDeltaStreamer implements Serializable { @Parameter(names = {"--commit-on-errors"}, description = "Commit even when some records failed to be written") public Boolean commitOnErrors = false; - @Parameter(names = {"--delta-sync-scheduling-weight"}, description = - "Scheduling weight for delta sync as defined in " + @Parameter(names = {"--delta-sync-scheduling-weight"}, + description = "Scheduling weight for delta sync as defined in " + "https://spark.apache.org/docs/latest/job-scheduling.html") public Integer deltaSyncSchedulingWeight = 1; @@ -253,8 +251,8 @@ public class HoodieDeltaStreamer implements Serializable { /** * Compaction is enabled for MoR table by default. This flag disables it */ - @Parameter(names = {"--disable-compaction"}, description = "Compaction is enabled for MoR table by default." - + "This flag disables it ") + @Parameter(names = {"--disable-compaction"}, + description = "Compaction is enabled for MoR table by default." + "This flag disables it ") public Boolean forceDisableCompaction = false; /** @@ -288,8 +286,8 @@ public class HoodieDeltaStreamer implements Serializable { } Map additionalSparkConfigs = SchedulerConfGenerator.getSparkSchedulingConfigs(cfg); - JavaSparkContext jssc = UtilHelpers.buildSparkContext("delta-streamer-" + cfg.targetTableName, - cfg.sparkMaster, additionalSparkConfigs); + JavaSparkContext jssc = + UtilHelpers.buildSparkContext("delta-streamer-" + cfg.targetTableName, cfg.sparkMaster, additionalSparkConfigs); try { new HoodieDeltaStreamer(cfg, jssc).sync(); } finally { @@ -349,8 +347,8 @@ public class HoodieDeltaStreamer implements Serializable { this.sparkSession = SparkSession.builder().config(jssc.getConf()).getOrCreate(); if (fs.exists(new Path(cfg.targetBasePath))) { - HoodieTableMetaClient meta = new HoodieTableMetaClient( - new Configuration(fs.getConf()), cfg.targetBasePath, false); + HoodieTableMetaClient meta = + new HoodieTableMetaClient(new Configuration(fs.getConf()), cfg.targetBasePath, false); tableType = meta.getTableType(); // This will guarantee there is no surprise with table type Preconditions.checkArgument(tableType.equals(HoodieTableType.valueOf(cfg.storageType)), @@ -367,8 +365,8 @@ public class HoodieDeltaStreamer implements Serializable { cfg.operation = cfg.operation == Operation.UPSERT ? Operation.INSERT : cfg.operation; } - deltaSync = new DeltaSync(cfg, sparkSession, schemaProvider, tableType, - props, jssc, fs, hiveConf, this::onInitializingWriteClient); + deltaSync = new DeltaSync(cfg, sparkSession, schemaProvider, tableType, props, jssc, fs, hiveConf, + this::onInitializingWriteClient); } public DeltaSync getDeltaSync() { @@ -392,9 +390,8 @@ public class HoodieDeltaStreamer implements Serializable { Option scheduledCompactionInstant = deltaSync.syncOnce(); if (scheduledCompactionInstant.isPresent()) { log.info("Enqueuing new pending compaction instant (" + scheduledCompactionInstant + ")"); - asyncCompactService.enqueuePendingCompaction(new HoodieInstant( - State.REQUESTED, HoodieTimeline.COMPACTION_ACTION, - scheduledCompactionInstant.get())); + asyncCompactService.enqueuePendingCompaction(new HoodieInstant(State.REQUESTED, + HoodieTimeline.COMPACTION_ACTION, scheduledCompactionInstant.get())); asyncCompactService.waitTillPendingCompactionsReducesTo(cfg.maxPendingCompactions); } long toSleepMs = cfg.minSyncIntervalSeconds * 1000 - (System.currentTimeMillis() - start); @@ -429,6 +426,7 @@ public class HoodieDeltaStreamer implements Serializable { /** * Callback to initialize write client and start compaction service if required + * * @param writeClient HoodieWriteClient * @return */ @@ -436,8 +434,8 @@ public class HoodieDeltaStreamer implements Serializable { if (cfg.isAsyncCompactionEnabled()) { asyncCompactService = new AsyncCompactService(jssc, writeClient); // Enqueue existing pending compactions first - HoodieTableMetaClient meta = new HoodieTableMetaClient( - new Configuration(jssc.hadoopConfiguration()), cfg.targetBasePath, true); + HoodieTableMetaClient meta = + new HoodieTableMetaClient(new Configuration(jssc.hadoopConfiguration()), cfg.targetBasePath, true); List pending = CompactionUtils.getPendingCompactionInstantTimes(meta); pending.stream().forEach(hoodieInstant -> asyncCompactService.enqueuePendingCompaction(hoodieInstant)); asyncCompactService.start((error) -> { @@ -499,7 +497,7 @@ public class HoodieDeltaStreamer implements Serializable { public AsyncCompactService(JavaSparkContext jssc, HoodieWriteClient client) { this.jssc = jssc; this.compactor = new Compactor(client, jssc); - //TODO: HUDI-157 : Only allow 1 compactor to run in parallel till Incremental View on MOR is fully implemented. + // TODO: HUDI-157 : Only allow 1 compactor to run in parallel till Incremental View on MOR is fully implemented. this.maxConcurrentCompaction = 1; } @@ -512,6 +510,7 @@ public class HoodieDeltaStreamer implements Serializable { /** * Wait till outstanding pending compactions reduces to the passed in value + * * @param numPendingCompactions Maximum pending compactions allowed * @throws InterruptedException */ @@ -528,6 +527,7 @@ public class HoodieDeltaStreamer implements Serializable { /** * Fetch Next pending compaction if available + * * @return * @throws InterruptedException */ diff --git a/hudi-utilities/src/main/java/org/apache/hudi/utilities/deltastreamer/SchedulerConfGenerator.java b/hudi-utilities/src/main/java/org/apache/hudi/utilities/deltastreamer/SchedulerConfGenerator.java index 70b13d4b5..d519085bf 100644 --- a/hudi-utilities/src/main/java/org/apache/hudi/utilities/deltastreamer/SchedulerConfGenerator.java +++ b/hudi-utilities/src/main/java/org/apache/hudi/utilities/deltastreamer/SchedulerConfGenerator.java @@ -32,8 +32,8 @@ import org.apache.log4j.Logger; import org.apache.spark.SparkConf; /** - * Utility Class to generate Spark Scheduling allocation file. This kicks in only when user - * sets spark.scheduler.mode=FAIR at spark-submit time + * Utility Class to generate Spark Scheduling allocation file. This kicks in only when user sets + * spark.scheduler.mode=FAIR at spark-submit time */ public class SchedulerConfGenerator { @@ -45,25 +45,16 @@ public class SchedulerConfGenerator { public static final String SPARK_SCHEDULER_ALLOCATION_FILE_KEY = "spark.scheduler.allocation.file"; private static String SPARK_SCHEDULING_PATTERN = - "\n" - + "\n" - + " \n" - + " %s\n" - + " %s\n" - + " %s\n" - + " \n" - + " \n" - + " %s\n" - + " %s\n" - + " %s\n" - + " \n" - + ""; + "\n" + "\n" + " \n" + + " %s\n" + " %s\n" + " %s\n" + + " \n" + " \n" + " %s\n" + + " %s\n" + " %s\n" + " \n" + ""; private static String generateConfig(Integer deltaSyncWeight, Integer compactionWeight, Integer deltaSyncMinShare, Integer compactionMinShare) { - return String.format(SPARK_SCHEDULING_PATTERN, - DELTASYNC_POOL_NAME, "FAIR", deltaSyncWeight.toString(), deltaSyncMinShare.toString(), - COMPACT_POOL_NAME, "FAIR", compactionWeight.toString(), compactionMinShare.toString()); + return String.format(SPARK_SCHEDULING_PATTERN, DELTASYNC_POOL_NAME, "FAIR", deltaSyncWeight.toString(), + deltaSyncMinShare.toString(), COMPACT_POOL_NAME, "FAIR", compactionWeight.toString(), + compactionMinShare.toString()); } @@ -75,13 +66,11 @@ public class SchedulerConfGenerator { public static Map getSparkSchedulingConfigs(HoodieDeltaStreamer.Config cfg) throws Exception { scala.Option scheduleModeKeyOption = new SparkConf().getOption(SPARK_SCHEDULER_MODE_KEY); final Option sparkSchedulerMode = - scheduleModeKeyOption.isDefined() - ? Option.of(scheduleModeKeyOption.get()) - : Option.empty(); + scheduleModeKeyOption.isDefined() ? Option.of(scheduleModeKeyOption.get()) : Option.empty(); Map additionalSparkConfigs = new HashMap<>(); - if (sparkSchedulerMode.isPresent() && "FAIR".equals(sparkSchedulerMode.get()) - && cfg.continuousMode && cfg.storageType.equals(HoodieTableType.MERGE_ON_READ.name())) { + if (sparkSchedulerMode.isPresent() && "FAIR".equals(sparkSchedulerMode.get()) && cfg.continuousMode + && cfg.storageType.equals(HoodieTableType.MERGE_ON_READ.name())) { String sparkSchedulingConfFile = generateAndStoreConfig(cfg.deltaSyncSchedulingWeight, cfg.compactSchedulingWeight, cfg.deltaSyncSchedulingMinShare, cfg.compactSchedulingMinShare); additionalSparkConfigs.put(SPARK_SCHEDULER_ALLOCATION_FILE_KEY, sparkSchedulingConfFile); @@ -92,10 +81,8 @@ public class SchedulerConfGenerator { return additionalSparkConfigs; } - private static String generateAndStoreConfig(Integer deltaSyncWeight, - Integer compactionWeight, - Integer deltaSyncMinShare, - Integer compactionMinShare) throws IOException { + private static String generateAndStoreConfig(Integer deltaSyncWeight, Integer compactionWeight, + Integer deltaSyncMinShare, Integer compactionMinShare) throws IOException { File tempConfigFile = File.createTempFile(UUID.randomUUID().toString(), ".xml"); BufferedWriter bw = new BufferedWriter(new FileWriter(tempConfigFile)); bw.write(generateConfig(deltaSyncWeight, compactionWeight, deltaSyncMinShare, compactionMinShare)); diff --git a/hudi-utilities/src/main/java/org/apache/hudi/utilities/deltastreamer/SourceFormatAdapter.java b/hudi-utilities/src/main/java/org/apache/hudi/utilities/deltastreamer/SourceFormatAdapter.java index 1b46d4afd..4df948e41 100644 --- a/hudi-utilities/src/main/java/org/apache/hudi/utilities/deltastreamer/SourceFormatAdapter.java +++ b/hudi-utilities/src/main/java/org/apache/hudi/utilities/deltastreamer/SourceFormatAdapter.java @@ -49,26 +49,24 @@ public final class SourceFormatAdapter { } /** - * Fetch new data in avro format. If the source provides data in different format, they are translated - * to Avro format + * Fetch new data in avro format. If the source provides data in different format, they are translated to Avro format + * * @param lastCkptStr * @param sourceLimit * @return */ - public InputBatch> fetchNewDataInAvroFormat(Option lastCkptStr, - long sourceLimit) { + public InputBatch> fetchNewDataInAvroFormat(Option lastCkptStr, long sourceLimit) { switch (source.getSourceType()) { case AVRO: - return ((AvroSource)source).fetchNext(lastCkptStr, sourceLimit); + return ((AvroSource) source).fetchNext(lastCkptStr, sourceLimit); case JSON: { - InputBatch> r = ((JsonSource)source).fetchNext(lastCkptStr, sourceLimit); + InputBatch> r = ((JsonSource) source).fetchNext(lastCkptStr, sourceLimit); AvroConvertor convertor = new AvroConvertor(r.getSchemaProvider().getSourceSchema()); - return new InputBatch<>(Option.ofNullable( - r.getBatch().map(rdd -> rdd.map(convertor::fromJson)) - .orElse(null)), r.getCheckpointForNextBatch(), r.getSchemaProvider()); + return new InputBatch<>(Option.ofNullable(r.getBatch().map(rdd -> rdd.map(convertor::fromJson)).orElse(null)), + r.getCheckpointForNextBatch(), r.getSchemaProvider()); } case ROW: { - InputBatch> r = ((RowSource)source).fetchNext(lastCkptStr, sourceLimit); + InputBatch> r = ((RowSource) source).fetchNext(lastCkptStr, sourceLimit); return new InputBatch<>(Option.ofNullable(r.getBatch().map( rdd -> (AvroConversionUtils.createRdd(rdd, HOODIE_RECORD_STRUCT_NAME, HOODIE_RECORD_NAMESPACE).toJavaRDD())) .orElse(null)), r.getCheckpointForNextBatch(), r.getSchemaProvider()); @@ -79,8 +77,8 @@ public final class SourceFormatAdapter { } /** - * Fetch new data in row format. If the source provides data in different format, they are translated - * to Row format + * Fetch new data in row format. If the source provides data in different format, they are translated to Row format + * * @param lastCkptStr * @param sourceLimit * @return @@ -88,22 +86,27 @@ public final class SourceFormatAdapter { public InputBatch> fetchNewDataInRowFormat(Option lastCkptStr, long sourceLimit) { switch (source.getSourceType()) { case ROW: - return ((RowSource)source).fetchNext(lastCkptStr, sourceLimit); + return ((RowSource) source).fetchNext(lastCkptStr, sourceLimit); case AVRO: { - InputBatch> r = ((AvroSource)source).fetchNext(lastCkptStr, sourceLimit); + InputBatch> r = ((AvroSource) source).fetchNext(lastCkptStr, sourceLimit); Schema sourceSchema = r.getSchemaProvider().getSourceSchema(); - return new InputBatch<>(Option.ofNullable( - r.getBatch().map(rdd -> AvroConversionUtils.createDataFrame(JavaRDD.toRDD(rdd), - sourceSchema.toString(), source.getSparkSession())) - .orElse(null)), r.getCheckpointForNextBatch(), r.getSchemaProvider()); + return new InputBatch<>( + Option + .ofNullable( + r.getBatch() + .map(rdd -> AvroConversionUtils.createDataFrame(JavaRDD.toRDD(rdd), sourceSchema.toString(), + source.getSparkSession())) + .orElse(null)), + r.getCheckpointForNextBatch(), r.getSchemaProvider()); } case JSON: { - InputBatch> r = ((JsonSource)source).fetchNext(lastCkptStr, sourceLimit); + InputBatch> r = ((JsonSource) source).fetchNext(lastCkptStr, sourceLimit); Schema sourceSchema = r.getSchemaProvider().getSourceSchema(); StructType dataType = AvroConversionUtils.convertAvroSchemaToStructType(sourceSchema); - return new InputBatch<>(Option.ofNullable( - r.getBatch().map(rdd -> source.getSparkSession().read().schema(dataType).json(rdd)) - .orElse(null)), r.getCheckpointForNextBatch(), r.getSchemaProvider()); + return new InputBatch<>( + Option.ofNullable( + r.getBatch().map(rdd -> source.getSparkSession().read().schema(dataType).json(rdd)).orElse(null)), + r.getCheckpointForNextBatch(), r.getSchemaProvider()); } default: throw new IllegalArgumentException("Unknown source type (" + source.getSourceType() + ")"); diff --git a/hudi-utilities/src/main/java/org/apache/hudi/utilities/keygen/TimestampBasedKeyGenerator.java b/hudi-utilities/src/main/java/org/apache/hudi/utilities/keygen/TimestampBasedKeyGenerator.java index fc41859f4..bb6802fa9 100644 --- a/hudi-utilities/src/main/java/org/apache/hudi/utilities/keygen/TimestampBasedKeyGenerator.java +++ b/hudi-utilities/src/main/java/org/apache/hudi/utilities/keygen/TimestampBasedKeyGenerator.java @@ -55,14 +55,11 @@ public class TimestampBasedKeyGenerator extends SimpleKeyGenerator { static class Config { // One value from TimestampType above - private static final String TIMESTAMP_TYPE_FIELD_PROP = "hoodie.deltastreamer.keygen" - + ".timebased.timestamp.type"; - private static final String TIMESTAMP_INPUT_DATE_FORMAT_PROP = "hoodie.deltastreamer.keygen" - + ".timebased.input" - + ".dateformat"; - private static final String TIMESTAMP_OUTPUT_DATE_FORMAT_PROP = "hoodie.deltastreamer.keygen" - + ".timebased.output" - + ".dateformat"; + private static final String TIMESTAMP_TYPE_FIELD_PROP = "hoodie.deltastreamer.keygen" + ".timebased.timestamp.type"; + private static final String TIMESTAMP_INPUT_DATE_FORMAT_PROP = + "hoodie.deltastreamer.keygen" + ".timebased.input" + ".dateformat"; + private static final String TIMESTAMP_OUTPUT_DATE_FORMAT_PROP = + "hoodie.deltastreamer.keygen" + ".timebased.output" + ".dateformat"; } public TimestampBasedKeyGenerator(TypedProperties config) { @@ -73,10 +70,9 @@ public class TimestampBasedKeyGenerator extends SimpleKeyGenerator { this.outputDateFormat = config.getString(Config.TIMESTAMP_OUTPUT_DATE_FORMAT_PROP); if (timestampType == TimestampType.DATE_STRING || timestampType == TimestampType.MIXED) { - DataSourceUtils - .checkRequiredProperties(config, Collections.singletonList(Config.TIMESTAMP_INPUT_DATE_FORMAT_PROP)); - this.inputDateFormat = new SimpleDateFormat( - config.getString(Config.TIMESTAMP_INPUT_DATE_FORMAT_PROP)); + DataSourceUtils.checkRequiredProperties(config, + Collections.singletonList(Config.TIMESTAMP_INPUT_DATE_FORMAT_PROP)); + this.inputDateFormat = new SimpleDateFormat(config.getString(Config.TIMESTAMP_INPUT_DATE_FORMAT_PROP)); this.inputDateFormat.setTimeZone(TimeZone.getTimeZone("GMT")); } } @@ -105,8 +101,7 @@ public class TimestampBasedKeyGenerator extends SimpleKeyGenerator { return new HoodieKey(DataSourceUtils.getNestedFieldValAsString(record, recordKeyField), partitionPathFormat.format(new Date(unixTime * 1000))); } catch (ParseException pe) { - throw new HoodieDeltaStreamerException( - "Unable to parse input partition field :" + partitionVal, pe); + throw new HoodieDeltaStreamerException("Unable to parse input partition field :" + partitionVal, pe); } } } diff --git a/hudi-utilities/src/main/java/org/apache/hudi/utilities/perf/TimelineServerPerf.java b/hudi-utilities/src/main/java/org/apache/hudi/utilities/perf/TimelineServerPerf.java index c8469f93e..81a418f2b 100644 --- a/hudi-utilities/src/main/java/org/apache/hudi/utilities/perf/TimelineServerPerf.java +++ b/hudi-utilities/src/main/java/org/apache/hudi/utilities/perf/TimelineServerPerf.java @@ -82,8 +82,8 @@ public class TimelineServerPerf implements Serializable { List allPartitionPaths = FSUtils.getAllPartitionPaths(timelineServer.getFs(), cfg.basePath, true); Collections.shuffle(allPartitionPaths); - List selected = allPartitionPaths.stream().filter(p -> !p.contains("error")) - .limit(cfg.maxPartitions).collect(Collectors.toList()); + List selected = allPartitionPaths.stream().filter(p -> !p.contains("error")).limit(cfg.maxPartitions) + .collect(Collectors.toList()); JavaSparkContext jsc = UtilHelpers.buildSparkContext("hudi-view-perf-" + cfg.basePath, cfg.sparkMaster); if (!useExternalTimelineServer) { this.timelineServer.startService(); @@ -100,15 +100,13 @@ public class TimelineServerPerf implements Serializable { String dumpPrefix = UUID.randomUUID().toString(); System.out.println("First Iteration to load all partitions"); - Dumper d = new Dumper(metaClient.getFs(), new Path(reportDir, - String.format("1_%s.csv", dumpPrefix))); + Dumper d = new Dumper(metaClient.getFs(), new Path(reportDir, String.format("1_%s.csv", dumpPrefix))); d.init(); d.dump(runLookups(jsc, selected, fsView, 1, 0)); d.close(); System.out.println("\n\n\n First Iteration is done"); - Dumper d2 = new Dumper(metaClient.getFs(), new Path(reportDir, - String.format("2_%s.csv", dumpPrefix))); + Dumper d2 = new Dumper(metaClient.getFs(), new Path(reportDir, String.format("2_%s.csv", dumpPrefix))); d2.init(); d2.dump(runLookups(jsc, selected, fsView, cfg.numIterations, cfg.numCoresPerExecutor)); d2.close(); @@ -164,8 +162,8 @@ public class TimelineServerPerf implements Serializable { long beginTs = System.currentTimeMillis(); Option c = fsView.getLatestFileSlice(partition, fileId); long endTs = System.currentTimeMillis(); - System.out.println("Latest File Slice for part=" + partition + ", fileId=" - + fileId + ", Slice=" + c + ", Time=" + (endTs - beginTs)); + System.out.println("Latest File Slice for part=" + partition + ", fileId=" + fileId + ", Slice=" + c + ", Time=" + + (endTs - beginTs)); latencyHistogram.update(endTs - beginTs); } return new PerfStats(partition, id, latencyHistogram.getSnapshot()); @@ -288,8 +286,7 @@ public class TimelineServerPerf implements Serializable { description = "Directory where spilled view entries will be stored. Used for SPILLABLE_DISK storage type") public String baseStorePathForFileGroups = FileSystemViewStorageConfig.DEFAULT_VIEW_SPILLABLE_DIR; - @Parameter(names = {"--rocksdb-path", "-rp"}, - description = "Root directory for RocksDB") + @Parameter(names = {"--rocksdb-path", "-rp"}, description = "Root directory for RocksDB") public String rocksDBPath = FileSystemViewStorageConfig.DEFAULT_ROCKSDB_BASE_PATH; @Parameter(names = {"--wait-for-manual-queries", "-ww"}) diff --git a/hudi-utilities/src/main/java/org/apache/hudi/utilities/schema/FilebasedSchemaProvider.java b/hudi-utilities/src/main/java/org/apache/hudi/utilities/schema/FilebasedSchemaProvider.java index 3c9102b44..3eb3a44ec 100644 --- a/hudi-utilities/src/main/java/org/apache/hudi/utilities/schema/FilebasedSchemaProvider.java +++ b/hudi-utilities/src/main/java/org/apache/hudi/utilities/schema/FilebasedSchemaProvider.java @@ -38,10 +38,8 @@ public class FilebasedSchemaProvider extends SchemaProvider { * Configs supported */ public static class Config { - private static final String SOURCE_SCHEMA_FILE_PROP = "hoodie.deltastreamer.schemaprovider" - + ".source.schema.file"; - private static final String TARGET_SCHEMA_FILE_PROP = "hoodie.deltastreamer.schemaprovider" - + ".target.schema.file"; + private static final String SOURCE_SCHEMA_FILE_PROP = "hoodie.deltastreamer.schemaprovider" + ".source.schema.file"; + private static final String TARGET_SCHEMA_FILE_PROP = "hoodie.deltastreamer.schemaprovider" + ".target.schema.file"; } private final FileSystem fs; @@ -55,11 +53,10 @@ public class FilebasedSchemaProvider extends SchemaProvider { DataSourceUtils.checkRequiredProperties(props, Collections.singletonList(Config.SOURCE_SCHEMA_FILE_PROP)); this.fs = FSUtils.getFs(props.getString(Config.SOURCE_SCHEMA_FILE_PROP), jssc.hadoopConfiguration()); try { - this.sourceSchema = new Schema.Parser().parse( - fs.open(new Path(props.getString(Config.SOURCE_SCHEMA_FILE_PROP)))); + this.sourceSchema = new Schema.Parser().parse(fs.open(new Path(props.getString(Config.SOURCE_SCHEMA_FILE_PROP)))); if (props.containsKey(Config.TARGET_SCHEMA_FILE_PROP)) { - this.targetSchema = new Schema.Parser().parse( - fs.open(new Path(props.getString(Config.TARGET_SCHEMA_FILE_PROP)))); + this.targetSchema = + new Schema.Parser().parse(fs.open(new Path(props.getString(Config.TARGET_SCHEMA_FILE_PROP)))); } } catch (IOException ioe) { throw new HoodieIOException("Error reading schema", ioe); diff --git a/hudi-utilities/src/main/java/org/apache/hudi/utilities/schema/NullTargetSchemaRegistryProvider.java b/hudi-utilities/src/main/java/org/apache/hudi/utilities/schema/NullTargetSchemaRegistryProvider.java index 109b499fa..d7415a05e 100644 --- a/hudi-utilities/src/main/java/org/apache/hudi/utilities/schema/NullTargetSchemaRegistryProvider.java +++ b/hudi-utilities/src/main/java/org/apache/hudi/utilities/schema/NullTargetSchemaRegistryProvider.java @@ -23,9 +23,8 @@ import org.apache.hudi.common.util.TypedProperties; import org.apache.spark.api.java.JavaSparkContext; /** - * Schema provider that will force DeltaStreamer to infer target schema from the dataset. - * It can be used with SQL or Flattening transformers to avoid having a target schema in the schema - * registry. + * Schema provider that will force DeltaStreamer to infer target schema from the dataset. It can be used with SQL or + * Flattening transformers to avoid having a target schema in the schema registry. */ public class NullTargetSchemaRegistryProvider extends SchemaRegistryProvider { diff --git a/hudi-utilities/src/main/java/org/apache/hudi/utilities/schema/RowBasedSchemaProvider.java b/hudi-utilities/src/main/java/org/apache/hudi/utilities/schema/RowBasedSchemaProvider.java index 22a36b40a..3cec79c52 100644 --- a/hudi-utilities/src/main/java/org/apache/hudi/utilities/schema/RowBasedSchemaProvider.java +++ b/hudi-utilities/src/main/java/org/apache/hudi/utilities/schema/RowBasedSchemaProvider.java @@ -38,6 +38,6 @@ public class RowBasedSchemaProvider extends SchemaProvider { @Override public Schema getSourceSchema() { return AvroConversionUtils.convertStructTypeToAvroSchema(rowStruct, HOODIE_RECORD_STRUCT_NAME, - HOODIE_RECORD_NAMESPACE); + HOODIE_RECORD_NAMESPACE); } } diff --git a/hudi-utilities/src/main/java/org/apache/hudi/utilities/sources/AvroDFSSource.java b/hudi-utilities/src/main/java/org/apache/hudi/utilities/sources/AvroDFSSource.java index 83e81e80e..b7f6f8c73 100644 --- a/hudi-utilities/src/main/java/org/apache/hudi/utilities/sources/AvroDFSSource.java +++ b/hudi-utilities/src/main/java/org/apache/hudi/utilities/sources/AvroDFSSource.java @@ -46,20 +46,17 @@ public class AvroDFSSource extends AvroSource { } @Override - protected InputBatch> fetchNewData(Option lastCkptStr, - long sourceLimit) { + protected InputBatch> fetchNewData(Option lastCkptStr, long sourceLimit) { Pair, String> selectPathsWithMaxModificationTime = pathSelector.getNextFilePathsAndMaxModificationTime(lastCkptStr, sourceLimit); - return selectPathsWithMaxModificationTime.getLeft().map(pathStr -> new InputBatch<>( - Option.of(fromFiles(pathStr)), - selectPathsWithMaxModificationTime.getRight())) + return selectPathsWithMaxModificationTime.getLeft() + .map(pathStr -> new InputBatch<>(Option.of(fromFiles(pathStr)), selectPathsWithMaxModificationTime.getRight())) .orElseGet(() -> new InputBatch<>(Option.empty(), selectPathsWithMaxModificationTime.getRight())); } private JavaRDD fromFiles(String pathStr) { - JavaPairRDD avroRDD = sparkContext.newAPIHadoopFile(pathStr, - AvroKeyInputFormat.class, AvroKey.class, NullWritable.class, - sparkContext.hadoopConfiguration()); + JavaPairRDD avroRDD = sparkContext.newAPIHadoopFile(pathStr, AvroKeyInputFormat.class, + AvroKey.class, NullWritable.class, sparkContext.hadoopConfiguration()); return avroRDD.keys().map(r -> ((GenericRecord) r.datum())); } } diff --git a/hudi-utilities/src/main/java/org/apache/hudi/utilities/sources/AvroKafkaSource.java b/hudi-utilities/src/main/java/org/apache/hudi/utilities/sources/AvroKafkaSource.java index 3b051ac15..2ce8b4371 100644 --- a/hudi-utilities/src/main/java/org/apache/hudi/utilities/sources/AvroKafkaSource.java +++ b/hudi-utilities/src/main/java/org/apache/hudi/utilities/sources/AvroKafkaSource.java @@ -50,24 +50,21 @@ public class AvroKafkaSource extends AvroSource { } @Override - protected InputBatch> fetchNewData(Option lastCheckpointStr, - long sourceLimit) { + protected InputBatch> fetchNewData(Option lastCheckpointStr, long sourceLimit) { OffsetRange[] offsetRanges = offsetGen.getNextOffsetRanges(lastCheckpointStr, sourceLimit); long totalNewMsgs = CheckpointUtils.totalNewMessages(offsetRanges); if (totalNewMsgs <= 0) { - return new InputBatch<>(Option.empty(), - lastCheckpointStr.isPresent() ? lastCheckpointStr.get() : ""); + return new InputBatch<>(Option.empty(), lastCheckpointStr.isPresent() ? lastCheckpointStr.get() : ""); } else { log.info("About to read " + totalNewMsgs + " from Kafka for topic :" + offsetGen.getTopicName()); } JavaRDD newDataRDD = toRDD(offsetRanges); - return new InputBatch<>(Option.of(newDataRDD), - KafkaOffsetGen.CheckpointUtils.offsetsToStr(offsetRanges)); + return new InputBatch<>(Option.of(newDataRDD), KafkaOffsetGen.CheckpointUtils.offsetsToStr(offsetRanges)); } private JavaRDD toRDD(OffsetRange[] offsetRanges) { - JavaRDD recordRDD = KafkaUtils - .createRDD(sparkContext, String.class, Object.class, StringDecoder.class, KafkaAvroDecoder.class, + JavaRDD recordRDD = + KafkaUtils.createRDD(sparkContext, String.class, Object.class, StringDecoder.class, KafkaAvroDecoder.class, offsetGen.getKafkaParams(), offsetRanges).values().map(obj -> (GenericRecord) obj); return recordRDD; } diff --git a/hudi-utilities/src/main/java/org/apache/hudi/utilities/sources/AvroSource.java b/hudi-utilities/src/main/java/org/apache/hudi/utilities/sources/AvroSource.java index 815864029..3137cd6fc 100644 --- a/hudi-utilities/src/main/java/org/apache/hudi/utilities/sources/AvroSource.java +++ b/hudi-utilities/src/main/java/org/apache/hudi/utilities/sources/AvroSource.java @@ -27,9 +27,7 @@ import org.apache.spark.sql.SparkSession; public abstract class AvroSource extends Source> { - public AvroSource(TypedProperties props, - JavaSparkContext sparkContext, - SparkSession sparkSession, + public AvroSource(TypedProperties props, JavaSparkContext sparkContext, SparkSession sparkSession, SchemaProvider schemaProvider) { super(props, sparkContext, sparkSession, schemaProvider, SourceType.AVRO); } diff --git a/hudi-utilities/src/main/java/org/apache/hudi/utilities/sources/HiveIncrPullSource.java b/hudi-utilities/src/main/java/org/apache/hudi/utilities/sources/HiveIncrPullSource.java index 97a5bac79..621b6fc00 100644 --- a/hudi-utilities/src/main/java/org/apache/hudi/utilities/sources/HiveIncrPullSource.java +++ b/hudi-utilities/src/main/java/org/apache/hudi/utilities/sources/HiveIncrPullSource.java @@ -46,13 +46,12 @@ import org.apache.spark.api.java.JavaSparkContext; import org.apache.spark.sql.SparkSession; /** - * Source to read deltas produced by {@link HiveIncrementalPuller}, commit by commit and apply - * to the target table + * Source to read deltas produced by {@link HiveIncrementalPuller}, commit by commit and apply to the target table *

    * The general idea here is to have commits sync across the data pipeline. *

    - * [Source Tables(s)] ====> HiveIncrementalScanner ==> incrPullRootPath ==> targetTable {c1,c2,c3,...} - * {c1,c2,c3,...} {c1,c2,c3,...} + * [Source Tables(s)] ====> HiveIncrementalScanner ==> incrPullRootPath ==> targetTable {c1,c2,c3,...} {c1,c2,c3,...} + * {c1,c2,c3,...} *

    * This produces beautiful causality, that makes data issues in ETLs very easy to debug */ @@ -84,8 +83,7 @@ public class HiveIncrPullSource extends AvroSource { /** * Finds the first commit from source, greater than the target's last commit, and reads it out. */ - private Option findCommitToPull(Option latestTargetCommit) - throws IOException { + private Option findCommitToPull(Option latestTargetCommit) throws IOException { log.info("Looking for commits "); @@ -104,7 +102,7 @@ public class HiveIncrPullSource extends AvroSource { } for (String commitTime : commitTimes) { - //TODO(vc): Add an option to delete consumed commits + // TODO(vc): Add an option to delete consumed commits if (commitTime.compareTo(latestTargetCommit.get()) > 0) { return Option.of(commitTime); } @@ -113,30 +111,24 @@ public class HiveIncrPullSource extends AvroSource { } @Override - protected InputBatch> fetchNewData( - Option lastCheckpointStr, long sourceLimit) { + protected InputBatch> fetchNewData(Option lastCheckpointStr, long sourceLimit) { try { // find the source commit to pull Option commitToPull = findCommitToPull(lastCheckpointStr); if (!commitToPull.isPresent()) { - return new InputBatch<>(Option.empty(), - lastCheckpointStr.isPresent() ? lastCheckpointStr.get() : ""); + return new InputBatch<>(Option.empty(), lastCheckpointStr.isPresent() ? lastCheckpointStr.get() : ""); } // read the files out. - List commitDeltaFiles = Arrays.asList( - fs.listStatus(new Path(incrPullRootPath, commitToPull.get()))); - String pathStr = commitDeltaFiles.stream().map(f -> f.getPath().toString()) - .collect(Collectors.joining(",")); - JavaPairRDD avroRDD = sparkContext.newAPIHadoopFile(pathStr, - AvroKeyInputFormat.class, AvroKey.class, NullWritable.class, - sparkContext.hadoopConfiguration()); + List commitDeltaFiles = Arrays.asList(fs.listStatus(new Path(incrPullRootPath, commitToPull.get()))); + String pathStr = commitDeltaFiles.stream().map(f -> f.getPath().toString()).collect(Collectors.joining(",")); + JavaPairRDD avroRDD = sparkContext.newAPIHadoopFile(pathStr, AvroKeyInputFormat.class, + AvroKey.class, NullWritable.class, sparkContext.hadoopConfiguration()); return new InputBatch<>(Option.of(avroRDD.keys().map(r -> ((GenericRecord) r.datum()))), String.valueOf(commitToPull.get())); } catch (IOException ioe) { - throw new HoodieIOException( - "Unable to read from source from checkpoint: " + lastCheckpointStr, ioe); + throw new HoodieIOException("Unable to read from source from checkpoint: " + lastCheckpointStr, ioe); } } } diff --git a/hudi-utilities/src/main/java/org/apache/hudi/utilities/sources/HoodieIncrSource.java b/hudi-utilities/src/main/java/org/apache/hudi/utilities/sources/HoodieIncrSource.java index 5d95e7d4a..f5ed4e31e 100644 --- a/hudi-utilities/src/main/java/org/apache/hudi/utilities/sources/HoodieIncrSource.java +++ b/hudi-utilities/src/main/java/org/apache/hudi/utilities/sources/HoodieIncrSource.java @@ -72,8 +72,7 @@ public class HoodieIncrSource extends RowSource { private static final Boolean DEFAULT_READ_LATEST_INSTANT_ON_MISSING_CKPT = false; } - public HoodieIncrSource(TypedProperties props, - JavaSparkContext sparkContext, SparkSession sparkSession, + public HoodieIncrSource(TypedProperties props, JavaSparkContext sparkContext, SparkSession sparkSession, SchemaProvider schemaProvider) { super(props, sparkContext, sparkSession, schemaProvider); } @@ -84,13 +83,12 @@ public class HoodieIncrSource extends RowSource { DataSourceUtils.checkRequiredProperties(props, Arrays.asList(Config.HOODIE_SRC_BASE_PATH)); /** - DataSourceUtils.checkRequiredProperties(props, Arrays.asList(Config.HOODIE_SRC_BASE_PATH, - Config.HOODIE_SRC_PARTITION_FIELDS)); - List partitionFields = props.getStringList(Config.HOODIE_SRC_PARTITION_FIELDS, ",", - new ArrayList<>()); - PartitionValueExtractor extractor = DataSourceUtils.createPartitionExtractor(props.getString( - Config.HOODIE_SRC_PARTITION_EXTRACTORCLASS, Config.DEFAULT_HOODIE_SRC_PARTITION_EXTRACTORCLASS)); - **/ + * DataSourceUtils.checkRequiredProperties(props, Arrays.asList(Config.HOODIE_SRC_BASE_PATH, + * Config.HOODIE_SRC_PARTITION_FIELDS)); List partitionFields = + * props.getStringList(Config.HOODIE_SRC_PARTITION_FIELDS, ",", new ArrayList<>()); PartitionValueExtractor + * extractor = DataSourceUtils.createPartitionExtractor(props.getString( Config.HOODIE_SRC_PARTITION_EXTRACTORCLASS, + * Config.DEFAULT_HOODIE_SRC_PARTITION_EXTRACTORCLASS)); + **/ String srcPath = props.getString(Config.HOODIE_SRC_BASE_PATH); int numInstantsPerFetch = props.getInteger(Config.NUM_INSTANTS_PER_FETCH, Config.DEFAULT_NUM_INSTANTS_PER_FETCH); boolean readLatestOnMissingCkpt = props.getBoolean(Config.READ_LATEST_INSTANT_ON_MISSING_CKPT, @@ -117,42 +115,31 @@ public class HoodieIncrSource extends RowSource { Dataset source = reader.load(srcPath); /** - log.info("Partition Fields are : (" + partitionFields + "). Initial Source Schema :" + source.schema()); - - StructType newSchema = new StructType(source.schema().fields()); - for (String field : partitionFields) { - newSchema = newSchema.add(field, DataTypes.StringType, true); - } - - /** - * Validates if the commit time is sane and also generates Partition fields from _hoodie_partition_path if + * log.info("Partition Fields are : (" + partitionFields + "). Initial Source Schema :" + source.schema()); + * + * StructType newSchema = new StructType(source.schema().fields()); for (String field : partitionFields) { newSchema + * = newSchema.add(field, DataTypes.StringType, true); } + * + * /** Validates if the commit time is sane and also generates Partition fields from _hoodie_partition_path if * configured * - Dataset validated = source.map((MapFunction) (Row row) -> { - // _hoodie_instant_time - String instantTime = row.getString(0); - IncrSourceHelper.validateInstantTime(row, instantTime, instantEndpts.getKey(), instantEndpts.getValue()); - if (!partitionFields.isEmpty()) { - // _hoodie_partition_path - String hoodiePartitionPath = row.getString(3); - List partitionVals = extractor.extractPartitionValuesInPath(hoodiePartitionPath).stream() - .map(o -> (Object) o).collect(Collectors.toList()); - Preconditions.checkArgument(partitionVals.size() == partitionFields.size(), - "#partition-fields != #partition-values-extracted"); - List rowObjs = new ArrayList<>(scala.collection.JavaConversions.seqAsJavaList(row.toSeq())); - rowObjs.addAll(partitionVals); - return RowFactory.create(rowObjs.toArray()); - } - return row; - }, RowEncoder.apply(newSchema)); - - log.info("Validated Source Schema :" + validated.schema()); - **/ + * Dataset validated = source.map((MapFunction) (Row row) -> { // _hoodie_instant_time String + * instantTime = row.getString(0); IncrSourceHelper.validateInstantTime(row, instantTime, instantEndpts.getKey(), + * instantEndpts.getValue()); if (!partitionFields.isEmpty()) { // _hoodie_partition_path String hoodiePartitionPath + * = row.getString(3); List partitionVals = + * extractor.extractPartitionValuesInPath(hoodiePartitionPath).stream() .map(o -> (Object) + * o).collect(Collectors.toList()); Preconditions.checkArgument(partitionVals.size() == partitionFields.size(), + * "#partition-fields != #partition-values-extracted"); List rowObjs = new + * ArrayList<>(scala.collection.JavaConversions.seqAsJavaList(row.toSeq())); rowObjs.addAll(partitionVals); return + * RowFactory.create(rowObjs.toArray()); } return row; }, RowEncoder.apply(newSchema)); + * + * log.info("Validated Source Schema :" + validated.schema()); + **/ // Remove Hoodie meta columns except partition path from input source final Dataset src = source.drop(HoodieRecord.HOODIE_META_COLUMNS.stream() .filter(x -> !x.equals(HoodieRecord.PARTITION_PATH_METADATA_FIELD)).toArray(String[]::new)); - //log.info("Final Schema from Source is :" + src.schema()); + // log.info("Final Schema from Source is :" + src.schema()); return Pair.of(Option.of(src), instantEndpts.getRight()); } } diff --git a/hudi-utilities/src/main/java/org/apache/hudi/utilities/sources/InputBatch.java b/hudi-utilities/src/main/java/org/apache/hudi/utilities/sources/InputBatch.java index efb8dbeda..6f15dea97 100644 --- a/hudi-utilities/src/main/java/org/apache/hudi/utilities/sources/InputBatch.java +++ b/hudi-utilities/src/main/java/org/apache/hudi/utilities/sources/InputBatch.java @@ -27,8 +27,7 @@ public class InputBatch { private final String checkpointForNextBatch; private final SchemaProvider schemaProvider; - public InputBatch(Option batch, String checkpointForNextBatch, - SchemaProvider schemaProvider) { + public InputBatch(Option batch, String checkpointForNextBatch, SchemaProvider schemaProvider) { this.batch = batch; this.checkpointForNextBatch = checkpointForNextBatch; this.schemaProvider = schemaProvider; diff --git a/hudi-utilities/src/main/java/org/apache/hudi/utilities/sources/JsonDFSSource.java b/hudi-utilities/src/main/java/org/apache/hudi/utilities/sources/JsonDFSSource.java index a801d2a4b..6be3a54ba 100644 --- a/hudi-utilities/src/main/java/org/apache/hudi/utilities/sources/JsonDFSSource.java +++ b/hudi-utilities/src/main/java/org/apache/hudi/utilities/sources/JsonDFSSource.java @@ -41,12 +41,11 @@ public class JsonDFSSource extends JsonSource { } @Override - protected InputBatch> fetchNewData(Option lastCkptStr, - long sourceLimit) { + protected InputBatch> fetchNewData(Option lastCkptStr, long sourceLimit) { Pair, String> selPathsWithMaxModificationTime = pathSelector.getNextFilePathsAndMaxModificationTime(lastCkptStr, sourceLimit); - return selPathsWithMaxModificationTime.getLeft().map(pathStr -> new InputBatch<>( - Option.of(fromFiles(pathStr)), selPathsWithMaxModificationTime.getRight())) + return selPathsWithMaxModificationTime.getLeft() + .map(pathStr -> new InputBatch<>(Option.of(fromFiles(pathStr)), selPathsWithMaxModificationTime.getRight())) .orElse(new InputBatch<>(Option.empty(), selPathsWithMaxModificationTime.getRight())); } diff --git a/hudi-utilities/src/main/java/org/apache/hudi/utilities/sources/JsonKafkaSource.java b/hudi-utilities/src/main/java/org/apache/hudi/utilities/sources/JsonKafkaSource.java index 867798dcb..8e95a8cec 100644 --- a/hudi-utilities/src/main/java/org/apache/hudi/utilities/sources/JsonKafkaSource.java +++ b/hudi-utilities/src/main/java/org/apache/hudi/utilities/sources/JsonKafkaSource.java @@ -48,13 +48,11 @@ public class JsonKafkaSource extends JsonSource { } @Override - protected InputBatch> fetchNewData(Option lastCheckpointStr, - long sourceLimit) { + protected InputBatch> fetchNewData(Option lastCheckpointStr, long sourceLimit) { OffsetRange[] offsetRanges = offsetGen.getNextOffsetRanges(lastCheckpointStr, sourceLimit); long totalNewMsgs = CheckpointUtils.totalNewMessages(offsetRanges); if (totalNewMsgs <= 0) { - return new InputBatch<>(Option.empty(), - lastCheckpointStr.isPresent() ? lastCheckpointStr.get() : ""); + return new InputBatch<>(Option.empty(), lastCheckpointStr.isPresent() ? lastCheckpointStr.get() : ""); } log.info("About to read " + totalNewMsgs + " from Kafka for topic :" + offsetGen.getTopicName()); JavaRDD newDataRDD = toRDD(offsetRanges); diff --git a/hudi-utilities/src/main/java/org/apache/hudi/utilities/sources/JsonSource.java b/hudi-utilities/src/main/java/org/apache/hudi/utilities/sources/JsonSource.java index ac47c956b..5c9db183d 100644 --- a/hudi-utilities/src/main/java/org/apache/hudi/utilities/sources/JsonSource.java +++ b/hudi-utilities/src/main/java/org/apache/hudi/utilities/sources/JsonSource.java @@ -26,9 +26,7 @@ import org.apache.spark.sql.SparkSession; public abstract class JsonSource extends Source> { - public JsonSource(TypedProperties props, - JavaSparkContext sparkContext, - SparkSession sparkSession, + public JsonSource(TypedProperties props, JavaSparkContext sparkContext, SparkSession sparkSession, SchemaProvider schemaProvider) { super(props, sparkContext, sparkSession, schemaProvider, SourceType.JSON); } diff --git a/hudi-utilities/src/main/java/org/apache/hudi/utilities/sources/RowSource.java b/hudi-utilities/src/main/java/org/apache/hudi/utilities/sources/RowSource.java index 715ba8028..467c66710 100644 --- a/hudi-utilities/src/main/java/org/apache/hudi/utilities/sources/RowSource.java +++ b/hudi-utilities/src/main/java/org/apache/hudi/utilities/sources/RowSource.java @@ -30,15 +30,12 @@ import org.apache.spark.sql.SparkSession; public abstract class RowSource extends Source> { - public RowSource(TypedProperties props, - JavaSparkContext sparkContext, - SparkSession sparkSession, + public RowSource(TypedProperties props, JavaSparkContext sparkContext, SparkSession sparkSession, SchemaProvider schemaProvider) { super(props, sparkContext, sparkSession, schemaProvider, SourceType.ROW); } - protected abstract Pair>, String> fetchNextBatch(Option lastCkptStr, - long sourceLimit); + protected abstract Pair>, String> fetchNextBatch(Option lastCkptStr, long sourceLimit); @Override protected final InputBatch> fetchNewData(Option lastCkptStr, long sourceLimit) { diff --git a/hudi-utilities/src/main/java/org/apache/hudi/utilities/sources/Source.java b/hudi-utilities/src/main/java/org/apache/hudi/utilities/sources/Source.java index 2900ae09d..ea57f4b47 100644 --- a/hudi-utilities/src/main/java/org/apache/hudi/utilities/sources/Source.java +++ b/hudi-utilities/src/main/java/org/apache/hudi/utilities/sources/Source.java @@ -34,9 +34,7 @@ public abstract class Source implements Serializable { protected static volatile Logger log = LogManager.getLogger(Source.class); public enum SourceType { - JSON, - AVRO, - ROW + JSON, AVRO, ROW } protected transient TypedProperties props; @@ -64,6 +62,7 @@ public abstract class Source implements Serializable { /** * Main API called by Hoodie Delta Streamer to fetch records + * * @param lastCkptStr Last Checkpoint * @param sourceLimit Source Limit * @return @@ -71,8 +70,8 @@ public abstract class Source implements Serializable { public final InputBatch fetchNext(Option lastCkptStr, long sourceLimit) { InputBatch batch = fetchNewData(lastCkptStr, sourceLimit); // If overriddenSchemaProvider is passed in CLI, use it - return overriddenSchemaProvider == null ? batch : new InputBatch<>(batch.getBatch(), - batch.getCheckpointForNextBatch(), overriddenSchemaProvider); + return overriddenSchemaProvider == null ? batch + : new InputBatch<>(batch.getBatch(), batch.getCheckpointForNextBatch(), overriddenSchemaProvider); } public SourceType getSourceType() { diff --git a/hudi-utilities/src/main/java/org/apache/hudi/utilities/sources/helpers/AvroConvertor.java b/hudi-utilities/src/main/java/org/apache/hudi/utilities/sources/helpers/AvroConvertor.java index eeaf3f6f2..4049cb655 100644 --- a/hudi-utilities/src/main/java/org/apache/hudi/utilities/sources/helpers/AvroConvertor.java +++ b/hudi-utilities/src/main/java/org/apache/hudi/utilities/sources/helpers/AvroConvertor.java @@ -27,8 +27,8 @@ import org.apache.avro.generic.GenericRecord; import org.apache.hudi.avro.MercifulJsonConverter; /** - * Convert a variety of datum into Avro GenericRecords. Has a bunch of lazy - * fields to circumvent issues around serializing these objects from driver to executors + * Convert a variety of datum into Avro GenericRecords. Has a bunch of lazy fields to circumvent issues around + * serializing these objects from driver to executors */ public class AvroConvertor implements Serializable { diff --git a/hudi-utilities/src/main/java/org/apache/hudi/utilities/sources/helpers/DFSPathSelector.java b/hudi-utilities/src/main/java/org/apache/hudi/utilities/sources/helpers/DFSPathSelector.java index fd5c67682..ce979d654 100644 --- a/hudi-utilities/src/main/java/org/apache/hudi/utilities/sources/helpers/DFSPathSelector.java +++ b/hudi-utilities/src/main/java/org/apache/hudi/utilities/sources/helpers/DFSPathSelector.java @@ -59,18 +59,18 @@ public class DFSPathSelector { this.fs = FSUtils.getFs(props.getString(Config.ROOT_INPUT_PATH_PROP), hadoopConf); } - public Pair, String> getNextFilePathsAndMaxModificationTime( - Option lastCheckpointStr, long sourceLimit) { + public Pair, String> getNextFilePathsAndMaxModificationTime(Option lastCheckpointStr, + long sourceLimit) { try { // obtain all eligible files under root folder. List eligibleFiles = new ArrayList<>(); - RemoteIterator fitr = fs.listFiles( - new Path(props.getString(Config.ROOT_INPUT_PATH_PROP)), true); + RemoteIterator fitr = + fs.listFiles(new Path(props.getString(Config.ROOT_INPUT_PATH_PROP)), true); while (fitr.hasNext()) { LocatedFileStatus fileStatus = fitr.next(); - if (fileStatus.isDirectory() || IGNORE_FILEPREFIX_LIST.stream() - .anyMatch(pfx -> fileStatus.getPath().getName().startsWith(pfx))) { + if (fileStatus.isDirectory() + || IGNORE_FILEPREFIX_LIST.stream().anyMatch(pfx -> fileStatus.getPath().getName().startsWith(pfx))) { continue; } eligibleFiles.add(fileStatus); @@ -83,8 +83,7 @@ public class DFSPathSelector { long maxModificationTime = Long.MIN_VALUE; List filteredFiles = new ArrayList<>(); for (FileStatus f : eligibleFiles) { - if (lastCheckpointStr.isPresent() && f.getModificationTime() <= Long.valueOf( - lastCheckpointStr.get())) { + if (lastCheckpointStr.isPresent() && f.getModificationTime() <= Long.valueOf(lastCheckpointStr.get())) { // skip processed files continue; } @@ -101,20 +100,15 @@ public class DFSPathSelector { // no data to read if (filteredFiles.size() == 0) { - return new ImmutablePair<>(Option.empty(), - lastCheckpointStr.orElseGet(() -> String.valueOf(Long.MIN_VALUE))); + return new ImmutablePair<>(Option.empty(), lastCheckpointStr.orElseGet(() -> String.valueOf(Long.MIN_VALUE))); } // read the files out. - String pathStr = filteredFiles.stream().map(f -> f.getPath().toString()) - .collect(Collectors.joining(",")); + String pathStr = filteredFiles.stream().map(f -> f.getPath().toString()).collect(Collectors.joining(",")); - return new ImmutablePair<>( - Option.ofNullable(pathStr), - String.valueOf(maxModificationTime)); + return new ImmutablePair<>(Option.ofNullable(pathStr), String.valueOf(maxModificationTime)); } catch (IOException ioe) { - throw new HoodieIOException( - "Unable to read from source from checkpoint: " + lastCheckpointStr, ioe); + throw new HoodieIOException("Unable to read from source from checkpoint: " + lastCheckpointStr, ioe); } } } diff --git a/hudi-utilities/src/main/java/org/apache/hudi/utilities/sources/helpers/IncrSourceHelper.java b/hudi-utilities/src/main/java/org/apache/hudi/utilities/sources/helpers/IncrSourceHelper.java index 021c8ee97..62d35cd7e 100644 --- a/hudi-utilities/src/main/java/org/apache/hudi/utilities/sources/helpers/IncrSourceHelper.java +++ b/hudi-utilities/src/main/java/org/apache/hudi/utilities/sources/helpers/IncrSourceHelper.java @@ -51,13 +51,11 @@ public class IncrSourceHelper { * @param readLatestOnMissingBeginInstant when begin instant is missing, allow reading from latest committed instant * @return begin and end instants */ - public static Pair calculateBeginAndEndInstants( - JavaSparkContext jssc, String srcBasePath, int numInstantsPerFetch, Option beginInstant, - boolean readLatestOnMissingBeginInstant) { - Preconditions.checkArgument(numInstantsPerFetch > 0, "Make sure the config" - + " hoodie.deltastreamer.source.hoodieincr.num_instants is set to a positive value"); - HoodieTableMetaClient srcMetaClient = new HoodieTableMetaClient(jssc.hadoopConfiguration(), - srcBasePath, true); + public static Pair calculateBeginAndEndInstants(JavaSparkContext jssc, String srcBasePath, + int numInstantsPerFetch, Option beginInstant, boolean readLatestOnMissingBeginInstant) { + Preconditions.checkArgument(numInstantsPerFetch > 0, + "Make sure the config" + " hoodie.deltastreamer.source.hoodieincr.num_instants is set to a positive value"); + HoodieTableMetaClient srcMetaClient = new HoodieTableMetaClient(jssc.hadoopConfiguration(), srcBasePath, true); final HoodieTimeline activeCommitTimeline = srcMetaClient.getActiveTimeline().getCommitTimeline().filterCompletedInstants(); @@ -72,11 +70,8 @@ public class IncrSourceHelper { } }); - Option nthInstant = Option.fromJavaOptional( - activeCommitTimeline - .findInstantsAfter(beginInstantTime, numInstantsPerFetch) - .getInstants() - .reduce((x, y) -> y)); + Option nthInstant = Option.fromJavaOptional(activeCommitTimeline + .findInstantsAfter(beginInstantTime, numInstantsPerFetch).getInstants().reduce((x, y) -> y)); return Pair.of(beginInstantTime, nthInstant.map(instant -> instant.getTimestamp()).orElse(beginInstantTime)); } @@ -90,14 +85,12 @@ public class IncrSourceHelper { */ public static void validateInstantTime(Row row, String instantTime, String sinceInstant, String endInstant) { Preconditions.checkNotNull(instantTime); - Preconditions.checkArgument(HoodieTimeline.compareTimestamps(instantTime, - sinceInstant, HoodieTimeline.GREATER), - "Instant time(_hoodie_commit_time) in row (" + row + ") was : " + instantTime - + "but expected to be between " + sinceInstant + "(excl) - " - + endInstant + "(incl)"); - Preconditions.checkArgument(HoodieTimeline.compareTimestamps(instantTime, - endInstant, HoodieTimeline.LESSER_OR_EQUAL), - "Instant time(_hoodie_commit_time) in row (" + row + ") was : " + instantTime - + "but expected to be between " + sinceInstant + "(excl) - " + endInstant + "(incl)"); + Preconditions.checkArgument(HoodieTimeline.compareTimestamps(instantTime, sinceInstant, HoodieTimeline.GREATER), + "Instant time(_hoodie_commit_time) in row (" + row + ") was : " + instantTime + "but expected to be between " + + sinceInstant + "(excl) - " + endInstant + "(incl)"); + Preconditions.checkArgument( + HoodieTimeline.compareTimestamps(instantTime, endInstant, HoodieTimeline.LESSER_OR_EQUAL), + "Instant time(_hoodie_commit_time) in row (" + row + ") was : " + instantTime + "but expected to be between " + + sinceInstant + "(excl) - " + endInstant + "(incl)"); } } diff --git a/hudi-utilities/src/main/java/org/apache/hudi/utilities/sources/helpers/KafkaOffsetGen.java b/hudi-utilities/src/main/java/org/apache/hudi/utilities/sources/helpers/KafkaOffsetGen.java index 2b0fd8791..558e75710 100644 --- a/hudi-utilities/src/main/java/org/apache/hudi/utilities/sources/helpers/KafkaOffsetGen.java +++ b/hudi-utilities/src/main/java/org/apache/hudi/utilities/sources/helpers/KafkaOffsetGen.java @@ -58,8 +58,7 @@ public class KafkaOffsetGen { /** * Reconstruct checkpoint from string. */ - public static HashMap strToOffsets( - String checkpointStr) { + public static HashMap strToOffsets(String checkpointStr) { HashMap offsetMap = new HashMap<>(); if (checkpointStr.length() == 0) { return offsetMap; @@ -75,14 +74,15 @@ public class KafkaOffsetGen { } /** - * String representation of checkpoint

    Format: topic1,0:offset0,1:offset1,2:offset2, ..... + * String representation of checkpoint + *

    + * Format: topic1,0:offset0,1:offset1,2:offset2, ..... */ public static String offsetsToStr(OffsetRange[] ranges) { StringBuilder sb = new StringBuilder(); // at least 1 partition will be present. sb.append(ranges[0].topic() + ","); - sb.append(Arrays.stream(ranges) - .map(r -> String.format("%s:%d", r.partition(), r.untilOffset())) + sb.append(Arrays.stream(ranges).map(r -> String.format("%s:%d", r.partition(), r.untilOffset())) .collect(Collectors.joining(","))); return sb.toString(); } @@ -94,10 +94,8 @@ public class KafkaOffsetGen { * @param toOffsetMap offsets of where each partitions is currently at * @param numEvents maximum number of events to read. */ - public static OffsetRange[] computeOffsetRanges( - HashMap fromOffsetMap, - HashMap toOffsetMap, - long numEvents) { + public static OffsetRange[] computeOffsetRanges(HashMap fromOffsetMap, + HashMap toOffsetMap, long numEvents) { Comparator byPartition = Comparator.comparing(OffsetRange::partition); @@ -114,8 +112,8 @@ public class KafkaOffsetGen { // keep going until we have events to allocate and partitions still not exhausted. while (allocedEvents < numEvents && exhaustedPartitions.size() < toOffsetMap.size()) { long remainingEvents = numEvents - allocedEvents; - long eventsPerPartition = (long) Math - .ceil((1.0 * remainingEvents) / (toOffsetMap.size() - exhaustedPartitions.size())); + long eventsPerPartition = + (long) Math.ceil((1.0 * remainingEvents) / (toOffsetMap.size() - exhaustedPartitions.size())); // Allocate the remaining events to non-exhausted partitions, in round robin fashion for (int i = 0; i < ranges.length; i++) { @@ -162,8 +160,7 @@ public class KafkaOffsetGen { * Kafka reset offset strategies */ enum KafkaResetOffsetStrategies { - LARGEST, - SMALLEST + LARGEST, SMALLEST } /** @@ -193,12 +190,11 @@ public class KafkaOffsetGen { // Obtain current metadata for the topic KafkaCluster cluster = new KafkaCluster(ScalaHelpers.toScalaMap(kafkaParams)); - Either, Set> either = cluster.getPartitions( - ScalaHelpers.toScalaSet(new HashSet<>(Collections.singletonList(topicName)))); + Either, Set> either = + cluster.getPartitions(ScalaHelpers.toScalaSet(new HashSet<>(Collections.singletonList(topicName)))); if (either.isLeft()) { // log errors. and bail out. - throw new HoodieDeltaStreamerException("Error obtaining partition metadata", - either.left().get().head()); + throw new HoodieDeltaStreamerException("Error obtaining partition metadata", either.left().get().head()); } Set topicPartitions = either.right().get(); @@ -208,26 +204,25 @@ public class KafkaOffsetGen { if (lastCheckpointStr.isPresent()) { fromOffsets = checkupValidOffsets(cluster, lastCheckpointStr, topicPartitions); } else { - KafkaResetOffsetStrategies autoResetValue = KafkaResetOffsetStrategies.valueOf( - props.getString("auto.offset.reset", Config.DEFAULT_AUTO_RESET_OFFSET.toString()).toUpperCase()); + KafkaResetOffsetStrategies autoResetValue = KafkaResetOffsetStrategies + .valueOf(props.getString("auto.offset.reset", Config.DEFAULT_AUTO_RESET_OFFSET.toString()).toUpperCase()); switch (autoResetValue) { case SMALLEST: - fromOffsets = new HashMap(ScalaHelpers.toJavaMap( - cluster.getEarliestLeaderOffsets(topicPartitions).right().get())); + fromOffsets = + new HashMap(ScalaHelpers.toJavaMap(cluster.getEarliestLeaderOffsets(topicPartitions).right().get())); break; case LARGEST: - fromOffsets = new HashMap( - ScalaHelpers.toJavaMap(cluster.getLatestLeaderOffsets(topicPartitions).right().get())); + fromOffsets = + new HashMap(ScalaHelpers.toJavaMap(cluster.getLatestLeaderOffsets(topicPartitions).right().get())); break; default: - throw new HoodieNotSupportedException( - "Auto reset value must be one of 'smallest' or 'largest' "); + throw new HoodieNotSupportedException("Auto reset value must be one of 'smallest' or 'largest' "); } } // Obtain the latest offsets. - HashMap toOffsets = new HashMap( - ScalaHelpers.toJavaMap(cluster.getLatestLeaderOffsets(topicPartitions).right().get())); + HashMap toOffsets = + new HashMap(ScalaHelpers.toJavaMap(cluster.getLatestLeaderOffsets(topicPartitions).right().get())); // Come up with final set of OffsetRanges to read (account for new partitions, limit number of events) long numEvents = Math.min(DEFAULT_MAX_EVENTS_TO_READ, sourceLimit); @@ -236,22 +231,17 @@ public class KafkaOffsetGen { return offsetRanges; } - // check up checkpoint offsets is valid or not, if true, return checkpoint offsets, + // check up checkpoint offsets is valid or not, if true, return checkpoint offsets, // else return earliest offsets - private HashMap checkupValidOffsets( - KafkaCluster cluster, - Option lastCheckpointStr, - Set topicPartitions) { + private HashMap checkupValidOffsets(KafkaCluster cluster, + Option lastCheckpointStr, Set topicPartitions) { HashMap checkpointOffsets = - CheckpointUtils.strToOffsets(lastCheckpointStr.get()); + CheckpointUtils.strToOffsets(lastCheckpointStr.get()); HashMap earliestOffsets = - new HashMap(ScalaHelpers.toJavaMap( - cluster.getEarliestLeaderOffsets(topicPartitions).right().get())); + new HashMap(ScalaHelpers.toJavaMap(cluster.getEarliestLeaderOffsets(topicPartitions).right().get())); - boolean checkpointOffsetReseter = checkpointOffsets.entrySet() - .stream() - .anyMatch(offset -> offset.getValue().offset() - < earliestOffsets.get(offset.getKey()).offset()); + boolean checkpointOffsetReseter = checkpointOffsets.entrySet().stream() + .anyMatch(offset -> offset.getValue().offset() < earliestOffsets.get(offset.getKey()).offset()); return checkpointOffsetReseter ? earliestOffsets : checkpointOffsets; } diff --git a/hudi-utilities/src/main/java/org/apache/hudi/utilities/transform/FlatteningTransformer.java b/hudi-utilities/src/main/java/org/apache/hudi/utilities/transform/FlatteningTransformer.java index d029f6c65..858fbd8e0 100644 --- a/hudi-utilities/src/main/java/org/apache/hudi/utilities/transform/FlatteningTransformer.java +++ b/hudi-utilities/src/main/java/org/apache/hudi/utilities/transform/FlatteningTransformer.java @@ -39,18 +39,14 @@ public class FlatteningTransformer implements Transformer { /** Configs supported */ @Override - public Dataset apply( - JavaSparkContext jsc, - SparkSession sparkSession, - Dataset rowDataset, + public Dataset apply(JavaSparkContext jsc, SparkSession sparkSession, Dataset rowDataset, TypedProperties properties) { // tmp table name doesn't like dashes String tmpTable = TMP_TABLE.concat(UUID.randomUUID().toString().replace("-", "_")); log.info("Registering tmp table : " + tmpTable); rowDataset.registerTempTable(tmpTable); - return sparkSession.sql("select " + flattenSchema(rowDataset.schema(), null) - + " from " + tmpTable); + return sparkSession.sql("select " + flattenSchema(rowDataset.schema(), null) + " from " + tmpTable); } public String flattenSchema(StructType schema, String prefix) { @@ -75,7 +71,7 @@ public class FlatteningTransformer implements Transformer { } if (selectSQLQuery.length() > 0) { - selectSQLQuery. deleteCharAt(selectSQLQuery.length() - 1); + selectSQLQuery.deleteCharAt(selectSQLQuery.length() - 1); } return selectSQLQuery.toString(); diff --git a/hudi-utilities/src/main/java/org/apache/hudi/utilities/transform/IdentityTransformer.java b/hudi-utilities/src/main/java/org/apache/hudi/utilities/transform/IdentityTransformer.java index 5a2c2c999..ffc0fadc1 100644 --- a/hudi-utilities/src/main/java/org/apache/hudi/utilities/transform/IdentityTransformer.java +++ b/hudi-utilities/src/main/java/org/apache/hudi/utilities/transform/IdentityTransformer.java @@ -30,8 +30,8 @@ import org.apache.spark.sql.SparkSession; public class IdentityTransformer implements Transformer { @Override - public Dataset apply(JavaSparkContext jsc, SparkSession sparkSession, - Dataset rowDataset, TypedProperties properties) { + public Dataset apply(JavaSparkContext jsc, SparkSession sparkSession, Dataset rowDataset, + TypedProperties properties) { return rowDataset; } } diff --git a/hudi-utilities/src/main/java/org/apache/hudi/utilities/transform/SqlQueryBasedTransformer.java b/hudi-utilities/src/main/java/org/apache/hudi/utilities/transform/SqlQueryBasedTransformer.java index ce78deb1d..b5e2e3733 100644 --- a/hudi-utilities/src/main/java/org/apache/hudi/utilities/transform/SqlQueryBasedTransformer.java +++ b/hudi-utilities/src/main/java/org/apache/hudi/utilities/transform/SqlQueryBasedTransformer.java @@ -48,8 +48,8 @@ public class SqlQueryBasedTransformer implements Transformer { } @Override - public Dataset apply(JavaSparkContext jsc, SparkSession sparkSession, - Dataset rowDataset, TypedProperties properties) { + public Dataset apply(JavaSparkContext jsc, SparkSession sparkSession, Dataset rowDataset, + TypedProperties properties) { String transformerSQL = properties.getString(Config.TRANSFORMER_SQL); if (null == transformerSQL) { throw new IllegalArgumentException("Missing configuration : (" + Config.TRANSFORMER_SQL + ")"); diff --git a/hudi-utilities/src/main/java/org/apache/hudi/utilities/transform/Transformer.java b/hudi-utilities/src/main/java/org/apache/hudi/utilities/transform/Transformer.java index 8b3e42a08..7433f6fa9 100644 --- a/hudi-utilities/src/main/java/org/apache/hudi/utilities/transform/Transformer.java +++ b/hudi-utilities/src/main/java/org/apache/hudi/utilities/transform/Transformer.java @@ -38,6 +38,5 @@ public interface Transformer { * @param properties Config properties * @return Transformed Dataset */ - Dataset apply(JavaSparkContext jsc, SparkSession sparkSession, - Dataset rowDataset, TypedProperties properties); + Dataset apply(JavaSparkContext jsc, SparkSession sparkSession, Dataset rowDataset, TypedProperties properties); } diff --git a/hudi-utilities/src/test/java/org/apache/hudi/utilities/SchedulerConfGeneratorTest.java b/hudi-utilities/src/test/java/org/apache/hudi/utilities/SchedulerConfGeneratorTest.java index 7a249c91b..239564121 100644 --- a/hudi-utilities/src/test/java/org/apache/hudi/utilities/SchedulerConfGeneratorTest.java +++ b/hudi-utilities/src/test/java/org/apache/hudi/utilities/SchedulerConfGeneratorTest.java @@ -43,8 +43,8 @@ public class SchedulerConfGeneratorTest { cfg.continuousMode = true; cfg.storageType = HoodieTableType.COPY_ON_WRITE.name(); configs = SchedulerConfGenerator.getSparkSchedulingConfigs(cfg); - assertNull("storageType is not MERGE_ON_READ", configs.get( - SchedulerConfGenerator.SPARK_SCHEDULER_ALLOCATION_FILE_KEY)); + assertNull("storageType is not MERGE_ON_READ", + configs.get(SchedulerConfGenerator.SPARK_SCHEDULER_ALLOCATION_FILE_KEY)); cfg.storageType = HoodieTableType.MERGE_ON_READ.name(); configs = SchedulerConfGenerator.getSparkSchedulingConfigs(cfg); diff --git a/hudi-utilities/src/test/java/org/apache/hudi/utilities/TestFlatteningTransformer.java b/hudi-utilities/src/test/java/org/apache/hudi/utilities/TestFlatteningTransformer.java index c5a2ab020..3e7d217ce 100644 --- a/hudi-utilities/src/test/java/org/apache/hudi/utilities/TestFlatteningTransformer.java +++ b/hudi-utilities/src/test/java/org/apache/hudi/utilities/TestFlatteningTransformer.java @@ -34,23 +34,20 @@ public class TestFlatteningTransformer { FlatteningTransformer transformer = new FlatteningTransformer(); // Init - StructField[] nestedStructFields = new StructField[]{ - new StructField("nestedIntColumn", DataTypes.IntegerType, true, Metadata.empty()), - new StructField("nestedStringColumn", DataTypes.StringType, true, Metadata.empty()), - }; + StructField[] nestedStructFields = + new StructField[] {new StructField("nestedIntColumn", DataTypes.IntegerType, true, Metadata.empty()), + new StructField("nestedStringColumn", DataTypes.StringType, true, Metadata.empty()),}; - StructField[] structFields = new StructField[]{ - new StructField("intColumn", DataTypes.IntegerType, true, Metadata.empty()), - new StructField("stringColumn", DataTypes.StringType, true, Metadata.empty()), - new StructField("nestedStruct", DataTypes.createStructType(nestedStructFields), true, Metadata.empty()) - }; + StructField[] structFields = + new StructField[] {new StructField("intColumn", DataTypes.IntegerType, true, Metadata.empty()), + new StructField("stringColumn", DataTypes.StringType, true, Metadata.empty()), + new StructField("nestedStruct", DataTypes.createStructType(nestedStructFields), true, Metadata.empty())}; StructType schema = new StructType(structFields); String flattenedSql = transformer.flattenSchema(schema, null); assertEquals("intColumn as intColumn,stringColumn as stringColumn," - + "nestedStruct.nestedIntColumn as nestedStruct_nestedIntColumn," - + "nestedStruct.nestedStringColumn as nestedStruct_nestedStringColumn", - flattenedSql); + + "nestedStruct.nestedIntColumn as nestedStruct_nestedIntColumn," + + "nestedStruct.nestedStringColumn as nestedStruct_nestedStringColumn", flattenedSql); } } diff --git a/hudi-utilities/src/test/java/org/apache/hudi/utilities/TestHDFSParquetImporter.java b/hudi-utilities/src/test/java/org/apache/hudi/utilities/TestHDFSParquetImporter.java index 6907dbcc7..2c170a264 100644 --- a/hudi-utilities/src/test/java/org/apache/hudi/utilities/TestHDFSParquetImporter.java +++ b/hudi-utilities/src/test/java/org/apache/hudi/utilities/TestHDFSParquetImporter.java @@ -90,8 +90,7 @@ public class TestHDFSParquetImporter implements Serializable { jsc = getJavaSparkContext(); // Test root folder. - String basePath = (new Path(dfsBasePath, - Thread.currentThread().getStackTrace()[1].getMethodName())).toString(); + String basePath = (new Path(dfsBasePath, Thread.currentThread().getStackTrace()[1].getMethodName())).toString(); // Hoodie root folder Path hoodieFolder = new Path(basePath, "testTarget"); @@ -99,13 +98,12 @@ public class TestHDFSParquetImporter implements Serializable { // Create schema file. String schemaFile = new Path(basePath, "file.schema").toString(); - //Create generic records. + // Create generic records. Path srcFolder = new Path(basePath, "testSrc"); createRecords(srcFolder); - HDFSParquetImporter.Config cfg = getHDFSParquetImporterConfig(srcFolder.toString(), - hoodieFolder.toString(), "testTable", "COPY_ON_WRITE", "_row_key", "timestamp", 1, - schemaFile); + HDFSParquetImporter.Config cfg = getHDFSParquetImporterConfig(srcFolder.toString(), hoodieFolder.toString(), + "testTable", "COPY_ON_WRITE", "_row_key", "timestamp", 1, schemaFile); AtomicInteger retry = new AtomicInteger(3); AtomicInteger fileCreated = new AtomicInteger(0); HDFSParquetImporter dataImporter = new HDFSParquetImporter(cfg) { @@ -134,8 +132,7 @@ public class TestHDFSParquetImporter implements Serializable { RemoteIterator hoodieFiles = dfs.listFiles(hoodieFolder, true); while (hoodieFiles.hasNext()) { LocatedFileStatus f = hoodieFiles.next(); - isCommitFilePresent = - isCommitFilePresent || f.getPath().toString().endsWith(HoodieTimeline.COMMIT_EXTENSION); + isCommitFilePresent = isCommitFilePresent || f.getPath().toString().endsWith(HoodieTimeline.COMMIT_EXTENSION); if (f.getPath().toString().endsWith("parquet")) { SQLContext sc = new SQLContext(jsc); @@ -164,14 +161,11 @@ public class TestHDFSParquetImporter implements Serializable { long startTime = HoodieActiveTimeline.COMMIT_FORMATTER.parse("20170203000000").getTime() / 1000; List records = new ArrayList(); for (long recordNum = 0; recordNum < 96; recordNum++) { - records.add(HoodieTestDataGenerator - .generateGenericRecord(Long.toString(recordNum), "rider-" + recordNum, - "driver-" + recordNum, startTime + TimeUnit.HOURS.toSeconds(recordNum))); + records.add(HoodieTestDataGenerator.generateGenericRecord(Long.toString(recordNum), "rider-" + recordNum, + "driver-" + recordNum, startTime + TimeUnit.HOURS.toSeconds(recordNum))); } ParquetWriter writer = AvroParquetWriter.builder(srcFile) - .withSchema(HoodieTestDataGenerator.avroSchema) - .withConf(HoodieTestUtils.getDefaultHadoopConf()) - .build(); + .withSchema(HoodieTestDataGenerator.avroSchema).withConf(HoodieTestUtils.getDefaultHadoopConf()).build(); for (GenericRecord record : records) { writer.write(record); } @@ -194,15 +188,13 @@ public class TestHDFSParquetImporter implements Serializable { jsc = getJavaSparkContext(); // Test root folder. - String basePath = (new Path(dfsBasePath, - Thread.currentThread().getStackTrace()[1].getMethodName())).toString(); + String basePath = (new Path(dfsBasePath, Thread.currentThread().getStackTrace()[1].getMethodName())).toString(); // Hoodie root folder Path hoodieFolder = new Path(basePath, "testTarget"); Path srcFolder = new Path(basePath.toString(), "srcTest"); Path schemaFile = new Path(basePath.toString(), "missingFile.schema"); - HDFSParquetImporter.Config cfg = getHDFSParquetImporterConfig(srcFolder.toString(), - hoodieFolder.toString(), "testTable", "COPY_ON_WRITE", "_row_key", "timestamp", 1, - schemaFile.toString()); + HDFSParquetImporter.Config cfg = getHDFSParquetImporterConfig(srcFolder.toString(), hoodieFolder.toString(), + "testTable", "COPY_ON_WRITE", "_row_key", "timestamp", 1, schemaFile.toString()); HDFSParquetImporter dataImporter = new HDFSParquetImporter(cfg); // Should fail - return : -1. assertEquals(-1, dataImporter.dataImport(jsc, 0)); @@ -228,12 +220,11 @@ public class TestHDFSParquetImporter implements Serializable { jsc = getJavaSparkContext(); // Test root folder. - String basePath = (new Path(dfsBasePath, - Thread.currentThread().getStackTrace()[1].getMethodName())).toString(); + String basePath = (new Path(dfsBasePath, Thread.currentThread().getStackTrace()[1].getMethodName())).toString(); // Hoodie root folder Path hoodieFolder = new Path(basePath, "testTarget"); - //Create generic records. + // Create generic records. Path srcFolder = new Path(basePath, "testSrc"); createRecords(srcFolder); @@ -245,14 +236,14 @@ public class TestHDFSParquetImporter implements Serializable { HDFSParquetImporter.Config cfg; // Check for invalid row key. - cfg = getHDFSParquetImporterConfig(srcFolder.toString(), hoodieFolder.toString(), "testTable", - "COPY_ON_WRITE", "invalidRowKey", "timestamp", 1, schemaFile.toString()); + cfg = getHDFSParquetImporterConfig(srcFolder.toString(), hoodieFolder.toString(), "testTable", "COPY_ON_WRITE", + "invalidRowKey", "timestamp", 1, schemaFile.toString()); dataImporter = new HDFSParquetImporter(cfg); assertEquals(-1, dataImporter.dataImport(jsc, 0)); // Check for invalid partition key. - cfg = getHDFSParquetImporterConfig(srcFolder.toString(), hoodieFolder.toString(), "testTable", - "COPY_ON_WRITE", "_row_key", "invalidTimeStamp", 1, schemaFile.toString()); + cfg = getHDFSParquetImporterConfig(srcFolder.toString(), hoodieFolder.toString(), "testTable", "COPY_ON_WRITE", + "_row_key", "invalidTimeStamp", 1, schemaFile.toString()); dataImporter = new HDFSParquetImporter(cfg); assertEquals(-1, dataImporter.dataImport(jsc, 0)); @@ -263,9 +254,8 @@ public class TestHDFSParquetImporter implements Serializable { } } - private HDFSParquetImporter.Config getHDFSParquetImporterConfig(String srcPath, String targetPath, - String tableName, String tableType, String rowKey, String partitionKey, int parallelism, - String schemaFile) { + private HDFSParquetImporter.Config getHDFSParquetImporterConfig(String srcPath, String targetPath, String tableName, + String tableType, String rowKey, String partitionKey, int parallelism, String schemaFile) { HDFSParquetImporter.Config cfg = new HDFSParquetImporter.Config(); cfg.srcPath = srcPath; cfg.targetPath = targetPath; diff --git a/hudi-utilities/src/test/java/org/apache/hudi/utilities/TestHoodieDeltaStreamer.java b/hudi-utilities/src/test/java/org/apache/hudi/utilities/TestHoodieDeltaStreamer.java index 7f513692c..e0fedd6e4 100644 --- a/hudi-utilities/src/test/java/org/apache/hudi/utilities/TestHoodieDeltaStreamer.java +++ b/hudi-utilities/src/test/java/org/apache/hudi/utilities/TestHoodieDeltaStreamer.java @@ -77,8 +77,7 @@ import org.junit.BeforeClass; import org.junit.Test; /** - * Basic tests against {@link HoodieDeltaStreamer}, by issuing bulk_inserts, - * upserts, inserts. Check counts at the end. + * Basic tests against {@link HoodieDeltaStreamer}, by issuing bulk_inserts, upserts, inserts. Check counts at the end. */ public class TestHoodieDeltaStreamer extends UtilitiesTestBase { @@ -124,8 +123,7 @@ public class TestHoodieDeltaStreamer extends UtilitiesTestBase { // Source schema is the target schema of upstream table downstreamProps.setProperty("hoodie.deltastreamer.schemaprovider.source.schema.file", dfsBasePath + "/target.avsc"); downstreamProps.setProperty("hoodie.deltastreamer.schemaprovider.target.schema.file", dfsBasePath + "/target.avsc"); - UtilitiesTestBase.Helpers.savePropsToDFS(downstreamProps, dfs, - dfsBasePath + "/test-downstream-source.properties"); + UtilitiesTestBase.Helpers.savePropsToDFS(downstreamProps, dfs, dfsBasePath + "/test-downstream-source.properties"); // Properties used for testing invalid key generator TypedProperties invalidProps = new TypedProperties(); @@ -135,8 +133,7 @@ public class TestHoodieDeltaStreamer extends UtilitiesTestBase { invalidProps.setProperty("hoodie.datasource.write.partitionpath.field", "not_there"); invalidProps.setProperty("hoodie.deltastreamer.schemaprovider.source.schema.file", dfsBasePath + "/source.avsc"); invalidProps.setProperty("hoodie.deltastreamer.schemaprovider.target.schema.file", dfsBasePath + "/target.avsc"); - UtilitiesTestBase.Helpers.savePropsToDFS(invalidProps, dfs, - dfsBasePath + "/" + PROPS_FILENAME_TEST_INVALID); + UtilitiesTestBase.Helpers.savePropsToDFS(invalidProps, dfs, dfsBasePath + "/" + PROPS_FILENAME_TEST_INVALID); } @AfterClass @@ -235,7 +232,7 @@ public class TestHoodieDeltaStreamer extends UtilitiesTestBase { HoodieTableMetaClient meta = new HoodieTableMetaClient(fs.getConf(), datasetPath); HoodieTimeline timeline = meta.getActiveTimeline().getCommitTimeline().filterCompletedInstants(); log.info("Timeline Instants=" + meta.getActiveTimeline().getInstants().collect(Collectors.toList())); - int numCompactionCommits = (int)timeline.getInstants().count(); + int numCompactionCommits = (int) timeline.getInstants().count(); assertTrue("Got=" + numCompactionCommits + ", exp >=" + minExpected, minExpected <= numCompactionCommits); } @@ -243,7 +240,7 @@ public class TestHoodieDeltaStreamer extends UtilitiesTestBase { HoodieTableMetaClient meta = new HoodieTableMetaClient(fs.getConf(), datasetPath); HoodieTimeline timeline = meta.getActiveTimeline().getDeltaCommitTimeline().filterCompletedInstants(); log.info("Timeline Instants=" + meta.getActiveTimeline().getInstants().collect(Collectors.toList())); - int numDeltaCommits = (int)timeline.getInstants().count(); + int numDeltaCommits = (int) timeline.getInstants().count(); assertTrue("Got=" + numDeltaCommits + ", exp >=" + minExpected, minExpected <= numDeltaCommits); } @@ -252,8 +249,8 @@ public class TestHoodieDeltaStreamer extends UtilitiesTestBase { HoodieTableMetaClient meta = new HoodieTableMetaClient(fs.getConf(), datasetPath); HoodieTimeline timeline = meta.getActiveTimeline().getCommitsTimeline().filterCompletedInstants(); HoodieInstant lastInstant = timeline.lastInstant().get(); - HoodieCommitMetadata commitMetadata = HoodieCommitMetadata.fromBytes( - timeline.getInstantDetails(lastInstant).get(), HoodieCommitMetadata.class); + HoodieCommitMetadata commitMetadata = + HoodieCommitMetadata.fromBytes(timeline.getInstantDetails(lastInstant).get(), HoodieCommitMetadata.class); assertEquals(totalCommits, timeline.countInstants()); assertEquals(expected, commitMetadata.getMetadata(HoodieDeltaStreamer.CHECKPOINT_KEY)); return lastInstant.getTimestamp(); @@ -279,28 +276,25 @@ public class TestHoodieDeltaStreamer extends UtilitiesTestBase { @Test public void testProps() throws IOException { - TypedProperties props = new DFSPropertiesConfiguration( - dfs, new Path(dfsBasePath + "/" + PROPS_FILENAME_TEST_SOURCE)).getConfig(); + TypedProperties props = + new DFSPropertiesConfiguration(dfs, new Path(dfsBasePath + "/" + PROPS_FILENAME_TEST_SOURCE)).getConfig(); assertEquals(2, props.getInteger("hoodie.upsert.shuffle.parallelism")); assertEquals("_row_key", props.getString("hoodie.datasource.write.recordkey.field")); - assertEquals( - "org.apache.hudi.utilities.TestHoodieDeltaStreamer$TestGenerator", - props.getString("hoodie.datasource.write.keygenerator.class") - ); + assertEquals("org.apache.hudi.utilities.TestHoodieDeltaStreamer$TestGenerator", + props.getString("hoodie.datasource.write.keygenerator.class")); } @Test public void testPropsWithInvalidKeyGenerator() throws Exception { try { String datasetBasePath = dfsBasePath + "/test_dataset"; - HoodieDeltaStreamer deltaStreamer = new HoodieDeltaStreamer( - TestHelpers.makeConfig( - datasetBasePath, Operation.BULK_INSERT, TripsWithDistanceTransformer.class.getName(), - PROPS_FILENAME_TEST_INVALID, false), jsc); + HoodieDeltaStreamer deltaStreamer = + new HoodieDeltaStreamer(TestHelpers.makeConfig(datasetBasePath, Operation.BULK_INSERT, + TripsWithDistanceTransformer.class.getName(), PROPS_FILENAME_TEST_INVALID, false), jsc); deltaStreamer.sync(); fail("Should error out when setting the key generator class property to an invalid value"); } catch (IOException e) { - //expected + // expected log.error("Expected error during getting the key generator", e); assertTrue(e.getMessage().contains("Could not load key generator class")); } @@ -310,12 +304,12 @@ public class TestHoodieDeltaStreamer extends UtilitiesTestBase { public void testDatasetCreation() throws Exception { try { dfs.mkdirs(new Path(dfsBasePath + "/not_a_dataset")); - HoodieDeltaStreamer deltaStreamer = new HoodieDeltaStreamer( - TestHelpers.makeConfig(dfsBasePath + "/not_a_dataset", Operation.BULK_INSERT), jsc); + HoodieDeltaStreamer deltaStreamer = + new HoodieDeltaStreamer(TestHelpers.makeConfig(dfsBasePath + "/not_a_dataset", Operation.BULK_INSERT), jsc); deltaStreamer.sync(); fail("Should error out when pointed out at a dir thats not a dataset"); } catch (DatasetNotFoundException e) { - //expected + // expected log.error("Expected error during dataset creation", e); } } @@ -395,11 +389,9 @@ public class TestHoodieDeltaStreamer extends UtilitiesTestBase { } /** - * Test Bulk Insert and upserts with hive syncing. Tests Hudi incremental processing using a 2 step pipeline - * The first step involves using a SQL template to transform a source - * TEST-DATA-SOURCE ============================> HUDI TABLE 1 ===============> HUDI TABLE 2 - * (incr-pull with transform) (incr-pull) - * Hudi Table 1 is synced with Hive. + * Test Bulk Insert and upserts with hive syncing. Tests Hudi incremental processing using a 2 step pipeline The first + * step involves using a SQL template to transform a source TEST-DATA-SOURCE ============================> HUDI TABLE + * 1 ===============> HUDI TABLE 2 (incr-pull with transform) (incr-pull) Hudi Table 1 is synced with Hive. */ @Test public void testBulkInsertsAndUpsertsWithSQLBasedTransformerFor2StepPipeline() throws Exception { @@ -467,12 +459,11 @@ public class TestHoodieDeltaStreamer extends UtilitiesTestBase { // Test Hive integration HoodieHiveClient hiveClient = new HoodieHiveClient(hiveSyncConfig, hiveServer.getHiveConf(), dfs); - assertTrue("Table " + hiveSyncConfig.tableName + " should exist", - hiveClient.doesTableExist()); + assertTrue("Table " + hiveSyncConfig.tableName + " should exist", hiveClient.doesTableExist()); assertEquals("Table partitions should match the number of partitions we wrote", 1, hiveClient.scanTablePartitions().size()); - assertEquals("The last commit that was sycned should be updated in the TBLPROPERTIES", - lastInstantForUpstreamTable, hiveClient.getLastCommitTimeSynced().get()); + assertEquals("The last commit that was sycned should be updated in the TBLPROPERTIES", lastInstantForUpstreamTable, + hiveClient.getLastCommitTimeSynced().get()); } @Test @@ -499,8 +490,7 @@ public class TestHoodieDeltaStreamer extends UtilitiesTestBase { // Test with empty commits HoodieTableMetaClient mClient = new HoodieTableMetaClient(jsc.hadoopConfiguration(), datasetBasePath, true); - HoodieInstant lastFinished = - mClient.getCommitsTimeline().filterCompletedInstants().lastInstant().get(); + HoodieInstant lastFinished = mClient.getCommitsTimeline().filterCompletedInstants().lastInstant().get(); HoodieDeltaStreamer.Config cfg2 = TestHelpers.makeDropAllConfig(datasetBasePath, Operation.UPSERT); cfg2.filterDupes = true; cfg2.sourceLimit = 2000; @@ -509,14 +499,13 @@ public class TestHoodieDeltaStreamer extends UtilitiesTestBase { HoodieDeltaStreamer ds2 = new HoodieDeltaStreamer(cfg2, jsc); ds2.sync(); mClient = new HoodieTableMetaClient(jsc.hadoopConfiguration(), datasetBasePath, true); - HoodieInstant newLastFinished = - mClient.getCommitsTimeline().filterCompletedInstants().lastInstant().get(); + HoodieInstant newLastFinished = mClient.getCommitsTimeline().filterCompletedInstants().lastInstant().get(); Assert.assertTrue(HoodieTimeline.compareTimestamps(newLastFinished.getTimestamp(), lastFinished.getTimestamp(), HoodieTimeline.GREATER)); // Ensure it is empty - HoodieCommitMetadata commitMetadata = HoodieCommitMetadata.fromBytes( - mClient.getActiveTimeline().getInstantDetails(newLastFinished).get(), HoodieCommitMetadata.class); + HoodieCommitMetadata commitMetadata = HoodieCommitMetadata + .fromBytes(mClient.getActiveTimeline().getInstantDetails(newLastFinished).get(), HoodieCommitMetadata.class); System.out.println("New Commit Metadata=" + commitMetadata); Assert.assertTrue(commitMetadata.getPartitionToWriteStats().isEmpty()); } @@ -527,8 +516,7 @@ public class TestHoodieDeltaStreamer extends UtilitiesTestBase { props.setProperty(TestSourceConfig.MAX_UNIQUE_RECORDS_PROP, "1000"); props.setProperty(TestSourceConfig.NUM_SOURCE_PARTITIONS_PROP, "1"); props.setProperty(TestSourceConfig.USE_ROCKSDB_FOR_TEST_DATAGEN_KEYS, "true"); - DistributedTestDataSource distributedTestDataSource = new DistributedTestDataSource(props, - jsc, sparkSession, null); + DistributedTestDataSource distributedTestDataSource = new DistributedTestDataSource(props, jsc, sparkSession, null); InputBatch> batch = distributedTestDataSource.fetchNext(Option.empty(), 10000000); batch.getBatch().get().cache(); long c = batch.getBatch().get().count(); @@ -542,13 +530,10 @@ public class TestHoodieDeltaStreamer extends UtilitiesTestBase { /** * Taken from https://stackoverflow.com/questions/3694380/calculating-distance-between-two-points-using-latitude- - * longitude-what-am-i-doi - * Calculate distance between two points in latitude and longitude taking - * into account height difference. If you are not interested in height - * difference pass 0.0. Uses Haversine method as its base. + * longitude-what-am-i-doi Calculate distance between two points in latitude and longitude taking into account + * height difference. If you are not interested in height difference pass 0.0. Uses Haversine method as its base. * - * lat1, lon1 Start point lat2, lon2 End point el1 Start altitude in meters - * el2 End altitude in meters + * lat1, lon1 Start point lat2, lon2 End point el1 Start altitude in meters el2 End altitude in meters * * @returns Distance in Meters */ @@ -559,9 +544,8 @@ public class TestHoodieDeltaStreamer extends UtilitiesTestBase { double latDistance = Math.toRadians(lat2 - lat1); double lonDistance = Math.toRadians(lon2 - lon1); - double a = Math.sin(latDistance / 2) * Math.sin(latDistance / 2) - + Math.cos(Math.toRadians(lat1)) * Math.cos(Math.toRadians(lat2)) - * Math.sin(lonDistance / 2) * Math.sin(lonDistance / 2); + double a = Math.sin(latDistance / 2) * Math.sin(latDistance / 2) + Math.cos(Math.toRadians(lat1)) + * Math.cos(Math.toRadians(lat2)) * Math.sin(lonDistance / 2) * Math.sin(lonDistance / 2); double c = 2 * Math.atan2(Math.sqrt(a), Math.sqrt(1 - a)); double distance = R * c * 1000; // convert to meters @@ -579,12 +563,11 @@ public class TestHoodieDeltaStreamer extends UtilitiesTestBase { public static class TripsWithDistanceTransformer implements Transformer { @Override - public Dataset apply(JavaSparkContext jsc, SparkSession sparkSession, - Dataset rowDataset, TypedProperties properties) { + public Dataset apply(JavaSparkContext jsc, SparkSession sparkSession, Dataset rowDataset, + TypedProperties properties) { rowDataset.sqlContext().udf().register("distance_udf", new DistanceUDF(), DataTypes.DoubleType); - return rowDataset.withColumn("haversine_distance", - functions.callUDF("distance_udf", functions.col("begin_lat"), - functions.col("end_lat"), functions.col("begin_lon"), functions.col("end_lat"))); + return rowDataset.withColumn("haversine_distance", functions.callUDF("distance_udf", functions.col("begin_lat"), + functions.col("end_lat"), functions.col("begin_lon"), functions.col("end_lat"))); } } diff --git a/hudi-utilities/src/test/java/org/apache/hudi/utilities/TestHoodieSnapshotCopier.java b/hudi-utilities/src/test/java/org/apache/hudi/utilities/TestHoodieSnapshotCopier.java index 1ab1f0420..f64c38bff 100644 --- a/hudi-utilities/src/test/java/org/apache/hudi/utilities/TestHoodieSnapshotCopier.java +++ b/hudi-utilities/src/test/java/org/apache/hudi/utilities/TestHoodieSnapshotCopier.java @@ -77,8 +77,8 @@ public class TestHoodieSnapshotCopier extends HoodieCommonTestHarness { assertFalse(fs.exists(new Path(outputPath + "/_SUCCESS"))); } - //TODO - uncomment this after fixing test failures - //@Test + // TODO - uncomment this after fixing test failures + // @Test public void testSnapshotCopy() throws Exception { // Generate some commits and corresponding parquets String commitTime1 = "20160501010101"; @@ -95,40 +95,30 @@ public class TestHoodieSnapshotCopier extends HoodieCommonTestHarness { new File(basePath + "/2016/05/01/").mkdirs(); new File(basePath + "/2016/05/02/").mkdirs(); new File(basePath + "/2016/05/06/").mkdirs(); - HoodieTestDataGenerator - .writePartitionMetadata(fs, new String[] {"2016/05/01", "2016/05/02", "2016/05/06"}, - basePath); + HoodieTestDataGenerator.writePartitionMetadata(fs, new String[] {"2016/05/01", "2016/05/02", "2016/05/06"}, + basePath); // Make commit1 - File file11 = new File( - basePath + "/2016/05/01/" + FSUtils.makeDataFileName(commitTime1, TEST_WRITE_TOKEN, "id11")); + File file11 = new File(basePath + "/2016/05/01/" + FSUtils.makeDataFileName(commitTime1, TEST_WRITE_TOKEN, "id11")); file11.createNewFile(); - File file12 = new File( - basePath + "/2016/05/02/" + FSUtils.makeDataFileName(commitTime1, TEST_WRITE_TOKEN, "id12")); + File file12 = new File(basePath + "/2016/05/02/" + FSUtils.makeDataFileName(commitTime1, TEST_WRITE_TOKEN, "id12")); file12.createNewFile(); - File file13 = new File( - basePath + "/2016/05/06/" + FSUtils.makeDataFileName(commitTime1, TEST_WRITE_TOKEN, "id13")); + File file13 = new File(basePath + "/2016/05/06/" + FSUtils.makeDataFileName(commitTime1, TEST_WRITE_TOKEN, "id13")); file13.createNewFile(); // Make commit2 - File file21 = new File( - basePath + "/2016/05/01/" + FSUtils.makeDataFileName(commitTime2, TEST_WRITE_TOKEN, "id21")); + File file21 = new File(basePath + "/2016/05/01/" + FSUtils.makeDataFileName(commitTime2, TEST_WRITE_TOKEN, "id21")); file21.createNewFile(); - File file22 = new File( - basePath + "/2016/05/02/" + FSUtils.makeDataFileName(commitTime2, TEST_WRITE_TOKEN, "id22")); + File file22 = new File(basePath + "/2016/05/02/" + FSUtils.makeDataFileName(commitTime2, TEST_WRITE_TOKEN, "id22")); file22.createNewFile(); - File file23 = new File( - basePath + "/2016/05/06/" + FSUtils.makeDataFileName(commitTime2, TEST_WRITE_TOKEN, "id23")); + File file23 = new File(basePath + "/2016/05/06/" + FSUtils.makeDataFileName(commitTime2, TEST_WRITE_TOKEN, "id23")); file23.createNewFile(); // Make commit3 - File file31 = new File( - basePath + "/2016/05/01/" + FSUtils.makeDataFileName(commitTime3, TEST_WRITE_TOKEN, "id31")); + File file31 = new File(basePath + "/2016/05/01/" + FSUtils.makeDataFileName(commitTime3, TEST_WRITE_TOKEN, "id31")); file31.createNewFile(); - File file32 = new File( - basePath + "/2016/05/02/" + FSUtils.makeDataFileName(commitTime3, TEST_WRITE_TOKEN, "id32")); + File file32 = new File(basePath + "/2016/05/02/" + FSUtils.makeDataFileName(commitTime3, TEST_WRITE_TOKEN, "id32")); file32.createNewFile(); - File file33 = new File( - basePath + "/2016/05/06/" + FSUtils.makeDataFileName(commitTime3, TEST_WRITE_TOKEN, "id33")); + File file33 = new File(basePath + "/2016/05/06/" + FSUtils.makeDataFileName(commitTime3, TEST_WRITE_TOKEN, "id33")); file33.createNewFile(); // Do a snapshot copy diff --git a/hudi-utilities/src/test/java/org/apache/hudi/utilities/UtilitiesTestBase.java b/hudi-utilities/src/test/java/org/apache/hudi/utilities/UtilitiesTestBase.java index 2aec7dc16..2125483b8 100644 --- a/hudi-utilities/src/test/java/org/apache/hudi/utilities/UtilitiesTestBase.java +++ b/hudi-utilities/src/test/java/org/apache/hudi/utilities/UtilitiesTestBase.java @@ -110,6 +110,7 @@ public class UtilitiesTestBase { /** * Helper to get hive sync config + * * @param basePath * @param tableName * @return @@ -130,6 +131,7 @@ public class UtilitiesTestBase { /** * Initialize Hive DB + * * @throws IOException */ private static void clearHiveDb() throws IOException { @@ -151,8 +153,8 @@ public class UtilitiesTestBase { private static ClassLoader classLoader = Helpers.class.getClassLoader(); public static void copyToDFS(String testResourcePath, FileSystem fs, String targetPath) throws IOException { - BufferedReader reader = new BufferedReader( - new InputStreamReader(classLoader.getResourceAsStream(testResourcePath))); + BufferedReader reader = + new BufferedReader(new InputStreamReader(classLoader.getResourceAsStream(testResourcePath))); PrintStream os = new PrintStream(fs.create(new Path(targetPath), true)); String line; while ((line = reader.readLine()) != null) { diff --git a/hudi-utilities/src/test/java/org/apache/hudi/utilities/sources/AbstractBaseTestSource.java b/hudi-utilities/src/test/java/org/apache/hudi/utilities/sources/AbstractBaseTestSource.java index 390454b8f..58a7d580f 100644 --- a/hudi-utilities/src/test/java/org/apache/hudi/utilities/sources/AbstractBaseTestSource.java +++ b/hudi-utilities/src/test/java/org/apache/hudi/utilities/sources/AbstractBaseTestSource.java @@ -69,16 +69,15 @@ public abstract class AbstractBaseTestSource extends AvroSource { dataGeneratorMap.clear(); } - protected AbstractBaseTestSource(TypedProperties props, - JavaSparkContext sparkContext, SparkSession sparkSession, + protected AbstractBaseTestSource(TypedProperties props, JavaSparkContext sparkContext, SparkSession sparkSession, SchemaProvider schemaProvider) { super(props, sparkContext, sparkSession, schemaProvider); } protected static Stream fetchNextBatch(TypedProperties props, int sourceLimit, String commitTime, int partition) { - int maxUniqueKeys = props.getInteger(TestSourceConfig.MAX_UNIQUE_RECORDS_PROP, - TestSourceConfig.DEFAULT_MAX_UNIQUE_RECORDS); + int maxUniqueKeys = + props.getInteger(TestSourceConfig.MAX_UNIQUE_RECORDS_PROP, TestSourceConfig.DEFAULT_MAX_UNIQUE_RECORDS); HoodieTestDataGenerator dataGenerator = dataGeneratorMap.get(partition); diff --git a/hudi-utilities/src/test/java/org/apache/hudi/utilities/sources/DistributedTestDataSource.java b/hudi-utilities/src/test/java/org/apache/hudi/utilities/sources/DistributedTestDataSource.java index 6fa8efd41..8e877678f 100644 --- a/hudi-utilities/src/test/java/org/apache/hudi/utilities/sources/DistributedTestDataSource.java +++ b/hudi-utilities/src/test/java/org/apache/hudi/utilities/sources/DistributedTestDataSource.java @@ -37,12 +37,11 @@ public class DistributedTestDataSource extends AbstractBaseTestSource { private final int numTestSourcePartitions; - public DistributedTestDataSource(TypedProperties props, - JavaSparkContext sparkContext, SparkSession sparkSession, + public DistributedTestDataSource(TypedProperties props, JavaSparkContext sparkContext, SparkSession sparkSession, SchemaProvider schemaProvider) { super(props, sparkContext, sparkSession, schemaProvider); - this.numTestSourcePartitions = props.getInteger(TestSourceConfig.NUM_SOURCE_PARTITIONS_PROP, - TestSourceConfig.DEFAULT_NUM_SOURCE_PARTITIONS); + this.numTestSourcePartitions = + props.getInteger(TestSourceConfig.NUM_SOURCE_PARTITIONS_PROP, TestSourceConfig.DEFAULT_NUM_SOURCE_PARTITIONS); } @Override @@ -60,20 +59,21 @@ public class DistributedTestDataSource extends AbstractBaseTestSource { newProps.putAll(props); // Set the maxUniqueRecords per partition for TestDataSource - int maxUniqueRecords = props.getInteger(TestSourceConfig.MAX_UNIQUE_RECORDS_PROP, - TestSourceConfig.DEFAULT_MAX_UNIQUE_RECORDS); + int maxUniqueRecords = + props.getInteger(TestSourceConfig.MAX_UNIQUE_RECORDS_PROP, TestSourceConfig.DEFAULT_MAX_UNIQUE_RECORDS); String maxUniqueRecordsPerPartition = String.valueOf(Math.max(1, maxUniqueRecords / numTestSourcePartitions)); newProps.setProperty(TestSourceConfig.MAX_UNIQUE_RECORDS_PROP, maxUniqueRecordsPerPartition); int perPartitionSourceLimit = Math.max(1, (int) (sourceLimit / numTestSourcePartitions)); - JavaRDD avroRDD = sparkContext.parallelize(IntStream.range(0, numTestSourcePartitions).boxed() - .collect(Collectors.toList()), numTestSourcePartitions).mapPartitionsWithIndex((p, idx) -> { - log.info("Initializing source with newProps=" + newProps); - if (!dataGeneratorMap.containsKey(p)) { - initDataGen(newProps, p); - } - Iterator itr = fetchNextBatch(newProps, perPartitionSourceLimit, commitTime, p).iterator(); - return itr; - }, true); + JavaRDD avroRDD = + sparkContext.parallelize(IntStream.range(0, numTestSourcePartitions).boxed().collect(Collectors.toList()), + numTestSourcePartitions).mapPartitionsWithIndex((p, idx) -> { + log.info("Initializing source with newProps=" + newProps); + if (!dataGeneratorMap.containsKey(p)) { + initDataGen(newProps, p); + } + Iterator itr = fetchNextBatch(newProps, perPartitionSourceLimit, commitTime, p).iterator(); + return itr; + }, true); return new InputBatch<>(Option.of(avroRDD), commitTime); } } diff --git a/hudi-utilities/src/test/java/org/apache/hudi/utilities/sources/TestDFSSource.java b/hudi-utilities/src/test/java/org/apache/hudi/utilities/sources/TestDFSSource.java index ff51d126c..9ee32855a 100644 --- a/hudi-utilities/src/test/java/org/apache/hudi/utilities/sources/TestDFSSource.java +++ b/hudi-utilities/src/test/java/org/apache/hudi/utilities/sources/TestDFSSource.java @@ -80,16 +80,13 @@ public class TestDFSSource extends UtilitiesTestBase { // 1. Extract without any checkpoint => get all the data, respecting sourceLimit assertEquals(Option.empty(), jsonSource.fetchNewDataInAvroFormat(Option.empty(), Long.MAX_VALUE).getBatch()); - UtilitiesTestBase.Helpers.saveStringsToDFS( - Helpers.jsonifyRecords(dataGenerator.generateInserts("000", 100)), dfs, + UtilitiesTestBase.Helpers.saveStringsToDFS(Helpers.jsonifyRecords(dataGenerator.generateInserts("000", 100)), dfs, dfsBasePath + "/jsonFiles/1.json"); assertEquals(Option.empty(), jsonSource.fetchNewDataInAvroFormat(Option.empty(), 10).getBatch()); - InputBatch> fetch1 = - jsonSource.fetchNewDataInAvroFormat(Option.empty(), 1000000); + InputBatch> fetch1 = jsonSource.fetchNewDataInAvroFormat(Option.empty(), 1000000); assertEquals(100, fetch1.getBatch().get().count()); // Test json -> Row format - InputBatch> fetch1AsRows = - jsonSource.fetchNewDataInRowFormat(Option.empty(), 1000000); + InputBatch> fetch1AsRows = jsonSource.fetchNewDataInRowFormat(Option.empty(), 1000000); assertEquals(100, fetch1AsRows.getBatch().get().count()); // Test Avro -> Row format Dataset fetch1Rows = AvroConversionUtils.createDataFrame(JavaRDD.toRDD(fetch1.getBatch().get()), @@ -97,16 +94,15 @@ public class TestDFSSource extends UtilitiesTestBase { assertEquals(100, fetch1Rows.count()); // 2. Produce new data, extract new data - UtilitiesTestBase.Helpers.saveStringsToDFS( - Helpers.jsonifyRecords(dataGenerator.generateInserts("001", 10000)), - dfs, dfsBasePath + "/jsonFiles/2.json"); - InputBatch> fetch2 = jsonSource.fetchNewDataInRowFormat( - Option.of(fetch1.getCheckpointForNextBatch()), Long.MAX_VALUE); + UtilitiesTestBase.Helpers.saveStringsToDFS(Helpers.jsonifyRecords(dataGenerator.generateInserts("001", 10000)), dfs, + dfsBasePath + "/jsonFiles/2.json"); + InputBatch> fetch2 = + jsonSource.fetchNewDataInRowFormat(Option.of(fetch1.getCheckpointForNextBatch()), Long.MAX_VALUE); assertEquals(10000, fetch2.getBatch().get().count()); // 3. Extract with previous checkpoint => gives same data back (idempotent) - InputBatch> fetch3 = jsonSource.fetchNewDataInRowFormat( - Option.of(fetch1.getCheckpointForNextBatch()), Long.MAX_VALUE); + InputBatch> fetch3 = + jsonSource.fetchNewDataInRowFormat(Option.of(fetch1.getCheckpointForNextBatch()), Long.MAX_VALUE); assertEquals(10000, fetch3.getBatch().get().count()); assertEquals(fetch2.getCheckpointForNextBatch(), fetch3.getCheckpointForNextBatch()); fetch3.getBatch().get().registerTempTable("test_dfs_table"); @@ -114,8 +110,8 @@ public class TestDFSSource extends UtilitiesTestBase { assertEquals(10000, rowDataset.count()); // 4. Extract with latest checkpoint => no new data returned - InputBatch> fetch4 = jsonSource.fetchNewDataInAvroFormat( - Option.of(fetch2.getCheckpointForNextBatch()), Long.MAX_VALUE); + InputBatch> fetch4 = + jsonSource.fetchNewDataInAvroFormat(Option.of(fetch2.getCheckpointForNextBatch()), Long.MAX_VALUE); assertEquals(Option.empty(), fetch4.getBatch()); } } diff --git a/hudi-utilities/src/test/java/org/apache/hudi/utilities/sources/TestDataSource.java b/hudi-utilities/src/test/java/org/apache/hudi/utilities/sources/TestDataSource.java index ec5f7db94..1ba75eab0 100644 --- a/hudi-utilities/src/test/java/org/apache/hudi/utilities/sources/TestDataSource.java +++ b/hudi-utilities/src/test/java/org/apache/hudi/utilities/sources/TestDataSource.java @@ -44,8 +44,7 @@ public class TestDataSource extends AbstractBaseTestSource { } @Override - protected InputBatch> fetchNewData(Option lastCheckpointStr, - long sourceLimit) { + protected InputBatch> fetchNewData(Option lastCheckpointStr, long sourceLimit) { int nextCommitNum = lastCheckpointStr.map(s -> Integer.parseInt(s) + 1).orElse(0); String commitTime = String.format("%05d", nextCommitNum); @@ -56,8 +55,8 @@ public class TestDataSource extends AbstractBaseTestSource { return new InputBatch<>(Option.empty(), lastCheckpointStr.orElse(null)); } - List records = fetchNextBatch(props, (int)sourceLimit, commitTime, DEFAULT_PARTITION_NUM) - .collect(Collectors.toList()); + List records = + fetchNextBatch(props, (int) sourceLimit, commitTime, DEFAULT_PARTITION_NUM).collect(Collectors.toList()); JavaRDD avroRDD = sparkContext.parallelize(records, 4); return new InputBatch<>(Option.of(avroRDD), commitTime); } diff --git a/hudi-utilities/src/test/java/org/apache/hudi/utilities/sources/TestKafkaSource.java b/hudi-utilities/src/test/java/org/apache/hudi/utilities/sources/TestKafkaSource.java index 9e31a60ed..9825ae6f7 100644 --- a/hudi-utilities/src/test/java/org/apache/hudi/utilities/sources/TestKafkaSource.java +++ b/hudi-utilities/src/test/java/org/apache/hudi/utilities/sources/TestKafkaSource.java @@ -107,13 +107,13 @@ public class TestKafkaSource extends UtilitiesTestBase { // 2. Produce new data, extract new data testUtils.sendMessages(TEST_TOPIC_NAME, Helpers.jsonifyRecords(dataGenerator.generateInserts("001", 1000))); - InputBatch> fetch2 = kafkaSource.fetchNewDataInRowFormat( - Option.of(fetch1.getCheckpointForNextBatch()), Long.MAX_VALUE); + InputBatch> fetch2 = + kafkaSource.fetchNewDataInRowFormat(Option.of(fetch1.getCheckpointForNextBatch()), Long.MAX_VALUE); assertEquals(1100, fetch2.getBatch().get().count()); // 3. Extract with previous checkpoint => gives same data back (idempotent) - InputBatch> fetch3 = kafkaSource.fetchNewDataInAvroFormat( - Option.of(fetch1.getCheckpointForNextBatch()), Long.MAX_VALUE); + InputBatch> fetch3 = + kafkaSource.fetchNewDataInAvroFormat(Option.of(fetch1.getCheckpointForNextBatch()), Long.MAX_VALUE); assertEquals(fetch2.getBatch().get().count(), fetch3.getBatch().get().count()); assertEquals(fetch2.getCheckpointForNextBatch(), fetch3.getCheckpointForNextBatch()); // Same using Row API @@ -123,8 +123,8 @@ public class TestKafkaSource extends UtilitiesTestBase { assertEquals(fetch2.getCheckpointForNextBatch(), fetch3AsRows.getCheckpointForNextBatch()); // 4. Extract with latest checkpoint => no new data returned - InputBatch> fetch4 = kafkaSource.fetchNewDataInAvroFormat( - Option.of(fetch2.getCheckpointForNextBatch()), Long.MAX_VALUE); + InputBatch> fetch4 = + kafkaSource.fetchNewDataInAvroFormat(Option.of(fetch2.getCheckpointForNextBatch()), Long.MAX_VALUE); assertEquals(Option.empty(), fetch4.getBatch()); // Same using Row API InputBatch> fetch4AsRows = @@ -144,26 +144,19 @@ public class TestKafkaSource extends UtilitiesTestBase { @Test public void testComputeOffsetRanges() { // test totalNewMessages() - long totalMsgs = CheckpointUtils.totalNewMessages(new OffsetRange[]{ - OffsetRange.apply(TEST_TOPIC_NAME, 0, 0, 100), - OffsetRange.apply(TEST_TOPIC_NAME, 0, 100, 200) - }); + long totalMsgs = CheckpointUtils.totalNewMessages(new OffsetRange[] {OffsetRange.apply(TEST_TOPIC_NAME, 0, 0, 100), + OffsetRange.apply(TEST_TOPIC_NAME, 0, 100, 200)}); assertEquals(200, totalMsgs); // should consume all the full data - OffsetRange[] ranges = CheckpointUtils.computeOffsetRanges( - makeOffsetMap(new int[]{0, 1}, new long[]{200000, 250000}), - makeOffsetMap(new int[]{0, 1}, new long[]{300000, 350000}), - 1000000L - ); + OffsetRange[] ranges = + CheckpointUtils.computeOffsetRanges(makeOffsetMap(new int[] {0, 1}, new long[] {200000, 250000}), + makeOffsetMap(new int[] {0, 1}, new long[] {300000, 350000}), 1000000L); assertEquals(200000, CheckpointUtils.totalNewMessages(ranges)); // should only consume upto limit - ranges = CheckpointUtils.computeOffsetRanges( - makeOffsetMap(new int[]{0, 1}, new long[]{200000, 250000}), - makeOffsetMap(new int[]{0, 1}, new long[]{300000, 350000}), - 10000 - ); + ranges = CheckpointUtils.computeOffsetRanges(makeOffsetMap(new int[] {0, 1}, new long[] {200000, 250000}), + makeOffsetMap(new int[] {0, 1}, new long[] {300000, 350000}), 10000); assertEquals(10000, CheckpointUtils.totalNewMessages(ranges)); assertEquals(200000, ranges[0].fromOffset()); assertEquals(205000, ranges[0].untilOffset()); @@ -171,30 +164,21 @@ public class TestKafkaSource extends UtilitiesTestBase { assertEquals(255000, ranges[1].untilOffset()); // should also consume from new partitions. - ranges = CheckpointUtils.computeOffsetRanges( - makeOffsetMap(new int[]{0, 1}, new long[]{200000, 250000}), - makeOffsetMap(new int[]{0, 1, 2}, new long[]{300000, 350000, 100000}), - 1000000L - ); + ranges = CheckpointUtils.computeOffsetRanges(makeOffsetMap(new int[] {0, 1}, new long[] {200000, 250000}), + makeOffsetMap(new int[] {0, 1, 2}, new long[] {300000, 350000, 100000}), 1000000L); assertEquals(300000, CheckpointUtils.totalNewMessages(ranges)); assertEquals(3, ranges.length); // for skewed offsets, does not starve any partition & can catch up - ranges = CheckpointUtils.computeOffsetRanges( - makeOffsetMap(new int[]{0, 1}, new long[]{200000, 250000}), - makeOffsetMap(new int[]{0, 1, 2}, new long[]{200010, 350000, 10000}), - 100000 - ); + ranges = CheckpointUtils.computeOffsetRanges(makeOffsetMap(new int[] {0, 1}, new long[] {200000, 250000}), + makeOffsetMap(new int[] {0, 1, 2}, new long[] {200010, 350000, 10000}), 100000); assertEquals(100000, CheckpointUtils.totalNewMessages(ranges)); assertEquals(10, ranges[0].count()); assertEquals(89990, ranges[1].count()); assertEquals(10000, ranges[2].count()); - ranges = CheckpointUtils.computeOffsetRanges( - makeOffsetMap(new int[]{0, 1}, new long[]{200000, 250000}), - makeOffsetMap(new int[]{0, 1, 2}, new long[]{200010, 350000, 10000}), - 1000000 - ); + ranges = CheckpointUtils.computeOffsetRanges(makeOffsetMap(new int[] {0, 1}, new long[] {200000, 250000}), + makeOffsetMap(new int[] {0, 1, 2}, new long[] {200010, 350000, 10000}), 1000000); assertEquals(110010, CheckpointUtils.totalNewMessages(ranges)); assertEquals(10, ranges[0].count()); assertEquals(100000, ranges[1].count()); diff --git a/hudi-utilities/src/test/java/org/apache/hudi/utilities/sources/config/TestSourceConfig.java b/hudi-utilities/src/test/java/org/apache/hudi/utilities/sources/config/TestSourceConfig.java index 557e620a3..217c6155a 100644 --- a/hudi-utilities/src/test/java/org/apache/hudi/utilities/sources/config/TestSourceConfig.java +++ b/hudi-utilities/src/test/java/org/apache/hudi/utilities/sources/config/TestSourceConfig.java @@ -21,7 +21,7 @@ package org.apache.hudi.utilities.sources.config; /** * Configurations for Test Data Sources */ -public class TestSourceConfig { +public class TestSourceConfig { // Used by DistributedTestDataSource only. Number of partitions where each partitions generates test-data public static final String NUM_SOURCE_PARTITIONS_PROP = "hoodie.deltastreamer.source.test.num_partitions"; diff --git a/packaging/hudi-hadoop-mr-bundle/pom.xml b/packaging/hudi-hadoop-mr-bundle/pom.xml index bdeb33846..5e4ae3c2e 100644 --- a/packaging/hudi-hadoop-mr-bundle/pom.xml +++ b/packaging/hudi-hadoop-mr-bundle/pom.xml @@ -28,6 +28,7 @@ true + ${project.parent.basedir} diff --git a/packaging/hudi-hive-bundle/pom.xml b/packaging/hudi-hive-bundle/pom.xml index 2ca3a7755..fb6eb9d64 100644 --- a/packaging/hudi-hive-bundle/pom.xml +++ b/packaging/hudi-hive-bundle/pom.xml @@ -28,6 +28,7 @@ true + ${project.parent.basedir} diff --git a/packaging/hudi-presto-bundle/pom.xml b/packaging/hudi-presto-bundle/pom.xml index 2e9e0baf7..2d5635981 100644 --- a/packaging/hudi-presto-bundle/pom.xml +++ b/packaging/hudi-presto-bundle/pom.xml @@ -28,6 +28,7 @@ true + ${project.parent.basedir} diff --git a/packaging/hudi-spark-bundle/pom.xml b/packaging/hudi-spark-bundle/pom.xml index 68baea7c9..ffdc3d569 100644 --- a/packaging/hudi-spark-bundle/pom.xml +++ b/packaging/hudi-spark-bundle/pom.xml @@ -28,6 +28,7 @@ true + ${project.parent.basedir} diff --git a/packaging/hudi-timeline-server-bundle/pom.xml b/packaging/hudi-timeline-server-bundle/pom.xml index f370cb4c1..7deddd273 100644 --- a/packaging/hudi-timeline-server-bundle/pom.xml +++ b/packaging/hudi-timeline-server-bundle/pom.xml @@ -229,6 +229,7 @@ true + ${project.parent.basedir} diff --git a/packaging/hudi-timeline-server-bundle/src/main/java/org/apache/hudi/timeline/server/bundle/Main.java b/packaging/hudi-timeline-server-bundle/src/main/java/org/apache/hudi/timeline/server/bundle/Main.java index c97a7da14..a55f2e130 100644 --- a/packaging/hudi-timeline-server-bundle/src/main/java/org/apache/hudi/timeline/server/bundle/Main.java +++ b/packaging/hudi-timeline-server-bundle/src/main/java/org/apache/hudi/timeline/server/bundle/Main.java @@ -30,7 +30,7 @@ import org.apache.hudi.common.util.ReflectionUtils; */ public class Main { - public static void main(String[] args) { - ReflectionUtils.getTopLevelClassesInClasspath(Main.class).forEach(System.out::println); - } + public static void main(String[] args) { + ReflectionUtils.getTopLevelClassesInClasspath(Main.class).forEach(System.out::println); + } } diff --git a/packaging/hudi-utilities-bundle/pom.xml b/packaging/hudi-utilities-bundle/pom.xml index 091182f5e..fc08ea929 100644 --- a/packaging/hudi-utilities-bundle/pom.xml +++ b/packaging/hudi-utilities-bundle/pom.xml @@ -29,6 +29,7 @@ true + ${project.parent.basedir} diff --git a/pom.xml b/pom.xml index 17d9758a4..43ebdd243 100644 --- a/pom.xml +++ b/pom.xml @@ -99,6 +99,7 @@ ${skipTests} ${skipTests} UTF-8 + ${project.basedir} @@ -155,7 +156,7 @@ false ${project.basedir}/src/main/scala ${project.basedir}/src/test/scala - ${maven.multiModuleProjectDirectory}/style/scalastyle-config.xml + ${main.basedir}/style/scalastyle-config.xml UTF-8 @@ -167,6 +168,33 @@ + + com.diffplug.spotless + spotless-maven-plugin + 1.24.3 + + + + ${main.basedir}/style/eclipse-java-google-style.xml + 4.10.0 + + + + + + + + + + + spotless-check + compile + + check + + + + org.apache.maven.plugins maven-compiler-plugin diff --git a/style/checkstyle-suppressions.xml b/style/checkstyle-suppressions.xml index cca4efc97..30dc51274 100644 --- a/style/checkstyle-suppressions.xml +++ b/style/checkstyle-suppressions.xml @@ -22,6 +22,7 @@ + diff --git a/style/checkstyle.xml b/style/checkstyle.xml index 947d14104..51a291db9 100644 --- a/style/checkstyle.xml +++ b/style/checkstyle.xml @@ -42,7 +42,14 @@ + + + + + + + @@ -55,7 +62,7 @@ - + @@ -193,7 +200,7 @@ - + --> diff --git a/style/eclipse-java-google-style.xml b/style/eclipse-java-google-style.xml index 8b1a67ad4..f99bb9f3d 100644 --- a/style/eclipse-java-google-style.xml +++ b/style/eclipse-java-google-style.xml @@ -15,339 +15,339 @@ See the License for the specific language governing permissions and limitations under the License. --> - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +