1
0

[HUDI-296] Explore use of spotless to auto fix formatting errors (#945)

- Add spotless format fixing to project
- One time reformatting for conformity
- Build fails for formatting changes and mvn spotless:apply autofixes them
This commit is contained in:
leesf
2019-10-10 20:19:40 +08:00
committed by vinoth chandar
parent 834c591955
commit b19bed442d
381 changed files with 7350 additions and 9064 deletions

View File

@@ -30,6 +30,7 @@
<properties> <properties>
<project.build.sourceEncoding>UTF-8</project.build.sourceEncoding> <project.build.sourceEncoding>UTF-8</project.build.sourceEncoding>
<checkstyle.skip>true</checkstyle.skip> <checkstyle.skip>true</checkstyle.skip>
<main.basedir>${project.parent.parent.basedir}</main.basedir>
</properties> </properties>

View File

@@ -30,6 +30,7 @@
<properties> <properties>
<project.build.sourceEncoding>UTF-8</project.build.sourceEncoding> <project.build.sourceEncoding>UTF-8</project.build.sourceEncoding>
<checkstyle.skip>true</checkstyle.skip> <checkstyle.skip>true</checkstyle.skip>
<main.basedir>${project.parent.parent.basedir}</main.basedir>
</properties> </properties>
<dependencyManagement> <dependencyManagement>

View File

@@ -30,6 +30,7 @@
<properties> <properties>
<project.build.sourceEncoding>UTF-8</project.build.sourceEncoding> <project.build.sourceEncoding>UTF-8</project.build.sourceEncoding>
<checkstyle.skip>true</checkstyle.skip> <checkstyle.skip>true</checkstyle.skip>
<main.basedir>${project.parent.parent.basedir}</main.basedir>
</properties> </properties>
<dependencyManagement> <dependencyManagement>

View File

@@ -30,6 +30,7 @@
<properties> <properties>
<project.build.sourceEncoding>UTF-8</project.build.sourceEncoding> <project.build.sourceEncoding>UTF-8</project.build.sourceEncoding>
<checkstyle.skip>true</checkstyle.skip> <checkstyle.skip>true</checkstyle.skip>
<main.basedir>${project.parent.parent.basedir}</main.basedir>
</properties> </properties>
<dependencyManagement> <dependencyManagement>

View File

@@ -30,6 +30,7 @@
<properties> <properties>
<project.build.sourceEncoding>UTF-8</project.build.sourceEncoding> <project.build.sourceEncoding>UTF-8</project.build.sourceEncoding>
<checkstyle.skip>true</checkstyle.skip> <checkstyle.skip>true</checkstyle.skip>
<main.basedir>${project.parent.parent.basedir}</main.basedir>
</properties> </properties>
<dependencyManagement> <dependencyManagement>

View File

@@ -56,6 +56,7 @@
<docker.presto.version>0.217</docker.presto.version> <docker.presto.version>0.217</docker.presto.version>
<dockerfile.maven.version>1.4.3</dockerfile.maven.version> <dockerfile.maven.version>1.4.3</dockerfile.maven.version>
<checkstyle.skip>true</checkstyle.skip> <checkstyle.skip>true</checkstyle.skip>
<main.basedir>${project.parent.basedir}</main.basedir>
</properties> </properties>
<build> <build>

View File

@@ -32,6 +32,7 @@
<properties> <properties>
<project.build.sourceEncoding>UTF-8</project.build.sourceEncoding> <project.build.sourceEncoding>UTF-8</project.build.sourceEncoding>
<checkstyle.skip>true</checkstyle.skip> <checkstyle.skip>true</checkstyle.skip>
<main.basedir>${project.parent.parent.basedir}</main.basedir>
</properties> </properties>
<dependencyManagement> <dependencyManagement>

View File

@@ -30,6 +30,7 @@
<properties> <properties>
<project.build.sourceEncoding>UTF-8</project.build.sourceEncoding> <project.build.sourceEncoding>UTF-8</project.build.sourceEncoding>
<checkstyle.skip>true</checkstyle.skip> <checkstyle.skip>true</checkstyle.skip>
<main.basedir>${project.parent.parent.basedir}</main.basedir>
</properties> </properties>
<dependencyManagement> <dependencyManagement>

View File

@@ -30,6 +30,7 @@
<properties> <properties>
<project.build.sourceEncoding>UTF-8</project.build.sourceEncoding> <project.build.sourceEncoding>UTF-8</project.build.sourceEncoding>
<checkstyle.skip>true</checkstyle.skip> <checkstyle.skip>true</checkstyle.skip>
<main.basedir>${project.parent.parent.basedir}</main.basedir>
</properties> </properties>
<dependencyManagement> <dependencyManagement>

View File

@@ -30,6 +30,7 @@
<properties> <properties>
<project.build.sourceEncoding>UTF-8</project.build.sourceEncoding> <project.build.sourceEncoding>UTF-8</project.build.sourceEncoding>
<checkstyle.skip>true</checkstyle.skip> <checkstyle.skip>true</checkstyle.skip>
<main.basedir>${project.parent.parent.basedir}</main.basedir>
</properties> </properties>
<dependencyManagement> <dependencyManagement>

View File

@@ -30,6 +30,7 @@
<properties> <properties>
<project.build.sourceEncoding>UTF-8</project.build.sourceEncoding> <project.build.sourceEncoding>UTF-8</project.build.sourceEncoding>
<checkstyle.skip>true</checkstyle.skip> <checkstyle.skip>true</checkstyle.skip>
<main.basedir>${project.parent.parent.basedir}</main.basedir>
</properties> </properties>
<dependencyManagement> <dependencyManagement>

View File

@@ -29,6 +29,7 @@
<properties> <properties>
<spring.shell.version>1.2.0.RELEASE</spring.shell.version> <spring.shell.version>1.2.0.RELEASE</spring.shell.version>
<jar.mainclass>org.springframework.shell.Bootstrap</jar.mainclass> <jar.mainclass>org.springframework.shell.Bootstrap</jar.mainclass>
<main.basedir>${project.parent.basedir}</main.basedir>
</properties> </properties>
<repositories> <repositories>

View File

@@ -52,19 +52,16 @@ public class HoodiePrintHelper {
* @param rows List of rows * @param rows List of rows
* @return Serialized form for printing * @return Serialized form for printing
*/ */
public static String print(TableHeader rowHeader, public static String print(TableHeader rowHeader, Map<String, Function<Object, String>> fieldNameToConverterMap,
Map<String, Function<Object, String>> fieldNameToConverterMap, String sortByField, boolean isDescending, Integer limit, boolean headerOnly, List<Comparable[]> rows) {
String sortByField, boolean isDescending, Integer limit, boolean headerOnly,
List<Comparable[]> rows) {
if (headerOnly) { if (headerOnly) {
return HoodiePrintHelper.print(rowHeader); return HoodiePrintHelper.print(rowHeader);
} }
Table table = new Table(rowHeader, fieldNameToConverterMap, Table table =
Option.ofNullable(sortByField.isEmpty() ? null : sortByField), new Table(rowHeader, fieldNameToConverterMap, Option.ofNullable(sortByField.isEmpty() ? null : sortByField),
Option.ofNullable(isDescending), Option.ofNullable(isDescending), Option.ofNullable(limit <= 0 ? null : limit)).addAllRows(rows).flip();
Option.ofNullable(limit <= 0 ? null : limit)).addAllRows(rows).flip();
return HoodiePrintHelper.print(table); return HoodiePrintHelper.print(table);
} }
@@ -79,9 +76,8 @@ public class HoodiePrintHelper {
String[] header = new String[buffer.getFieldNames().size()]; String[] header = new String[buffer.getFieldNames().size()];
buffer.getFieldNames().toArray(header); buffer.getFieldNames().toArray(header);
String[][] rows = buffer.getRenderRows().stream() String[][] rows =
.map(l -> l.stream().toArray(String[]::new)) buffer.getRenderRows().stream().map(l -> l.stream().toArray(String[]::new)).toArray(String[][]::new);
.toArray(String[][]::new);
return printTextTable(header, rows); return printTextTable(header, rows);
} }

View File

@@ -31,8 +31,7 @@ import java.util.stream.IntStream;
import org.apache.hudi.common.util.Option; import org.apache.hudi.common.util.Option;
/** /**
* Table to be rendered. This class takes care of ordering * Table to be rendered. This class takes care of ordering rows and limiting before renderer renders it.
* rows and limiting before renderer renders it.
*/ */
public class Table implements Iterable<List<String>> { public class Table implements Iterable<List<String>> {
@@ -53,11 +52,8 @@ public class Table implements Iterable<List<String>> {
// Rows ready for Rendering // Rows ready for Rendering
private List<List<String>> renderRows; private List<List<String>> renderRows;
public Table(TableHeader rowHeader, public Table(TableHeader rowHeader, Map<String, Function<Object, String>> fieldNameToConverterMap,
Map<String, Function<Object, String>> fieldNameToConverterMap, Option<String> orderingFieldNameOptional, Option<Boolean> isDescendingOptional, Option<Integer> limitOptional) {
Option<String> orderingFieldNameOptional,
Option<Boolean> isDescendingOptional,
Option<Integer> limitOptional) {
this.rowHeader = rowHeader; this.rowHeader = rowHeader;
this.fieldNameToConverterMap = fieldNameToConverterMap; this.fieldNameToConverterMap = fieldNameToConverterMap;
this.orderingFieldNameOptional = orderingFieldNameOptional; this.orderingFieldNameOptional = orderingFieldNameOptional;
@@ -68,6 +64,7 @@ public class Table implements Iterable<List<String>> {
/** /**
* Main API to add row to the table * Main API to add row to the table
*
* @param row Row * @param row Row
*/ */
public Table add(List<Comparable> row) { public Table add(List<Comparable> row) {
@@ -86,6 +83,7 @@ public class Table implements Iterable<List<String>> {
/** /**
* Add all rows * Add all rows
*
* @param rows Rows to be aded * @param rows Rows to be aded
* @return * @return
*/ */
@@ -96,6 +94,7 @@ public class Table implements Iterable<List<String>> {
/** /**
* Add all rows * Add all rows
*
* @param rows Rows to be added * @param rows Rows to be added
* @return * @return
*/ */
@@ -115,6 +114,7 @@ public class Table implements Iterable<List<String>> {
/** /**
* Sorting of rows by a specified field * Sorting of rows by a specified field
*
* @return * @return
*/ */
private List<List<Comparable>> orderRows() { private List<List<Comparable>> orderRows() {

View File

@@ -59,8 +59,8 @@ public class ArchivedCommitsCommand implements CommandMarker {
@CliOption(key = {"limit"}, help = "Limit commits", unspecifiedDefaultValue = "-1") final Integer limit, @CliOption(key = {"limit"}, help = "Limit commits", unspecifiedDefaultValue = "-1") final Integer limit,
@CliOption(key = {"sortBy"}, help = "Sorting Field", unspecifiedDefaultValue = "") final String sortByField, @CliOption(key = {"sortBy"}, help = "Sorting Field", unspecifiedDefaultValue = "") final String sortByField,
@CliOption(key = {"desc"}, help = "Ordering", unspecifiedDefaultValue = "false") final boolean descending, @CliOption(key = {"desc"}, help = "Ordering", unspecifiedDefaultValue = "false") final boolean descending,
@CliOption(key = { @CliOption(key = {"headeronly"}, help = "Print Header Only",
"headeronly"}, help = "Print Header Only", unspecifiedDefaultValue = "false") final boolean headerOnly) unspecifiedDefaultValue = "false") final boolean headerOnly)
throws IOException { throws IOException {
System.out.println("===============> Showing only " + limit + " archived commits <==============="); System.out.println("===============> Showing only " + limit + " archived commits <===============");
String basePath = HoodieCLI.tableMetadata.getBasePath(); String basePath = HoodieCLI.tableMetadata.getBasePath();
@@ -86,9 +86,8 @@ public class ArchivedCommitsCommand implements CommandMarker {
.filter(r -> r.get("actionType").toString().equals(HoodieTimeline.COMMIT_ACTION) .filter(r -> r.get("actionType").toString().equals(HoodieTimeline.COMMIT_ACTION)
|| r.get("actionType").toString().equals(HoodieTimeline.DELTA_COMMIT_ACTION)) || r.get("actionType").toString().equals(HoodieTimeline.DELTA_COMMIT_ACTION))
.flatMap(r -> { .flatMap(r -> {
HoodieCommitMetadata metadata = HoodieCommitMetadata metadata = (HoodieCommitMetadata) SpecificData.get()
(HoodieCommitMetadata) SpecificData.get().deepCopy(HoodieCommitMetadata.SCHEMA$, .deepCopy(HoodieCommitMetadata.SCHEMA$, r.get("hoodieCommitMetadata"));
r.get("hoodieCommitMetadata"));
final String instantTime = r.get("commitTime").toString(); final String instantTime = r.get("commitTime").toString();
final String action = r.get("actionType").toString(); final String action = r.get("actionType").toString();
return metadata.getPartitionToWriteStats().values().stream().flatMap(hoodieWriteStats -> { return metadata.getPartitionToWriteStats().values().stream().flatMap(hoodieWriteStats -> {
@@ -118,22 +117,13 @@ public class ArchivedCommitsCommand implements CommandMarker {
allStats.addAll(readCommits); allStats.addAll(readCommits);
reader.close(); reader.close();
} }
TableHeader header = new TableHeader().addTableHeaderField("action") TableHeader header = new TableHeader().addTableHeaderField("action").addTableHeaderField("instant")
.addTableHeaderField("instant") .addTableHeaderField("partition").addTableHeaderField("file_id").addTableHeaderField("prev_instant")
.addTableHeaderField("partition") .addTableHeaderField("num_writes").addTableHeaderField("num_inserts").addTableHeaderField("num_deletes")
.addTableHeaderField("file_id") .addTableHeaderField("num_update_writes").addTableHeaderField("total_log_files")
.addTableHeaderField("prev_instant") .addTableHeaderField("total_log_blocks").addTableHeaderField("total_corrupt_log_blocks")
.addTableHeaderField("num_writes") .addTableHeaderField("total_rollback_blocks").addTableHeaderField("total_log_records")
.addTableHeaderField("num_inserts") .addTableHeaderField("total_updated_records_compacted").addTableHeaderField("total_write_bytes")
.addTableHeaderField("num_deletes")
.addTableHeaderField("num_update_writes")
.addTableHeaderField("total_log_files")
.addTableHeaderField("total_log_blocks")
.addTableHeaderField("total_corrupt_log_blocks")
.addTableHeaderField("total_rollback_blocks")
.addTableHeaderField("total_log_records")
.addTableHeaderField("total_updated_records_compacted")
.addTableHeaderField("total_write_bytes")
.addTableHeaderField("total_write_errors"); .addTableHeaderField("total_write_errors");
return HoodiePrintHelper.print(header, new HashMap<>(), sortByField, descending, limit, headerOnly, allStats); return HoodiePrintHelper.print(header, new HashMap<>(), sortByField, descending, limit, headerOnly, allStats);
@@ -141,19 +131,19 @@ public class ArchivedCommitsCommand implements CommandMarker {
@CliCommand(value = "show archived commits", help = "Read commits from archived files and show details") @CliCommand(value = "show archived commits", help = "Read commits from archived files and show details")
public String showCommits( public String showCommits(
@CliOption(key = {"skipMetadata"}, help = "Skip displaying commit metadata", unspecifiedDefaultValue = "true") @CliOption(key = {"skipMetadata"}, help = "Skip displaying commit metadata",
boolean skipMetadata, unspecifiedDefaultValue = "true") boolean skipMetadata,
@CliOption(key = {"limit"}, help = "Limit commits", unspecifiedDefaultValue = "10") final Integer limit, @CliOption(key = {"limit"}, help = "Limit commits", unspecifiedDefaultValue = "10") final Integer limit,
@CliOption(key = {"sortBy"}, help = "Sorting Field", unspecifiedDefaultValue = "") final String sortByField, @CliOption(key = {"sortBy"}, help = "Sorting Field", unspecifiedDefaultValue = "") final String sortByField,
@CliOption(key = {"desc"}, help = "Ordering", unspecifiedDefaultValue = "false") final boolean descending, @CliOption(key = {"desc"}, help = "Ordering", unspecifiedDefaultValue = "false") final boolean descending,
@CliOption(key = { @CliOption(key = {"headeronly"}, help = "Print Header Only",
"headeronly"}, help = "Print Header Only", unspecifiedDefaultValue = "false") final boolean headerOnly) unspecifiedDefaultValue = "false") final boolean headerOnly)
throws IOException { throws IOException {
System.out.println("===============> Showing only " + limit + " archived commits <==============="); System.out.println("===============> Showing only " + limit + " archived commits <===============");
String basePath = HoodieCLI.tableMetadata.getBasePath(); String basePath = HoodieCLI.tableMetadata.getBasePath();
FileStatus[] fsStatuses = FSUtils.getFs(basePath, HoodieCLI.conf) FileStatus[] fsStatuses =
.globStatus(new Path(basePath + "/.hoodie/.commits_.archive*")); FSUtils.getFs(basePath, HoodieCLI.conf).globStatus(new Path(basePath + "/.hoodie/.commits_.archive*"));
List<Comparable[]> allCommits = new ArrayList<>(); List<Comparable[]> allCommits = new ArrayList<>();
for (FileStatus fs : fsStatuses) { for (FileStatus fs : fsStatuses) {
// read the archived file // read the archived file
@@ -167,15 +157,13 @@ public class ArchivedCommitsCommand implements CommandMarker {
List<IndexedRecord> records = blk.getRecords(); List<IndexedRecord> records = blk.getRecords();
readRecords.addAll(records); readRecords.addAll(records);
} }
List<Comparable[]> readCommits = readRecords.stream().map(r -> (GenericRecord) r).map(r -> List<Comparable[]> readCommits = readRecords.stream().map(r -> (GenericRecord) r)
readCommit(r, skipMetadata)) .map(r -> readCommit(r, skipMetadata)).collect(Collectors.toList());
.collect(Collectors.toList());
allCommits.addAll(readCommits); allCommits.addAll(readCommits);
reader.close(); reader.close();
} }
TableHeader header = new TableHeader().addTableHeaderField("CommitTime") TableHeader header = new TableHeader().addTableHeaderField("CommitTime").addTableHeaderField("CommitType");
.addTableHeaderField("CommitType");
if (!skipMetadata) { if (!skipMetadata) {
header = header.addTableHeaderField("CommitDetails"); header = header.addTableHeaderField("CommitDetails");

View File

@@ -63,8 +63,8 @@ public class CleansCommand implements CommandMarker {
@CliOption(key = {"limit"}, help = "Limit commits", unspecifiedDefaultValue = "-1") final Integer limit, @CliOption(key = {"limit"}, help = "Limit commits", unspecifiedDefaultValue = "-1") final Integer limit,
@CliOption(key = {"sortBy"}, help = "Sorting Field", unspecifiedDefaultValue = "") final String sortByField, @CliOption(key = {"sortBy"}, help = "Sorting Field", unspecifiedDefaultValue = "") final String sortByField,
@CliOption(key = {"desc"}, help = "Ordering", unspecifiedDefaultValue = "false") final boolean descending, @CliOption(key = {"desc"}, help = "Ordering", unspecifiedDefaultValue = "false") final boolean descending,
@CliOption(key = { @CliOption(key = {"headeronly"}, help = "Print Header Only",
"headeronly"}, help = "Print Header Only", unspecifiedDefaultValue = "false") final boolean headerOnly) unspecifiedDefaultValue = "false") final boolean headerOnly)
throws IOException { throws IOException {
HoodieActiveTimeline activeTimeline = HoodieCLI.tableMetadata.getActiveTimeline(); HoodieActiveTimeline activeTimeline = HoodieCLI.tableMetadata.getActiveTimeline();
@@ -74,17 +74,15 @@ public class CleansCommand implements CommandMarker {
Collections.reverse(cleans); Collections.reverse(cleans);
for (int i = 0; i < cleans.size(); i++) { for (int i = 0; i < cleans.size(); i++) {
HoodieInstant clean = cleans.get(i); HoodieInstant clean = cleans.get(i);
HoodieCleanMetadata cleanMetadata = AvroUtils HoodieCleanMetadata cleanMetadata =
.deserializeHoodieCleanMetadata(timeline.getInstantDetails(clean).get()); AvroUtils.deserializeHoodieCleanMetadata(timeline.getInstantDetails(clean).get());
rows.add(new Comparable[] {clean.getTimestamp(), cleanMetadata.getEarliestCommitToRetain(), rows.add(new Comparable[] {clean.getTimestamp(), cleanMetadata.getEarliestCommitToRetain(),
cleanMetadata.getTotalFilesDeleted(), cleanMetadata.getTimeTakenInMillis()}); cleanMetadata.getTotalFilesDeleted(), cleanMetadata.getTimeTakenInMillis()});
} }
TableHeader header = new TableHeader() TableHeader header =
.addTableHeaderField("CleanTime") new TableHeader().addTableHeaderField("CleanTime").addTableHeaderField("EarliestCommandRetained")
.addTableHeaderField("EarliestCommandRetained") .addTableHeaderField("Total Files Deleted").addTableHeaderField("Total Time Taken");
.addTableHeaderField("Total Files Deleted")
.addTableHeaderField("Total Time Taken");
return HoodiePrintHelper.print(header, new HashMap<>(), sortByField, descending, limit, headerOnly, rows); return HoodiePrintHelper.print(header, new HashMap<>(), sortByField, descending, limit, headerOnly, rows);
} }
@@ -95,13 +93,12 @@ public class CleansCommand implements CommandMarker {
} }
@CliCommand(value = "clean showpartitions", help = "Show partition level details of a clean") @CliCommand(value = "clean showpartitions", help = "Show partition level details of a clean")
public String showCleanPartitions( public String showCleanPartitions(@CliOption(key = {"clean"}, help = "clean to show") final String commitTime,
@CliOption(key = {"clean"}, help = "clean to show") final String commitTime,
@CliOption(key = {"limit"}, help = "Limit commits", unspecifiedDefaultValue = "-1") final Integer limit, @CliOption(key = {"limit"}, help = "Limit commits", unspecifiedDefaultValue = "-1") final Integer limit,
@CliOption(key = {"sortBy"}, help = "Sorting Field", unspecifiedDefaultValue = "") final String sortByField, @CliOption(key = {"sortBy"}, help = "Sorting Field", unspecifiedDefaultValue = "") final String sortByField,
@CliOption(key = {"desc"}, help = "Ordering", unspecifiedDefaultValue = "false") final boolean descending, @CliOption(key = {"desc"}, help = "Ordering", unspecifiedDefaultValue = "false") final boolean descending,
@CliOption(key = { @CliOption(key = {"headeronly"}, help = "Print Header Only",
"headeronly"}, help = "Print Header Only", unspecifiedDefaultValue = "false") final boolean headerOnly) unspecifiedDefaultValue = "false") final boolean headerOnly)
throws Exception { throws Exception {
HoodieActiveTimeline activeTimeline = HoodieCLI.tableMetadata.getActiveTimeline(); HoodieActiveTimeline activeTimeline = HoodieCLI.tableMetadata.getActiveTimeline();
@@ -112,8 +109,8 @@ public class CleansCommand implements CommandMarker {
return "Clean " + commitTime + " not found in metadata " + timeline; return "Clean " + commitTime + " not found in metadata " + timeline;
} }
HoodieCleanMetadata cleanMetadata = AvroUtils.deserializeHoodieCleanMetadata( HoodieCleanMetadata cleanMetadata =
timeline.getInstantDetails(cleanInstant).get()); AvroUtils.deserializeHoodieCleanMetadata(timeline.getInstantDetails(cleanInstant).get());
List<Comparable[]> rows = new ArrayList<>(); List<Comparable[]> rows = new ArrayList<>();
for (Map.Entry<String, HoodieCleanPartitionMetadata> entry : cleanMetadata.getPartitionMetadata().entrySet()) { for (Map.Entry<String, HoodieCleanPartitionMetadata> entry : cleanMetadata.getPartitionMetadata().entrySet()) {
String path = entry.getKey(); String path = entry.getKey();
@@ -124,11 +121,8 @@ public class CleansCommand implements CommandMarker {
rows.add(new Comparable[] {path, policy, totalSuccessDeletedFiles, totalFailedDeletedFiles}); rows.add(new Comparable[] {path, policy, totalSuccessDeletedFiles, totalFailedDeletedFiles});
} }
TableHeader header = new TableHeader() TableHeader header = new TableHeader().addTableHeaderField("Partition Path").addTableHeaderField("Cleaning policy")
.addTableHeaderField("Partition Path") .addTableHeaderField("Total Files Successfully Deleted").addTableHeaderField("Total Failed Deletions");
.addTableHeaderField("Cleaning policy")
.addTableHeaderField("Total Files Successfully Deleted")
.addTableHeaderField("Total Failed Deletions");
return HoodiePrintHelper.print(header, new HashMap<>(), sortByField, descending, limit, headerOnly, rows); return HoodiePrintHelper.print(header, new HashMap<>(), sortByField, descending, limit, headerOnly, rows);
} }

View File

@@ -69,12 +69,13 @@ public class CommitsCommand implements CommandMarker {
} }
@CliCommand(value = "commits show", help = "Show the commits") @CliCommand(value = "commits show", help = "Show the commits")
public String showCommits(@CliOption(key = { public String showCommits(
"limit"}, mandatory = false, help = "Limit commits", unspecifiedDefaultValue = "-1") final Integer limit, @CliOption(key = {"limit"}, mandatory = false, help = "Limit commits",
unspecifiedDefaultValue = "-1") final Integer limit,
@CliOption(key = {"sortBy"}, help = "Sorting Field", unspecifiedDefaultValue = "") final String sortByField, @CliOption(key = {"sortBy"}, help = "Sorting Field", unspecifiedDefaultValue = "") final String sortByField,
@CliOption(key = {"desc"}, help = "Ordering", unspecifiedDefaultValue = "false") final boolean descending, @CliOption(key = {"desc"}, help = "Ordering", unspecifiedDefaultValue = "false") final boolean descending,
@CliOption(key = { @CliOption(key = {"headeronly"}, help = "Print Header Only",
"headeronly"}, help = "Print Header Only", unspecifiedDefaultValue = "false") final boolean headerOnly) unspecifiedDefaultValue = "false") final boolean headerOnly)
throws IOException { throws IOException {
HoodieActiveTimeline activeTimeline = HoodieCLI.tableMetadata.getActiveTimeline(); HoodieActiveTimeline activeTimeline = HoodieCLI.tableMetadata.getActiveTimeline();
@@ -84,16 +85,12 @@ public class CommitsCommand implements CommandMarker {
Collections.reverse(commits); Collections.reverse(commits);
for (int i = 0; i < commits.size(); i++) { for (int i = 0; i < commits.size(); i++) {
HoodieInstant commit = commits.get(i); HoodieInstant commit = commits.get(i);
HoodieCommitMetadata commitMetadata = HoodieCommitMetadata.fromBytes(timeline.getInstantDetails(commit).get(), HoodieCommitMetadata commitMetadata =
HoodieCommitMetadata.class); HoodieCommitMetadata.fromBytes(timeline.getInstantDetails(commit).get(), HoodieCommitMetadata.class);
rows.add(new Comparable[]{commit.getTimestamp(), rows.add(new Comparable[] {commit.getTimestamp(), commitMetadata.fetchTotalBytesWritten(),
commitMetadata.fetchTotalBytesWritten(), commitMetadata.fetchTotalFilesInsert(), commitMetadata.fetchTotalFilesUpdated(),
commitMetadata.fetchTotalFilesInsert(), commitMetadata.fetchTotalPartitionsWritten(), commitMetadata.fetchTotalRecordsWritten(),
commitMetadata.fetchTotalFilesUpdated(), commitMetadata.fetchTotalUpdateRecordsWritten(), commitMetadata.fetchTotalWriteErrors()});
commitMetadata.fetchTotalPartitionsWritten(),
commitMetadata.fetchTotalRecordsWritten(),
commitMetadata.fetchTotalUpdateRecordsWritten(),
commitMetadata.fetchTotalWriteErrors()});
} }
Map<String, Function<Object, String>> fieldNameToConverterMap = new HashMap<>(); Map<String, Function<Object, String>> fieldNameToConverterMap = new HashMap<>();
@@ -101,15 +98,10 @@ public class CommitsCommand implements CommandMarker {
return NumericUtils.humanReadableByteCount((Double.valueOf(entry.toString()))); return NumericUtils.humanReadableByteCount((Double.valueOf(entry.toString())));
}); });
TableHeader header = new TableHeader() TableHeader header = new TableHeader().addTableHeaderField("CommitTime").addTableHeaderField("Total Bytes Written")
.addTableHeaderField("CommitTime") .addTableHeaderField("Total Files Added").addTableHeaderField("Total Files Updated")
.addTableHeaderField("Total Bytes Written") .addTableHeaderField("Total Partitions Written").addTableHeaderField("Total Records Written")
.addTableHeaderField("Total Files Added") .addTableHeaderField("Total Update Records Written").addTableHeaderField("Total Errors");
.addTableHeaderField("Total Files Updated")
.addTableHeaderField("Total Partitions Written")
.addTableHeaderField("Total Records Written")
.addTableHeaderField("Total Update Records Written")
.addTableHeaderField("Total Errors");
return HoodiePrintHelper.print(header, fieldNameToConverterMap, sortByField, descending, limit, headerOnly, rows); return HoodiePrintHelper.print(header, fieldNameToConverterMap, sortByField, descending, limit, headerOnly, rows);
} }
@@ -132,8 +124,8 @@ public class CommitsCommand implements CommandMarker {
} }
SparkLauncher sparkLauncher = SparkUtil.initLauncher(sparkPropertiesPath); SparkLauncher sparkLauncher = SparkUtil.initLauncher(sparkPropertiesPath);
sparkLauncher sparkLauncher.addAppArgs(SparkMain.SparkCommand.ROLLBACK.toString(), commitTime,
.addAppArgs(SparkMain.SparkCommand.ROLLBACK.toString(), commitTime, HoodieCLI.tableMetadata.getBasePath()); HoodieCLI.tableMetadata.getBasePath());
Process process = sparkLauncher.launch(); Process process = sparkLauncher.launch();
InputStreamConsumer.captureOutput(process); InputStreamConsumer.captureOutput(process);
int exitCode = process.waitFor(); int exitCode = process.waitFor();
@@ -146,13 +138,12 @@ public class CommitsCommand implements CommandMarker {
} }
@CliCommand(value = "commit showpartitions", help = "Show partition level details of a commit") @CliCommand(value = "commit showpartitions", help = "Show partition level details of a commit")
public String showCommitPartitions( public String showCommitPartitions(@CliOption(key = {"commit"}, help = "Commit to show") final String commitTime,
@CliOption(key = {"commit"}, help = "Commit to show") final String commitTime,
@CliOption(key = {"limit"}, help = "Limit commits", unspecifiedDefaultValue = "-1") final Integer limit, @CliOption(key = {"limit"}, help = "Limit commits", unspecifiedDefaultValue = "-1") final Integer limit,
@CliOption(key = {"sortBy"}, help = "Sorting Field", unspecifiedDefaultValue = "") final String sortByField, @CliOption(key = {"sortBy"}, help = "Sorting Field", unspecifiedDefaultValue = "") final String sortByField,
@CliOption(key = {"desc"}, help = "Ordering", unspecifiedDefaultValue = "false") final boolean descending, @CliOption(key = {"desc"}, help = "Ordering", unspecifiedDefaultValue = "false") final boolean descending,
@CliOption(key = { @CliOption(key = {"headeronly"}, help = "Print Header Only",
"headeronly"}, help = "Print Header Only", unspecifiedDefaultValue = "false") final boolean headerOnly) unspecifiedDefaultValue = "false") final boolean headerOnly)
throws Exception { throws Exception {
HoodieActiveTimeline activeTimeline = HoodieCLI.tableMetadata.getActiveTimeline(); HoodieActiveTimeline activeTimeline = HoodieCLI.tableMetadata.getActiveTimeline();
@@ -185,8 +176,7 @@ public class CommitsCommand implements CommandMarker {
totalBytesWritten += stat.getTotalWriteBytes(); totalBytesWritten += stat.getTotalWriteBytes();
totalWriteErrors += stat.getTotalWriteErrors(); totalWriteErrors += stat.getTotalWriteErrors();
} }
rows.add(new Comparable[]{path, totalFilesAdded, totalFilesUpdated, rows.add(new Comparable[] {path, totalFilesAdded, totalFilesUpdated, totalRecordsInserted, totalRecordsUpdated,
totalRecordsInserted, totalRecordsUpdated,
totalBytesWritten, totalWriteErrors}); totalBytesWritten, totalWriteErrors});
} }
@@ -195,26 +185,21 @@ public class CommitsCommand implements CommandMarker {
return NumericUtils.humanReadableByteCount((Long.valueOf(entry.toString()))); return NumericUtils.humanReadableByteCount((Long.valueOf(entry.toString())));
}); });
TableHeader header = new TableHeader() TableHeader header = new TableHeader().addTableHeaderField("Partition Path")
.addTableHeaderField("Partition Path") .addTableHeaderField("Total Files Added").addTableHeaderField("Total Files Updated")
.addTableHeaderField("Total Files Added") .addTableHeaderField("Total Records Inserted").addTableHeaderField("Total Records Updated")
.addTableHeaderField("Total Files Updated") .addTableHeaderField("Total Bytes Written").addTableHeaderField("Total Errors");
.addTableHeaderField("Total Records Inserted")
.addTableHeaderField("Total Records Updated")
.addTableHeaderField("Total Bytes Written")
.addTableHeaderField("Total Errors");
return HoodiePrintHelper.print(header, fieldNameToConverterMap, sortByField, descending, limit, headerOnly, rows); return HoodiePrintHelper.print(header, fieldNameToConverterMap, sortByField, descending, limit, headerOnly, rows);
} }
@CliCommand(value = "commit showfiles", help = "Show file level details of a commit") @CliCommand(value = "commit showfiles", help = "Show file level details of a commit")
public String showCommitFiles( public String showCommitFiles(@CliOption(key = {"commit"}, help = "Commit to show") final String commitTime,
@CliOption(key = {"commit"}, help = "Commit to show") final String commitTime,
@CliOption(key = {"limit"}, help = "Limit commits", unspecifiedDefaultValue = "-1") final Integer limit, @CliOption(key = {"limit"}, help = "Limit commits", unspecifiedDefaultValue = "-1") final Integer limit,
@CliOption(key = {"sortBy"}, help = "Sorting Field", unspecifiedDefaultValue = "") final String sortByField, @CliOption(key = {"sortBy"}, help = "Sorting Field", unspecifiedDefaultValue = "") final String sortByField,
@CliOption(key = {"desc"}, help = "Ordering", unspecifiedDefaultValue = "false") final boolean descending, @CliOption(key = {"desc"}, help = "Ordering", unspecifiedDefaultValue = "false") final boolean descending,
@CliOption(key = { @CliOption(key = {"headeronly"}, help = "Print Header Only",
"headeronly"}, help = "Print Header Only", unspecifiedDefaultValue = "false") final boolean headerOnly) unspecifiedDefaultValue = "false") final boolean headerOnly)
throws Exception { throws Exception {
HoodieActiveTimeline activeTimeline = HoodieCLI.tableMetadata.getActiveTimeline(); HoodieActiveTimeline activeTimeline = HoodieCLI.tableMetadata.getActiveTimeline();
@@ -232,22 +217,14 @@ public class CommitsCommand implements CommandMarker {
List<HoodieWriteStat> stats = entry.getValue(); List<HoodieWriteStat> stats = entry.getValue();
for (HoodieWriteStat stat : stats) { for (HoodieWriteStat stat : stats) {
rows.add(new Comparable[] {path, stat.getFileId(), stat.getPrevCommit(), stat.getNumUpdateWrites(), rows.add(new Comparable[] {path, stat.getFileId(), stat.getPrevCommit(), stat.getNumUpdateWrites(),
stat.getNumWrites(), stat.getTotalWriteBytes(), stat.getNumWrites(), stat.getTotalWriteBytes(), stat.getTotalWriteErrors(), stat.getFileSizeInBytes()});
stat.getTotalWriteErrors(),
stat.getFileSizeInBytes()
});
} }
} }
TableHeader header = new TableHeader() TableHeader header = new TableHeader().addTableHeaderField("Partition Path").addTableHeaderField("File ID")
.addTableHeaderField("Partition Path") .addTableHeaderField("Previous Commit").addTableHeaderField("Total Records Updated")
.addTableHeaderField("File ID") .addTableHeaderField("Total Records Written").addTableHeaderField("Total Bytes Written")
.addTableHeaderField("Previous Commit") .addTableHeaderField("Total Errors").addTableHeaderField("File Size");
.addTableHeaderField("Total Records Updated")
.addTableHeaderField("Total Records Written")
.addTableHeaderField("Total Bytes Written")
.addTableHeaderField("Total Errors")
.addTableHeaderField("File Size");
return HoodiePrintHelper.print(header, new HashMap<>(), sortByField, descending, limit, headerOnly, rows); return HoodiePrintHelper.print(header, new HashMap<>(), sortByField, descending, limit, headerOnly, rows);
} }
@@ -270,8 +247,8 @@ public class CommitsCommand implements CommandMarker {
String sourceLatestCommit = String sourceLatestCommit =
sourceTimeline.getInstants().iterator().hasNext() ? "0" : sourceTimeline.lastInstant().get().getTimestamp(); sourceTimeline.getInstants().iterator().hasNext() ? "0" : sourceTimeline.lastInstant().get().getTimestamp();
if (sourceLatestCommit != null && HoodieTimeline.compareTimestamps(targetLatestCommit, sourceLatestCommit, if (sourceLatestCommit != null
HoodieTimeline.GREATER)) { && HoodieTimeline.compareTimestamps(targetLatestCommit, sourceLatestCommit, HoodieTimeline.GREATER)) {
// source is behind the target // source is behind the target
List<String> commitsToCatchup = targetTimeline.findInstantsAfter(sourceLatestCommit, Integer.MAX_VALUE) List<String> commitsToCatchup = targetTimeline.findInstantsAfter(sourceLatestCommit, Integer.MAX_VALUE)
.getInstants().map(HoodieInstant::getTimestamp).collect(Collectors.toList()); .getInstants().map(HoodieInstant::getTimestamp).collect(Collectors.toList());

View File

@@ -75,16 +75,15 @@ public class CompactionCommand implements CommandMarker {
@CliCommand(value = "compactions show all", help = "Shows all compactions that are in active timeline") @CliCommand(value = "compactions show all", help = "Shows all compactions that are in active timeline")
public String compactionsAll( public String compactionsAll(
@CliOption(key = { @CliOption(key = {"includeExtraMetadata"}, help = "Include extra metadata",
"includeExtraMetadata"}, help = "Include extra metadata", unspecifiedDefaultValue = "false") final unspecifiedDefaultValue = "false") final boolean includeExtraMetadata,
boolean includeExtraMetadata, @CliOption(key = {"limit"}, mandatory = false, help = "Limit commits",
@CliOption(key = { unspecifiedDefaultValue = "-1") final Integer limit,
"limit"}, mandatory = false, help = "Limit commits", unspecifiedDefaultValue = "-1") final Integer limit,
@CliOption(key = {"sortBy"}, help = "Sorting Field", unspecifiedDefaultValue = "") final String sortByField, @CliOption(key = {"sortBy"}, help = "Sorting Field", unspecifiedDefaultValue = "") final String sortByField,
@CliOption(key = {"desc"}, help = "Ordering", unspecifiedDefaultValue = "false") final boolean descending, @CliOption(key = {"desc"}, help = "Ordering", unspecifiedDefaultValue = "false") final boolean descending,
@CliOption(key = { @CliOption(key = {"headeronly"}, help = "Print Header Only",
"headeronly"}, help = "Print Header Only", unspecifiedDefaultValue = "false") final unspecifiedDefaultValue = "false") final boolean headerOnly)
boolean headerOnly) throws IOException { throws IOException {
HoodieActiveTimeline activeTimeline = HoodieCLI.tableMetadata.getActiveTimeline(); HoodieActiveTimeline activeTimeline = HoodieCLI.tableMetadata.getActiveTimeline();
HoodieTimeline timeline = activeTimeline.getCommitsAndCompactionTimeline(); HoodieTimeline timeline = activeTimeline.getCommitsAndCompactionTimeline();
HoodieTimeline commitTimeline = activeTimeline.getCommitTimeline().filterCompletedInstants(); HoodieTimeline commitTimeline = activeTimeline.getCommitTimeline().filterCompletedInstants();
@@ -99,15 +98,14 @@ public class CompactionCommand implements CommandMarker {
if (!instant.getAction().equals(HoodieTimeline.COMPACTION_ACTION)) { if (!instant.getAction().equals(HoodieTimeline.COMPACTION_ACTION)) {
try { try {
// This could be a completed compaction. Assume a compaction request file is present but skip if fails // This could be a completed compaction. Assume a compaction request file is present but skip if fails
workload = AvroUtils.deserializeCompactionPlan( workload = AvroUtils.deserializeCompactionPlan(activeTimeline
activeTimeline.getInstantAuxiliaryDetails( .getInstantAuxiliaryDetails(HoodieTimeline.getCompactionRequestedInstant(instant.getTimestamp())).get());
HoodieTimeline.getCompactionRequestedInstant(instant.getTimestamp())).get());
} catch (HoodieIOException ioe) { } catch (HoodieIOException ioe) {
// SKIP // SKIP
} }
} else { } else {
workload = AvroUtils.deserializeCompactionPlan(activeTimeline.getInstantAuxiliaryDetails( workload = AvroUtils.deserializeCompactionPlan(activeTimeline
HoodieTimeline.getCompactionRequestedInstant(instant.getTimestamp())).get()); .getInstantAuxiliaryDetails(HoodieTimeline.getCompactionRequestedInstant(instant.getTimestamp())).get());
} }
if (null != workload) { if (null != workload) {
@@ -116,22 +114,18 @@ public class CompactionCommand implements CommandMarker {
state = State.COMPLETED; state = State.COMPLETED;
} }
if (includeExtraMetadata) { if (includeExtraMetadata) {
rows.add(new Comparable[]{instant.getTimestamp(), rows.add(new Comparable[] {instant.getTimestamp(), state.toString(),
state.toString(),
workload.getOperations() == null ? 0 : workload.getOperations().size(), workload.getOperations() == null ? 0 : workload.getOperations().size(),
workload.getExtraMetadata().toString()}); workload.getExtraMetadata().toString()});
} else { } else {
rows.add(new Comparable[]{instant.getTimestamp(), rows.add(new Comparable[] {instant.getTimestamp(), state.toString(),
state.toString(),
workload.getOperations() == null ? 0 : workload.getOperations().size()}); workload.getOperations() == null ? 0 : workload.getOperations().size()});
} }
} }
} }
Map<String, Function<Object, String>> fieldNameToConverterMap = new HashMap<>(); Map<String, Function<Object, String>> fieldNameToConverterMap = new HashMap<>();
TableHeader header = new TableHeader() TableHeader header = new TableHeader().addTableHeaderField("Compaction Instant Time").addTableHeaderField("State")
.addTableHeaderField("Compaction Instant Time")
.addTableHeaderField("State")
.addTableHeaderField("Total FileIds to be Compacted"); .addTableHeaderField("Total FileIds to be Compacted");
if (includeExtraMetadata) { if (includeExtraMetadata) {
header = header.addTableHeaderField("Extra Metadata"); header = header.addTableHeaderField("Extra Metadata");
@@ -141,48 +135,37 @@ public class CompactionCommand implements CommandMarker {
@CliCommand(value = "compaction show", help = "Shows compaction details for a specific compaction instant") @CliCommand(value = "compaction show", help = "Shows compaction details for a specific compaction instant")
public String compactionShow( public String compactionShow(
@CliOption(key = "instant", mandatory = true, help = "Base path for the target hoodie dataset") final @CliOption(key = "instant", mandatory = true,
String compactionInstantTime, help = "Base path for the target hoodie dataset") final String compactionInstantTime,
@CliOption(key = { @CliOption(key = {"limit"}, mandatory = false, help = "Limit commits",
"limit"}, mandatory = false, help = "Limit commits", unspecifiedDefaultValue = "-1") final Integer limit, unspecifiedDefaultValue = "-1") final Integer limit,
@CliOption(key = {"sortBy"}, help = "Sorting Field", unspecifiedDefaultValue = "") final String sortByField, @CliOption(key = {"sortBy"}, help = "Sorting Field", unspecifiedDefaultValue = "") final String sortByField,
@CliOption(key = {"desc"}, help = "Ordering", unspecifiedDefaultValue = "false") final boolean descending, @CliOption(key = {"desc"}, help = "Ordering", unspecifiedDefaultValue = "false") final boolean descending,
@CliOption(key = { @CliOption(key = {"headeronly"}, help = "Print Header Only",
"headeronly"}, help = "Print Header Only", unspecifiedDefaultValue = "false") final boolean headerOnly) unspecifiedDefaultValue = "false") final boolean headerOnly)
throws Exception { throws Exception {
HoodieActiveTimeline activeTimeline = HoodieCLI.tableMetadata.getActiveTimeline(); HoodieActiveTimeline activeTimeline = HoodieCLI.tableMetadata.getActiveTimeline();
HoodieCompactionPlan workload = AvroUtils.deserializeCompactionPlan( HoodieCompactionPlan workload = AvroUtils.deserializeCompactionPlan(activeTimeline
activeTimeline.getInstantAuxiliaryDetails( .getInstantAuxiliaryDetails(HoodieTimeline.getCompactionRequestedInstant(compactionInstantTime)).get());
HoodieTimeline.getCompactionRequestedInstant(compactionInstantTime)).get());
List<Comparable[]> rows = new ArrayList<>(); List<Comparable[]> rows = new ArrayList<>();
if ((null != workload) && (null != workload.getOperations())) { if ((null != workload) && (null != workload.getOperations())) {
for (HoodieCompactionOperation op : workload.getOperations()) { for (HoodieCompactionOperation op : workload.getOperations()) {
rows.add(new Comparable[]{op.getPartitionPath(), rows.add(new Comparable[] {op.getPartitionPath(), op.getFileId(), op.getBaseInstantTime(), op.getDataFilePath(),
op.getFileId(), op.getDeltaFilePaths().size(), op.getMetrics() == null ? "" : op.getMetrics().toString()});
op.getBaseInstantTime(),
op.getDataFilePath(),
op.getDeltaFilePaths().size(),
op.getMetrics() == null ? "" : op.getMetrics().toString()
});
} }
} }
Map<String, Function<Object, String>> fieldNameToConverterMap = new HashMap<>(); Map<String, Function<Object, String>> fieldNameToConverterMap = new HashMap<>();
TableHeader header = new TableHeader() TableHeader header = new TableHeader().addTableHeaderField("Partition Path").addTableHeaderField("File Id")
.addTableHeaderField("Partition Path") .addTableHeaderField("Base Instant").addTableHeaderField("Data File Path")
.addTableHeaderField("File Id") .addTableHeaderField("Total Delta Files").addTableHeaderField("getMetrics");
.addTableHeaderField("Base Instant")
.addTableHeaderField("Data File Path")
.addTableHeaderField("Total Delta Files")
.addTableHeaderField("getMetrics");
return HoodiePrintHelper.print(header, fieldNameToConverterMap, sortByField, descending, limit, headerOnly, rows); return HoodiePrintHelper.print(header, fieldNameToConverterMap, sortByField, descending, limit, headerOnly, rows);
} }
@CliCommand(value = "compaction schedule", help = "Schedule Compaction") @CliCommand(value = "compaction schedule", help = "Schedule Compaction")
public String scheduleCompact( public String scheduleCompact(@CliOption(key = "sparkMemory", unspecifiedDefaultValue = "1G",
@CliOption(key = "sparkMemory", unspecifiedDefaultValue = "1G", help = "Spark executor memory") help = "Spark executor memory") final String sparkMemory) throws Exception {
final String sparkMemory) throws Exception {
boolean initialized = HoodieCLI.initConf(); boolean initialized = HoodieCLI.initConf();
HoodieCLI.initFS(initialized); HoodieCLI.initFS(initialized);
@@ -190,8 +173,8 @@ public class CompactionCommand implements CommandMarker {
String compactionInstantTime = HoodieActiveTimeline.createNewCommitTime(); String compactionInstantTime = HoodieActiveTimeline.createNewCommitTime();
if (HoodieCLI.tableMetadata.getTableType() == HoodieTableType.MERGE_ON_READ) { if (HoodieCLI.tableMetadata.getTableType() == HoodieTableType.MERGE_ON_READ) {
String sparkPropertiesPath = Utils.getDefaultPropertiesFile( String sparkPropertiesPath =
scala.collection.JavaConversions.propertiesAsScalaMap(System.getProperties())); Utils.getDefaultPropertiesFile(scala.collection.JavaConversions.propertiesAsScalaMap(System.getProperties()));
SparkLauncher sparkLauncher = SparkUtil.initLauncher(sparkPropertiesPath); SparkLauncher sparkLauncher = SparkUtil.initLauncher(sparkPropertiesPath);
sparkLauncher.addAppArgs(SparkCommand.COMPACT_SCHEDULE.toString(), HoodieCLI.tableMetadata.getBasePath(), sparkLauncher.addAppArgs(SparkCommand.COMPACT_SCHEDULE.toString(), HoodieCLI.tableMetadata.getBasePath(),
HoodieCLI.tableMetadata.getTableConfig().getTableName(), compactionInstantTime, sparkMemory); HoodieCLI.tableMetadata.getTableConfig().getTableName(), compactionInstantTime, sparkMemory);
@@ -209,33 +192,34 @@ public class CompactionCommand implements CommandMarker {
@CliCommand(value = "compaction run", help = "Run Compaction for given instant time") @CliCommand(value = "compaction run", help = "Run Compaction for given instant time")
public String compact( public String compact(
@CliOption(key = {"parallelism"}, mandatory = true, help = "Parallelism for hoodie compaction") @CliOption(key = {"parallelism"}, mandatory = true,
final String parallelism, help = "Parallelism for hoodie compaction") final String parallelism,
@CliOption(key = "schemaFilePath", mandatory = true, help = "Path for Avro schema file") @CliOption(key = "schemaFilePath", mandatory = true,
final String schemaFilePath, help = "Path for Avro schema file") final String schemaFilePath,
@CliOption(key = "sparkMemory", unspecifiedDefaultValue = "4G", help = "Spark executor memory") @CliOption(key = "sparkMemory", unspecifiedDefaultValue = "4G",
final String sparkMemory, help = "Spark executor memory") final String sparkMemory,
@CliOption(key = "retry", unspecifiedDefaultValue = "1", help = "Number of retries") @CliOption(key = "retry", unspecifiedDefaultValue = "1", help = "Number of retries") final String retry,
final String retry, @CliOption(key = "compactionInstant", mandatory = false,
@CliOption(key = "compactionInstant", mandatory = false, help = "Base path for the target hoodie dataset") help = "Base path for the target hoodie dataset") String compactionInstantTime)
String compactionInstantTime) throws Exception { throws Exception {
boolean initialized = HoodieCLI.initConf(); boolean initialized = HoodieCLI.initConf();
HoodieCLI.initFS(initialized); HoodieCLI.initFS(initialized);
if (HoodieCLI.tableMetadata.getTableType() == HoodieTableType.MERGE_ON_READ) { if (HoodieCLI.tableMetadata.getTableType() == HoodieTableType.MERGE_ON_READ) {
if (null == compactionInstantTime) { if (null == compactionInstantTime) {
// pick outstanding one with lowest timestamp // pick outstanding one with lowest timestamp
Option<String> firstPendingInstant = HoodieCLI.tableMetadata.reloadActiveTimeline() Option<String> firstPendingInstant =
.filterCompletedAndCompactionInstants().filter(instant -> instant.getAction() HoodieCLI.tableMetadata.reloadActiveTimeline().filterCompletedAndCompactionInstants()
.equals(HoodieTimeline.COMPACTION_ACTION)).firstInstant().map(HoodieInstant::getTimestamp); .filter(instant -> instant.getAction().equals(HoodieTimeline.COMPACTION_ACTION)).firstInstant()
.map(HoodieInstant::getTimestamp);
if (!firstPendingInstant.isPresent()) { if (!firstPendingInstant.isPresent()) {
return "NO PENDING COMPACTION TO RUN"; return "NO PENDING COMPACTION TO RUN";
} }
compactionInstantTime = firstPendingInstant.get(); compactionInstantTime = firstPendingInstant.get();
} }
String sparkPropertiesPath = Utils.getDefaultPropertiesFile( String sparkPropertiesPath =
scala.collection.JavaConversions.propertiesAsScalaMap(System.getProperties())); Utils.getDefaultPropertiesFile(scala.collection.JavaConversions.propertiesAsScalaMap(System.getProperties()));
SparkLauncher sparkLauncher = SparkUtil.initLauncher(sparkPropertiesPath); SparkLauncher sparkLauncher = SparkUtil.initLauncher(sparkPropertiesPath);
sparkLauncher.addAppArgs(SparkCommand.COMPACT_RUN.toString(), HoodieCLI.tableMetadata.getBasePath(), sparkLauncher.addAppArgs(SparkCommand.COMPACT_RUN.toString(), HoodieCLI.tableMetadata.getBasePath(),
HoodieCLI.tableMetadata.getTableConfig().getTableName(), compactionInstantTime, parallelism, schemaFilePath, HoodieCLI.tableMetadata.getTableConfig().getTableName(), compactionInstantTime, parallelism, schemaFilePath,
@@ -279,8 +263,8 @@ public class CompactionCommand implements CommandMarker {
@CliOption(key = {"limit"}, help = "Limit commits", unspecifiedDefaultValue = "-1") Integer limit, @CliOption(key = {"limit"}, help = "Limit commits", unspecifiedDefaultValue = "-1") Integer limit,
@CliOption(key = {"sortBy"}, help = "Sorting Field", unspecifiedDefaultValue = "") String sortByField, @CliOption(key = {"sortBy"}, help = "Sorting Field", unspecifiedDefaultValue = "") String sortByField,
@CliOption(key = {"desc"}, help = "Ordering", unspecifiedDefaultValue = "false") boolean descending, @CliOption(key = {"desc"}, help = "Ordering", unspecifiedDefaultValue = "false") boolean descending,
@CliOption(key = { @CliOption(key = {"headeronly"}, help = "Print Header Only",
"headeronly"}, help = "Print Header Only", unspecifiedDefaultValue = "false") boolean headerOnly) unspecifiedDefaultValue = "false") boolean headerOnly)
throws Exception { throws Exception {
boolean initialized = HoodieCLI.initConf(); boolean initialized = HoodieCLI.initConf();
HoodieCLI.initFS(initialized); HoodieCLI.initFS(initialized);
@@ -290,12 +274,11 @@ public class CompactionCommand implements CommandMarker {
String output = null; String output = null;
if (HoodieCLI.tableMetadata.getTableType() == HoodieTableType.MERGE_ON_READ) { if (HoodieCLI.tableMetadata.getTableType() == HoodieTableType.MERGE_ON_READ) {
try { try {
String sparkPropertiesPath = Utils.getDefaultPropertiesFile( String sparkPropertiesPath = Utils
scala.collection.JavaConversions.propertiesAsScalaMap(System.getProperties())); .getDefaultPropertiesFile(scala.collection.JavaConversions.propertiesAsScalaMap(System.getProperties()));
SparkLauncher sparkLauncher = SparkUtil.initLauncher(sparkPropertiesPath); SparkLauncher sparkLauncher = SparkUtil.initLauncher(sparkPropertiesPath);
sparkLauncher.addAppArgs(SparkCommand.COMPACT_VALIDATE.toString(), sparkLauncher.addAppArgs(SparkCommand.COMPACT_VALIDATE.toString(), HoodieCLI.tableMetadata.getBasePath(),
HoodieCLI.tableMetadata.getBasePath(), compactionInstant, outputPathStr, parallelism, master, compactionInstant, outputPathStr, parallelism, master, sparkMemory);
sparkMemory);
Process process = sparkLauncher.launch(); Process process = sparkLauncher.launch();
InputStreamConsumer.captureOutput(process); InputStreamConsumer.captureOutput(process);
int exitCode = process.waitFor(); int exitCode = process.waitFor();
@@ -307,8 +290,7 @@ public class CompactionCommand implements CommandMarker {
String message = "\n\n\t COMPACTION PLAN " + (valid ? "VALID" : "INVALID") + "\n\n"; String message = "\n\n\t COMPACTION PLAN " + (valid ? "VALID" : "INVALID") + "\n\n";
List<Comparable[]> rows = new ArrayList<>(); List<Comparable[]> rows = new ArrayList<>();
res.stream().forEach(r -> { res.stream().forEach(r -> {
Comparable[] row = new Comparable[]{r.getOperation().getFileId(), Comparable[] row = new Comparable[] {r.getOperation().getFileId(), r.getOperation().getBaseInstantTime(),
r.getOperation().getBaseInstantTime(),
r.getOperation().getDataFilePath().isPresent() ? r.getOperation().getDataFilePath().get() : "", r.getOperation().getDataFilePath().isPresent() ? r.getOperation().getDataFilePath().get() : "",
r.getOperation().getDeltaFilePaths().size(), r.isSuccess(), r.getOperation().getDeltaFilePaths().size(), r.isSuccess(),
r.getException().isPresent() ? r.getException().get().getMessage() : ""}; r.getException().isPresent() ? r.getException().get().getMessage() : ""};
@@ -316,12 +298,8 @@ public class CompactionCommand implements CommandMarker {
}); });
Map<String, Function<Object, String>> fieldNameToConverterMap = new HashMap<>(); Map<String, Function<Object, String>> fieldNameToConverterMap = new HashMap<>();
TableHeader header = new TableHeader() TableHeader header = new TableHeader().addTableHeaderField("File Id").addTableHeaderField("Base Instant Time")
.addTableHeaderField("File Id") .addTableHeaderField("Base Data File").addTableHeaderField("Num Delta Files").addTableHeaderField("Valid")
.addTableHeaderField("Base Instant Time")
.addTableHeaderField("Base Data File")
.addTableHeaderField("Num Delta Files")
.addTableHeaderField("Valid")
.addTableHeaderField("Error"); .addTableHeaderField("Error");
output = message + HoodiePrintHelper.print(header, fieldNameToConverterMap, sortByField, descending, limit, output = message + HoodiePrintHelper.print(header, fieldNameToConverterMap, sortByField, descending, limit,
@@ -349,8 +327,8 @@ public class CompactionCommand implements CommandMarker {
@CliOption(key = {"limit"}, help = "Limit commits", unspecifiedDefaultValue = "-1") Integer limit, @CliOption(key = {"limit"}, help = "Limit commits", unspecifiedDefaultValue = "-1") Integer limit,
@CliOption(key = {"sortBy"}, help = "Sorting Field", unspecifiedDefaultValue = "") String sortByField, @CliOption(key = {"sortBy"}, help = "Sorting Field", unspecifiedDefaultValue = "") String sortByField,
@CliOption(key = {"desc"}, help = "Ordering", unspecifiedDefaultValue = "false") boolean descending, @CliOption(key = {"desc"}, help = "Ordering", unspecifiedDefaultValue = "false") boolean descending,
@CliOption(key = { @CliOption(key = {"headeronly"}, help = "Print Header Only",
"headeronly"}, help = "Print Header Only", unspecifiedDefaultValue = "false") boolean headerOnly) unspecifiedDefaultValue = "false") boolean headerOnly)
throws Exception { throws Exception {
boolean initialized = HoodieCLI.initConf(); boolean initialized = HoodieCLI.initConf();
HoodieCLI.initFS(initialized); HoodieCLI.initFS(initialized);
@@ -360,12 +338,12 @@ public class CompactionCommand implements CommandMarker {
String output = ""; String output = "";
if (HoodieCLI.tableMetadata.getTableType() == HoodieTableType.MERGE_ON_READ) { if (HoodieCLI.tableMetadata.getTableType() == HoodieTableType.MERGE_ON_READ) {
try { try {
String sparkPropertiesPath = Utils.getDefaultPropertiesFile( String sparkPropertiesPath = Utils
scala.collection.JavaConversions.propertiesAsScalaMap(System.getProperties())); .getDefaultPropertiesFile(scala.collection.JavaConversions.propertiesAsScalaMap(System.getProperties()));
SparkLauncher sparkLauncher = SparkUtil.initLauncher(sparkPropertiesPath); SparkLauncher sparkLauncher = SparkUtil.initLauncher(sparkPropertiesPath);
sparkLauncher.addAppArgs(SparkCommand.COMPACT_UNSCHEDULE_PLAN.toString(), sparkLauncher.addAppArgs(SparkCommand.COMPACT_UNSCHEDULE_PLAN.toString(), HoodieCLI.tableMetadata.getBasePath(),
HoodieCLI.tableMetadata.getBasePath(), compactionInstant, outputPathStr, parallelism, master, compactionInstant, outputPathStr, parallelism, master, sparkMemory, Boolean.valueOf(skipV).toString(),
sparkMemory, Boolean.valueOf(skipV).toString(), Boolean.valueOf(dryRun).toString()); Boolean.valueOf(dryRun).toString());
Process process = sparkLauncher.launch(); Process process = sparkLauncher.launch();
InputStreamConsumer.captureOutput(process); InputStreamConsumer.captureOutput(process);
int exitCode = process.waitFor(); int exitCode = process.waitFor();
@@ -373,8 +351,8 @@ public class CompactionCommand implements CommandMarker {
return "Failed to unschedule compaction for " + compactionInstant; return "Failed to unschedule compaction for " + compactionInstant;
} }
List<RenameOpResult> res = deSerializeOperationResult(outputPathStr, HoodieCLI.fs); List<RenameOpResult> res = deSerializeOperationResult(outputPathStr, HoodieCLI.fs);
output = getRenamesToBePrinted(res, limit, sortByField, descending, headerOnly, output =
"unschedule pending compaction"); getRenamesToBePrinted(res, limit, sortByField, descending, headerOnly, "unschedule pending compaction");
} finally { } finally {
// Delete tmp file used to serialize result // Delete tmp file used to serialize result
if (HoodieCLI.fs.exists(outputPath)) { if (HoodieCLI.fs.exists(outputPath)) {
@@ -407,12 +385,12 @@ public class CompactionCommand implements CommandMarker {
String output = ""; String output = "";
if (HoodieCLI.tableMetadata.getTableType() == HoodieTableType.MERGE_ON_READ) { if (HoodieCLI.tableMetadata.getTableType() == HoodieTableType.MERGE_ON_READ) {
try { try {
String sparkPropertiesPath = Utils.getDefaultPropertiesFile( String sparkPropertiesPath = Utils
scala.collection.JavaConversions.propertiesAsScalaMap(System.getProperties())); .getDefaultPropertiesFile(scala.collection.JavaConversions.propertiesAsScalaMap(System.getProperties()));
SparkLauncher sparkLauncher = SparkUtil.initLauncher(sparkPropertiesPath); SparkLauncher sparkLauncher = SparkUtil.initLauncher(sparkPropertiesPath);
sparkLauncher.addAppArgs(SparkCommand.COMPACT_UNSCHEDULE_FILE.toString(), sparkLauncher.addAppArgs(SparkCommand.COMPACT_UNSCHEDULE_FILE.toString(), HoodieCLI.tableMetadata.getBasePath(),
HoodieCLI.tableMetadata.getBasePath(), fileId, outputPathStr, "1", master, fileId, outputPathStr, "1", master, sparkMemory, Boolean.valueOf(skipV).toString(),
sparkMemory, Boolean.valueOf(skipV).toString(), Boolean.valueOf(dryRun).toString()); Boolean.valueOf(dryRun).toString());
Process process = sparkLauncher.launch(); Process process = sparkLauncher.launch();
InputStreamConsumer.captureOutput(process); InputStreamConsumer.captureOutput(process);
int exitCode = process.waitFor(); int exitCode = process.waitFor();
@@ -445,8 +423,8 @@ public class CompactionCommand implements CommandMarker {
@CliOption(key = {"limit"}, help = "Limit commits", unspecifiedDefaultValue = "-1") Integer limit, @CliOption(key = {"limit"}, help = "Limit commits", unspecifiedDefaultValue = "-1") Integer limit,
@CliOption(key = {"sortBy"}, help = "Sorting Field", unspecifiedDefaultValue = "") String sortByField, @CliOption(key = {"sortBy"}, help = "Sorting Field", unspecifiedDefaultValue = "") String sortByField,
@CliOption(key = {"desc"}, help = "Ordering", unspecifiedDefaultValue = "false") boolean descending, @CliOption(key = {"desc"}, help = "Ordering", unspecifiedDefaultValue = "false") boolean descending,
@CliOption(key = { @CliOption(key = {"headeronly"}, help = "Print Header Only",
"headeronly"}, help = "Print Header Only", unspecifiedDefaultValue = "false") boolean headerOnly) unspecifiedDefaultValue = "false") boolean headerOnly)
throws Exception { throws Exception {
boolean initialized = HoodieCLI.initConf(); boolean initialized = HoodieCLI.initConf();
HoodieCLI.initFS(initialized); HoodieCLI.initFS(initialized);
@@ -455,12 +433,11 @@ public class CompactionCommand implements CommandMarker {
String output = ""; String output = "";
if (HoodieCLI.tableMetadata.getTableType() == HoodieTableType.MERGE_ON_READ) { if (HoodieCLI.tableMetadata.getTableType() == HoodieTableType.MERGE_ON_READ) {
try { try {
String sparkPropertiesPath = Utils.getDefaultPropertiesFile( String sparkPropertiesPath = Utils
scala.collection.JavaConversions.propertiesAsScalaMap(System.getProperties())); .getDefaultPropertiesFile(scala.collection.JavaConversions.propertiesAsScalaMap(System.getProperties()));
SparkLauncher sparkLauncher = SparkUtil.initLauncher(sparkPropertiesPath); SparkLauncher sparkLauncher = SparkUtil.initLauncher(sparkPropertiesPath);
sparkLauncher.addAppArgs(SparkCommand.COMPACT_REPAIR.toString(), sparkLauncher.addAppArgs(SparkCommand.COMPACT_REPAIR.toString(), HoodieCLI.tableMetadata.getBasePath(),
HoodieCLI.tableMetadata.getBasePath(), compactionInstant, outputPathStr, parallelism, master, compactionInstant, outputPathStr, parallelism, master, sparkMemory, Boolean.valueOf(dryRun).toString());
sparkMemory, Boolean.valueOf(dryRun).toString());
Process process = sparkLauncher.launch(); Process process = sparkLauncher.launch();
InputStreamConsumer.captureOutput(process); InputStreamConsumer.captureOutput(process);
int exitCode = process.waitFor(); int exitCode = process.waitFor();
@@ -481,41 +458,35 @@ public class CompactionCommand implements CommandMarker {
} }
} }
private String getRenamesToBePrinted(List<RenameOpResult> res, Integer limit, private String getRenamesToBePrinted(List<RenameOpResult> res, Integer limit, String sortByField, boolean descending,
String sortByField, boolean descending, boolean headerOnly, String operation) { boolean headerOnly, String operation) {
Option<Boolean> result = Option.fromJavaOptional( Option<Boolean> result =
res.stream().map(r -> r.isExecuted() && r.isSuccess()).reduce(Boolean::logicalAnd)); Option.fromJavaOptional(res.stream().map(r -> r.isExecuted() && r.isSuccess()).reduce(Boolean::logicalAnd));
if (result.isPresent()) { if (result.isPresent()) {
System.out.println("There were some file renames that needed to be done to " + operation); System.out.println("There were some file renames that needed to be done to " + operation);
if (result.get()) { if (result.get()) {
System.out.println("All renames successfully completed to " + operation + " done !!"); System.out.println("All renames successfully completed to " + operation + " done !!");
} else { } else {
System.out.println("Some renames failed. DataSet could be in inconsistent-state. " System.out
+ "Try running compaction repair"); .println("Some renames failed. DataSet could be in inconsistent-state. " + "Try running compaction repair");
} }
List<Comparable[]> rows = new ArrayList<>(); List<Comparable[]> rows = new ArrayList<>();
res.stream().forEach(r -> { res.stream().forEach(r -> {
Comparable[] row = new Comparable[] { Comparable[] row =
r.getOperation().fileId, r.getOperation().srcPath, r.getOperation().destPath, new Comparable[] {r.getOperation().fileId, r.getOperation().srcPath, r.getOperation().destPath,
r.isExecuted(), r.isSuccess(), r.getException().isPresent() ? r.getException().get().getMessage() : "" r.isExecuted(), r.isSuccess(), r.getException().isPresent() ? r.getException().get().getMessage() : ""};
};
rows.add(row); rows.add(row);
}); });
Map<String, Function<Object, String>> fieldNameToConverterMap = new HashMap<>(); Map<String, Function<Object, String>> fieldNameToConverterMap = new HashMap<>();
TableHeader header = new TableHeader() TableHeader header = new TableHeader().addTableHeaderField("File Id").addTableHeaderField("Source File Path")
.addTableHeaderField("File Id") .addTableHeaderField("Destination File Path").addTableHeaderField("Rename Executed?")
.addTableHeaderField("Source File Path") .addTableHeaderField("Rename Succeeded?").addTableHeaderField("Error");
.addTableHeaderField("Destination File Path")
.addTableHeaderField("Rename Executed?")
.addTableHeaderField("Rename Succeeded?")
.addTableHeaderField("Error");
return HoodiePrintHelper.print(header, fieldNameToConverterMap, sortByField, descending, return HoodiePrintHelper.print(header, fieldNameToConverterMap, sortByField, descending, limit, headerOnly, rows);
limit, headerOnly, rows);
} else { } else {
return "No File renames needed to " + operation + ". Operation successful."; return "No File renames needed to " + operation + ". Operation successful.";
} }

View File

@@ -52,13 +52,12 @@ public class DatasetsCommand implements CommandMarker {
@CliOption(key = {"maxCheckIntervalMs"}, mandatory = false, unspecifiedDefaultValue = "300000", @CliOption(key = {"maxCheckIntervalMs"}, mandatory = false, unspecifiedDefaultValue = "300000",
help = "Max wait time for eventual consistency") final Integer maxConsistencyIntervalMs, help = "Max wait time for eventual consistency") final Integer maxConsistencyIntervalMs,
@CliOption(key = {"maxCheckIntervalMs"}, mandatory = false, unspecifiedDefaultValue = "7", @CliOption(key = {"maxCheckIntervalMs"}, mandatory = false, unspecifiedDefaultValue = "7",
help = "Max checks for eventual consistency") final Integer maxConsistencyChecks) throws IOException { help = "Max checks for eventual consistency") final Integer maxConsistencyChecks)
HoodieCLI.setConsistencyGuardConfig( throws IOException {
ConsistencyGuardConfig.newBuilder() HoodieCLI
.withConsistencyCheckEnabled(eventuallyConsistent) .setConsistencyGuardConfig(ConsistencyGuardConfig.newBuilder().withConsistencyCheckEnabled(eventuallyConsistent)
.withInitialConsistencyCheckIntervalMs(initialConsistencyIntervalMs) .withInitialConsistencyCheckIntervalMs(initialConsistencyIntervalMs)
.withMaxConsistencyCheckIntervalMs(maxConsistencyIntervalMs) .withMaxConsistencyCheckIntervalMs(maxConsistencyIntervalMs).withMaxConsistencyChecks(maxConsistencyChecks)
.withMaxConsistencyChecks(maxConsistencyChecks)
.build()); .build());
HoodieCLI.initConf(); HoodieCLI.initConf();
HoodieCLI.connectTo(path); HoodieCLI.connectTo(path);
@@ -82,7 +81,8 @@ public class DatasetsCommand implements CommandMarker {
@CliOption(key = {"tableType"}, unspecifiedDefaultValue = "COPY_ON_WRITE", @CliOption(key = {"tableType"}, unspecifiedDefaultValue = "COPY_ON_WRITE",
help = "Hoodie Table Type. Must be one of : COPY_ON_WRITE or MERGE_ON_READ") final String tableTypeStr, help = "Hoodie Table Type. Must be one of : COPY_ON_WRITE or MERGE_ON_READ") final String tableTypeStr,
@CliOption(key = {"payloadClass"}, unspecifiedDefaultValue = "org.apache.hudi.common.model.HoodieAvroPayload", @CliOption(key = {"payloadClass"}, unspecifiedDefaultValue = "org.apache.hudi.common.model.HoodieAvroPayload",
help = "Payload Class") final String payloadClass) throws IOException { help = "Payload Class") final String payloadClass)
throws IOException {
boolean initialized = HoodieCLI.initConf(); boolean initialized = HoodieCLI.initConf();
HoodieCLI.initFS(initialized); HoodieCLI.initFS(initialized);
@@ -117,9 +117,7 @@ public class DatasetsCommand implements CommandMarker {
*/ */
@CliCommand(value = "desc", help = "Describle Hoodie Table properties") @CliCommand(value = "desc", help = "Describle Hoodie Table properties")
public String descTable() { public String descTable() {
TableHeader header = new TableHeader() TableHeader header = new TableHeader().addTableHeaderField("Property").addTableHeaderField("Value");
.addTableHeaderField("Property")
.addTableHeaderField("Value");
List<Comparable[]> rows = new ArrayList<>(); List<Comparable[]> rows = new ArrayList<>();
rows.add(new Comparable[] {"basePath", HoodieCLI.tableMetadata.getBasePath()}); rows.add(new Comparable[] {"basePath", HoodieCLI.tableMetadata.getBasePath()});
rows.add(new Comparable[] {"metaPath", HoodieCLI.tableMetadata.getMetaPath()}); rows.add(new Comparable[] {"metaPath", HoodieCLI.tableMetadata.getMetaPath()});

View File

@@ -52,24 +52,23 @@ public class FileSystemViewCommand implements CommandMarker {
@CliCommand(value = "show fsview all", help = "Show entire file-system view") @CliCommand(value = "show fsview all", help = "Show entire file-system view")
public String showAllFileSlices( public String showAllFileSlices(
@CliOption(key = {"pathRegex"}, @CliOption(key = {"pathRegex"}, help = "regex to select files, eg: 2016/08/02",
help = "regex to select files, eg: 2016/08/02", unspecifiedDefaultValue = "*/*/*") String globRegex, unspecifiedDefaultValue = "*/*/*") String globRegex,
@CliOption(key = {"readOptimizedOnly"}, help = "Only display read-optimized view", @CliOption(key = {"readOptimizedOnly"}, help = "Only display read-optimized view",
unspecifiedDefaultValue = "false") boolean readOptimizedOnly, unspecifiedDefaultValue = "false") boolean readOptimizedOnly,
@CliOption(key = {"maxInstant"}, help = "File-Slices upto this instant are displayed", @CliOption(key = {"maxInstant"}, help = "File-Slices upto this instant are displayed",
unspecifiedDefaultValue = "") String maxInstant, unspecifiedDefaultValue = "") String maxInstant,
@CliOption(key = { @CliOption(key = {"includeMax"}, help = "Include Max Instant",
"includeMax"}, help = "Include Max Instant", unspecifiedDefaultValue = "false") boolean includeMaxInstant, unspecifiedDefaultValue = "false") boolean includeMaxInstant,
@CliOption(key = { @CliOption(key = {"includeInflight"}, help = "Include Inflight Instants",
"includeInflight"}, help = "Include Inflight Instants", unspecifiedDefaultValue = "false") unspecifiedDefaultValue = "false") boolean includeInflight,
boolean includeInflight, @CliOption(key = {"excludeCompaction"}, help = "Exclude compaction Instants",
@CliOption(key = {"excludeCompaction"}, help = "Exclude compaction Instants", unspecifiedDefaultValue = "false") unspecifiedDefaultValue = "false") boolean excludeCompaction,
boolean excludeCompaction,
@CliOption(key = {"limit"}, help = "Limit rows to be displayed", unspecifiedDefaultValue = "-1") Integer limit, @CliOption(key = {"limit"}, help = "Limit rows to be displayed", unspecifiedDefaultValue = "-1") Integer limit,
@CliOption(key = {"sortBy"}, help = "Sorting Field", unspecifiedDefaultValue = "") final String sortByField, @CliOption(key = {"sortBy"}, help = "Sorting Field", unspecifiedDefaultValue = "") final String sortByField,
@CliOption(key = {"desc"}, help = "Ordering", unspecifiedDefaultValue = "false") final boolean descending, @CliOption(key = {"desc"}, help = "Ordering", unspecifiedDefaultValue = "false") final boolean descending,
@CliOption(key = { @CliOption(key = {"headeronly"}, help = "Print Header Only",
"headeronly"}, help = "Print Header Only", unspecifiedDefaultValue = "false") final boolean headerOnly) unspecifiedDefaultValue = "false") final boolean headerOnly)
throws IOException { throws IOException {
HoodieTableFileSystemView fsView = buildFileSystemView(globRegex, maxInstant, readOptimizedOnly, includeMaxInstant, HoodieTableFileSystemView fsView = buildFileSystemView(globRegex, maxInstant, readOptimizedOnly, includeMaxInstant,
@@ -97,15 +96,10 @@ public class FileSystemViewCommand implements CommandMarker {
fieldNameToConverterMap.put("Total Delta File Size", converterFunction); fieldNameToConverterMap.put("Total Delta File Size", converterFunction);
fieldNameToConverterMap.put("Data-File Size", converterFunction); fieldNameToConverterMap.put("Data-File Size", converterFunction);
TableHeader header = new TableHeader() TableHeader header = new TableHeader().addTableHeaderField("Partition").addTableHeaderField("FileId")
.addTableHeaderField("Partition") .addTableHeaderField("Base-Instant").addTableHeaderField("Data-File").addTableHeaderField("Data-File Size");
.addTableHeaderField("FileId")
.addTableHeaderField("Base-Instant")
.addTableHeaderField("Data-File")
.addTableHeaderField("Data-File Size");
if (!readOptimizedOnly) { if (!readOptimizedOnly) {
header = header.addTableHeaderField("Num Delta Files") header = header.addTableHeaderField("Num Delta Files").addTableHeaderField("Total Delta File Size")
.addTableHeaderField("Total Delta File Size")
.addTableHeaderField("Delta Files"); .addTableHeaderField("Delta Files");
} }
return HoodiePrintHelper.print(header, fieldNameToConverterMap, sortByField, descending, limit, headerOnly, rows); return HoodiePrintHelper.print(header, fieldNameToConverterMap, sortByField, descending, limit, headerOnly, rows);
@@ -113,25 +107,24 @@ public class FileSystemViewCommand implements CommandMarker {
@CliCommand(value = "show fsview latest", help = "Show latest file-system view") @CliCommand(value = "show fsview latest", help = "Show latest file-system view")
public String showLatestFileSlices( public String showLatestFileSlices(
@CliOption(key = {"partitionPath"}, @CliOption(key = {"partitionPath"}, help = "A valid paritition path", mandatory = true) String partition,
help = "A valid paritition path", mandatory = true) String partition,
@CliOption(key = {"readOptimizedOnly"}, help = "Only display read-optimized view", @CliOption(key = {"readOptimizedOnly"}, help = "Only display read-optimized view",
unspecifiedDefaultValue = "false") boolean readOptimizedOnly, unspecifiedDefaultValue = "false") boolean readOptimizedOnly,
@CliOption(key = {"maxInstant"}, help = "File-Slices upto this instant are displayed", @CliOption(key = {"maxInstant"}, help = "File-Slices upto this instant are displayed",
unspecifiedDefaultValue = "") String maxInstant, unspecifiedDefaultValue = "") String maxInstant,
@CliOption(key = {"merge"}, help = "Merge File Slices due to pending compaction", @CliOption(key = {"merge"}, help = "Merge File Slices due to pending compaction",
unspecifiedDefaultValue = "true") final boolean merge, unspecifiedDefaultValue = "true") final boolean merge,
@CliOption(key = {"includeMax"}, help = "Include Max Instant", unspecifiedDefaultValue = "false") @CliOption(key = {"includeMax"}, help = "Include Max Instant",
boolean includeMaxInstant, unspecifiedDefaultValue = "false") boolean includeMaxInstant,
@CliOption(key = {"includeInflight"}, help = "Include Inflight Instants", unspecifiedDefaultValue = "false") @CliOption(key = {"includeInflight"}, help = "Include Inflight Instants",
boolean includeInflight, unspecifiedDefaultValue = "false") boolean includeInflight,
@CliOption(key = {"excludeCompaction"}, help = "Exclude compaction Instants", unspecifiedDefaultValue = "false") @CliOption(key = {"excludeCompaction"}, help = "Exclude compaction Instants",
boolean excludeCompaction, unspecifiedDefaultValue = "false") boolean excludeCompaction,
@CliOption(key = {"limit"}, help = "Limit rows to be displayed", unspecifiedDefaultValue = "-1") Integer limit, @CliOption(key = {"limit"}, help = "Limit rows to be displayed", unspecifiedDefaultValue = "-1") Integer limit,
@CliOption(key = {"sortBy"}, help = "Sorting Field", unspecifiedDefaultValue = "") final String sortByField, @CliOption(key = {"sortBy"}, help = "Sorting Field", unspecifiedDefaultValue = "") final String sortByField,
@CliOption(key = {"desc"}, help = "Ordering", unspecifiedDefaultValue = "false") final boolean descending, @CliOption(key = {"desc"}, help = "Ordering", unspecifiedDefaultValue = "false") final boolean descending,
@CliOption(key = { @CliOption(key = {"headeronly"}, help = "Print Header Only",
"headeronly"}, help = "Print Header Only", unspecifiedDefaultValue = "false") final boolean headerOnly) unspecifiedDefaultValue = "false") final boolean headerOnly)
throws IOException { throws IOException {
HoodieTableFileSystemView fsView = buildFileSystemView(partition, maxInstant, readOptimizedOnly, includeMaxInstant, HoodieTableFileSystemView fsView = buildFileSystemView(partition, maxInstant, readOptimizedOnly, includeMaxInstant,
@@ -163,28 +156,25 @@ public class FileSystemViewCommand implements CommandMarker {
if (!readOptimizedOnly) { if (!readOptimizedOnly) {
row[idx++] = fs.getLogFiles().count(); row[idx++] = fs.getLogFiles().count();
row[idx++] = fs.getLogFiles().mapToLong(lf -> lf.getFileSize()).sum(); row[idx++] = fs.getLogFiles().mapToLong(lf -> lf.getFileSize()).sum();
long logFilesScheduledForCompactionTotalSize = fs.getLogFiles() long logFilesScheduledForCompactionTotalSize =
.filter(lf -> lf.getBaseCommitTime().equals(fs.getBaseInstantTime())) fs.getLogFiles().filter(lf -> lf.getBaseCommitTime().equals(fs.getBaseInstantTime()))
.mapToLong(lf -> lf.getFileSize()).sum(); .mapToLong(lf -> lf.getFileSize()).sum();
row[idx++] = logFilesScheduledForCompactionTotalSize; row[idx++] = logFilesScheduledForCompactionTotalSize;
long logFilesUnscheduledTotalSize = fs.getLogFiles() long logFilesUnscheduledTotalSize =
.filter(lf -> !lf.getBaseCommitTime().equals(fs.getBaseInstantTime())) fs.getLogFiles().filter(lf -> !lf.getBaseCommitTime().equals(fs.getBaseInstantTime()))
.mapToLong(lf -> lf.getFileSize()).sum(); .mapToLong(lf -> lf.getFileSize()).sum();
row[idx++] = logFilesUnscheduledTotalSize; row[idx++] = logFilesUnscheduledTotalSize;
double logSelectedForCompactionToBaseRatio = double logSelectedForCompactionToBaseRatio =
dataFileSize > 0 ? logFilesScheduledForCompactionTotalSize / (dataFileSize * 1.0) : -1; dataFileSize > 0 ? logFilesScheduledForCompactionTotalSize / (dataFileSize * 1.0) : -1;
row[idx++] = logSelectedForCompactionToBaseRatio; row[idx++] = logSelectedForCompactionToBaseRatio;
double logUnscheduledToBaseRatio = double logUnscheduledToBaseRatio = dataFileSize > 0 ? logFilesUnscheduledTotalSize / (dataFileSize * 1.0) : -1;
dataFileSize > 0 ? logFilesUnscheduledTotalSize / (dataFileSize * 1.0) : -1;
row[idx++] = logUnscheduledToBaseRatio; row[idx++] = logUnscheduledToBaseRatio;
row[idx++] = fs.getLogFiles() row[idx++] = fs.getLogFiles().filter(lf -> lf.getBaseCommitTime().equals(fs.getBaseInstantTime()))
.filter(lf -> lf.getBaseCommitTime().equals(fs.getBaseInstantTime()))
.collect(Collectors.toList()).toString(); .collect(Collectors.toList()).toString();
row[idx++] = fs.getLogFiles() row[idx++] = fs.getLogFiles().filter(lf -> !lf.getBaseCommitTime().equals(fs.getBaseInstantTime()))
.filter(lf -> !lf.getBaseCommitTime().equals(fs.getBaseInstantTime()))
.collect(Collectors.toList()).toString(); .collect(Collectors.toList()).toString();
} }
rows.add(row); rows.add(row);
@@ -200,16 +190,11 @@ public class FileSystemViewCommand implements CommandMarker {
fieldNameToConverterMap.put("Delta Size - compaction unscheduled", converterFunction); fieldNameToConverterMap.put("Delta Size - compaction unscheduled", converterFunction);
} }
TableHeader header = new TableHeader() TableHeader header = new TableHeader().addTableHeaderField("Partition").addTableHeaderField("FileId")
.addTableHeaderField("Partition") .addTableHeaderField("Base-Instant").addTableHeaderField("Data-File").addTableHeaderField("Data-File Size");
.addTableHeaderField("FileId")
.addTableHeaderField("Base-Instant")
.addTableHeaderField("Data-File")
.addTableHeaderField("Data-File Size");
if (!readOptimizedOnly) { if (!readOptimizedOnly) {
header = header.addTableHeaderField("Num Delta Files") header = header.addTableHeaderField("Num Delta Files").addTableHeaderField("Total Delta Size")
.addTableHeaderField("Total Delta Size")
.addTableHeaderField("Delta Size - compaction scheduled") .addTableHeaderField("Delta Size - compaction scheduled")
.addTableHeaderField("Delta Size - compaction unscheduled") .addTableHeaderField("Delta Size - compaction unscheduled")
.addTableHeaderField("Delta To Base Ratio - compaction scheduled") .addTableHeaderField("Delta To Base Ratio - compaction scheduled")
@@ -222,6 +207,7 @@ public class FileSystemViewCommand implements CommandMarker {
/** /**
* Build File System View * Build File System View
*
* @param globRegex Path Regex * @param globRegex Path Regex
* @param maxInstant Max Instants to be used for displaying file-instants * @param maxInstant Max Instants to be used for displaying file-instants
* @param readOptimizedOnly Include only read optimized view * @param readOptimizedOnly Include only read optimized view
@@ -233,8 +219,8 @@ public class FileSystemViewCommand implements CommandMarker {
*/ */
private HoodieTableFileSystemView buildFileSystemView(String globRegex, String maxInstant, boolean readOptimizedOnly, private HoodieTableFileSystemView buildFileSystemView(String globRegex, String maxInstant, boolean readOptimizedOnly,
boolean includeMaxInstant, boolean includeInflight, boolean excludeCompaction) throws IOException { boolean includeMaxInstant, boolean includeInflight, boolean excludeCompaction) throws IOException {
HoodieTableMetaClient metaClient = new HoodieTableMetaClient(HoodieCLI.tableMetadata.getHadoopConf(), HoodieTableMetaClient metaClient =
HoodieCLI.tableMetadata.getBasePath(), true); new HoodieTableMetaClient(HoodieCLI.tableMetadata.getHadoopConf(), HoodieCLI.tableMetadata.getBasePath(), true);
FileSystem fs = HoodieCLI.fs; FileSystem fs = HoodieCLI.fs;
String globPath = String.format("%s/%s/*", HoodieCLI.tableMetadata.getBasePath(), globRegex); String globPath = String.format("%s/%s/*", HoodieCLI.tableMetadata.getBasePath(), globRegex);
FileStatus[] statuses = fs.globStatus(new Path(globPath)); FileStatus[] statuses = fs.globStatus(new Path(globPath));

View File

@@ -43,17 +43,17 @@ public class HDFSParquetImportCommand implements CommandMarker {
@CliOption(key = "upsert", mandatory = false, unspecifiedDefaultValue = "false", @CliOption(key = "upsert", mandatory = false, unspecifiedDefaultValue = "false",
help = "Uses upsert API instead of the default insert API of WriteClient") boolean useUpsert, help = "Uses upsert API instead of the default insert API of WriteClient") boolean useUpsert,
@CliOption(key = "srcPath", mandatory = true, help = "Base path for the input dataset") final String srcPath, @CliOption(key = "srcPath", mandatory = true, help = "Base path for the input dataset") final String srcPath,
@CliOption(key = "targetPath", mandatory = true, help = "Base path for the target hoodie dataset") final String @CliOption(key = "targetPath", mandatory = true,
targetPath, help = "Base path for the target hoodie dataset") final String targetPath,
@CliOption(key = "tableName", mandatory = true, help = "Table name") final String tableName, @CliOption(key = "tableName", mandatory = true, help = "Table name") final String tableName,
@CliOption(key = "tableType", mandatory = true, help = "Table type") final String tableType, @CliOption(key = "tableType", mandatory = true, help = "Table type") final String tableType,
@CliOption(key = "rowKeyField", mandatory = true, help = "Row key field name") final String rowKeyField, @CliOption(key = "rowKeyField", mandatory = true, help = "Row key field name") final String rowKeyField,
@CliOption(key = "partitionPathField", mandatory = true, help = "Partition path field name") final String @CliOption(key = "partitionPathField", mandatory = true,
partitionPathField, help = "Partition path field name") final String partitionPathField,
@CliOption(key = { @CliOption(key = {"parallelism"}, mandatory = true,
"parallelism"}, mandatory = true, help = "Parallelism for hoodie insert") final String parallelism, help = "Parallelism for hoodie insert") final String parallelism,
@CliOption(key = "schemaFilePath", mandatory = true, help = "Path for Avro schema file") final String @CliOption(key = "schemaFilePath", mandatory = true,
schemaFilePath, help = "Path for Avro schema file") final String schemaFilePath,
@CliOption(key = "format", mandatory = true, help = "Format for the input data") final String format, @CliOption(key = "format", mandatory = true, help = "Format for the input data") final String format,
@CliOption(key = "sparkMemory", mandatory = true, help = "Spark executor memory") final String sparkMemory, @CliOption(key = "sparkMemory", mandatory = true, help = "Spark executor memory") final String sparkMemory,
@CliOption(key = "retry", mandatory = true, help = "Number of retries") final String retry) throws Exception { @CliOption(key = "retry", mandatory = true, help = "Number of retries") final String retry) throws Exception {
@@ -62,8 +62,8 @@ public class HDFSParquetImportCommand implements CommandMarker {
boolean initialized = HoodieCLI.initConf(); boolean initialized = HoodieCLI.initConf();
HoodieCLI.initFS(initialized); HoodieCLI.initFS(initialized);
String sparkPropertiesPath = Utils.getDefaultPropertiesFile( String sparkPropertiesPath =
JavaConverters.mapAsScalaMapConverter(System.getenv()).asScala()); Utils.getDefaultPropertiesFile(JavaConverters.mapAsScalaMapConverter(System.getenv()).asScala());
SparkLauncher sparkLauncher = SparkUtil.initLauncher(sparkPropertiesPath); SparkLauncher sparkLauncher = SparkUtil.initLauncher(sparkPropertiesPath);
@@ -72,8 +72,8 @@ public class HDFSParquetImportCommand implements CommandMarker {
cmd = SparkCommand.UPSERT.toString(); cmd = SparkCommand.UPSERT.toString();
} }
sparkLauncher.addAppArgs(cmd, srcPath, targetPath, tableName, tableType, rowKeyField, sparkLauncher.addAppArgs(cmd, srcPath, targetPath, tableName, tableType, rowKeyField, partitionPathField,
partitionPathField, parallelism, schemaFilePath, sparkMemory, retry); parallelism, schemaFilePath, sparkMemory, retry);
Process process = sparkLauncher.launch(); Process process = sparkLauncher.launch();
InputStreamConsumer.captureOutput(process); InputStreamConsumer.captureOutput(process);
int exitCode = process.waitFor(); int exitCode = process.waitFor();

View File

@@ -69,30 +69,29 @@ public class HoodieLogFileCommand implements CommandMarker {
@CliCommand(value = "show logfile metadata", help = "Read commit metadata from log files") @CliCommand(value = "show logfile metadata", help = "Read commit metadata from log files")
public String showLogFileCommits( public String showLogFileCommits(
@CliOption(key = "logFilePathPattern", mandatory = true, help = "Fully qualified path for the log file") final @CliOption(key = "logFilePathPattern", mandatory = true,
String logFilePathPattern, help = "Fully qualified path for the log file") final String logFilePathPattern,
@CliOption(key = {"limit"}, help = "Limit commits", unspecifiedDefaultValue = "-1") final Integer limit, @CliOption(key = {"limit"}, help = "Limit commits", unspecifiedDefaultValue = "-1") final Integer limit,
@CliOption(key = {"sortBy"}, help = "Sorting Field", unspecifiedDefaultValue = "") final String sortByField, @CliOption(key = {"sortBy"}, help = "Sorting Field", unspecifiedDefaultValue = "") final String sortByField,
@CliOption(key = {"desc"}, help = "Ordering", unspecifiedDefaultValue = "false") final boolean descending, @CliOption(key = {"desc"}, help = "Ordering", unspecifiedDefaultValue = "false") final boolean descending,
@CliOption(key = {"headeronly"}, help = "Print Header Only", unspecifiedDefaultValue = "false") @CliOption(key = {"headeronly"}, help = "Print Header Only",
final boolean headerOnly) throws IOException { unspecifiedDefaultValue = "false") final boolean headerOnly)
throws IOException {
FileSystem fs = HoodieCLI.tableMetadata.getFs(); FileSystem fs = HoodieCLI.tableMetadata.getFs();
List<String> logFilePaths = Arrays.stream(fs.globStatus(new Path(logFilePathPattern))) List<String> logFilePaths = Arrays.stream(fs.globStatus(new Path(logFilePathPattern)))
.map(status -> status.getPath().toString()).collect(Collectors.toList()); .map(status -> status.getPath().toString()).collect(Collectors.toList());
Map<String, List<Tuple3<HoodieLogBlockType, Tuple2<Map<HeaderMetadataType, String>, Map<HeaderMetadataType, Map<String, List<Tuple3<HoodieLogBlockType, Tuple2<Map<HeaderMetadataType, String>, Map<HeaderMetadataType, String>>, Integer>>> commitCountAndMetadata =
String>>, Integer>>> Maps.newHashMap();
commitCountAndMetadata = Maps.newHashMap();
int totalEntries = 0; int totalEntries = 0;
int numCorruptBlocks = 0; int numCorruptBlocks = 0;
int dummyInstantTimeCount = 0; int dummyInstantTimeCount = 0;
for (String logFilePath : logFilePaths) { for (String logFilePath : logFilePaths) {
FileStatus[] fsStatus = fs.listStatus(new Path(logFilePath)); FileStatus[] fsStatus = fs.listStatus(new Path(logFilePath));
Schema writerSchema = new AvroSchemaConverter().convert( Schema writerSchema = new AvroSchemaConverter()
SchemaUtil.readSchemaFromLogFile(HoodieCLI.tableMetadata.getFs(), new Path(logFilePath))); .convert(SchemaUtil.readSchemaFromLogFile(HoodieCLI.tableMetadata.getFs(), new Path(logFilePath)));
Reader reader = HoodieLogFormat Reader reader = HoodieLogFormat.newReader(fs, new HoodieLogFile(fsStatus[0].getPath()), writerSchema);
.newReader(fs, new HoodieLogFile(fsStatus[0].getPath()), writerSchema);
// read the avro blocks // read the avro blocks
while (reader.hasNext()) { while (reader.hasNext()) {
@@ -126,8 +125,8 @@ public class HoodieLogFileCommand implements CommandMarker {
new Tuple3<>(n.getBlockType(), new Tuple2<>(n.getLogBlockHeader(), n.getLogBlockFooter()), recordCount)); new Tuple3<>(n.getBlockType(), new Tuple2<>(n.getLogBlockHeader(), n.getLogBlockFooter()), recordCount));
totalEntries++; totalEntries++;
} else { } else {
List<Tuple3<HoodieLogBlockType, Tuple2<Map<HeaderMetadataType, String>, Map<HeaderMetadataType, String>>, List<Tuple3<HoodieLogBlockType, Tuple2<Map<HeaderMetadataType, String>, Map<HeaderMetadataType, String>>, Integer>> list =
Integer>> list = new ArrayList<>(); new ArrayList<>();
list.add( list.add(
new Tuple3<>(n.getBlockType(), new Tuple2<>(n.getLogBlockHeader(), n.getLogBlockFooter()), recordCount)); new Tuple3<>(n.getBlockType(), new Tuple2<>(n.getLogBlockHeader(), n.getLogBlockFooter()), recordCount));
commitCountAndMetadata.put(instantTime, list); commitCountAndMetadata.put(instantTime, list);
@@ -139,12 +138,11 @@ public class HoodieLogFileCommand implements CommandMarker {
List<Comparable[]> rows = new ArrayList<>(); List<Comparable[]> rows = new ArrayList<>();
int i = 0; int i = 0;
ObjectMapper objectMapper = new ObjectMapper(); ObjectMapper objectMapper = new ObjectMapper();
for (Map.Entry<String, List<Tuple3<HoodieLogBlockType, for (Map.Entry<String, List<Tuple3<HoodieLogBlockType, Tuple2<Map<HeaderMetadataType, String>, Map<HeaderMetadataType, String>>, Integer>>> entry : commitCountAndMetadata
Tuple2<Map<HeaderMetadataType, String>, Map<HeaderMetadataType, String>>, Integer>>> entry .entrySet()) {
: commitCountAndMetadata.entrySet()) {
String instantTime = entry.getKey().toString(); String instantTime = entry.getKey().toString();
for (Tuple3<HoodieLogBlockType, Tuple2<Map<HeaderMetadataType, String>, for (Tuple3<HoodieLogBlockType, Tuple2<Map<HeaderMetadataType, String>, Map<HeaderMetadataType, String>>, Integer> tuple3 : entry
Map<HeaderMetadataType, String>>, Integer> tuple3 : entry.getValue()) { .getValue()) {
Comparable[] output = new Comparable[5]; Comparable[] output = new Comparable[5];
output[0] = instantTime; output[0] = instantTime;
output[1] = tuple3._3(); output[1] = tuple3._3();
@@ -156,21 +154,18 @@ public class HoodieLogFileCommand implements CommandMarker {
} }
} }
TableHeader header = new TableHeader() TableHeader header = new TableHeader().addTableHeaderField("InstantTime").addTableHeaderField("RecordCount")
.addTableHeaderField("InstantTime") .addTableHeaderField("BlockType").addTableHeaderField("HeaderMetadata").addTableHeaderField("FooterMetadata");
.addTableHeaderField("RecordCount")
.addTableHeaderField("BlockType")
.addTableHeaderField("HeaderMetadata")
.addTableHeaderField("FooterMetadata");
return HoodiePrintHelper.print(header, new HashMap<>(), sortByField, descending, limit, headerOnly, rows); return HoodiePrintHelper.print(header, new HashMap<>(), sortByField, descending, limit, headerOnly, rows);
} }
@CliCommand(value = "show logfile records", help = "Read records from log files") @CliCommand(value = "show logfile records", help = "Read records from log files")
public String showLogFileRecords(@CliOption(key = { public String showLogFileRecords(
"limit"}, mandatory = false, help = "Limit commits", unspecifiedDefaultValue = "10") final Integer limit, @CliOption(key = {"limit"}, mandatory = false, help = "Limit commits",
@CliOption(key = "logFilePathPattern", mandatory = true, help = "Fully qualified paths for the log files") unspecifiedDefaultValue = "10") final Integer limit,
final String logFilePathPattern, @CliOption(key = "logFilePathPattern", mandatory = true,
help = "Fully qualified paths for the log files") final String logFilePathPattern,
@CliOption(key = "mergeRecords", mandatory = false, help = "If the records in the log files should be merged", @CliOption(key = "mergeRecords", mandatory = false, help = "If the records in the log files should be merged",
unspecifiedDefaultValue = "false") final Boolean shouldMerge) unspecifiedDefaultValue = "false") final Boolean shouldMerge)
throws IOException { throws IOException {
@@ -184,17 +179,16 @@ public class HoodieLogFileCommand implements CommandMarker {
// TODO : readerSchema can change across blocks/log files, fix this inside Scanner // TODO : readerSchema can change across blocks/log files, fix this inside Scanner
AvroSchemaConverter converter = new AvroSchemaConverter(); AvroSchemaConverter converter = new AvroSchemaConverter();
// get schema from last log file // get schema from last log file
Schema readerSchema = converter.convert( Schema readerSchema =
SchemaUtil.readSchemaFromLogFile(fs, new Path(logFilePaths.get(logFilePaths.size() - 1)))); converter.convert(SchemaUtil.readSchemaFromLogFile(fs, new Path(logFilePaths.get(logFilePaths.size() - 1))));
List<IndexedRecord> allRecords = new ArrayList<>(); List<IndexedRecord> allRecords = new ArrayList<>();
if (shouldMerge) { if (shouldMerge) {
System.out.println("===========================> MERGING RECORDS <==================="); System.out.println("===========================> MERGING RECORDS <===================");
HoodieMergedLogRecordScanner scanner = new HoodieMergedLogRecordScanner(fs, HoodieMergedLogRecordScanner scanner =
HoodieCLI.tableMetadata.getBasePath(), logFilePaths, readerSchema, new HoodieMergedLogRecordScanner(fs, HoodieCLI.tableMetadata.getBasePath(), logFilePaths, readerSchema,
HoodieCLI.tableMetadata.getActiveTimeline().getCommitTimeline().lastInstant().get() HoodieCLI.tableMetadata.getActiveTimeline().getCommitTimeline().lastInstant().get().getTimestamp(),
.getTimestamp(),
Long.valueOf(HoodieMemoryConfig.DEFAULT_MAX_MEMORY_FOR_SPILLABLE_MAP_IN_BYTES), Long.valueOf(HoodieMemoryConfig.DEFAULT_MAX_MEMORY_FOR_SPILLABLE_MAP_IN_BYTES),
Boolean.valueOf(HoodieCompactionConfig.DEFAULT_COMPACTION_LAZY_BLOCK_READ_ENABLED), Boolean.valueOf(HoodieCompactionConfig.DEFAULT_COMPACTION_LAZY_BLOCK_READ_ENABLED),
Boolean.valueOf(HoodieCompactionConfig.DEFAULT_COMPACTION_REVERSE_LOG_READ_ENABLED), Boolean.valueOf(HoodieCompactionConfig.DEFAULT_COMPACTION_REVERSE_LOG_READ_ENABLED),
@@ -209,10 +203,10 @@ public class HoodieLogFileCommand implements CommandMarker {
} }
} else { } else {
for (String logFile : logFilePaths) { for (String logFile : logFilePaths) {
Schema writerSchema = new AvroSchemaConverter().convert( Schema writerSchema = new AvroSchemaConverter()
SchemaUtil.readSchemaFromLogFile(HoodieCLI.tableMetadata.getFs(), new Path(logFile))); .convert(SchemaUtil.readSchemaFromLogFile(HoodieCLI.tableMetadata.getFs(), new Path(logFile)));
HoodieLogFormat.Reader reader = HoodieLogFormat HoodieLogFormat.Reader reader =
.newReader(fs, new HoodieLogFile(new Path(logFile)), writerSchema); HoodieLogFormat.newReader(fs, new HoodieLogFile(new Path(logFile)), writerSchema);
// read the avro blocks // read the avro blocks
while (reader.hasNext()) { while (reader.hasNext()) {
HoodieLogBlock n = reader.next(); HoodieLogBlock n = reader.next();

View File

@@ -44,19 +44,16 @@ public class HoodieSyncCommand implements CommandMarker {
public String validateSync( public String validateSync(
@CliOption(key = {"mode"}, unspecifiedDefaultValue = "complete", help = "Check mode") final String mode, @CliOption(key = {"mode"}, unspecifiedDefaultValue = "complete", help = "Check mode") final String mode,
@CliOption(key = {"sourceDb"}, unspecifiedDefaultValue = "rawdata", help = "source database") final String srcDb, @CliOption(key = {"sourceDb"}, unspecifiedDefaultValue = "rawdata", help = "source database") final String srcDb,
@CliOption(key = { @CliOption(key = {"targetDb"}, unspecifiedDefaultValue = "dwh_hoodie",
"targetDb"}, unspecifiedDefaultValue = "dwh_hoodie", help = "target database") final String tgtDb, help = "target database") final String tgtDb,
@CliOption(key = { @CliOption(key = {"partitionCount"}, unspecifiedDefaultValue = "5",
"partitionCount"}, unspecifiedDefaultValue = "5", help = "total number of recent partitions to validate") help = "total number of recent partitions to validate") final int partitionCount,
final int partitionCount, @CliOption(key = {"hiveServerUrl"}, mandatory = true,
@CliOption(key = { help = "hiveServerURL to connect to") final String hiveServerUrl,
"hiveServerUrl"}, mandatory = true, help = "hiveServerURL to connect to") final String hiveServerUrl, @CliOption(key = {"hiveUser"}, mandatory = false, unspecifiedDefaultValue = "",
@CliOption(key = { help = "hive username to connect to") final String hiveUser,
"hiveUser"}, mandatory = false, unspecifiedDefaultValue = "", help = "hive username to connect to") final @CliOption(key = {"hivePass"}, mandatory = true, unspecifiedDefaultValue = "",
String hiveUser, help = "hive password to connect to") final String hivePass)
@CliOption(key = {
"hivePass"}, mandatory = true, unspecifiedDefaultValue = "", help = "hive password to connect to") final
String hivePass)
throws Exception { throws Exception {
HoodieTableMetaClient target = HoodieCLI.syncTableMetadata; HoodieTableMetaClient target = HoodieCLI.syncTableMetadata;
HoodieTimeline targetTimeline = target.getActiveTimeline().getCommitsTimeline(); HoodieTimeline targetTimeline = target.getActiveTimeline().getCommitsTimeline();
@@ -77,8 +74,8 @@ public class HoodieSyncCommand implements CommandMarker {
String sourceLatestCommit = String sourceLatestCommit =
sourceTimeline.getInstants().iterator().hasNext() ? "0" : sourceTimeline.lastInstant().get().getTimestamp(); sourceTimeline.getInstants().iterator().hasNext() ? "0" : sourceTimeline.lastInstant().get().getTimestamp();
if (sourceLatestCommit != null && HoodieTimeline.compareTimestamps(targetLatestCommit, sourceLatestCommit, if (sourceLatestCommit != null
HoodieTimeline.GREATER)) { && HoodieTimeline.compareTimestamps(targetLatestCommit, sourceLatestCommit, HoodieTimeline.GREATER)) {
// source is behind the target // source is behind the target
List<HoodieInstant> commitsToCatchup = targetTimeline.findInstantsAfter(sourceLatestCommit, Integer.MAX_VALUE) List<HoodieInstant> commitsToCatchup = targetTimeline.findInstantsAfter(sourceLatestCommit, Integer.MAX_VALUE)
.getInstants().collect(Collectors.toList()); .getInstants().collect(Collectors.toList());
@@ -89,8 +86,8 @@ public class HoodieSyncCommand implements CommandMarker {
long newInserts = CommitUtil.countNewRecords(target, long newInserts = CommitUtil.countNewRecords(target,
commitsToCatchup.stream().map(HoodieInstant::getTimestamp).collect(Collectors.toList())); commitsToCatchup.stream().map(HoodieInstant::getTimestamp).collect(Collectors.toList()));
return "Count difference now is (count(" + target.getTableConfig().getTableName() + ") - count(" return "Count difference now is (count(" + target.getTableConfig().getTableName() + ") - count("
+ source.getTableConfig().getTableName() + source.getTableConfig().getTableName() + ") == " + (targetCount - sourceCount) + ". Catch up count is "
+ ") == " + (targetCount - sourceCount) + ". Catch up count is " + newInserts; + newInserts;
} }
} else { } else {
List<HoodieInstant> commitsToCatchup = sourceTimeline.findInstantsAfter(targetLatestCommit, Integer.MAX_VALUE) List<HoodieInstant> commitsToCatchup = sourceTimeline.findInstantsAfter(targetLatestCommit, Integer.MAX_VALUE)
@@ -102,8 +99,8 @@ public class HoodieSyncCommand implements CommandMarker {
long newInserts = CommitUtil.countNewRecords(source, long newInserts = CommitUtil.countNewRecords(source,
commitsToCatchup.stream().map(HoodieInstant::getTimestamp).collect(Collectors.toList())); commitsToCatchup.stream().map(HoodieInstant::getTimestamp).collect(Collectors.toList()));
return "Count difference now is (count(" + source.getTableConfig().getTableName() + ") - count(" return "Count difference now is (count(" + source.getTableConfig().getTableName() + ") - count("
+ target.getTableConfig().getTableName() + target.getTableConfig().getTableName() + ") == " + (sourceCount - targetCount) + ". Catch up count is "
+ ") == " + (sourceCount - targetCount) + ". Catch up count is " + newInserts; + newInserts;
} }
} }

View File

@@ -47,16 +47,15 @@ public class RepairsCommand implements CommandMarker {
return HoodieCLI.tableMetadata != null; return HoodieCLI.tableMetadata != null;
} }
@CliCommand(value = "repair deduplicate", help = "De-duplicate a partition path contains duplicates & produce " @CliCommand(value = "repair deduplicate",
+ "repaired files to replace with") help = "De-duplicate a partition path contains duplicates & produce " + "repaired files to replace with")
public String deduplicate(@CliOption(key = { public String deduplicate(
"duplicatedPartitionPath"}, help = "Partition Path containing the duplicates", mandatory = true) final String @CliOption(key = {"duplicatedPartitionPath"}, help = "Partition Path containing the duplicates",
duplicatedPartitionPath, mandatory = true) final String duplicatedPartitionPath,
@CliOption(key = { @CliOption(key = {"repairedOutputPath"}, help = "Location to place the repaired files",
"repairedOutputPath"}, help = "Location to place the repaired files", mandatory = true) final String mandatory = true) final String repairedOutputPath,
repairedOutputPath, @CliOption(key = {"sparkProperties"}, help = "Spark Properites File Path",
@CliOption(key = { mandatory = true) final String sparkPropertiesPath)
"sparkProperties"}, help = "Spark Properites File Path", mandatory = true) final String sparkPropertiesPath)
throws Exception { throws Exception {
SparkLauncher sparkLauncher = SparkUtil.initLauncher(sparkPropertiesPath); SparkLauncher sparkLauncher = SparkUtil.initLauncher(sparkPropertiesPath);
sparkLauncher.addAppArgs(SparkMain.SparkCommand.DEDUPLICATE.toString(), duplicatedPartitionPath, repairedOutputPath, sparkLauncher.addAppArgs(SparkMain.SparkCommand.DEDUPLICATE.toString(), duplicatedPartitionPath, repairedOutputPath,
@@ -73,14 +72,15 @@ public class RepairsCommand implements CommandMarker {
@CliCommand(value = "repair addpartitionmeta", help = "Add partition metadata to a dataset, if not present") @CliCommand(value = "repair addpartitionmeta", help = "Add partition metadata to a dataset, if not present")
public String addPartitionMeta(@CliOption(key = { public String addPartitionMeta(
"dryrun"}, help = "Should we actually add or just print what would be done", unspecifiedDefaultValue = "true") @CliOption(key = {"dryrun"}, help = "Should we actually add or just print what would be done",
final boolean dryRun) throws IOException { unspecifiedDefaultValue = "true") final boolean dryRun)
throws IOException {
String latestCommit = HoodieCLI.tableMetadata.getActiveTimeline().getCommitTimeline().lastInstant().get() String latestCommit =
.getTimestamp(); HoodieCLI.tableMetadata.getActiveTimeline().getCommitTimeline().lastInstant().get().getTimestamp();
List<String> partitionPaths = FSUtils.getAllPartitionFoldersThreeLevelsDown(HoodieCLI.fs, List<String> partitionPaths =
HoodieCLI.tableMetadata.getBasePath()); FSUtils.getAllPartitionFoldersThreeLevelsDown(HoodieCLI.fs, HoodieCLI.tableMetadata.getBasePath());
Path basePath = new Path(HoodieCLI.tableMetadata.getBasePath()); Path basePath = new Path(HoodieCLI.tableMetadata.getBasePath());
String[][] rows = new String[partitionPaths.size() + 1][]; String[][] rows = new String[partitionPaths.size() + 1][];
@@ -94,8 +94,8 @@ public class RepairsCommand implements CommandMarker {
if (!HoodiePartitionMetadata.hasPartitionMetadata(HoodieCLI.fs, partitionPath)) { if (!HoodiePartitionMetadata.hasPartitionMetadata(HoodieCLI.fs, partitionPath)) {
row[1] = "No"; row[1] = "No";
if (!dryRun) { if (!dryRun) {
HoodiePartitionMetadata partitionMetadata = new HoodiePartitionMetadata(HoodieCLI.fs, latestCommit, basePath, HoodiePartitionMetadata partitionMetadata =
partitionPath); new HoodiePartitionMetadata(HoodieCLI.fs, latestCommit, basePath, partitionPath);
partitionMetadata.trySave(0); partitionMetadata.trySave(0);
} }
} }

View File

@@ -50,8 +50,8 @@ public class RollbacksCommand implements CommandMarker {
@CliOption(key = {"limit"}, help = "Limit #rows to be displayed", unspecifiedDefaultValue = "10") Integer limit, @CliOption(key = {"limit"}, help = "Limit #rows to be displayed", unspecifiedDefaultValue = "10") Integer limit,
@CliOption(key = {"sortBy"}, help = "Sorting Field", unspecifiedDefaultValue = "") final String sortByField, @CliOption(key = {"sortBy"}, help = "Sorting Field", unspecifiedDefaultValue = "") final String sortByField,
@CliOption(key = {"desc"}, help = "Ordering", unspecifiedDefaultValue = "false") final boolean descending, @CliOption(key = {"desc"}, help = "Ordering", unspecifiedDefaultValue = "false") final boolean descending,
@CliOption(key = { @CliOption(key = {"headeronly"}, help = "Print Header Only",
"headeronly"}, help = "Print Header Only", unspecifiedDefaultValue = "false") final boolean headerOnly) unspecifiedDefaultValue = "false") final boolean headerOnly)
throws IOException { throws IOException {
HoodieActiveTimeline activeTimeline = new RollbackTimeline(HoodieCLI.tableMetadata); HoodieActiveTimeline activeTimeline = new RollbackTimeline(HoodieCLI.tableMetadata);
HoodieTimeline rollback = activeTimeline.getRollbackTimeline().filterCompletedInstants(); HoodieTimeline rollback = activeTimeline.getRollbackTimeline().filterCompletedInstants();
@@ -59,8 +59,8 @@ public class RollbacksCommand implements CommandMarker {
final List<Comparable[]> rows = new ArrayList<>(); final List<Comparable[]> rows = new ArrayList<>();
rollback.getInstants().forEach(instant -> { rollback.getInstants().forEach(instant -> {
try { try {
HoodieRollbackMetadata metadata = AvroUtils.deserializeAvroMetadata( HoodieRollbackMetadata metadata = AvroUtils
activeTimeline.getInstantDetails(instant).get(), HoodieRollbackMetadata.class); .deserializeAvroMetadata(activeTimeline.getInstantDetails(instant).get(), HoodieRollbackMetadata.class);
metadata.getCommitsRollback().forEach(c -> { metadata.getCommitsRollback().forEach(c -> {
Comparable[] row = new Comparable[5]; Comparable[] row = new Comparable[5];
row[0] = metadata.getStartRollbackTime(); row[0] = metadata.getStartRollbackTime();
@@ -74,11 +74,8 @@ public class RollbacksCommand implements CommandMarker {
e.printStackTrace(); e.printStackTrace();
} }
}); });
TableHeader header = new TableHeader() TableHeader header = new TableHeader().addTableHeaderField("Instant").addTableHeaderField("Rolledback Instant")
.addTableHeaderField("Instant") .addTableHeaderField("Total Files Deleted").addTableHeaderField("Time taken in millis")
.addTableHeaderField("Rolledback Instant")
.addTableHeaderField("Total Files Deleted")
.addTableHeaderField("Time taken in millis")
.addTableHeaderField("Total Partitions"); .addTableHeaderField("Total Partitions");
return HoodiePrintHelper.print(header, new HashMap<>(), sortByField, descending, limit, headerOnly, rows); return HoodiePrintHelper.print(header, new HashMap<>(), sortByField, descending, limit, headerOnly, rows);
} }
@@ -89,16 +86,17 @@ public class RollbacksCommand implements CommandMarker {
@CliOption(key = {"limit"}, help = "Limit #rows to be displayed", unspecifiedDefaultValue = "10") Integer limit, @CliOption(key = {"limit"}, help = "Limit #rows to be displayed", unspecifiedDefaultValue = "10") Integer limit,
@CliOption(key = {"sortBy"}, help = "Sorting Field", unspecifiedDefaultValue = "") final String sortByField, @CliOption(key = {"sortBy"}, help = "Sorting Field", unspecifiedDefaultValue = "") final String sortByField,
@CliOption(key = {"desc"}, help = "Ordering", unspecifiedDefaultValue = "false") final boolean descending, @CliOption(key = {"desc"}, help = "Ordering", unspecifiedDefaultValue = "false") final boolean descending,
@CliOption(key = { @CliOption(key = {"headeronly"}, help = "Print Header Only",
"headeronly"}, help = "Print Header Only", unspecifiedDefaultValue = "false") final boolean headerOnly) unspecifiedDefaultValue = "false") final boolean headerOnly)
throws IOException { throws IOException {
HoodieActiveTimeline activeTimeline = new RollbackTimeline(HoodieCLI.tableMetadata); HoodieActiveTimeline activeTimeline = new RollbackTimeline(HoodieCLI.tableMetadata);
final List<Comparable[]> rows = new ArrayList<>(); final List<Comparable[]> rows = new ArrayList<>();
HoodieRollbackMetadata metadata = AvroUtils.deserializeAvroMetadata( HoodieRollbackMetadata metadata = AvroUtils.deserializeAvroMetadata(
activeTimeline.getInstantDetails(new HoodieInstant(State.COMPLETED, ROLLBACK_ACTION, rollbackInstant)) activeTimeline.getInstantDetails(new HoodieInstant(State.COMPLETED, ROLLBACK_ACTION, rollbackInstant)).get(),
.get(), HoodieRollbackMetadata.class); HoodieRollbackMetadata.class);
metadata.getPartitionMetadata().entrySet().forEach(e -> { metadata.getPartitionMetadata().entrySet().forEach(e -> {
Stream.concat(e.getValue().getSuccessDeleteFiles().stream().map(f -> Pair.of(f, true)), Stream
.concat(e.getValue().getSuccessDeleteFiles().stream().map(f -> Pair.of(f, true)),
e.getValue().getFailedDeleteFiles().stream().map(f -> Pair.of(f, false))) e.getValue().getFailedDeleteFiles().stream().map(f -> Pair.of(f, false)))
.forEach(fileWithDeleteStatus -> { .forEach(fileWithDeleteStatus -> {
Comparable[] row = new Comparable[5]; Comparable[] row = new Comparable[5];
@@ -111,12 +109,8 @@ public class RollbacksCommand implements CommandMarker {
}); });
}); });
TableHeader header = new TableHeader() TableHeader header = new TableHeader().addTableHeaderField("Instant").addTableHeaderField("Rolledback Instants")
.addTableHeaderField("Instant") .addTableHeaderField("Partition").addTableHeaderField("Deleted File").addTableHeaderField("Succeeded");
.addTableHeaderField("Rolledback Instants")
.addTableHeaderField("Partition")
.addTableHeaderField("Deleted File")
.addTableHeaderField("Succeeded");
return HoodiePrintHelper.print(header, new HashMap<>(), sortByField, descending, limit, headerOnly, rows); return HoodiePrintHelper.print(header, new HashMap<>(), sortByField, descending, limit, headerOnly, rows);
} }

View File

@@ -62,8 +62,8 @@ public class SavepointsCommand implements CommandMarker {
@CliAvailabilityIndicator({"savepoint rollback"}) @CliAvailabilityIndicator({"savepoint rollback"})
public boolean isRollbackToSavepointAvailable() { public boolean isRollbackToSavepointAvailable() {
return HoodieCLI.tableMetadata != null && !HoodieCLI.tableMetadata.getActiveTimeline().getSavePointTimeline() return HoodieCLI.tableMetadata != null
.filterCompletedInstants().empty(); && !HoodieCLI.tableMetadata.getActiveTimeline().getSavePointTimeline().filterCompletedInstants().empty();
} }
@CliCommand(value = "savepoints show", help = "Show the savepoints") @CliCommand(value = "savepoints show", help = "Show the savepoints")
@@ -137,8 +137,8 @@ public class SavepointsCommand implements CommandMarker {
} }
private static HoodieWriteClient createHoodieClient(JavaSparkContext jsc, String basePath) throws Exception { private static HoodieWriteClient createHoodieClient(JavaSparkContext jsc, String basePath) throws Exception {
HoodieWriteConfig config = HoodieWriteConfig.newBuilder().withPath(basePath).withIndexConfig( HoodieWriteConfig config = HoodieWriteConfig.newBuilder().withPath(basePath)
HoodieIndexConfig.newBuilder().withIndexType(HoodieIndex.IndexType.BLOOM).build()).build(); .withIndexConfig(HoodieIndexConfig.newBuilder().withIndexType(HoodieIndex.IndexType.BLOOM).build()).build();
return new HoodieWriteClient(jsc, config, false); return new HoodieWriteClient(jsc, config, false);
} }

View File

@@ -43,8 +43,7 @@ public class SparkMain {
* Commands * Commands
*/ */
enum SparkCommand { enum SparkCommand {
ROLLBACK, DEDUPLICATE, ROLLBACK_TO_SAVEPOINT, SAVEPOINT, IMPORT, UPSERT, COMPACT_SCHEDULE, COMPACT_RUN, ROLLBACK, DEDUPLICATE, ROLLBACK_TO_SAVEPOINT, SAVEPOINT, IMPORT, UPSERT, COMPACT_SCHEDULE, COMPACT_RUN, COMPACT_UNSCHEDULE_PLAN, COMPACT_UNSCHEDULE_FILE, COMPACT_VALIDATE, COMPACT_REPAIR
COMPACT_UNSCHEDULE_PLAN, COMPACT_UNSCHEDULE_FILE, COMPACT_VALIDATE, COMPACT_REPAIR
} }
public static void main(String[] args) throws Exception { public static void main(String[] args) throws Exception {
@@ -76,13 +75,12 @@ public class SparkMain {
break; break;
case COMPACT_RUN: case COMPACT_RUN:
assert (args.length == 8); assert (args.length == 8);
returnCode = compact(jsc, args[1], args[2], args[3], Integer.parseInt(args[4]), returnCode = compact(jsc, args[1], args[2], args[3], Integer.parseInt(args[4]), args[5], args[6],
args[5], args[6], Integer.parseInt(args[7]), false); Integer.parseInt(args[7]), false);
break; break;
case COMPACT_SCHEDULE: case COMPACT_SCHEDULE:
assert (args.length == 5); assert (args.length == 5);
returnCode = compact(jsc, args[1], args[2], args[3], 1, returnCode = compact(jsc, args[1], args[2], args[3], 1, "", args[4], 0, true);
"", args[4], 0, true);
break; break;
case COMPACT_VALIDATE: case COMPACT_VALIDATE:
assert (args.length == 7); assert (args.length == 7);
@@ -113,8 +111,7 @@ public class SparkMain {
System.exit(returnCode); System.exit(returnCode);
} }
private static int dataLoad(JavaSparkContext jsc, String command, private static int dataLoad(JavaSparkContext jsc, String command, String srcPath, String targetPath, String tableName,
String srcPath, String targetPath, String tableName,
String tableType, String rowKey, String partitionKey, int parallelism, String schemaFile, String sparkMaster, String tableType, String rowKey, String partitionKey, int parallelism, String schemaFile, String sparkMaster,
String sparkMemory, int retry) throws Exception { String sparkMemory, int retry) throws Exception {
Config cfg = new Config(); Config cfg = new Config();
@@ -180,9 +177,9 @@ public class SparkMain {
new HoodieCompactionAdminTool(cfg).run(jsc); new HoodieCompactionAdminTool(cfg).run(jsc);
} }
private static void doCompactUnscheduleFile(JavaSparkContext jsc, String basePath, String fileId, private static void doCompactUnscheduleFile(JavaSparkContext jsc, String basePath, String fileId, String outputPath,
String outputPath, int parallelism, String sparkMaster, String sparkMemory, boolean skipValidation, int parallelism, String sparkMaster, String sparkMemory, boolean skipValidation, boolean dryRun)
boolean dryRun) throws Exception { throws Exception {
HoodieCompactionAdminTool.Config cfg = new HoodieCompactionAdminTool.Config(); HoodieCompactionAdminTool.Config cfg = new HoodieCompactionAdminTool.Config();
cfg.basePath = basePath; cfg.basePath = basePath;
cfg.operation = Operation.UNSCHEDULE_FILE; cfg.operation = Operation.UNSCHEDULE_FILE;
@@ -244,8 +241,8 @@ public class SparkMain {
} }
private static HoodieWriteClient createHoodieClient(JavaSparkContext jsc, String basePath) throws Exception { private static HoodieWriteClient createHoodieClient(JavaSparkContext jsc, String basePath) throws Exception {
HoodieWriteConfig config = HoodieWriteConfig.newBuilder().withPath(basePath).withIndexConfig( HoodieWriteConfig config = HoodieWriteConfig.newBuilder().withPath(basePath)
HoodieIndexConfig.newBuilder().withIndexType(HoodieIndex.IndexType.BLOOM).build()).build(); .withIndexConfig(HoodieIndexConfig.newBuilder().withIndexType(HoodieIndex.IndexType.BLOOM).build()).build();
return new HoodieWriteClient(jsc, config); return new HoodieWriteClient(jsc, config);
} }
} }

View File

@@ -63,8 +63,9 @@ public class StatsCommand implements CommandMarker {
@CliOption(key = {"limit"}, help = "Limit commits", unspecifiedDefaultValue = "-1") final Integer limit, @CliOption(key = {"limit"}, help = "Limit commits", unspecifiedDefaultValue = "-1") final Integer limit,
@CliOption(key = {"sortBy"}, help = "Sorting Field", unspecifiedDefaultValue = "") final String sortByField, @CliOption(key = {"sortBy"}, help = "Sorting Field", unspecifiedDefaultValue = "") final String sortByField,
@CliOption(key = {"desc"}, help = "Ordering", unspecifiedDefaultValue = "false") final boolean descending, @CliOption(key = {"desc"}, help = "Ordering", unspecifiedDefaultValue = "false") final boolean descending,
@CliOption(key = {"headeronly"}, help = "Print Header Only", unspecifiedDefaultValue = "false") @CliOption(key = {"headeronly"}, help = "Print Header Only",
final boolean headerOnly) throws IOException { unspecifiedDefaultValue = "false") final boolean headerOnly)
throws IOException {
long totalRecordsUpserted = 0; long totalRecordsUpserted = 0;
long totalRecordsWritten = 0; long totalRecordsWritten = 0;
@@ -93,31 +94,26 @@ public class StatsCommand implements CommandMarker {
} }
rows.add(new Comparable[] {"Total", totalRecordsUpserted, totalRecordsWritten, waf}); rows.add(new Comparable[] {"Total", totalRecordsUpserted, totalRecordsWritten, waf});
TableHeader header = new TableHeader() TableHeader header = new TableHeader().addTableHeaderField("CommitTime").addTableHeaderField("Total Upserted")
.addTableHeaderField("CommitTime") .addTableHeaderField("Total Written").addTableHeaderField("Write Amplifiation Factor");
.addTableHeaderField("Total Upserted")
.addTableHeaderField("Total Written")
.addTableHeaderField("Write Amplifiation Factor");
return HoodiePrintHelper.print(header, new HashMap<>(), sortByField, descending, limit, headerOnly, rows); return HoodiePrintHelper.print(header, new HashMap<>(), sortByField, descending, limit, headerOnly, rows);
} }
private Comparable[] printFileSizeHistogram(String commitTime, Snapshot s) { private Comparable[] printFileSizeHistogram(String commitTime, Snapshot s) {
return new Comparable[]{commitTime, s.getMin(), return new Comparable[] {commitTime, s.getMin(), s.getValue(0.1), s.getMedian(), s.getMean(), s.get95thPercentile(),
s.getValue(0.1), s.getMedian(), s.getMax(), s.size(), s.getStdDev()};
s.getMean(), s.get95thPercentile(),
s.getMax(), s.size(),
s.getStdDev()};
} }
@CliCommand(value = "stats filesizes", help = "File Sizes. Display summary stats on sizes of files") @CliCommand(value = "stats filesizes", help = "File Sizes. Display summary stats on sizes of files")
public String fileSizeStats( public String fileSizeStats(
@CliOption(key = {"partitionPath"}, @CliOption(key = {"partitionPath"}, help = "regex to select files, eg: 2016/08/02",
help = "regex to select files, eg: 2016/08/02", unspecifiedDefaultValue = "*/*/*") final String globRegex, unspecifiedDefaultValue = "*/*/*") final String globRegex,
@CliOption(key = {"limit"}, help = "Limit commits", unspecifiedDefaultValue = "-1") final Integer limit, @CliOption(key = {"limit"}, help = "Limit commits", unspecifiedDefaultValue = "-1") final Integer limit,
@CliOption(key = {"sortBy"}, help = "Sorting Field", unspecifiedDefaultValue = "") final String sortByField, @CliOption(key = {"sortBy"}, help = "Sorting Field", unspecifiedDefaultValue = "") final String sortByField,
@CliOption(key = {"desc"}, help = "Ordering", unspecifiedDefaultValue = "false") final boolean descending, @CliOption(key = {"desc"}, help = "Ordering", unspecifiedDefaultValue = "false") final boolean descending,
@CliOption(key = {"headeronly"}, help = "Print Header Only", unspecifiedDefaultValue = "false") @CliOption(key = {"headeronly"}, help = "Print Header Only",
final boolean headerOnly) throws IOException { unspecifiedDefaultValue = "false") final boolean headerOnly)
throws IOException {
FileSystem fs = HoodieCLI.fs; FileSystem fs = HoodieCLI.fs;
String globPath = String.format("%s/%s/*", HoodieCLI.tableMetadata.getBasePath(), globRegex); String globPath = String.format("%s/%s/*", HoodieCLI.tableMetadata.getBasePath(), globRegex);
@@ -145,8 +141,8 @@ public class StatsCommand implements CommandMarker {
Snapshot s = globalHistogram.getSnapshot(); Snapshot s = globalHistogram.getSnapshot();
rows.add(printFileSizeHistogram("ALL", s)); rows.add(printFileSizeHistogram("ALL", s));
Function<Object, String> converterFunction = entry -> Function<Object, String> converterFunction =
NumericUtils.humanReadableByteCount((Double.valueOf(entry.toString()))); entry -> NumericUtils.humanReadableByteCount((Double.valueOf(entry.toString())));
Map<String, Function<Object, String>> fieldNameToConverterMap = new HashMap<>(); Map<String, Function<Object, String>> fieldNameToConverterMap = new HashMap<>();
fieldNameToConverterMap.put("Min", converterFunction); fieldNameToConverterMap.put("Min", converterFunction);
fieldNameToConverterMap.put("10th", converterFunction); fieldNameToConverterMap.put("10th", converterFunction);
@@ -156,16 +152,9 @@ public class StatsCommand implements CommandMarker {
fieldNameToConverterMap.put("Max", converterFunction); fieldNameToConverterMap.put("Max", converterFunction);
fieldNameToConverterMap.put("StdDev", converterFunction); fieldNameToConverterMap.put("StdDev", converterFunction);
TableHeader header = new TableHeader() TableHeader header = new TableHeader().addTableHeaderField("CommitTime").addTableHeaderField("Min")
.addTableHeaderField("CommitTime") .addTableHeaderField("10th").addTableHeaderField("50th").addTableHeaderField("avg").addTableHeaderField("95th")
.addTableHeaderField("Min") .addTableHeaderField("Max").addTableHeaderField("NumFiles").addTableHeaderField("StdDev");
.addTableHeaderField("10th")
.addTableHeaderField("50th")
.addTableHeaderField("avg")
.addTableHeaderField("95th")
.addTableHeaderField("Max")
.addTableHeaderField("NumFiles")
.addTableHeaderField("StdDev");
return HoodiePrintHelper.print(header, fieldNameToConverterMap, sortByField, descending, limit, headerOnly, rows); return HoodiePrintHelper.print(header, fieldNameToConverterMap, sortByField, descending, limit, headerOnly, rows);
} }
} }

View File

@@ -52,8 +52,7 @@ public class HiveUtil {
stmt.execute("set hive.input.format=org.apache.hadoop.hive.ql.io.HiveInputFormat"); stmt.execute("set hive.input.format=org.apache.hadoop.hive.ql.io.HiveInputFormat");
stmt.execute("set hive.stats.autogather=false"); stmt.execute("set hive.stats.autogather=false");
rs = stmt.executeQuery( rs = stmt.executeQuery(
"select count(`_hoodie_commit_time`) as cnt from " + dbName + "." "select count(`_hoodie_commit_time`) as cnt from " + dbName + "." + source.getTableConfig().getTableName());
+ source.getTableConfig().getTableName());
long count = -1; long count = -1;
if (rs.next()) { if (rs.next()) {
count = rs.getLong("cnt"); count = rs.getLong("cnt");

View File

@@ -40,8 +40,8 @@ public class SparkUtil {
public static SparkLauncher initLauncher(String propertiesFile) throws URISyntaxException { public static SparkLauncher initLauncher(String propertiesFile) throws URISyntaxException {
String currentJar = new File(SparkUtil.class.getProtectionDomain().getCodeSource().getLocation().toURI().getPath()) String currentJar = new File(SparkUtil.class.getProtectionDomain().getCodeSource().getLocation().toURI().getPath())
.getAbsolutePath(); .getAbsolutePath();
SparkLauncher sparkLauncher = new SparkLauncher().setAppResource(currentJar) SparkLauncher sparkLauncher =
.setMainClass(SparkMain.class.getName()); new SparkLauncher().setAppResource(currentJar).setMainClass(SparkMain.class.getName());
if (!StringUtils.isNullOrEmpty(propertiesFile)) { if (!StringUtils.isNullOrEmpty(propertiesFile)) {
sparkLauncher.setPropertiesFile(propertiesFile); sparkLauncher.setPropertiesFile(propertiesFile);

View File

@@ -26,6 +26,10 @@
<artifactId>hudi-client</artifactId> <artifactId>hudi-client</artifactId>
<packaging>jar</packaging> <packaging>jar</packaging>
<properties>
<main.basedir>${project.parent.basedir}</main.basedir>
</properties>
<build> <build>
<plugins> <plugins>
<plugin> <plugin>

View File

@@ -32,8 +32,8 @@ import org.apache.log4j.Logger;
import org.apache.spark.api.java.JavaSparkContext; import org.apache.spark.api.java.JavaSparkContext;
/** /**
* Abstract class taking care of holding common member variables (FileSystem, SparkContext, HoodieConfigs) * Abstract class taking care of holding common member variables (FileSystem, SparkContext, HoodieConfigs) Also, manages
* Also, manages embedded timeline-server if enabled. * embedded timeline-server if enabled.
*/ */
public abstract class AbstractHoodieClient implements Serializable, AutoCloseable { public abstract class AbstractHoodieClient implements Serializable, AutoCloseable {
@@ -45,10 +45,9 @@ public abstract class AbstractHoodieClient implements Serializable, AutoCloseabl
protected final String basePath; protected final String basePath;
/** /**
* Timeline Server has the same lifetime as that of Client. * Timeline Server has the same lifetime as that of Client. Any operations done on the same timeline service will be
* Any operations done on the same timeline service will be able to take advantage * able to take advantage of the cached file-system view. New completed actions will be synced automatically in an
* of the cached file-system view. New completed actions will be synced automatically * incremental fashion.
* in an incremental fashion.
*/ */
private transient Option<EmbeddedTimelineService> timelineServer; private transient Option<EmbeddedTimelineService> timelineServer;
private final boolean shouldStopTimelineServer; private final boolean shouldStopTimelineServer;

View File

@@ -69,8 +69,7 @@ public class CompactionAdminClient extends AbstractHoodieClient {
super(jsc, HoodieWriteConfig.newBuilder().withPath(basePath).build()); super(jsc, HoodieWriteConfig.newBuilder().withPath(basePath).build());
} }
public CompactionAdminClient(JavaSparkContext jsc, String basePath, public CompactionAdminClient(JavaSparkContext jsc, String basePath, Option<EmbeddedTimelineService> timelineServer) {
Option<EmbeddedTimelineService> timelineServer) {
super(jsc, HoodieWriteConfig.newBuilder().withPath(basePath).build(), timelineServer); super(jsc, HoodieWriteConfig.newBuilder().withPath(basePath).build(), timelineServer);
} }
@@ -81,8 +80,8 @@ public class CompactionAdminClient extends AbstractHoodieClient {
* @param metaClient Hoodie Table Meta Client * @param metaClient Hoodie Table Meta Client
* @param compactionInstant Compaction Instant * @param compactionInstant Compaction Instant
*/ */
public List<ValidationOpResult> validateCompactionPlan(HoodieTableMetaClient metaClient, public List<ValidationOpResult> validateCompactionPlan(HoodieTableMetaClient metaClient, String compactionInstant,
String compactionInstant, int parallelism) throws IOException { int parallelism) throws IOException {
HoodieCompactionPlan plan = getCompactionPlan(metaClient, compactionInstant); HoodieCompactionPlan plan = getCompactionPlan(metaClient, compactionInstant);
HoodieTableFileSystemView fsView = HoodieTableFileSystemView fsView =
new HoodieTableFileSystemView(metaClient, metaClient.getCommitsAndCompactionTimeline()); new HoodieTableFileSystemView(metaClient, metaClient.getCommitsAndCompactionTimeline());
@@ -112,15 +111,13 @@ public class CompactionAdminClient extends AbstractHoodieClient {
* @param parallelism Parallelism * @param parallelism Parallelism
* @param dryRun Dry Run * @param dryRun Dry Run
*/ */
public List<RenameOpResult> unscheduleCompactionPlan( public List<RenameOpResult> unscheduleCompactionPlan(String compactionInstant, boolean skipValidation,
String compactionInstant, boolean skipValidation, int parallelism, boolean dryRun) throws Exception { int parallelism, boolean dryRun) throws Exception {
HoodieTableMetaClient metaClient = createMetaClient(false); HoodieTableMetaClient metaClient = createMetaClient(false);
List<Pair<HoodieLogFile, HoodieLogFile>> renameActions = List<Pair<HoodieLogFile, HoodieLogFile>> renameActions = getRenamingActionsForUnschedulingCompactionPlan(metaClient,
getRenamingActionsForUnschedulingCompactionPlan(metaClient, compactionInstant, parallelism, compactionInstant, parallelism, Option.empty(), skipValidation);
Option.empty(), skipValidation);
List<RenameOpResult> res = List<RenameOpResult> res = runRenamingOps(metaClient, renameActions, parallelism, dryRun);
runRenamingOps(metaClient, renameActions, parallelism, dryRun);
Option<Boolean> success = Option<Boolean> success =
Option.fromJavaOptional(res.stream().map(r -> (r.isExecuted() && r.isSuccess())).reduce(Boolean::logicalAnd)); Option.fromJavaOptional(res.stream().map(r -> (r.isExecuted() && r.isSuccess())).reduce(Boolean::logicalAnd));
@@ -145,8 +142,8 @@ public class CompactionAdminClient extends AbstractHoodieClient {
} }
/** /**
* Remove a fileId from pending compaction. Removes the associated compaction operation and rename delta-files * Remove a fileId from pending compaction. Removes the associated compaction operation and rename delta-files that
* that were generated for that file-id after the compaction operation was scheduled. * were generated for that file-id after the compaction operation was scheduled.
* *
* This operation MUST be executed with compactions and writer turned OFF. * This operation MUST be executed with compactions and writer turned OFF.
* *
@@ -154,12 +151,11 @@ public class CompactionAdminClient extends AbstractHoodieClient {
* @param skipValidation Skip validation * @param skipValidation Skip validation
* @param dryRun Dry Run Mode * @param dryRun Dry Run Mode
*/ */
public List<RenameOpResult> unscheduleCompactionFileId(HoodieFileGroupId fgId, public List<RenameOpResult> unscheduleCompactionFileId(HoodieFileGroupId fgId, boolean skipValidation, boolean dryRun)
boolean skipValidation, boolean dryRun) throws Exception { throws Exception {
HoodieTableMetaClient metaClient = createMetaClient(false); HoodieTableMetaClient metaClient = createMetaClient(false);
List<Pair<HoodieLogFile, HoodieLogFile>> renameActions = List<Pair<HoodieLogFile, HoodieLogFile>> renameActions =
getRenamingActionsForUnschedulingCompactionForFileId(metaClient, fgId, getRenamingActionsForUnschedulingCompactionForFileId(metaClient, fgId, Option.empty(), skipValidation);
Option.empty(), skipValidation);
List<RenameOpResult> res = runRenamingOps(metaClient, renameActions, 1, dryRun); List<RenameOpResult> res = runRenamingOps(metaClient, renameActions, 1, dryRun);
@@ -167,15 +163,15 @@ public class CompactionAdminClient extends AbstractHoodieClient {
// Ready to remove this file-Id from compaction request // Ready to remove this file-Id from compaction request
Pair<String, HoodieCompactionOperation> compactionOperationWithInstant = Pair<String, HoodieCompactionOperation> compactionOperationWithInstant =
CompactionUtils.getAllPendingCompactionOperations(metaClient).get(fgId); CompactionUtils.getAllPendingCompactionOperations(metaClient).get(fgId);
HoodieCompactionPlan plan = CompactionUtils HoodieCompactionPlan plan =
.getCompactionPlan(metaClient, compactionOperationWithInstant.getKey()); CompactionUtils.getCompactionPlan(metaClient, compactionOperationWithInstant.getKey());
List<HoodieCompactionOperation> newOps = plan.getOperations().stream() List<HoodieCompactionOperation> newOps = plan.getOperations().stream().filter(
.filter(op -> (!op.getFileId().equals(fgId.getFileId())) op -> (!op.getFileId().equals(fgId.getFileId())) && (!op.getPartitionPath().equals(fgId.getPartitionPath())))
&& (!op.getPartitionPath().equals(fgId.getPartitionPath()))).collect(Collectors.toList()); .collect(Collectors.toList());
HoodieCompactionPlan newPlan = HoodieCompactionPlan newPlan =
HoodieCompactionPlan.newBuilder().setOperations(newOps).setExtraMetadata(plan.getExtraMetadata()).build(); HoodieCompactionPlan.newBuilder().setOperations(newOps).setExtraMetadata(plan.getExtraMetadata()).build();
HoodieInstant inflight = new HoodieInstant(State.INFLIGHT, COMPACTION_ACTION, HoodieInstant inflight =
compactionOperationWithInstant.getLeft()); new HoodieInstant(State.INFLIGHT, COMPACTION_ACTION, compactionOperationWithInstant.getLeft());
Path inflightPath = new Path(metaClient.getMetaPath(), inflight.getFileName()); Path inflightPath = new Path(metaClient.getMetaPath(), inflight.getFileName());
if (metaClient.getFs().exists(inflightPath)) { if (metaClient.getFs().exists(inflightPath)) {
// revert if in inflight state // revert if in inflight state
@@ -189,28 +185,28 @@ public class CompactionAdminClient extends AbstractHoodieClient {
} }
/** /**
* Renames delta files to make file-slices consistent with the timeline as dictated by Hoodie metadata. * Renames delta files to make file-slices consistent with the timeline as dictated by Hoodie metadata. Use when
* Use when compaction unschedule fails partially. * compaction unschedule fails partially.
* *
* This operation MUST be executed with compactions and writer turned OFF. * This operation MUST be executed with compactions and writer turned OFF.
*
* @param compactionInstant Compaction Instant to be repaired * @param compactionInstant Compaction Instant to be repaired
* @param dryRun Dry Run Mode * @param dryRun Dry Run Mode
*/ */
public List<RenameOpResult> repairCompaction(String compactionInstant, public List<RenameOpResult> repairCompaction(String compactionInstant, int parallelism, boolean dryRun)
int parallelism, boolean dryRun) throws Exception { throws Exception {
HoodieTableMetaClient metaClient = createMetaClient(false); HoodieTableMetaClient metaClient = createMetaClient(false);
List<ValidationOpResult> validationResults = List<ValidationOpResult> validationResults = validateCompactionPlan(metaClient, compactionInstant, parallelism);
validateCompactionPlan(metaClient, compactionInstant, parallelism); List<ValidationOpResult> failed =
List<ValidationOpResult> failed = validationResults.stream() validationResults.stream().filter(v -> !v.isSuccess()).collect(Collectors.toList());
.filter(v -> !v.isSuccess()).collect(Collectors.toList());
if (failed.isEmpty()) { if (failed.isEmpty()) {
return new ArrayList<>(); return new ArrayList<>();
} }
final HoodieTableFileSystemView fsView = new HoodieTableFileSystemView(metaClient, final HoodieTableFileSystemView fsView =
metaClient.getCommitsAndCompactionTimeline()); new HoodieTableFileSystemView(metaClient, metaClient.getCommitsAndCompactionTimeline());
List<Pair<HoodieLogFile, HoodieLogFile>> renameActions = failed.stream().flatMap(v -> List<Pair<HoodieLogFile, HoodieLogFile>> renameActions =
getRenamingActionsToAlignWithCompactionOperation(metaClient, compactionInstant, failed.stream().flatMap(v -> getRenamingActionsToAlignWithCompactionOperation(metaClient, compactionInstant,
v.getOperation(), Option.of(fsView)).stream()).collect(Collectors.toList()); v.getOperation(), Option.of(fsView)).stream()).collect(Collectors.toList());
return runRenamingOps(metaClient, renameActions, parallelism, dryRun); return runRenamingOps(metaClient, renameActions, parallelism, dryRun);
} }
@@ -218,11 +214,10 @@ public class CompactionAdminClient extends AbstractHoodieClient {
/** /**
* Construction Compaction Plan from compaction instant * Construction Compaction Plan from compaction instant
*/ */
private static HoodieCompactionPlan getCompactionPlan(HoodieTableMetaClient metaClient, private static HoodieCompactionPlan getCompactionPlan(HoodieTableMetaClient metaClient, String compactionInstant)
String compactionInstant) throws IOException { throws IOException {
HoodieCompactionPlan compactionPlan = AvroUtils.deserializeCompactionPlan( HoodieCompactionPlan compactionPlan = AvroUtils.deserializeCompactionPlan(metaClient.getActiveTimeline()
metaClient.getActiveTimeline().getInstantAuxiliaryDetails( .getInstantAuxiliaryDetails(HoodieTimeline.getCompactionRequestedInstant(compactionInstant)).get());
HoodieTimeline.getCompactionRequestedInstant(compactionInstant)).get());
return compactionPlan; return compactionPlan;
} }
@@ -238,20 +233,18 @@ public class CompactionAdminClient extends AbstractHoodieClient {
protected static List<Pair<HoodieLogFile, HoodieLogFile>> getRenamingActionsToAlignWithCompactionOperation( protected static List<Pair<HoodieLogFile, HoodieLogFile>> getRenamingActionsToAlignWithCompactionOperation(
HoodieTableMetaClient metaClient, String compactionInstant, CompactionOperation op, HoodieTableMetaClient metaClient, String compactionInstant, CompactionOperation op,
Option<HoodieTableFileSystemView> fsViewOpt) { Option<HoodieTableFileSystemView> fsViewOpt) {
HoodieTableFileSystemView fileSystemView = fsViewOpt.isPresent() ? fsViewOpt.get() : HoodieTableFileSystemView fileSystemView = fsViewOpt.isPresent() ? fsViewOpt.get()
new HoodieTableFileSystemView(metaClient, metaClient.getCommitsAndCompactionTimeline()); : new HoodieTableFileSystemView(metaClient, metaClient.getCommitsAndCompactionTimeline());
HoodieInstant lastInstant = metaClient.getCommitsAndCompactionTimeline().lastInstant().get(); HoodieInstant lastInstant = metaClient.getCommitsAndCompactionTimeline().lastInstant().get();
FileSlice merged = FileSlice merged =
fileSystemView.getLatestMergedFileSlicesBeforeOrOn(op.getPartitionPath(), lastInstant.getTimestamp()) fileSystemView.getLatestMergedFileSlicesBeforeOrOn(op.getPartitionPath(), lastInstant.getTimestamp())
.filter(fs -> fs.getFileId().equals(op.getFileId())).findFirst().get(); .filter(fs -> fs.getFileId().equals(op.getFileId())).findFirst().get();
final int maxVersion = final int maxVersion = op.getDeltaFilePaths().stream().map(lf -> FSUtils.getFileVersionFromLog(new Path(lf)))
op.getDeltaFilePaths().stream().map(lf -> FSUtils.getFileVersionFromLog(new Path(lf)))
.reduce((x, y) -> x > y ? x : y).orElse(0); .reduce((x, y) -> x > y ? x : y).orElse(0);
List<HoodieLogFile> logFilesToBeMoved = List<HoodieLogFile> logFilesToBeMoved =
merged.getLogFiles().filter(lf -> lf.getLogVersion() > maxVersion).collect(Collectors.toList()); merged.getLogFiles().filter(lf -> lf.getLogVersion() > maxVersion).collect(Collectors.toList());
return logFilesToBeMoved.stream().map(lf -> { return logFilesToBeMoved.stream().map(lf -> {
Preconditions.checkArgument(lf.getLogVersion() - maxVersion > 0, Preconditions.checkArgument(lf.getLogVersion() - maxVersion > 0, "Expect new log version to be sane");
"Expect new log version to be sane");
HoodieLogFile newLogFile = new HoodieLogFile(new Path(lf.getPath().getParent(), HoodieLogFile newLogFile = new HoodieLogFile(new Path(lf.getPath().getParent(),
FSUtils.makeLogFileName(lf.getFileId(), "." + FSUtils.getFileExtensionFromLog(lf.getPath()), FSUtils.makeLogFileName(lf.getFileId(), "." + FSUtils.getFileExtensionFromLog(lf.getPath()),
compactionInstant, lf.getLogVersion() - maxVersion, HoodieLogFormat.UNKNOWN_WRITE_TOKEN))); compactionInstant, lf.getLogVersion() - maxVersion, HoodieLogFormat.UNKNOWN_WRITE_TOKEN)));
@@ -285,11 +278,10 @@ public class CompactionAdminClient extends AbstractHoodieClient {
* @param operation Compaction Operation * @param operation Compaction Operation
* @param fsViewOpt File System View * @param fsViewOpt File System View
*/ */
private ValidationOpResult validateCompactionOperation(HoodieTableMetaClient metaClient, private ValidationOpResult validateCompactionOperation(HoodieTableMetaClient metaClient, String compactionInstant,
String compactionInstant, CompactionOperation operation, Option<HoodieTableFileSystemView> fsViewOpt) CompactionOperation operation, Option<HoodieTableFileSystemView> fsViewOpt) throws IOException {
throws IOException { HoodieTableFileSystemView fileSystemView = fsViewOpt.isPresent() ? fsViewOpt.get()
HoodieTableFileSystemView fileSystemView = fsViewOpt.isPresent() ? fsViewOpt.get() : : new HoodieTableFileSystemView(metaClient, metaClient.getCommitsAndCompactionTimeline());
new HoodieTableFileSystemView(metaClient, metaClient.getCommitsAndCompactionTimeline());
Option<HoodieInstant> lastInstant = metaClient.getCommitsAndCompactionTimeline().lastInstant(); Option<HoodieInstant> lastInstant = metaClient.getCommitsAndCompactionTimeline().lastInstant();
try { try {
if (lastInstant.isPresent()) { if (lastInstant.isPresent()) {
@@ -300,16 +292,15 @@ public class CompactionAdminClient extends AbstractHoodieClient {
FileSlice fs = fileSliceOptional.get(); FileSlice fs = fileSliceOptional.get();
Option<HoodieDataFile> df = fs.getDataFile(); Option<HoodieDataFile> df = fs.getDataFile();
if (operation.getDataFilePath().isPresent()) { if (operation.getDataFilePath().isPresent()) {
String expPath = metaClient.getFs().getFileStatus(new Path(operation.getDataFilePath().get())).getPath() String expPath =
.toString(); metaClient.getFs().getFileStatus(new Path(operation.getDataFilePath().get())).getPath().toString();
Preconditions.checkArgument(df.isPresent(), "Data File must be present. File Slice was : " Preconditions.checkArgument(df.isPresent(),
+ fs + ", operation :" + operation); "Data File must be present. File Slice was : " + fs + ", operation :" + operation);
Preconditions.checkArgument(df.get().getPath().equals(expPath), Preconditions.checkArgument(df.get().getPath().equals(expPath),
"Base Path in operation is specified as " + expPath + " but got path " + df.get().getPath()); "Base Path in operation is specified as " + expPath + " but got path " + df.get().getPath());
} }
Set<HoodieLogFile> logFilesInFileSlice = fs.getLogFiles().collect(Collectors.toSet()); Set<HoodieLogFile> logFilesInFileSlice = fs.getLogFiles().collect(Collectors.toSet());
Set<HoodieLogFile> logFilesInCompactionOp = operation.getDeltaFilePaths().stream() Set<HoodieLogFile> logFilesInCompactionOp = operation.getDeltaFilePaths().stream().map(dp -> {
.map(dp -> {
try { try {
FileStatus[] fileStatuses = metaClient.getFs().listStatus(new Path(dp)); FileStatus[] fileStatuses = metaClient.getFs().listStatus(new Path(dp));
Preconditions.checkArgument(fileStatuses.length == 1, "Expect only 1 file-status"); Preconditions.checkArgument(fileStatuses.length == 1, "Expect only 1 file-status");
@@ -320,25 +311,23 @@ public class CompactionAdminClient extends AbstractHoodieClient {
throw new HoodieIOException(ioe.getMessage(), ioe); throw new HoodieIOException(ioe.getMessage(), ioe);
} }
}).collect(Collectors.toSet()); }).collect(Collectors.toSet());
Set<HoodieLogFile> missing = Set<HoodieLogFile> missing = logFilesInCompactionOp.stream().filter(lf -> !logFilesInFileSlice.contains(lf))
logFilesInCompactionOp.stream().filter(lf -> !logFilesInFileSlice.contains(lf))
.collect(Collectors.toSet()); .collect(Collectors.toSet());
Preconditions.checkArgument(missing.isEmpty(), Preconditions.checkArgument(missing.isEmpty(),
"All log files specified in compaction operation is not present. Missing :" + missing "All log files specified in compaction operation is not present. Missing :" + missing + ", Exp :"
+ ", Exp :" + logFilesInCompactionOp + ", Got :" + logFilesInFileSlice); + logFilesInCompactionOp + ", Got :" + logFilesInFileSlice);
Set<HoodieLogFile> diff = Set<HoodieLogFile> diff = logFilesInFileSlice.stream().filter(lf -> !logFilesInCompactionOp.contains(lf))
logFilesInFileSlice.stream().filter(lf -> !logFilesInCompactionOp.contains(lf))
.collect(Collectors.toSet()); .collect(Collectors.toSet());
Preconditions.checkArgument(diff.stream().allMatch(lf -> lf.getBaseCommitTime().equals(compactionInstant)), Preconditions.checkArgument(diff.stream().allMatch(lf -> lf.getBaseCommitTime().equals(compactionInstant)),
"There are some log-files which are neither specified in compaction plan " "There are some log-files which are neither specified in compaction plan "
+ "nor present after compaction request instant. Some of these :" + diff); + "nor present after compaction request instant. Some of these :" + diff);
} else { } else {
throw new CompactionValidationException("Unable to find file-slice for file-id (" + operation.getFileId() throw new CompactionValidationException(
+ " Compaction operation is invalid."); "Unable to find file-slice for file-id (" + operation.getFileId() + " Compaction operation is invalid.");
} }
} else { } else {
throw new CompactionValidationException("Unable to find any committed instant. Compaction Operation may " throw new CompactionValidationException(
+ "be pointing to stale file-slices"); "Unable to find any committed instant. Compaction Operation may " + "be pointing to stale file-slices");
} }
} catch (CompactionValidationException | IllegalArgumentException e) { } catch (CompactionValidationException | IllegalArgumentException e) {
return new ValidationOpResult(operation, false, Option.of(e)); return new ValidationOpResult(operation, false, Option.of(e));
@@ -374,8 +363,7 @@ public class CompactionAdminClient extends AbstractHoodieClient {
}).collect(); }).collect();
} else { } else {
log.info("Dry-Run Mode activated for rename operations"); log.info("Dry-Run Mode activated for rename operations");
return renameActions.parallelStream() return renameActions.parallelStream().map(lfPair -> new RenameOpResult(lfPair, false, false, Option.empty()))
.map(lfPair -> new RenameOpResult(lfPair, false, false, Option.empty()))
.collect(Collectors.toList()); .collect(Collectors.toList());
} }
} }
@@ -395,18 +383,18 @@ public class CompactionAdminClient extends AbstractHoodieClient {
protected List<Pair<HoodieLogFile, HoodieLogFile>> getRenamingActionsForUnschedulingCompactionPlan( protected List<Pair<HoodieLogFile, HoodieLogFile>> getRenamingActionsForUnschedulingCompactionPlan(
HoodieTableMetaClient metaClient, String compactionInstant, int parallelism, HoodieTableMetaClient metaClient, String compactionInstant, int parallelism,
Option<HoodieTableFileSystemView> fsViewOpt, boolean skipValidation) throws IOException { Option<HoodieTableFileSystemView> fsViewOpt, boolean skipValidation) throws IOException {
HoodieTableFileSystemView fsView = fsViewOpt.isPresent() ? fsViewOpt.get() : HoodieTableFileSystemView fsView = fsViewOpt.isPresent() ? fsViewOpt.get()
new HoodieTableFileSystemView(metaClient, metaClient.getCommitsAndCompactionTimeline()); : new HoodieTableFileSystemView(metaClient, metaClient.getCommitsAndCompactionTimeline());
HoodieCompactionPlan plan = getCompactionPlan(metaClient, compactionInstant); HoodieCompactionPlan plan = getCompactionPlan(metaClient, compactionInstant);
if (plan.getOperations() != null) { if (plan.getOperations() != null) {
log.info("Number of Compaction Operations :" + plan.getOperations().size() log.info(
+ " for instant :" + compactionInstant); "Number of Compaction Operations :" + plan.getOperations().size() + " for instant :" + compactionInstant);
List<CompactionOperation> ops = plan.getOperations().stream() List<CompactionOperation> ops = plan.getOperations().stream()
.map(CompactionOperation::convertFromAvroRecordInstance).collect(Collectors.toList()); .map(CompactionOperation::convertFromAvroRecordInstance).collect(Collectors.toList());
return jsc.parallelize(ops, parallelism).flatMap(op -> { return jsc.parallelize(ops, parallelism).flatMap(op -> {
try { try {
return getRenamingActionsForUnschedulingCompactionOperation(metaClient, compactionInstant, return getRenamingActionsForUnschedulingCompactionOperation(metaClient, compactionInstant, op,
op, Option.of(fsView), skipValidation).iterator(); Option.of(fsView), skipValidation).iterator();
} catch (IOException ioe) { } catch (IOException ioe) {
throw new HoodieIOException(ioe.getMessage(), ioe); throw new HoodieIOException(ioe.getMessage(), ioe);
} catch (CompactionValidationException ve) { } catch (CompactionValidationException ve) {
@@ -434,8 +422,8 @@ public class CompactionAdminClient extends AbstractHoodieClient {
HoodieTableMetaClient metaClient, String compactionInstant, CompactionOperation operation, HoodieTableMetaClient metaClient, String compactionInstant, CompactionOperation operation,
Option<HoodieTableFileSystemView> fsViewOpt, boolean skipValidation) throws IOException { Option<HoodieTableFileSystemView> fsViewOpt, boolean skipValidation) throws IOException {
List<Pair<HoodieLogFile, HoodieLogFile>> result = new ArrayList<>(); List<Pair<HoodieLogFile, HoodieLogFile>> result = new ArrayList<>();
HoodieTableFileSystemView fileSystemView = fsViewOpt.isPresent() ? fsViewOpt.get() : HoodieTableFileSystemView fileSystemView = fsViewOpt.isPresent() ? fsViewOpt.get()
new HoodieTableFileSystemView(metaClient, metaClient.getCommitsAndCompactionTimeline()); : new HoodieTableFileSystemView(metaClient, metaClient.getCommitsAndCompactionTimeline());
if (!skipValidation) { if (!skipValidation) {
validateCompactionOperation(metaClient, compactionInstant, operation, Option.of(fileSystemView)); validateCompactionOperation(metaClient, compactionInstant, operation, Option.of(fileSystemView));
} }
@@ -445,13 +433,11 @@ public class CompactionAdminClient extends AbstractHoodieClient {
.filter(fs -> fs.getFileId().equals(operation.getFileId())).findFirst().get(); .filter(fs -> fs.getFileId().equals(operation.getFileId())).findFirst().get();
List<HoodieLogFile> logFilesToRepair = List<HoodieLogFile> logFilesToRepair =
merged.getLogFiles().filter(lf -> lf.getBaseCommitTime().equals(compactionInstant)) merged.getLogFiles().filter(lf -> lf.getBaseCommitTime().equals(compactionInstant))
.sorted(HoodieLogFile.getLogFileComparator()) .sorted(HoodieLogFile.getLogFileComparator()).collect(Collectors.toList());
.collect(Collectors.toList());
FileSlice fileSliceForCompaction = FileSlice fileSliceForCompaction =
fileSystemView.getLatestFileSlicesBeforeOrOn(operation.getPartitionPath(), operation.getBaseInstantTime(), true) fileSystemView.getLatestFileSlicesBeforeOrOn(operation.getPartitionPath(), operation.getBaseInstantTime(), true)
.filter(fs -> fs.getFileId().equals(operation.getFileId())).findFirst().get(); .filter(fs -> fs.getFileId().equals(operation.getFileId())).findFirst().get();
int maxUsedVersion = int maxUsedVersion = fileSliceForCompaction.getLogFiles().findFirst().map(HoodieLogFile::getLogVersion)
fileSliceForCompaction.getLogFiles().findFirst().map(HoodieLogFile::getLogVersion)
.orElse(HoodieLogFile.LOGFILE_BASE_VERSION - 1); .orElse(HoodieLogFile.LOGFILE_BASE_VERSION - 1);
String logExtn = fileSliceForCompaction.getLogFiles().findFirst().map(lf -> "." + lf.getFileExtension()) String logExtn = fileSliceForCompaction.getLogFiles().findFirst().map(lf -> "." + lf.getFileExtension())
.orElse(HoodieLogFile.DELTA_EXTENSION); .orElse(HoodieLogFile.DELTA_EXTENSION);
@@ -479,8 +465,8 @@ public class CompactionAdminClient extends AbstractHoodieClient {
* compaction. * compaction.
*/ */
public List<Pair<HoodieLogFile, HoodieLogFile>> getRenamingActionsForUnschedulingCompactionForFileId( public List<Pair<HoodieLogFile, HoodieLogFile>> getRenamingActionsForUnschedulingCompactionForFileId(
HoodieTableMetaClient metaClient, HoodieFileGroupId fgId, HoodieTableMetaClient metaClient, HoodieFileGroupId fgId, Option<HoodieTableFileSystemView> fsViewOpt,
Option<HoodieTableFileSystemView> fsViewOpt, boolean skipValidation) throws IOException { boolean skipValidation) throws IOException {
Map<HoodieFileGroupId, Pair<String, HoodieCompactionOperation>> allPendingCompactions = Map<HoodieFileGroupId, Pair<String, HoodieCompactionOperation>> allPendingCompactions =
CompactionUtils.getAllPendingCompactionOperations(metaClient); CompactionUtils.getAllPendingCompactionOperations(metaClient);
if (allPendingCompactions.containsKey(fgId)) { if (allPendingCompactions.containsKey(fgId)) {
@@ -496,20 +482,19 @@ public class CompactionAdminClient extends AbstractHoodieClient {
*/ */
public static class RenameOpResult extends OperationResult<RenameInfo> { public static class RenameOpResult extends OperationResult<RenameInfo> {
public RenameOpResult() { public RenameOpResult() {}
public RenameOpResult(Pair<HoodieLogFile, HoodieLogFile> op, boolean success, Option<Exception> exception) {
super(
new RenameInfo(op.getKey().getFileId(), op.getKey().getPath().toString(), op.getRight().getPath().toString()),
success, exception);
} }
public RenameOpResult(Pair<HoodieLogFile, HoodieLogFile> op, boolean success, public RenameOpResult(Pair<HoodieLogFile, HoodieLogFile> op, boolean executed, boolean success,
Option<Exception> exception) { Option<Exception> exception) {
super(new RenameInfo(op.getKey().getFileId(), op.getKey().getPath().toString(), super(
op.getRight().getPath().toString()), success, exception); new RenameInfo(op.getKey().getFileId(), op.getKey().getPath().toString(), op.getRight().getPath().toString()),
} executed, success, exception);
public RenameOpResult(
Pair<HoodieLogFile, HoodieLogFile> op, boolean executed, boolean success,
Option<Exception> exception) {
super(new RenameInfo(op.getKey().getFileId(), op.getKey().getPath().toString(),
op.getRight().getPath().toString()), executed, success, exception);
} }
} }
@@ -518,11 +503,9 @@ public class CompactionAdminClient extends AbstractHoodieClient {
*/ */
public static class ValidationOpResult extends OperationResult<CompactionOperation> { public static class ValidationOpResult extends OperationResult<CompactionOperation> {
public ValidationOpResult() { public ValidationOpResult() {}
}
public ValidationOpResult( public ValidationOpResult(CompactionOperation operation, boolean success, Option<Exception> exception) {
CompactionOperation operation, boolean success, Option<Exception> exception) {
super(operation, success, exception); super(operation, success, exception);
} }
} }
@@ -533,8 +516,7 @@ public class CompactionAdminClient extends AbstractHoodieClient {
public String srcPath; public String srcPath;
public String destPath; public String destPath;
public RenameInfo() { public RenameInfo() {}
}
public RenameInfo(String fileId, String srcPath, String destPath) { public RenameInfo(String fileId, String srcPath, String destPath) {
this.fileId = fileId; this.fileId = fileId;

View File

@@ -58,9 +58,8 @@ public class HoodieReadClient<T extends HoodieRecordPayload> extends AbstractHoo
private static final Logger logger = LogManager.getLogger(HoodieReadClient.class); private static final Logger logger = LogManager.getLogger(HoodieReadClient.class);
/** /**
* TODO: We need to persist the index type into hoodie.properties and be able to access the index * TODO: We need to persist the index type into hoodie.properties and be able to access the index just with a simple
* just with a simple basepath pointing to the dataset. Until, then just always assume a * basepath pointing to the dataset. Until, then just always assume a BloomIndex
* BloomIndex
*/ */
private final transient HoodieIndex<T> index; private final transient HoodieIndex<T> index;
private final HoodieTimeline commitTimeline; private final HoodieTimeline commitTimeline;
@@ -70,13 +69,11 @@ public class HoodieReadClient<T extends HoodieRecordPayload> extends AbstractHoo
/** /**
* @param basePath path to Hoodie dataset * @param basePath path to Hoodie dataset
*/ */
public HoodieReadClient(JavaSparkContext jsc, String basePath, public HoodieReadClient(JavaSparkContext jsc, String basePath, Option<EmbeddedTimelineService> timelineService) {
Option<EmbeddedTimelineService> timelineService) {
this(jsc, HoodieWriteConfig.newBuilder().withPath(basePath) this(jsc, HoodieWriteConfig.newBuilder().withPath(basePath)
// by default we use HoodieBloomIndex // by default we use HoodieBloomIndex
.withIndexConfig( .withIndexConfig(HoodieIndexConfig.newBuilder().withIndexType(HoodieIndex.IndexType.BLOOM).build()).build(),
HoodieIndexConfig.newBuilder().withIndexType(HoodieIndex.IndexType.BLOOM).build()) timelineService);
.build(), timelineService);
} }
/** /**
@@ -130,8 +127,7 @@ public class HoodieReadClient<T extends HoodieRecordPayload> extends AbstractHoo
private void assertSqlContext() { private void assertSqlContext() {
if (!sqlContextOpt.isPresent()) { if (!sqlContextOpt.isPresent()) {
throw new IllegalStateException( throw new IllegalStateException("SQLContext must be set, when performing dataframe operations");
"SQLContext must be set, when performing dataframe operations");
} }
} }
@@ -152,17 +148,16 @@ public class HoodieReadClient<T extends HoodieRecordPayload> extends AbstractHoo
*/ */
public Dataset<Row> readROView(JavaRDD<HoodieKey> hoodieKeys, int parallelism) { public Dataset<Row> readROView(JavaRDD<HoodieKey> hoodieKeys, int parallelism) {
assertSqlContext(); assertSqlContext();
JavaPairRDD<HoodieKey, Option<Pair<String, String>>> lookupResultRDD = index JavaPairRDD<HoodieKey, Option<Pair<String, String>>> lookupResultRDD =
.fetchRecordLocation(hoodieKeys, jsc, hoodieTable); index.fetchRecordLocation(hoodieKeys, jsc, hoodieTable);
JavaPairRDD<HoodieKey, Option<String>> keyToFileRDD = lookupResultRDD JavaPairRDD<HoodieKey, Option<String>> keyToFileRDD =
.mapToPair(r -> new Tuple2<>(r._1, convertToDataFilePath(r._2))); lookupResultRDD.mapToPair(r -> new Tuple2<>(r._1, convertToDataFilePath(r._2)));
List<String> paths = keyToFileRDD.filter(keyFileTuple -> keyFileTuple._2().isPresent()) List<String> paths = keyToFileRDD.filter(keyFileTuple -> keyFileTuple._2().isPresent())
.map(keyFileTuple -> keyFileTuple._2().get()).collect(); .map(keyFileTuple -> keyFileTuple._2().get()).collect();
// record locations might be same for multiple keys, so need a unique list // record locations might be same for multiple keys, so need a unique list
Set<String> uniquePaths = new HashSet<>(paths); Set<String> uniquePaths = new HashSet<>(paths);
Dataset<Row> originalDF = sqlContextOpt.get().read() Dataset<Row> originalDF = sqlContextOpt.get().read().parquet(uniquePaths.toArray(new String[uniquePaths.size()]));
.parquet(uniquePaths.toArray(new String[uniquePaths.size()]));
StructType schema = originalDF.schema(); StructType schema = originalDF.schema();
JavaPairRDD<HoodieKey, Row> keyRowRDD = originalDF.javaRDD().mapToPair(row -> { JavaPairRDD<HoodieKey, Row> keyRowRDD = originalDF.javaRDD().mapToPair(row -> {
HoodieKey key = new HoodieKey(row.getAs(HoodieRecord.RECORD_KEY_METADATA_FIELD), HoodieKey key = new HoodieKey(row.getAs(HoodieRecord.RECORD_KEY_METADATA_FIELD),
@@ -176,18 +171,16 @@ public class HoodieReadClient<T extends HoodieRecordPayload> extends AbstractHoo
} }
/** /**
* Checks if the given [Keys] exists in the hoodie table and returns [Key, Option[FullFilePath]] * Checks if the given [Keys] exists in the hoodie table and returns [Key, Option[FullFilePath]] If the optional
* If the optional FullFilePath value is not present, then the key is not found. If the * FullFilePath value is not present, then the key is not found. If the FullFilePath value is present, it is the path
* FullFilePath value is present, it is the path component (without scheme) of the URI underlying * component (without scheme) of the URI underlying file
* file
*/ */
public JavaPairRDD<HoodieKey, Option<String>> checkExists(JavaRDD<HoodieKey> hoodieKeys) { public JavaPairRDD<HoodieKey, Option<String>> checkExists(JavaRDD<HoodieKey> hoodieKeys) {
return index.fetchRecordLocation(hoodieKeys, jsc, hoodieTable); return index.fetchRecordLocation(hoodieKeys, jsc, hoodieTable);
} }
/** /**
* Filter out HoodieRecords that already exists in the output folder. This is useful in * Filter out HoodieRecords that already exists in the output folder. This is useful in deduplication.
* deduplication.
* *
* @param hoodieRecords Input RDD of Hoodie records. * @param hoodieRecords Input RDD of Hoodie records.
* @return A subset of hoodieRecords RDD, with existing records filtered out. * @return A subset of hoodieRecords RDD, with existing records filtered out.
@@ -198,27 +191,27 @@ public class HoodieReadClient<T extends HoodieRecordPayload> extends AbstractHoo
} }
/** /**
* Looks up the index and tags each incoming record with a location of a file that contains the * Looks up the index and tags each incoming record with a location of a file that contains the row (if it is actually
* row (if it is actually present). Input RDD should contain no duplicates if needed. * present). Input RDD should contain no duplicates if needed.
* *
* @param hoodieRecords Input RDD of Hoodie records * @param hoodieRecords Input RDD of Hoodie records
* @return Tagged RDD of Hoodie records * @return Tagged RDD of Hoodie records
*/ */
public JavaRDD<HoodieRecord<T>> tagLocation(JavaRDD<HoodieRecord<T>> hoodieRecords) public JavaRDD<HoodieRecord<T>> tagLocation(JavaRDD<HoodieRecord<T>> hoodieRecords) throws HoodieIndexException {
throws HoodieIndexException {
return index.tagLocation(hoodieRecords, jsc, hoodieTable); return index.tagLocation(hoodieRecords, jsc, hoodieTable);
} }
/** /**
* Return all pending compactions with instant time for clients to decide what to compact next. * Return all pending compactions with instant time for clients to decide what to compact next.
*
* @return * @return
*/ */
public List<Pair<String, HoodieCompactionPlan>> getPendingCompactions() { public List<Pair<String, HoodieCompactionPlan>> getPendingCompactions() {
HoodieTableMetaClient metaClient = new HoodieTableMetaClient(jsc.hadoopConfiguration(), HoodieTableMetaClient metaClient =
hoodieTable.getMetaClient().getBasePath(), true); new HoodieTableMetaClient(jsc.hadoopConfiguration(), hoodieTable.getMetaClient().getBasePath(), true);
return CompactionUtils.getAllPendingCompactionPlans(metaClient).stream() return CompactionUtils.getAllPendingCompactionPlans(metaClient).stream()
.map(instantWorkloadPair -> .map(
Pair.of(instantWorkloadPair.getKey().getTimestamp(), instantWorkloadPair.getValue())) instantWorkloadPair -> Pair.of(instantWorkloadPair.getKey().getTimestamp(), instantWorkloadPair.getValue()))
.collect(Collectors.toList()); .collect(Collectors.toList());
} }
} }

View File

@@ -64,14 +64,11 @@ public class WriteStatus implements Serializable {
} }
/** /**
* Mark write as success, optionally using given parameters for the purpose of calculating some * Mark write as success, optionally using given parameters for the purpose of calculating some aggregate metrics.
* aggregate metrics. This method is not meant to cache passed arguments, since WriteStatus * This method is not meant to cache passed arguments, since WriteStatus objects are collected in Spark Driver.
* objects are collected in Spark Driver.
* *
* @param record deflated {@code HoodieRecord} containing information that uniquely identifies * @param record deflated {@code HoodieRecord} containing information that uniquely identifies it.
* it. * @param optionalRecordMetadata optional metadata related to data contained in {@link HoodieRecord} before deflation.
* @param optionalRecordMetadata optional metadata related to data contained in {@link
* HoodieRecord} before deflation.
*/ */
public void markSuccess(HoodieRecord record, Option<Map<String, String>> optionalRecordMetadata) { public void markSuccess(HoodieRecord record, Option<Map<String, String>> optionalRecordMetadata) {
if (trackSuccessRecords) { if (trackSuccessRecords) {
@@ -81,14 +78,11 @@ public class WriteStatus implements Serializable {
} }
/** /**
* Mark write as failed, optionally using given parameters for the purpose of calculating some * Mark write as failed, optionally using given parameters for the purpose of calculating some aggregate metrics. This
* aggregate metrics. This method is not meant to cache passed arguments, since WriteStatus * method is not meant to cache passed arguments, since WriteStatus objects are collected in Spark Driver.
* objects are collected in Spark Driver.
* *
* @param record deflated {@code HoodieRecord} containing information that uniquely identifies * @param record deflated {@code HoodieRecord} containing information that uniquely identifies it.
* it. * @param optionalRecordMetadata optional metadata related to data contained in {@link HoodieRecord} before deflation.
* @param optionalRecordMetadata optional metadata related to data contained in {@link
* HoodieRecord} before deflation.
*/ */
public void markFailure(HoodieRecord record, Throwable t, Option<Map<String, String>> optionalRecordMetadata) { public void markFailure(HoodieRecord record, Throwable t, Option<Map<String, String>> optionalRecordMetadata) {
if (failedRecords.isEmpty() || (random.nextDouble() <= failureFraction)) { if (failedRecords.isEmpty() || (random.nextDouble() <= failureFraction)) {

View File

@@ -40,10 +40,8 @@ public class HoodieCompactionConfig extends DefaultHoodieConfig {
// Turn on inline compaction - after fw delta commits a inline compaction will be run // Turn on inline compaction - after fw delta commits a inline compaction will be run
public static final String INLINE_COMPACT_PROP = "hoodie.compact.inline"; public static final String INLINE_COMPACT_PROP = "hoodie.compact.inline";
// Run a compaction every N delta commits // Run a compaction every N delta commits
public static final String INLINE_COMPACT_NUM_DELTA_COMMITS_PROP = public static final String INLINE_COMPACT_NUM_DELTA_COMMITS_PROP = "hoodie.compact.inline.max" + ".delta.commits";
"hoodie.compact.inline.max" + ".delta.commits"; public static final String CLEANER_FILE_VERSIONS_RETAINED_PROP = "hoodie.cleaner.fileversions" + ".retained";
public static final String CLEANER_FILE_VERSIONS_RETAINED_PROP =
"hoodie.cleaner.fileversions" + ".retained";
public static final String CLEANER_COMMITS_RETAINED_PROP = "hoodie.cleaner.commits.retained"; public static final String CLEANER_COMMITS_RETAINED_PROP = "hoodie.cleaner.commits.retained";
public static final String MAX_COMMITS_TO_KEEP_PROP = "hoodie.keep.max.commits"; public static final String MAX_COMMITS_TO_KEEP_PROP = "hoodie.keep.max.commits";
public static final String MIN_COMMITS_TO_KEEP_PROP = "hoodie.keep.min.commits"; public static final String MIN_COMMITS_TO_KEEP_PROP = "hoodie.keep.min.commits";
@@ -56,25 +54,21 @@ public class HoodieCompactionConfig extends DefaultHoodieConfig {
* Configs related to specific table types * Configs related to specific table types
**/ **/
// Number of inserts, that will be put each partition/bucket for writing // Number of inserts, that will be put each partition/bucket for writing
public static final String COPY_ON_WRITE_TABLE_INSERT_SPLIT_SIZE = public static final String COPY_ON_WRITE_TABLE_INSERT_SPLIT_SIZE = "hoodie.copyonwrite.insert" + ".split.size";
"hoodie.copyonwrite.insert" + ".split.size";
// The rationale to pick the insert parallelism is the following. Writing out 100MB files, // The rationale to pick the insert parallelism is the following. Writing out 100MB files,
// with atleast 1kb records, means 100K records per file. we just overprovision to 500K // with atleast 1kb records, means 100K records per file. we just overprovision to 500K
public static final String DEFAULT_COPY_ON_WRITE_TABLE_INSERT_SPLIT_SIZE = String.valueOf(500000); public static final String DEFAULT_COPY_ON_WRITE_TABLE_INSERT_SPLIT_SIZE = String.valueOf(500000);
// Config to control whether we control insert split sizes automatically based on average // Config to control whether we control insert split sizes automatically based on average
// record sizes // record sizes
public static final String COPY_ON_WRITE_TABLE_AUTO_SPLIT_INSERTS = public static final String COPY_ON_WRITE_TABLE_AUTO_SPLIT_INSERTS = "hoodie.copyonwrite.insert" + ".auto.split";
"hoodie.copyonwrite.insert" + ".auto.split";
// its off by default // its off by default
public static final String DEFAULT_COPY_ON_WRITE_TABLE_AUTO_SPLIT_INSERTS = String.valueOf(true); public static final String DEFAULT_COPY_ON_WRITE_TABLE_AUTO_SPLIT_INSERTS = String.valueOf(true);
// This value is used as a guessimate for the record size, if we can't determine this from // This value is used as a guessimate for the record size, if we can't determine this from
// previous commits // previous commits
public static final String COPY_ON_WRITE_TABLE_RECORD_SIZE_ESTIMATE = public static final String COPY_ON_WRITE_TABLE_RECORD_SIZE_ESTIMATE = "hoodie.copyonwrite" + ".record.size.estimate";
"hoodie.copyonwrite" + ".record.size.estimate";
// Used to determine how much more can be packed into a small file, before it exceeds the size // Used to determine how much more can be packed into a small file, before it exceeds the size
// limit. // limit.
public static final String DEFAULT_COPY_ON_WRITE_TABLE_RECORD_SIZE_ESTIMATE = String public static final String DEFAULT_COPY_ON_WRITE_TABLE_RECORD_SIZE_ESTIMATE = String.valueOf(1024);
.valueOf(1024);
public static final String CLEANER_PARALLELISM = "hoodie.cleaner.parallelism"; public static final String CLEANER_PARALLELISM = "hoodie.cleaner.parallelism";
public static final String DEFAULT_CLEANER_PARALLELISM = String.valueOf(200); public static final String DEFAULT_CLEANER_PARALLELISM = String.valueOf(200);
public static final String TARGET_IO_PER_COMPACTION_IN_MB_PROP = "hoodie.compaction.target.io"; public static final String TARGET_IO_PER_COMPACTION_IN_MB_PROP = "hoodie.compaction.target.io";
@@ -82,8 +76,7 @@ public class HoodieCompactionConfig extends DefaultHoodieConfig {
public static final String DEFAULT_TARGET_IO_PER_COMPACTION_IN_MB = String.valueOf(500 * 1024); public static final String DEFAULT_TARGET_IO_PER_COMPACTION_IN_MB = String.valueOf(500 * 1024);
public static final String COMPACTION_STRATEGY_PROP = "hoodie.compaction.strategy"; public static final String COMPACTION_STRATEGY_PROP = "hoodie.compaction.strategy";
// 200GB of target IO per compaction // 200GB of target IO per compaction
public static final String DEFAULT_COMPACTION_STRATEGY = LogFileSizeBasedCompactionStrategy.class public static final String DEFAULT_COMPACTION_STRATEGY = LogFileSizeBasedCompactionStrategy.class.getName();
.getName();
// used to merge records written to log file // used to merge records written to log file
public static final String DEFAULT_PAYLOAD_CLASS = HoodieAvroPayload.class.getName(); public static final String DEFAULT_PAYLOAD_CLASS = HoodieAvroPayload.class.getName();
public static final String PAYLOAD_CLASS_PROP = "hoodie.compaction.payload.class"; public static final String PAYLOAD_CLASS_PROP = "hoodie.compaction.payload.class";
@@ -91,15 +84,12 @@ public class HoodieCompactionConfig extends DefaultHoodieConfig {
// used to choose a trade off between IO vs Memory when performing compaction process // used to choose a trade off between IO vs Memory when performing compaction process
// Depending on outputfile_size and memory provided, choose true to avoid OOM for large file // Depending on outputfile_size and memory provided, choose true to avoid OOM for large file
// size + small memory // size + small memory
public static final String COMPACTION_LAZY_BLOCK_READ_ENABLED_PROP = public static final String COMPACTION_LAZY_BLOCK_READ_ENABLED_PROP = "hoodie.compaction.lazy" + ".block.read";
"hoodie.compaction.lazy" + ".block.read";
public static final String DEFAULT_COMPACTION_LAZY_BLOCK_READ_ENABLED = "false"; public static final String DEFAULT_COMPACTION_LAZY_BLOCK_READ_ENABLED = "false";
// used to choose whether to enable reverse log reading (reverse log traversal) // used to choose whether to enable reverse log reading (reverse log traversal)
public static final String COMPACTION_REVERSE_LOG_READ_ENABLED_PROP = public static final String COMPACTION_REVERSE_LOG_READ_ENABLED_PROP = "hoodie.compaction" + ".reverse.log.read";
"hoodie.compaction" + ".reverse.log.read";
public static final String DEFAULT_COMPACTION_REVERSE_LOG_READ_ENABLED = "false"; public static final String DEFAULT_COMPACTION_REVERSE_LOG_READ_ENABLED = "false";
private static final String DEFAULT_CLEANER_POLICY = HoodieCleaningPolicy.KEEP_LATEST_COMMITS private static final String DEFAULT_CLEANER_POLICY = HoodieCleaningPolicy.KEEP_LATEST_COMMITS.name();
.name();
private static final String DEFAULT_AUTO_CLEAN = "true"; private static final String DEFAULT_AUTO_CLEAN = "true";
private static final String DEFAULT_INLINE_COMPACT = "false"; private static final String DEFAULT_INLINE_COMPACT = "false";
private static final String DEFAULT_INLINE_COMPACT_NUM_DELTA_COMMITS = "1"; private static final String DEFAULT_INLINE_COMPACT_NUM_DELTA_COMMITS = "1";
@@ -108,8 +98,8 @@ public class HoodieCompactionConfig extends DefaultHoodieConfig {
private static final String DEFAULT_MAX_COMMITS_TO_KEEP = "30"; private static final String DEFAULT_MAX_COMMITS_TO_KEEP = "30";
private static final String DEFAULT_MIN_COMMITS_TO_KEEP = "20"; private static final String DEFAULT_MIN_COMMITS_TO_KEEP = "20";
private static final String DEFAULT_COMMITS_ARCHIVAL_BATCH_SIZE = String.valueOf(10); private static final String DEFAULT_COMMITS_ARCHIVAL_BATCH_SIZE = String.valueOf(10);
public static final String TARGET_PARTITIONS_PER_DAYBASED_COMPACTION_PROP = "hoodie.compaction.daybased.target" public static final String TARGET_PARTITIONS_PER_DAYBASED_COMPACTION_PROP =
+ ".partitions"; "hoodie.compaction.daybased.target" + ".partitions";
// 500GB of target IO per compaction (both read and write) // 500GB of target IO per compaction (both read and write)
public static final String DEFAULT_TARGET_PARTITIONS_PER_DAYBASED_COMPACTION = String.valueOf(10); public static final String DEFAULT_TARGET_PARTITIONS_PER_DAYBASED_COMPACTION = String.valueOf(10);
@@ -188,14 +178,12 @@ public class HoodieCompactionConfig extends DefaultHoodieConfig {
} }
public Builder autoTuneInsertSplits(boolean autoTuneInsertSplits) { public Builder autoTuneInsertSplits(boolean autoTuneInsertSplits) {
props.setProperty(COPY_ON_WRITE_TABLE_AUTO_SPLIT_INSERTS, props.setProperty(COPY_ON_WRITE_TABLE_AUTO_SPLIT_INSERTS, String.valueOf(autoTuneInsertSplits));
String.valueOf(autoTuneInsertSplits));
return this; return this;
} }
public Builder approxRecordSize(int recordSizeEstimate) { public Builder approxRecordSize(int recordSizeEstimate) {
props.setProperty(COPY_ON_WRITE_TABLE_RECORD_SIZE_ESTIMATE, props.setProperty(COPY_ON_WRITE_TABLE_RECORD_SIZE_ESTIMATE, String.valueOf(recordSizeEstimate));
String.valueOf(recordSizeEstimate));
return this; return this;
} }
@@ -215,32 +203,27 @@ public class HoodieCompactionConfig extends DefaultHoodieConfig {
} }
public Builder withTargetIOPerCompactionInMB(long targetIOPerCompactionInMB) { public Builder withTargetIOPerCompactionInMB(long targetIOPerCompactionInMB) {
props.setProperty(TARGET_IO_PER_COMPACTION_IN_MB_PROP, props.setProperty(TARGET_IO_PER_COMPACTION_IN_MB_PROP, String.valueOf(targetIOPerCompactionInMB));
String.valueOf(targetIOPerCompactionInMB));
return this; return this;
} }
public Builder withMaxNumDeltaCommitsBeforeCompaction(int maxNumDeltaCommitsBeforeCompaction) { public Builder withMaxNumDeltaCommitsBeforeCompaction(int maxNumDeltaCommitsBeforeCompaction) {
props.setProperty(INLINE_COMPACT_NUM_DELTA_COMMITS_PROP, props.setProperty(INLINE_COMPACT_NUM_DELTA_COMMITS_PROP, String.valueOf(maxNumDeltaCommitsBeforeCompaction));
String.valueOf(maxNumDeltaCommitsBeforeCompaction));
return this; return this;
} }
public Builder withCompactionLazyBlockReadEnabled(Boolean compactionLazyBlockReadEnabled) { public Builder withCompactionLazyBlockReadEnabled(Boolean compactionLazyBlockReadEnabled) {
props.setProperty(COMPACTION_LAZY_BLOCK_READ_ENABLED_PROP, props.setProperty(COMPACTION_LAZY_BLOCK_READ_ENABLED_PROP, String.valueOf(compactionLazyBlockReadEnabled));
String.valueOf(compactionLazyBlockReadEnabled));
return this; return this;
} }
public Builder withCompactionReverseLogReadEnabled(Boolean compactionReverseLogReadEnabled) { public Builder withCompactionReverseLogReadEnabled(Boolean compactionReverseLogReadEnabled) {
props.setProperty(COMPACTION_REVERSE_LOG_READ_ENABLED_PROP, props.setProperty(COMPACTION_REVERSE_LOG_READ_ENABLED_PROP, String.valueOf(compactionReverseLogReadEnabled));
String.valueOf(compactionReverseLogReadEnabled));
return this; return this;
} }
public Builder withTargetPartitionsPerDayBasedCompaction(int targetPartitionsPerCompaction) { public Builder withTargetPartitionsPerDayBasedCompaction(int targetPartitionsPerCompaction) {
props.setProperty(TARGET_PARTITIONS_PER_DAYBASED_COMPACTION_PROP, props.setProperty(TARGET_PARTITIONS_PER_DAYBASED_COMPACTION_PROP, String.valueOf(targetPartitionsPerCompaction));
String.valueOf(targetPartitionsPerCompaction));
return this; return this;
} }
@@ -251,8 +234,7 @@ public class HoodieCompactionConfig extends DefaultHoodieConfig {
public HoodieCompactionConfig build() { public HoodieCompactionConfig build() {
HoodieCompactionConfig config = new HoodieCompactionConfig(props); HoodieCompactionConfig config = new HoodieCompactionConfig(props);
setDefaultOnCondition(props, !props.containsKey(AUTO_CLEAN_PROP), AUTO_CLEAN_PROP, setDefaultOnCondition(props, !props.containsKey(AUTO_CLEAN_PROP), AUTO_CLEAN_PROP, DEFAULT_AUTO_CLEAN);
DEFAULT_AUTO_CLEAN);
setDefaultOnCondition(props, !props.containsKey(INLINE_COMPACT_PROP), INLINE_COMPACT_PROP, setDefaultOnCondition(props, !props.containsKey(INLINE_COMPACT_PROP), INLINE_COMPACT_PROP,
DEFAULT_INLINE_COMPACT); DEFAULT_INLINE_COMPACT);
setDefaultOnCondition(props, !props.containsKey(INLINE_COMPACT_NUM_DELTA_COMMITS_PROP), setDefaultOnCondition(props, !props.containsKey(INLINE_COMPACT_NUM_DELTA_COMMITS_PROP),
@@ -261,27 +243,25 @@ public class HoodieCompactionConfig extends DefaultHoodieConfig {
DEFAULT_CLEANER_POLICY); DEFAULT_CLEANER_POLICY);
setDefaultOnCondition(props, !props.containsKey(CLEANER_FILE_VERSIONS_RETAINED_PROP), setDefaultOnCondition(props, !props.containsKey(CLEANER_FILE_VERSIONS_RETAINED_PROP),
CLEANER_FILE_VERSIONS_RETAINED_PROP, DEFAULT_CLEANER_FILE_VERSIONS_RETAINED); CLEANER_FILE_VERSIONS_RETAINED_PROP, DEFAULT_CLEANER_FILE_VERSIONS_RETAINED);
setDefaultOnCondition(props, !props.containsKey(CLEANER_COMMITS_RETAINED_PROP), setDefaultOnCondition(props, !props.containsKey(CLEANER_COMMITS_RETAINED_PROP), CLEANER_COMMITS_RETAINED_PROP,
CLEANER_COMMITS_RETAINED_PROP, DEFAULT_CLEANER_COMMITS_RETAINED); DEFAULT_CLEANER_COMMITS_RETAINED);
setDefaultOnCondition(props, !props.containsKey(MAX_COMMITS_TO_KEEP_PROP), MAX_COMMITS_TO_KEEP_PROP, setDefaultOnCondition(props, !props.containsKey(MAX_COMMITS_TO_KEEP_PROP), MAX_COMMITS_TO_KEEP_PROP,
DEFAULT_MAX_COMMITS_TO_KEEP); DEFAULT_MAX_COMMITS_TO_KEEP);
setDefaultOnCondition(props, !props.containsKey(MIN_COMMITS_TO_KEEP_PROP), MIN_COMMITS_TO_KEEP_PROP, setDefaultOnCondition(props, !props.containsKey(MIN_COMMITS_TO_KEEP_PROP), MIN_COMMITS_TO_KEEP_PROP,
DEFAULT_MIN_COMMITS_TO_KEEP); DEFAULT_MIN_COMMITS_TO_KEEP);
setDefaultOnCondition(props, !props.containsKey(PARQUET_SMALL_FILE_LIMIT_BYTES), setDefaultOnCondition(props, !props.containsKey(PARQUET_SMALL_FILE_LIMIT_BYTES), PARQUET_SMALL_FILE_LIMIT_BYTES,
PARQUET_SMALL_FILE_LIMIT_BYTES, DEFAULT_PARQUET_SMALL_FILE_LIMIT_BYTES); DEFAULT_PARQUET_SMALL_FILE_LIMIT_BYTES);
setDefaultOnCondition(props, !props.containsKey(COPY_ON_WRITE_TABLE_INSERT_SPLIT_SIZE), setDefaultOnCondition(props, !props.containsKey(COPY_ON_WRITE_TABLE_INSERT_SPLIT_SIZE),
COPY_ON_WRITE_TABLE_INSERT_SPLIT_SIZE, DEFAULT_COPY_ON_WRITE_TABLE_INSERT_SPLIT_SIZE); COPY_ON_WRITE_TABLE_INSERT_SPLIT_SIZE, DEFAULT_COPY_ON_WRITE_TABLE_INSERT_SPLIT_SIZE);
setDefaultOnCondition(props, !props.containsKey(COPY_ON_WRITE_TABLE_AUTO_SPLIT_INSERTS), setDefaultOnCondition(props, !props.containsKey(COPY_ON_WRITE_TABLE_AUTO_SPLIT_INSERTS),
COPY_ON_WRITE_TABLE_AUTO_SPLIT_INSERTS, DEFAULT_COPY_ON_WRITE_TABLE_AUTO_SPLIT_INSERTS); COPY_ON_WRITE_TABLE_AUTO_SPLIT_INSERTS, DEFAULT_COPY_ON_WRITE_TABLE_AUTO_SPLIT_INSERTS);
setDefaultOnCondition(props, !props.containsKey(COPY_ON_WRITE_TABLE_RECORD_SIZE_ESTIMATE), setDefaultOnCondition(props, !props.containsKey(COPY_ON_WRITE_TABLE_RECORD_SIZE_ESTIMATE),
COPY_ON_WRITE_TABLE_RECORD_SIZE_ESTIMATE, COPY_ON_WRITE_TABLE_RECORD_SIZE_ESTIMATE, DEFAULT_COPY_ON_WRITE_TABLE_RECORD_SIZE_ESTIMATE);
DEFAULT_COPY_ON_WRITE_TABLE_RECORD_SIZE_ESTIMATE);
setDefaultOnCondition(props, !props.containsKey(CLEANER_PARALLELISM), CLEANER_PARALLELISM, setDefaultOnCondition(props, !props.containsKey(CLEANER_PARALLELISM), CLEANER_PARALLELISM,
DEFAULT_CLEANER_PARALLELISM); DEFAULT_CLEANER_PARALLELISM);
setDefaultOnCondition(props, !props.containsKey(COMPACTION_STRATEGY_PROP), setDefaultOnCondition(props, !props.containsKey(COMPACTION_STRATEGY_PROP), COMPACTION_STRATEGY_PROP,
COMPACTION_STRATEGY_PROP, DEFAULT_COMPACTION_STRATEGY); DEFAULT_COMPACTION_STRATEGY);
setDefaultOnCondition(props, !props.containsKey(PAYLOAD_CLASS_PROP), setDefaultOnCondition(props, !props.containsKey(PAYLOAD_CLASS_PROP), PAYLOAD_CLASS_PROP, DEFAULT_PAYLOAD_CLASS);
PAYLOAD_CLASS_PROP, DEFAULT_PAYLOAD_CLASS);
setDefaultOnCondition(props, !props.containsKey(TARGET_IO_PER_COMPACTION_IN_MB_PROP), setDefaultOnCondition(props, !props.containsKey(TARGET_IO_PER_COMPACTION_IN_MB_PROP),
TARGET_IO_PER_COMPACTION_IN_MB_PROP, DEFAULT_TARGET_IO_PER_COMPACTION_IN_MB); TARGET_IO_PER_COMPACTION_IN_MB_PROP, DEFAULT_TARGET_IO_PER_COMPACTION_IN_MB);
setDefaultOnCondition(props, !props.containsKey(COMPACTION_LAZY_BLOCK_READ_ENABLED_PROP), setDefaultOnCondition(props, !props.containsKey(COMPACTION_LAZY_BLOCK_READ_ENABLED_PROP),
@@ -299,13 +279,15 @@ public class HoodieCompactionConfig extends DefaultHoodieConfig {
// commit instant on timeline, that still has not been cleaned. Could miss some data via incr pull // commit instant on timeline, that still has not been cleaned. Could miss some data via incr pull
int minInstantsToKeep = Integer.parseInt(props.getProperty(HoodieCompactionConfig.MIN_COMMITS_TO_KEEP_PROP)); int minInstantsToKeep = Integer.parseInt(props.getProperty(HoodieCompactionConfig.MIN_COMMITS_TO_KEEP_PROP));
int maxInstantsToKeep = Integer.parseInt(props.getProperty(HoodieCompactionConfig.MAX_COMMITS_TO_KEEP_PROP)); int maxInstantsToKeep = Integer.parseInt(props.getProperty(HoodieCompactionConfig.MAX_COMMITS_TO_KEEP_PROP));
int cleanerCommitsRetained = Integer int cleanerCommitsRetained =
.parseInt(props.getProperty(HoodieCompactionConfig.CLEANER_COMMITS_RETAINED_PROP)); Integer.parseInt(props.getProperty(HoodieCompactionConfig.CLEANER_COMMITS_RETAINED_PROP));
Preconditions.checkArgument(maxInstantsToKeep > minInstantsToKeep); Preconditions.checkArgument(maxInstantsToKeep > minInstantsToKeep);
Preconditions.checkArgument(minInstantsToKeep > cleanerCommitsRetained, Preconditions.checkArgument(minInstantsToKeep > cleanerCommitsRetained,
String.format("Increase %s=%d to be greater than %s=%d. Otherwise, there is risk of incremental pull " String.format(
+ "missing data from few instants.", HoodieCompactionConfig.MIN_COMMITS_TO_KEEP_PROP, "Increase %s=%d to be greater than %s=%d. Otherwise, there is risk of incremental pull "
minInstantsToKeep, HoodieCompactionConfig.CLEANER_COMMITS_RETAINED_PROP, cleanerCommitsRetained)); + "missing data from few instants.",
HoodieCompactionConfig.MIN_COMMITS_TO_KEEP_PROP, minInstantsToKeep,
HoodieCompactionConfig.CLEANER_COMMITS_RETAINED_PROP, cleanerCommitsRetained));
return config; return config;
} }
} }

View File

@@ -32,8 +32,8 @@ public class HoodieHBaseIndexConfig extends DefaultHoodieConfig {
public static final String HBASE_GET_BATCH_SIZE_PROP = "hoodie.index.hbase.get.batch.size"; public static final String HBASE_GET_BATCH_SIZE_PROP = "hoodie.index.hbase.get.batch.size";
public static final String HBASE_ZK_ZNODEPARENT = "hoodie.index.hbase.zknode.path"; public static final String HBASE_ZK_ZNODEPARENT = "hoodie.index.hbase.zknode.path";
/** /**
* Note that if HBASE_PUT_BATCH_SIZE_AUTO_COMPUTE_PROP is set to true, this batch size will not * Note that if HBASE_PUT_BATCH_SIZE_AUTO_COMPUTE_PROP is set to true, this batch size will not be honored for HBase
* be honored for HBase Puts * Puts
*/ */
public static final String HBASE_PUT_BATCH_SIZE_PROP = "hoodie.index.hbase.put.batch.size"; public static final String HBASE_PUT_BATCH_SIZE_PROP = "hoodie.index.hbase.put.batch.size";
@@ -48,18 +48,16 @@ public class HoodieHBaseIndexConfig extends DefaultHoodieConfig {
public static final String HBASE_PUT_BATCH_SIZE_AUTO_COMPUTE_PROP = "hoodie.index.hbase.put.batch.size.autocompute"; public static final String HBASE_PUT_BATCH_SIZE_AUTO_COMPUTE_PROP = "hoodie.index.hbase.put.batch.size.autocompute";
public static final String DEFAULT_HBASE_PUT_BATCH_SIZE_AUTO_COMPUTE = "false"; public static final String DEFAULT_HBASE_PUT_BATCH_SIZE_AUTO_COMPUTE = "false";
/** /**
* Property to set the fraction of the global share of QPS that should be allocated to this job. * Property to set the fraction of the global share of QPS that should be allocated to this job. Let's say there are 3
* Let's say there are 3 jobs which have input size in terms of number of rows required for * jobs which have input size in terms of number of rows required for HbaseIndexing as x, 2x, 3x respectively. Then
* HbaseIndexing as x, 2x, 3x respectively. Then this fraction for the jobs would be (0.17) 1/6, * this fraction for the jobs would be (0.17) 1/6, 0.33 (2/6) and 0.5 (3/6) respectively.
* 0.33 (2/6) and 0.5 (3/6) respectively.
*/ */
public static final String HBASE_QPS_FRACTION_PROP = "hoodie.index.hbase.qps.fraction"; public static final String HBASE_QPS_FRACTION_PROP = "hoodie.index.hbase.qps.fraction";
/** /**
* Property to set maximum QPS allowed per Region Server. This should be same across various * Property to set maximum QPS allowed per Region Server. This should be same across various jobs. This is intended to
* jobs. This is intended to limit the aggregate QPS generated across various jobs to an Hbase * limit the aggregate QPS generated across various jobs to an Hbase Region Server. It is recommended to set this
* Region Server. It is recommended to set this value based on global indexing throughput needs * value based on global indexing throughput needs and most importantly, how much the HBase installation in use is
* and most importantly, how much the HBase installation in use is able to tolerate without * able to tolerate without Region Servers going down.
* Region Servers going down.
*/ */
public static String HBASE_MAX_QPS_PER_REGION_SERVER_PROP = "hoodie.index.hbase.max.qps.per.region.server"; public static String HBASE_MAX_QPS_PER_REGION_SERVER_PROP = "hoodie.index.hbase.max.qps.per.region.server";
/** /**
@@ -71,8 +69,7 @@ public class HoodieHBaseIndexConfig extends DefaultHoodieConfig {
*/ */
public static final int DEFAULT_HBASE_MAX_QPS_PER_REGION_SERVER = 1000; public static final int DEFAULT_HBASE_MAX_QPS_PER_REGION_SERVER = 1000;
/** /**
* Default is 50%, which means a total of 2 jobs can run using HbaseIndex without overwhelming * Default is 50%, which means a total of 2 jobs can run using HbaseIndex without overwhelming Region Servers
* Region Servers
*/ */
public static final float DEFAULT_HBASE_QPS_FRACTION = 0.5f; public static final float DEFAULT_HBASE_QPS_FRACTION = 0.5f;
@@ -218,18 +215,15 @@ public class HoodieHBaseIndexConfig extends DefaultHoodieConfig {
/** /**
* <p> * <p>
* Method to set maximum QPS allowed per Region Server. This should be same across various * Method to set maximum QPS allowed per Region Server. This should be same across various jobs. This is intended to
* jobs. This is intended to limit the aggregate QPS generated across various jobs to an * limit the aggregate QPS generated across various jobs to an Hbase Region Server.
* Hbase Region Server.
* </p> * </p>
* <p> * <p>
* It is recommended to set this value based on your global indexing throughput needs and * It is recommended to set this value based on your global indexing throughput needs and most importantly, how much
* most importantly, how much your HBase installation is able to tolerate without Region * your HBase installation is able to tolerate without Region Servers going down.
* Servers going down.
* </p> * </p>
*/ */
public HoodieHBaseIndexConfig.Builder hbaseIndexMaxQPSPerRegionServer( public HoodieHBaseIndexConfig.Builder hbaseIndexMaxQPSPerRegionServer(int maxQPSPerRegionServer) {
int maxQPSPerRegionServer) {
// This should be same across various jobs // This should be same across various jobs
props.setProperty(HoodieHBaseIndexConfig.HBASE_MAX_QPS_PER_REGION_SERVER_PROP, props.setProperty(HoodieHBaseIndexConfig.HBASE_MAX_QPS_PER_REGION_SERVER_PROP,
String.valueOf(maxQPSPerRegionServer)); String.valueOf(maxQPSPerRegionServer));
@@ -238,30 +232,30 @@ public class HoodieHBaseIndexConfig extends DefaultHoodieConfig {
public HoodieHBaseIndexConfig build() { public HoodieHBaseIndexConfig build() {
HoodieHBaseIndexConfig config = new HoodieHBaseIndexConfig(props); HoodieHBaseIndexConfig config = new HoodieHBaseIndexConfig(props);
setDefaultOnCondition(props, !props.containsKey(HBASE_GET_BATCH_SIZE_PROP), setDefaultOnCondition(props, !props.containsKey(HBASE_GET_BATCH_SIZE_PROP), HBASE_GET_BATCH_SIZE_PROP,
HBASE_GET_BATCH_SIZE_PROP, String.valueOf(DEFAULT_HBASE_BATCH_SIZE)); String.valueOf(DEFAULT_HBASE_BATCH_SIZE));
setDefaultOnCondition(props, !props.containsKey(HBASE_PUT_BATCH_SIZE_PROP), setDefaultOnCondition(props, !props.containsKey(HBASE_PUT_BATCH_SIZE_PROP), HBASE_PUT_BATCH_SIZE_PROP,
HBASE_PUT_BATCH_SIZE_PROP, String.valueOf(DEFAULT_HBASE_BATCH_SIZE)); String.valueOf(DEFAULT_HBASE_BATCH_SIZE));
setDefaultOnCondition(props, !props.containsKey(HBASE_PUT_BATCH_SIZE_AUTO_COMPUTE_PROP), setDefaultOnCondition(props, !props.containsKey(HBASE_PUT_BATCH_SIZE_AUTO_COMPUTE_PROP),
HBASE_PUT_BATCH_SIZE_AUTO_COMPUTE_PROP, String.valueOf(DEFAULT_HBASE_PUT_BATCH_SIZE_AUTO_COMPUTE)); HBASE_PUT_BATCH_SIZE_AUTO_COMPUTE_PROP, String.valueOf(DEFAULT_HBASE_PUT_BATCH_SIZE_AUTO_COMPUTE));
setDefaultOnCondition(props, !props.containsKey(HBASE_QPS_FRACTION_PROP), setDefaultOnCondition(props, !props.containsKey(HBASE_QPS_FRACTION_PROP), HBASE_QPS_FRACTION_PROP,
HBASE_QPS_FRACTION_PROP, String.valueOf(DEFAULT_HBASE_QPS_FRACTION)); String.valueOf(DEFAULT_HBASE_QPS_FRACTION));
setDefaultOnCondition(props, !props.containsKey(HBASE_MAX_QPS_PER_REGION_SERVER_PROP), setDefaultOnCondition(props, !props.containsKey(HBASE_MAX_QPS_PER_REGION_SERVER_PROP),
HBASE_MAX_QPS_PER_REGION_SERVER_PROP, String.valueOf(DEFAULT_HBASE_MAX_QPS_PER_REGION_SERVER)); HBASE_MAX_QPS_PER_REGION_SERVER_PROP, String.valueOf(DEFAULT_HBASE_MAX_QPS_PER_REGION_SERVER));
setDefaultOnCondition(props, !props.containsKey(HOODIE_INDEX_COMPUTE_QPS_DYNAMICALLY), setDefaultOnCondition(props, !props.containsKey(HOODIE_INDEX_COMPUTE_QPS_DYNAMICALLY),
HOODIE_INDEX_COMPUTE_QPS_DYNAMICALLY, String.valueOf(DEFAULT_HOODIE_INDEX_COMPUTE_QPS_DYNAMICALLY)); HOODIE_INDEX_COMPUTE_QPS_DYNAMICALLY, String.valueOf(DEFAULT_HOODIE_INDEX_COMPUTE_QPS_DYNAMICALLY));
setDefaultOnCondition(props, !props.containsKey(HBASE_INDEX_QPS_ALLOCATOR_CLASS), setDefaultOnCondition(props, !props.containsKey(HBASE_INDEX_QPS_ALLOCATOR_CLASS), HBASE_INDEX_QPS_ALLOCATOR_CLASS,
HBASE_INDEX_QPS_ALLOCATOR_CLASS, String.valueOf(DEFAULT_HBASE_INDEX_QPS_ALLOCATOR_CLASS)); String.valueOf(DEFAULT_HBASE_INDEX_QPS_ALLOCATOR_CLASS));
setDefaultOnCondition(props, !props.containsKey(HOODIE_INDEX_DESIRED_PUTS_TIME_IN_SECS), setDefaultOnCondition(props, !props.containsKey(HOODIE_INDEX_DESIRED_PUTS_TIME_IN_SECS),
HOODIE_INDEX_DESIRED_PUTS_TIME_IN_SECS, String.valueOf(DEFAULT_HOODIE_INDEX_DESIRED_PUTS_TIME_IN_SECS)); HOODIE_INDEX_DESIRED_PUTS_TIME_IN_SECS, String.valueOf(DEFAULT_HOODIE_INDEX_DESIRED_PUTS_TIME_IN_SECS));
setDefaultOnCondition(props, !props.containsKey(HBASE_ZK_PATH_QPS_ROOT), setDefaultOnCondition(props, !props.containsKey(HBASE_ZK_PATH_QPS_ROOT), HBASE_ZK_PATH_QPS_ROOT,
HBASE_ZK_PATH_QPS_ROOT, String.valueOf(DEFAULT_HBASE_ZK_PATH_QPS_ROOT)); String.valueOf(DEFAULT_HBASE_ZK_PATH_QPS_ROOT));
setDefaultOnCondition(props, !props.containsKey(HOODIE_INDEX_HBASE_ZK_SESSION_TIMEOUT_MS), setDefaultOnCondition(props, !props.containsKey(HOODIE_INDEX_HBASE_ZK_SESSION_TIMEOUT_MS),
HOODIE_INDEX_HBASE_ZK_SESSION_TIMEOUT_MS, String.valueOf(DEFAULT_ZK_SESSION_TIMEOUT_MS)); HOODIE_INDEX_HBASE_ZK_SESSION_TIMEOUT_MS, String.valueOf(DEFAULT_ZK_SESSION_TIMEOUT_MS));
setDefaultOnCondition(props, !props.containsKey(HOODIE_INDEX_HBASE_ZK_CONNECTION_TIMEOUT_MS), setDefaultOnCondition(props, !props.containsKey(HOODIE_INDEX_HBASE_ZK_CONNECTION_TIMEOUT_MS),
HOODIE_INDEX_HBASE_ZK_CONNECTION_TIMEOUT_MS, String.valueOf(DEFAULT_ZK_CONNECTION_TIMEOUT_MS)); HOODIE_INDEX_HBASE_ZK_CONNECTION_TIMEOUT_MS, String.valueOf(DEFAULT_ZK_CONNECTION_TIMEOUT_MS));
setDefaultOnCondition(props, !props.containsKey(HBASE_INDEX_QPS_ALLOCATOR_CLASS), setDefaultOnCondition(props, !props.containsKey(HBASE_INDEX_QPS_ALLOCATOR_CLASS), HBASE_INDEX_QPS_ALLOCATOR_CLASS,
HBASE_INDEX_QPS_ALLOCATOR_CLASS, String.valueOf(DEFAULT_HBASE_INDEX_QPS_ALLOCATOR_CLASS)); String.valueOf(DEFAULT_HBASE_INDEX_QPS_ALLOCATOR_CLASS));
return config; return config;
} }

View File

@@ -42,8 +42,7 @@ public class HoodieIndexConfig extends DefaultHoodieConfig {
public static final String BLOOM_INDEX_PARALLELISM_PROP = "hoodie.bloom.index.parallelism"; public static final String BLOOM_INDEX_PARALLELISM_PROP = "hoodie.bloom.index.parallelism";
// Disable explicit bloom index parallelism setting by default - hoodie auto computes // Disable explicit bloom index parallelism setting by default - hoodie auto computes
public static final String DEFAULT_BLOOM_INDEX_PARALLELISM = "0"; public static final String DEFAULT_BLOOM_INDEX_PARALLELISM = "0";
public static final String BLOOM_INDEX_PRUNE_BY_RANGES_PROP = public static final String BLOOM_INDEX_PRUNE_BY_RANGES_PROP = "hoodie.bloom.index.prune.by" + ".ranges";
"hoodie.bloom.index.prune.by" + ".ranges";
public static final String DEFAULT_BLOOM_INDEX_PRUNE_BY_RANGES = "true"; public static final String DEFAULT_BLOOM_INDEX_PRUNE_BY_RANGES = "true";
public static final String BLOOM_INDEX_USE_CACHING_PROP = "hoodie.bloom.index.use.caching"; public static final String BLOOM_INDEX_USE_CACHING_PROP = "hoodie.bloom.index.use.caching";
public static final String DEFAULT_BLOOM_INDEX_USE_CACHING = "true"; public static final String DEFAULT_BLOOM_INDEX_USE_CACHING = "true";
@@ -67,8 +66,7 @@ public class HoodieIndexConfig extends DefaultHoodieConfig {
public static final String DEFAULT_HBASE_BATCH_SIZE = "100"; public static final String DEFAULT_HBASE_BATCH_SIZE = "100";
public static final String BLOOM_INDEX_INPUT_STORAGE_LEVEL = public static final String BLOOM_INDEX_INPUT_STORAGE_LEVEL = "hoodie.bloom.index.input.storage" + ".level";
"hoodie.bloom.index.input.storage" + ".level";
public static final String DEFAULT_BLOOM_INDEX_INPUT_STORAGE_LEVEL = "MEMORY_AND_DISK_SER"; public static final String DEFAULT_BLOOM_INDEX_INPUT_STORAGE_LEVEL = "MEMORY_AND_DISK_SER";
private HoodieIndexConfig(Properties props) { private HoodieIndexConfig(Properties props) {
@@ -175,20 +173,18 @@ public class HoodieIndexConfig extends DefaultHoodieConfig {
public HoodieIndexConfig build() { public HoodieIndexConfig build() {
HoodieIndexConfig config = new HoodieIndexConfig(props); HoodieIndexConfig config = new HoodieIndexConfig(props);
setDefaultOnCondition(props, !props.containsKey(INDEX_TYPE_PROP), INDEX_TYPE_PROP, setDefaultOnCondition(props, !props.containsKey(INDEX_TYPE_PROP), INDEX_TYPE_PROP, DEFAULT_INDEX_TYPE);
DEFAULT_INDEX_TYPE); setDefaultOnCondition(props, !props.containsKey(BLOOM_FILTER_NUM_ENTRIES), BLOOM_FILTER_NUM_ENTRIES,
setDefaultOnCondition(props, !props.containsKey(BLOOM_FILTER_NUM_ENTRIES), DEFAULT_BLOOM_FILTER_NUM_ENTRIES);
BLOOM_FILTER_NUM_ENTRIES, DEFAULT_BLOOM_FILTER_NUM_ENTRIES); setDefaultOnCondition(props, !props.containsKey(BLOOM_FILTER_FPP), BLOOM_FILTER_FPP, DEFAULT_BLOOM_FILTER_FPP);
setDefaultOnCondition(props, !props.containsKey(BLOOM_FILTER_FPP), BLOOM_FILTER_FPP, setDefaultOnCondition(props, !props.containsKey(BLOOM_INDEX_PARALLELISM_PROP), BLOOM_INDEX_PARALLELISM_PROP,
DEFAULT_BLOOM_FILTER_FPP); DEFAULT_BLOOM_INDEX_PARALLELISM);
setDefaultOnCondition(props, !props.containsKey(BLOOM_INDEX_PARALLELISM_PROP),
BLOOM_INDEX_PARALLELISM_PROP, DEFAULT_BLOOM_INDEX_PARALLELISM);
setDefaultOnCondition(props, !props.containsKey(BLOOM_INDEX_PRUNE_BY_RANGES_PROP), setDefaultOnCondition(props, !props.containsKey(BLOOM_INDEX_PRUNE_BY_RANGES_PROP),
BLOOM_INDEX_PRUNE_BY_RANGES_PROP, DEFAULT_BLOOM_INDEX_PRUNE_BY_RANGES); BLOOM_INDEX_PRUNE_BY_RANGES_PROP, DEFAULT_BLOOM_INDEX_PRUNE_BY_RANGES);
setDefaultOnCondition(props, !props.containsKey(BLOOM_INDEX_USE_CACHING_PROP), setDefaultOnCondition(props, !props.containsKey(BLOOM_INDEX_USE_CACHING_PROP), BLOOM_INDEX_USE_CACHING_PROP,
BLOOM_INDEX_USE_CACHING_PROP, DEFAULT_BLOOM_INDEX_USE_CACHING); DEFAULT_BLOOM_INDEX_USE_CACHING);
setDefaultOnCondition(props, !props.containsKey(BLOOM_INDEX_INPUT_STORAGE_LEVEL), setDefaultOnCondition(props, !props.containsKey(BLOOM_INDEX_INPUT_STORAGE_LEVEL), BLOOM_INDEX_INPUT_STORAGE_LEVEL,
BLOOM_INDEX_INPUT_STORAGE_LEVEL, DEFAULT_BLOOM_INDEX_INPUT_STORAGE_LEVEL); DEFAULT_BLOOM_INDEX_INPUT_STORAGE_LEVEL);
setDefaultOnCondition(props, !props.containsKey(BLOOM_INDEX_TREE_BASED_FILTER_PROP), setDefaultOnCondition(props, !props.containsKey(BLOOM_INDEX_TREE_BASED_FILTER_PROP),
BLOOM_INDEX_TREE_BASED_FILTER_PROP, DEFAULT_BLOOM_INDEX_TREE_BASED_FILTER); BLOOM_INDEX_TREE_BASED_FILTER_PROP, DEFAULT_BLOOM_INDEX_TREE_BASED_FILTER);
setDefaultOnCondition(props, !props.containsKey(BLOOM_INDEX_BUCKETIZED_CHECKING_PROP), setDefaultOnCondition(props, !props.containsKey(BLOOM_INDEX_BUCKETIZED_CHECKING_PROP),

View File

@@ -41,8 +41,7 @@ public class HoodieMemoryConfig extends DefaultHoodieConfig {
// Default max memory fraction during compaction, excess spills to disk // Default max memory fraction during compaction, excess spills to disk
public static final String DEFAULT_MAX_MEMORY_FRACTION_FOR_COMPACTION = String.valueOf(0.6); public static final String DEFAULT_MAX_MEMORY_FRACTION_FOR_COMPACTION = String.valueOf(0.6);
// Default memory size per compaction (used if SparkEnv is absent), excess spills to disk // Default memory size per compaction (used if SparkEnv is absent), excess spills to disk
public static final long DEFAULT_MAX_MEMORY_FOR_SPILLABLE_MAP_IN_BYTES = public static final long DEFAULT_MAX_MEMORY_FOR_SPILLABLE_MAP_IN_BYTES = 1024 * 1024 * 1024L; // 1GB
1024 * 1024 * 1024L; // 1GB
// Property to set the max memory for merge // Property to set the max memory for merge
public static final String MAX_MEMORY_FOR_MERGE_PROP = "hoodie.memory.merge.max.size"; public static final String MAX_MEMORY_FOR_MERGE_PROP = "hoodie.memory.merge.max.size";
// Property to set the max memory for compaction // Property to set the max memory for compaction
@@ -88,20 +87,17 @@ public class HoodieMemoryConfig extends DefaultHoodieConfig {
} }
public Builder withMaxMemoryFractionPerPartitionMerge(double maxMemoryFractionPerPartitionMerge) { public Builder withMaxMemoryFractionPerPartitionMerge(double maxMemoryFractionPerPartitionMerge) {
props.setProperty(MAX_MEMORY_FRACTION_FOR_MERGE_PROP, props.setProperty(MAX_MEMORY_FRACTION_FOR_MERGE_PROP, String.valueOf(maxMemoryFractionPerPartitionMerge));
String.valueOf(maxMemoryFractionPerPartitionMerge));
return this; return this;
} }
public Builder withMaxMemoryFractionPerCompaction(double maxMemoryFractionPerCompaction) { public Builder withMaxMemoryFractionPerCompaction(double maxMemoryFractionPerCompaction) {
props.setProperty(MAX_MEMORY_FRACTION_FOR_COMPACTION_PROP, props.setProperty(MAX_MEMORY_FRACTION_FOR_COMPACTION_PROP, String.valueOf(maxMemoryFractionPerCompaction));
String.valueOf(maxMemoryFractionPerCompaction));
return this; return this;
} }
public Builder withMaxDFSStreamBufferSize(int maxStreamBufferSize) { public Builder withMaxDFSStreamBufferSize(int maxStreamBufferSize) {
props.setProperty(MAX_DFS_STREAM_BUFFER_SIZE_PROP, props.setProperty(MAX_DFS_STREAM_BUFFER_SIZE_PROP, String.valueOf(maxStreamBufferSize));
String.valueOf(maxStreamBufferSize));
return this; return this;
} }
@@ -130,19 +126,16 @@ public class HoodieMemoryConfig extends DefaultHoodieConfig {
if (SparkEnv.get() != null) { if (SparkEnv.get() != null) {
// 1 GB is the default conf used by Spark, look at SparkContext.scala // 1 GB is the default conf used by Spark, look at SparkContext.scala
long executorMemoryInBytes = Utils.memoryStringToMb(SparkEnv.get().conf().get(SPARK_EXECUTOR_MEMORY_PROP, long executorMemoryInBytes = Utils.memoryStringToMb(
DEFAULT_SPARK_EXECUTOR_MEMORY_MB)) * 1024 SparkEnv.get().conf().get(SPARK_EXECUTOR_MEMORY_PROP, DEFAULT_SPARK_EXECUTOR_MEMORY_MB)) * 1024 * 1024L;
* 1024L;
// 0.6 is the default value used by Spark, // 0.6 is the default value used by Spark,
// look at {@link // look at {@link
// https://github.com/apache/spark/blob/master/core/src/main/scala/org/apache/spark/SparkConf.scala#L507} // https://github.com/apache/spark/blob/master/core/src/main/scala/org/apache/spark/SparkConf.scala#L507}
double memoryFraction = Double double memoryFraction = Double.valueOf(
.valueOf(SparkEnv.get().conf().get(SPARK_EXECUTOR_MEMORY_FRACTION_PROP, SparkEnv.get().conf().get(SPARK_EXECUTOR_MEMORY_FRACTION_PROP, DEFAULT_SPARK_EXECUTOR_MEMORY_FRACTION));
DEFAULT_SPARK_EXECUTOR_MEMORY_FRACTION));
double maxMemoryFractionForMerge = Double.valueOf(maxMemoryFraction); double maxMemoryFractionForMerge = Double.valueOf(maxMemoryFraction);
double userAvailableMemory = executorMemoryInBytes * (1 - memoryFraction); double userAvailableMemory = executorMemoryInBytes * (1 - memoryFraction);
long maxMemoryForMerge = (long) Math long maxMemoryForMerge = (long) Math.floor(userAvailableMemory * maxMemoryFractionForMerge);
.floor(userAvailableMemory * maxMemoryFractionForMerge);
return maxMemoryForMerge; return maxMemoryForMerge;
} else { } else {
return DEFAULT_MAX_MEMORY_FOR_SPILLABLE_MAP_IN_BYTES; return DEFAULT_MAX_MEMORY_FOR_SPILLABLE_MAP_IN_BYTES;
@@ -151,29 +144,19 @@ public class HoodieMemoryConfig extends DefaultHoodieConfig {
public HoodieMemoryConfig build() { public HoodieMemoryConfig build() {
HoodieMemoryConfig config = new HoodieMemoryConfig(props); HoodieMemoryConfig config = new HoodieMemoryConfig(props);
setDefaultOnCondition(props, setDefaultOnCondition(props, !props.containsKey(MAX_MEMORY_FRACTION_FOR_COMPACTION_PROP),
!props.containsKey(MAX_MEMORY_FRACTION_FOR_COMPACTION_PROP), MAX_MEMORY_FRACTION_FOR_COMPACTION_PROP, DEFAULT_MAX_MEMORY_FRACTION_FOR_COMPACTION);
MAX_MEMORY_FRACTION_FOR_COMPACTION_PROP, setDefaultOnCondition(props, !props.containsKey(MAX_MEMORY_FRACTION_FOR_MERGE_PROP),
DEFAULT_MAX_MEMORY_FRACTION_FOR_COMPACTION);
setDefaultOnCondition(props,
!props.containsKey(MAX_MEMORY_FRACTION_FOR_MERGE_PROP),
MAX_MEMORY_FRACTION_FOR_MERGE_PROP, DEFAULT_MAX_MEMORY_FRACTION_FOR_MERGE); MAX_MEMORY_FRACTION_FOR_MERGE_PROP, DEFAULT_MAX_MEMORY_FRACTION_FOR_MERGE);
setDefaultOnCondition(props, setDefaultOnCondition(props, !props.containsKey(MAX_MEMORY_FOR_MERGE_PROP), MAX_MEMORY_FOR_MERGE_PROP,
!props.containsKey(MAX_MEMORY_FOR_MERGE_PROP), String.valueOf(getMaxMemoryAllowedForMerge(props.getProperty(MAX_MEMORY_FRACTION_FOR_MERGE_PROP))));
MAX_MEMORY_FOR_MERGE_PROP, String.valueOf( setDefaultOnCondition(props, !props.containsKey(MAX_MEMORY_FOR_COMPACTION_PROP), MAX_MEMORY_FOR_COMPACTION_PROP,
getMaxMemoryAllowedForMerge(props.getProperty(MAX_MEMORY_FRACTION_FOR_MERGE_PROP)))); String.valueOf(getMaxMemoryAllowedForMerge(props.getProperty(MAX_MEMORY_FRACTION_FOR_COMPACTION_PROP))));
setDefaultOnCondition(props, setDefaultOnCondition(props, !props.containsKey(MAX_DFS_STREAM_BUFFER_SIZE_PROP), MAX_DFS_STREAM_BUFFER_SIZE_PROP,
!props.containsKey(MAX_MEMORY_FOR_COMPACTION_PROP), String.valueOf(DEFAULT_MAX_DFS_STREAM_BUFFER_SIZE));
MAX_MEMORY_FOR_COMPACTION_PROP, String.valueOf( setDefaultOnCondition(props, !props.containsKey(SPILLABLE_MAP_BASE_PATH_PROP), SPILLABLE_MAP_BASE_PATH_PROP,
getMaxMemoryAllowedForMerge(props.getProperty(MAX_MEMORY_FRACTION_FOR_COMPACTION_PROP)))); DEFAULT_SPILLABLE_MAP_BASE_PATH);
setDefaultOnCondition(props, setDefaultOnCondition(props, !props.containsKey(WRITESTATUS_FAILURE_FRACTION_PROP),
!props.containsKey(MAX_DFS_STREAM_BUFFER_SIZE_PROP),
MAX_DFS_STREAM_BUFFER_SIZE_PROP, String.valueOf(DEFAULT_MAX_DFS_STREAM_BUFFER_SIZE));
setDefaultOnCondition(props,
!props.containsKey(SPILLABLE_MAP_BASE_PATH_PROP),
SPILLABLE_MAP_BASE_PATH_PROP, DEFAULT_SPILLABLE_MAP_BASE_PATH);
setDefaultOnCondition(props,
!props.containsKey(WRITESTATUS_FAILURE_FRACTION_PROP),
WRITESTATUS_FAILURE_FRACTION_PROP, String.valueOf(DEFAULT_WRITESTATUS_FAILURE_FRACTION)); WRITESTATUS_FAILURE_FRACTION_PROP, String.valueOf(DEFAULT_WRITESTATUS_FAILURE_FRACTION));
return config; return config;
} }

View File

@@ -35,8 +35,7 @@ public class HoodieMetricsConfig extends DefaultHoodieConfig {
public static final String METRICS_ON = METRIC_PREFIX + ".on"; public static final String METRICS_ON = METRIC_PREFIX + ".on";
public static final boolean DEFAULT_METRICS_ON = false; public static final boolean DEFAULT_METRICS_ON = false;
public static final String METRICS_REPORTER_TYPE = METRIC_PREFIX + ".reporter.type"; public static final String METRICS_REPORTER_TYPE = METRIC_PREFIX + ".reporter.type";
public static final MetricsReporterType DEFAULT_METRICS_REPORTER_TYPE = MetricsReporterType public static final MetricsReporterType DEFAULT_METRICS_REPORTER_TYPE = MetricsReporterType.GRAPHITE;
.GRAPHITE;
// Graphite // Graphite
public static final String GRAPHITE_PREFIX = METRIC_PREFIX + ".graphite"; public static final String GRAPHITE_PREFIX = METRIC_PREFIX + ".graphite";
@@ -103,8 +102,7 @@ public class HoodieMetricsConfig extends DefaultHoodieConfig {
public HoodieMetricsConfig build() { public HoodieMetricsConfig build() {
HoodieMetricsConfig config = new HoodieMetricsConfig(props); HoodieMetricsConfig config = new HoodieMetricsConfig(props);
setDefaultOnCondition(props, !props.containsKey(METRICS_ON), METRICS_ON, setDefaultOnCondition(props, !props.containsKey(METRICS_ON), METRICS_ON, String.valueOf(DEFAULT_METRICS_ON));
String.valueOf(DEFAULT_METRICS_ON));
setDefaultOnCondition(props, !props.containsKey(METRICS_REPORTER_TYPE), METRICS_REPORTER_TYPE, setDefaultOnCondition(props, !props.containsKey(METRICS_REPORTER_TYPE), METRICS_REPORTER_TYPE,
DEFAULT_METRICS_REPORTER_TYPE.name()); DEFAULT_METRICS_REPORTER_TYPE.name());
setDefaultOnCondition(props, !props.containsKey(GRAPHITE_SERVER_HOST), GRAPHITE_SERVER_HOST, setDefaultOnCondition(props, !props.containsKey(GRAPHITE_SERVER_HOST), GRAPHITE_SERVER_HOST,

View File

@@ -38,8 +38,7 @@ public class HoodieStorageConfig extends DefaultHoodieConfig {
public static final String DEFAULT_PARQUET_PAGE_SIZE_BYTES = String.valueOf(1 * 1024 * 1024); public static final String DEFAULT_PARQUET_PAGE_SIZE_BYTES = String.valueOf(1 * 1024 * 1024);
// used to size log files // used to size log files
public static final String LOGFILE_SIZE_MAX_BYTES = "hoodie.logfile.max.size"; public static final String LOGFILE_SIZE_MAX_BYTES = "hoodie.logfile.max.size";
public static final String DEFAULT_LOGFILE_SIZE_MAX_BYTES = String public static final String DEFAULT_LOGFILE_SIZE_MAX_BYTES = String.valueOf(1024 * 1024 * 1024); // 1 GB
.valueOf(1024 * 1024 * 1024); // 1 GB
// used to size data blocks in log file // used to size data blocks in log file
public static final String LOGFILE_DATA_BLOCK_SIZE_MAX_BYTES = "hoodie.logfile.data.block.max.size"; public static final String LOGFILE_DATA_BLOCK_SIZE_MAX_BYTES = "hoodie.logfile.data.block.max.size";
public static final String DEFAULT_LOGFILE_DATA_BLOCK_SIZE_MAX_BYTES = String.valueOf(256 * 1024 * 1024); // 256 MB public static final String DEFAULT_LOGFILE_DATA_BLOCK_SIZE_MAX_BYTES = String.valueOf(256 * 1024 * 1024); // 256 MB
@@ -122,20 +121,20 @@ public class HoodieStorageConfig extends DefaultHoodieConfig {
public HoodieStorageConfig build() { public HoodieStorageConfig build() {
HoodieStorageConfig config = new HoodieStorageConfig(props); HoodieStorageConfig config = new HoodieStorageConfig(props);
setDefaultOnCondition(props, !props.containsKey(PARQUET_FILE_MAX_BYTES), setDefaultOnCondition(props, !props.containsKey(PARQUET_FILE_MAX_BYTES), PARQUET_FILE_MAX_BYTES,
PARQUET_FILE_MAX_BYTES, DEFAULT_PARQUET_FILE_MAX_BYTES); DEFAULT_PARQUET_FILE_MAX_BYTES);
setDefaultOnCondition(props, !props.containsKey(PARQUET_BLOCK_SIZE_BYTES), setDefaultOnCondition(props, !props.containsKey(PARQUET_BLOCK_SIZE_BYTES), PARQUET_BLOCK_SIZE_BYTES,
PARQUET_BLOCK_SIZE_BYTES, DEFAULT_PARQUET_BLOCK_SIZE_BYTES); DEFAULT_PARQUET_BLOCK_SIZE_BYTES);
setDefaultOnCondition(props, !props.containsKey(PARQUET_PAGE_SIZE_BYTES), setDefaultOnCondition(props, !props.containsKey(PARQUET_PAGE_SIZE_BYTES), PARQUET_PAGE_SIZE_BYTES,
PARQUET_PAGE_SIZE_BYTES, DEFAULT_PARQUET_PAGE_SIZE_BYTES); DEFAULT_PARQUET_PAGE_SIZE_BYTES);
setDefaultOnCondition(props, !props.containsKey(LOGFILE_DATA_BLOCK_SIZE_MAX_BYTES), setDefaultOnCondition(props, !props.containsKey(LOGFILE_DATA_BLOCK_SIZE_MAX_BYTES),
LOGFILE_DATA_BLOCK_SIZE_MAX_BYTES, DEFAULT_LOGFILE_DATA_BLOCK_SIZE_MAX_BYTES); LOGFILE_DATA_BLOCK_SIZE_MAX_BYTES, DEFAULT_LOGFILE_DATA_BLOCK_SIZE_MAX_BYTES);
setDefaultOnCondition(props, !props.containsKey(LOGFILE_SIZE_MAX_BYTES), setDefaultOnCondition(props, !props.containsKey(LOGFILE_SIZE_MAX_BYTES), LOGFILE_SIZE_MAX_BYTES,
LOGFILE_SIZE_MAX_BYTES, DEFAULT_LOGFILE_SIZE_MAX_BYTES); DEFAULT_LOGFILE_SIZE_MAX_BYTES);
setDefaultOnCondition(props, !props.containsKey(PARQUET_COMPRESSION_RATIO), setDefaultOnCondition(props, !props.containsKey(PARQUET_COMPRESSION_RATIO), PARQUET_COMPRESSION_RATIO,
PARQUET_COMPRESSION_RATIO, DEFAULT_STREAM_COMPRESSION_RATIO); DEFAULT_STREAM_COMPRESSION_RATIO);
setDefaultOnCondition(props, !props.containsKey(PARQUET_COMPRESSION_CODEC), setDefaultOnCondition(props, !props.containsKey(PARQUET_COMPRESSION_CODEC), PARQUET_COMPRESSION_CODEC,
PARQUET_COMPRESSION_CODEC, DEFAULT_PARQUET_COMPRESSION_CODEC); DEFAULT_PARQUET_COMPRESSION_CODEC);
setDefaultOnCondition(props, !props.containsKey(LOGFILE_TO_PARQUET_COMPRESSION_RATIO), setDefaultOnCondition(props, !props.containsKey(LOGFILE_TO_PARQUET_COMPRESSION_RATIO),
LOGFILE_TO_PARQUET_COMPRESSION_RATIO, DEFAULT_LOGFILE_TO_PARQUET_COMPRESSION_RATIO); LOGFILE_TO_PARQUET_COMPRESSION_RATIO, DEFAULT_LOGFILE_TO_PARQUET_COMPRESSION_RATIO);
return config; return config;

View File

@@ -61,8 +61,7 @@ public class HoodieWriteConfig extends DefaultHoodieConfig {
private static final String DEFAULT_WRITE_STATUS_STORAGE_LEVEL = "MEMORY_AND_DISK_SER"; private static final String DEFAULT_WRITE_STATUS_STORAGE_LEVEL = "MEMORY_AND_DISK_SER";
private static final String HOODIE_AUTO_COMMIT_PROP = "hoodie.auto.commit"; private static final String HOODIE_AUTO_COMMIT_PROP = "hoodie.auto.commit";
private static final String DEFAULT_HOODIE_AUTO_COMMIT = "true"; private static final String DEFAULT_HOODIE_AUTO_COMMIT = "true";
private static final String HOODIE_ASSUME_DATE_PARTITIONING_PROP = private static final String HOODIE_ASSUME_DATE_PARTITIONING_PROP = "hoodie.assume.date" + ".partitioning";
"hoodie.assume.date" + ".partitioning";
private static final String DEFAULT_ASSUME_DATE_PARTITIONING = "false"; private static final String DEFAULT_ASSUME_DATE_PARTITIONING = "false";
private static final String HOODIE_WRITE_STATUS_CLASS_PROP = "hoodie.writestatus.class"; private static final String HOODIE_WRITE_STATUS_CLASS_PROP = "hoodie.writestatus.class";
private static final String DEFAULT_HOODIE_WRITE_STATUS_CLASS = WriteStatus.class.getName(); private static final String DEFAULT_HOODIE_WRITE_STATUS_CLASS = WriteStatus.class.getName();
@@ -143,8 +142,7 @@ public class HoodieWriteConfig extends DefaultHoodieConfig {
} }
public int getWriteBufferLimitBytes() { public int getWriteBufferLimitBytes() {
return Integer return Integer.parseInt(props.getProperty(WRITE_BUFFER_LIMIT_BYTES, DEFAULT_WRITE_BUFFER_LIMIT_BYTES));
.parseInt(props.getProperty(WRITE_BUFFER_LIMIT_BYTES, DEFAULT_WRITE_BUFFER_LIMIT_BYTES));
} }
public boolean shouldCombineBeforeInsert() { public boolean shouldCombineBeforeInsert() {
@@ -191,18 +189,15 @@ public class HoodieWriteConfig extends DefaultHoodieConfig {
* compaction properties * compaction properties
**/ **/
public HoodieCleaningPolicy getCleanerPolicy() { public HoodieCleaningPolicy getCleanerPolicy() {
return HoodieCleaningPolicy return HoodieCleaningPolicy.valueOf(props.getProperty(HoodieCompactionConfig.CLEANER_POLICY_PROP));
.valueOf(props.getProperty(HoodieCompactionConfig.CLEANER_POLICY_PROP));
} }
public int getCleanerFileVersionsRetained() { public int getCleanerFileVersionsRetained() {
return Integer return Integer.parseInt(props.getProperty(HoodieCompactionConfig.CLEANER_FILE_VERSIONS_RETAINED_PROP));
.parseInt(props.getProperty(HoodieCompactionConfig.CLEANER_FILE_VERSIONS_RETAINED_PROP));
} }
public int getCleanerCommitsRetained() { public int getCleanerCommitsRetained() {
return Integer return Integer.parseInt(props.getProperty(HoodieCompactionConfig.CLEANER_COMMITS_RETAINED_PROP));
.parseInt(props.getProperty(HoodieCompactionConfig.CLEANER_COMMITS_RETAINED_PROP));
} }
public int getMaxCommitsToKeep() { public int getMaxCommitsToKeep() {
@@ -214,23 +209,19 @@ public class HoodieWriteConfig extends DefaultHoodieConfig {
} }
public int getParquetSmallFileLimit() { public int getParquetSmallFileLimit() {
return Integer return Integer.parseInt(props.getProperty(HoodieCompactionConfig.PARQUET_SMALL_FILE_LIMIT_BYTES));
.parseInt(props.getProperty(HoodieCompactionConfig.PARQUET_SMALL_FILE_LIMIT_BYTES));
} }
public int getCopyOnWriteInsertSplitSize() { public int getCopyOnWriteInsertSplitSize() {
return Integer return Integer.parseInt(props.getProperty(HoodieCompactionConfig.COPY_ON_WRITE_TABLE_INSERT_SPLIT_SIZE));
.parseInt(props.getProperty(HoodieCompactionConfig.COPY_ON_WRITE_TABLE_INSERT_SPLIT_SIZE));
} }
public int getCopyOnWriteRecordSizeEstimate() { public int getCopyOnWriteRecordSizeEstimate() {
return Integer.parseInt( return Integer.parseInt(props.getProperty(HoodieCompactionConfig.COPY_ON_WRITE_TABLE_RECORD_SIZE_ESTIMATE));
props.getProperty(HoodieCompactionConfig.COPY_ON_WRITE_TABLE_RECORD_SIZE_ESTIMATE));
} }
public boolean shouldAutoTuneInsertSplits() { public boolean shouldAutoTuneInsertSplits() {
return Boolean.parseBoolean( return Boolean.parseBoolean(props.getProperty(HoodieCompactionConfig.COPY_ON_WRITE_TABLE_AUTO_SPLIT_INSERTS));
props.getProperty(HoodieCompactionConfig.COPY_ON_WRITE_TABLE_AUTO_SPLIT_INSERTS));
} }
public int getCleanerParallelism() { public int getCleanerParallelism() {
@@ -246,28 +237,23 @@ public class HoodieWriteConfig extends DefaultHoodieConfig {
} }
public int getInlineCompactDeltaCommitMax() { public int getInlineCompactDeltaCommitMax() {
return Integer return Integer.parseInt(props.getProperty(HoodieCompactionConfig.INLINE_COMPACT_NUM_DELTA_COMMITS_PROP));
.parseInt(props.getProperty(HoodieCompactionConfig.INLINE_COMPACT_NUM_DELTA_COMMITS_PROP));
} }
public CompactionStrategy getCompactionStrategy() { public CompactionStrategy getCompactionStrategy() {
return ReflectionUtils return ReflectionUtils.loadClass(props.getProperty(HoodieCompactionConfig.COMPACTION_STRATEGY_PROP));
.loadClass(props.getProperty(HoodieCompactionConfig.COMPACTION_STRATEGY_PROP));
} }
public Long getTargetIOPerCompactionInMB() { public Long getTargetIOPerCompactionInMB() {
return Long return Long.parseLong(props.getProperty(HoodieCompactionConfig.TARGET_IO_PER_COMPACTION_IN_MB_PROP));
.parseLong(props.getProperty(HoodieCompactionConfig.TARGET_IO_PER_COMPACTION_IN_MB_PROP));
} }
public Boolean getCompactionLazyBlockReadEnabled() { public Boolean getCompactionLazyBlockReadEnabled() {
return Boolean return Boolean.valueOf(props.getProperty(HoodieCompactionConfig.COMPACTION_LAZY_BLOCK_READ_ENABLED_PROP));
.valueOf(props.getProperty(HoodieCompactionConfig.COMPACTION_LAZY_BLOCK_READ_ENABLED_PROP));
} }
public Boolean getCompactionReverseLogReadEnabled() { public Boolean getCompactionReverseLogReadEnabled() {
return Boolean.valueOf( return Boolean.valueOf(props.getProperty(HoodieCompactionConfig.COMPACTION_REVERSE_LOG_READ_ENABLED_PROP));
props.getProperty(HoodieCompactionConfig.COMPACTION_REVERSE_LOG_READ_ENABLED_PROP));
} }
public String getPayloadClass() { public String getPayloadClass() {
@@ -275,13 +261,11 @@ public class HoodieWriteConfig extends DefaultHoodieConfig {
} }
public int getTargetPartitionsPerDayBasedCompaction() { public int getTargetPartitionsPerDayBasedCompaction() {
return Integer return Integer.parseInt(props.getProperty(HoodieCompactionConfig.TARGET_PARTITIONS_PER_DAYBASED_COMPACTION_PROP));
.parseInt(props.getProperty(HoodieCompactionConfig.TARGET_PARTITIONS_PER_DAYBASED_COMPACTION_PROP));
} }
public int getCommitArchivalBatchSize() { public int getCommitArchivalBatchSize() {
return Integer return Integer.parseInt(props.getProperty(HoodieCompactionConfig.COMMITS_ARCHIVAL_BATCH_SIZE_PROP));
.parseInt(props.getProperty(HoodieCompactionConfig.COMMITS_ARCHIVAL_BATCH_SIZE_PROP));
} }
/** /**
@@ -352,9 +336,8 @@ public class HoodieWriteConfig extends DefaultHoodieConfig {
} }
/** /**
* Fraction of the global share of QPS that should be allocated to this job. * Fraction of the global share of QPS that should be allocated to this job. Let's say there are 3 jobs which have
* Let's say there are 3 jobs which have input size in terms of number of rows * input size in terms of number of rows required for HbaseIndexing as x, 2x, 3x respectively. Then this fraction for
* required for HbaseIndexing as x, 2x, 3x respectively. Then this fraction for
* the jobs would be (0.17) 1/6, 0.33 (2/6) and 0.5 (3/6) respectively. * the jobs would be (0.17) 1/6, 0.33 (2/6) and 0.5 (3/6) respectively.
*/ */
public float getHbaseIndexQPSFraction() { public float getHbaseIndexQPSFraction() {
@@ -370,8 +353,8 @@ public class HoodieWriteConfig extends DefaultHoodieConfig {
} }
/** /**
* This should be same across various jobs. This is intended to limit the aggregate * This should be same across various jobs. This is intended to limit the aggregate QPS generated across various
* QPS generated across various Hoodie jobs to an Hbase Region Server * Hoodie jobs to an Hbase Region Server
*/ */
public int getHbaseIndexMaxQPSPerRegionServer() { public int getHbaseIndexMaxQPSPerRegionServer() {
return Integer.parseInt(props.getProperty(HoodieHBaseIndexConfig.HBASE_MAX_QPS_PER_REGION_SERVER_PROP)); return Integer.parseInt(props.getProperty(HoodieHBaseIndexConfig.HBASE_MAX_QPS_PER_REGION_SERVER_PROP));
@@ -382,8 +365,7 @@ public class HoodieWriteConfig extends DefaultHoodieConfig {
} }
public boolean getBloomIndexPruneByRanges() { public boolean getBloomIndexPruneByRanges() {
return Boolean return Boolean.parseBoolean(props.getProperty(HoodieIndexConfig.BLOOM_INDEX_PRUNE_BY_RANGES_PROP));
.parseBoolean(props.getProperty(HoodieIndexConfig.BLOOM_INDEX_PRUNE_BY_RANGES_PROP));
} }
public boolean getBloomIndexUseCaching() { public boolean getBloomIndexUseCaching() {
@@ -403,8 +385,7 @@ public class HoodieWriteConfig extends DefaultHoodieConfig {
} }
public StorageLevel getBloomIndexInputStorageLevel() { public StorageLevel getBloomIndexInputStorageLevel() {
return StorageLevel return StorageLevel.fromString(props.getProperty(HoodieIndexConfig.BLOOM_INDEX_INPUT_STORAGE_LEVEL));
.fromString(props.getProperty(HoodieIndexConfig.BLOOM_INDEX_INPUT_STORAGE_LEVEL));
} }
/** /**
@@ -423,8 +404,7 @@ public class HoodieWriteConfig extends DefaultHoodieConfig {
} }
public int getLogFileDataBlockMaxSize() { public int getLogFileDataBlockMaxSize() {
return Integer return Integer.parseInt(props.getProperty(HoodieStorageConfig.LOGFILE_DATA_BLOCK_SIZE_MAX_BYTES));
.parseInt(props.getProperty(HoodieStorageConfig.LOGFILE_DATA_BLOCK_SIZE_MAX_BYTES));
} }
public int getLogFileMaxSize() { public int getLogFileMaxSize() {
@@ -451,8 +431,7 @@ public class HoodieWriteConfig extends DefaultHoodieConfig {
} }
public MetricsReporterType getMetricsReporterType() { public MetricsReporterType getMetricsReporterType() {
return MetricsReporterType return MetricsReporterType.valueOf(props.getProperty(HoodieMetricsConfig.METRICS_REPORTER_TYPE));
.valueOf(props.getProperty(HoodieMetricsConfig.METRICS_REPORTER_TYPE));
} }
public String getGraphiteServerHost() { public String getGraphiteServerHost() {
@@ -475,9 +454,7 @@ public class HoodieWriteConfig extends DefaultHoodieConfig {
} }
public Double getMaxMemoryFractionPerCompaction() { public Double getMaxMemoryFractionPerCompaction() {
return Double return Double.valueOf(props.getProperty(HoodieMemoryConfig.MAX_MEMORY_FRACTION_FOR_COMPACTION_PROP));
.valueOf(
props.getProperty(HoodieMemoryConfig.MAX_MEMORY_FRACTION_FOR_COMPACTION_PROP));
} }
public Long getMaxMemoryPerPartitionMerge() { public Long getMaxMemoryPerPartitionMerge() {
@@ -637,8 +614,7 @@ public class HoodieWriteConfig extends DefaultHoodieConfig {
} }
public Builder withAssumeDatePartitioning(boolean assumeDatePartitioning) { public Builder withAssumeDatePartitioning(boolean assumeDatePartitioning) {
props.setProperty(HOODIE_ASSUME_DATE_PARTITIONING_PROP, props.setProperty(HOODIE_ASSUME_DATE_PARTITIONING_PROP, String.valueOf(assumeDatePartitioning));
String.valueOf(assumeDatePartitioning));
return this; return this;
} }
@@ -671,48 +647,42 @@ public class HoodieWriteConfig extends DefaultHoodieConfig {
public HoodieWriteConfig build() { public HoodieWriteConfig build() {
// Check for mandatory properties // Check for mandatory properties
setDefaultOnCondition(props, !props.containsKey(INSERT_PARALLELISM), INSERT_PARALLELISM, setDefaultOnCondition(props, !props.containsKey(INSERT_PARALLELISM), INSERT_PARALLELISM, DEFAULT_PARALLELISM);
setDefaultOnCondition(props, !props.containsKey(BULKINSERT_PARALLELISM), BULKINSERT_PARALLELISM,
DEFAULT_PARALLELISM); DEFAULT_PARALLELISM);
setDefaultOnCondition(props, !props.containsKey(BULKINSERT_PARALLELISM), setDefaultOnCondition(props, !props.containsKey(UPSERT_PARALLELISM), UPSERT_PARALLELISM, DEFAULT_PARALLELISM);
BULKINSERT_PARALLELISM, DEFAULT_PARALLELISM); setDefaultOnCondition(props, !props.containsKey(COMBINE_BEFORE_INSERT_PROP), COMBINE_BEFORE_INSERT_PROP,
setDefaultOnCondition(props, !props.containsKey(UPSERT_PARALLELISM), UPSERT_PARALLELISM, DEFAULT_COMBINE_BEFORE_INSERT);
DEFAULT_PARALLELISM); setDefaultOnCondition(props, !props.containsKey(COMBINE_BEFORE_UPSERT_PROP), COMBINE_BEFORE_UPSERT_PROP,
setDefaultOnCondition(props, !props.containsKey(COMBINE_BEFORE_INSERT_PROP), DEFAULT_COMBINE_BEFORE_UPSERT);
COMBINE_BEFORE_INSERT_PROP, DEFAULT_COMBINE_BEFORE_INSERT); setDefaultOnCondition(props, !props.containsKey(WRITE_STATUS_STORAGE_LEVEL), WRITE_STATUS_STORAGE_LEVEL,
setDefaultOnCondition(props, !props.containsKey(COMBINE_BEFORE_UPSERT_PROP), DEFAULT_WRITE_STATUS_STORAGE_LEVEL);
COMBINE_BEFORE_UPSERT_PROP, DEFAULT_COMBINE_BEFORE_UPSERT); setDefaultOnCondition(props, !props.containsKey(HOODIE_AUTO_COMMIT_PROP), HOODIE_AUTO_COMMIT_PROP,
setDefaultOnCondition(props, !props.containsKey(WRITE_STATUS_STORAGE_LEVEL), DEFAULT_HOODIE_AUTO_COMMIT);
WRITE_STATUS_STORAGE_LEVEL, DEFAULT_WRITE_STATUS_STORAGE_LEVEL);
setDefaultOnCondition(props, !props.containsKey(HOODIE_AUTO_COMMIT_PROP),
HOODIE_AUTO_COMMIT_PROP, DEFAULT_HOODIE_AUTO_COMMIT);
setDefaultOnCondition(props, !props.containsKey(HOODIE_ASSUME_DATE_PARTITIONING_PROP), setDefaultOnCondition(props, !props.containsKey(HOODIE_ASSUME_DATE_PARTITIONING_PROP),
HOODIE_ASSUME_DATE_PARTITIONING_PROP, DEFAULT_ASSUME_DATE_PARTITIONING); HOODIE_ASSUME_DATE_PARTITIONING_PROP, DEFAULT_ASSUME_DATE_PARTITIONING);
setDefaultOnCondition(props, !props.containsKey(HOODIE_WRITE_STATUS_CLASS_PROP), setDefaultOnCondition(props, !props.containsKey(HOODIE_WRITE_STATUS_CLASS_PROP), HOODIE_WRITE_STATUS_CLASS_PROP,
HOODIE_WRITE_STATUS_CLASS_PROP, DEFAULT_HOODIE_WRITE_STATUS_CLASS); DEFAULT_HOODIE_WRITE_STATUS_CLASS);
setDefaultOnCondition(props, !props.containsKey(FINALIZE_WRITE_PARALLELISM), setDefaultOnCondition(props, !props.containsKey(FINALIZE_WRITE_PARALLELISM), FINALIZE_WRITE_PARALLELISM,
FINALIZE_WRITE_PARALLELISM, DEFAULT_FINALIZE_WRITE_PARALLELISM); DEFAULT_FINALIZE_WRITE_PARALLELISM);
setDefaultOnCondition(props, !props.containsKey(EMBEDDED_TIMELINE_SERVER_ENABLED), setDefaultOnCondition(props, !props.containsKey(EMBEDDED_TIMELINE_SERVER_ENABLED),
EMBEDDED_TIMELINE_SERVER_ENABLED, DEFAULT_EMBEDDED_TIMELINE_SERVER_ENABLED); EMBEDDED_TIMELINE_SERVER_ENABLED, DEFAULT_EMBEDDED_TIMELINE_SERVER_ENABLED);
setDefaultOnCondition(props, !props.containsKey(INITIAL_CONSISTENCY_CHECK_INTERVAL_MS_PROP), setDefaultOnCondition(props, !props.containsKey(INITIAL_CONSISTENCY_CHECK_INTERVAL_MS_PROP),
INITIAL_CONSISTENCY_CHECK_INTERVAL_MS_PROP, String.valueOf(DEFAULT_INITIAL_CONSISTENCY_CHECK_INTERVAL_MS)); INITIAL_CONSISTENCY_CHECK_INTERVAL_MS_PROP, String.valueOf(DEFAULT_INITIAL_CONSISTENCY_CHECK_INTERVAL_MS));
setDefaultOnCondition(props, !props.containsKey(MAX_CONSISTENCY_CHECK_INTERVAL_MS_PROP), setDefaultOnCondition(props, !props.containsKey(MAX_CONSISTENCY_CHECK_INTERVAL_MS_PROP),
MAX_CONSISTENCY_CHECK_INTERVAL_MS_PROP, String.valueOf(DEFAULT_MAX_CONSISTENCY_CHECK_INTERVAL_MS)); MAX_CONSISTENCY_CHECK_INTERVAL_MS_PROP, String.valueOf(DEFAULT_MAX_CONSISTENCY_CHECK_INTERVAL_MS));
setDefaultOnCondition(props, !props.containsKey(MAX_CONSISTENCY_CHECKS_PROP), setDefaultOnCondition(props, !props.containsKey(MAX_CONSISTENCY_CHECKS_PROP), MAX_CONSISTENCY_CHECKS_PROP,
MAX_CONSISTENCY_CHECKS_PROP, String.valueOf(DEFAULT_MAX_CONSISTENCY_CHECKS)); String.valueOf(DEFAULT_MAX_CONSISTENCY_CHECKS));
setDefaultOnCondition(props, !props.containsKey(FAIL_ON_TIMELINE_ARCHIVING_ENABLED_PROP), setDefaultOnCondition(props, !props.containsKey(FAIL_ON_TIMELINE_ARCHIVING_ENABLED_PROP),
FAIL_ON_TIMELINE_ARCHIVING_ENABLED_PROP, DEFAULT_FAIL_ON_TIMELINE_ARCHIVING_ENABLED); FAIL_ON_TIMELINE_ARCHIVING_ENABLED_PROP, DEFAULT_FAIL_ON_TIMELINE_ARCHIVING_ENABLED);
// Make sure the props is propagated // Make sure the props is propagated
setDefaultOnCondition(props, !isIndexConfigSet, setDefaultOnCondition(props, !isIndexConfigSet, HoodieIndexConfig.newBuilder().fromProperties(props).build());
HoodieIndexConfig.newBuilder().fromProperties(props).build()); setDefaultOnCondition(props, !isStorageConfigSet, HoodieStorageConfig.newBuilder().fromProperties(props).build());
setDefaultOnCondition(props, !isStorageConfigSet,
HoodieStorageConfig.newBuilder().fromProperties(props).build());
setDefaultOnCondition(props, !isCompactionConfigSet, setDefaultOnCondition(props, !isCompactionConfigSet,
HoodieCompactionConfig.newBuilder().fromProperties(props).build()); HoodieCompactionConfig.newBuilder().fromProperties(props).build());
setDefaultOnCondition(props, !isMetricsConfigSet, setDefaultOnCondition(props, !isMetricsConfigSet, HoodieMetricsConfig.newBuilder().fromProperties(props).build());
HoodieMetricsConfig.newBuilder().fromProperties(props).build()); setDefaultOnCondition(props, !isMemoryConfigSet, HoodieMemoryConfig.newBuilder().fromProperties(props).build());
setDefaultOnCondition(props, !isMemoryConfigSet,
HoodieMemoryConfig.newBuilder().fromProperties(props).build());
setDefaultOnCondition(props, !isViewConfigSet, setDefaultOnCondition(props, !isViewConfigSet,
FileSystemViewStorageConfig.newBuilder().fromProperties(props).build()); FileSystemViewStorageConfig.newBuilder().fromProperties(props).build());
setDefaultOnCondition(props, !isConsistencyGuardSet, setDefaultOnCondition(props, !isConsistencyGuardSet,

View File

@@ -19,8 +19,9 @@
package org.apache.hudi.exception; package org.apache.hudi.exception;
/** /**
* <p> Exception thrown for any higher level errors when <code>HoodieClient</code> is doing a delta * <p>
* commit </p> * Exception thrown for any higher level errors when <code>HoodieClient</code> is doing a delta commit
* </p>
*/ */
public class HoodieAppendException extends HoodieException { public class HoodieAppendException extends HoodieException {

View File

@@ -19,7 +19,8 @@
package org.apache.hudi.exception; package org.apache.hudi.exception;
/** /**
* <p> Exception thrown for any higher level errors when <code>HoodieClient</code> is doing a Commit * <p>
* Exception thrown for any higher level errors when <code>HoodieClient</code> is doing a Commit
* </p> * </p>
*/ */
public class HoodieCommitException extends HoodieException { public class HoodieCommitException extends HoodieException {

View File

@@ -20,7 +20,9 @@ package org.apache.hudi.exception;
/** /**
* <p> Exception thrown when dependent system is not available </p> * <p>
* Exception thrown when dependent system is not available
* </p>
*/ */
public class HoodieDependentSystemUnavailableException extends HoodieException { public class HoodieDependentSystemUnavailableException extends HoodieException {

View File

@@ -19,8 +19,9 @@
package org.apache.hudi.exception; package org.apache.hudi.exception;
/** /**
* <p> Exception thrown for any higher level errors when <code>HoodieClient</code> is doing a bulk * <p>
* insert </p> * Exception thrown for any higher level errors when <code>HoodieClient</code> is doing a bulk insert
* </p>
*/ */
public class HoodieInsertException extends HoodieException { public class HoodieInsertException extends HoodieException {

View File

@@ -19,8 +19,9 @@
package org.apache.hudi.exception; package org.apache.hudi.exception;
/** /**
* <p> Exception thrown for any higher level errors when <code>HoodieClient</code> is doing a * <p>
* incremental upsert </p> * Exception thrown for any higher level errors when <code>HoodieClient</code> is doing a incremental upsert
* </p>
*/ */
public class HoodieUpsertException extends HoodieException { public class HoodieUpsertException extends HoodieException {

View File

@@ -31,16 +31,16 @@ import org.apache.spark.api.java.function.Function2;
/** /**
* Map function that handles a sorted stream of HoodieRecords * Map function that handles a sorted stream of HoodieRecords
*/ */
public class BulkInsertMapFunction<T extends HoodieRecordPayload> implements public class BulkInsertMapFunction<T extends HoodieRecordPayload>
Function2<Integer, Iterator<HoodieRecord<T>>, Iterator<List<WriteStatus>>> { implements Function2<Integer, Iterator<HoodieRecord<T>>, Iterator<List<WriteStatus>>> {
private String commitTime; private String commitTime;
private HoodieWriteConfig config; private HoodieWriteConfig config;
private HoodieTable<T> hoodieTable; private HoodieTable<T> hoodieTable;
private List<String> fileIDPrefixes; private List<String> fileIDPrefixes;
public BulkInsertMapFunction(String commitTime, HoodieWriteConfig config, public BulkInsertMapFunction(String commitTime, HoodieWriteConfig config, HoodieTable<T> hoodieTable,
HoodieTable<T> hoodieTable, List<String> fileIDPrefixes) { List<String> fileIDPrefixes) {
this.commitTime = commitTime; this.commitTime = commitTime;
this.config = config; this.config = config;
this.hoodieTable = hoodieTable; this.hoodieTable = hoodieTable;

View File

@@ -37,11 +37,10 @@ import org.apache.hudi.io.HoodieWriteHandle;
import org.apache.hudi.table.HoodieTable; import org.apache.hudi.table.HoodieTable;
/** /**
* Lazy Iterable, that writes a stream of HoodieRecords sorted by the partitionPath, into new * Lazy Iterable, that writes a stream of HoodieRecords sorted by the partitionPath, into new files.
* files.
*/ */
public class CopyOnWriteLazyInsertIterable<T extends HoodieRecordPayload> extends public class CopyOnWriteLazyInsertIterable<T extends HoodieRecordPayload>
LazyIterableIterator<HoodieRecord<T>, List<WriteStatus>> { extends LazyIterableIterator<HoodieRecord<T>, List<WriteStatus>> {
protected final HoodieWriteConfig hoodieConfig; protected final HoodieWriteConfig hoodieConfig;
protected final String commitTime; protected final String commitTime;
@@ -80,25 +79,23 @@ public class CopyOnWriteLazyInsertIterable<T extends HoodieRecordPayload> extend
* Transformer function to help transform a HoodieRecord. This transformer is used by BufferedIterator to offload some * Transformer function to help transform a HoodieRecord. This transformer is used by BufferedIterator to offload some
* expensive operations of transformation to the reader thread. * expensive operations of transformation to the reader thread.
*/ */
static <T extends HoodieRecordPayload> Function<HoodieRecord<T>, static <T extends HoodieRecordPayload> Function<HoodieRecord<T>, HoodieInsertValueGenResult<HoodieRecord>> getTransformFunction(
HoodieInsertValueGenResult<HoodieRecord>> getTransformFunction(Schema schema) { Schema schema) {
return hoodieRecord -> new HoodieInsertValueGenResult(hoodieRecord, schema); return hoodieRecord -> new HoodieInsertValueGenResult(hoodieRecord, schema);
} }
@Override @Override
protected void start() { protected void start() {}
}
@Override @Override
protected List<WriteStatus> computeNext() { protected List<WriteStatus> computeNext() {
// Executor service used for launching writer thread. // Executor service used for launching writer thread.
BoundedInMemoryExecutor<HoodieRecord<T>, BoundedInMemoryExecutor<HoodieRecord<T>, HoodieInsertValueGenResult<HoodieRecord>, List<WriteStatus>> bufferedIteratorExecutor =
HoodieInsertValueGenResult<HoodieRecord>, List<WriteStatus>> bufferedIteratorExecutor = null; null;
try { try {
final Schema schema = new Schema.Parser().parse(hoodieConfig.getSchema()); final Schema schema = new Schema.Parser().parse(hoodieConfig.getSchema());
bufferedIteratorExecutor = bufferedIteratorExecutor =
new SparkBoundedInMemoryExecutor<>(hoodieConfig, inputItr, new SparkBoundedInMemoryExecutor<>(hoodieConfig, inputItr, getInsertHandler(), getTransformFunction(schema));
getInsertHandler(), getTransformFunction(schema));
final List<WriteStatus> result = bufferedIteratorExecutor.execute(); final List<WriteStatus> result = bufferedIteratorExecutor.execute();
assert result != null && !result.isEmpty() && !bufferedIteratorExecutor.isRemaining(); assert result != null && !result.isEmpty() && !bufferedIteratorExecutor.isRemaining();
return result; return result;
@@ -112,8 +109,7 @@ public class CopyOnWriteLazyInsertIterable<T extends HoodieRecordPayload> extend
} }
@Override @Override
protected void end() { protected void end() {}
}
protected String getNextFileId(String idPfx) { protected String getNextFileId(String idPfx) {
return String.format("%s-%d", idPfx, numFilesWritten++); return String.format("%s-%d", idPfx, numFilesWritten++);
@@ -124,11 +120,10 @@ public class CopyOnWriteLazyInsertIterable<T extends HoodieRecordPayload> extend
} }
/** /**
* Consumes stream of hoodie records from in-memory queue and * Consumes stream of hoodie records from in-memory queue and writes to one or more create-handles
* writes to one or more create-handles
*/ */
protected class CopyOnWriteInsertHandler extends protected class CopyOnWriteInsertHandler
BoundedInMemoryQueueConsumer<HoodieInsertValueGenResult<HoodieRecord>, List<WriteStatus>> { extends BoundedInMemoryQueueConsumer<HoodieInsertValueGenResult<HoodieRecord>, List<WriteStatus>> {
protected final List<WriteStatus> statuses = new ArrayList<>(); protected final List<WriteStatus> statuses = new ArrayList<>();
protected HoodieWriteHandle handle; protected HoodieWriteHandle handle;

View File

@@ -21,16 +21,15 @@ package org.apache.hudi.func;
import java.util.Iterator; import java.util.Iterator;
/** /**
* (NOTE: Adapted from Apache SystemML) This class is a generic base class for lazy, single pass * (NOTE: Adapted from Apache SystemML) This class is a generic base class for lazy, single pass inputItr classes in
* inputItr classes in order to simplify the implementation of lazy iterators for mapPartitions use * order to simplify the implementation of lazy iterators for mapPartitions use cases. Note [SPARK-3369], which gives
* cases. Note [SPARK-3369], which gives the reasons for backwards compatibility with regard to the * the reasons for backwards compatibility with regard to the iterable API despite Spark's single pass nature.
* iterable API despite Spark's single pass nature.
* <p> * <p>
* Provide a way to obtain a inputItr of type O (output), out of an inputItr of type I (input) * Provide a way to obtain a inputItr of type O (output), out of an inputItr of type I (input)
* <p> * <p>
* Things to remember: - Assumes Spark calls hasNext() to check for elements, before calling next() * Things to remember: - Assumes Spark calls hasNext() to check for elements, before calling next() to obtain them -
* to obtain them - Assumes hasNext() gets called atleast once. - Concrete Implementation is * Assumes hasNext() gets called atleast once. - Concrete Implementation is responsible for calling inputIterator.next()
* responsible for calling inputIterator.next() and doing the processing in computeNext() * and doing the processing in computeNext()
*/ */
public abstract class LazyIterableIterator<I, O> implements Iterable<O>, Iterator<O> { public abstract class LazyIterableIterator<I, O> implements Iterable<O>, Iterator<O> {

View File

@@ -29,11 +29,9 @@ import org.apache.hudi.io.HoodieAppendHandle;
import org.apache.hudi.table.HoodieTable; import org.apache.hudi.table.HoodieTable;
/** /**
* Lazy Iterable, that writes a stream of HoodieRecords sorted by the partitionPath, into new * Lazy Iterable, that writes a stream of HoodieRecords sorted by the partitionPath, into new log files.
* log files.
*/ */
public class MergeOnReadLazyInsertIterable<T extends HoodieRecordPayload> extends public class MergeOnReadLazyInsertIterable<T extends HoodieRecordPayload> extends CopyOnWriteLazyInsertIterable<T> {
CopyOnWriteLazyInsertIterable<T> {
public MergeOnReadLazyInsertIterable(Iterator<HoodieRecord<T>> sortedRecordItr, HoodieWriteConfig config, public MergeOnReadLazyInsertIterable(Iterator<HoodieRecord<T>> sortedRecordItr, HoodieWriteConfig config,
String commitTime, HoodieTable<T> hoodieTable, String idPfx) { String commitTime, HoodieTable<T> hoodieTable, String idPfx) {

View File

@@ -32,8 +32,7 @@ public class OperationResult<T> implements Serializable {
private boolean success; private boolean success;
private Option<Exception> exception; private Option<Exception> exception;
public OperationResult() { public OperationResult() {}
}
public OperationResult(T operation, boolean success, Option<Exception> exception) { public OperationResult(T operation, boolean success, Option<Exception> exception) {
this.operation = operation; this.operation = operation;
@@ -67,11 +66,7 @@ public class OperationResult<T> implements Serializable {
@Override @Override
public String toString() { public String toString() {
return "OperationResult{" return "OperationResult{" + "operation=" + operation + ", executed=" + executed + ", success=" + success
+ "operation=" + operation + ", exception=" + exception + '}';
+ ", executed=" + executed
+ ", success=" + success
+ ", exception=" + exception
+ '}';
} }
} }

View File

@@ -25,8 +25,8 @@ import org.apache.hudi.exception.HoodieIOException;
import org.apache.parquet.hadoop.ParquetReader; import org.apache.parquet.hadoop.ParquetReader;
/** /**
* This class wraps a parquet reader and provides an iterator based api to * This class wraps a parquet reader and provides an iterator based api to read from a parquet file. This is used in
* read from a parquet file. This is used in {@link BoundedInMemoryQueue} * {@link BoundedInMemoryQueue}
*/ */
public class ParquetReaderIterator<T> implements Iterator<T> { public class ParquetReaderIterator<T> implements Iterator<T> {

View File

@@ -36,17 +36,13 @@ public class SparkBoundedInMemoryExecutor<I, O, E> extends BoundedInMemoryExecut
final TaskContext sparkThreadTaskContext; final TaskContext sparkThreadTaskContext;
public SparkBoundedInMemoryExecutor(final HoodieWriteConfig hoodieConfig, final Iterator<I> inputItr, public SparkBoundedInMemoryExecutor(final HoodieWriteConfig hoodieConfig, final Iterator<I> inputItr,
BoundedInMemoryQueueConsumer<O, E> consumer, BoundedInMemoryQueueConsumer<O, E> consumer, Function<I, O> bufferedIteratorTransform) {
Function<I, O> bufferedIteratorTransform) {
this(hoodieConfig, new IteratorBasedQueueProducer<>(inputItr), consumer, bufferedIteratorTransform); this(hoodieConfig, new IteratorBasedQueueProducer<>(inputItr), consumer, bufferedIteratorTransform);
} }
public SparkBoundedInMemoryExecutor(final HoodieWriteConfig hoodieConfig, public SparkBoundedInMemoryExecutor(final HoodieWriteConfig hoodieConfig, BoundedInMemoryQueueProducer<I> producer,
BoundedInMemoryQueueProducer<I> producer, BoundedInMemoryQueueConsumer<O, E> consumer, Function<I, O> bufferedIteratorTransform) {
BoundedInMemoryQueueConsumer<O, E> consumer, super(hoodieConfig.getWriteBufferLimitBytes(), producer, Option.of(consumer), bufferedIteratorTransform);
Function<I, O> bufferedIteratorTransform) {
super(hoodieConfig.getWriteBufferLimitBytes(), producer,
Option.of(consumer), bufferedIteratorTransform);
this.sparkThreadTaskContext = TaskContext.get(); this.sparkThreadTaskContext = TaskContext.get();
} }

View File

@@ -65,18 +65,18 @@ public abstract class HoodieIndex<T extends HoodieRecordPayload> implements Seri
} }
/** /**
* Checks if the given [Keys] exists in the hoodie table and returns [Key, Option[partitionPath, fileID]] * Checks if the given [Keys] exists in the hoodie table and returns [Key, Option[partitionPath, fileID]] If the
* If the optional is empty, then the key is not found. * optional is empty, then the key is not found.
*/ */
public abstract JavaPairRDD<HoodieKey, Option<Pair<String, String>>> fetchRecordLocation( public abstract JavaPairRDD<HoodieKey, Option<Pair<String, String>>> fetchRecordLocation(
JavaRDD<HoodieKey> hoodieKeys, final JavaSparkContext jsc, HoodieTable<T> hoodieTable); JavaRDD<HoodieKey> hoodieKeys, final JavaSparkContext jsc, HoodieTable<T> hoodieTable);
/** /**
* Looks up the index and tags each incoming record with a location of a file that contains the * Looks up the index and tags each incoming record with a location of a file that contains the row (if it is actually
* row (if it is actually present) * present)
*/ */
public abstract JavaRDD<HoodieRecord<T>> tagLocation(JavaRDD<HoodieRecord<T>> recordRDD, public abstract JavaRDD<HoodieRecord<T>> tagLocation(JavaRDD<HoodieRecord<T>> recordRDD, JavaSparkContext jsc,
JavaSparkContext jsc, HoodieTable<T> hoodieTable) throws HoodieIndexException; HoodieTable<T> hoodieTable) throws HoodieIndexException;
/** /**
* Extracts the location of written records, and updates the index. * Extracts the location of written records, and updates the index.
@@ -84,8 +84,7 @@ public abstract class HoodieIndex<T extends HoodieRecordPayload> implements Seri
* TODO(vc): We may need to propagate the record as well in a WriteStatus class * TODO(vc): We may need to propagate the record as well in a WriteStatus class
*/ */
public abstract JavaRDD<WriteStatus> updateLocation(JavaRDD<WriteStatus> writeStatusRDD, JavaSparkContext jsc, public abstract JavaRDD<WriteStatus> updateLocation(JavaRDD<WriteStatus> writeStatusRDD, JavaSparkContext jsc,
HoodieTable<T> hoodieTable) HoodieTable<T> hoodieTable) throws HoodieIndexException;
throws HoodieIndexException;
/** /**
* Rollback the efffects of the commit made at commitTime. * Rollback the efffects of the commit made at commitTime.
@@ -93,17 +92,17 @@ public abstract class HoodieIndex<T extends HoodieRecordPayload> implements Seri
public abstract boolean rollbackCommit(String commitTime); public abstract boolean rollbackCommit(String commitTime);
/** /**
* An index is `global` if {@link HoodieKey} to fileID mapping, does not depend on the * An index is `global` if {@link HoodieKey} to fileID mapping, does not depend on the `partitionPath`. Such an
* `partitionPath`. Such an implementation is able to obtain the same mapping, for two hoodie keys * implementation is able to obtain the same mapping, for two hoodie keys with same `recordKey` but different
* with same `recordKey` but different `partitionPath` * `partitionPath`
* *
* @return whether or not, the index implementation is global in nature * @return whether or not, the index implementation is global in nature
*/ */
public abstract boolean isGlobal(); public abstract boolean isGlobal();
/** /**
* This is used by storage to determine, if its safe to send inserts, straight to the log, i.e * This is used by storage to determine, if its safe to send inserts, straight to the log, i.e having a
* having a {@link FileSlice}, with no data file. * {@link FileSlice}, with no data file.
* *
* @return Returns true/false depending on whether the impl has this capability * @return Returns true/false depending on whether the impl has this capability
*/ */
@@ -111,8 +110,8 @@ public abstract class HoodieIndex<T extends HoodieRecordPayload> implements Seri
/** /**
* An index is "implicit" with respect to storage, if just writing new data to a file slice, * An index is "implicit" with respect to storage, if just writing new data to a file slice, updates the index as
* updates the index as well. This is used by storage, to save memory footprint in certain cases. * well. This is used by storage, to save memory footprint in certain cases.
*/ */
public abstract boolean isImplicitWithStorage(); public abstract boolean isImplicitWithStorage();

View File

@@ -40,7 +40,9 @@ import org.apache.spark.api.java.function.Function2;
/** /**
* Hoodie Index implementation backed by an in-memory Hash map. <p> ONLY USE FOR LOCAL TESTING * Hoodie Index implementation backed by an in-memory Hash map.
* <p>
* ONLY USE FOR LOCAL TESTING
*/ */
public class InMemoryHashIndex<T extends HoodieRecordPayload> extends HoodieIndex<T> { public class InMemoryHashIndex<T extends HoodieRecordPayload> extends HoodieIndex<T> {
@@ -122,12 +124,10 @@ public class InMemoryHashIndex<T extends HoodieRecordPayload> extends HoodieInde
/** /**
* Function that tags each HoodieRecord with an existing location, if known. * Function that tags each HoodieRecord with an existing location, if known.
*/ */
class LocationTagFunction implements class LocationTagFunction implements Function2<Integer, Iterator<HoodieRecord<T>>, Iterator<HoodieRecord<T>>> {
Function2<Integer, Iterator<HoodieRecord<T>>, Iterator<HoodieRecord<T>>> {
@Override @Override
public Iterator<HoodieRecord<T>> call(Integer partitionNum, public Iterator<HoodieRecord<T>> call(Integer partitionNum, Iterator<HoodieRecord<T>> hoodieRecordIterator) {
Iterator<HoodieRecord<T>> hoodieRecordIterator) {
List<HoodieRecord<T>> taggedRecords = new ArrayList<>(); List<HoodieRecord<T>> taggedRecords = new ArrayList<>();
while (hoodieRecordIterator.hasNext()) { while (hoodieRecordIterator.hasNext()) {
HoodieRecord<T> rec = hoodieRecordIterator.next(); HoodieRecord<T> rec = hoodieRecordIterator.next();

View File

@@ -35,6 +35,7 @@ import org.apache.spark.Partitioner;
* Partitions bloom filter checks by spreading out comparisons across buckets of work. * Partitions bloom filter checks by spreading out comparisons across buckets of work.
* *
* Each bucket incurs the following cost * Each bucket incurs the following cost
*
* <pre> * <pre>
* 1) Read bloom filter from file footer * 1) Read bloom filter from file footer
* 2) Check keys against bloom filter * 2) Check keys against bloom filter
@@ -47,6 +48,7 @@ import org.apache.spark.Partitioner;
* could bound the amount of skew to std_dev(numberOfBucketsPerPartition) * cost of (3), lower than sort partitioning. * could bound the amount of skew to std_dev(numberOfBucketsPerPartition) * cost of (3), lower than sort partitioning.
* *
* Approach has two goals : * Approach has two goals :
*
* <pre> * <pre>
* 1) Pack as many buckets from same file group into same partition, to amortize cost of (1) and (2) further * 1) Pack as many buckets from same file group into same partition, to amortize cost of (1) and (2) further
* 2) Spread buckets across partitions evenly to achieve skew reduction * 2) Spread buckets across partitions evenly to achieve skew reduction
@@ -76,8 +78,7 @@ public class BucketizedBloomCheckPartitioner extends Partitioner {
Map<String, Integer> bucketsPerFileGroup = new HashMap<>(); Map<String, Integer> bucketsPerFileGroup = new HashMap<>();
// Compute the buckets needed per file group, using simple uniform distribution // Compute the buckets needed per file group, using simple uniform distribution
fileGroupToComparisons.forEach((f, c) -> fileGroupToComparisons.forEach((f, c) -> bucketsPerFileGroup.put(f, (int) Math.ceil((c * 1.0) / keysPerBucket)));
bucketsPerFileGroup.put(f, (int) Math.ceil((c * 1.0) / keysPerBucket)));
int totalBuckets = bucketsPerFileGroup.values().stream().mapToInt(i -> i).sum(); int totalBuckets = bucketsPerFileGroup.values().stream().mapToInt(i -> i).sum();
// If totalBuckets > targetPartitions, no need to have extra partitions // If totalBuckets > targetPartitions, no need to have extra partitions
this.partitions = Math.min(targetPartitions, totalBuckets); this.partitions = Math.min(targetPartitions, totalBuckets);

View File

@@ -78,12 +78,12 @@ public class HoodieBloomIndex<T extends HoodieRecordPayload> extends HoodieIndex
} }
// Step 1: Extract out thinner JavaPairRDD of (partitionPath, recordKey) // Step 1: Extract out thinner JavaPairRDD of (partitionPath, recordKey)
JavaPairRDD<String, String> partitionRecordKeyPairRDD = recordRDD JavaPairRDD<String, String> partitionRecordKeyPairRDD =
.mapToPair(record -> new Tuple2<>(record.getPartitionPath(), record.getRecordKey())); recordRDD.mapToPair(record -> new Tuple2<>(record.getPartitionPath(), record.getRecordKey()));
// Lookup indexes for all the partition/recordkey pair // Lookup indexes for all the partition/recordkey pair
JavaPairRDD<HoodieKey, HoodieRecordLocation> keyFilenamePairRDD = lookupIndex(partitionRecordKeyPairRDD, jsc, JavaPairRDD<HoodieKey, HoodieRecordLocation> keyFilenamePairRDD =
hoodieTable); lookupIndex(partitionRecordKeyPairRDD, jsc, hoodieTable);
// Cache the result, for subsequent stages. // Cache the result, for subsequent stages.
if (config.getBloomIndexUseCaching()) { if (config.getBloomIndexUseCaching()) {
@@ -96,8 +96,7 @@ public class HoodieBloomIndex<T extends HoodieRecordPayload> extends HoodieIndex
// Step 4: Tag the incoming records, as inserts or updates, by joining with existing record keys // Step 4: Tag the incoming records, as inserts or updates, by joining with existing record keys
// Cost: 4 sec. // Cost: 4 sec.
JavaRDD<HoodieRecord<T>> taggedRecordRDD = tagLocationBacktoRecords(keyFilenamePairRDD, JavaRDD<HoodieRecord<T>> taggedRecordRDD = tagLocationBacktoRecords(keyFilenamePairRDD, recordRDD);
recordRDD);
if (config.getBloomIndexUseCaching()) { if (config.getBloomIndexUseCaching()) {
recordRDD.unpersist(); // unpersist the input Record RDD recordRDD.unpersist(); // unpersist the input Record RDD
@@ -108,8 +107,8 @@ public class HoodieBloomIndex<T extends HoodieRecordPayload> extends HoodieIndex
} }
/** /**
* Returns an RDD mapping each HoodieKey with a partitionPath/fileID which contains it. Option.Empty if the key is * Returns an RDD mapping each HoodieKey with a partitionPath/fileID which contains it. Option.Empty if the key is not
* not found. * found.
* *
* @param hoodieKeys keys to lookup * @param hoodieKeys keys to lookup
* @param jsc spark context * @param jsc spark context
@@ -118,12 +117,12 @@ public class HoodieBloomIndex<T extends HoodieRecordPayload> extends HoodieIndex
@Override @Override
public JavaPairRDD<HoodieKey, Option<Pair<String, String>>> fetchRecordLocation(JavaRDD<HoodieKey> hoodieKeys, public JavaPairRDD<HoodieKey, Option<Pair<String, String>>> fetchRecordLocation(JavaRDD<HoodieKey> hoodieKeys,
JavaSparkContext jsc, HoodieTable<T> hoodieTable) { JavaSparkContext jsc, HoodieTable<T> hoodieTable) {
JavaPairRDD<String, String> partitionRecordKeyPairRDD = hoodieKeys JavaPairRDD<String, String> partitionRecordKeyPairRDD =
.mapToPair(key -> new Tuple2<>(key.getPartitionPath(), key.getRecordKey())); hoodieKeys.mapToPair(key -> new Tuple2<>(key.getPartitionPath(), key.getRecordKey()));
// Lookup indexes for all the partition/recordkey pair // Lookup indexes for all the partition/recordkey pair
JavaPairRDD<HoodieKey, HoodieRecordLocation> recordKeyLocationRDD = lookupIndex(partitionRecordKeyPairRDD, jsc, JavaPairRDD<HoodieKey, HoodieRecordLocation> recordKeyLocationRDD =
hoodieTable); lookupIndex(partitionRecordKeyPairRDD, jsc, hoodieTable);
JavaPairRDD<HoodieKey, String> keyHoodieKeyPairRDD = hoodieKeys.mapToPair(key -> new Tuple2<>(key, null)); JavaPairRDD<HoodieKey, String> keyHoodieKeyPairRDD = hoodieKeys.mapToPair(key -> new Tuple2<>(key, null));
return keyHoodieKeyPairRDD.leftOuterJoin(recordKeyLocationRDD).mapToPair(keyLoc -> { return keyHoodieKeyPairRDD.leftOuterJoin(recordKeyLocationRDD).mapToPair(keyLoc -> {
@@ -149,19 +148,19 @@ public class HoodieBloomIndex<T extends HoodieRecordPayload> extends HoodieIndex
List<String> affectedPartitionPathList = new ArrayList<>(recordsPerPartition.keySet()); List<String> affectedPartitionPathList = new ArrayList<>(recordsPerPartition.keySet());
// Step 2: Load all involved files as <Partition, filename> pairs // Step 2: Load all involved files as <Partition, filename> pairs
List<Tuple2<String, BloomIndexFileInfo>> fileInfoList = loadInvolvedFiles(affectedPartitionPathList, jsc, List<Tuple2<String, BloomIndexFileInfo>> fileInfoList =
hoodieTable); loadInvolvedFiles(affectedPartitionPathList, jsc, hoodieTable);
final Map<String, List<BloomIndexFileInfo>> partitionToFileInfo = fileInfoList.stream() final Map<String, List<BloomIndexFileInfo>> partitionToFileInfo =
.collect(groupingBy(Tuple2::_1, mapping(Tuple2::_2, toList()))); fileInfoList.stream().collect(groupingBy(Tuple2::_1, mapping(Tuple2::_2, toList())));
// Step 3: Obtain a RDD, for each incoming record, that already exists, with the file id, // Step 3: Obtain a RDD, for each incoming record, that already exists, with the file id,
// that contains it. // that contains it.
Map<String, Long> comparisonsPerFileGroup = computeComparisonsPerFileGroup(recordsPerPartition, partitionToFileInfo, Map<String, Long> comparisonsPerFileGroup =
partitionRecordKeyPairRDD); computeComparisonsPerFileGroup(recordsPerPartition, partitionToFileInfo, partitionRecordKeyPairRDD);
int safeParallelism = computeSafeParallelism(recordsPerPartition, comparisonsPerFileGroup); int safeParallelism = computeSafeParallelism(recordsPerPartition, comparisonsPerFileGroup);
int joinParallelism = determineParallelism(partitionRecordKeyPairRDD.partitions().size(), safeParallelism); int joinParallelism = determineParallelism(partitionRecordKeyPairRDD.partitions().size(), safeParallelism);
return findMatchingFilesForRecordKeys(partitionToFileInfo, partitionRecordKeyPairRDD, joinParallelism, return findMatchingFilesForRecordKeys(partitionToFileInfo, partitionRecordKeyPairRDD, joinParallelism, hoodieTable,
hoodieTable, comparisonsPerFileGroup); comparisonsPerFileGroup);
} }
/** /**
@@ -175,8 +174,8 @@ public class HoodieBloomIndex<T extends HoodieRecordPayload> extends HoodieIndex
if (config.getBloomIndexPruneByRanges()) { if (config.getBloomIndexPruneByRanges()) {
// we will just try exploding the input and then count to determine comparisons // we will just try exploding the input and then count to determine comparisons
// FIX(vc): Only do sampling here and extrapolate? // FIX(vc): Only do sampling here and extrapolate?
fileToComparisons = explodeRecordRDDWithFileComparisons(partitionToFileInfo, fileToComparisons = explodeRecordRDDWithFileComparisons(partitionToFileInfo, partitionRecordKeyPairRDD)
partitionRecordKeyPairRDD).mapToPair(t -> t).countByKey(); .mapToPair(t -> t).countByKey();
} else { } else {
fileToComparisons = new HashMap<>(); fileToComparisons = new HashMap<>();
partitionToFileInfo.entrySet().stream().forEach(e -> { partitionToFileInfo.entrySet().stream().forEach(e -> {
@@ -191,34 +190,41 @@ public class HoodieBloomIndex<T extends HoodieRecordPayload> extends HoodieIndex
/** /**
* Compute the minimum parallelism needed to play well with the spark 2GB limitation.. The index lookup can be skewed * Compute the minimum parallelism needed to play well with the spark 2GB limitation.. The index lookup can be skewed
* in three dimensions : #files, #partitions, #records <p> To be able to smoothly handle skews, we need to compute how * in three dimensions : #files, #partitions, #records
* to split each partitions into subpartitions. We do it here, in a way that keeps the amount of each Spark join * <p>
* partition to < 2GB. <p> If {@link HoodieIndexConfig#BLOOM_INDEX_PARALLELISM_PROP} is * To be able to smoothly handle skews, we need to compute how to split each partitions into subpartitions. We do it
* specified as a NON-zero number, then that is used explicitly. * here, in a way that keeps the amount of each Spark join partition to < 2GB.
* <p>
* If {@link HoodieIndexConfig#BLOOM_INDEX_PARALLELISM_PROP} is specified as a NON-zero number, then that is used
* explicitly.
*/ */
int computeSafeParallelism(Map<String, Long> recordsPerPartition, Map<String, Long> comparisonsPerFileGroup) { int computeSafeParallelism(Map<String, Long> recordsPerPartition, Map<String, Long> comparisonsPerFileGroup) {
long totalComparisons = comparisonsPerFileGroup.values().stream().mapToLong(Long::longValue).sum(); long totalComparisons = comparisonsPerFileGroup.values().stream().mapToLong(Long::longValue).sum();
long totalFiles = comparisonsPerFileGroup.size(); long totalFiles = comparisonsPerFileGroup.size();
long totalRecords = recordsPerPartition.values().stream().mapToLong(Long::longValue).sum(); long totalRecords = recordsPerPartition.values().stream().mapToLong(Long::longValue).sum();
int parallelism = (int) (totalComparisons / MAX_ITEMS_PER_SHUFFLE_PARTITION + 1); int parallelism = (int) (totalComparisons / MAX_ITEMS_PER_SHUFFLE_PARTITION + 1);
logger.info(String.format("TotalRecords %d, TotalFiles %d, TotalAffectedPartitions %d, TotalComparisons %d, " logger.info(String.format(
+ "SafeParallelism %d", totalRecords, totalFiles, recordsPerPartition.size(), totalComparisons, parallelism)); "TotalRecords %d, TotalFiles %d, TotalAffectedPartitions %d, TotalComparisons %d, " + "SafeParallelism %d",
totalRecords, totalFiles, recordsPerPartition.size(), totalComparisons, parallelism));
return parallelism; return parallelism;
} }
/** /**
* Its crucial to pick the right parallelism. <p> totalSubPartitions : this is deemed safe limit, to be nice with * Its crucial to pick the right parallelism.
* Spark. inputParallelism : typically number of input file splits <p> We pick the max such that, we are always safe, * <p>
* but go higher if say a there are a lot of input files. (otherwise, we will fallback to number of partitions in * totalSubPartitions : this is deemed safe limit, to be nice with Spark. inputParallelism : typically number of input
* input and end up with slow performance) * file splits
* <p>
* We pick the max such that, we are always safe, but go higher if say a there are a lot of input files. (otherwise,
* we will fallback to number of partitions in input and end up with slow performance)
*/ */
private int determineParallelism(int inputParallelism, int totalSubPartitions) { private int determineParallelism(int inputParallelism, int totalSubPartitions) {
// If bloom index parallelism is set, use it to to check against the input parallelism and // If bloom index parallelism is set, use it to to check against the input parallelism and
// take the max // take the max
int indexParallelism = Math.max(inputParallelism, config.getBloomIndexParallelism()); int indexParallelism = Math.max(inputParallelism, config.getBloomIndexParallelism());
int joinParallelism = Math.max(totalSubPartitions, indexParallelism); int joinParallelism = Math.max(totalSubPartitions, indexParallelism);
logger.info("InputParallelism: ${" + inputParallelism + "}, " + "IndexParallelism: ${" + config logger.info("InputParallelism: ${" + inputParallelism + "}, " + "IndexParallelism: ${"
.getBloomIndexParallelism() + "}, " + "TotalSubParts: ${" + totalSubPartitions + "}, " + config.getBloomIndexParallelism() + "}, " + "TotalSubParts: ${" + totalSubPartitions + "}, "
+ "Join Parallelism set to : " + joinParallelism); + "Join Parallelism set to : " + joinParallelism);
return joinParallelism; return joinParallelism;
} }
@@ -231,11 +237,10 @@ public class HoodieBloomIndex<T extends HoodieRecordPayload> extends HoodieIndex
final HoodieTable hoodieTable) { final HoodieTable hoodieTable) {
// Obtain the latest data files from all the partitions. // Obtain the latest data files from all the partitions.
List<Pair<String, String>> partitionPathFileIDList = jsc List<Pair<String, String>> partitionPathFileIDList =
.parallelize(partitions, Math.max(partitions.size(), 1)) jsc.parallelize(partitions, Math.max(partitions.size(), 1)).flatMap(partitionPath -> {
.flatMap(partitionPath -> { Option<HoodieInstant> latestCommitTime =
Option<HoodieInstant> latestCommitTime = hoodieTable.getMetaClient().getCommitsTimeline() hoodieTable.getMetaClient().getCommitsTimeline().filterCompletedInstants().lastInstant();
.filterCompletedInstants().lastInstant();
List<Pair<String, String>> filteredFiles = new ArrayList<>(); List<Pair<String, String>> filteredFiles = new ArrayList<>();
if (latestCommitTime.isPresent()) { if (latestCommitTime.isPresent()) {
filteredFiles = hoodieTable.getROFileSystemView() filteredFiles = hoodieTable.getROFileSystemView()
@@ -259,8 +264,7 @@ public class HoodieBloomIndex<T extends HoodieRecordPayload> extends HoodieIndex
}).collect(); }).collect();
} else { } else {
return partitionPathFileIDList.stream() return partitionPathFileIDList.stream()
.map(pf -> new Tuple2<>(pf.getKey(), new BloomIndexFileInfo(pf.getValue()))) .map(pf -> new Tuple2<>(pf.getKey(), new BloomIndexFileInfo(pf.getValue()))).collect(toList());
.collect(toList());
} }
} }
@@ -307,8 +311,8 @@ public class HoodieBloomIndex<T extends HoodieRecordPayload> extends HoodieIndex
JavaRDD<Tuple2<String, HoodieKey>> explodeRecordRDDWithFileComparisons( JavaRDD<Tuple2<String, HoodieKey>> explodeRecordRDDWithFileComparisons(
final Map<String, List<BloomIndexFileInfo>> partitionToFileIndexInfo, final Map<String, List<BloomIndexFileInfo>> partitionToFileIndexInfo,
JavaPairRDD<String, String> partitionRecordKeyPairRDD) { JavaPairRDD<String, String> partitionRecordKeyPairRDD) {
IndexFileFilter indexFileFilter = config.useBloomIndexTreebasedFilter() IndexFileFilter indexFileFilter =
? new IntervalTreeBasedIndexFileFilter(partitionToFileIndexInfo) config.useBloomIndexTreebasedFilter() ? new IntervalTreeBasedIndexFileFilter(partitionToFileIndexInfo)
: new ListBasedIndexFileFilter(partitionToFileIndexInfo); : new ListBasedIndexFileFilter(partitionToFileIndexInfo);
return partitionRecordKeyPairRDD.map(partitionRecordKeyPair -> { return partitionRecordKeyPairRDD.map(partitionRecordKeyPair -> {
@@ -322,10 +326,12 @@ public class HoodieBloomIndex<T extends HoodieRecordPayload> extends HoodieIndex
} }
/** /**
* Find out <RowKey, filename> pair. All workload grouped by file-level. <p> Join PairRDD(PartitionPath, RecordKey) * Find out <RowKey, filename> pair. All workload grouped by file-level.
* and PairRDD(PartitionPath, File) & then repartition such that each RDD partition is a file, then for each file, we * <p>
* do (1) load bloom filter, (2) load rowKeys, (3) Tag rowKey <p> Make sure the parallelism is atleast the groupby * Join PairRDD(PartitionPath, RecordKey) and PairRDD(PartitionPath, File) & then repartition such that each RDD
* parallelism for tagging location * partition is a file, then for each file, we do (1) load bloom filter, (2) load rowKeys, (3) Tag rowKey
* <p>
* Make sure the parallelism is atleast the groupby parallelism for tagging location
*/ */
@VisibleForTesting @VisibleForTesting
JavaPairRDD<HoodieKey, HoodieRecordLocation> findMatchingFilesForRecordKeys( JavaPairRDD<HoodieKey, HoodieRecordLocation> findMatchingFilesForRecordKeys(
@@ -336,33 +342,24 @@ public class HoodieBloomIndex<T extends HoodieRecordPayload> extends HoodieIndex
explodeRecordRDDWithFileComparisons(partitionToFileIndexInfo, partitionRecordKeyPairRDD); explodeRecordRDDWithFileComparisons(partitionToFileIndexInfo, partitionRecordKeyPairRDD);
if (config.useBloomIndexBucketizedChecking()) { if (config.useBloomIndexBucketizedChecking()) {
Partitioner partitioner = new BucketizedBloomCheckPartitioner( Partitioner partitioner = new BucketizedBloomCheckPartitioner(shuffleParallelism, fileGroupToComparisons,
shuffleParallelism, config.getBloomIndexKeysPerBucket());
fileGroupToComparisons,
config.getBloomIndexKeysPerBucket()
);
fileComparisonsRDD = fileComparisonsRDD fileComparisonsRDD = fileComparisonsRDD.mapToPair(t -> new Tuple2<>(Pair.of(t._1, t._2.getRecordKey()), t))
.mapToPair(t -> new Tuple2<>(Pair.of(t._1, t._2.getRecordKey()), t)) .repartitionAndSortWithinPartitions(partitioner).map(Tuple2::_2);
.repartitionAndSortWithinPartitions(partitioner)
.map(Tuple2::_2);
} else { } else {
fileComparisonsRDD = fileComparisonsRDD.sortBy(Tuple2::_1, true, shuffleParallelism); fileComparisonsRDD = fileComparisonsRDD.sortBy(Tuple2::_1, true, shuffleParallelism);
} }
return fileComparisonsRDD return fileComparisonsRDD.mapPartitionsWithIndex(new HoodieBloomIndexCheckFunction(hoodieTable, config), true)
.mapPartitionsWithIndex(new HoodieBloomIndexCheckFunction(hoodieTable, config), true) .flatMap(List::iterator).filter(lr -> lr.getMatchingRecordKeys().size() > 0)
.flatMap(List::iterator)
.filter(lr -> lr.getMatchingRecordKeys().size() > 0)
.flatMapToPair(lookupResult -> lookupResult.getMatchingRecordKeys().stream() .flatMapToPair(lookupResult -> lookupResult.getMatchingRecordKeys().stream()
.map(recordKey -> new Tuple2<>(new HoodieKey(recordKey, lookupResult.getPartitionPath()), .map(recordKey -> new Tuple2<>(new HoodieKey(recordKey, lookupResult.getPartitionPath()),
new HoodieRecordLocation(lookupResult.getBaseInstantTime(), lookupResult.getFileId()))) new HoodieRecordLocation(lookupResult.getBaseInstantTime(), lookupResult.getFileId())))
.collect(Collectors.toList()) .collect(Collectors.toList()).iterator());
.iterator());
} }
HoodieRecord<T> getTaggedRecord(HoodieRecord<T> inputRecord, HoodieRecord<T> getTaggedRecord(HoodieRecord<T> inputRecord, Option<HoodieRecordLocation> location) {
Option<HoodieRecordLocation> location) {
HoodieRecord<T> record = inputRecord; HoodieRecord<T> record = inputRecord;
if (location.isPresent()) { if (location.isPresent()) {
// When you have a record in multiple files in the same partition, then rowKeyRecordPairRDD // When you have a record in multiple files in the same partition, then rowKeyRecordPairRDD
@@ -383,12 +380,12 @@ public class HoodieBloomIndex<T extends HoodieRecordPayload> extends HoodieIndex
*/ */
protected JavaRDD<HoodieRecord<T>> tagLocationBacktoRecords( protected JavaRDD<HoodieRecord<T>> tagLocationBacktoRecords(
JavaPairRDD<HoodieKey, HoodieRecordLocation> keyFilenamePairRDD, JavaRDD<HoodieRecord<T>> recordRDD) { JavaPairRDD<HoodieKey, HoodieRecordLocation> keyFilenamePairRDD, JavaRDD<HoodieRecord<T>> recordRDD) {
JavaPairRDD<HoodieKey, HoodieRecord<T>> keyRecordPairRDD = recordRDD JavaPairRDD<HoodieKey, HoodieRecord<T>> keyRecordPairRDD =
.mapToPair(record -> new Tuple2<>(record.getKey(), record)); recordRDD.mapToPair(record -> new Tuple2<>(record.getKey(), record));
// Here as the recordRDD might have more data than rowKeyRDD (some rowKeys' fileId is null), // Here as the recordRDD might have more data than rowKeyRDD (some rowKeys' fileId is null),
// so we do left outer join. // so we do left outer join.
return keyRecordPairRDD.leftOuterJoin(keyFilenamePairRDD).values().map( return keyRecordPairRDD.leftOuterJoin(keyFilenamePairRDD).values()
v1 -> getTaggedRecord(v1._1, Option.ofNullable(v1._2.orNull()))); .map(v1 -> getTaggedRecord(v1._1, Option.ofNullable(v1._2.orNull())));
} }
@Override @Override

View File

@@ -34,11 +34,10 @@ import org.apache.spark.api.java.function.Function2;
import scala.Tuple2; import scala.Tuple2;
/** /**
* Function performing actual checking of RDD partition containing (fileId, hoodieKeys) against the * Function performing actual checking of RDD partition containing (fileId, hoodieKeys) against the actual files
* actual files
*/ */
public class HoodieBloomIndexCheckFunction implements public class HoodieBloomIndexCheckFunction
Function2<Integer, Iterator<Tuple2<String, HoodieKey>>, Iterator<List<KeyLookupResult>>> { implements Function2<Integer, Iterator<Tuple2<String, HoodieKey>>, Iterator<List<KeyLookupResult>>> {
private final HoodieTable hoodieTable; private final HoodieTable hoodieTable;
@@ -59,14 +58,12 @@ public class HoodieBloomIndexCheckFunction implements
private HoodieKeyLookupHandle keyLookupHandle; private HoodieKeyLookupHandle keyLookupHandle;
LazyKeyCheckIterator( LazyKeyCheckIterator(Iterator<Tuple2<String, HoodieKey>> filePartitionRecordKeyTripletItr) {
Iterator<Tuple2<String, HoodieKey>> filePartitionRecordKeyTripletItr) {
super(filePartitionRecordKeyTripletItr); super(filePartitionRecordKeyTripletItr);
} }
@Override @Override
protected void start() { protected void start() {}
}
@Override @Override
protected List<HoodieKeyLookupHandle.KeyLookupResult> computeNext() { protected List<HoodieKeyLookupHandle.KeyLookupResult> computeNext() {
@@ -113,7 +110,6 @@ public class HoodieBloomIndexCheckFunction implements
} }
@Override @Override
protected void end() { protected void end() {}
}
} }
} }

View File

@@ -59,8 +59,7 @@ public class HoodieGlobalBloomIndex<T extends HoodieRecordPayload> extends Hoodi
final HoodieTable hoodieTable) { final HoodieTable hoodieTable) {
HoodieTableMetaClient metaClient = hoodieTable.getMetaClient(); HoodieTableMetaClient metaClient = hoodieTable.getMetaClient();
try { try {
List<String> allPartitionPaths = FSUtils List<String> allPartitionPaths = FSUtils.getAllPartitionPaths(metaClient.getFs(), metaClient.getBasePath(),
.getAllPartitionPaths(metaClient.getFs(), metaClient.getBasePath(),
config.shouldAssumeDatePartitioning()); config.shouldAssumeDatePartitioning());
return super.loadInvolvedFiles(allPartitionPaths, jsc, hoodieTable); return super.loadInvolvedFiles(allPartitionPaths, jsc, hoodieTable);
} catch (IOException e) { } catch (IOException e) {
@@ -88,8 +87,8 @@ public class HoodieGlobalBloomIndex<T extends HoodieRecordPayload> extends Hoodi
entry.getValue().forEach(indexFile -> indexToPartitionMap.put(indexFile.getFileId(), entry.getKey())); entry.getValue().forEach(indexFile -> indexToPartitionMap.put(indexFile.getFileId(), entry.getKey()));
} }
IndexFileFilter indexFileFilter = config.getBloomIndexPruneByRanges() IndexFileFilter indexFileFilter =
? new IntervalTreeBasedGlobalIndexFileFilter(partitionToFileIndexInfo) config.getBloomIndexPruneByRanges() ? new IntervalTreeBasedGlobalIndexFileFilter(partitionToFileIndexInfo)
: new ListBasedGlobalIndexFileFilter(partitionToFileIndexInfo); : new ListBasedGlobalIndexFileFilter(partitionToFileIndexInfo);
return partitionRecordKeyPairRDD.map(partitionRecordKeyPair -> { return partitionRecordKeyPairRDD.map(partitionRecordKeyPair -> {
@@ -109,8 +108,8 @@ public class HoodieGlobalBloomIndex<T extends HoodieRecordPayload> extends Hoodi
@Override @Override
protected JavaRDD<HoodieRecord<T>> tagLocationBacktoRecords( protected JavaRDD<HoodieRecord<T>> tagLocationBacktoRecords(
JavaPairRDD<HoodieKey, HoodieRecordLocation> keyFilenamePairRDD, JavaRDD<HoodieRecord<T>> recordRDD) { JavaPairRDD<HoodieKey, HoodieRecordLocation> keyFilenamePairRDD, JavaRDD<HoodieRecord<T>> recordRDD) {
JavaPairRDD<String, HoodieRecord<T>> rowKeyRecordPairRDD = recordRDD JavaPairRDD<String, HoodieRecord<T>> rowKeyRecordPairRDD =
.mapToPair(record -> new Tuple2<>(record.getRecordKey(), record)); recordRDD.mapToPair(record -> new Tuple2<>(record.getRecordKey(), record));
// Here as the recordRDD might have more data than rowKeyRDD (some rowKeys' fileId is null), // Here as the recordRDD might have more data than rowKeyRDD (some rowKeys' fileId is null),
// so we do left outer join. // so we do left outer join.

View File

@@ -41,16 +41,16 @@ class IntervalTreeBasedGlobalIndexFileFilter implements IndexFileFilter {
* @param partitionToFileIndexInfo Map of partition to List of {@link BloomIndexFileInfo}s * @param partitionToFileIndexInfo Map of partition to List of {@link BloomIndexFileInfo}s
*/ */
IntervalTreeBasedGlobalIndexFileFilter(final Map<String, List<BloomIndexFileInfo>> partitionToFileIndexInfo) { IntervalTreeBasedGlobalIndexFileFilter(final Map<String, List<BloomIndexFileInfo>> partitionToFileIndexInfo) {
List<BloomIndexFileInfo> allIndexFiles = partitionToFileIndexInfo.values().stream().flatMap(Collection::stream) List<BloomIndexFileInfo> allIndexFiles =
.collect(Collectors.toList()); partitionToFileIndexInfo.values().stream().flatMap(Collection::stream).collect(Collectors.toList());
// Note that the interval tree implementation doesn't have auto-balancing to ensure logN search time. // Note that the interval tree implementation doesn't have auto-balancing to ensure logN search time.
// So, we are shuffling the input here hoping the tree will not have any skewness. If not, the tree could be skewed // So, we are shuffling the input here hoping the tree will not have any skewness. If not, the tree could be skewed
// which could result in N search time instead of NlogN. // which could result in N search time instead of NlogN.
Collections.shuffle(allIndexFiles); Collections.shuffle(allIndexFiles);
allIndexFiles.forEach(indexFile -> { allIndexFiles.forEach(indexFile -> {
if (indexFile.hasKeyRanges()) { if (indexFile.hasKeyRanges()) {
indexLookUpTree.insert(new KeyRangeNode(indexFile.getMinRecordKey(), indexLookUpTree
indexFile.getMaxRecordKey(), indexFile.getFileId())); .insert(new KeyRangeNode(indexFile.getMinRecordKey(), indexFile.getMaxRecordKey(), indexFile.getFileId()));
} else { } else {
filesWithNoRanges.add(indexFile.getFileId()); filesWithNoRanges.add(indexFile.getFileId());
} }

View File

@@ -48,8 +48,8 @@ class IntervalTreeBasedIndexFileFilter implements IndexFileFilter {
KeyRangeLookupTree lookUpTree = new KeyRangeLookupTree(); KeyRangeLookupTree lookUpTree = new KeyRangeLookupTree();
bloomIndexFiles.forEach(indexFileInfo -> { bloomIndexFiles.forEach(indexFileInfo -> {
if (indexFileInfo.hasKeyRanges()) { if (indexFileInfo.hasKeyRanges()) {
lookUpTree.insert(new KeyRangeNode(indexFileInfo.getMinRecordKey(), lookUpTree.insert(new KeyRangeNode(indexFileInfo.getMinRecordKey(), indexFileInfo.getMaxRecordKey(),
indexFileInfo.getMaxRecordKey(), indexFileInfo.getFileId())); indexFileInfo.getFileId()));
} else { } else {
if (!partitionToFilesWithNoRanges.containsKey(partition)) { if (!partitionToFilesWithNoRanges.containsKey(partition)) {
partitionToFilesWithNoRanges.put(partition, new HashSet<>()); partitionToFilesWithNoRanges.put(partition, new HashSet<>());

View File

@@ -50,25 +50,16 @@ class KeyRangeLookupTree implements Serializable {
* *
* If no root exists, make {@code newNode} as the root and return the new root. * If no root exists, make {@code newNode} as the root and return the new root.
* *
* If current root and newNode matches with min record key and max record key, * If current root and newNode matches with min record key and max record key, merge two nodes. In other words, add
* merge two nodes. In other words, add files from {@code newNode} to current root. * files from {@code newNode} to current root. Return current root.
* Return current root.
* *
* If current root is < newNode * If current root is < newNode if current root has no right sub tree update current root's right sub tree max and min
* if current root has no right sub tree * set newNode as right sub tree else update root's right sub tree min and max with newNode's min and max record key
* update current root's right sub tree max and min * as applicable recursively call insert() with root's right subtree as new root
* set newNode as right sub tree
* else
* update root's right sub tree min and max with newNode's min and max record key as applicable
* recursively call insert() with root's right subtree as new root
* *
* else // current root is >= newNode * else // current root is >= newNode if current root has no left sub tree update current root's left sub tree max and
* if current root has no left sub tree * min set newNode as left sub tree else update root's left sub tree min and max with newNode's min and max record key
* update current root's left sub tree max and min * as applicable recursively call insert() with root's left subtree as new root
* set newNode as left sub tree
* else
* update root's left sub tree min and max with newNode's min and max record key as applicable
* recursively call insert() with root's left subtree as new root
* *
* @param root refers to the current root of the look up tree * @param root refers to the current root of the look up tree
* @param newNode newNode the new {@link KeyRangeNode} to be inserted * @param newNode newNode the new {@link KeyRangeNode} to be inserted

View File

@@ -62,15 +62,10 @@ class KeyRangeNode implements Comparable<KeyRangeNode>, Serializable {
@Override @Override
public String toString() { public String toString() {
return "KeyRangeNode{" return "KeyRangeNode{" + "minRecordKey='" + minRecordKey + '\'' + ", maxRecordKey='" + maxRecordKey + '\''
+ "minRecordKey='" + minRecordKey + '\'' + ", fileNameList=" + fileNameList + ", rightSubTreeMax='" + rightSubTreeMax + '\'' + ", leftSubTreeMax='"
+ ", maxRecordKey='" + maxRecordKey + '\'' + leftSubTreeMax + '\'' + ", rightSubTreeMin='" + rightSubTreeMin + '\'' + ", leftSubTreeMin='" + leftSubTreeMin
+ ", fileNameList=" + fileNameList + '\'' + '}';
+ ", rightSubTreeMax='" + rightSubTreeMax + '\''
+ ", leftSubTreeMax='" + leftSubTreeMax + '\''
+ ", rightSubTreeMin='" + rightSubTreeMin + '\''
+ ", leftSubTreeMin='" + leftSubTreeMin + '\''
+ '}';
} }
/** /**
@@ -78,8 +73,8 @@ class KeyRangeNode implements Comparable<KeyRangeNode>, Serializable {
* *
* @param that the {@link KeyRangeNode} to be compared with * @param that the {@link KeyRangeNode} to be compared with
* @return the result of comparison. 0 if both min and max are equal in both. 1 if this {@link KeyRangeNode} is * @return the result of comparison. 0 if both min and max are equal in both. 1 if this {@link KeyRangeNode} is
* greater than the {@code that} keyRangeNode. -1 if {@code that} keyRangeNode is greater than this {@link * greater than the {@code that} keyRangeNode. -1 if {@code that} keyRangeNode is greater than this
* KeyRangeNode} * {@link KeyRangeNode}
*/ */
@Override @Override
public int compareTo(KeyRangeNode that) { public int compareTo(KeyRangeNode that) {

View File

@@ -30,8 +30,7 @@ class ListBasedGlobalIndexFileFilter extends ListBasedIndexFileFilter {
* *
* @param partitionToFileIndexInfo Map of partition to List of {@link BloomIndexFileInfo} * @param partitionToFileIndexInfo Map of partition to List of {@link BloomIndexFileInfo}
*/ */
ListBasedGlobalIndexFileFilter( ListBasedGlobalIndexFileFilter(Map<String, List<BloomIndexFileInfo>> partitionToFileIndexInfo) {
Map<String, List<BloomIndexFileInfo>> partitionToFileIndexInfo) {
super(partitionToFileIndexInfo); super(partitionToFileIndexInfo);
} }

View File

@@ -68,10 +68,8 @@ import scala.Tuple2;
*/ */
public class HBaseIndex<T extends HoodieRecordPayload> extends HoodieIndex<T> { public class HBaseIndex<T extends HoodieRecordPayload> extends HoodieIndex<T> {
public static final String DEFAULT_SPARK_EXECUTOR_INSTANCES_CONFIG_NAME = public static final String DEFAULT_SPARK_EXECUTOR_INSTANCES_CONFIG_NAME = "spark.executor.instances";
"spark.executor.instances"; public static final String DEFAULT_SPARK_DYNAMIC_ALLOCATION_ENABLED_CONFIG_NAME = "spark.dynamicAllocation.enabled";
public static final String DEFAULT_SPARK_DYNAMIC_ALLOCATION_ENABLED_CONFIG_NAME =
"spark.dynamicAllocation.enabled";
public static final String DEFAULT_SPARK_DYNAMIC_ALLOCATION_MAX_EXECUTORS_CONFIG_NAME = public static final String DEFAULT_SPARK_DYNAMIC_ALLOCATION_MAX_EXECUTORS_CONFIG_NAME =
"spark.dynamicAllocation.maxExecutors"; "spark.dynamicAllocation.maxExecutors";
@@ -114,9 +112,8 @@ public class HBaseIndex<T extends HoodieRecordPayload> extends HoodieIndex<T> {
public HBaseIndexQPSResourceAllocator createQPSResourceAllocator(HoodieWriteConfig config) { public HBaseIndexQPSResourceAllocator createQPSResourceAllocator(HoodieWriteConfig config) {
try { try {
logger.info("createQPSResourceAllocator :" + config.getHBaseQPSResourceAllocatorClass()); logger.info("createQPSResourceAllocator :" + config.getHBaseQPSResourceAllocatorClass());
final HBaseIndexQPSResourceAllocator resourceAllocator = final HBaseIndexQPSResourceAllocator resourceAllocator = (HBaseIndexQPSResourceAllocator) ReflectionUtils
(HBaseIndexQPSResourceAllocator) ReflectionUtils.loadClass( .loadClass(config.getHBaseQPSResourceAllocatorClass(), config);
config.getHBaseQPSResourceAllocatorClass(), config);
return resourceAllocator; return resourceAllocator;
} catch (Exception e) { } catch (Exception e) {
logger.warn("error while instantiating HBaseIndexQPSResourceAllocator", e); logger.warn("error while instantiating HBaseIndexQPSResourceAllocator", e);
@@ -143,14 +140,14 @@ public class HBaseIndex<T extends HoodieRecordPayload> extends HoodieIndex<T> {
try { try {
return ConnectionFactory.createConnection(hbaseConfig); return ConnectionFactory.createConnection(hbaseConfig);
} catch (IOException e) { } catch (IOException e) {
throw new HoodieDependentSystemUnavailableException( throw new HoodieDependentSystemUnavailableException(HoodieDependentSystemUnavailableException.HBASE,
HoodieDependentSystemUnavailableException.HBASE, quorum + ":" + port); quorum + ":" + port);
} }
} }
/** /**
* Since we are sharing the HbaseConnection across tasks in a JVM, make sure the HbaseConnectio is * Since we are sharing the HbaseConnection across tasks in a JVM, make sure the HbaseConnectio is closed when JVM
* closed when JVM exits * exits
*/ */
private void addShutDownHook() { private void addShutDownHook() {
Runtime.getRuntime().addShutdownHook(new Thread() { Runtime.getRuntime().addShutdownHook(new Thread() {
@@ -172,31 +169,28 @@ public class HBaseIndex<T extends HoodieRecordPayload> extends HoodieIndex<T> {
} }
private Get generateStatement(String key) throws IOException { private Get generateStatement(String key) throws IOException {
return new Get(Bytes.toBytes(key)).setMaxVersions(1) return new Get(Bytes.toBytes(key)).setMaxVersions(1).addColumn(SYSTEM_COLUMN_FAMILY, COMMIT_TS_COLUMN)
.addColumn(SYSTEM_COLUMN_FAMILY, COMMIT_TS_COLUMN) .addColumn(SYSTEM_COLUMN_FAMILY, FILE_NAME_COLUMN).addColumn(SYSTEM_COLUMN_FAMILY, PARTITION_PATH_COLUMN);
.addColumn(SYSTEM_COLUMN_FAMILY, FILE_NAME_COLUMN)
.addColumn(SYSTEM_COLUMN_FAMILY, PARTITION_PATH_COLUMN);
} }
private boolean checkIfValidCommit(HoodieTableMetaClient metaClient, String commitTs) { private boolean checkIfValidCommit(HoodieTableMetaClient metaClient, String commitTs) {
HoodieTimeline commitTimeline = metaClient.getActiveTimeline().filterCompletedInstants(); HoodieTimeline commitTimeline = metaClient.getActiveTimeline().filterCompletedInstants();
// Check if the last commit ts for this row is 1) present in the timeline or // Check if the last commit ts for this row is 1) present in the timeline or
// 2) is less than the first commit ts in the timeline // 2) is less than the first commit ts in the timeline
return !commitTimeline.empty() && (commitTimeline return !commitTimeline.empty()
.containsInstant(new HoodieInstant(false, HoodieTimeline.COMMIT_ACTION, commitTs)) && (commitTimeline.containsInstant(new HoodieInstant(false, HoodieTimeline.COMMIT_ACTION, commitTs))
|| HoodieTimeline || HoodieTimeline.compareTimestamps(commitTimeline.firstInstant().get().getTimestamp(), commitTs,
.compareTimestamps(commitTimeline.firstInstant().get().getTimestamp(), commitTs,
HoodieTimeline.GREATER)); HoodieTimeline.GREATER));
} }
/** /**
* Function that tags each HoodieRecord with an existing location, if known. * Function that tags each HoodieRecord with an existing location, if known.
*/ */
private Function2<Integer, Iterator<HoodieRecord<T>>, private Function2<Integer, Iterator<HoodieRecord<T>>, Iterator<HoodieRecord<T>>> locationTagFunction(
Iterator<HoodieRecord<T>>> locationTagFunction(HoodieTableMetaClient metaClient) { HoodieTableMetaClient metaClient) {
return (Function2<Integer, Iterator<HoodieRecord<T>>, Iterator<HoodieRecord<T>>>) return (Function2<Integer, Iterator<HoodieRecord<T>>, Iterator<HoodieRecord<T>>>) (partitionNum,
(partitionNum, hoodieRecordIterator) -> { hoodieRecordIterator) -> {
Integer multiGetBatchSize = config.getHbaseIndexGetBatchSize(); Integer multiGetBatchSize = config.getHbaseIndexGetBatchSize();
@@ -228,16 +222,12 @@ public class HBaseIndex<T extends HoodieRecordPayload> extends HoodieIndex<T> {
HoodieRecord currentRecord = currentBatchOfRecords.remove(0); HoodieRecord currentRecord = currentBatchOfRecords.remove(0);
if (result.getRow() != null) { if (result.getRow() != null) {
String keyFromResult = Bytes.toString(result.getRow()); String keyFromResult = Bytes.toString(result.getRow());
String commitTs = Bytes String commitTs = Bytes.toString(result.getValue(SYSTEM_COLUMN_FAMILY, COMMIT_TS_COLUMN));
.toString(result.getValue(SYSTEM_COLUMN_FAMILY, COMMIT_TS_COLUMN)); String fileId = Bytes.toString(result.getValue(SYSTEM_COLUMN_FAMILY, FILE_NAME_COLUMN));
String fileId = Bytes String partitionPath = Bytes.toString(result.getValue(SYSTEM_COLUMN_FAMILY, PARTITION_PATH_COLUMN));
.toString(result.getValue(SYSTEM_COLUMN_FAMILY, FILE_NAME_COLUMN));
String partitionPath = Bytes
.toString(result.getValue(SYSTEM_COLUMN_FAMILY, PARTITION_PATH_COLUMN));
if (checkIfValidCommit(metaClient, commitTs)) { if (checkIfValidCommit(metaClient, commitTs)) {
currentRecord = new HoodieRecord( currentRecord = new HoodieRecord(new HoodieKey(currentRecord.getRecordKey(), partitionPath),
new HoodieKey(currentRecord.getRecordKey(), partitionPath),
currentRecord.getData()); currentRecord.getData());
currentRecord.unseal(); currentRecord.unseal();
currentRecord.setCurrentLocation(new HoodieRecordLocation(commitTs, fileId)); currentRecord.setCurrentLocation(new HoodieRecordLocation(commitTs, fileId));
@@ -255,8 +245,7 @@ public class HBaseIndex<T extends HoodieRecordPayload> extends HoodieIndex<T> {
} }
} }
} catch (IOException e) { } catch (IOException e) {
throw new HoodieIndexException( throw new HoodieIndexException("Failed to Tag indexed locations because of exception with HBase Client", e);
"Failed to Tag indexed locations because of exception with HBase Client", e);
} finally { } finally {
if (hTable != null) { if (hTable != null) {
try { try {
@@ -310,12 +299,9 @@ public class HBaseIndex<T extends HoodieRecordPayload> extends HoodieIndex<T> {
continue; continue;
} }
Put put = new Put(Bytes.toBytes(rec.getRecordKey())); Put put = new Put(Bytes.toBytes(rec.getRecordKey()));
put.addColumn(SYSTEM_COLUMN_FAMILY, COMMIT_TS_COLUMN, put.addColumn(SYSTEM_COLUMN_FAMILY, COMMIT_TS_COLUMN, Bytes.toBytes(loc.get().getInstantTime()));
Bytes.toBytes(loc.get().getInstantTime())); put.addColumn(SYSTEM_COLUMN_FAMILY, FILE_NAME_COLUMN, Bytes.toBytes(loc.get().getFileId()));
put.addColumn(SYSTEM_COLUMN_FAMILY, FILE_NAME_COLUMN, put.addColumn(SYSTEM_COLUMN_FAMILY, PARTITION_PATH_COLUMN, Bytes.toBytes(rec.getPartitionPath()));
Bytes.toBytes(loc.get().getFileId()));
put.addColumn(SYSTEM_COLUMN_FAMILY, PARTITION_PATH_COLUMN,
Bytes.toBytes(rec.getPartitionPath()));
puts.add(put); puts.add(put);
} else { } else {
// Delete existing index for a deleted record // Delete existing index for a deleted record
@@ -338,8 +324,7 @@ public class HBaseIndex<T extends HoodieRecordPayload> extends HoodieIndex<T> {
writeStatusList.add(writeStatus); writeStatusList.add(writeStatus);
} }
} catch (IOException e) { } catch (IOException e) {
throw new HoodieIndexException( throw new HoodieIndexException("Failed to Update Index locations because of exception with HBase Client", e);
"Failed to Update Index locations because of exception with HBase Client", e);
} finally { } finally {
if (hTable != null) { if (hTable != null) {
try { try {
@@ -356,8 +341,7 @@ public class HBaseIndex<T extends HoodieRecordPayload> extends HoodieIndex<T> {
/** /**
* Helper method to facilitate performing puts and deletes in Hbase * Helper method to facilitate performing puts and deletes in Hbase
*/ */
private void doPutsAndDeletes(HTable hTable, List<Put> puts, List<Delete> deletes) private void doPutsAndDeletes(HTable hTable, List<Put> puts, List<Delete> deletes) throws IOException {
throws IOException {
if (puts.size() > 0) { if (puts.size() > 0) {
hTable.put(puts); hTable.put(puts);
} }
@@ -385,58 +369,49 @@ public class HBaseIndex<T extends HoodieRecordPayload> extends HoodieIndex<T> {
final HBaseIndexQPSResourceAllocator hBaseIndexQPSResourceAllocator = createQPSResourceAllocator(this.config); final HBaseIndexQPSResourceAllocator hBaseIndexQPSResourceAllocator = createQPSResourceAllocator(this.config);
setPutBatchSize(writeStatusRDD, hBaseIndexQPSResourceAllocator, jsc); setPutBatchSize(writeStatusRDD, hBaseIndexQPSResourceAllocator, jsc);
logger.info("multiPutBatchSize: before hbase puts" + multiPutBatchSize); logger.info("multiPutBatchSize: before hbase puts" + multiPutBatchSize);
JavaRDD<WriteStatus> writeStatusJavaRDD = writeStatusRDD.mapPartitionsWithIndex( JavaRDD<WriteStatus> writeStatusJavaRDD = writeStatusRDD.mapPartitionsWithIndex(updateLocationFunction(), true);
updateLocationFunction(), true);
// caching the index updated status RDD // caching the index updated status RDD
writeStatusJavaRDD = writeStatusJavaRDD.persist(config.getWriteStatusStorageLevel()); writeStatusJavaRDD = writeStatusJavaRDD.persist(config.getWriteStatusStorageLevel());
return writeStatusJavaRDD; return writeStatusJavaRDD;
} }
private void setPutBatchSize(JavaRDD<WriteStatus> writeStatusRDD, private void setPutBatchSize(JavaRDD<WriteStatus> writeStatusRDD,
HBaseIndexQPSResourceAllocator hBaseIndexQPSResourceAllocator, HBaseIndexQPSResourceAllocator hBaseIndexQPSResourceAllocator, final JavaSparkContext jsc) {
final JavaSparkContext jsc) {
if (config.getHbaseIndexPutBatchSizeAutoCompute()) { if (config.getHbaseIndexPutBatchSizeAutoCompute()) {
SparkConf conf = jsc.getConf(); SparkConf conf = jsc.getConf();
int maxExecutors = conf.getInt(DEFAULT_SPARK_EXECUTOR_INSTANCES_CONFIG_NAME, 1); int maxExecutors = conf.getInt(DEFAULT_SPARK_EXECUTOR_INSTANCES_CONFIG_NAME, 1);
if (conf.getBoolean(DEFAULT_SPARK_DYNAMIC_ALLOCATION_ENABLED_CONFIG_NAME, false)) { if (conf.getBoolean(DEFAULT_SPARK_DYNAMIC_ALLOCATION_ENABLED_CONFIG_NAME, false)) {
maxExecutors = Math.max(maxExecutors, conf.getInt( maxExecutors =
DEFAULT_SPARK_DYNAMIC_ALLOCATION_MAX_EXECUTORS_CONFIG_NAME, 1)); Math.max(maxExecutors, conf.getInt(DEFAULT_SPARK_DYNAMIC_ALLOCATION_MAX_EXECUTORS_CONFIG_NAME, 1));
} }
/* /*
Each writeStatus represents status information from a write done in one of the IOHandles. * Each writeStatus represents status information from a write done in one of the IOHandles. If a writeStatus has
If a writeStatus has any insert, it implies that the corresponding task contacts HBase for * any insert, it implies that the corresponding task contacts HBase for doing puts, since we only do puts for
doing puts, since we only do puts for inserts from HBaseIndex. * inserts from HBaseIndex.
*/ */
final Tuple2<Long, Integer> numPutsParallelismTuple = getHBasePutAccessParallelism(writeStatusRDD); final Tuple2<Long, Integer> numPutsParallelismTuple = getHBasePutAccessParallelism(writeStatusRDD);
final long numPuts = numPutsParallelismTuple._1; final long numPuts = numPutsParallelismTuple._1;
final int hbasePutsParallelism = numPutsParallelismTuple._2; final int hbasePutsParallelism = numPutsParallelismTuple._2;
this.numRegionServersForTable = getNumRegionServersAliveForTable(); this.numRegionServersForTable = getNumRegionServersAliveForTable();
final float desiredQPSFraction = hBaseIndexQPSResourceAllocator final float desiredQPSFraction =
.calculateQPSFractionForPutsTime(numPuts, this.numRegionServersForTable); hBaseIndexQPSResourceAllocator.calculateQPSFractionForPutsTime(numPuts, this.numRegionServersForTable);
logger.info("Desired QPSFraction :" + desiredQPSFraction); logger.info("Desired QPSFraction :" + desiredQPSFraction);
logger.info("Number HBase puts :" + numPuts); logger.info("Number HBase puts :" + numPuts);
logger.info("Hbase Puts Parallelism :" + hbasePutsParallelism); logger.info("Hbase Puts Parallelism :" + hbasePutsParallelism);
final float availableQpsFraction = hBaseIndexQPSResourceAllocator final float availableQpsFraction =
.acquireQPSResources(desiredQPSFraction, numPuts); hBaseIndexQPSResourceAllocator.acquireQPSResources(desiredQPSFraction, numPuts);
logger.info("Allocated QPS Fraction :" + availableQpsFraction); logger.info("Allocated QPS Fraction :" + availableQpsFraction);
multiPutBatchSize = putBatchSizeCalculator multiPutBatchSize = putBatchSizeCalculator.getBatchSize(numRegionServersForTable, maxQpsPerRegionServer,
.getBatchSize( hbasePutsParallelism, maxExecutors, SLEEP_TIME_MILLISECONDS, availableQpsFraction);
numRegionServersForTable,
maxQpsPerRegionServer,
hbasePutsParallelism,
maxExecutors,
SLEEP_TIME_MILLISECONDS,
availableQpsFraction);
logger.info("multiPutBatchSize :" + multiPutBatchSize); logger.info("multiPutBatchSize :" + multiPutBatchSize);
} }
} }
@VisibleForTesting @VisibleForTesting
public Tuple2<Long, Integer> getHBasePutAccessParallelism(final JavaRDD<WriteStatus> writeStatusRDD) { public Tuple2<Long, Integer> getHBasePutAccessParallelism(final JavaRDD<WriteStatus> writeStatusRDD) {
final JavaPairRDD<Long, Integer> insertOnlyWriteStatusRDD = final JavaPairRDD<Long, Integer> insertOnlyWriteStatusRDD = writeStatusRDD
writeStatusRDD.filter(w -> w.getStat().getNumInserts() > 0) .filter(w -> w.getStat().getNumInserts() > 0).mapToPair(w -> new Tuple2<>(w.getStat().getNumInserts(), 1));
.mapToPair(w -> new Tuple2<>(w.getStat().getNumInserts(), 1));
return insertOnlyWriteStatusRDD.fold(new Tuple2<>(0L, 0), (w, c) -> new Tuple2<>(w._1 + c._1, w._2 + c._2)); return insertOnlyWriteStatusRDD.fold(new Tuple2<>(0L, 0), (w, c) -> new Tuple2<>(w._1 + c._1, w._2 + c._2));
} }
@@ -460,21 +435,25 @@ public class HBaseIndex<T extends HoodieRecordPayload> extends HoodieIndex<T> {
* 16000. We assume requests get distributed to Region Servers uniformly, so each RS gets 1600 requests which * 16000. We assume requests get distributed to Region Servers uniformly, so each RS gets 1600 requests which
* happens to be 10% of 16667 (maxQPSPerRegionServer), as expected. * happens to be 10% of 16667 (maxQPSPerRegionServer), as expected.
* </p> * </p>
* <p> Assumptions made here <li> In a batch, writes get evenly distributed to each RS for that * <p>
* table. Since we do writes only in the case of inserts and not updates, for this assumption to fail, inserts would * Assumptions made here
* have to be skewed towards few RS, likelihood of which is less if Hbase table is pre-split and rowKeys are UUIDs * <li>In a batch, writes get evenly distributed to each RS for that table. Since we do writes only in the case of
* (random strings). If this assumption fails, then it is possible for some RS to receive more than * inserts and not updates, for this assumption to fail, inserts would have to be skewed towards few RS, likelihood
* maxQpsPerRegionServer QPS, but for simplicity, we are going ahead with this model, since this is meant to be a * of which is less if Hbase table is pre-split and rowKeys are UUIDs (random strings). If this assumption fails,
* lightweight distributed throttling mechanism without maintaining a global context. So if this assumption breaks, * then it is possible for some RS to receive more than maxQpsPerRegionServer QPS, but for simplicity, we are going
* we are hoping the HBase Master relocates hot-spot regions to new Region Servers. * ahead with this model, since this is meant to be a lightweight distributed throttling mechanism without
* maintaining a global context. So if this assumption breaks, we are hoping the HBase Master relocates hot-spot
* regions to new Region Servers.
* *
* </li> <li> For Region Server stability, throttling at a second level granularity is fine. * </li>
* Although, within a second, the sum of queries might be within maxQpsPerRegionServer, there could be peaks at some * <li>For Region Server stability, throttling at a second level granularity is fine. Although, within a second, the
* sub second intervals. So, the assumption is that these peaks are tolerated by the Region Server (which at max can * sum of queries might be within maxQpsPerRegionServer, there could be peaks at some sub second intervals. So, the
* be maxQpsPerRegionServer). </li> </p> * assumption is that these peaks are tolerated by the Region Server (which at max can be maxQpsPerRegionServer).
* </li>
* </p>
*/ */
public int getBatchSize(int numRegionServersForTable, int maxQpsPerRegionServer, public int getBatchSize(int numRegionServersForTable, int maxQpsPerRegionServer, int numTasksDuringPut,
int numTasksDuringPut, int maxExecutors, int sleepTimeMs, float qpsFraction) { int maxExecutors, int sleepTimeMs, float qpsFraction) {
int numRSAlive = numRegionServersForTable; int numRSAlive = numRegionServersForTable;
int maxReqPerSec = (int) (qpsFraction * numRSAlive * maxQpsPerRegionServer); int maxReqPerSec = (int) (qpsFraction * numRSAlive * maxQpsPerRegionServer);
int numTasks = numTasksDuringPut; int numTasks = numTasksDuringPut;
@@ -499,11 +478,9 @@ public class HBaseIndex<T extends HoodieRecordPayload> extends HoodieIndex<T> {
// from the driver, so ok to use a local connection variable. // from the driver, so ok to use a local connection variable.
if (numRegionServersForTable == null) { if (numRegionServersForTable == null) {
try (Connection conn = getHBaseConnection()) { try (Connection conn = getHBaseConnection()) {
RegionLocator regionLocator = conn RegionLocator regionLocator = conn.getRegionLocator(TableName.valueOf(tableName));
.getRegionLocator(TableName.valueOf(tableName)); numRegionServersForTable = Math
numRegionServersForTable = Math.toIntExact( .toIntExact(regionLocator.getAllRegionLocations().stream().map(e -> e.getServerName()).distinct().count());
regionLocator.getAllRegionLocations().stream().map(e -> e.getServerName()).distinct()
.count());
return numRegionServersForTable; return numRegionServersForTable;
} catch (IOException e) { } catch (IOException e) {
logger.error(e); logger.error(e);

View File

@@ -26,8 +26,8 @@ import java.io.Serializable;
public interface HBaseIndexQPSResourceAllocator extends Serializable { public interface HBaseIndexQPSResourceAllocator extends Serializable {
/** /**
* This method returns the QPS Fraction value that needs to be acquired such that the respective * This method returns the QPS Fraction value that needs to be acquired such that the respective HBase index operation
* HBase index operation can be completed in desiredPutsTime. * can be completed in desiredPutsTime.
* *
* @param numPuts Number of inserts to be written to HBase index * @param numPuts Number of inserts to be written to HBase index
* @param desiredPutsTimeInSecs Total expected time for the HBase inserts operation * @param desiredPutsTimeInSecs Total expected time for the HBase inserts operation

View File

@@ -96,8 +96,8 @@ public class HoodieAppendHandle<T extends HoodieRecordPayload> extends HoodieWri
// Total number of new records inserted into the delta file // Total number of new records inserted into the delta file
private long insertRecordsWritten = 0; private long insertRecordsWritten = 0;
public HoodieAppendHandle(HoodieWriteConfig config, String commitTime, HoodieTable<T> hoodieTable, public HoodieAppendHandle(HoodieWriteConfig config, String commitTime, HoodieTable<T> hoodieTable, String fileId,
String fileId, Iterator<HoodieRecord<T>> recordItr) { Iterator<HoodieRecord<T>> recordItr) {
super(config, commitTime, fileId, hoodieTable); super(config, commitTime, fileId, hoodieTable);
writeStatus.setStat(new HoodieDeltaWriteStat()); writeStatus.setStat(new HoodieDeltaWriteStat());
this.fileId = fileId; this.fileId = fileId;
@@ -137,10 +137,8 @@ public class HoodieAppendHandle<T extends HoodieRecordPayload> extends HoodieWri
} catch (Exception e) { } catch (Exception e) {
logger.error("Error in update task at commit " + instantTime, e); logger.error("Error in update task at commit " + instantTime, e);
writeStatus.setGlobalError(e); writeStatus.setGlobalError(e);
throw new HoodieUpsertException( throw new HoodieUpsertException("Failed to initialize HoodieAppendHandle for FileId: " + fileId + " on commit "
"Failed to initialize HoodieAppendHandle for FileId: " + fileId + " on commit " + instantTime + " on HDFS path " + hoodieTable.getMetaClient().getBasePath() + partitionPath, e);
+ instantTime + " on HDFS path " + hoodieTable.getMetaClient().getBasePath()
+ partitionPath, e);
} }
Path path = new Path(partitionPath, writer.getLogFile().getFileName()); Path path = new Path(partitionPath, writer.getLogFile().getFileName());
writeStatus.getStat().setPath(path.toString()); writeStatus.getStat().setPath(path.toString());
@@ -155,13 +153,11 @@ public class HoodieAppendHandle<T extends HoodieRecordPayload> extends HoodieWri
if (avroRecord.isPresent()) { if (avroRecord.isPresent()) {
// Convert GenericRecord to GenericRecord with hoodie commit metadata in schema // Convert GenericRecord to GenericRecord with hoodie commit metadata in schema
avroRecord = Option.of(rewriteRecord((GenericRecord) avroRecord.get())); avroRecord = Option.of(rewriteRecord((GenericRecord) avroRecord.get()));
String seqId = HoodieRecord.generateSequenceId(instantTime, TaskContext.getPartitionId(), String seqId =
recordIndex.getAndIncrement()); HoodieRecord.generateSequenceId(instantTime, TaskContext.getPartitionId(), recordIndex.getAndIncrement());
HoodieAvroUtils HoodieAvroUtils.addHoodieKeyToRecord((GenericRecord) avroRecord.get(), hoodieRecord.getRecordKey(),
.addHoodieKeyToRecord((GenericRecord) avroRecord.get(), hoodieRecord.getRecordKey(),
hoodieRecord.getPartitionPath(), fileId); hoodieRecord.getPartitionPath(), fileId);
HoodieAvroUtils HoodieAvroUtils.addCommitMetadataToRecord((GenericRecord) avroRecord.get(), instantTime, seqId);
.addCommitMetadataToRecord((GenericRecord) avroRecord.get(), instantTime, seqId);
// If currentLocation is present, then this is an update // If currentLocation is present, then this is an update
if (hoodieRecord.getCurrentLocation() != null) { if (hoodieRecord.getCurrentLocation() != null) {
updatedRecordsWritten++; updatedRecordsWritten++;
@@ -208,20 +204,18 @@ public class HoodieAppendHandle<T extends HoodieRecordPayload> extends HoodieWri
recordList.clear(); recordList.clear();
} }
if (keysToDelete.size() > 0) { if (keysToDelete.size() > 0) {
writer = writer.appendBlock( writer = writer.appendBlock(new HoodieDeleteBlock(keysToDelete.stream().toArray(HoodieKey[]::new), header));
new HoodieDeleteBlock(keysToDelete.stream().toArray(HoodieKey[]::new), header));
keysToDelete.clear(); keysToDelete.clear();
} }
} catch (Exception e) { } catch (Exception e) {
throw new HoodieAppendException( throw new HoodieAppendException("Failed while appending records to " + currentLogFile.getPath(), e);
"Failed while appending records to " + currentLogFile.getPath(), e);
} }
} }
@Override @Override
public boolean canWrite(HoodieRecord record) { public boolean canWrite(HoodieRecord record) {
return config.getParquetMaxFileSize() >= estimatedNumberOfBytesWritten * config return config.getParquetMaxFileSize() >= estimatedNumberOfBytesWritten
.getLogFileToParquetCompressionRatio(); * config.getLogFileToParquetCompressionRatio();
} }
@Override @Override
@@ -262,8 +256,8 @@ public class HoodieAppendHandle<T extends HoodieRecordPayload> extends HoodieWri
runtimeStats.setTotalUpsertTime(timer.endTimer()); runtimeStats.setTotalUpsertTime(timer.endTimer());
stat.setRuntimeStats(runtimeStats); stat.setRuntimeStats(runtimeStats);
logger.info(String.format("AppendHandle for partitionPath %s fileID %s, took %d ms.", logger.info(String.format("AppendHandle for partitionPath %s fileID %s, took %d ms.", stat.getPartitionPath(),
stat.getPartitionPath(), stat.getFileId(), runtimeStats.getTotalUpsertTime())); stat.getFileId(), runtimeStats.getTotalUpsertTime()));
return writeStatus; return writeStatus;
} catch (IOException e) { } catch (IOException e) {
@@ -282,13 +276,11 @@ public class HoodieAppendHandle<T extends HoodieRecordPayload> extends HoodieWri
return HoodieLogFormat.newWriterBuilder() return HoodieLogFormat.newWriterBuilder()
.onParentPath(FSUtils.getPartitionPath(hoodieTable.getMetaClient().getBasePath(), partitionPath)) .onParentPath(FSUtils.getPartitionPath(hoodieTable.getMetaClient().getBasePath(), partitionPath))
.withFileId(fileId).overBaseCommit(baseCommitTime).withLogVersion( .withFileId(fileId).overBaseCommit(baseCommitTime)
latestLogFile.map(HoodieLogFile::getLogVersion).orElse(HoodieLogFile.LOGFILE_BASE_VERSION)) .withLogVersion(latestLogFile.map(HoodieLogFile::getLogVersion).orElse(HoodieLogFile.LOGFILE_BASE_VERSION))
.withSizeThreshold(config.getLogFileMaxSize()).withFs(fs) .withSizeThreshold(config.getLogFileMaxSize()).withFs(fs)
.withLogWriteToken( .withLogWriteToken(latestLogFile.map(x -> FSUtils.getWriteTokenFromLogPath(x.getPath())).orElse(writeToken))
latestLogFile.map(x -> FSUtils.getWriteTokenFromLogPath(x.getPath())).orElse(writeToken)) .withRolloverLogWriteToken(writeToken).withFileExtension(HoodieLogFile.DELTA_EXTENSION).build();
.withRolloverLogWriteToken(writeToken)
.withFileExtension(HoodieLogFile.DELTA_EXTENSION).build();
} }
private void writeToBuffer(HoodieRecord<T> record) { private void writeToBuffer(HoodieRecord<T> record) {

View File

@@ -45,9 +45,12 @@ import org.apache.log4j.Logger;
/** /**
* Cleaner is responsible for garbage collecting older files in a given partition path, such that * Cleaner is responsible for garbage collecting older files in a given partition path, such that
* <p> 1) It provides sufficient time for existing queries running on older versions, to close <p> * <p>
* 2) It bounds the growth of the files in the file system <p> TODO: Should all cleaning be done * 1) It provides sufficient time for existing queries running on older versions, to close
* based on {@link HoodieCommitMetadata} * <p>
* 2) It bounds the growth of the files in the file system
* <p>
* TODO: Should all cleaning be done based on {@link HoodieCommitMetadata}
*/ */
public class HoodieCleanHelper<T extends HoodieRecordPayload<T>> { public class HoodieCleanHelper<T extends HoodieRecordPayload<T>> {
@@ -66,22 +69,21 @@ public class HoodieCleanHelper<T extends HoodieRecordPayload<T>> {
this.config = config; this.config = config;
this.fgIdToPendingCompactionOperations = this.fgIdToPendingCompactionOperations =
((SyncableFileSystemView) hoodieTable.getRTFileSystemView()).getPendingCompactionOperations() ((SyncableFileSystemView) hoodieTable.getRTFileSystemView()).getPendingCompactionOperations()
.map(entry -> Pair.of(new HoodieFileGroupId(entry.getValue().getPartitionPath(), .map(entry -> Pair.of(
entry.getValue().getFileId()), entry.getValue())) new HoodieFileGroupId(entry.getValue().getPartitionPath(), entry.getValue().getFileId()),
entry.getValue()))
.collect(Collectors.toMap(Pair::getKey, Pair::getValue)); .collect(Collectors.toMap(Pair::getKey, Pair::getValue));
} }
/** /**
* Selects the older versions of files for cleaning, such that it bounds the number of versions of * Selects the older versions of files for cleaning, such that it bounds the number of versions of each file. This
* each file. This policy is useful, if you are simply interested in querying the table, and you * policy is useful, if you are simply interested in querying the table, and you don't want too many versions for a
* don't want too many versions for a single file (i.e run it with versionsRetained = 1) * single file (i.e run it with versionsRetained = 1)
*/ */
private List<String> getFilesToCleanKeepingLatestVersions(String partitionPath) private List<String> getFilesToCleanKeepingLatestVersions(String partitionPath) throws IOException {
throws IOException { logger.info("Cleaning " + partitionPath + ", retaining latest " + config.getCleanerFileVersionsRetained()
logger.info("Cleaning " + partitionPath + ", retaining latest " + config + " file versions. ");
.getCleanerFileVersionsRetained() + " file versions. "); List<HoodieFileGroup> fileGroups = fileSystemView.getAllFileGroups(partitionPath).collect(Collectors.toList());
List<HoodieFileGroup> fileGroups = fileSystemView.getAllFileGroups(partitionPath)
.collect(Collectors.toList());
List<String> deletePaths = new ArrayList<>(); List<String> deletePaths = new ArrayList<>();
// Collect all the datafiles savepointed by all the savepoints // Collect all the datafiles savepointed by all the savepoints
List<String> savepointedFiles = hoodieTable.getSavepoints().stream() List<String> savepointedFiles = hoodieTable.getSavepoints().stream()
@@ -90,8 +92,8 @@ public class HoodieCleanHelper<T extends HoodieRecordPayload<T>> {
for (HoodieFileGroup fileGroup : fileGroups) { for (HoodieFileGroup fileGroup : fileGroups) {
int keepVersions = config.getCleanerFileVersionsRetained(); int keepVersions = config.getCleanerFileVersionsRetained();
// do not cleanup slice required for pending compaction // do not cleanup slice required for pending compaction
Iterator<FileSlice> fileSliceIterator = fileGroup.getAllFileSlices() Iterator<FileSlice> fileSliceIterator =
.filter(fs -> !isFileSliceNeededForPendingCompaction(fs)).iterator(); fileGroup.getAllFileSlices().filter(fs -> !isFileSliceNeededForPendingCompaction(fs)).iterator();
if (isFileGroupInPendingCompaction(fileGroup)) { if (isFileGroupInPendingCompaction(fileGroup)) {
// We have already saved the last version of file-groups for pending compaction Id // We have already saved the last version of file-groups for pending compaction Id
keepVersions--; keepVersions--;
@@ -116,8 +118,8 @@ public class HoodieCleanHelper<T extends HoodieRecordPayload<T>> {
} }
if (hoodieTable.getMetaClient().getTableType() == HoodieTableType.MERGE_ON_READ) { if (hoodieTable.getMetaClient().getTableType() == HoodieTableType.MERGE_ON_READ) {
// If merge on read, then clean the log files for the commits as well // If merge on read, then clean the log files for the commits as well
deletePaths.addAll(nextSlice.getLogFiles().map(file -> file.getPath().toString()) deletePaths
.collect(Collectors.toList())); .addAll(nextSlice.getLogFiles().map(file -> file.getPath().toString()).collect(Collectors.toList()));
} }
} }
} }
@@ -126,21 +128,21 @@ public class HoodieCleanHelper<T extends HoodieRecordPayload<T>> {
/** /**
* Selects the versions for file for cleaning, such that it <p> - Leaves the latest version of the * Selects the versions for file for cleaning, such that it
* file untouched - For older versions, - It leaves all the commits untouched which has occured in * <p>
* last <code>config.getCleanerCommitsRetained()</code> commits - It leaves ONE commit before this * - Leaves the latest version of the file untouched - For older versions, - It leaves all the commits untouched which
* window. We assume that the max(query execution time) == commit_batch_time * * has occured in last <code>config.getCleanerCommitsRetained()</code> commits - It leaves ONE commit before this
* config.getCleanerCommitsRetained(). This is 12 hours by default. This is essential to leave the * window. We assume that the max(query execution time) == commit_batch_time * config.getCleanerCommitsRetained().
* file used by the query thats running for the max time. <p> This provides the effect of having * This is 12 hours by default. This is essential to leave the file used by the query thats running for the max time.
* lookback into all changes that happened in the last X commits. (eg: if you retain 24 commits, * <p>
* and commit batch time is 30 mins, then you have 12 hrs of lookback) <p> This policy is the * This provides the effect of having lookback into all changes that happened in the last X commits. (eg: if you
* default. * retain 24 commits, and commit batch time is 30 mins, then you have 12 hrs of lookback)
* <p>
* This policy is the default.
*/ */
private List<String> getFilesToCleanKeepingLatestCommits(String partitionPath) private List<String> getFilesToCleanKeepingLatestCommits(String partitionPath) throws IOException {
throws IOException {
int commitsRetained = config.getCleanerCommitsRetained(); int commitsRetained = config.getCleanerCommitsRetained();
logger logger.info("Cleaning " + partitionPath + ", retaining latest " + commitsRetained + " commits. ");
.info("Cleaning " + partitionPath + ", retaining latest " + commitsRetained + " commits. ");
List<String> deletePaths = new ArrayList<>(); List<String> deletePaths = new ArrayList<>();
// Collect all the datafiles savepointed by all the savepoints // Collect all the datafiles savepointed by all the savepoints
@@ -150,8 +152,7 @@ public class HoodieCleanHelper<T extends HoodieRecordPayload<T>> {
// determine if we have enough commits, to start cleaning. // determine if we have enough commits, to start cleaning.
if (commitTimeline.countInstants() > commitsRetained) { if (commitTimeline.countInstants() > commitsRetained) {
HoodieInstant earliestCommitToRetain = getEarliestCommitToRetain().get(); HoodieInstant earliestCommitToRetain = getEarliestCommitToRetain().get();
List<HoodieFileGroup> fileGroups = fileSystemView.getAllFileGroups(partitionPath) List<HoodieFileGroup> fileGroups = fileSystemView.getAllFileGroups(partitionPath).collect(Collectors.toList());
.collect(Collectors.toList());
for (HoodieFileGroup fileGroup : fileGroups) { for (HoodieFileGroup fileGroup : fileGroups) {
List<FileSlice> fileSliceList = fileGroup.getAllFileSlices().collect(Collectors.toList()); List<FileSlice> fileSliceList = fileGroup.getAllFileSlices().collect(Collectors.toList());
@@ -160,8 +161,8 @@ public class HoodieCleanHelper<T extends HoodieRecordPayload<T>> {
} }
String lastVersion = fileSliceList.get(0).getBaseInstantTime(); String lastVersion = fileSliceList.get(0).getBaseInstantTime();
String lastVersionBeforeEarliestCommitToRetain = getLatestVersionBeforeCommit(fileSliceList, String lastVersionBeforeEarliestCommitToRetain =
earliestCommitToRetain); getLatestVersionBeforeCommit(fileSliceList, earliestCommitToRetain);
// Ensure there are more than 1 version of the file (we only clean old files from updates) // Ensure there are more than 1 version of the file (we only clean old files from updates)
// i.e always spare the last commit. // i.e always spare the last commit.
@@ -183,16 +184,14 @@ public class HoodieCleanHelper<T extends HoodieRecordPayload<T>> {
} }
// Always keep the last commit // Always keep the last commit
if (!isFileSliceNeededForPendingCompaction(aSlice) if (!isFileSliceNeededForPendingCompaction(aSlice) && HoodieTimeline
&& HoodieTimeline .compareTimestamps(earliestCommitToRetain.getTimestamp(), fileCommitTime, HoodieTimeline.GREATER)) {
.compareTimestamps(earliestCommitToRetain.getTimestamp(), fileCommitTime,
HoodieTimeline.GREATER)) {
// this is a commit, that should be cleaned. // this is a commit, that should be cleaned.
aFile.ifPresent(hoodieDataFile -> deletePaths.add(hoodieDataFile.getPath())); aFile.ifPresent(hoodieDataFile -> deletePaths.add(hoodieDataFile.getPath()));
if (hoodieTable.getMetaClient().getTableType() == HoodieTableType.MERGE_ON_READ) { if (hoodieTable.getMetaClient().getTableType() == HoodieTableType.MERGE_ON_READ) {
// If merge on read, then clean the log files for the commits as well // If merge on read, then clean the log files for the commits as well
deletePaths.addAll(aSlice.getLogFiles().map(file -> file.getPath().toString()) deletePaths
.collect(Collectors.toList())); .addAll(aSlice.getLogFiles().map(file -> file.getPath().toString()).collect(Collectors.toList()));
} }
} }
} }
@@ -205,12 +204,10 @@ public class HoodieCleanHelper<T extends HoodieRecordPayload<T>> {
/** /**
* Gets the latest version < commitTime. This version file could still be used by queries. * Gets the latest version < commitTime. This version file could still be used by queries.
*/ */
private String getLatestVersionBeforeCommit(List<FileSlice> fileSliceList, private String getLatestVersionBeforeCommit(List<FileSlice> fileSliceList, HoodieInstant commitTime) {
HoodieInstant commitTime) {
for (FileSlice file : fileSliceList) { for (FileSlice file : fileSliceList) {
String fileCommitTime = file.getBaseInstantTime(); String fileCommitTime = file.getBaseInstantTime();
if (HoodieTimeline if (HoodieTimeline.compareTimestamps(commitTime.getTimestamp(), fileCommitTime, HoodieTimeline.GREATER)) {
.compareTimestamps(commitTime.getTimestamp(), fileCommitTime, HoodieTimeline.GREATER)) {
// fileList is sorted on the reverse, so the first commit we find <= commitTime is the // fileList is sorted on the reverse, so the first commit we find <= commitTime is the
// one we want // one we want
return fileCommitTime; return fileCommitTime;
@@ -246,14 +243,14 @@ public class HoodieCleanHelper<T extends HoodieRecordPayload<T>> {
int commitsRetained = config.getCleanerCommitsRetained(); int commitsRetained = config.getCleanerCommitsRetained();
if (config.getCleanerPolicy() == HoodieCleaningPolicy.KEEP_LATEST_COMMITS if (config.getCleanerPolicy() == HoodieCleaningPolicy.KEEP_LATEST_COMMITS
&& commitTimeline.countInstants() > commitsRetained) { && commitTimeline.countInstants() > commitsRetained) {
earliestCommitToRetain = commitTimeline earliestCommitToRetain = commitTimeline.nthInstant(commitTimeline.countInstants() - commitsRetained);
.nthInstant(commitTimeline.countInstants() - commitsRetained);
} }
return earliestCommitToRetain; return earliestCommitToRetain;
} }
/** /**
* Determine if file slice needed to be preserved for pending compaction * Determine if file slice needed to be preserved for pending compaction
*
* @param fileSlice File Slice * @param fileSlice File Slice
* @return true if file slice needs to be preserved, false otherwise. * @return true if file slice needs to be preserved, false otherwise.
*/ */

View File

@@ -83,9 +83,8 @@ public class HoodieCommitArchiveLog {
try { try {
if (this.writer == null) { if (this.writer == null) {
return HoodieLogFormat.newWriterBuilder().onParentPath(archiveFilePath.getParent()) return HoodieLogFormat.newWriterBuilder().onParentPath(archiveFilePath.getParent())
.withFileId(archiveFilePath.getName()) .withFileId(archiveFilePath.getName()).withFileExtension(HoodieArchivedLogFile.ARCHIVE_EXTENSION)
.withFileExtension(HoodieArchivedLogFile.ARCHIVE_EXTENSION).withFs(metaClient.getFs()) .withFs(metaClient.getFs()).overBaseCommit("").build();
.overBaseCommit("").build();
} else { } else {
return this.writer; return this.writer;
} }
@@ -137,8 +136,7 @@ public class HoodieCommitArchiveLog {
// TODO: Handle ROLLBACK_ACTION in future // TODO: Handle ROLLBACK_ACTION in future
// ROLLBACK_ACTION is currently not defined in HoodieActiveTimeline // ROLLBACK_ACTION is currently not defined in HoodieActiveTimeline
HoodieTimeline cleanAndRollbackTimeline = table.getActiveTimeline() HoodieTimeline cleanAndRollbackTimeline = table.getActiveTimeline()
.getTimelineOfActions(Sets.newHashSet(HoodieTimeline.CLEAN_ACTION)) .getTimelineOfActions(Sets.newHashSet(HoodieTimeline.CLEAN_ACTION)).filterCompletedInstants();
.filterCompletedInstants();
Stream<HoodieInstant> instants = cleanAndRollbackTimeline.getInstants() Stream<HoodieInstant> instants = cleanAndRollbackTimeline.getInstants()
.collect(Collectors.groupingBy(s -> s.getAction())).entrySet().stream().map(i -> { .collect(Collectors.groupingBy(s -> s.getAction())).entrySet().stream().map(i -> {
if (i.getValue().size() > maxCommitsToKeep) { if (i.getValue().size() > maxCommitsToKeep) {
@@ -159,20 +157,16 @@ public class HoodieCommitArchiveLog {
Option<HoodieInstant> firstSavepoint = table.getCompletedSavepointTimeline().firstInstant(); Option<HoodieInstant> firstSavepoint = table.getCompletedSavepointTimeline().firstInstant();
if (!commitTimeline.empty() && commitTimeline.countInstants() > maxCommitsToKeep) { if (!commitTimeline.empty() && commitTimeline.countInstants() > maxCommitsToKeep) {
// Actually do the commits // Actually do the commits
instants = Stream.concat(instants, commitTimeline.getInstants() instants = Stream.concat(instants, commitTimeline.getInstants().filter(s -> {
.filter(s -> {
// if no savepoint present, then dont filter // if no savepoint present, then dont filter
return !(firstSavepoint.isPresent() && HoodieTimeline return !(firstSavepoint.isPresent() && HoodieTimeline.compareTimestamps(firstSavepoint.get().getTimestamp(),
.compareTimestamps(firstSavepoint.get().getTimestamp(), s.getTimestamp(), s.getTimestamp(), HoodieTimeline.LESSER_OR_EQUAL));
HoodieTimeline.LESSER_OR_EQUAL)); }).filter(s -> {
})
.filter(s -> {
// Ensure commits >= oldest pending compaction commit is retained // Ensure commits >= oldest pending compaction commit is retained
return oldestPendingCompactionInstant.map(instant -> { return oldestPendingCompactionInstant.map(instant -> {
return HoodieTimeline.compareTimestamps(instant.getTimestamp(), s.getTimestamp(), HoodieTimeline.GREATER); return HoodieTimeline.compareTimestamps(instant.getTimestamp(), s.getTimestamp(), HoodieTimeline.GREATER);
}).orElse(true); }).orElse(true);
}) }).limit(commitTimeline.countInstants() - minCommitsToKeep));
.limit(commitTimeline.countInstants() - minCommitsToKeep));
} }
return instants; return instants;
@@ -194,12 +188,9 @@ public class HoodieCommitArchiveLog {
} }
// Remove older meta-data from auxiliary path too // Remove older meta-data from auxiliary path too
Option<HoodieInstant> latestCommitted = Option<HoodieInstant> latestCommitted = Option.fromJavaOptional(archivedInstants.stream().filter(i -> {
Option.fromJavaOptional(archivedInstants.stream() return i.isCompleted() && (i.getAction().equals(HoodieTimeline.COMMIT_ACTION)
.filter(i -> { || (i.getAction().equals(HoodieTimeline.DELTA_COMMIT_ACTION)));
return i.isCompleted()
&& (i.getAction().equals(HoodieTimeline.COMMIT_ACTION) || (i.getAction().equals(
HoodieTimeline.DELTA_COMMIT_ACTION)));
}).max(Comparator.comparing(HoodieInstant::getTimestamp))); }).max(Comparator.comparing(HoodieInstant::getTimestamp)));
if (latestCommitted.isPresent()) { if (latestCommitted.isPresent()) {
success &= deleteAllInstantsOlderorEqualsInAuxMetaFolder(latestCommitted.get()); success &= deleteAllInstantsOlderorEqualsInAuxMetaFolder(latestCommitted.get());
@@ -214,12 +205,9 @@ public class HoodieCommitArchiveLog {
* @return success if all eligible file deleted successfully * @return success if all eligible file deleted successfully
* @throws IOException in case of error * @throws IOException in case of error
*/ */
private boolean deleteAllInstantsOlderorEqualsInAuxMetaFolder(HoodieInstant thresholdInstant) private boolean deleteAllInstantsOlderorEqualsInAuxMetaFolder(HoodieInstant thresholdInstant) throws IOException {
throws IOException { List<HoodieInstant> instants = HoodieTableMetaClient.scanHoodieInstantsFromFileSystem(metaClient.getFs(),
List<HoodieInstant> instants = new Path(metaClient.getMetaAuxiliaryPath()), HoodieActiveTimeline.VALID_EXTENSIONS_IN_ACTIVE_TIMELINE);
HoodieTableMetaClient.scanHoodieInstantsFromFileSystem(metaClient.getFs(),
new Path(metaClient.getMetaAuxiliaryPath()),
HoodieActiveTimeline.VALID_EXTENSIONS_IN_ACTIVE_TIMELINE);
List<HoodieInstant> instantsToBeDeleted = List<HoodieInstant> instantsToBeDeleted =
instants.stream().filter(instant1 -> HoodieTimeline.compareTimestamps(instant1.getTimestamp(), instants.stream().filter(instant1 -> HoodieTimeline.compareTimestamps(instant1.getTimestamp(),
@@ -239,8 +227,7 @@ public class HoodieCommitArchiveLog {
public void archive(List<HoodieInstant> instants) throws HoodieCommitException { public void archive(List<HoodieInstant> instants) throws HoodieCommitException {
try { try {
HoodieTimeline commitTimeline = metaClient.getActiveTimeline().getAllCommitsTimeline() HoodieTimeline commitTimeline = metaClient.getActiveTimeline().getAllCommitsTimeline().filterCompletedInstants();
.filterCompletedInstants();
Schema wrapperSchema = HoodieArchivedMetaEntry.getClassSchema(); Schema wrapperSchema = HoodieArchivedMetaEntry.getClassSchema();
log.info("Wrapper schema " + wrapperSchema.toString()); log.info("Wrapper schema " + wrapperSchema.toString());
List<IndexedRecord> records = new ArrayList<>(); List<IndexedRecord> records = new ArrayList<>();
@@ -277,15 +264,14 @@ public class HoodieCommitArchiveLog {
} }
} }
private IndexedRecord convertToAvroRecord(HoodieTimeline commitTimeline, private IndexedRecord convertToAvroRecord(HoodieTimeline commitTimeline, HoodieInstant hoodieInstant)
HoodieInstant hoodieInstant) throws IOException { throws IOException {
HoodieArchivedMetaEntry archivedMetaWrapper = new HoodieArchivedMetaEntry(); HoodieArchivedMetaEntry archivedMetaWrapper = new HoodieArchivedMetaEntry();
archivedMetaWrapper.setCommitTime(hoodieInstant.getTimestamp()); archivedMetaWrapper.setCommitTime(hoodieInstant.getTimestamp());
switch (hoodieInstant.getAction()) { switch (hoodieInstant.getAction()) {
case HoodieTimeline.CLEAN_ACTION: { case HoodieTimeline.CLEAN_ACTION: {
archivedMetaWrapper.setHoodieCleanMetadata(AvroUtils archivedMetaWrapper.setHoodieCleanMetadata(AvroUtils
.deserializeAvroMetadata(commitTimeline.getInstantDetails(hoodieInstant).get(), .deserializeAvroMetadata(commitTimeline.getInstantDetails(hoodieInstant).get(), HoodieCleanMetadata.class));
HoodieCleanMetadata.class));
archivedMetaWrapper.setActionType(ActionType.clean.name()); archivedMetaWrapper.setActionType(ActionType.clean.name());
break; break;
} }
@@ -297,16 +283,14 @@ public class HoodieCommitArchiveLog {
break; break;
} }
case HoodieTimeline.ROLLBACK_ACTION: { case HoodieTimeline.ROLLBACK_ACTION: {
archivedMetaWrapper.setHoodieRollbackMetadata(AvroUtils archivedMetaWrapper.setHoodieRollbackMetadata(AvroUtils.deserializeAvroMetadata(
.deserializeAvroMetadata(commitTimeline.getInstantDetails(hoodieInstant).get(), commitTimeline.getInstantDetails(hoodieInstant).get(), HoodieRollbackMetadata.class));
HoodieRollbackMetadata.class));
archivedMetaWrapper.setActionType(ActionType.rollback.name()); archivedMetaWrapper.setActionType(ActionType.rollback.name());
break; break;
} }
case HoodieTimeline.SAVEPOINT_ACTION: { case HoodieTimeline.SAVEPOINT_ACTION: {
archivedMetaWrapper.setHoodieSavePointMetadata(AvroUtils archivedMetaWrapper.setHoodieSavePointMetadata(AvroUtils.deserializeAvroMetadata(
.deserializeAvroMetadata(commitTimeline.getInstantDetails(hoodieInstant).get(), commitTimeline.getInstantDetails(hoodieInstant).get(), HoodieSavepointMetadata.class));
HoodieSavepointMetadata.class));
archivedMetaWrapper.setActionType(ActionType.savepoint.name()); archivedMetaWrapper.setActionType(ActionType.savepoint.name());
break; break;
} }
@@ -328,8 +312,8 @@ public class HoodieCommitArchiveLog {
ObjectMapper mapper = new ObjectMapper(); ObjectMapper mapper = new ObjectMapper();
// Need this to ignore other public get() methods // Need this to ignore other public get() methods
mapper.configure(DeserializationFeature.FAIL_ON_UNKNOWN_PROPERTIES, false); mapper.configure(DeserializationFeature.FAIL_ON_UNKNOWN_PROPERTIES, false);
org.apache.hudi.avro.model.HoodieCommitMetadata avroMetaData = mapper org.apache.hudi.avro.model.HoodieCommitMetadata avroMetaData =
.convertValue(hoodieCommitMetadata, org.apache.hudi.avro.model.HoodieCommitMetadata.class); mapper.convertValue(hoodieCommitMetadata, org.apache.hudi.avro.model.HoodieCommitMetadata.class);
// Do not archive Rolling Stats, cannot set to null since AVRO will throw null pointer // Do not archive Rolling Stats, cannot set to null since AVRO will throw null pointer
avroMetaData.getExtraMetadata().put(HoodieRollingStatMetadata.ROLLING_STAT_METADATA_KEY, ""); avroMetaData.getExtraMetadata().put(HoodieRollingStatMetadata.ROLLING_STAT_METADATA_KEY, "");
return avroMetaData; return avroMetaData;

View File

@@ -66,11 +66,10 @@ public class HoodieCreateHandle<T extends HoodieRecordPayload> extends HoodieWri
new Path(config.getBasePath()), FSUtils.getPartitionPath(config.getBasePath(), partitionPath)); new Path(config.getBasePath()), FSUtils.getPartitionPath(config.getBasePath(), partitionPath));
partitionMetadata.trySave(TaskContext.getPartitionId()); partitionMetadata.trySave(TaskContext.getPartitionId());
createMarkerFile(partitionPath); createMarkerFile(partitionPath);
this.storageWriter = HoodieStorageWriterFactory this.storageWriter =
.getStorageWriter(commitTime, path, hoodieTable, config, writerSchema); HoodieStorageWriterFactory.getStorageWriter(commitTime, path, hoodieTable, config, writerSchema);
} catch (IOException e) { } catch (IOException e) {
throw new HoodieInsertException( throw new HoodieInsertException("Failed to initialize HoodieStorageWriter for path " + path, e);
"Failed to initialize HoodieStorageWriter for path " + path, e);
} }
logger.info("New CreateHandle for partition :" + partitionPath + " with fileId " + fileId); logger.info("New CreateHandle for partition :" + partitionPath + " with fileId " + fileId);
} }
@@ -136,8 +135,7 @@ public class HoodieCreateHandle<T extends HoodieRecordPayload> extends HoodieWri
} }
} }
} catch (IOException io) { } catch (IOException io) {
throw new HoodieInsertException( throw new HoodieInsertException("Failed to insert records for path " + path, io);
"Failed to insert records for path " + path, io);
} }
} }
@@ -151,8 +149,8 @@ public class HoodieCreateHandle<T extends HoodieRecordPayload> extends HoodieWri
*/ */
@Override @Override
public WriteStatus close() { public WriteStatus close() {
logger.info("Closing the file " + writeStatus.getFileId() + " as we are done with all the records " logger
+ recordsWritten); .info("Closing the file " + writeStatus.getFileId() + " as we are done with all the records " + recordsWritten);
try { try {
storageWriter.close(); storageWriter.close();
@@ -174,8 +172,8 @@ public class HoodieCreateHandle<T extends HoodieRecordPayload> extends HoodieWri
stat.setRuntimeStats(runtimeStats); stat.setRuntimeStats(runtimeStats);
writeStatus.setStat(stat); writeStatus.setStat(stat);
logger.info(String.format("CreateHandle for partitionPath %s fileID %s, took %d ms.", logger.info(String.format("CreateHandle for partitionPath %s fileID %s, took %d ms.", stat.getPartitionPath(),
stat.getPartitionPath(), stat.getFileId(), runtimeStats.getTotalCreateTime())); stat.getFileId(), runtimeStats.getTotalCreateTime()));
return writeStatus; return writeStatus;
} catch (IOException e) { } catch (IOException e) {

View File

@@ -67,15 +67,15 @@ public class HoodieKeyLookupHandle<T extends HoodieRecordPayload> extends Hoodie
/** /**
* Given a list of row keys and one file, return only row keys existing in that file. * Given a list of row keys and one file, return only row keys existing in that file.
*/ */
public static List<String> checkCandidatesAgainstFile(Configuration configuration, public static List<String> checkCandidatesAgainstFile(Configuration configuration, List<String> candidateRecordKeys,
List<String> candidateRecordKeys, Path filePath) throws HoodieIndexException { Path filePath) throws HoodieIndexException {
List<String> foundRecordKeys = new ArrayList<>(); List<String> foundRecordKeys = new ArrayList<>();
try { try {
// Load all rowKeys from the file, to double-confirm // Load all rowKeys from the file, to double-confirm
if (!candidateRecordKeys.isEmpty()) { if (!candidateRecordKeys.isEmpty()) {
HoodieTimer timer = new HoodieTimer().startTimer(); HoodieTimer timer = new HoodieTimer().startTimer();
Set<String> fileRowKeys = ParquetUtils.filterParquetRowKeys(configuration, filePath, Set<String> fileRowKeys =
new HashSet<>(candidateRecordKeys)); ParquetUtils.filterParquetRowKeys(configuration, filePath, new HashSet<>(candidateRecordKeys));
foundRecordKeys.addAll(fileRowKeys); foundRecordKeys.addAll(fileRowKeys);
logger.info(String.format("Checked keys against file %s, in %d ms. #candidates (%d) #found (%d)", filePath, logger.info(String.format("Checked keys against file %s, in %d ms. #candidates (%d) #found (%d)", filePath,
timer.endTimer(), candidateRecordKeys.size(), foundRecordKeys.size())); timer.endTimer(), candidateRecordKeys.size(), foundRecordKeys.size()));
@@ -112,11 +112,11 @@ public class HoodieKeyLookupHandle<T extends HoodieRecordPayload> extends Hoodie
} }
HoodieDataFile dataFile = getLatestDataFile(); HoodieDataFile dataFile = getLatestDataFile();
List<String> matchingKeys = checkCandidatesAgainstFile(hoodieTable.getHadoopConf(), candidateRecordKeys, List<String> matchingKeys =
new Path(dataFile.getPath())); checkCandidatesAgainstFile(hoodieTable.getHadoopConf(), candidateRecordKeys, new Path(dataFile.getPath()));
logger.info(String.format("Total records (%d), bloom filter candidates (%d)/fp(%d), actual matches (%d)", logger.info(
totalKeysChecked, candidateRecordKeys.size(), candidateRecordKeys.size() - matchingKeys.size(), String.format("Total records (%d), bloom filter candidates (%d)/fp(%d), actual matches (%d)", totalKeysChecked,
matchingKeys.size())); candidateRecordKeys.size(), candidateRecordKeys.size() - matchingKeys.size(), matchingKeys.size()));
return new KeyLookupResult(partitionPathFilePair.getRight(), partitionPathFilePair.getLeft(), return new KeyLookupResult(partitionPathFilePair.getRight(), partitionPathFilePair.getLeft(),
dataFile.getCommitTime(), matchingKeys); dataFile.getCommitTime(), matchingKeys);
} }

View File

@@ -71,8 +71,7 @@ public class HoodieMergeHandle<T extends HoodieRecordPayload> extends HoodieWrit
Iterator<HoodieRecord<T>> recordItr, String fileId) { Iterator<HoodieRecord<T>> recordItr, String fileId) {
super(config, commitTime, fileId, hoodieTable); super(config, commitTime, fileId, hoodieTable);
String partitionPath = init(fileId, recordItr); String partitionPath = init(fileId, recordItr);
init(fileId, partitionPath, init(fileId, partitionPath, hoodieTable.getROFileSystemView().getLatestDataFile(partitionPath, fileId).get());
hoodieTable.getROFileSystemView().getLatestDataFile(partitionPath, fileId).get());
} }
/** /**
@@ -83,8 +82,8 @@ public class HoodieMergeHandle<T extends HoodieRecordPayload> extends HoodieWrit
super(config, commitTime, fileId, hoodieTable); super(config, commitTime, fileId, hoodieTable);
this.keyToNewRecords = keyToNewRecords; this.keyToNewRecords = keyToNewRecords;
this.useWriterSchema = true; this.useWriterSchema = true;
init(fileId, keyToNewRecords.get(keyToNewRecords.keySet().stream().findFirst().get()) init(fileId, keyToNewRecords.get(keyToNewRecords.keySet().stream().findFirst().get()).getPartitionPath(),
.getPartitionPath(), dataFileToBeMerged); dataFileToBeMerged);
} }
@@ -160,14 +159,12 @@ public class HoodieMergeHandle<T extends HoodieRecordPayload> extends HoodieWrit
new Path(config.getBasePath()), FSUtils.getPartitionPath(config.getBasePath(), partitionPath)); new Path(config.getBasePath()), FSUtils.getPartitionPath(config.getBasePath(), partitionPath));
partitionMetadata.trySave(TaskContext.getPartitionId()); partitionMetadata.trySave(TaskContext.getPartitionId());
oldFilePath = new Path( oldFilePath = new Path(config.getBasePath() + "/" + partitionPath + "/" + latestValidFilePath);
config.getBasePath() + "/" + partitionPath + "/" + latestValidFilePath);
String relativePath = new Path((partitionPath.isEmpty() ? "" : partitionPath + "/") String relativePath = new Path((partitionPath.isEmpty() ? "" : partitionPath + "/")
+ FSUtils.makeDataFileName(instantTime, writeToken, fileId)).toString(); + FSUtils.makeDataFileName(instantTime, writeToken, fileId)).toString();
newFilePath = new Path(config.getBasePath(), relativePath); newFilePath = new Path(config.getBasePath(), relativePath);
logger.info(String logger.info(String.format("Merging new data into oldPath %s, as newPath %s", oldFilePath.toString(),
.format("Merging new data into oldPath %s, as newPath %s", oldFilePath.toString(),
newFilePath.toString())); newFilePath.toString()));
// file name is same for all records, in this bunch // file name is same for all records, in this bunch
writeStatus.setFileId(fileId); writeStatus.setFileId(fileId);
@@ -180,13 +177,12 @@ public class HoodieMergeHandle<T extends HoodieRecordPayload> extends HoodieWrit
createMarkerFile(partitionPath); createMarkerFile(partitionPath);
// Create the writer for writing the new version file // Create the writer for writing the new version file
storageWriter = HoodieStorageWriterFactory storageWriter =
.getStorageWriter(instantTime, newFilePath, hoodieTable, config, writerSchema); HoodieStorageWriterFactory.getStorageWriter(instantTime, newFilePath, hoodieTable, config, writerSchema);
} catch (IOException io) { } catch (IOException io) {
logger.error("Error in update task at commit " + instantTime, io); logger.error("Error in update task at commit " + instantTime, io);
writeStatus.setGlobalError(io); writeStatus.setGlobalError(io);
throw new HoodieUpsertException( throw new HoodieUpsertException("Failed to initialize HoodieUpdateHandle for FileId: " + fileId + " on commit "
"Failed to initialize HoodieUpdateHandle for FileId: " + fileId + " on commit "
+ instantTime + " on path " + hoodieTable.getMetaClient().getBasePath(), io); + instantTime + " on path " + hoodieTable.getMetaClient().getBasePath(), io);
} }
} }
@@ -217,10 +213,8 @@ public class HoodieMergeHandle<T extends HoodieRecordPayload> extends HoodieWrit
logger.info("Number of entries in MemoryBasedMap => " logger.info("Number of entries in MemoryBasedMap => "
+ ((ExternalSpillableMap) keyToNewRecords).getInMemoryMapNumEntries() + ((ExternalSpillableMap) keyToNewRecords).getInMemoryMapNumEntries()
+ "Total size in bytes of MemoryBasedMap => " + "Total size in bytes of MemoryBasedMap => "
+ ((ExternalSpillableMap) keyToNewRecords).getCurrentInMemoryMapSize() + ((ExternalSpillableMap) keyToNewRecords).getCurrentInMemoryMapSize() + "Number of entries in DiskBasedMap => "
+ "Number of entries in DiskBasedMap => " + ((ExternalSpillableMap) keyToNewRecords).getDiskBasedMapNumEntries() + "Size of file spilled to disk => "
+ ((ExternalSpillableMap) keyToNewRecords).getDiskBasedMapNumEntries()
+ "Size of file spilled to disk => "
+ ((ExternalSpillableMap) keyToNewRecords).getSizeOfFileOnDiskInBytes()); + ((ExternalSpillableMap) keyToNewRecords).getSizeOfFileOnDiskInBytes());
return partitionPath; return partitionPath;
} }
@@ -258,8 +252,7 @@ public class HoodieMergeHandle<T extends HoodieRecordPayload> extends HoodieWrit
} }
/** /**
* Go through an old record. Here if we detect a newer version shows up, we write the new one to * Go through an old record. Here if we detect a newer version shows up, we write the new one to the file.
* the file.
*/ */
public void write(GenericRecord oldRecord) { public void write(GenericRecord oldRecord) {
String key = oldRecord.get(HoodieRecord.RECORD_KEY_METADATA_FIELD).toString(); String key = oldRecord.get(HoodieRecord.RECORD_KEY_METADATA_FIELD).toString();
@@ -269,12 +262,12 @@ public class HoodieMergeHandle<T extends HoodieRecordPayload> extends HoodieWrit
// writing the first record. So make a copy of the record to be merged // writing the first record. So make a copy of the record to be merged
HoodieRecord<T> hoodieRecord = new HoodieRecord<>(keyToNewRecords.get(key)); HoodieRecord<T> hoodieRecord = new HoodieRecord<>(keyToNewRecords.get(key));
try { try {
Option<IndexedRecord> combinedAvroRecord = hoodieRecord.getData() Option<IndexedRecord> combinedAvroRecord =
.combineAndGetUpdateValue(oldRecord, useWriterSchema ? writerSchema : originalSchema); hoodieRecord.getData().combineAndGetUpdateValue(oldRecord, useWriterSchema ? writerSchema : originalSchema);
if (writeUpdateRecord(hoodieRecord, combinedAvroRecord)) { if (writeUpdateRecord(hoodieRecord, combinedAvroRecord)) {
/* ONLY WHEN /*
* 1) we have an update for this key AND * ONLY WHEN 1) we have an update for this key AND 2) We are able to successfully write the the combined new
* 2) We are able to successfully write the the combined new value * value
* *
* We no longer need to copy the old record over. * We no longer need to copy the old record over.
*/ */
@@ -282,26 +275,24 @@ public class HoodieMergeHandle<T extends HoodieRecordPayload> extends HoodieWrit
} }
writtenRecordKeys.add(key); writtenRecordKeys.add(key);
} catch (Exception e) { } catch (Exception e) {
throw new HoodieUpsertException( throw new HoodieUpsertException("Failed to combine/merge new record with old value in storage, for new record {"
"Failed to combine/merge new record with old value in storage, for new record {"
+ keyToNewRecords.get(key) + "}, old value {" + oldRecord + "}", e); + keyToNewRecords.get(key) + "}, old value {" + oldRecord + "}", e);
} }
} }
if (copyOldRecord) { if (copyOldRecord) {
// this should work as it is, since this is an existing record // this should work as it is, since this is an existing record
String errMsg = "Failed to merge old record into new file for key " + key + " from old file " String errMsg = "Failed to merge old record into new file for key " + key + " from old file " + getOldFilePath()
+ getOldFilePath() + " to new file " + newFilePath; + " to new file " + newFilePath;
try { try {
storageWriter.writeAvro(key, oldRecord); storageWriter.writeAvro(key, oldRecord);
} catch (ClassCastException e) { } catch (ClassCastException e) {
logger.error("Schema mismatch when rewriting old record " + oldRecord + " from file " logger.error("Schema mismatch when rewriting old record " + oldRecord + " from file " + getOldFilePath()
+ getOldFilePath() + " to file " + newFilePath + " with writerSchema " + writerSchema + " to file " + newFilePath + " with writerSchema " + writerSchema.toString(true));
.toString(true));
throw new HoodieUpsertException(errMsg, e); throw new HoodieUpsertException(errMsg, e);
} catch (IOException e) { } catch (IOException e) {
logger.error("Failed to merge old record into new file for key " + key + " from old file " logger.error("Failed to merge old record into new file for key " + key + " from old file " + getOldFilePath()
+ getOldFilePath() + " to new file " + newFilePath, e); + " to new file " + newFilePath, e);
throw new HoodieUpsertException(errMsg, e); throw new HoodieUpsertException(errMsg, e);
} }
recordsWritten++; recordsWritten++;
@@ -344,8 +335,8 @@ public class HoodieMergeHandle<T extends HoodieRecordPayload> extends HoodieWrit
runtimeStats.setTotalUpsertTime(timer.endTimer()); runtimeStats.setTotalUpsertTime(timer.endTimer());
stat.setRuntimeStats(runtimeStats); stat.setRuntimeStats(runtimeStats);
logger.info(String.format("MergeHandle for partitionPath %s fileID %s, took %d ms.", logger.info(String.format("MergeHandle for partitionPath %s fileID %s, took %d ms.", stat.getPartitionPath(),
stat.getPartitionPath(), stat.getFileId(), runtimeStats.getTotalUpsertTime())); stat.getFileId(), runtimeStats.getTotalUpsertTime()));
return writeStatus; return writeStatus;
} catch (IOException e) { } catch (IOException e) {

View File

@@ -61,8 +61,7 @@ public abstract class HoodieWriteHandle<T extends HoodieRecordPayload> extends H
this.writerSchema = createHoodieWriteSchema(originalSchema); this.writerSchema = createHoodieWriteSchema(originalSchema);
this.timer = new HoodieTimer().startTimer(); this.timer = new HoodieTimer().startTimer();
this.writeStatus = (WriteStatus) ReflectionUtils.loadClass(config.getWriteStatusClassName(), this.writeStatus = (WriteStatus) ReflectionUtils.loadClass(config.getWriteStatusClassName(),
!hoodieTable.getIndex().isImplicitWithStorage(), !hoodieTable.getIndex().isImplicitWithStorage(), config.getWriteStatusFailureFraction());
config.getWriteStatusFailureFraction());
} }
/** /**

View File

@@ -45,15 +45,12 @@ public interface HoodieCompactor extends Serializable {
* @return Compaction Plan * @return Compaction Plan
* @throws IOException when encountering errors * @throws IOException when encountering errors
*/ */
HoodieCompactionPlan generateCompactionPlan(JavaSparkContext jsc, HoodieCompactionPlan generateCompactionPlan(JavaSparkContext jsc, HoodieTable hoodieTable, HoodieWriteConfig config,
HoodieTable hoodieTable, HoodieWriteConfig config, String compactionCommitTime, String compactionCommitTime, Set<HoodieFileGroupId> fgIdsInPendingCompactions) throws IOException;
Set<HoodieFileGroupId> fgIdsInPendingCompactions)
throws IOException;
/** /**
* Execute compaction operations and report back status * Execute compaction operations and report back status
*/ */
JavaRDD<WriteStatus> compact(JavaSparkContext jsc, JavaRDD<WriteStatus> compact(JavaSparkContext jsc, HoodieCompactionPlan compactionPlan, HoodieTable hoodieTable,
HoodieCompactionPlan compactionPlan, HoodieTable hoodieTable, HoodieWriteConfig config, HoodieWriteConfig config, String compactionInstantTime) throws IOException;
String compactionInstantTime) throws IOException;
} }

View File

@@ -63,9 +63,9 @@ import org.apache.spark.util.AccumulatorV2;
import org.apache.spark.util.LongAccumulator; import org.apache.spark.util.LongAccumulator;
/** /**
* HoodieRealtimeTableCompactor compacts a hoodie table with merge on read storage. Computes all * HoodieRealtimeTableCompactor compacts a hoodie table with merge on read storage. Computes all possible compactions,
* possible compactions, passes it through a CompactionFilter and executes all the compactions and * passes it through a CompactionFilter and executes all the compactions and writes a new version of base files and make
* writes a new version of base files and make a normal commit * a normal commit
* *
* @see HoodieCompactor * @see HoodieCompactor
*/ */
@@ -78,9 +78,8 @@ public class HoodieRealtimeTableCompactor implements HoodieCompactor {
private AccumulatorV2<Long, Long> totalFileSlices; private AccumulatorV2<Long, Long> totalFileSlices;
@Override @Override
public JavaRDD<WriteStatus> compact(JavaSparkContext jsc, public JavaRDD<WriteStatus> compact(JavaSparkContext jsc, HoodieCompactionPlan compactionPlan,
HoodieCompactionPlan compactionPlan, HoodieTable hoodieTable, HoodieWriteConfig config, HoodieTable hoodieTable, HoodieWriteConfig config, String compactionInstantTime) throws IOException {
String compactionInstantTime) throws IOException {
if (compactionPlan == null || (compactionPlan.getOperations() == null) if (compactionPlan == null || (compactionPlan.getOperations() == null)
|| (compactionPlan.getOperations().isEmpty())) { || (compactionPlan.getOperations().isEmpty())) {
return jsc.emptyRDD(); return jsc.emptyRDD();
@@ -88,41 +87,36 @@ public class HoodieRealtimeTableCompactor implements HoodieCompactor {
HoodieTableMetaClient metaClient = hoodieTable.getMetaClient(); HoodieTableMetaClient metaClient = hoodieTable.getMetaClient();
// Compacting is very similar to applying updates to existing file // Compacting is very similar to applying updates to existing file
HoodieCopyOnWriteTable table = new HoodieCopyOnWriteTable(config, jsc); HoodieCopyOnWriteTable table = new HoodieCopyOnWriteTable(config, jsc);
List<CompactionOperation> operations = compactionPlan.getOperations().stream().map( List<CompactionOperation> operations = compactionPlan.getOperations().stream()
CompactionOperation::convertFromAvroRecordInstance).collect(toList()); .map(CompactionOperation::convertFromAvroRecordInstance).collect(toList());
log.info("Compactor compacting " + operations + " files"); log.info("Compactor compacting " + operations + " files");
return jsc.parallelize(operations, operations.size()) return jsc.parallelize(operations, operations.size())
.map(s -> compact(table, metaClient, config, s, compactionInstantTime)) .map(s -> compact(table, metaClient, config, s, compactionInstantTime)).flatMap(List::iterator);
.flatMap(List::iterator);
} }
private List<WriteStatus> compact(HoodieCopyOnWriteTable hoodieCopyOnWriteTable, HoodieTableMetaClient metaClient, private List<WriteStatus> compact(HoodieCopyOnWriteTable hoodieCopyOnWriteTable, HoodieTableMetaClient metaClient,
HoodieWriteConfig config, HoodieWriteConfig config, CompactionOperation operation, String commitTime) throws IOException {
CompactionOperation operation, String commitTime) throws IOException {
FileSystem fs = metaClient.getFs(); FileSystem fs = metaClient.getFs();
Schema readerSchema = HoodieAvroUtils Schema readerSchema = HoodieAvroUtils.addMetadataFields(new Schema.Parser().parse(config.getSchema()));
.addMetadataFields(new Schema.Parser().parse(config.getSchema()));
log.info("Compacting base " + operation.getDataFilePath() + " with delta files " + operation log.info("Compacting base " + operation.getDataFilePath() + " with delta files " + operation.getDeltaFilePaths()
.getDeltaFilePaths() + " for commit " + commitTime); + " for commit " + commitTime);
// TODO - FIX THIS // TODO - FIX THIS
// Reads the entire avro file. Always only specific blocks should be read from the avro file // Reads the entire avro file. Always only specific blocks should be read from the avro file
// (failure recover). // (failure recover).
// Load all the delta commits since the last compaction commit and get all the blocks to be // Load all the delta commits since the last compaction commit and get all the blocks to be
// loaded and load it using CompositeAvroLogReader // loaded and load it using CompositeAvroLogReader
// Since a DeltaCommit is not defined yet, reading all the records. revisit this soon. // Since a DeltaCommit is not defined yet, reading all the records. revisit this soon.
String maxInstantTime = metaClient.getActiveTimeline() String maxInstantTime = metaClient
.getTimelineOfActions( .getActiveTimeline().getTimelineOfActions(Sets.newHashSet(HoodieTimeline.COMMIT_ACTION,
Sets.newHashSet(HoodieTimeline.COMMIT_ACTION, HoodieTimeline.ROLLBACK_ACTION, HoodieTimeline.ROLLBACK_ACTION, HoodieTimeline.DELTA_COMMIT_ACTION))
HoodieTimeline.DELTA_COMMIT_ACTION))
.filterCompletedInstants().lastInstant().get().getTimestamp(); .filterCompletedInstants().lastInstant().get().getTimestamp();
log.info("MaxMemoryPerCompaction => " + config.getMaxMemoryPerCompaction()); log.info("MaxMemoryPerCompaction => " + config.getMaxMemoryPerCompaction());
HoodieMergedLogRecordScanner scanner = new HoodieMergedLogRecordScanner(fs, HoodieMergedLogRecordScanner scanner = new HoodieMergedLogRecordScanner(fs, metaClient.getBasePath(),
metaClient.getBasePath(), operation.getDeltaFilePaths(), readerSchema, maxInstantTime, operation.getDeltaFilePaths(), readerSchema, maxInstantTime, config.getMaxMemoryPerCompaction(),
config.getMaxMemoryPerCompaction(), config.getCompactionLazyBlockReadEnabled(), config.getCompactionLazyBlockReadEnabled(), config.getCompactionReverseLogReadEnabled(),
config.getCompactionReverseLogReadEnabled(), config.getMaxDFSStreamBufferSize(), config.getMaxDFSStreamBufferSize(), config.getSpillableMapBasePath());
config.getSpillableMapBasePath());
if (!scanner.iterator().hasNext()) { if (!scanner.iterator().hasNext()) {
return Lists.<WriteStatus>newArrayList(); return Lists.<WriteStatus>newArrayList();
} }
@@ -134,21 +128,20 @@ public class HoodieRealtimeTableCompactor implements HoodieCompactor {
// If the dataFile is present, there is a base parquet file present, perform updates else perform inserts into a // If the dataFile is present, there is a base parquet file present, perform updates else perform inserts into a
// new base parquet file. // new base parquet file.
if (oldDataFileOpt.isPresent()) { if (oldDataFileOpt.isPresent()) {
result = hoodieCopyOnWriteTable result = hoodieCopyOnWriteTable.handleUpdate(commitTime, operation.getFileId(), scanner.getRecords(),
.handleUpdate(commitTime, operation.getFileId(), scanner.getRecords(), oldDataFileOpt.get()); oldDataFileOpt.get());
} else { } else {
result = hoodieCopyOnWriteTable result = hoodieCopyOnWriteTable.handleInsert(commitTime, operation.getPartitionPath(), operation.getFileId(),
.handleInsert(commitTime, operation.getPartitionPath(), operation.getFileId(), scanner.iterator()); scanner.iterator());
} }
Iterable<List<WriteStatus>> resultIterable = () -> result; Iterable<List<WriteStatus>> resultIterable = () -> result;
return StreamSupport.stream(resultIterable.spliterator(), false).flatMap(Collection::stream) return StreamSupport.stream(resultIterable.spliterator(), false).flatMap(Collection::stream).peek(s -> {
.peek(s -> {
s.getStat().setTotalUpdatedRecordsCompacted(scanner.getNumMergedRecordsInLog()); s.getStat().setTotalUpdatedRecordsCompacted(scanner.getNumMergedRecordsInLog());
s.getStat().setTotalLogFilesCompacted(scanner.getTotalLogFiles()); s.getStat().setTotalLogFilesCompacted(scanner.getTotalLogFiles());
s.getStat().setTotalLogRecords(scanner.getTotalLogRecords()); s.getStat().setTotalLogRecords(scanner.getTotalLogRecords());
s.getStat().setPartitionPath(operation.getPartitionPath()); s.getStat().setPartitionPath(operation.getPartitionPath());
s.getStat().setTotalLogSizeCompacted(operation.getMetrics().get( s.getStat()
CompactionStrategy.TOTAL_LOG_FILE_SIZE).longValue()); .setTotalLogSizeCompacted(operation.getMetrics().get(CompactionStrategy.TOTAL_LOG_FILE_SIZE).longValue());
s.getStat().setTotalLogBlocks(scanner.getTotalLogBlocks()); s.getStat().setTotalLogBlocks(scanner.getTotalLogBlocks());
s.getStat().setTotalCorruptLogBlock(scanner.getTotalCorruptBlocks()); s.getStat().setTotalCorruptLogBlock(scanner.getTotalCorruptBlocks());
s.getStat().setTotalRollbackBlocks(scanner.getTotalRollbacks()); s.getStat().setTotalRollbackBlocks(scanner.getTotalRollbacks());
@@ -159,27 +152,24 @@ public class HoodieRealtimeTableCompactor implements HoodieCompactor {
} }
@Override @Override
public HoodieCompactionPlan generateCompactionPlan(JavaSparkContext jsc, public HoodieCompactionPlan generateCompactionPlan(JavaSparkContext jsc, HoodieTable hoodieTable,
HoodieTable hoodieTable, HoodieWriteConfig config, String compactionCommitTime, HoodieWriteConfig config, String compactionCommitTime, Set<HoodieFileGroupId> fgIdsInPendingCompactions)
Set<HoodieFileGroupId> fgIdsInPendingCompactions) throws IOException { throws IOException {
totalLogFiles = new LongAccumulator(); totalLogFiles = new LongAccumulator();
totalFileSlices = new LongAccumulator(); totalFileSlices = new LongAccumulator();
jsc.sc().register(totalLogFiles); jsc.sc().register(totalLogFiles);
jsc.sc().register(totalFileSlices); jsc.sc().register(totalFileSlices);
Preconditions Preconditions.checkArgument(hoodieTable.getMetaClient().getTableType() == HoodieTableType.MERGE_ON_READ,
.checkArgument(hoodieTable.getMetaClient().getTableType() == HoodieTableType.MERGE_ON_READ, "HoodieRealtimeTableCompactor can only compact table of type " + HoodieTableType.MERGE_ON_READ + " and not "
"HoodieRealtimeTableCompactor can only compact table of type " + hoodieTable.getMetaClient().getTableType().name());
+ HoodieTableType.MERGE_ON_READ + " and not " + hoodieTable.getMetaClient()
.getTableType().name());
// TODO : check if maxMemory is not greater than JVM or spark.executor memory // TODO : check if maxMemory is not greater than JVM or spark.executor memory
// TODO - rollback any compactions in flight // TODO - rollback any compactions in flight
HoodieTableMetaClient metaClient = hoodieTable.getMetaClient(); HoodieTableMetaClient metaClient = hoodieTable.getMetaClient();
log.info("Compacting " + metaClient.getBasePath() + " with commit " + compactionCommitTime); log.info("Compacting " + metaClient.getBasePath() + " with commit " + compactionCommitTime);
List<String> partitionPaths = FSUtils List<String> partitionPaths = FSUtils.getAllPartitionPaths(metaClient.getFs(), metaClient.getBasePath(),
.getAllPartitionPaths(metaClient.getFs(), metaClient.getBasePath(),
config.shouldAssumeDatePartitioning()); config.shouldAssumeDatePartitioning());
// filter the partition paths if needed to reduce list status // filter the partition paths if needed to reduce list status
@@ -192,16 +182,12 @@ public class HoodieRealtimeTableCompactor implements HoodieCompactor {
RealtimeView fileSystemView = hoodieTable.getRTFileSystemView(); RealtimeView fileSystemView = hoodieTable.getRTFileSystemView();
log.info("Compaction looking for files to compact in " + partitionPaths + " partitions"); log.info("Compaction looking for files to compact in " + partitionPaths + " partitions");
List<HoodieCompactionOperation> operations = List<HoodieCompactionOperation> operations = jsc.parallelize(partitionPaths, partitionPaths.size())
jsc.parallelize(partitionPaths, partitionPaths.size())
.flatMap((FlatMapFunction<String, CompactionOperation>) partitionPath -> fileSystemView .flatMap((FlatMapFunction<String, CompactionOperation>) partitionPath -> fileSystemView
.getLatestFileSlices(partitionPath) .getLatestFileSlices(partitionPath)
.filter(slice -> .filter(slice -> !fgIdsInPendingCompactions.contains(slice.getFileGroupId())).map(s -> {
!fgIdsInPendingCompactions.contains(slice.getFileGroupId())) List<HoodieLogFile> logFiles =
.map( s.getLogFiles().sorted(HoodieLogFile.getLogFileComparator()).collect(Collectors.toList());
s -> {
List<HoodieLogFile> logFiles = s.getLogFiles().sorted(HoodieLogFile
.getLogFileComparator()).collect(Collectors.toList());
totalLogFiles.add((long) logFiles.size()); totalLogFiles.add((long) logFiles.size());
totalFileSlices.add(1L); totalFileSlices.add(1L);
// Avro generated classes are not inheriting Serializable. Using CompactionOperation POJO // Avro generated classes are not inheriting Serializable. Using CompactionOperation POJO
@@ -210,10 +196,8 @@ public class HoodieRealtimeTableCompactor implements HoodieCompactor {
Option<HoodieDataFile> dataFile = s.getDataFile(); Option<HoodieDataFile> dataFile = s.getDataFile();
return new CompactionOperation(dataFile, partitionPath, logFiles, return new CompactionOperation(dataFile, partitionPath, logFiles,
config.getCompactionStrategy().captureMetrics(config, dataFile, partitionPath, logFiles)); config.getCompactionStrategy().captureMetrics(config, dataFile, partitionPath, logFiles));
}) }).filter(c -> !c.getDeltaFilePaths().isEmpty()).collect(toList()).iterator())
.filter(c -> !c.getDeltaFilePaths().isEmpty()) .collect().stream().map(CompactionUtils::buildHoodieCompactionOperation).collect(toList());
.collect(toList()).iterator()).collect().stream().map(CompactionUtils::buildHoodieCompactionOperation)
.collect(toList());
log.info("Total of " + operations.size() + " compactions are retrieved"); log.info("Total of " + operations.size() + " compactions are retrieved");
log.info("Total number of latest files slices " + totalFileSlices.value()); log.info("Total number of latest files slices " + totalFileSlices.value());
log.info("Total number of log files " + totalLogFiles.value()); log.info("Total number of log files " + totalLogFiles.value());
@@ -222,11 +206,11 @@ public class HoodieRealtimeTableCompactor implements HoodieCompactor {
// compactions only // compactions only
HoodieCompactionPlan compactionPlan = config.getCompactionStrategy().generateCompactionPlan(config, operations, HoodieCompactionPlan compactionPlan = config.getCompactionStrategy().generateCompactionPlan(config, operations,
CompactionUtils.getAllPendingCompactionPlans(metaClient).stream().map(Pair::getValue).collect(toList())); CompactionUtils.getAllPendingCompactionPlans(metaClient).stream().map(Pair::getValue).collect(toList()));
Preconditions.checkArgument(compactionPlan.getOperations().stream().noneMatch( Preconditions.checkArgument(
compactionPlan.getOperations().stream().noneMatch(
op -> fgIdsInPendingCompactions.contains(new HoodieFileGroupId(op.getPartitionPath(), op.getFileId()))), op -> fgIdsInPendingCompactions.contains(new HoodieFileGroupId(op.getPartitionPath(), op.getFileId()))),
"Bad Compaction Plan. FileId MUST NOT have multiple pending compactions. " "Bad Compaction Plan. FileId MUST NOT have multiple pending compactions. "
+ "Please fix your strategy implementation." + "Please fix your strategy implementation." + "FileIdsWithPendingCompactions :" + fgIdsInPendingCompactions
+ "FileIdsWithPendingCompactions :" + fgIdsInPendingCompactions
+ ", Selected workload :" + compactionPlan); + ", Selected workload :" + compactionPlan);
if (compactionPlan.getOperations().isEmpty()) { if (compactionPlan.getOperations().isEmpty()) {
log.warn("After filtering, Nothing to compact for " + metaClient.getBasePath()); log.warn("After filtering, Nothing to compact for " + metaClient.getBasePath());

View File

@@ -25,8 +25,8 @@ import org.apache.hudi.avro.model.HoodieCompactionPlan;
import org.apache.hudi.config.HoodieWriteConfig; import org.apache.hudi.config.HoodieWriteConfig;
/** /**
* CompactionStrategy which looks at total IO to be done for the compaction (read + write) and * CompactionStrategy which looks at total IO to be done for the compaction (read + write) and limits the list of
* limits the list of compactions to be under a configured limit on the IO * compactions to be under a configured limit on the IO
* *
* @see CompactionStrategy * @see CompactionStrategy
*/ */

View File

@@ -32,11 +32,10 @@ import org.apache.hudi.config.HoodieWriteConfig;
/** /**
* This strategy ensures that the last N partitions are picked up even if there are later partitions created for the * This strategy ensures that the last N partitions are picked up even if there are later partitions created for the
* dataset. lastNPartitions is defined as the N partitions before the currentDate. * dataset. lastNPartitions is defined as the N partitions before the currentDate. currentDay = 2018/01/01 The dataset
* currentDay = 2018/01/01 * has partitions for 2018/02/02 and 2018/03/03 beyond the currentDay This strategy will pick up the following
* The dataset has partitions for 2018/02/02 and 2018/03/03 beyond the currentDay * partitions for compaction : (2018/01/01, allPartitionsInRange[(2018/01/01 - lastNPartitions) to 2018/01/01),
* This strategy will pick up the following partitions for compaction : * 2018/02/02, 2018/03/03)
* (2018/01/01, allPartitionsInRange[(2018/01/01 - lastNPartitions) to 2018/01/01), 2018/02/02, 2018/03/03)
*/ */
public class BoundedPartitionAwareCompactionStrategy extends DayBasedCompactionStrategy { public class BoundedPartitionAwareCompactionStrategy extends DayBasedCompactionStrategy {
@@ -46,15 +45,14 @@ public class BoundedPartitionAwareCompactionStrategy extends DayBasedCompactionS
public List<HoodieCompactionOperation> orderAndFilter(HoodieWriteConfig writeConfig, public List<HoodieCompactionOperation> orderAndFilter(HoodieWriteConfig writeConfig,
List<HoodieCompactionOperation> operations, List<HoodieCompactionPlan> pendingCompactionPlans) { List<HoodieCompactionOperation> operations, List<HoodieCompactionPlan> pendingCompactionPlans) {
// The earliest partition to compact - current day minus the target partitions limit // The earliest partition to compact - current day minus the target partitions limit
String earliestPartitionPathToCompact = dateFormat.format( String earliestPartitionPathToCompact =
getDateAtOffsetFromToday(-1 * writeConfig.getTargetPartitionsPerDayBasedCompaction())); dateFormat.format(getDateAtOffsetFromToday(-1 * writeConfig.getTargetPartitionsPerDayBasedCompaction()));
// Filter out all partitions greater than earliestPartitionPathToCompact // Filter out all partitions greater than earliestPartitionPathToCompact
List<HoodieCompactionOperation> eligibleCompactionOperations = operations.stream() List<HoodieCompactionOperation> eligibleCompactionOperations =
.collect(Collectors.groupingBy(HoodieCompactionOperation::getPartitionPath)).entrySet().stream() operations.stream().collect(Collectors.groupingBy(HoodieCompactionOperation::getPartitionPath)).entrySet()
.sorted(Map.Entry.comparingByKey(comparator)) .stream().sorted(Map.Entry.comparingByKey(comparator))
.filter(e -> comparator.compare(earliestPartitionPathToCompact, e.getKey()) >= 0) .filter(e -> comparator.compare(earliestPartitionPathToCompact, e.getKey()) >= 0)
.flatMap(e -> e.getValue().stream()) .flatMap(e -> e.getValue().stream()).collect(Collectors.toList());
.collect(Collectors.toList());
return eligibleCompactionOperations; return eligibleCompactionOperations;
} }
@@ -62,13 +60,12 @@ public class BoundedPartitionAwareCompactionStrategy extends DayBasedCompactionS
@Override @Override
public List<String> filterPartitionPaths(HoodieWriteConfig writeConfig, List<String> partitionPaths) { public List<String> filterPartitionPaths(HoodieWriteConfig writeConfig, List<String> partitionPaths) {
// The earliest partition to compact - current day minus the target partitions limit // The earliest partition to compact - current day minus the target partitions limit
String earliestPartitionPathToCompact = dateFormat.format( String earliestPartitionPathToCompact =
getDateAtOffsetFromToday(-1 * writeConfig.getTargetPartitionsPerDayBasedCompaction())); dateFormat.format(getDateAtOffsetFromToday(-1 * writeConfig.getTargetPartitionsPerDayBasedCompaction()));
// Get all partitions and sort them // Get all partitions and sort them
List<String> filteredPartitionPaths = partitionPaths.stream().map(partition -> partition.replace("/", "-")) List<String> filteredPartitionPaths = partitionPaths.stream().map(partition -> partition.replace("/", "-"))
.sorted(Comparator.reverseOrder()).map(partitionPath -> partitionPath.replace("-", "/")) .sorted(Comparator.reverseOrder()).map(partitionPath -> partitionPath.replace("-", "/"))
.filter(e -> comparator.compare(earliestPartitionPathToCompact, e) >= 0) .filter(e -> comparator.compare(earliestPartitionPathToCompact, e) >= 0).collect(Collectors.toList());
.collect(Collectors.toList());
return filteredPartitionPaths; return filteredPartitionPaths;
} }

View File

@@ -32,11 +32,10 @@ import org.apache.hudi.config.HoodieWriteConfig;
import org.apache.hudi.io.compact.HoodieRealtimeTableCompactor; import org.apache.hudi.io.compact.HoodieRealtimeTableCompactor;
/** /**
* Strategy for compaction. Pluggable implementation to define how compaction should be done. The * Strategy for compaction. Pluggable implementation to define how compaction should be done. The over-ridden
* over-ridden implementations of this abstract class can capture the relevant metrics to order * implementations of this abstract class can capture the relevant metrics to order and filter the final list of
* and filter the final list of compaction operation to run in a single compaction. * compaction operation to run in a single compaction. Implementation of CompactionStrategy cannot hold any state.
* Implementation of CompactionStrategy cannot hold any state. Difference instantiations can be * Difference instantiations can be passed in every time
* passed in every time
* *
* @see HoodieRealtimeTableCompactor * @see HoodieRealtimeTableCompactor
*/ */
@@ -49,8 +48,8 @@ public abstract class CompactionStrategy implements Serializable {
public static final String TOTAL_LOG_FILES = "TOTAL_LOG_FILES"; public static final String TOTAL_LOG_FILES = "TOTAL_LOG_FILES";
/** /**
* Callback hook when a HoodieCompactionOperation is created. Individual strategies can capture the * Callback hook when a HoodieCompactionOperation is created. Individual strategies can capture the metrics they need
* metrics they need to decide on the priority. * to decide on the priority.
* *
* @param dataFile - Base file to compact * @param dataFile - Base file to compact
* @param partitionPath - Partition path * @param partitionPath - Partition path
@@ -65,11 +64,11 @@ public abstract class CompactionStrategy implements Serializable {
Long totalLogFileSize = logFiles.stream().map(HoodieLogFile::getFileSize).filter(size -> size >= 0) Long totalLogFileSize = logFiles.stream().map(HoodieLogFile::getFileSize).filter(size -> size >= 0)
.reduce((size1, size2) -> size1 + size2).orElse(0L); .reduce((size1, size2) -> size1 + size2).orElse(0L);
// Total read will be the base file + all the log files // Total read will be the base file + all the log files
Long totalIORead = FSUtils.getSizeInMB((dataFile.isPresent() ? dataFile.get().getFileSize() : 0L) Long totalIORead =
+ totalLogFileSize); FSUtils.getSizeInMB((dataFile.isPresent() ? dataFile.get().getFileSize() : 0L) + totalLogFileSize);
// Total write will be similar to the size of the base file // Total write will be similar to the size of the base file
Long totalIOWrite = FSUtils Long totalIOWrite =
.getSizeInMB(dataFile.isPresent() ? dataFile.get().getFileSize() : defaultMaxParquetFileSize); FSUtils.getSizeInMB(dataFile.isPresent() ? dataFile.get().getFileSize() : defaultMaxParquetFileSize);
// Total IO will the the IO for read + write // Total IO will the the IO for read + write
Long totalIO = totalIORead + totalIOWrite; Long totalIO = totalIORead + totalIOWrite;
// Save these metrics and we will use during the filter // Save these metrics and we will use during the filter
@@ -95,8 +94,7 @@ public abstract class CompactionStrategy implements Serializable {
List<HoodieCompactionOperation> operations, List<HoodieCompactionPlan> pendingCompactionPlans) { List<HoodieCompactionOperation> operations, List<HoodieCompactionPlan> pendingCompactionPlans) {
// Strategy implementation can overload this method to set specific compactor-id // Strategy implementation can overload this method to set specific compactor-id
return HoodieCompactionPlan.newBuilder() return HoodieCompactionPlan.newBuilder()
.setOperations(orderAndFilter(writeConfig, operations, pendingCompactionPlans)) .setOperations(orderAndFilter(writeConfig, operations, pendingCompactionPlans)).build();
.build();
} }
/** /**
@@ -109,13 +107,13 @@ public abstract class CompactionStrategy implements Serializable {
* @return list of compactions to perform in this run * @return list of compactions to perform in this run
*/ */
public List<HoodieCompactionOperation> orderAndFilter(HoodieWriteConfig writeConfig, public List<HoodieCompactionOperation> orderAndFilter(HoodieWriteConfig writeConfig,
List<HoodieCompactionOperation> operations, List<HoodieCompactionOperation> operations, List<HoodieCompactionPlan> pendingCompactionPlans) {
List<HoodieCompactionPlan> pendingCompactionPlans) {
return operations; return operations;
} }
/** /**
* Filter the partition paths based on compaction strategy * Filter the partition paths based on compaction strategy
*
* @param writeConfig * @param writeConfig
* @param allPartitionPaths * @param allPartitionPaths
* @return * @return

View File

@@ -34,21 +34,18 @@ import org.apache.hudi.config.HoodieWriteConfig;
import org.apache.hudi.exception.HoodieException; import org.apache.hudi.exception.HoodieException;
/** /**
* This strategy orders compactions in reverse order of creation of Hive Partitions. It helps to * This strategy orders compactions in reverse order of creation of Hive Partitions. It helps to compact data in latest
* compact data in latest partitions first and then older capped at the Total_IO allowed. * partitions first and then older capped at the Total_IO allowed.
*/ */
public class DayBasedCompactionStrategy extends CompactionStrategy { public class DayBasedCompactionStrategy extends CompactionStrategy {
// For now, use SimpleDateFormat as default partition format // For now, use SimpleDateFormat as default partition format
protected static String datePartitionFormat = "yyyy/MM/dd"; protected static String datePartitionFormat = "yyyy/MM/dd";
// Sorts compaction in LastInFirstCompacted order // Sorts compaction in LastInFirstCompacted order
protected static Comparator<String> comparator = (String leftPartition, protected static Comparator<String> comparator = (String leftPartition, String rightPartition) -> {
String rightPartition) -> {
try { try {
Date left = new SimpleDateFormat(datePartitionFormat, Locale.ENGLISH) Date left = new SimpleDateFormat(datePartitionFormat, Locale.ENGLISH).parse(leftPartition);
.parse(leftPartition); Date right = new SimpleDateFormat(datePartitionFormat, Locale.ENGLISH).parse(rightPartition);
Date right = new SimpleDateFormat(datePartitionFormat, Locale.ENGLISH)
.parse(rightPartition);
return left.after(right) ? -1 : right.after(left) ? 1 : 0; return left.after(right) ? -1 : right.after(left) ? 1 : 0;
} catch (ParseException e) { } catch (ParseException e) {
throw new HoodieException("Invalid Partition Date Format", e); throw new HoodieException("Invalid Partition Date Format", e);
@@ -68,8 +65,7 @@ public class DayBasedCompactionStrategy extends CompactionStrategy {
List<HoodieCompactionOperation> filteredList = operations.stream() List<HoodieCompactionOperation> filteredList = operations.stream()
.collect(Collectors.groupingBy(HoodieCompactionOperation::getPartitionPath)).entrySet().stream() .collect(Collectors.groupingBy(HoodieCompactionOperation::getPartitionPath)).entrySet().stream()
.sorted(Map.Entry.comparingByKey(comparator)).limit(writeConfig.getTargetPartitionsPerDayBasedCompaction()) .sorted(Map.Entry.comparingByKey(comparator)).limit(writeConfig.getTargetPartitionsPerDayBasedCompaction())
.flatMap(e -> e.getValue().stream()) .flatMap(e -> e.getValue().stream()).collect(Collectors.toList());
.collect(Collectors.toList());
return filteredList; return filteredList;
} }

View File

@@ -30,14 +30,14 @@ import org.apache.hudi.common.util.Option;
import org.apache.hudi.config.HoodieWriteConfig; import org.apache.hudi.config.HoodieWriteConfig;
/** /**
* LogFileSizeBasedCompactionStrategy orders the compactions based on the total log files size and * LogFileSizeBasedCompactionStrategy orders the compactions based on the total log files size and limits the
* limits the compactions within a configured IO bound * compactions within a configured IO bound
* *
* @see BoundedIOCompactionStrategy * @see BoundedIOCompactionStrategy
* @see CompactionStrategy * @see CompactionStrategy
*/ */
public class LogFileSizeBasedCompactionStrategy extends BoundedIOCompactionStrategy implements public class LogFileSizeBasedCompactionStrategy extends BoundedIOCompactionStrategy
Comparator<HoodieCompactionOperation> { implements Comparator<HoodieCompactionOperation> {
private static final String TOTAL_LOG_FILE_SIZE = "TOTAL_LOG_FILE_SIZE"; private static final String TOTAL_LOG_FILE_SIZE = "TOTAL_LOG_FILE_SIZE";
@@ -47,9 +47,8 @@ public class LogFileSizeBasedCompactionStrategy extends BoundedIOCompactionStrat
Map<String, Double> metrics = super.captureMetrics(config, dataFile, partitionPath, logFiles); Map<String, Double> metrics = super.captureMetrics(config, dataFile, partitionPath, logFiles);
// Total size of all the log files // Total size of all the log files
Long totalLogFileSize = logFiles.stream().map(HoodieLogFile::getFileSize) Long totalLogFileSize = logFiles.stream().map(HoodieLogFile::getFileSize).filter(size -> size >= 0)
.filter(size -> size >= 0).reduce((size1, size2) -> size1 + size2) .reduce((size1, size2) -> size1 + size2).orElse(0L);
.orElse(0L);
// save the metrics needed during the order // save the metrics needed during the order
metrics.put(TOTAL_LOG_FILE_SIZE, totalLogFileSize.doubleValue()); metrics.put(TOTAL_LOG_FILE_SIZE, totalLogFileSize.doubleValue());
return metrics; return metrics;
@@ -59,9 +58,8 @@ public class LogFileSizeBasedCompactionStrategy extends BoundedIOCompactionStrat
public List<HoodieCompactionOperation> orderAndFilter(HoodieWriteConfig writeConfig, public List<HoodieCompactionOperation> orderAndFilter(HoodieWriteConfig writeConfig,
List<HoodieCompactionOperation> operations, List<HoodieCompactionPlan> pendingCompactionPlans) { List<HoodieCompactionOperation> operations, List<HoodieCompactionPlan> pendingCompactionPlans) {
// Order the operations based on the reverse size of the logs and limit them by the IO // Order the operations based on the reverse size of the logs and limit them by the IO
return super return super.orderAndFilter(writeConfig, operations.stream().sorted(this).collect(Collectors.toList()),
.orderAndFilter(writeConfig, pendingCompactionPlans);
operations.stream().sorted(this).collect(Collectors.toList()), pendingCompactionPlans);
} }
@Override @Override

View File

@@ -24,9 +24,8 @@ import org.apache.hudi.avro.model.HoodieCompactionPlan;
import org.apache.hudi.config.HoodieWriteConfig; import org.apache.hudi.config.HoodieWriteConfig;
/** /**
* UnBoundedCompactionStrategy will not change ordering or filter any compaction. It is a * UnBoundedCompactionStrategy will not change ordering or filter any compaction. It is a pass-through and will compact
* pass-through and will compact all the base files which has a log file. This usually means * all the base files which has a log file. This usually means no-intelligence on compaction.
* no-intelligence on compaction.
* *
* @see CompactionStrategy * @see CompactionStrategy
*/ */

View File

@@ -27,12 +27,11 @@ import org.apache.hudi.avro.model.HoodieCompactionPlan;
import org.apache.hudi.config.HoodieWriteConfig; import org.apache.hudi.config.HoodieWriteConfig;
/** /**
* UnBoundedPartitionAwareCompactionStrategy is a custom UnBounded Strategy. * UnBoundedPartitionAwareCompactionStrategy is a custom UnBounded Strategy. This will filter all the partitions that
* This will filter all the partitions that are eligible to be compacted by a * are eligible to be compacted by a {@link BoundedPartitionAwareCompactionStrategy} and return the result. This is done
* {@link BoundedPartitionAwareCompactionStrategy} and return the result. * so that a long running UnBoundedPartitionAwareCompactionStrategy does not step over partitions in a shorter running
* This is done so that a long running UnBoundedPartitionAwareCompactionStrategy does not step over partitions * BoundedPartitionAwareCompactionStrategy. Essentially, this is an inverse of the partitions chosen in
* in a shorter running BoundedPartitionAwareCompactionStrategy. Essentially, this is an inverse of the * BoundedPartitionAwareCompactionStrategy
* partitions chosen in BoundedPartitionAwareCompactionStrategy
* *
* @see CompactionStrategy * @see CompactionStrategy
*/ */
@@ -41,10 +40,10 @@ public class UnBoundedPartitionAwareCompactionStrategy extends CompactionStrateg
@Override @Override
public List<HoodieCompactionOperation> orderAndFilter(HoodieWriteConfig config, public List<HoodieCompactionOperation> orderAndFilter(HoodieWriteConfig config,
final List<HoodieCompactionOperation> operations, final List<HoodieCompactionPlan> pendingCompactionWorkloads) { final List<HoodieCompactionOperation> operations, final List<HoodieCompactionPlan> pendingCompactionWorkloads) {
BoundedPartitionAwareCompactionStrategy boundedPartitionAwareCompactionStrategy BoundedPartitionAwareCompactionStrategy boundedPartitionAwareCompactionStrategy =
= new BoundedPartitionAwareCompactionStrategy(); new BoundedPartitionAwareCompactionStrategy();
List<HoodieCompactionOperation> operationsToExclude = boundedPartitionAwareCompactionStrategy List<HoodieCompactionOperation> operationsToExclude =
.orderAndFilter(config, operations, pendingCompactionWorkloads); boundedPartitionAwareCompactionStrategy.orderAndFilter(config, operations, pendingCompactionWorkloads);
List<HoodieCompactionOperation> allOperations = new ArrayList<>(operations); List<HoodieCompactionOperation> allOperations = new ArrayList<>(operations);
allOperations.removeAll(operationsToExclude); allOperations.removeAll(operationsToExclude);
return allOperations; return allOperations;
@@ -52,13 +51,13 @@ public class UnBoundedPartitionAwareCompactionStrategy extends CompactionStrateg
@Override @Override
public List<String> filterPartitionPaths(HoodieWriteConfig writeConfig, List<String> partitionPaths) { public List<String> filterPartitionPaths(HoodieWriteConfig writeConfig, List<String> partitionPaths) {
List<String> allPartitionPaths = partitionPaths.stream().map(partition -> partition.replace("/", "-")) List<String> allPartitionPaths =
.sorted(Comparator.reverseOrder()).map(partitionPath -> partitionPath.replace("-", "/")) partitionPaths.stream().map(partition -> partition.replace("/", "-")).sorted(Comparator.reverseOrder())
.collect(Collectors.toList()); .map(partitionPath -> partitionPath.replace("-", "/")).collect(Collectors.toList());
BoundedPartitionAwareCompactionStrategy boundedPartitionAwareCompactionStrategy BoundedPartitionAwareCompactionStrategy boundedPartitionAwareCompactionStrategy =
= new BoundedPartitionAwareCompactionStrategy(); new BoundedPartitionAwareCompactionStrategy();
List<String> partitionsToExclude = boundedPartitionAwareCompactionStrategy.filterPartitionPaths(writeConfig, List<String> partitionsToExclude =
partitionPaths); boundedPartitionAwareCompactionStrategy.filterPartitionPaths(writeConfig, partitionPaths);
allPartitionPaths.removeAll(partitionsToExclude); allPartitionPaths.removeAll(partitionsToExclude);
return allPartitionPaths; return allPartitionPaths;
} }

View File

@@ -32,9 +32,8 @@ public class HoodieParquetConfig {
private Configuration hadoopConf; private Configuration hadoopConf;
private double compressionRatio; private double compressionRatio;
public HoodieParquetConfig(HoodieAvroWriteSupport writeSupport, public HoodieParquetConfig(HoodieAvroWriteSupport writeSupport, CompressionCodecName compressionCodecName,
CompressionCodecName compressionCodecName, int blockSize, int pageSize, long maxFileSize, int blockSize, int pageSize, long maxFileSize, Configuration hadoopConf, double compressionRatio) {
Configuration hadoopConf, double compressionRatio) {
this.writeSupport = writeSupport; this.writeSupport = writeSupport;
this.compressionCodecName = compressionCodecName; this.compressionCodecName = compressionCodecName;
this.blockSize = blockSize; this.blockSize = blockSize;

View File

@@ -36,11 +36,11 @@ import org.apache.parquet.hadoop.ParquetWriter;
import org.apache.spark.TaskContext; import org.apache.spark.TaskContext;
/** /**
* HoodieParquetWriter extends the ParquetWriter to help limit the size of underlying file. Provides * HoodieParquetWriter extends the ParquetWriter to help limit the size of underlying file. Provides a way to check if
* a way to check if the current file can take more records with the <code>canWrite()</code> * the current file can take more records with the <code>canWrite()</code>
*/ */
public class HoodieParquetWriter<T extends HoodieRecordPayload, R extends IndexedRecord> extends public class HoodieParquetWriter<T extends HoodieRecordPayload, R extends IndexedRecord>
ParquetWriter<IndexedRecord> implements HoodieStorageWriter<R> { extends ParquetWriter<IndexedRecord> implements HoodieStorageWriter<R> {
private static AtomicLong recordIndex = new AtomicLong(1); private static AtomicLong recordIndex = new AtomicLong(1);
@@ -52,24 +52,22 @@ public class HoodieParquetWriter<T extends HoodieRecordPayload, R extends Indexe
private final Schema schema; private final Schema schema;
public HoodieParquetWriter(String commitTime, Path file, HoodieParquetConfig parquetConfig, public HoodieParquetWriter(String commitTime, Path file, HoodieParquetConfig parquetConfig, Schema schema)
Schema schema) throws IOException { throws IOException {
super(HoodieWrapperFileSystem.convertToHoodiePath(file, parquetConfig.getHadoopConf()), super(HoodieWrapperFileSystem.convertToHoodiePath(file, parquetConfig.getHadoopConf()),
ParquetFileWriter.Mode.CREATE, parquetConfig.getWriteSupport(), ParquetFileWriter.Mode.CREATE, parquetConfig.getWriteSupport(), parquetConfig.getCompressionCodecName(),
parquetConfig.getCompressionCodecName(), parquetConfig.getBlockSize(), parquetConfig.getBlockSize(), parquetConfig.getPageSize(), parquetConfig.getPageSize(),
parquetConfig.getPageSize(), parquetConfig.getPageSize(),
ParquetWriter.DEFAULT_IS_DICTIONARY_ENABLED, ParquetWriter.DEFAULT_IS_VALIDATING_ENABLED, ParquetWriter.DEFAULT_IS_DICTIONARY_ENABLED, ParquetWriter.DEFAULT_IS_VALIDATING_ENABLED,
ParquetWriter.DEFAULT_WRITER_VERSION, ParquetWriter.DEFAULT_WRITER_VERSION, registerFileSystem(file, parquetConfig.getHadoopConf()));
registerFileSystem(file, parquetConfig.getHadoopConf()));
this.file = HoodieWrapperFileSystem.convertToHoodiePath(file, parquetConfig.getHadoopConf()); this.file = HoodieWrapperFileSystem.convertToHoodiePath(file, parquetConfig.getHadoopConf());
this.fs = (HoodieWrapperFileSystem) this.file this.fs =
.getFileSystem(registerFileSystem(file, parquetConfig.getHadoopConf())); (HoodieWrapperFileSystem) this.file.getFileSystem(registerFileSystem(file, parquetConfig.getHadoopConf()));
// We cannot accurately measure the snappy compressed output file size. We are choosing a // We cannot accurately measure the snappy compressed output file size. We are choosing a
// conservative 10% // conservative 10%
// TODO - compute this compression ratio dynamically by looking at the bytes written to the // TODO - compute this compression ratio dynamically by looking at the bytes written to the
// stream and the actual file size reported by HDFS // stream and the actual file size reported by HDFS
this.maxFileSize = parquetConfig.getMaxFileSize() + Math this.maxFileSize = parquetConfig.getMaxFileSize()
.round(parquetConfig.getMaxFileSize() * parquetConfig.getCompressionRatio()); + Math.round(parquetConfig.getMaxFileSize() * parquetConfig.getCompressionRatio());
this.writeSupport = parquetConfig.getWriteSupport(); this.writeSupport = parquetConfig.getWriteSupport();
this.commitTime = commitTime; this.commitTime = commitTime;
this.schema = schema; this.schema = schema;
@@ -85,10 +83,10 @@ public class HoodieParquetWriter<T extends HoodieRecordPayload, R extends Indexe
@Override @Override
public void writeAvroWithMetadata(R avroRecord, HoodieRecord record) throws IOException { public void writeAvroWithMetadata(R avroRecord, HoodieRecord record) throws IOException {
String seqId = HoodieRecord.generateSequenceId(commitTime, TaskContext.getPartitionId(), String seqId =
recordIndex.getAndIncrement()); HoodieRecord.generateSequenceId(commitTime, TaskContext.getPartitionId(), recordIndex.getAndIncrement());
HoodieAvroUtils.addHoodieKeyToRecord((GenericRecord) avroRecord, record.getRecordKey(), HoodieAvroUtils.addHoodieKeyToRecord((GenericRecord) avroRecord, record.getRecordKey(), record.getPartitionPath(),
record.getPartitionPath(), file.getName()); file.getName());
HoodieAvroUtils.addCommitMetadataToRecord((GenericRecord) avroRecord, commitTime, seqId); HoodieAvroUtils.addCommitMetadataToRecord((GenericRecord) avroRecord, commitTime, seqId);
super.write(avroRecord); super.write(avroRecord);
writeSupport.add(record.getRecordKey()); writeSupport.add(record.getRecordKey());

View File

@@ -36,8 +36,8 @@ import org.apache.parquet.avro.AvroSchemaConverter;
public class HoodieStorageWriterFactory { public class HoodieStorageWriterFactory {
public static <T extends HoodieRecordPayload, R extends IndexedRecord> HoodieStorageWriter<R> getStorageWriter( public static <T extends HoodieRecordPayload, R extends IndexedRecord> HoodieStorageWriter<R> getStorageWriter(
String commitTime, Path path, HoodieTable<T> hoodieTable, String commitTime, Path path, HoodieTable<T> hoodieTable, HoodieWriteConfig config, Schema schema)
HoodieWriteConfig config, Schema schema) throws IOException { throws IOException {
final String name = path.getName(); final String name = path.getName();
final String extension = FSUtils.isLogFile(path) ? HOODIE_LOG.getFileExtension() : FSUtils.getFileExtension(name); final String extension = FSUtils.isLogFile(path) ? HOODIE_LOG.getFileExtension() : FSUtils.getFileExtension(name);
if (PARQUET.getFileExtension().equals(extension)) { if (PARQUET.getFileExtension().equals(extension)) {
@@ -46,19 +46,16 @@ public class HoodieStorageWriterFactory {
throw new UnsupportedOperationException(extension + " format not supported yet."); throw new UnsupportedOperationException(extension + " format not supported yet.");
} }
private static <T extends HoodieRecordPayload, private static <T extends HoodieRecordPayload, R extends IndexedRecord> HoodieStorageWriter<R> newParquetStorageWriter(
R extends IndexedRecord> HoodieStorageWriter<R> newParquetStorageWriter(String commitTime, Path path, String commitTime, Path path, HoodieWriteConfig config, Schema schema, HoodieTable hoodieTable)
HoodieWriteConfig config, Schema schema, HoodieTable hoodieTable) throws IOException { throws IOException {
BloomFilter filter = new BloomFilter(config.getBloomFilterNumEntries(), BloomFilter filter = new BloomFilter(config.getBloomFilterNumEntries(), config.getBloomFilterFPP());
config.getBloomFilterFPP()); HoodieAvroWriteSupport writeSupport =
HoodieAvroWriteSupport writeSupport = new HoodieAvroWriteSupport( new HoodieAvroWriteSupport(new AvroSchemaConverter().convert(schema), schema, filter);
new AvroSchemaConverter().convert(schema), schema, filter);
HoodieParquetConfig parquetConfig = HoodieParquetConfig parquetConfig = new HoodieParquetConfig(writeSupport, config.getParquetCompressionCodec(),
new HoodieParquetConfig(writeSupport, config.getParquetCompressionCodec(), config.getParquetBlockSize(), config.getParquetPageSize(), config.getParquetMaxFileSize(),
config.getParquetBlockSize(), config.getParquetPageSize(), hoodieTable.getHadoopConf(), config.getParquetCompressionRatio());
config.getParquetMaxFileSize(), hoodieTable.getHadoopConf(),
config.getParquetCompressionRatio());
return new HoodieParquetWriter<>(commitTime, path, parquetConfig, schema); return new HoodieParquetWriter<>(commitTime, path, parquetConfig, schema);
} }

View File

@@ -118,8 +118,8 @@ public class HoodieMetrics {
return indexTimer == null ? null : indexTimer.time(); return indexTimer == null ? null : indexTimer.time();
} }
public void updateCommitMetrics(long commitEpochTimeInMs, long durationInMs, public void updateCommitMetrics(long commitEpochTimeInMs, long durationInMs, HoodieCommitMetadata metadata,
HoodieCommitMetadata metadata, String actionType) { String actionType) {
if (config.isMetricsOn()) { if (config.isMetricsOn()) {
long totalPartitionsWritten = metadata.fetchTotalPartitionsWritten(); long totalPartitionsWritten = metadata.fetchTotalPartitionsWritten();
long totalFilesInsert = metadata.fetchTotalFilesInsert(); long totalFilesInsert = metadata.fetchTotalFilesInsert();
@@ -154,9 +154,8 @@ public class HoodieMetrics {
public void updateRollbackMetrics(long durationInMs, long numFilesDeleted) { public void updateRollbackMetrics(long durationInMs, long numFilesDeleted) {
if (config.isMetricsOn()) { if (config.isMetricsOn()) {
logger.info(String logger.info(
.format("Sending rollback metrics (duration=%d, numFilesDeleted=%d)", durationInMs, String.format("Sending rollback metrics (duration=%d, numFilesDeleted=%d)", durationInMs, numFilesDeleted));
numFilesDeleted));
Metrics.registerGauge(getMetricsName("rollback", "duration"), durationInMs); Metrics.registerGauge(getMetricsName("rollback", "duration"), durationInMs);
Metrics.registerGauge(getMetricsName("rollback", "numFilesDeleted"), numFilesDeleted); Metrics.registerGauge(getMetricsName("rollback", "numFilesDeleted"), numFilesDeleted);
} }
@@ -164,9 +163,8 @@ public class HoodieMetrics {
public void updateCleanMetrics(long durationInMs, int numFilesDeleted) { public void updateCleanMetrics(long durationInMs, int numFilesDeleted) {
if (config.isMetricsOn()) { if (config.isMetricsOn()) {
logger.info(String logger.info(
.format("Sending clean metrics (duration=%d, numFilesDeleted=%d)", durationInMs, String.format("Sending clean metrics (duration=%d, numFilesDeleted=%d)", durationInMs, numFilesDeleted));
numFilesDeleted));
Metrics.registerGauge(getMetricsName("clean", "duration"), durationInMs); Metrics.registerGauge(getMetricsName("clean", "duration"), durationInMs);
Metrics.registerGauge(getMetricsName("clean", "numFilesDeleted"), numFilesDeleted); Metrics.registerGauge(getMetricsName("clean", "numFilesDeleted"), numFilesDeleted);
} }
@@ -174,9 +172,8 @@ public class HoodieMetrics {
public void updateFinalizeWriteMetrics(long durationInMs, long numFilesFinalized) { public void updateFinalizeWriteMetrics(long durationInMs, long numFilesFinalized) {
if (config.isMetricsOn()) { if (config.isMetricsOn()) {
logger.info(String logger.info(String.format("Sending finalize write metrics (duration=%d, numFilesFinalized=%d)", durationInMs,
.format("Sending finalize write metrics (duration=%d, numFilesFinalized=%d)", numFilesFinalized));
durationInMs, numFilesFinalized));
Metrics.registerGauge(getMetricsName("finalize", "duration"), durationInMs); Metrics.registerGauge(getMetricsName("finalize", "duration"), durationInMs);
Metrics.registerGauge(getMetricsName("finalize", "numFilesFinalized"), numFilesFinalized); Metrics.registerGauge(getMetricsName("finalize", "numFilesFinalized"), numFilesFinalized);
} }
@@ -184,10 +181,8 @@ public class HoodieMetrics {
public void updateIndexMetrics(final String action, final long durationInMs) { public void updateIndexMetrics(final String action, final long durationInMs) {
if (config.isMetricsOn()) { if (config.isMetricsOn()) {
logger.info(String logger.info(String.format("Sending index metrics (%s.duration, %d)", action, durationInMs));
.format("Sending index metrics (%s.duration, %d)",action, durationInMs)); Metrics.registerGauge(getMetricsName("index", String.format("%s.duration", action)), durationInMs);
Metrics.registerGauge(getMetricsName("index", String.format("%s.duration", action)),
durationInMs);
} }
} }

View File

@@ -26,12 +26,10 @@ import java.io.Closeable;
public class InMemoryMetricsReporter extends MetricsReporter { public class InMemoryMetricsReporter extends MetricsReporter {
@Override @Override
public void start() { public void start() {}
}
@Override @Override
public void report() { public void report() {}
}
@Override @Override
public Closeable getReporter() { public Closeable getReporter() {

View File

@@ -30,8 +30,7 @@ import org.apache.log4j.LogManager;
import org.apache.log4j.Logger; import org.apache.log4j.Logger;
/** /**
* Implementation of Graphite reporter, which connects to the Graphite server, and send metrics to * Implementation of Graphite reporter, which connects to the Graphite server, and send metrics to that server.
* that server.
*/ */
public class MetricsGraphiteReporter extends MetricsReporter { public class MetricsGraphiteReporter extends MetricsReporter {
@@ -50,8 +49,7 @@ public class MetricsGraphiteReporter extends MetricsReporter {
this.serverHost = config.getGraphiteServerHost(); this.serverHost = config.getGraphiteServerHost();
this.serverPort = config.getGraphiteServerPort(); this.serverPort = config.getGraphiteServerPort();
if (serverHost == null || serverPort == 0) { if (serverHost == null || serverPort == 0) {
throw new RuntimeException(String throw new RuntimeException(String.format("Graphite cannot be initialized with serverHost[%s] and serverPort[%s].",
.format("Graphite cannot be initialized with serverHost[%s] and serverPort[%s].",
serverHost, serverPort)); serverHost, serverPort));
} }
@@ -84,8 +82,7 @@ public class MetricsGraphiteReporter extends MetricsReporter {
private GraphiteReporter createGraphiteReport() { private GraphiteReporter createGraphiteReport() {
Graphite graphite = new Graphite(new InetSocketAddress(serverHost, serverPort)); Graphite graphite = new Graphite(new InetSocketAddress(serverHost, serverPort));
String reporterPrefix = config.getGraphiteMetricPrefix(); String reporterPrefix = config.getGraphiteMetricPrefix();
return GraphiteReporter.forRegistry(registry).prefixedWith(reporterPrefix) return GraphiteReporter.forRegistry(registry).prefixedWith(reporterPrefix).convertRatesTo(TimeUnit.SECONDS)
.convertRatesTo(TimeUnit.SECONDS).convertDurationsTo(TimeUnit.MILLISECONDS) .convertDurationsTo(TimeUnit.MILLISECONDS).filter(MetricFilter.ALL).build(graphite);
.filter(MetricFilter.ALL).build(graphite);
} }
} }

View File

@@ -19,8 +19,7 @@
package org.apache.hudi.metrics; package org.apache.hudi.metrics;
/** /**
* Types of the reporter. Right now we only support Graphite. We can include JMX and CSV in the * Types of the reporter. Right now we only support Graphite. We can include JMX and CSV in the future.
* future.
*/ */
public enum MetricsReporterType { public enum MetricsReporterType {
GRAPHITE, INMEMORY GRAPHITE, INMEMORY

View File

@@ -82,8 +82,7 @@ import scala.Tuple2;
/** /**
* Implementation of a very heavily read-optimized Hoodie Table where * Implementation of a very heavily read-optimized Hoodie Table where
* <p> * <p>
* INSERTS - Produce new files, block aligned to desired size (or) Merge with the smallest existing * INSERTS - Produce new files, block aligned to desired size (or) Merge with the smallest existing file, to expand it
* file, to expand it
* <p> * <p>
* UPDATES - Produce a new version of the file, just replacing the updated records with new values * UPDATES - Produce a new version of the file, just replacing the updated records with new values
*/ */
@@ -95,11 +94,9 @@ public class HoodieCopyOnWriteTable<T extends HoodieRecordPayload> extends Hoodi
super(config, jsc); super(config, jsc);
} }
private static PairFlatMapFunction<Iterator<Tuple2<String, String>>, String, private static PairFlatMapFunction<Iterator<Tuple2<String, String>>, String, PartitionCleanStat> deleteFilesFunc(
PartitionCleanStat> deleteFilesFunc(
HoodieTable table) { HoodieTable table) {
return (PairFlatMapFunction<Iterator<Tuple2<String, String>>, String, PartitionCleanStat>) return (PairFlatMapFunction<Iterator<Tuple2<String, String>>, String, PartitionCleanStat>) iter -> {
iter -> {
Map<String, PartitionCleanStat> partitionCleanStatMap = new HashMap<>(); Map<String, PartitionCleanStat> partitionCleanStatMap = new HashMap<>();
FileSystem fs = table.getMetaClient().getFs(); FileSystem fs = table.getMetaClient().getFs();
@@ -116,8 +113,7 @@ public class HoodieCopyOnWriteTable<T extends HoodieRecordPayload> extends Hoodi
partitionCleanStat.addDeletedFileResult(deletePathStr, deletedFileResult); partitionCleanStat.addDeletedFileResult(deletePathStr, deletedFileResult);
} }
return partitionCleanStatMap.entrySet().stream() return partitionCleanStatMap.entrySet().stream().map(e -> new Tuple2<>(e.getKey(), e.getValue()))
.map(e -> new Tuple2<>(e.getKey(), e.getValue()))
.collect(Collectors.toList()).iterator(); .collect(Collectors.toList()).iterator();
}; };
} }
@@ -131,8 +127,7 @@ public class HoodieCopyOnWriteTable<T extends HoodieRecordPayload> extends Hoodi
}; };
} }
private static Boolean deleteFileAndGetResult(FileSystem fs, String deletePathStr) private static Boolean deleteFileAndGetResult(FileSystem fs, String deletePathStr) throws IOException {
throws IOException {
Path deletePath = new Path(deletePathStr); Path deletePath = new Path(deletePathStr);
logger.debug("Working on delete path :" + deletePath); logger.debug("Working on delete path :" + deletePath);
boolean deleteResult = fs.delete(deletePath, false); boolean deleteResult = fs.delete(deletePath, false);
@@ -171,8 +166,8 @@ public class HoodieCopyOnWriteTable<T extends HoodieRecordPayload> extends Hoodi
throw new HoodieNotSupportedException("Compaction is not supported from a CopyOnWrite table"); throw new HoodieNotSupportedException("Compaction is not supported from a CopyOnWrite table");
} }
public Iterator<List<WriteStatus>> handleUpdate(String commitTime, String fileId, public Iterator<List<WriteStatus>> handleUpdate(String commitTime, String fileId, Iterator<HoodieRecord<T>> recordItr)
Iterator<HoodieRecord<T>> recordItr) throws IOException { throws IOException {
// This is needed since sometimes some buckets are never picked in getPartition() and end up with 0 records // This is needed since sometimes some buckets are never picked in getPartition() and end up with 0 records
if (!recordItr.hasNext()) { if (!recordItr.hasNext()) {
logger.info("Empty partition with fileId => " + fileId); logger.info("Empty partition with fileId => " + fileId);
@@ -190,17 +185,16 @@ public class HoodieCopyOnWriteTable<T extends HoodieRecordPayload> extends Hoodi
return handleUpdateInternal(upsertHandle, commitTime, fileId); return handleUpdateInternal(upsertHandle, commitTime, fileId);
} }
protected Iterator<List<WriteStatus>> handleUpdateInternal(HoodieMergeHandle upsertHandle, protected Iterator<List<WriteStatus>> handleUpdateInternal(HoodieMergeHandle upsertHandle, String commitTime,
String commitTime, String fileId) String fileId) throws IOException {
throws IOException {
if (upsertHandle.getOldFilePath() == null) { if (upsertHandle.getOldFilePath() == null) {
throw new HoodieUpsertException( throw new HoodieUpsertException(
"Error in finding the old file path at commit " + commitTime + " for fileId: " + fileId); "Error in finding the old file path at commit " + commitTime + " for fileId: " + fileId);
} else { } else {
AvroReadSupport.setAvroReadSchema(getHadoopConf(), upsertHandle.getWriterSchema()); AvroReadSupport.setAvroReadSchema(getHadoopConf(), upsertHandle.getWriterSchema());
BoundedInMemoryExecutor<GenericRecord, GenericRecord, Void> wrapper = null; BoundedInMemoryExecutor<GenericRecord, GenericRecord, Void> wrapper = null;
try (ParquetReader<IndexedRecord> reader = AvroParquetReader.<IndexedRecord>builder(upsertHandle.getOldFilePath()) try (ParquetReader<IndexedRecord> reader =
.withConf(getHadoopConf()).build()) { AvroParquetReader.<IndexedRecord>builder(upsertHandle.getOldFilePath()).withConf(getHadoopConf()).build()) {
wrapper = new SparkBoundedInMemoryExecutor(config, new ParquetReaderIterator(reader), wrapper = new SparkBoundedInMemoryExecutor(config, new ParquetReaderIterator(reader),
new UpdateHandler(upsertHandle), x -> x); new UpdateHandler(upsertHandle), x -> x);
wrapper.execute(); wrapper.execute();
@@ -216,15 +210,13 @@ public class HoodieCopyOnWriteTable<T extends HoodieRecordPayload> extends Hoodi
// TODO(vc): This needs to be revisited // TODO(vc): This needs to be revisited
if (upsertHandle.getWriteStatus().getPartitionPath() == null) { if (upsertHandle.getWriteStatus().getPartitionPath() == null) {
logger.info("Upsert Handle has partition path as null " + upsertHandle.getOldFilePath() logger.info("Upsert Handle has partition path as null " + upsertHandle.getOldFilePath() + ", "
+ ", " + upsertHandle.getWriteStatus()); + upsertHandle.getWriteStatus());
} }
return Collections.singletonList(Collections.singletonList(upsertHandle.getWriteStatus())) return Collections.singletonList(Collections.singletonList(upsertHandle.getWriteStatus())).iterator();
.iterator();
} }
protected HoodieMergeHandle getUpdateHandle(String commitTime, String fileId, protected HoodieMergeHandle getUpdateHandle(String commitTime, String fileId, Iterator<HoodieRecord<T>> recordItr) {
Iterator<HoodieRecord<T>> recordItr) {
return new HoodieMergeHandle<>(config, commitTime, this, recordItr, fileId); return new HoodieMergeHandle<>(config, commitTime, this, recordItr, fileId);
} }
@@ -233,8 +225,8 @@ public class HoodieCopyOnWriteTable<T extends HoodieRecordPayload> extends Hoodi
return new HoodieMergeHandle<>(config, commitTime, this, keyToNewRecords, fileId, dataFileToBeMerged); return new HoodieMergeHandle<>(config, commitTime, this, keyToNewRecords, fileId, dataFileToBeMerged);
} }
public Iterator<List<WriteStatus>> handleInsert(String commitTime, String idPfx, public Iterator<List<WriteStatus>> handleInsert(String commitTime, String idPfx, Iterator<HoodieRecord<T>> recordItr)
Iterator<HoodieRecord<T>> recordItr) throws Exception { throws Exception {
// This is needed since sometimes some buckets are never picked in getPartition() and end up with 0 records // This is needed since sometimes some buckets are never picked in getPartition() and end up with 0 records
if (!recordItr.hasNext()) { if (!recordItr.hasNext()) {
logger.info("Empty partition"); logger.info("Empty partition");
@@ -245,16 +237,16 @@ public class HoodieCopyOnWriteTable<T extends HoodieRecordPayload> extends Hoodi
public Iterator<List<WriteStatus>> handleInsert(String commitTime, String partitionPath, String fileId, public Iterator<List<WriteStatus>> handleInsert(String commitTime, String partitionPath, String fileId,
Iterator<HoodieRecord<T>> recordItr) { Iterator<HoodieRecord<T>> recordItr) {
HoodieCreateHandle createHandle = new HoodieCreateHandle(config, commitTime, this, partitionPath, fileId, HoodieCreateHandle createHandle =
recordItr); new HoodieCreateHandle(config, commitTime, this, partitionPath, fileId, recordItr);
createHandle.write(); createHandle.write();
return Collections.singletonList(Collections.singletonList(createHandle.close())).iterator(); return Collections.singletonList(Collections.singletonList(createHandle.close())).iterator();
} }
@SuppressWarnings("unchecked") @SuppressWarnings("unchecked")
@Override @Override
public Iterator<List<WriteStatus>> handleUpsertPartition(String commitTime, Integer partition, public Iterator<List<WriteStatus>> handleUpsertPartition(String commitTime, Integer partition, Iterator recordItr,
Iterator recordItr, Partitioner partitioner) { Partitioner partitioner) {
UpsertPartitioner upsertPartitioner = (UpsertPartitioner) partitioner; UpsertPartitioner upsertPartitioner = (UpsertPartitioner) partitioner;
BucketInfo binfo = upsertPartitioner.getBucketInfo(partition); BucketInfo binfo = upsertPartitioner.getBucketInfo(partition);
BucketType btype = binfo.bucketType; BucketType btype = binfo.bucketType;
@@ -264,8 +256,7 @@ public class HoodieCopyOnWriteTable<T extends HoodieRecordPayload> extends Hoodi
} else if (btype.equals(BucketType.UPDATE)) { } else if (btype.equals(BucketType.UPDATE)) {
return handleUpdate(commitTime, binfo.fileIdPrefix, recordItr); return handleUpdate(commitTime, binfo.fileIdPrefix, recordItr);
} else { } else {
throw new HoodieUpsertException( throw new HoodieUpsertException("Unknown bucketType " + btype + " for partition :" + partition);
"Unknown bucketType " + btype + " for partition :" + partition);
} }
} catch (Throwable t) { } catch (Throwable t) {
String msg = "Error upserting bucketType " + btype + " for partition :" + partition; String msg = "Error upserting bucketType " + btype + " for partition :" + partition;
@@ -275,15 +266,14 @@ public class HoodieCopyOnWriteTable<T extends HoodieRecordPayload> extends Hoodi
} }
@Override @Override
public Iterator<List<WriteStatus>> handleInsertPartition(String commitTime, Integer partition, public Iterator<List<WriteStatus>> handleInsertPartition(String commitTime, Integer partition, Iterator recordItr,
Iterator recordItr, Partitioner partitioner) { Partitioner partitioner) {
return handleUpsertPartition(commitTime, partition, recordItr, partitioner); return handleUpsertPartition(commitTime, partition, recordItr, partitioner);
} }
/** /**
* Performs cleaning of partition paths according to cleaning policy and returns the number of * Performs cleaning of partition paths according to cleaning policy and returns the number of files cleaned. Handles
* files cleaned. Handles skews in partitions to clean by making files to clean as the unit of * skews in partitions to clean by making files to clean as the unit of task distribution.
* task distribution.
* *
* @throws IllegalArgumentException if unknown cleaning policy is provided * @throws IllegalArgumentException if unknown cleaning policy is provided
*/ */
@@ -291,11 +281,9 @@ public class HoodieCopyOnWriteTable<T extends HoodieRecordPayload> extends Hoodi
public List<HoodieCleanStat> clean(JavaSparkContext jsc) { public List<HoodieCleanStat> clean(JavaSparkContext jsc) {
try { try {
FileSystem fs = getMetaClient().getFs(); FileSystem fs = getMetaClient().getFs();
List<String> partitionsToClean = FSUtils List<String> partitionsToClean =
.getAllPartitionPaths(fs, getMetaClient().getBasePath(), FSUtils.getAllPartitionPaths(fs, getMetaClient().getBasePath(), config.shouldAssumeDatePartitioning());
config.shouldAssumeDatePartitioning()); logger.info("Partitions to clean up : " + partitionsToClean + ", with policy " + config.getCleanerPolicy());
logger.info("Partitions to clean up : " + partitionsToClean + ", with policy " + config
.getCleanerPolicy());
if (partitionsToClean.isEmpty()) { if (partitionsToClean.isEmpty()) {
logger.info("Nothing to clean here mom. It is already clean"); logger.info("Nothing to clean here mom. It is already clean");
return Collections.emptyList(); return Collections.emptyList();
@@ -307,12 +295,10 @@ public class HoodieCopyOnWriteTable<T extends HoodieRecordPayload> extends Hoodi
} }
/** /**
* Common method used for cleaning out parquet files under a partition path during rollback of a * Common method used for cleaning out parquet files under a partition path during rollback of a set of commits
* set of commits
*/ */
protected Map<FileStatus, Boolean> deleteCleanedFiles(Map<FileStatus, Boolean> results, String partitionPath, protected Map<FileStatus, Boolean> deleteCleanedFiles(Map<FileStatus, Boolean> results, String partitionPath,
PathFilter filter) PathFilter filter) throws IOException {
throws IOException {
logger.info("Cleaning path " + partitionPath); logger.info("Cleaning path " + partitionPath);
FileSystem fs = getMetaClient().getFs(); FileSystem fs = getMetaClient().getFs();
FileStatus[] toBeDeleted = fs.listStatus(FSUtils.getPartitionPath(config.getBasePath(), partitionPath), filter); FileStatus[] toBeDeleted = fs.listStatus(FSUtils.getPartitionPath(config.getBasePath(), partitionPath), filter);
@@ -325,12 +311,10 @@ public class HoodieCopyOnWriteTable<T extends HoodieRecordPayload> extends Hoodi
} }
/** /**
* Common method used for cleaning out parquet files under a partition path during rollback of a * Common method used for cleaning out parquet files under a partition path during rollback of a set of commits
* set of commits
*/ */
protected Map<FileStatus, Boolean> deleteCleanedFiles(Map<FileStatus, Boolean> results, String commit, String protected Map<FileStatus, Boolean> deleteCleanedFiles(Map<FileStatus, Boolean> results, String commit,
partitionPath) String partitionPath) throws IOException {
throws IOException {
logger.info("Cleaning path " + partitionPath); logger.info("Cleaning path " + partitionPath);
FileSystem fs = getMetaClient().getFs(); FileSystem fs = getMetaClient().getFs();
PathFilter filter = (path) -> { PathFilter filter = (path) -> {
@@ -354,8 +338,8 @@ public class HoodieCopyOnWriteTable<T extends HoodieRecordPayload> extends Hoodi
throws IOException { throws IOException {
String actionType = metaClient.getCommitActionType(); String actionType = metaClient.getCommitActionType();
HoodieActiveTimeline activeTimeline = this.getActiveTimeline(); HoodieActiveTimeline activeTimeline = this.getActiveTimeline();
List<String> inflights = this.getInflightCommitTimeline().getInstants() List<String> inflights =
.map(HoodieInstant::getTimestamp).collect(Collectors.toList()); this.getInflightCommitTimeline().getInstants().map(HoodieInstant::getTimestamp).collect(Collectors.toList());
// Atomically unpublish the commits // Atomically unpublish the commits
if (!inflights.contains(commit)) { if (!inflights.contains(commit)) {
activeTimeline.revertToInflight(new HoodieInstant(false, actionType, commit)); activeTimeline.revertToInflight(new HoodieInstant(false, actionType, commit));
@@ -364,10 +348,9 @@ public class HoodieCopyOnWriteTable<T extends HoodieRecordPayload> extends Hoodi
// delete all the data files for this commit // delete all the data files for this commit
logger.info("Clean out all parquet files generated for commit: " + commit); logger.info("Clean out all parquet files generated for commit: " + commit);
List<HoodieRollbackStat> stats = jsc.parallelize(FSUtils List<HoodieRollbackStat> stats =
.getAllPartitionPaths(metaClient.getFs(), getMetaClient().getBasePath(), jsc.parallelize(FSUtils.getAllPartitionPaths(metaClient.getFs(), getMetaClient().getBasePath(),
config.shouldAssumeDatePartitioning())) config.shouldAssumeDatePartitioning())).map((Function<String, HoodieRollbackStat>) partitionPath -> {
.map((Function<String, HoodieRollbackStat>) partitionPath -> {
// Scan all partitions files with this commit time // Scan all partitions files with this commit time
final Map<FileStatus, Boolean> filesToDeletedStatus = new HashMap<>(); final Map<FileStatus, Boolean> filesToDeletedStatus = new HashMap<>();
deleteCleanedFiles(filesToDeletedStatus, commit, partitionPath); deleteCleanedFiles(filesToDeletedStatus, commit, partitionPath);
@@ -376,13 +359,13 @@ public class HoodieCopyOnWriteTable<T extends HoodieRecordPayload> extends Hoodi
}).collect(); }).collect();
// Delete Inflight instant if enabled // Delete Inflight instant if enabled
deleteInflightInstant(deleteInstants, activeTimeline, deleteInflightInstant(deleteInstants, activeTimeline, new HoodieInstant(true, actionType, commit));
new HoodieInstant(true, actionType, commit));
return stats; return stats;
} }
/** /**
* Delete Inflight instant if enabled * Delete Inflight instant if enabled
*
* @param deleteInstant Enable Deletion of Inflight instant * @param deleteInstant Enable Deletion of Inflight instant
* @param activeTimeline Hoodie active timeline * @param activeTimeline Hoodie active timeline
* @param instantToBeDeleted Instant to be deleted * @param instantToBeDeleted Instant to be deleted
@@ -401,30 +384,27 @@ public class HoodieCopyOnWriteTable<T extends HoodieRecordPayload> extends Hoodi
} }
} }
private List<HoodieCleanStat> cleanPartitionPaths(List<String> partitionsToClean, private List<HoodieCleanStat> cleanPartitionPaths(List<String> partitionsToClean, JavaSparkContext jsc) {
JavaSparkContext jsc) {
int cleanerParallelism = Math.min(partitionsToClean.size(), config.getCleanerParallelism()); int cleanerParallelism = Math.min(partitionsToClean.size(), config.getCleanerParallelism());
logger.info("Using cleanerParallelism: " + cleanerParallelism); logger.info("Using cleanerParallelism: " + cleanerParallelism);
List<Tuple2<String, PartitionCleanStat>> partitionCleanStats = jsc List<Tuple2<String, PartitionCleanStat>> partitionCleanStats = jsc
.parallelize(partitionsToClean, cleanerParallelism) .parallelize(partitionsToClean, cleanerParallelism).flatMapToPair(getFilesToDeleteFunc(this, config))
.flatMapToPair(getFilesToDeleteFunc(this, config))
.repartition(cleanerParallelism) // repartition to remove skews .repartition(cleanerParallelism) // repartition to remove skews
.mapPartitionsToPair(deleteFilesFunc(this)).reduceByKey( .mapPartitionsToPair(deleteFilesFunc(this)).reduceByKey(
// merge partition level clean stats below // merge partition level clean stats below
(Function2<PartitionCleanStat, PartitionCleanStat, PartitionCleanStat>) (e1, e2) -> e1 (Function2<PartitionCleanStat, PartitionCleanStat, PartitionCleanStat>) (e1, e2) -> e1.merge(e2))
.merge(e2)).collect(); .collect();
Map<String, PartitionCleanStat> partitionCleanStatsMap = partitionCleanStats.stream() Map<String, PartitionCleanStat> partitionCleanStatsMap =
.collect(Collectors.toMap(Tuple2::_1, Tuple2::_2)); partitionCleanStats.stream().collect(Collectors.toMap(Tuple2::_1, Tuple2::_2));
HoodieCleanHelper cleaner = new HoodieCleanHelper(this, config); HoodieCleanHelper cleaner = new HoodieCleanHelper(this, config);
// Return PartitionCleanStat for each partition passed. // Return PartitionCleanStat for each partition passed.
return partitionsToClean.stream().map(partitionPath -> { return partitionsToClean.stream().map(partitionPath -> {
PartitionCleanStat partitionCleanStat = PartitionCleanStat partitionCleanStat =
(partitionCleanStatsMap.containsKey(partitionPath)) ? partitionCleanStatsMap (partitionCleanStatsMap.containsKey(partitionPath)) ? partitionCleanStatsMap.get(partitionPath)
.get(partitionPath) : new PartitionCleanStat(partitionPath); : new PartitionCleanStat(partitionPath);
return HoodieCleanStat.newBuilder().withPolicy(config.getCleanerPolicy()) return HoodieCleanStat.newBuilder().withPolicy(config.getCleanerPolicy()).withPartitionPath(partitionPath)
.withPartitionPath(partitionPath)
.withEarliestCommitRetained(cleaner.getEarliestCommitToRetain()) .withEarliestCommitRetained(cleaner.getEarliestCommitToRetain())
.withDeletePathPattern(partitionCleanStat.deletePathPatterns) .withDeletePathPattern(partitionCleanStat.deletePathPatterns)
.withSuccessfulDeletes(partitionCleanStat.successDeleteFiles) .withSuccessfulDeletes(partitionCleanStat.successDeleteFiles)
@@ -453,8 +433,7 @@ public class HoodieCopyOnWriteTable<T extends HoodieRecordPayload> extends Hoodi
} }
@Override @Override
protected void finish() { protected void finish() {}
}
@Override @Override
protected Void getResult() { protected Void getResult() {
@@ -487,8 +466,8 @@ public class HoodieCopyOnWriteTable<T extends HoodieRecordPayload> extends Hoodi
private PartitionCleanStat merge(PartitionCleanStat other) { private PartitionCleanStat merge(PartitionCleanStat other) {
if (!this.partitionPath.equals(other.partitionPath)) { if (!this.partitionPath.equals(other.partitionPath)) {
throw new RuntimeException(String throw new RuntimeException(
.format("partitionPath is not a match: (%s, %s)", partitionPath, other.partitionPath)); String.format("partitionPath is not a match: (%s, %s)", partitionPath, other.partitionPath));
} }
successDeleteFiles.addAll(other.successDeleteFiles); successDeleteFiles.addAll(other.successDeleteFiles);
deletePathPatterns.addAll(other.deletePathPatterns); deletePathPatterns.addAll(other.deletePathPatterns);
@@ -516,8 +495,8 @@ public class HoodieCopyOnWriteTable<T extends HoodieRecordPayload> extends Hoodi
} }
/** /**
* Helper class for an insert bucket along with the weight [0.0, 0.1] that defines the amount of * Helper class for an insert bucket along with the weight [0.0, 0.1] that defines the amount of incoming inserts that
* incoming inserts that should be allocated to the bucket * should be allocated to the bucket
*/ */
class InsertBucket implements Serializable { class InsertBucket implements Serializable {
@@ -563,8 +542,7 @@ public class HoodieCopyOnWriteTable<T extends HoodieRecordPayload> extends Hoodi
*/ */
List<SmallFile> smallFiles = new ArrayList<SmallFile>(); List<SmallFile> smallFiles = new ArrayList<SmallFile>();
/** /**
* Total number of RDD partitions, is determined by total buckets we want to pack the incoming * Total number of RDD partitions, is determined by total buckets we want to pack the incoming workload into
* workload into
*/ */
private int totalBuckets = 0; private int totalBuckets = 0;
/** /**
@@ -599,8 +577,7 @@ public class HoodieCopyOnWriteTable<T extends HoodieRecordPayload> extends Hoodi
assignUpdates(profile); assignUpdates(profile);
assignInserts(profile); assignInserts(profile);
logger.info( logger.info("Total Buckets :" + totalBuckets + ", " + "buckets info => " + bucketInfoMap + ", \n"
"Total Buckets :" + totalBuckets + ", " + "buckets info => " + bucketInfoMap + ", \n"
+ "Partition to insert buckets => " + partitionPathToInsertBuckets + ", \n" + "Partition to insert buckets => " + partitionPathToInsertBuckets + ", \n"
+ "UpdateLocations mapped to buckets =>" + updateLocationToBucket); + "UpdateLocations mapped to buckets =>" + updateLocationToBucket);
} }
@@ -608,8 +585,7 @@ public class HoodieCopyOnWriteTable<T extends HoodieRecordPayload> extends Hoodi
private void assignUpdates(WorkloadProfile profile) { private void assignUpdates(WorkloadProfile profile) {
// each update location gets a partition // each update location gets a partition
WorkloadStat gStat = profile.getGlobalStat(); WorkloadStat gStat = profile.getGlobalStat();
for (Map.Entry<String, Pair<String, Long>> updateLocEntry : gStat.getUpdateLocationToCount() for (Map.Entry<String, Pair<String, Long>> updateLocEntry : gStat.getUpdateLocationToCount().entrySet()) {
.entrySet()) {
addUpdateBucket(updateLocEntry.getKey()); addUpdateBucket(updateLocEntry.getKey());
} }
} }
@@ -628,8 +604,9 @@ public class HoodieCopyOnWriteTable<T extends HoodieRecordPayload> extends Hoodi
private void assignInserts(WorkloadProfile profile) { private void assignInserts(WorkloadProfile profile) {
// for new inserts, compute buckets depending on how many records we have for each partition // for new inserts, compute buckets depending on how many records we have for each partition
Set<String> partitionPaths = profile.getPartitionPaths(); Set<String> partitionPaths = profile.getPartitionPaths();
long averageRecordSize = averageBytesPerRecord(metaClient.getActiveTimeline().getCommitTimeline() long averageRecordSize =
.filterCompletedInstants(), config.getCopyOnWriteRecordSizeEstimate()); averageBytesPerRecord(metaClient.getActiveTimeline().getCommitTimeline().filterCompletedInstants(),
config.getCopyOnWriteRecordSizeEstimate());
logger.info("AvgRecordSize => " + averageRecordSize); logger.info("AvgRecordSize => " + averageRecordSize);
for (String partitionPath : partitionPaths) { for (String partitionPath : partitionPaths) {
WorkloadStat pStat = profile.getWorkloadStat(partitionPath); WorkloadStat pStat = profile.getWorkloadStat(partitionPath);
@@ -644,20 +621,17 @@ public class HoodieCopyOnWriteTable<T extends HoodieRecordPayload> extends Hoodi
// first try packing this into one of the smallFiles // first try packing this into one of the smallFiles
for (SmallFile smallFile : smallFiles) { for (SmallFile smallFile : smallFiles) {
long recordsToAppend = Math long recordsToAppend = Math.min((config.getParquetMaxFileSize() - smallFile.sizeBytes) / averageRecordSize,
.min((config.getParquetMaxFileSize() - smallFile.sizeBytes) / averageRecordSize,
totalUnassignedInserts); totalUnassignedInserts);
if (recordsToAppend > 0 && totalUnassignedInserts > 0) { if (recordsToAppend > 0 && totalUnassignedInserts > 0) {
// create a new bucket or re-use an existing bucket // create a new bucket or re-use an existing bucket
int bucket; int bucket;
if (updateLocationToBucket.containsKey(smallFile.location.getFileId())) { if (updateLocationToBucket.containsKey(smallFile.location.getFileId())) {
bucket = updateLocationToBucket.get(smallFile.location.getFileId()); bucket = updateLocationToBucket.get(smallFile.location.getFileId());
logger.info("Assigning " + recordsToAppend + " inserts to existing update bucket " logger.info("Assigning " + recordsToAppend + " inserts to existing update bucket " + bucket);
+ bucket);
} else { } else {
bucket = addUpdateBucket(smallFile.location.getFileId()); bucket = addUpdateBucket(smallFile.location.getFileId());
logger.info( logger.info("Assigning " + recordsToAppend + " inserts to new update bucket " + bucket);
"Assigning " + recordsToAppend + " inserts to new update bucket " + bucket);
} }
bucketNumbers.add(bucket); bucketNumbers.add(bucket);
recordsPerBucket.add(recordsToAppend); recordsPerBucket.add(recordsToAppend);
@@ -673,10 +647,8 @@ public class HoodieCopyOnWriteTable<T extends HoodieRecordPayload> extends Hoodi
} }
int insertBuckets = (int) Math.ceil((1.0 * totalUnassignedInserts) / insertRecordsPerBucket); int insertBuckets = (int) Math.ceil((1.0 * totalUnassignedInserts) / insertRecordsPerBucket);
logger.info( logger.info("After small file assignment: unassignedInserts => " + totalUnassignedInserts
"After small file assignment: unassignedInserts => " + totalUnassignedInserts + ", totalInsertBuckets => " + insertBuckets + ", recordsPerBucket => " + insertRecordsPerBucket);
+ ", totalInsertBuckets => " + insertBuckets + ", recordsPerBucket => "
+ insertRecordsPerBucket);
for (int b = 0; b < insertBuckets; b++) { for (int b = 0; b < insertBuckets; b++) {
bucketNumbers.add(totalBuckets); bucketNumbers.add(totalBuckets);
recordsPerBucket.add(totalUnassignedInserts / insertBuckets); recordsPerBucket.add(totalUnassignedInserts / insertBuckets);
@@ -696,8 +668,7 @@ public class HoodieCopyOnWriteTable<T extends HoodieRecordPayload> extends Hoodi
bkt.weight = (1.0 * recordsPerBucket.get(i)) / pStat.getNumInserts(); bkt.weight = (1.0 * recordsPerBucket.get(i)) / pStat.getNumInserts();
insertBuckets.add(bkt); insertBuckets.add(bkt);
} }
logger.info( logger.info("Total insert buckets for partition path " + partitionPath + " => " + insertBuckets);
"Total insert buckets for partition path " + partitionPath + " => " + insertBuckets);
partitionPathToInsertBuckets.put(partitionPath, insertBuckets); partitionPathToInsertBuckets.put(partitionPath, insertBuckets);
} }
} }
@@ -716,15 +687,13 @@ public class HoodieCopyOnWriteTable<T extends HoodieRecordPayload> extends Hoodi
if (!commitTimeline.empty()) { // if we have some commits if (!commitTimeline.empty()) { // if we have some commits
HoodieInstant latestCommitTime = commitTimeline.lastInstant().get(); HoodieInstant latestCommitTime = commitTimeline.lastInstant().get();
List<HoodieDataFile> allFiles = getROFileSystemView() List<HoodieDataFile> allFiles = getROFileSystemView()
.getLatestDataFilesBeforeOrOn(partitionPath, latestCommitTime.getTimestamp()) .getLatestDataFilesBeforeOrOn(partitionPath, latestCommitTime.getTimestamp()).collect(Collectors.toList());
.collect(Collectors.toList());
for (HoodieDataFile file : allFiles) { for (HoodieDataFile file : allFiles) {
if (file.getFileSize() < config.getParquetSmallFileLimit()) { if (file.getFileSize() < config.getParquetSmallFileLimit()) {
String filename = file.getFileName(); String filename = file.getFileName();
SmallFile sf = new SmallFile(); SmallFile sf = new SmallFile();
sf.location = new HoodieRecordLocation(FSUtils.getCommitTime(filename), sf.location = new HoodieRecordLocation(FSUtils.getCommitTime(filename), FSUtils.getFileId(filename));
FSUtils.getFileId(filename));
sf.sizeBytes = file.getFileSize(); sf.sizeBytes = file.getFileSize();
smallFileLocations.add(sf); smallFileLocations.add(sf);
// Update the global small files list // Update the global small files list
@@ -751,19 +720,18 @@ public class HoodieCopyOnWriteTable<T extends HoodieRecordPayload> extends Hoodi
@Override @Override
public int getPartition(Object key) { public int getPartition(Object key) {
Tuple2<HoodieKey, Option<HoodieRecordLocation>> keyLocation = (Tuple2<HoodieKey, Tuple2<HoodieKey, Option<HoodieRecordLocation>> keyLocation =
Option<HoodieRecordLocation>>) key; (Tuple2<HoodieKey, Option<HoodieRecordLocation>>) key;
if (keyLocation._2().isPresent()) { if (keyLocation._2().isPresent()) {
HoodieRecordLocation location = keyLocation._2().get(); HoodieRecordLocation location = keyLocation._2().get();
return updateLocationToBucket.get(location.getFileId()); return updateLocationToBucket.get(location.getFileId());
} else { } else {
List<InsertBucket> targetBuckets = partitionPathToInsertBuckets List<InsertBucket> targetBuckets = partitionPathToInsertBuckets.get(keyLocation._1().getPartitionPath());
.get(keyLocation._1().getPartitionPath());
// pick the target bucket to use based on the weights. // pick the target bucket to use based on the weights.
double totalWeight = 0.0; double totalWeight = 0.0;
final long totalInserts = Math.max(1, globalStat.getNumInserts()); final long totalInserts = Math.max(1, globalStat.getNumInserts());
final long hashOfKey = Hashing.md5() final long hashOfKey =
.hashString(keyLocation._1().getRecordKey(), StandardCharsets.UTF_8).asLong(); Hashing.md5().hashString(keyLocation._1().getRecordKey(), StandardCharsets.UTF_8).asLong();
final double r = 1.0 * Math.floorMod(hashOfKey, totalInserts) / totalInserts; final double r = 1.0 * Math.floorMod(hashOfKey, totalInserts) / totalInserts;
for (InsertBucket insertBucket : targetBuckets) { for (InsertBucket insertBucket : targetBuckets) {
totalWeight += insertBucket.weight; totalWeight += insertBucket.weight;
@@ -782,8 +750,8 @@ public class HoodieCopyOnWriteTable<T extends HoodieRecordPayload> extends Hoodi
} }
/** /**
* Obtains the average record size based on records written during previous commits. Used for * Obtains the average record size based on records written during previous commits. Used for estimating how many
* estimating how many records pack into one file. * records pack into one file.
*/ */
protected static long averageBytesPerRecord(HoodieTimeline commitTimeline, int defaultRecordSizeEstimate) { protected static long averageBytesPerRecord(HoodieTimeline commitTimeline, int defaultRecordSizeEstimate) {
long avgSize = defaultRecordSizeEstimate; long avgSize = defaultRecordSizeEstimate;

View File

@@ -73,15 +73,21 @@ import org.apache.spark.api.java.JavaSparkContext;
import org.apache.spark.api.java.function.Function; import org.apache.spark.api.java.function.Function;
/** /**
* Implementation of a more real-time read-optimized Hoodie Table where <p> INSERTS - Same as * Implementation of a more real-time read-optimized Hoodie Table where
* HoodieCopyOnWriteTable - Produce new files, block aligned to desired size (or) Merge with the * <p>
* smallest existing file, to expand it </p> <p> UPDATES - Appends the changes to a rolling log file * INSERTS - Same as HoodieCopyOnWriteTable - Produce new files, block aligned to desired size (or) Merge with the
* maintained per file Id. Compaction merges the log file into the base file. </p> <p> WARNING - MOR * smallest existing file, to expand it
* table type does not support nested rollbacks, every rollback must be followed by an attempted * </p>
* commit action </p> * <p>
* UPDATES - Appends the changes to a rolling log file maintained per file Id. Compaction merges the log file into the
* base file.
* </p>
* <p>
* WARNING - MOR table type does not support nested rollbacks, every rollback must be followed by an attempted commit
* action
* </p>
*/ */
public class HoodieMergeOnReadTable<T extends HoodieRecordPayload> extends public class HoodieMergeOnReadTable<T extends HoodieRecordPayload> extends HoodieCopyOnWriteTable<T> {
HoodieCopyOnWriteTable<T> {
private static Logger logger = LogManager.getLogger(HoodieMergeOnReadTable.class); private static Logger logger = LogManager.getLogger(HoodieMergeOnReadTable.class);
@@ -102,27 +108,24 @@ public class HoodieMergeOnReadTable<T extends HoodieRecordPayload> extends
} }
@Override @Override
public Iterator<List<WriteStatus>> handleUpdate(String commitTime, String fileId, public Iterator<List<WriteStatus>> handleUpdate(String commitTime, String fileId, Iterator<HoodieRecord<T>> recordItr)
Iterator<HoodieRecord<T>> recordItr) throws IOException { throws IOException {
logger.info("Merging updates for commit " + commitTime + " for file " + fileId); logger.info("Merging updates for commit " + commitTime + " for file " + fileId);
if (!index.canIndexLogFiles() && mergeOnReadUpsertPartitioner.getSmallFileIds().contains(fileId)) { if (!index.canIndexLogFiles() && mergeOnReadUpsertPartitioner.getSmallFileIds().contains(fileId)) {
logger.info( logger.info("Small file corrections for updates for commit " + commitTime + " for file " + fileId);
"Small file corrections for updates for commit " + commitTime + " for file " + fileId);
return super.handleUpdate(commitTime, fileId, recordItr); return super.handleUpdate(commitTime, fileId, recordItr);
} else { } else {
HoodieAppendHandle<T> appendHandle = new HoodieAppendHandle<>(config, commitTime, this, HoodieAppendHandle<T> appendHandle = new HoodieAppendHandle<>(config, commitTime, this, fileId, recordItr);
fileId, recordItr);
appendHandle.doAppend(); appendHandle.doAppend();
appendHandle.close(); appendHandle.close();
return Collections.singletonList(Collections.singletonList(appendHandle.getWriteStatus())) return Collections.singletonList(Collections.singletonList(appendHandle.getWriteStatus())).iterator();
.iterator();
} }
} }
@Override @Override
public Iterator<List<WriteStatus>> handleInsert(String commitTime, String idPfx, public Iterator<List<WriteStatus>> handleInsert(String commitTime, String idPfx, Iterator<HoodieRecord<T>> recordItr)
Iterator<HoodieRecord<T>> recordItr) throws Exception { throws Exception {
// If canIndexLogFiles, write inserts to log files else write inserts to parquet files // If canIndexLogFiles, write inserts to log files else write inserts to parquet files
if (index.canIndexLogFiles()) { if (index.canIndexLogFiles()) {
return new MergeOnReadLazyInsertIterable<>(recordItr, config, commitTime, this, idPfx); return new MergeOnReadLazyInsertIterable<>(recordItr, config, commitTime, this, idPfx);
@@ -134,8 +137,8 @@ public class HoodieMergeOnReadTable<T extends HoodieRecordPayload> extends
@Override @Override
public HoodieCompactionPlan scheduleCompaction(JavaSparkContext jsc, String instantTime) { public HoodieCompactionPlan scheduleCompaction(JavaSparkContext jsc, String instantTime) {
logger.info("Checking if compaction needs to be run on " + config.getBasePath()); logger.info("Checking if compaction needs to be run on " + config.getBasePath());
Option<HoodieInstant> lastCompaction = getActiveTimeline().getCommitTimeline() Option<HoodieInstant> lastCompaction =
.filterCompletedInstants().lastInstant(); getActiveTimeline().getCommitTimeline().filterCompletedInstants().lastInstant();
String deltaCommitsSinceTs = "0"; String deltaCommitsSinceTs = "0";
if (lastCompaction.isPresent()) { if (lastCompaction.isPresent()) {
deltaCommitsSinceTs = lastCompaction.get().getTimestamp(); deltaCommitsSinceTs = lastCompaction.get().getTimestamp();
@@ -145,8 +148,8 @@ public class HoodieMergeOnReadTable<T extends HoodieRecordPayload> extends
.findInstantsAfter(deltaCommitsSinceTs, Integer.MAX_VALUE).countInstants(); .findInstantsAfter(deltaCommitsSinceTs, Integer.MAX_VALUE).countInstants();
if (config.getInlineCompactDeltaCommitMax() > deltaCommitsSinceLastCompaction) { if (config.getInlineCompactDeltaCommitMax() > deltaCommitsSinceLastCompaction) {
logger.info("Not running compaction as only " + deltaCommitsSinceLastCompaction logger.info("Not running compaction as only " + deltaCommitsSinceLastCompaction
+ " delta commits was found since last compaction " + deltaCommitsSinceTs + " delta commits was found since last compaction " + deltaCommitsSinceTs + ". Waiting for "
+ ". Waiting for " + config.getInlineCompactDeltaCommitMax()); + config.getInlineCompactDeltaCommitMax());
return new HoodieCompactionPlan(); return new HoodieCompactionPlan();
} }
@@ -186,9 +189,8 @@ public class HoodieMergeOnReadTable<T extends HoodieRecordPayload> extends
// Atomically un-publish all non-inflight commits // Atomically un-publish all non-inflight commits
Option<HoodieInstant> commitOrCompactionOption = Option.fromJavaOptional(this.getActiveTimeline() Option<HoodieInstant> commitOrCompactionOption = Option.fromJavaOptional(this.getActiveTimeline()
.getTimelineOfActions(Sets.newHashSet(HoodieActiveTimeline.COMMIT_ACTION, .getTimelineOfActions(Sets.newHashSet(HoodieActiveTimeline.COMMIT_ACTION,
HoodieActiveTimeline.DELTA_COMMIT_ACTION, HoodieActiveTimeline.COMPACTION_ACTION)).getInstants() HoodieActiveTimeline.DELTA_COMMIT_ACTION, HoodieActiveTimeline.COMPACTION_ACTION))
.filter(i -> commit.equals(i.getTimestamp())) .getInstants().filter(i -> commit.equals(i.getTimestamp())).findFirst());
.findFirst());
HoodieInstant instantToRollback = commitOrCompactionOption.get(); HoodieInstant instantToRollback = commitOrCompactionOption.get();
// Atomically un-publish all non-inflight commits // Atomically un-publish all non-inflight commits
if (!instantToRollback.isInflight()) { if (!instantToRollback.isInflight()) {
@@ -196,10 +198,9 @@ public class HoodieMergeOnReadTable<T extends HoodieRecordPayload> extends
} }
logger.info("Unpublished " + commit); logger.info("Unpublished " + commit);
Long startTime = System.currentTimeMillis(); Long startTime = System.currentTimeMillis();
List<HoodieRollbackStat> allRollbackStats = jsc.parallelize(FSUtils List<HoodieRollbackStat> allRollbackStats =
.getAllPartitionPaths(this.metaClient.getFs(), this.getMetaClient().getBasePath(), jsc.parallelize(FSUtils.getAllPartitionPaths(this.metaClient.getFs(), this.getMetaClient().getBasePath(),
config.shouldAssumeDatePartitioning())) config.shouldAssumeDatePartitioning())).map((Function<String, HoodieRollbackStat>) partitionPath -> {
.map((Function<String, HoodieRollbackStat>) partitionPath -> {
HoodieActiveTimeline activeTimeline = this.getActiveTimeline().reload(); HoodieActiveTimeline activeTimeline = this.getActiveTimeline().reload();
HoodieRollbackStat hoodieRollbackStats = null; HoodieRollbackStat hoodieRollbackStats = null;
// Need to put the path filter here since Filter is not serializable // Need to put the path filter here since Filter is not serializable
@@ -222,10 +223,11 @@ public class HoodieMergeOnReadTable<T extends HoodieRecordPayload> extends
case HoodieTimeline.COMMIT_ACTION: case HoodieTimeline.COMMIT_ACTION:
try { try {
// Rollback of a commit should delete the newly created parquet files along with any log // Rollback of a commit should delete the newly created parquet files along with any log
// files created with this as baseCommit. This is required to support multi-rollbacks in a MOR table. // files created with this as baseCommit. This is required to support multi-rollbacks in a MOR
// table.
super.deleteCleanedFiles(filesToDeletedStatus, partitionPath, filter); super.deleteCleanedFiles(filesToDeletedStatus, partitionPath, filter);
hoodieRollbackStats = HoodieRollbackStat.newBuilder() hoodieRollbackStats = HoodieRollbackStat.newBuilder().withPartitionPath(partitionPath)
.withPartitionPath(partitionPath).withDeletedFileResults(filesToDeletedStatus).build(); .withDeletedFileResults(filesToDeletedStatus).build();
break; break;
} catch (IOException io) { } catch (IOException io) {
throw new UncheckedIOException("Failed to rollback for commit " + commit, io); throw new UncheckedIOException("Failed to rollback for commit " + commit, io);
@@ -233,25 +235,28 @@ public class HoodieMergeOnReadTable<T extends HoodieRecordPayload> extends
case HoodieTimeline.COMPACTION_ACTION: case HoodieTimeline.COMPACTION_ACTION:
try { try {
// If there is no delta commit present after the current commit (if compaction), no action, else we // If there is no delta commit present after the current commit (if compaction), no action, else we
// need to make sure that a compaction commit rollback also deletes any log files written as part of the // need to make sure that a compaction commit rollback also deletes any log files written as part of
// the
// succeeding deltacommit. // succeeding deltacommit.
boolean higherDeltaCommits = !activeTimeline.getDeltaCommitTimeline() boolean higherDeltaCommits = !activeTimeline.getDeltaCommitTimeline().filterCompletedInstants()
.filterCompletedInstants().findInstantsAfter(commit, 1).empty(); .findInstantsAfter(commit, 1).empty();
if (higherDeltaCommits) { if (higherDeltaCommits) {
// Rollback of a compaction action with no higher deltacommit means that the compaction is scheduled // Rollback of a compaction action with no higher deltacommit means that the compaction is
// and has not yet finished. In this scenario we should delete only the newly created parquet files // scheduled
// and has not yet finished. In this scenario we should delete only the newly created parquet
// files
// and not corresponding base commit log files created with this as baseCommit since updates would // and not corresponding base commit log files created with this as baseCommit since updates would
// have been written to the log files. // have been written to the log files.
super.deleteCleanedFiles(filesToDeletedStatus, commit, partitionPath); super.deleteCleanedFiles(filesToDeletedStatus, commit, partitionPath);
hoodieRollbackStats = HoodieRollbackStat.newBuilder() hoodieRollbackStats = HoodieRollbackStat.newBuilder().withPartitionPath(partitionPath)
.withPartitionPath(partitionPath).withDeletedFileResults(filesToDeletedStatus).build(); .withDeletedFileResults(filesToDeletedStatus).build();
} else { } else {
// No deltacommits present after this compaction commit (inflight or requested). In this case, we // No deltacommits present after this compaction commit (inflight or requested). In this case, we
// can also delete any log files that were created with this compaction commit as base // can also delete any log files that were created with this compaction commit as base
// commit. // commit.
super.deleteCleanedFiles(filesToDeletedStatus, partitionPath, filter); super.deleteCleanedFiles(filesToDeletedStatus, partitionPath, filter);
hoodieRollbackStats = HoodieRollbackStat.newBuilder() hoodieRollbackStats = HoodieRollbackStat.newBuilder().withPartitionPath(partitionPath)
.withPartitionPath(partitionPath).withDeletedFileResults(filesToDeletedStatus).build(); .withDeletedFileResults(filesToDeletedStatus).build();
} }
break; break;
} catch (IOException io) { } catch (IOException io) {
@@ -261,12 +266,14 @@ public class HoodieMergeOnReadTable<T extends HoodieRecordPayload> extends
// -------------------------------------------------------------------------------------------------- // --------------------------------------------------------------------------------------------------
// (A) The following cases are possible if index.canIndexLogFiles and/or index.isGlobal // (A) The following cases are possible if index.canIndexLogFiles and/or index.isGlobal
// -------------------------------------------------------------------------------------------------- // --------------------------------------------------------------------------------------------------
// (A.1) Failed first commit - Inserts were written to log files and HoodieWriteStat has no entries. In // (A.1) Failed first commit - Inserts were written to log files and HoodieWriteStat has no entries.
// In
// this scenario we would want to delete these log files. // this scenario we would want to delete these log files.
// (A.2) Failed recurring commit - Inserts/Updates written to log files. In this scenario, // (A.2) Failed recurring commit - Inserts/Updates written to log files. In this scenario,
// HoodieWriteStat will have the baseCommitTime for the first log file written, add rollback blocks. // HoodieWriteStat will have the baseCommitTime for the first log file written, add rollback blocks.
// (A.3) Rollback triggered for first commit - Inserts were written to the log files but the commit is // (A.3) Rollback triggered for first commit - Inserts were written to the log files but the commit is
// being reverted. In this scenario, HoodieWriteStat will be `null` for the attribute prevCommitTime and // being reverted. In this scenario, HoodieWriteStat will be `null` for the attribute prevCommitTime
// and
// and hence will end up deleting these log files. This is done so there are no orphan log files // and hence will end up deleting these log files. This is done so there are no orphan log files
// lying around. // lying around.
// (A.4) Rollback triggered for recurring commits - Inserts/Updates are being rolled back, the actions // (A.4) Rollback triggered for recurring commits - Inserts/Updates are being rolled back, the actions
@@ -274,7 +281,8 @@ public class HoodieMergeOnReadTable<T extends HoodieRecordPayload> extends
// --------------------------------------------------------------------------------------------------- // ---------------------------------------------------------------------------------------------------
// (B) The following cases are possible if !index.canIndexLogFiles and/or !index.isGlobal // (B) The following cases are possible if !index.canIndexLogFiles and/or !index.isGlobal
// --------------------------------------------------------------------------------------------------- // ---------------------------------------------------------------------------------------------------
// (B.1) Failed first commit - Inserts were written to parquet files and HoodieWriteStat has no entries. // (B.1) Failed first commit - Inserts were written to parquet files and HoodieWriteStat has no
// entries.
// In this scenario, we delete all the parquet files written for the failed commit. // In this scenario, we delete all the parquet files written for the failed commit.
// (B.2) Failed recurring commits - Inserts were written to parquet files and updates to log files. In // (B.2) Failed recurring commits - Inserts were written to parquet files and updates to log files. In
// this scenario, perform (A.1) and for updates written to log files, write rollback blocks. // this scenario, perform (A.1) and for updates written to log files, write rollback blocks.
@@ -282,10 +290,11 @@ public class HoodieMergeOnReadTable<T extends HoodieRecordPayload> extends
// (B.4) Rollback triggered for recurring commits - Same as (B.2) plus we need to delete the log files // (B.4) Rollback triggered for recurring commits - Same as (B.2) plus we need to delete the log files
// as well if the base parquet file gets deleted. // as well if the base parquet file gets deleted.
try { try {
HoodieCommitMetadata commitMetadata = HoodieCommitMetadata.fromBytes( HoodieCommitMetadata commitMetadata =
metaClient.getCommitTimeline().getInstantDetails( HoodieCommitMetadata.fromBytes(
new HoodieInstant(true, instantToRollback.getAction(), instantToRollback.getTimestamp())) metaClient.getCommitTimeline().getInstantDetails(new HoodieInstant(true,
.get(), HoodieCommitMetadata.class); instantToRollback.getAction(), instantToRollback.getTimestamp())).get(),
HoodieCommitMetadata.class);
// read commit file and (either append delete blocks or delete file) // read commit file and (either append delete blocks or delete file)
Map<FileStatus, Long> filesToNumBlocksRollback = new HashMap<>(); Map<FileStatus, Long> filesToNumBlocksRollback = new HashMap<>();
@@ -294,8 +303,7 @@ public class HoodieMergeOnReadTable<T extends HoodieRecordPayload> extends
// We do not know fileIds for inserts (first inserts are either log files or parquet files), // We do not know fileIds for inserts (first inserts are either log files or parquet files),
// delete all files for the corresponding failed commit, if present (same as COW) // delete all files for the corresponding failed commit, if present (same as COW)
super.deleteCleanedFiles(filesToDeletedStatus, partitionPath, filter); super.deleteCleanedFiles(filesToDeletedStatus, partitionPath, filter);
final Set<String> deletedFiles = filesToDeletedStatus.entrySet().stream() final Set<String> deletedFiles = filesToDeletedStatus.entrySet().stream().map(entry -> {
.map(entry -> {
Path filePath = entry.getKey().getPath(); Path filePath = entry.getKey().getPath();
return FSUtils.getFileIdFromFilePath(filePath); return FSUtils.getFileIdFromFilePath(filePath);
}).collect(Collectors.toSet()); }).collect(Collectors.toSet());
@@ -316,8 +324,8 @@ public class HoodieMergeOnReadTable<T extends HoodieRecordPayload> extends
}).filter(Objects::nonNull).collect(); }).filter(Objects::nonNull).collect();
// Delete Inflight instants if enabled // Delete Inflight instants if enabled
deleteInflightInstant(deleteInstants, this.getActiveTimeline(), new HoodieInstant(true, instantToRollback deleteInflightInstant(deleteInstants, this.getActiveTimeline(),
.getAction(), instantToRollback.getTimestamp())); new HoodieInstant(true, instantToRollback.getAction(), instantToRollback.getTimestamp()));
logger.debug("Time(in ms) taken to finish rollback " + (System.currentTimeMillis() - startTime)); logger.debug("Time(in ms) taken to finish rollback " + (System.currentTimeMillis() - startTime));
@@ -332,8 +340,8 @@ public class HoodieMergeOnReadTable<T extends HoodieRecordPayload> extends
} }
/** /**
* UpsertPartitioner for MergeOnRead table type, this allows auto correction of small parquet * UpsertPartitioner for MergeOnRead table type, this allows auto correction of small parquet files to larger ones
* files to larger ones without the need for an index in the logFile. * without the need for an index in the logFile.
*/ */
class MergeOnReadUpsertPartitioner extends HoodieCopyOnWriteTable.UpsertPartitioner { class MergeOnReadUpsertPartitioner extends HoodieCopyOnWriteTable.UpsertPartitioner {
@@ -361,20 +369,22 @@ public class HoodieMergeOnReadTable<T extends HoodieRecordPayload> extends
// TODO : choose last N small files since there can be multiple small files written to a single partition // TODO : choose last N small files since there can be multiple small files written to a single partition
// by different spark partitions in a single batch // by different spark partitions in a single batch
Option<FileSlice> smallFileSlice = Option.fromJavaOptional(getRTFileSystemView() Option<FileSlice> smallFileSlice = Option.fromJavaOptional(getRTFileSystemView()
.getLatestFileSlicesBeforeOrOn(partitionPath, latestCommitTime.getTimestamp(), false).filter( .getLatestFileSlicesBeforeOrOn(partitionPath, latestCommitTime.getTimestamp(), false)
fileSlice -> fileSlice.getLogFiles().count() < 1 .filter(fileSlice -> fileSlice.getLogFiles().count() < 1
&& fileSlice.getDataFile().get().getFileSize() < config && fileSlice.getDataFile().get().getFileSize() < config.getParquetSmallFileLimit())
.getParquetSmallFileLimit()).sorted((FileSlice left, FileSlice right) -> .sorted((FileSlice left,
left.getDataFile().get().getFileSize() < right.getDataFile().get().getFileSize() FileSlice right) -> left.getDataFile().get().getFileSize() < right.getDataFile().get().getFileSize()
? -1 : 1).findFirst()); ? -1
: 1)
.findFirst());
if (smallFileSlice.isPresent()) { if (smallFileSlice.isPresent()) {
allSmallFileSlices.add(smallFileSlice.get()); allSmallFileSlices.add(smallFileSlice.get());
} }
} else { } else {
// If we can index log files, we can add more inserts to log files for fileIds including those under // If we can index log files, we can add more inserts to log files for fileIds including those under
// pending compaction. // pending compaction.
List<FileSlice> allFileSlices = getRTFileSystemView() List<FileSlice> allFileSlices =
.getLatestFileSlicesBeforeOrOn(partitionPath, latestCommitTime.getTimestamp(), true) getRTFileSystemView().getLatestFileSlicesBeforeOrOn(partitionPath, latestCommitTime.getTimestamp(), true)
.collect(Collectors.toList()); .collect(Collectors.toList());
for (FileSlice fileSlice : allFileSlices) { for (FileSlice fileSlice : allFileSlices) {
if (isSmallFile(partitionPath, fileSlice)) { if (isSmallFile(partitionPath, fileSlice)) {
@@ -408,8 +418,7 @@ public class HoodieMergeOnReadTable<T extends HoodieRecordPayload> extends
} }
public List<String> getSmallFileIds() { public List<String> getSmallFileIds() {
return (List<String>) smallFiles.stream() return (List<String>) smallFiles.stream().map(smallFile -> ((SmallFile) smallFile).location.getFileId())
.map(smallFile -> ((SmallFile) smallFile).location.getFileId())
.collect(Collectors.toList()); .collect(Collectors.toList());
} }
@@ -417,8 +426,8 @@ public class HoodieMergeOnReadTable<T extends HoodieRecordPayload> extends
if (!fileSlice.getDataFile().isPresent()) { if (!fileSlice.getDataFile().isPresent()) {
return convertLogFilesSizeToExpectedParquetSize(fileSlice.getLogFiles().collect(Collectors.toList())); return convertLogFilesSizeToExpectedParquetSize(fileSlice.getLogFiles().collect(Collectors.toList()));
} else { } else {
return fileSlice.getDataFile().get().getFileSize() + convertLogFilesSizeToExpectedParquetSize(fileSlice return fileSlice.getDataFile().get().getFileSize()
.getLogFiles().collect(Collectors.toList())); + convertLogFilesSizeToExpectedParquetSize(fileSlice.getLogFiles().collect(Collectors.toList()));
} }
} }
@@ -431,13 +440,12 @@ public class HoodieMergeOnReadTable<T extends HoodieRecordPayload> extends
@VisibleForTesting @VisibleForTesting
public long convertLogFilesSizeToExpectedParquetSize(List<HoodieLogFile> hoodieLogFiles) { public long convertLogFilesSizeToExpectedParquetSize(List<HoodieLogFile> hoodieLogFiles) {
long totalSizeOfLogFiles = hoodieLogFiles.stream().map(hoodieLogFile -> hoodieLogFile.getFileSize()) long totalSizeOfLogFiles = hoodieLogFiles.stream().map(hoodieLogFile -> hoodieLogFile.getFileSize())
.filter(size -> size > 0) .filter(size -> size > 0).reduce((a, b) -> (a + b)).orElse(0L);
.reduce((a, b) -> (a + b)).orElse(0L);
// Here we assume that if there is no base parquet file, all log files contain only inserts. // Here we assume that if there is no base parquet file, all log files contain only inserts.
// We can then just get the parquet equivalent size of these log files, compare that with // We can then just get the parquet equivalent size of these log files, compare that with
// {@link config.getParquetMaxFileSize()} and decide if there is scope to insert more rows // {@link config.getParquetMaxFileSize()} and decide if there is scope to insert more rows
long logFilesEquivalentParquetFileSize = (long) (totalSizeOfLogFiles * config long logFilesEquivalentParquetFileSize =
.getLogFileToParquetCompressionRatio()); (long) (totalSizeOfLogFiles * config.getLogFileToParquetCompressionRatio());
return logFilesEquivalentParquetFileSize; return logFilesEquivalentParquetFileSize;
} }
} }
@@ -447,8 +455,8 @@ public class HoodieMergeOnReadTable<T extends HoodieRecordPayload> extends
Map<HeaderMetadataType, String> header = Maps.newHashMap(); Map<HeaderMetadataType, String> header = Maps.newHashMap();
header.put(HeaderMetadataType.INSTANT_TIME, metaClient.getActiveTimeline().lastInstant().get().getTimestamp()); header.put(HeaderMetadataType.INSTANT_TIME, metaClient.getActiveTimeline().lastInstant().get().getTimestamp());
header.put(HeaderMetadataType.TARGET_INSTANT_TIME, commit); header.put(HeaderMetadataType.TARGET_INSTANT_TIME, commit);
header.put(HeaderMetadataType.COMMAND_BLOCK_TYPE, String.valueOf(HoodieCommandBlockTypeEnum.ROLLBACK_PREVIOUS_BLOCK header.put(HeaderMetadataType.COMMAND_BLOCK_TYPE,
.ordinal())); String.valueOf(HoodieCommandBlockTypeEnum.ROLLBACK_PREVIOUS_BLOCK.ordinal()));
return header; return header;
} }
@@ -462,8 +470,7 @@ public class HoodieMergeOnReadTable<T extends HoodieRecordPayload> extends
// baseCommit always by listing the file slice // baseCommit always by listing the file slice
Map<String, String> fileIdToBaseCommitTimeForLogMap = this.getRTFileSystemView().getLatestFileSlices(partitionPath) Map<String, String> fileIdToBaseCommitTimeForLogMap = this.getRTFileSystemView().getLatestFileSlices(partitionPath)
.collect(Collectors.toMap(FileSlice::getFileId, FileSlice::getBaseInstantTime)); .collect(Collectors.toMap(FileSlice::getFileId, FileSlice::getBaseInstantTime));
commitMetadata.getPartitionToWriteStats().get(partitionPath).stream() commitMetadata.getPartitionToWriteStats().get(partitionPath).stream().filter(wStat -> {
.filter(wStat -> {
// Filter out stats without prevCommit since they are all inserts // Filter out stats without prevCommit since they are all inserts
return wStat != null && wStat.getPrevCommit() != HoodieWriteStat.NULL_COMMIT && wStat.getPrevCommit() != null return wStat != null && wStat.getPrevCommit() != HoodieWriteStat.NULL_COMMIT && wStat.getPrevCommit() != null
&& !deletedFiles.contains(wStat.getFileId()); && !deletedFiles.contains(wStat.getFileId());
@@ -473,10 +480,9 @@ public class HoodieMergeOnReadTable<T extends HoodieRecordPayload> extends
if (null != baseCommitTime) { if (null != baseCommitTime) {
boolean success = false; boolean success = false;
try { try {
writer = HoodieLogFormat.newWriterBuilder().onParentPath( writer = HoodieLogFormat.newWriterBuilder()
FSUtils.getPartitionPath(this.getMetaClient().getBasePath(), partitionPath)) .onParentPath(FSUtils.getPartitionPath(this.getMetaClient().getBasePath(), partitionPath))
.withFileId(wStat.getFileId()).overBaseCommit(baseCommitTime) .withFileId(wStat.getFileId()).overBaseCommit(baseCommitTime).withFs(this.metaClient.getFs())
.withFs(this.metaClient.getFs())
.withFileExtension(HoodieLogFile.DELTA_EXTENSION).build(); .withFileExtension(HoodieLogFile.DELTA_EXTENSION).build();
// generate metadata // generate metadata
Map<HeaderMetadataType, String> header = generateHeader(commit); Map<HeaderMetadataType, String> header = generateHeader(commit);
@@ -484,8 +490,7 @@ public class HoodieMergeOnReadTable<T extends HoodieRecordPayload> extends
writer = writer.appendBlock(new HoodieCommandBlock(header)); writer = writer.appendBlock(new HoodieCommandBlock(header));
success = true; success = true;
} catch (IOException | InterruptedException io) { } catch (IOException | InterruptedException io) {
throw new HoodieRollbackException( throw new HoodieRollbackException("Failed to rollback for commit " + commit, io);
"Failed to rollback for commit " + commit, io);
} finally { } finally {
try { try {
if (writer != null) { if (writer != null) {
@@ -495,8 +500,8 @@ public class HoodieMergeOnReadTable<T extends HoodieRecordPayload> extends
// This step is intentionally done after writer is closed. Guarantees that // This step is intentionally done after writer is closed. Guarantees that
// getFileStatus would reflect correct stats and FileNotFoundException is not thrown in // getFileStatus would reflect correct stats and FileNotFoundException is not thrown in
// cloud-storage : HUDI-168 // cloud-storage : HUDI-168
filesToNumBlocksRollback.put(this.getMetaClient().getFs() filesToNumBlocksRollback.put(this.getMetaClient().getFs().getFileStatus(writer.getLogFile().getPath()),
.getFileStatus(writer.getLogFile().getPath()), 1L); 1L);
} }
} catch (IOException io) { } catch (IOException io) {
throw new UncheckedIOException(io); throw new UncheckedIOException(io);
@@ -504,9 +509,7 @@ public class HoodieMergeOnReadTable<T extends HoodieRecordPayload> extends
} }
} }
}); });
return HoodieRollbackStat.newBuilder() return HoodieRollbackStat.newBuilder().withPartitionPath(partitionPath).withDeletedFileResults(filesToDeletedStatus)
.withPartitionPath(partitionPath)
.withDeletedFileResults(filesToDeletedStatus)
.withRollbackBlockAppendResults(filesToNumBlocksRollback).build(); .withRollbackBlockAppendResults(filesToNumBlocksRollback).build();
} }

View File

@@ -82,22 +82,21 @@ public abstract class HoodieTable<T extends HoodieRecordPayload> implements Seri
protected HoodieTable(HoodieWriteConfig config, JavaSparkContext jsc) { protected HoodieTable(HoodieWriteConfig config, JavaSparkContext jsc) {
this.config = config; this.config = config;
this.hadoopConfiguration = new SerializableConfiguration(jsc.hadoopConfiguration()); this.hadoopConfiguration = new SerializableConfiguration(jsc.hadoopConfiguration());
this.viewManager = FileSystemViewManager.createViewManager( this.viewManager = FileSystemViewManager.createViewManager(new SerializableConfiguration(jsc.hadoopConfiguration()),
new SerializableConfiguration(jsc.hadoopConfiguration()), config.getViewStorageConfig()); config.getViewStorageConfig());
this.metaClient = ClientUtils.createMetaClient(jsc, config, true); this.metaClient = ClientUtils.createMetaClient(jsc, config, true);
this.index = HoodieIndex.createIndex(config, jsc); this.index = HoodieIndex.createIndex(config, jsc);
} }
private synchronized FileSystemViewManager getViewManager() { private synchronized FileSystemViewManager getViewManager() {
if (null == viewManager) { if (null == viewManager) {
viewManager = FileSystemViewManager.createViewManager(hadoopConfiguration, viewManager = FileSystemViewManager.createViewManager(hadoopConfiguration, config.getViewStorageConfig());
config.getViewStorageConfig());
} }
return viewManager; return viewManager;
} }
public static <T extends HoodieRecordPayload> HoodieTable<T> getHoodieTable( public static <T extends HoodieRecordPayload> HoodieTable<T> getHoodieTable(HoodieTableMetaClient metaClient,
HoodieTableMetaClient metaClient, HoodieWriteConfig config, JavaSparkContext jsc) { HoodieWriteConfig config, JavaSparkContext jsc) {
switch (metaClient.getTableType()) { switch (metaClient.getTableType()) {
case COPY_ON_WRITE: case COPY_ON_WRITE:
return new HoodieCopyOnWriteTable<>(config, jsc); return new HoodieCopyOnWriteTable<>(config, jsc);
@@ -202,8 +201,7 @@ public abstract class HoodieTable<T extends HoodieRecordPayload> implements Seri
* Get the list of savepoints in this table * Get the list of savepoints in this table
*/ */
public List<String> getSavepoints() { public List<String> getSavepoints() {
return getCompletedSavepointTimeline().getInstants().map(HoodieInstant::getTimestamp) return getCompletedSavepointTimeline().getInstants().map(HoodieInstant::getTimestamp).collect(Collectors.toList());
.collect(Collectors.toList());
} }
/** /**
@@ -214,18 +212,14 @@ public abstract class HoodieTable<T extends HoodieRecordPayload> implements Seri
throw new HoodieSavepointException( throw new HoodieSavepointException(
"Could not get data files for savepoint " + savepointTime + ". No such savepoint."); "Could not get data files for savepoint " + savepointTime + ". No such savepoint.");
} }
HoodieInstant instant = new HoodieInstant(false, HoodieTimeline.SAVEPOINT_ACTION, HoodieInstant instant = new HoodieInstant(false, HoodieTimeline.SAVEPOINT_ACTION, savepointTime);
savepointTime);
HoodieSavepointMetadata metadata = null; HoodieSavepointMetadata metadata = null;
try { try {
metadata = AvroUtils metadata = AvroUtils.deserializeHoodieSavepointMetadata(getActiveTimeline().getInstantDetails(instant).get());
.deserializeHoodieSavepointMetadata(getActiveTimeline().getInstantDetails(instant).get());
} catch (IOException e) { } catch (IOException e) {
throw new HoodieSavepointException( throw new HoodieSavepointException("Could not get savepointed data files for savepoint " + savepointTime, e);
"Could not get savepointed data files for savepoint " + savepointTime, e);
} }
return metadata.getPartitionMetadata().values().stream() return metadata.getPartitionMetadata().values().stream().flatMap(s -> s.getSavepointDataFile().stream());
.flatMap(s -> s.getSavepointDataFile().stream());
} }
public HoodieActiveTimeline getActiveTimeline() { public HoodieActiveTimeline getActiveTimeline() {
@@ -242,17 +236,18 @@ public abstract class HoodieTable<T extends HoodieRecordPayload> implements Seri
/** /**
* Perform the ultimate IO for a given upserted (RDD) partition * Perform the ultimate IO for a given upserted (RDD) partition
*/ */
public abstract Iterator<List<WriteStatus>> handleUpsertPartition(String commitTime, public abstract Iterator<List<WriteStatus>> handleUpsertPartition(String commitTime, Integer partition,
Integer partition, Iterator<HoodieRecord<T>> recordIterator, Partitioner partitioner); Iterator<HoodieRecord<T>> recordIterator, Partitioner partitioner);
/** /**
* Perform the ultimate IO for a given inserted (RDD) partition * Perform the ultimate IO for a given inserted (RDD) partition
*/ */
public abstract Iterator<List<WriteStatus>> handleInsertPartition(String commitTime, public abstract Iterator<List<WriteStatus>> handleInsertPartition(String commitTime, Integer partition,
Integer partition, Iterator<HoodieRecord<T>> recordIterator, Partitioner partitioner); Iterator<HoodieRecord<T>> recordIterator, Partitioner partitioner);
/** /**
* Schedule compaction for the instant time * Schedule compaction for the instant time
*
* @param jsc Spark Context * @param jsc Spark Context
* @param instantTime Instant Time for scheduling compaction * @param instantTime Instant Time for scheduling compaction
* @return * @return
@@ -260,8 +255,7 @@ public abstract class HoodieTable<T extends HoodieRecordPayload> implements Seri
public abstract HoodieCompactionPlan scheduleCompaction(JavaSparkContext jsc, String instantTime); public abstract HoodieCompactionPlan scheduleCompaction(JavaSparkContext jsc, String instantTime);
/** /**
* Run Compaction on the table. Compaction arranges the data so that it is optimized for data * Run Compaction on the table. Compaction arranges the data so that it is optimized for data access
* access
* *
* @param jsc Spark Context * @param jsc Spark Context
* @param compactionInstantTime Instant Time * @param compactionInstantTime Instant Time
@@ -276,9 +270,9 @@ public abstract class HoodieTable<T extends HoodieRecordPayload> implements Seri
public abstract List<HoodieCleanStat> clean(JavaSparkContext jsc); public abstract List<HoodieCleanStat> clean(JavaSparkContext jsc);
/** /**
* Rollback the (inflight/committed) record changes with the given commit time. Four steps: (1) * Rollback the (inflight/committed) record changes with the given commit time. Four steps: (1) Atomically unpublish
* Atomically unpublish this commit (2) clean indexing data (3) clean new generated parquet files * this commit (2) clean indexing data (3) clean new generated parquet files / log blocks (4) Finally, delete
* / log blocks (4) Finally, delete .<action>.commit or .<action>.inflight file if deleteInstants = true * .<action>.commit or .<action>.inflight file if deleteInstants = true
*/ */
public abstract List<HoodieRollbackStat> rollback(JavaSparkContext jsc, String commit, boolean deleteInstants) public abstract List<HoodieRollbackStat> rollback(JavaSparkContext jsc, String commit, boolean deleteInstants)
throws IOException; throws IOException;
@@ -297,6 +291,7 @@ public abstract class HoodieTable<T extends HoodieRecordPayload> implements Seri
/** /**
* Delete Marker directory corresponding to an instant * Delete Marker directory corresponding to an instant
*
* @param instantTs Instant Time * @param instantTs Instant Time
*/ */
protected void deleteMarkerDir(String instantTs) { protected void deleteMarkerDir(String instantTs) {
@@ -343,13 +338,12 @@ public abstract class HoodieTable<T extends HoodieRecordPayload> implements Seri
// Contains list of partially created files. These needs to be cleaned up. // Contains list of partially created files. These needs to be cleaned up.
invalidDataPaths.removeAll(validDataPaths); invalidDataPaths.removeAll(validDataPaths);
if (!invalidDataPaths.isEmpty()) { if (!invalidDataPaths.isEmpty()) {
logger.info("Removing duplicate data files created due to spark retries before committing. Paths=" logger.info(
+ invalidDataPaths); "Removing duplicate data files created due to spark retries before committing. Paths=" + invalidDataPaths);
} }
Map<String, List<Pair<String, String>>> groupByPartition = invalidDataPaths.stream() Map<String, List<Pair<String, String>>> groupByPartition = invalidDataPaths.stream()
.map(dp -> Pair.of(new Path(dp).getParent().toString(), dp)) .map(dp -> Pair.of(new Path(dp).getParent().toString(), dp)).collect(Collectors.groupingBy(Pair::getKey));
.collect(Collectors.groupingBy(Pair::getKey));
if (!groupByPartition.isEmpty()) { if (!groupByPartition.isEmpty()) {
// Ensure all files in delete list is actually present. This is mandatory for an eventually consistent FS. // Ensure all files in delete list is actually present. This is mandatory for an eventually consistent FS.
@@ -394,6 +388,7 @@ public abstract class HoodieTable<T extends HoodieRecordPayload> implements Seri
/** /**
* Ensures all files passed either appear or disappear * Ensures all files passed either appear or disappear
*
* @param jsc JavaSparkContext * @param jsc JavaSparkContext
* @param groupByPartition Files grouped by partition * @param groupByPartition Files grouped by partition
* @param visibility Appear/Disappear * @param visibility Appear/Disappear

View File

@@ -23,13 +23,11 @@ import org.apache.hudi.common.model.HoodieRecordPayload;
import org.apache.spark.api.java.JavaRDD; import org.apache.spark.api.java.JavaRDD;
/** /**
* Repartition input records into at least expected number of output spark partitions. It should * Repartition input records into at least expected number of output spark partitions. It should give below guarantees -
* give below guarantees - Output spark partition will have records from only one hoodie partition. * Output spark partition will have records from only one hoodie partition. - Average records per output spark
* - Average records per output spark partitions should be almost equal to (#inputRecords / * partitions should be almost equal to (#inputRecords / #outputSparkPartitions) to avoid possible skews.
* #outputSparkPartitions) to avoid possible skews.
*/ */
public interface UserDefinedBulkInsertPartitioner<T extends HoodieRecordPayload> { public interface UserDefinedBulkInsertPartitioner<T extends HoodieRecordPayload> {
JavaRDD<HoodieRecord<T>> repartitionRecords(JavaRDD<HoodieRecord<T>> records, JavaRDD<HoodieRecord<T>> repartitionRecords(JavaRDD<HoodieRecord<T>> records, int outputSparkPartitions);
int outputSparkPartitions);
} }

View File

@@ -30,8 +30,7 @@ import org.apache.spark.api.java.JavaRDD;
import scala.Tuple2; import scala.Tuple2;
/** /**
* Information about incoming records for upsert/insert obtained either via sampling or * Information about incoming records for upsert/insert obtained either via sampling or introspecting the data fully
* introspecting the data fully
* <p> * <p>
* TODO(vc): Think about obtaining this directly from index.tagLocation * TODO(vc): Think about obtaining this directly from index.tagLocation
*/ */
@@ -62,11 +61,10 @@ public class WorkloadProfile<T extends HoodieRecordPayload> implements Serializa
Map<Tuple2<String, Option<HoodieRecordLocation>>, Long> partitionLocationCounts = taggedRecords Map<Tuple2<String, Option<HoodieRecordLocation>>, Long> partitionLocationCounts = taggedRecords
.mapToPair(record -> new Tuple2<>( .mapToPair(record -> new Tuple2<>(
new Tuple2<>(record.getPartitionPath(), Option.ofNullable(record.getCurrentLocation())), new Tuple2<>(record.getPartitionPath(), Option.ofNullable(record.getCurrentLocation())), record))
record)).countByKey(); .countByKey();
for (Map.Entry<Tuple2<String, Option<HoodieRecordLocation>>, Long> e : partitionLocationCounts for (Map.Entry<Tuple2<String, Option<HoodieRecordLocation>>, Long> e : partitionLocationCounts.entrySet()) {
.entrySet()) {
String partitionPath = e.getKey()._1(); String partitionPath = e.getKey()._1();
Long count = e.getValue(); Long count = e.getValue();
Option<HoodieRecordLocation> locOption = e.getKey()._2(); Option<HoodieRecordLocation> locOption = e.getKey()._2();

View File

@@ -41,7 +41,8 @@ import org.apache.spark.api.java.JavaRDD;
import org.apache.spark.api.java.JavaSparkContext; import org.apache.spark.api.java.JavaSparkContext;
/** /**
* Driver program that uses the Hoodie client with synthetic workload, and performs basic operations. <p> * Driver program that uses the Hoodie client with synthetic workload, and performs basic operations.
* <p>
*/ */
public class HoodieClientExample { public class HoodieClientExample {
@@ -82,18 +83,15 @@ public class HoodieClientExample {
Path path = new Path(tablePath); Path path = new Path(tablePath);
FileSystem fs = FSUtils.getFs(tablePath, jsc.hadoopConfiguration()); FileSystem fs = FSUtils.getFs(tablePath, jsc.hadoopConfiguration());
if (!fs.exists(path)) { if (!fs.exists(path)) {
HoodieTableMetaClient HoodieTableMetaClient.initTableType(jsc.hadoopConfiguration(), tablePath, HoodieTableType.valueOf(tableType),
.initTableType(jsc.hadoopConfiguration(), tablePath, HoodieTableType.valueOf(tableType), tableName, tableName, HoodieAvroPayload.class.getName());
HoodieAvroPayload.class.getName());
} }
// Create the write client to write some records in // Create the write client to write some records in
HoodieWriteConfig cfg = HoodieWriteConfig.newBuilder().withPath(tablePath) HoodieWriteConfig cfg = HoodieWriteConfig.newBuilder().withPath(tablePath)
.withSchema(HoodieTestDataGenerator.TRIP_EXAMPLE_SCHEMA).withParallelism(2, 2) .withSchema(HoodieTestDataGenerator.TRIP_EXAMPLE_SCHEMA).withParallelism(2, 2).forTable(tableName)
.forTable(tableName)
.withIndexConfig(HoodieIndexConfig.newBuilder().withIndexType(IndexType.BLOOM).build()) .withIndexConfig(HoodieIndexConfig.newBuilder().withIndexType(IndexType.BLOOM).build())
.withCompactionConfig( .withCompactionConfig(HoodieCompactionConfig.newBuilder().archiveCommitsWith(2, 3).build()).build();
HoodieCompactionConfig.newBuilder().archiveCommitsWith(2, 3).build()).build();
HoodieWriteClient client = new HoodieWriteClient(jsc, cfg); HoodieWriteClient client = new HoodieWriteClient(jsc, cfg);
/** /**

View File

@@ -74,6 +74,7 @@ public abstract class HoodieClientTestHarness extends HoodieCommonTestHarness im
/** /**
* Cleanups resource group for the subclasses of {@link TestHoodieClientBase}. * Cleanups resource group for the subclasses of {@link TestHoodieClientBase}.
*
* @throws IOException * @throws IOException
*/ */
public void cleanupResources() throws IOException { public void cleanupResources() throws IOException {
@@ -84,8 +85,7 @@ public abstract class HoodieClientTestHarness extends HoodieCommonTestHarness im
} }
/** /**
* Initializes the Spark contexts ({@link JavaSparkContext} and {@link SQLContext}) * Initializes the Spark contexts ({@link JavaSparkContext} and {@link SQLContext}) with the given application name.
* with the given application name.
* *
* @param appName The specified application name. * @param appName The specified application name.
*/ */
@@ -99,8 +99,8 @@ public abstract class HoodieClientTestHarness extends HoodieCommonTestHarness im
} }
/** /**
* Initializes the Spark contexts ({@link JavaSparkContext} and {@link SQLContext}) * Initializes the Spark contexts ({@link JavaSparkContext} and {@link SQLContext}) with a default name
* with a default name <b>TestHoodieClient</b>. * <b>TestHoodieClient</b>.
*/ */
protected void initSparkContexts() { protected void initSparkContexts() {
initSparkContexts("TestHoodieClient"); initSparkContexts("TestHoodieClient");
@@ -155,8 +155,8 @@ public abstract class HoodieClientTestHarness extends HoodieCommonTestHarness im
} }
/** /**
* Initializes an instance of {@link HoodieTableMetaClient} with a special table type * Initializes an instance of {@link HoodieTableMetaClient} with a special table type specified by
* specified by {@code getTableType()}. * {@code getTableType()}.
* *
* @throws IOException * @throws IOException
*/ */

View File

@@ -73,15 +73,14 @@ public class TestAsyncCompaction extends TestHoodieClientBase {
private HoodieWriteConfig.Builder getConfigBuilder(Boolean autoCommit) { private HoodieWriteConfig.Builder getConfigBuilder(Boolean autoCommit) {
return HoodieWriteConfig.newBuilder().withPath(basePath).withSchema(TRIP_EXAMPLE_SCHEMA).withParallelism(2, 2) return HoodieWriteConfig.newBuilder().withPath(basePath).withSchema(TRIP_EXAMPLE_SCHEMA).withParallelism(2, 2)
.withAutoCommit(autoCommit).withAssumeDatePartitioning(true).withCompactionConfig( .withAutoCommit(autoCommit).withAssumeDatePartitioning(true)
HoodieCompactionConfig.newBuilder().compactionSmallFileSize(1024 * 1024 * 1024).withInlineCompaction(false) .withCompactionConfig(HoodieCompactionConfig.newBuilder().compactionSmallFileSize(1024 * 1024 * 1024)
.withMaxNumDeltaCommitsBeforeCompaction(1).build()) .withInlineCompaction(false).withMaxNumDeltaCommitsBeforeCompaction(1).build())
.withStorageConfig(HoodieStorageConfig.newBuilder().limitFileSize(1024 * 1024 * 1024).build()) .withStorageConfig(HoodieStorageConfig.newBuilder().limitFileSize(1024 * 1024 * 1024).build())
.forTable("test-trip-table") .forTable("test-trip-table")
.withIndexConfig(HoodieIndexConfig.newBuilder().withIndexType(HoodieIndex.IndexType.BLOOM).build()) .withIndexConfig(HoodieIndexConfig.newBuilder().withIndexType(HoodieIndex.IndexType.BLOOM).build())
.withEmbeddedTimelineServerEnabled(true).withFileSystemViewConfig( .withEmbeddedTimelineServerEnabled(true).withFileSystemViewConfig(FileSystemViewStorageConfig.newBuilder()
FileSystemViewStorageConfig.newBuilder().withStorageType(FileSystemViewStorageType.EMBEDDED_KV_STORE) .withStorageType(FileSystemViewStorageType.EMBEDDED_KV_STORE).build());
.build());
} }
@Test @Test
@@ -97,8 +96,8 @@ public class TestAsyncCompaction extends TestHoodieClientBase {
int numRecs = 2000; int numRecs = 2000;
List<HoodieRecord> records = dataGen.generateInserts(firstInstantTime, numRecs); List<HoodieRecord> records = dataGen.generateInserts(firstInstantTime, numRecs);
runNextDeltaCommits(client, Arrays.asList(firstInstantTime, secondInstantTime), runNextDeltaCommits(client, Arrays.asList(firstInstantTime, secondInstantTime), records, cfg, true,
records, cfg, true, new ArrayList<>()); new ArrayList<>());
// Schedule compaction but do not run them // Schedule compaction but do not run them
scheduleCompaction(compactionInstantTime, client, cfg); scheduleCompaction(compactionInstantTime, client, cfg);
@@ -158,8 +157,8 @@ public class TestAsyncCompaction extends TestHoodieClientBase {
try (HoodieWriteClient client = getHoodieWriteClient(cfg, true);) { try (HoodieWriteClient client = getHoodieWriteClient(cfg, true);) {
List<HoodieRecord> records = dataGen.generateInserts(firstInstantTime, numRecs); List<HoodieRecord> records = dataGen.generateInserts(firstInstantTime, numRecs);
records = runNextDeltaCommits(client, Arrays.asList(firstInstantTime, secondInstantTime), records = runNextDeltaCommits(client, Arrays.asList(firstInstantTime, secondInstantTime), records, cfg, true,
records, cfg, true, new ArrayList<>()); new ArrayList<>());
// Schedule compaction but do not run them // Schedule compaction but do not run them
scheduleCompaction(compactionInstantTime, client, cfg); scheduleCompaction(compactionInstantTime, client, cfg);
@@ -182,15 +181,13 @@ public class TestAsyncCompaction extends TestHoodieClientBase {
// Validate // Validate
metaClient = new HoodieTableMetaClient(jsc.hadoopConfiguration(), cfg.getBasePath()); metaClient = new HoodieTableMetaClient(jsc.hadoopConfiguration(), cfg.getBasePath());
inflightInstant = inflightInstant = metaClient.getActiveTimeline().filterInflightsExcludingCompaction().firstInstant().get();
metaClient.getActiveTimeline().filterInflightsExcludingCompaction().firstInstant().get();
assertTrue("inflight instant has expected instant time", assertTrue("inflight instant has expected instant time",
inflightInstant.getTimestamp().equals(nextInflightInstantTime)); inflightInstant.getTimestamp().equals(nextInflightInstantTime));
assertTrue("Expect only one inflight instant", assertTrue("Expect only one inflight instant",
metaClient.getActiveTimeline().filterInflightsExcludingCompaction().getInstants().count() == 1); metaClient.getActiveTimeline().filterInflightsExcludingCompaction().getInstants().count() == 1);
// Expect pending Compaction to be present // Expect pending Compaction to be present
pendingCompactionInstant = pendingCompactionInstant = metaClient.getActiveTimeline().filterPendingCompactionTimeline().firstInstant().get();
metaClient.getActiveTimeline().filterPendingCompactionTimeline().firstInstant().get();
assertTrue("Pending Compaction instant has expected instant time", assertTrue("Pending Compaction instant has expected instant time",
pendingCompactionInstant.getTimestamp().equals(compactionInstantTime)); pendingCompactionInstant.getTimestamp().equals(compactionInstantTime));
} }
@@ -211,8 +208,8 @@ public class TestAsyncCompaction extends TestHoodieClientBase {
int numRecs = 2000; int numRecs = 2000;
List<HoodieRecord> records = dataGen.generateInserts(firstInstantTime, numRecs); List<HoodieRecord> records = dataGen.generateInserts(firstInstantTime, numRecs);
records = runNextDeltaCommits(client, Arrays.asList(firstInstantTime, secondInstantTime), records = runNextDeltaCommits(client, Arrays.asList(firstInstantTime, secondInstantTime), records, cfg, true,
records, cfg, true, new ArrayList<>()); new ArrayList<>());
// Schedule and mark compaction instant as inflight // Schedule and mark compaction instant as inflight
HoodieTableMetaClient metaClient = new HoodieTableMetaClient(jsc.hadoopConfiguration(), cfg.getBasePath()); HoodieTableMetaClient metaClient = new HoodieTableMetaClient(jsc.hadoopConfiguration(), cfg.getBasePath());
@@ -221,8 +218,8 @@ public class TestAsyncCompaction extends TestHoodieClientBase {
moveCompactionFromRequestedToInflight(compactionInstantTime, client, cfg); moveCompactionFromRequestedToInflight(compactionInstantTime, client, cfg);
// Complete ingestions // Complete ingestions
runNextDeltaCommits(client, Arrays.asList(thirdInstantTime, fourthInstantTime), runNextDeltaCommits(client, Arrays.asList(thirdInstantTime, fourthInstantTime), records, cfg, false,
records, cfg, false, Arrays.asList(compactionInstantTime)); Arrays.asList(compactionInstantTime));
// execute inflight compaction // execute inflight compaction
executeCompaction(compactionInstantTime, client, hoodieTable, cfg, numRecs, true); executeCompaction(compactionInstantTime, client, hoodieTable, cfg, numRecs, true);
@@ -242,8 +239,8 @@ public class TestAsyncCompaction extends TestHoodieClientBase {
int numRecs = 2000; int numRecs = 2000;
List<HoodieRecord> records = dataGen.generateInserts(firstInstantTime, numRecs); List<HoodieRecord> records = dataGen.generateInserts(firstInstantTime, numRecs);
records = runNextDeltaCommits(client, Arrays.asList(firstInstantTime, secondInstantTime), records = runNextDeltaCommits(client, Arrays.asList(firstInstantTime, secondInstantTime), records, cfg, true,
records, cfg, true, new ArrayList<>()); new ArrayList<>());
// Schedule compaction but do not run them // Schedule compaction but do not run them
HoodieTableMetaClient metaClient = new HoodieTableMetaClient(jsc.hadoopConfiguration(), cfg.getBasePath()); HoodieTableMetaClient metaClient = new HoodieTableMetaClient(jsc.hadoopConfiguration(), cfg.getBasePath());
@@ -256,8 +253,8 @@ public class TestAsyncCompaction extends TestHoodieClientBase {
boolean gotException = false; boolean gotException = false;
try { try {
runNextDeltaCommits(client, Arrays.asList(failedInstantTime), runNextDeltaCommits(client, Arrays.asList(failedInstantTime), records, cfg, false,
records, cfg, false, Arrays.asList(compactionInstantTime)); Arrays.asList(compactionInstantTime));
} catch (IllegalArgumentException iex) { } catch (IllegalArgumentException iex) {
// Latest pending compaction instant time must be earlier than this instant time. Should fail here // Latest pending compaction instant time must be earlier than this instant time. Should fail here
gotException = true; gotException = true;
@@ -279,8 +276,8 @@ public class TestAsyncCompaction extends TestHoodieClientBase {
int numRecs = 2000; int numRecs = 2000;
List<HoodieRecord> records = dataGen.generateInserts(firstInstantTime, numRecs); List<HoodieRecord> records = dataGen.generateInserts(firstInstantTime, numRecs);
records = runNextDeltaCommits(client, Arrays.asList(firstInstantTime, secondInstantTime), records = runNextDeltaCommits(client, Arrays.asList(firstInstantTime, secondInstantTime), records, cfg, true,
records, cfg, true, new ArrayList<>()); new ArrayList<>());
HoodieTableMetaClient metaClient = new HoodieTableMetaClient(jsc.hadoopConfiguration(), cfg.getBasePath()); HoodieTableMetaClient metaClient = new HoodieTableMetaClient(jsc.hadoopConfiguration(), cfg.getBasePath());
createNextDeltaCommit(inflightInstantTime, records, client, metaClient, cfg, true); createNextDeltaCommit(inflightInstantTime, records, client, metaClient, cfg, true);
@@ -315,8 +312,8 @@ public class TestAsyncCompaction extends TestHoodieClientBase {
int numRecs = 2000; int numRecs = 2000;
List<HoodieRecord> records = dataGen.generateInserts(firstInstantTime, numRecs); List<HoodieRecord> records = dataGen.generateInserts(firstInstantTime, numRecs);
records = runNextDeltaCommits(client, Arrays.asList(firstInstantTime, secondInstantTime), records = runNextDeltaCommits(client, Arrays.asList(firstInstantTime, secondInstantTime), records, cfg, true,
records, cfg, true, new ArrayList<>()); new ArrayList<>());
HoodieTableMetaClient metaClient = new HoodieTableMetaClient(jsc.hadoopConfiguration(), cfg.getBasePath()); HoodieTableMetaClient metaClient = new HoodieTableMetaClient(jsc.hadoopConfiguration(), cfg.getBasePath());
boolean gotException = false; boolean gotException = false;
@@ -337,8 +334,7 @@ public class TestAsyncCompaction extends TestHoodieClientBase {
} catch (IllegalArgumentException iex) { } catch (IllegalArgumentException iex) {
gotException = true; gotException = true;
} }
assertTrue("Compaction Instant to be scheduled cannot have same timestamp as committed instant", assertTrue("Compaction Instant to be scheduled cannot have same timestamp as committed instant", gotException);
gotException);
compactionInstantTime = "006"; compactionInstantTime = "006";
scheduleCompaction(compactionInstantTime, client, cfg); scheduleCompaction(compactionInstantTime, client, cfg);
@@ -349,8 +345,7 @@ public class TestAsyncCompaction extends TestHoodieClientBase {
} catch (IllegalArgumentException iex) { } catch (IllegalArgumentException iex) {
gotException = true; gotException = true;
} }
assertTrue("Compaction Instant to be scheduled cannot have same timestamp as a pending compaction", assertTrue("Compaction Instant to be scheduled cannot have same timestamp as a pending compaction", gotException);
gotException);
} }
@Test @Test
@@ -365,8 +360,8 @@ public class TestAsyncCompaction extends TestHoodieClientBase {
int numRecs = 2000; int numRecs = 2000;
List<HoodieRecord> records = dataGen.generateInserts(firstInstantTime, numRecs); List<HoodieRecord> records = dataGen.generateInserts(firstInstantTime, numRecs);
records = runNextDeltaCommits(client, Arrays.asList(firstInstantTime, secondInstantTime), records = runNextDeltaCommits(client, Arrays.asList(firstInstantTime, secondInstantTime), records, cfg, true,
records, cfg, true, new ArrayList<>()); new ArrayList<>());
HoodieTableMetaClient metaClient = new HoodieTableMetaClient(jsc.hadoopConfiguration(), cfg.getBasePath()); HoodieTableMetaClient metaClient = new HoodieTableMetaClient(jsc.hadoopConfiguration(), cfg.getBasePath());
HoodieTable hoodieTable = getHoodieTable(metaClient, cfg); HoodieTable hoodieTable = getHoodieTable(metaClient, cfg);
@@ -389,15 +384,15 @@ public class TestAsyncCompaction extends TestHoodieClientBase {
int numRecs = 2000; int numRecs = 2000;
List<HoodieRecord> records = dataGen.generateInserts(firstInstantTime, numRecs); List<HoodieRecord> records = dataGen.generateInserts(firstInstantTime, numRecs);
records = runNextDeltaCommits(client, Arrays.asList(firstInstantTime, secondInstantTime), records = runNextDeltaCommits(client, Arrays.asList(firstInstantTime, secondInstantTime), records, cfg, true,
records, cfg, true, new ArrayList<>()); new ArrayList<>());
HoodieTableMetaClient metaClient = new HoodieTableMetaClient(jsc.hadoopConfiguration(), cfg.getBasePath()); HoodieTableMetaClient metaClient = new HoodieTableMetaClient(jsc.hadoopConfiguration(), cfg.getBasePath());
HoodieTable hoodieTable = getHoodieTable(metaClient, cfg); HoodieTable hoodieTable = getHoodieTable(metaClient, cfg);
scheduleCompaction(compactionInstantTime, client, cfg); scheduleCompaction(compactionInstantTime, client, cfg);
runNextDeltaCommits(client, Arrays.asList(thirdInstantTime, fourthInstantTime), runNextDeltaCommits(client, Arrays.asList(thirdInstantTime, fourthInstantTime), records, cfg, false,
records, cfg, false, Arrays.asList(compactionInstantTime)); Arrays.asList(compactionInstantTime));
executeCompaction(compactionInstantTime, client, hoodieTable, cfg, numRecs, true); executeCompaction(compactionInstantTime, client, hoodieTable, cfg, numRecs, true);
} }
} }
@@ -428,8 +423,8 @@ public class TestAsyncCompaction extends TestHoodieClientBase {
} }
private List<HoodieRecord> runNextDeltaCommits(HoodieWriteClient client, List<String> deltaInstants, private List<HoodieRecord> runNextDeltaCommits(HoodieWriteClient client, List<String> deltaInstants,
List<HoodieRecord> records, HoodieWriteConfig cfg, boolean insertFirst, List<HoodieRecord> records, HoodieWriteConfig cfg, boolean insertFirst, List<String> expPendingCompactionInstants)
List<String> expPendingCompactionInstants) throws Exception { throws Exception {
HoodieTableMetaClient metaClient = new HoodieTableMetaClient(jsc.hadoopConfiguration(), cfg.getBasePath()); HoodieTableMetaClient metaClient = new HoodieTableMetaClient(jsc.hadoopConfiguration(), cfg.getBasePath());
List<Pair<HoodieInstant, HoodieCompactionPlan>> pendingCompactions = List<Pair<HoodieInstant, HoodieCompactionPlan>> pendingCompactions =
@@ -476,8 +471,8 @@ public class TestAsyncCompaction extends TestHoodieClientBase {
HoodieWriteConfig cfg) throws IOException { HoodieWriteConfig cfg) throws IOException {
HoodieTableMetaClient metaClient = new HoodieTableMetaClient(jsc.hadoopConfiguration(), cfg.getBasePath()); HoodieTableMetaClient metaClient = new HoodieTableMetaClient(jsc.hadoopConfiguration(), cfg.getBasePath());
HoodieInstant compactionInstant = HoodieTimeline.getCompactionRequestedInstant(compactionInstantTime); HoodieInstant compactionInstant = HoodieTimeline.getCompactionRequestedInstant(compactionInstantTime);
HoodieCompactionPlan workload = AvroUtils.deserializeCompactionPlan( HoodieCompactionPlan workload = AvroUtils
metaClient.getActiveTimeline().getInstantAuxiliaryDetails(compactionInstant).get()); .deserializeCompactionPlan(metaClient.getActiveTimeline().getInstantAuxiliaryDetails(compactionInstant).get());
metaClient.getActiveTimeline().transitionCompactionRequestedToInflight(compactionInstant); metaClient.getActiveTimeline().transitionCompactionRequestedToInflight(compactionInstant);
HoodieInstant instant = metaClient.getActiveTimeline().reload().filterPendingCompactionTimeline().getInstants() HoodieInstant instant = metaClient.getActiveTimeline().reload().filterPendingCompactionTimeline().getInstants()
.filter(in -> in.getTimestamp().equals(compactionInstantTime)).findAny().get(); .filter(in -> in.getTimestamp().equals(compactionInstantTime)).findAny().get();
@@ -489,27 +484,23 @@ public class TestAsyncCompaction extends TestHoodieClientBase {
client.scheduleCompactionAtInstant(compactionInstantTime, Option.empty()); client.scheduleCompactionAtInstant(compactionInstantTime, Option.empty());
HoodieTableMetaClient metaClient = new HoodieTableMetaClient(jsc.hadoopConfiguration(), cfg.getBasePath()); HoodieTableMetaClient metaClient = new HoodieTableMetaClient(jsc.hadoopConfiguration(), cfg.getBasePath());
HoodieInstant instant = metaClient.getActiveTimeline().filterPendingCompactionTimeline().lastInstant().get(); HoodieInstant instant = metaClient.getActiveTimeline().filterPendingCompactionTimeline().lastInstant().get();
assertEquals("Last compaction instant must be the one set", assertEquals("Last compaction instant must be the one set", instant.getTimestamp(), compactionInstantTime);
instant.getTimestamp(), compactionInstantTime);
} }
private void scheduleAndExecuteCompaction(String compactionInstantTime, private void scheduleAndExecuteCompaction(String compactionInstantTime, HoodieWriteClient client, HoodieTable table,
HoodieWriteClient client, HoodieTable table, HoodieWriteConfig cfg, int expectedNumRecs, HoodieWriteConfig cfg, int expectedNumRecs, boolean hasDeltaCommitAfterPendingCompaction) throws IOException {
boolean hasDeltaCommitAfterPendingCompaction) throws IOException {
scheduleCompaction(compactionInstantTime, client, cfg); scheduleCompaction(compactionInstantTime, client, cfg);
executeCompaction(compactionInstantTime, client, table, cfg, expectedNumRecs, hasDeltaCommitAfterPendingCompaction); executeCompaction(compactionInstantTime, client, table, cfg, expectedNumRecs, hasDeltaCommitAfterPendingCompaction);
} }
private void executeCompaction(String compactionInstantTime, private void executeCompaction(String compactionInstantTime, HoodieWriteClient client, HoodieTable table,
HoodieWriteClient client, HoodieTable table, HoodieWriteConfig cfg, int expectedNumRecs, HoodieWriteConfig cfg, int expectedNumRecs, boolean hasDeltaCommitAfterPendingCompaction) throws IOException {
boolean hasDeltaCommitAfterPendingCompaction) throws IOException {
client.compact(compactionInstantTime); client.compact(compactionInstantTime);
List<FileSlice> fileSliceList = getCurrentLatestFileSlices(table, cfg); List<FileSlice> fileSliceList = getCurrentLatestFileSlices(table, cfg);
assertTrue("Ensure latest file-slices are not empty", fileSliceList.stream().findAny().isPresent()); assertTrue("Ensure latest file-slices are not empty", fileSliceList.stream().findAny().isPresent());
assertFalse("Verify all file-slices have base-instant same as compaction instant", assertFalse("Verify all file-slices have base-instant same as compaction instant", fileSliceList.stream()
fileSliceList.stream().filter(fs -> !fs.getBaseInstantTime().equals(compactionInstantTime)) .filter(fs -> !fs.getBaseInstantTime().equals(compactionInstantTime)).findAny().isPresent());
.findAny().isPresent());
assertFalse("Verify all file-slices have data-files", assertFalse("Verify all file-slices have data-files",
fileSliceList.stream().filter(fs -> !fs.getDataFile().isPresent()).findAny().isPresent()); fileSliceList.stream().filter(fs -> !fs.getDataFile().isPresent()).findAny().isPresent());
@@ -522,12 +513,11 @@ public class TestAsyncCompaction extends TestHoodieClientBase {
} }
// verify that there is a commit // verify that there is a commit
table = getHoodieTable( table = getHoodieTable(new HoodieTableMetaClient(jsc.hadoopConfiguration(), cfg.getBasePath(), true), cfg);
new HoodieTableMetaClient(jsc.hadoopConfiguration(), cfg.getBasePath(), true), cfg);
HoodieTimeline timeline = table.getMetaClient().getCommitTimeline().filterCompletedInstants(); HoodieTimeline timeline = table.getMetaClient().getCommitTimeline().filterCompletedInstants();
String latestCompactionCommitTime = timeline.lastInstant().get().getTimestamp(); String latestCompactionCommitTime = timeline.lastInstant().get().getTimestamp();
assertEquals("Expect compaction instant time to be the latest commit time", assertEquals("Expect compaction instant time to be the latest commit time", latestCompactionCommitTime,
latestCompactionCommitTime, compactionInstantTime); compactionInstantTime);
Assert.assertEquals("Must contain expected records", expectedNumRecs, Assert.assertEquals("Must contain expected records", expectedNumRecs,
HoodieClientTestUtils.readSince(basePath, sqlContext, timeline, "000").count()); HoodieClientTestUtils.readSince(basePath, sqlContext, timeline, "000").count());
@@ -546,8 +536,8 @@ public class TestAsyncCompaction extends TestHoodieClientBase {
client.commit(instantTime, statuses); client.commit(instantTime, statuses);
} }
Option<HoodieInstant> deltaCommit = metaClient.getActiveTimeline().reload().getDeltaCommitTimeline() Option<HoodieInstant> deltaCommit =
.filterCompletedInstants().lastInstant(); metaClient.getActiveTimeline().reload().getDeltaCommitTimeline().filterCompletedInstants().lastInstant();
if (skipCommit && !cfg.shouldAutoCommit()) { if (skipCommit && !cfg.shouldAutoCommit()) {
assertTrue("Delta commit should not be latest instant", assertTrue("Delta commit should not be latest instant",
deltaCommit.get().getTimestamp().compareTo(instantTime) < 0); deltaCommit.get().getTimestamp().compareTo(instantTime) < 0);
@@ -560,8 +550,8 @@ public class TestAsyncCompaction extends TestHoodieClientBase {
private List<HoodieDataFile> getCurrentLatestDataFiles(HoodieTable table, HoodieWriteConfig cfg) throws IOException { private List<HoodieDataFile> getCurrentLatestDataFiles(HoodieTable table, HoodieWriteConfig cfg) throws IOException {
FileStatus[] allFiles = HoodieTestUtils.listAllDataFilesInPath(table.getMetaClient().getFs(), cfg.getBasePath()); FileStatus[] allFiles = HoodieTestUtils.listAllDataFilesInPath(table.getMetaClient().getFs(), cfg.getBasePath());
HoodieTableFileSystemView HoodieTableFileSystemView view =
view = new HoodieTableFileSystemView(table.getMetaClient(), table.getCompletedCommitsTimeline(), allFiles); new HoodieTableFileSystemView(table.getMetaClient(), table.getCompletedCommitsTimeline(), allFiles);
List<HoodieDataFile> dataFilesToRead = view.getLatestDataFiles().collect(Collectors.toList()); List<HoodieDataFile> dataFilesToRead = view.getLatestDataFiles().collect(Collectors.toList());
return dataFilesToRead; return dataFilesToRead;
} }
@@ -569,9 +559,8 @@ public class TestAsyncCompaction extends TestHoodieClientBase {
private List<FileSlice> getCurrentLatestFileSlices(HoodieTable table, HoodieWriteConfig cfg) throws IOException { private List<FileSlice> getCurrentLatestFileSlices(HoodieTable table, HoodieWriteConfig cfg) throws IOException {
HoodieTableFileSystemView view = new HoodieTableFileSystemView(table.getMetaClient(), HoodieTableFileSystemView view = new HoodieTableFileSystemView(table.getMetaClient(),
table.getMetaClient().getActiveTimeline().reload().getCommitsAndCompactionTimeline()); table.getMetaClient().getActiveTimeline().reload().getCommitsAndCompactionTimeline());
List<FileSlice> fileSliceList = List<FileSlice> fileSliceList = Arrays.asList(HoodieTestDataGenerator.DEFAULT_PARTITION_PATHS).stream()
Arrays.asList(HoodieTestDataGenerator.DEFAULT_PARTITION_PATHS).stream().flatMap(partition -> .flatMap(partition -> view.getLatestFileSlices(partition)).collect(Collectors.toList());
view.getLatestFileSlices(partition)).collect(Collectors.toList());
return fileSliceList; return fileSliceList;
} }

View File

@@ -93,16 +93,13 @@ public class TestCleaner extends TestHoodieClientBase {
* @param insertFn Insertion API for testing * @param insertFn Insertion API for testing
* @throws Exception in case of error * @throws Exception in case of error
*/ */
private String insertFirstBigBatchForClientCleanerTest( private String insertFirstBigBatchForClientCleanerTest(HoodieWriteConfig cfg, HoodieWriteClient client,
HoodieWriteConfig cfg,
HoodieWriteClient client,
Function2<List<HoodieRecord>, String, Integer> recordGenFunction, Function2<List<HoodieRecord>, String, Integer> recordGenFunction,
Function3<JavaRDD<WriteStatus>, HoodieWriteClient, JavaRDD<HoodieRecord>, String> insertFn) throws Exception { Function3<JavaRDD<WriteStatus>, HoodieWriteClient, JavaRDD<HoodieRecord>, String> insertFn) throws Exception {
/** /**
* do a big insert * do a big insert (this is basically same as insert part of upsert, just adding it here so we can catch breakages
* (this is basically same as insert part of upsert, just adding it here so we can * in insert(), if the implementation diverges.)
* catch breakages in insert(), if the implementation diverges.)
*/ */
String newCommitTime = client.startCommit(); String newCommitTime = client.startCommit();
@@ -145,8 +142,8 @@ public class TestCleaner extends TestHoodieClientBase {
*/ */
@Test @Test
public void testInsertPreppedAndCleanByVersions() throws Exception { public void testInsertPreppedAndCleanByVersions() throws Exception {
testInsertAndCleanByVersions(HoodieWriteClient::insertPreppedRecords, testInsertAndCleanByVersions(HoodieWriteClient::insertPreppedRecords, HoodieWriteClient::upsertPreppedRecords,
HoodieWriteClient::upsertPreppedRecords, true); true);
} }
/** /**
@@ -178,15 +175,13 @@ public class TestCleaner extends TestHoodieClientBase {
*/ */
private void testInsertAndCleanByVersions( private void testInsertAndCleanByVersions(
Function3<JavaRDD<WriteStatus>, HoodieWriteClient, JavaRDD<HoodieRecord>, String> insertFn, Function3<JavaRDD<WriteStatus>, HoodieWriteClient, JavaRDD<HoodieRecord>, String> insertFn,
Function3<JavaRDD<WriteStatus>, HoodieWriteClient, JavaRDD<HoodieRecord>, String> upsertFn, Function3<JavaRDD<WriteStatus>, HoodieWriteClient, JavaRDD<HoodieRecord>, String> upsertFn, boolean isPreppedAPI)
boolean isPreppedAPI throws Exception {
) throws Exception {
int maxVersions = 2; // keep upto 2 versions for each file int maxVersions = 2; // keep upto 2 versions for each file
HoodieWriteConfig cfg = getConfigBuilder().withCompactionConfig( HoodieWriteConfig cfg = getConfigBuilder()
HoodieCompactionConfig.newBuilder().withCleanerPolicy(HoodieCleaningPolicy.KEEP_LATEST_FILE_VERSIONS) .withCompactionConfig(HoodieCompactionConfig.newBuilder()
.retainFileVersions(maxVersions).build()) .withCleanerPolicy(HoodieCleaningPolicy.KEEP_LATEST_FILE_VERSIONS).retainFileVersions(maxVersions).build())
.withParallelism(1, 1).withBulkInsertParallelism(1) .withParallelism(1, 1).withBulkInsertParallelism(1).withFinalizeWriteParallelism(1)
.withFinalizeWriteParallelism(1)
.withConsistencyGuardConfig(ConsistencyGuardConfig.newBuilder().withConsistencyCheckEnabled(true).build()) .withConsistencyGuardConfig(ConsistencyGuardConfig.newBuilder().withConsistencyCheckEnabled(true).build())
.build(); .build();
try (HoodieWriteClient client = getHoodieWriteClient(cfg);) { try (HoodieWriteClient client = getHoodieWriteClient(cfg);) {
@@ -204,8 +199,7 @@ public class TestCleaner extends TestHoodieClientBase {
HoodieTable table = HoodieTable.getHoodieTable(metaClient, getConfig(), jsc); HoodieTable table = HoodieTable.getHoodieTable(metaClient, getConfig(), jsc);
for (String partitionPath : dataGen.getPartitionPaths()) { for (String partitionPath : dataGen.getPartitionPaths()) {
TableFileSystemView fsView = table.getFileSystemView(); TableFileSystemView fsView = table.getFileSystemView();
Option<Boolean> added = Option.fromJavaOptional(fsView.getAllFileGroups(partitionPath).findFirst() Option<Boolean> added = Option.fromJavaOptional(fsView.getAllFileGroups(partitionPath).findFirst().map(fg -> {
.map(fg -> {
fg.getLatestFileSlice().map(fs -> compactionFileIdToLatestFileSlice.put(fg.getFileGroupId(), fs)); fg.getLatestFileSlice().map(fs -> compactionFileIdToLatestFileSlice.put(fg.getFileGroupId(), fs));
return true; return true;
})); }));
@@ -234,8 +228,7 @@ public class TestCleaner extends TestHoodieClientBase {
client.startCommitWithTime(newInstantTime); client.startCommitWithTime(newInstantTime);
List<HoodieRecord> records = recordUpsertGenWrappedFunction.apply(newInstantTime, 100); List<HoodieRecord> records = recordUpsertGenWrappedFunction.apply(newInstantTime, 100);
List<WriteStatus> statuses = List<WriteStatus> statuses = upsertFn.apply(client, jsc.parallelize(records, 1), newInstantTime).collect();
upsertFn.apply(client, jsc.parallelize(records, 1), newInstantTime).collect();
// Verify there are no errors // Verify there are no errors
assertNoWriteErrors(statuses); assertNoWriteErrors(statuses);
@@ -249,8 +242,8 @@ public class TestCleaner extends TestHoodieClientBase {
// compute all the versions of all files, from time 0 // compute all the versions of all files, from time 0
HashMap<String, TreeSet<String>> fileIdToVersions = new HashMap<>(); HashMap<String, TreeSet<String>> fileIdToVersions = new HashMap<>();
for (HoodieInstant entry : timeline.getInstants().collect(Collectors.toList())) { for (HoodieInstant entry : timeline.getInstants().collect(Collectors.toList())) {
HoodieCommitMetadata commitMetadata = HoodieCommitMetadata HoodieCommitMetadata commitMetadata =
.fromBytes(timeline.getInstantDetails(entry).get(), HoodieCommitMetadata.class); HoodieCommitMetadata.fromBytes(timeline.getInstantDetails(entry).get(), HoodieCommitMetadata.class);
for (HoodieWriteStat wstat : commitMetadata.getWriteStats(partitionPath)) { for (HoodieWriteStat wstat : commitMetadata.getWriteStats(partitionPath)) {
if (!fileIdToVersions.containsKey(wstat.getFileId())) { if (!fileIdToVersions.containsKey(wstat.getFileId())) {
@@ -267,8 +260,8 @@ public class TestCleaner extends TestHoodieClientBase {
// Ensure latest file-slice selected for compaction is retained // Ensure latest file-slice selected for compaction is retained
Option<HoodieDataFile> dataFileForCompactionPresent = Option<HoodieDataFile> dataFileForCompactionPresent =
Option.fromJavaOptional(fileGroup.getAllDataFiles().filter(df -> { Option.fromJavaOptional(fileGroup.getAllDataFiles().filter(df -> {
return compactionFileIdToLatestFileSlice.get(fileGroup.getFileGroupId()) return compactionFileIdToLatestFileSlice.get(fileGroup.getFileGroupId()).getBaseInstantTime()
.getBaseInstantTime().equals(df.getCommitTime()); .equals(df.getCommitTime());
}).findAny()); }).findAny());
Assert.assertTrue("Data File selected for compaction is retained", Assert.assertTrue("Data File selected for compaction is retained",
dataFileForCompactionPresent.isPresent()); dataFileForCompactionPresent.isPresent());
@@ -310,8 +303,7 @@ public class TestCleaner extends TestHoodieClientBase {
*/ */
@Test @Test
public void testInsertPreppedAndCleanByCommits() throws Exception { public void testInsertPreppedAndCleanByCommits() throws Exception {
testInsertAndCleanByCommits(HoodieWriteClient::insertPreppedRecords, testInsertAndCleanByCommits(HoodieWriteClient::insertPreppedRecords, HoodieWriteClient::upsertPreppedRecords, true);
HoodieWriteClient::upsertPreppedRecords, true);
} }
/** /**
@@ -343,15 +335,13 @@ public class TestCleaner extends TestHoodieClientBase {
*/ */
private void testInsertAndCleanByCommits( private void testInsertAndCleanByCommits(
Function3<JavaRDD<WriteStatus>, HoodieWriteClient, JavaRDD<HoodieRecord>, String> insertFn, Function3<JavaRDD<WriteStatus>, HoodieWriteClient, JavaRDD<HoodieRecord>, String> insertFn,
Function3<JavaRDD<WriteStatus>, HoodieWriteClient, JavaRDD<HoodieRecord>, String> upsertFn, Function3<JavaRDD<WriteStatus>, HoodieWriteClient, JavaRDD<HoodieRecord>, String> upsertFn, boolean isPreppedAPI)
boolean isPreppedAPI throws Exception {
) throws Exception {
int maxCommits = 3; // keep upto 3 commits from the past int maxCommits = 3; // keep upto 3 commits from the past
HoodieWriteConfig cfg = getConfigBuilder().withCompactionConfig( HoodieWriteConfig cfg = getConfigBuilder()
HoodieCompactionConfig.newBuilder() .withCompactionConfig(HoodieCompactionConfig.newBuilder()
.withCleanerPolicy(HoodieCleaningPolicy.KEEP_LATEST_FILE_VERSIONS).retainCommits(maxCommits).build()) .withCleanerPolicy(HoodieCleaningPolicy.KEEP_LATEST_FILE_VERSIONS).retainCommits(maxCommits).build())
.withParallelism(1, 1).withBulkInsertParallelism(1) .withParallelism(1, 1).withBulkInsertParallelism(1).withFinalizeWriteParallelism(1)
.withFinalizeWriteParallelism(1)
.withConsistencyGuardConfig(ConsistencyGuardConfig.newBuilder().withConsistencyCheckEnabled(true).build()) .withConsistencyGuardConfig(ConsistencyGuardConfig.newBuilder().withConsistencyCheckEnabled(true).build())
.build(); .build();
HoodieWriteClient client = getHoodieWriteClient(cfg); HoodieWriteClient client = getHoodieWriteClient(cfg);
@@ -370,8 +360,7 @@ public class TestCleaner extends TestHoodieClientBase {
client.startCommitWithTime(newCommitTime); client.startCommitWithTime(newCommitTime);
List<HoodieRecord> records = recordUpsertGenWrappedFunction.apply(newCommitTime, 100); List<HoodieRecord> records = recordUpsertGenWrappedFunction.apply(newCommitTime, 100);
List<WriteStatus> statuses = List<WriteStatus> statuses = upsertFn.apply(client, jsc.parallelize(records, 1), newCommitTime).collect();
upsertFn.apply(client, jsc.parallelize(records, 1), newCommitTime).collect();
// Verify there are no errors // Verify there are no errors
assertNoWriteErrors(statuses); assertNoWriteErrors(statuses);
@@ -381,9 +370,9 @@ public class TestCleaner extends TestHoodieClientBase {
Option<HoodieInstant> earliestRetainedCommit = activeTimeline.nthFromLastInstant(maxCommits - 1); Option<HoodieInstant> earliestRetainedCommit = activeTimeline.nthFromLastInstant(maxCommits - 1);
Set<HoodieInstant> acceptableCommits = activeTimeline.getInstants().collect(Collectors.toSet()); Set<HoodieInstant> acceptableCommits = activeTimeline.getInstants().collect(Collectors.toSet());
if (earliestRetainedCommit.isPresent()) { if (earliestRetainedCommit.isPresent()) {
acceptableCommits.removeAll( acceptableCommits
activeTimeline.findInstantsInRange("000", earliestRetainedCommit.get().getTimestamp()).getInstants() .removeAll(activeTimeline.findInstantsInRange("000", earliestRetainedCommit.get().getTimestamp())
.collect(Collectors.toSet())); .getInstants().collect(Collectors.toSet()));
acceptableCommits.add(earliestRetainedCommit.get()); acceptableCommits.add(earliestRetainedCommit.get());
} }
@@ -412,18 +401,19 @@ public class TestCleaner extends TestHoodieClientBase {
*/ */
@Test @Test
public void testKeepLatestFileVersions() throws IOException { public void testKeepLatestFileVersions() throws IOException {
HoodieWriteConfig config = HoodieWriteConfig.newBuilder().withPath(basePath).withAssumeDatePartitioning(true) HoodieWriteConfig config =
.withCompactionConfig(HoodieCompactionConfig.newBuilder().withCleanerPolicy( HoodieWriteConfig.newBuilder().withPath(basePath).withAssumeDatePartitioning(true)
HoodieCleaningPolicy.KEEP_LATEST_FILE_VERSIONS).retainFileVersions(1).build()) .withCompactionConfig(HoodieCompactionConfig.newBuilder()
.withCleanerPolicy(HoodieCleaningPolicy.KEEP_LATEST_FILE_VERSIONS).retainFileVersions(1).build())
.build(); .build();
// make 1 commit, with 1 file per partition // make 1 commit, with 1 file per partition
HoodieTestUtils.createCommitFiles(basePath, "000"); HoodieTestUtils.createCommitFiles(basePath, "000");
String file1P0C0 = HoodieTestUtils String file1P0C0 =
.createNewDataFile(basePath, HoodieTestDataGenerator.DEFAULT_FIRST_PARTITION_PATH, "000"); HoodieTestUtils.createNewDataFile(basePath, HoodieTestDataGenerator.DEFAULT_FIRST_PARTITION_PATH, "000");
String file1P1C0 = HoodieTestUtils String file1P1C0 =
.createNewDataFile(basePath, HoodieTestDataGenerator.DEFAULT_SECOND_PARTITION_PATH, "000"); HoodieTestUtils.createNewDataFile(basePath, HoodieTestDataGenerator.DEFAULT_SECOND_PARTITION_PATH, "000");
metaClient = HoodieTableMetaClient.reload(metaClient); metaClient = HoodieTableMetaClient.reload(metaClient);
HoodieTable table = HoodieTable.getHoodieTable(metaClient, config, jsc); HoodieTable table = HoodieTable.getHoodieTable(metaClient, config, jsc);
@@ -434,24 +424,22 @@ public class TestCleaner extends TestHoodieClientBase {
assertEquals("Must not clean any files", 0, assertEquals("Must not clean any files", 0,
getCleanStat(hoodieCleanStatsOne, HoodieTestDataGenerator.DEFAULT_SECOND_PARTITION_PATH).getSuccessDeleteFiles() getCleanStat(hoodieCleanStatsOne, HoodieTestDataGenerator.DEFAULT_SECOND_PARTITION_PATH).getSuccessDeleteFiles()
.size()); .size());
assertTrue(HoodieTestUtils assertTrue(HoodieTestUtils.doesDataFileExist(basePath, HoodieTestDataGenerator.DEFAULT_FIRST_PARTITION_PATH, "000",
.doesDataFileExist(basePath, HoodieTestDataGenerator.DEFAULT_FIRST_PARTITION_PATH, "000", file1P0C0)); file1P0C0));
assertTrue(HoodieTestUtils assertTrue(HoodieTestUtils.doesDataFileExist(basePath, HoodieTestDataGenerator.DEFAULT_SECOND_PARTITION_PATH, "000",
.doesDataFileExist(basePath, HoodieTestDataGenerator.DEFAULT_SECOND_PARTITION_PATH, "000", file1P1C0)); file1P1C0));
// make next commit, with 1 insert & 1 update per partition // make next commit, with 1 insert & 1 update per partition
HoodieTestUtils.createCommitFiles(basePath, "001"); HoodieTestUtils.createCommitFiles(basePath, "001");
metaClient = HoodieTableMetaClient.reload(metaClient); metaClient = HoodieTableMetaClient.reload(metaClient);
table = HoodieTable.getHoodieTable(metaClient, config, jsc); table = HoodieTable.getHoodieTable(metaClient, config, jsc);
String file2P0C1 = HoodieTestUtils String file2P0C1 =
.createNewDataFile(basePath, HoodieTestDataGenerator.DEFAULT_FIRST_PARTITION_PATH, "001"); // insert HoodieTestUtils.createNewDataFile(basePath, HoodieTestDataGenerator.DEFAULT_FIRST_PARTITION_PATH, "001"); // insert
String file2P1C1 = HoodieTestUtils String file2P1C1 =
.createNewDataFile(basePath, HoodieTestDataGenerator.DEFAULT_SECOND_PARTITION_PATH, "001"); // insert HoodieTestUtils.createNewDataFile(basePath, HoodieTestDataGenerator.DEFAULT_SECOND_PARTITION_PATH, "001"); // insert
HoodieTestUtils HoodieTestUtils.createDataFile(basePath, HoodieTestDataGenerator.DEFAULT_FIRST_PARTITION_PATH, "001", file1P0C0); // update
.createDataFile(basePath, HoodieTestDataGenerator.DEFAULT_FIRST_PARTITION_PATH, "001", file1P0C0); // update HoodieTestUtils.createDataFile(basePath, HoodieTestDataGenerator.DEFAULT_SECOND_PARTITION_PATH, "001", file1P1C0); // update
HoodieTestUtils
.createDataFile(basePath, HoodieTestDataGenerator.DEFAULT_SECOND_PARTITION_PATH, "001", file1P1C0); // update
List<HoodieCleanStat> hoodieCleanStatsTwo = table.clean(jsc); List<HoodieCleanStat> hoodieCleanStatsTwo = table.clean(jsc);
assertEquals("Must clean 1 file", 1, assertEquals("Must clean 1 file", 1,
@@ -460,47 +448,44 @@ public class TestCleaner extends TestHoodieClientBase {
assertEquals("Must clean 1 file", 1, assertEquals("Must clean 1 file", 1,
getCleanStat(hoodieCleanStatsTwo, HoodieTestDataGenerator.DEFAULT_SECOND_PARTITION_PATH).getSuccessDeleteFiles() getCleanStat(hoodieCleanStatsTwo, HoodieTestDataGenerator.DEFAULT_SECOND_PARTITION_PATH).getSuccessDeleteFiles()
.size()); .size());
assertTrue(HoodieTestUtils assertTrue(HoodieTestUtils.doesDataFileExist(basePath, HoodieTestDataGenerator.DEFAULT_FIRST_PARTITION_PATH, "001",
.doesDataFileExist(basePath, HoodieTestDataGenerator.DEFAULT_FIRST_PARTITION_PATH, "001", file2P0C1)); file2P0C1));
assertTrue(HoodieTestUtils assertTrue(HoodieTestUtils.doesDataFileExist(basePath, HoodieTestDataGenerator.DEFAULT_SECOND_PARTITION_PATH, "001",
.doesDataFileExist(basePath, HoodieTestDataGenerator.DEFAULT_SECOND_PARTITION_PATH, "001", file2P1C1)); file2P1C1));
assertFalse(HoodieTestUtils assertFalse(HoodieTestUtils.doesDataFileExist(basePath, HoodieTestDataGenerator.DEFAULT_FIRST_PARTITION_PATH, "000",
.doesDataFileExist(basePath, HoodieTestDataGenerator.DEFAULT_FIRST_PARTITION_PATH, "000", file1P0C0)); file1P0C0));
assertFalse(HoodieTestUtils assertFalse(HoodieTestUtils.doesDataFileExist(basePath, HoodieTestDataGenerator.DEFAULT_SECOND_PARTITION_PATH,
.doesDataFileExist(basePath, HoodieTestDataGenerator.DEFAULT_SECOND_PARTITION_PATH, "000", file1P1C0)); "000", file1P1C0));
// make next commit, with 2 updates to existing files, and 1 insert // make next commit, with 2 updates to existing files, and 1 insert
HoodieTestUtils.createCommitFiles(basePath, "002"); HoodieTestUtils.createCommitFiles(basePath, "002");
metaClient = HoodieTableMetaClient.reload(metaClient); metaClient = HoodieTableMetaClient.reload(metaClient);
table = HoodieTable.getHoodieTable(metaClient, config, jsc); table = HoodieTable.getHoodieTable(metaClient, config, jsc);
HoodieTestUtils HoodieTestUtils.createDataFile(basePath, HoodieTestDataGenerator.DEFAULT_FIRST_PARTITION_PATH, "002", file1P0C0); // update
.createDataFile(basePath, HoodieTestDataGenerator.DEFAULT_FIRST_PARTITION_PATH, "002", file1P0C0); // update HoodieTestUtils.createDataFile(basePath, HoodieTestDataGenerator.DEFAULT_FIRST_PARTITION_PATH, "002", file2P0C1); // update
HoodieTestUtils String file3P0C2 =
.createDataFile(basePath, HoodieTestDataGenerator.DEFAULT_FIRST_PARTITION_PATH, "002", file2P0C1); // update HoodieTestUtils.createNewDataFile(basePath, HoodieTestDataGenerator.DEFAULT_FIRST_PARTITION_PATH, "002");
String file3P0C2 = HoodieTestUtils
.createNewDataFile(basePath, HoodieTestDataGenerator.DEFAULT_FIRST_PARTITION_PATH, "002");
List<HoodieCleanStat> hoodieCleanStatsThree = table.clean(jsc); List<HoodieCleanStat> hoodieCleanStatsThree = table.clean(jsc);
assertEquals("Must clean two files", 2, assertEquals("Must clean two files", 2,
getCleanStat(hoodieCleanStatsThree, HoodieTestDataGenerator.DEFAULT_FIRST_PARTITION_PATH) getCleanStat(hoodieCleanStatsThree, HoodieTestDataGenerator.DEFAULT_FIRST_PARTITION_PATH)
.getSuccessDeleteFiles().size()); .getSuccessDeleteFiles().size());
assertFalse(HoodieTestUtils assertFalse(HoodieTestUtils.doesDataFileExist(basePath, HoodieTestDataGenerator.DEFAULT_FIRST_PARTITION_PATH, "001",
.doesDataFileExist(basePath, HoodieTestDataGenerator.DEFAULT_FIRST_PARTITION_PATH, "001", file1P0C0)); file1P0C0));
assertFalse(HoodieTestUtils assertFalse(HoodieTestUtils.doesDataFileExist(basePath, HoodieTestDataGenerator.DEFAULT_FIRST_PARTITION_PATH, "001",
.doesDataFileExist(basePath, HoodieTestDataGenerator.DEFAULT_FIRST_PARTITION_PATH, "001", file2P0C1)); file2P0C1));
assertTrue(HoodieTestUtils assertTrue(HoodieTestUtils.doesDataFileExist(basePath, HoodieTestDataGenerator.DEFAULT_FIRST_PARTITION_PATH, "002",
.doesDataFileExist(basePath, HoodieTestDataGenerator.DEFAULT_FIRST_PARTITION_PATH, "002", file3P0C2)); file3P0C2));
// No cleaning on partially written file, with no commit. // No cleaning on partially written file, with no commit.
HoodieTestUtils HoodieTestUtils.createDataFile(basePath, HoodieTestDataGenerator.DEFAULT_FIRST_PARTITION_PATH, "003", file3P0C2); // update
.createDataFile(basePath, HoodieTestDataGenerator.DEFAULT_FIRST_PARTITION_PATH, "003", file3P0C2); // update
List<HoodieCleanStat> hoodieCleanStatsFour = table.clean(jsc); List<HoodieCleanStat> hoodieCleanStatsFour = table.clean(jsc);
assertEquals("Must not clean any files", 0, assertEquals("Must not clean any files", 0,
getCleanStat(hoodieCleanStatsFour, HoodieTestDataGenerator.DEFAULT_FIRST_PARTITION_PATH).getSuccessDeleteFiles() getCleanStat(hoodieCleanStatsFour, HoodieTestDataGenerator.DEFAULT_FIRST_PARTITION_PATH).getSuccessDeleteFiles()
.size()); .size());
assertTrue(HoodieTestUtils assertTrue(HoodieTestUtils.doesDataFileExist(basePath, HoodieTestDataGenerator.DEFAULT_FIRST_PARTITION_PATH, "002",
.doesDataFileExist(basePath, HoodieTestDataGenerator.DEFAULT_FIRST_PARTITION_PATH, "002", file3P0C2)); file3P0C2));
} }
/** /**
@@ -509,37 +494,33 @@ public class TestCleaner extends TestHoodieClientBase {
@Test @Test
public void testKeepLatestFileVersionsMOR() throws IOException { public void testKeepLatestFileVersionsMOR() throws IOException {
HoodieWriteConfig config = HoodieWriteConfig.newBuilder().withPath(basePath).withAssumeDatePartitioning(true) HoodieWriteConfig config =
.withCompactionConfig(HoodieCompactionConfig.newBuilder().withCleanerPolicy( HoodieWriteConfig.newBuilder().withPath(basePath).withAssumeDatePartitioning(true)
HoodieCleaningPolicy.KEEP_LATEST_FILE_VERSIONS).retainFileVersions(1).build()) .withCompactionConfig(HoodieCompactionConfig.newBuilder()
.withCleanerPolicy(HoodieCleaningPolicy.KEEP_LATEST_FILE_VERSIONS).retainFileVersions(1).build())
.build(); .build();
HoodieTableMetaClient metaClient = HoodieTestUtils.init(jsc.hadoopConfiguration(), basePath, HoodieTableMetaClient metaClient =
HoodieTableType.MERGE_ON_READ); HoodieTestUtils.init(jsc.hadoopConfiguration(), basePath, HoodieTableType.MERGE_ON_READ);
// Make 3 files, one base file and 2 log files associated with base file // Make 3 files, one base file and 2 log files associated with base file
String file1P0 = HoodieTestUtils String file1P0 =
.createNewDataFile(basePath, HoodieTestDataGenerator.DEFAULT_FIRST_PARTITION_PATH, "000"); HoodieTestUtils.createNewDataFile(basePath, HoodieTestDataGenerator.DEFAULT_FIRST_PARTITION_PATH, "000");
String file2P0L0 = HoodieTestUtils String file2P0L0 = HoodieTestUtils.createNewLogFile(fs, basePath,
.createNewLogFile(fs, basePath, HoodieTestDataGenerator.DEFAULT_FIRST_PARTITION_PATH, "000", file1P0, HoodieTestDataGenerator.DEFAULT_FIRST_PARTITION_PATH, "000", file1P0, Option.empty());
Option.empty()); String file2P0L1 = HoodieTestUtils.createNewLogFile(fs, basePath,
String file2P0L1 = HoodieTestUtils HoodieTestDataGenerator.DEFAULT_FIRST_PARTITION_PATH, "000", file1P0, Option.of(2));
.createNewLogFile(fs, basePath, HoodieTestDataGenerator.DEFAULT_FIRST_PARTITION_PATH, "000", file1P0,
Option.of(2));
// make 1 compaction commit // make 1 compaction commit
HoodieTestUtils.createCompactionCommitFiles(fs, basePath, "000"); HoodieTestUtils.createCompactionCommitFiles(fs, basePath, "000");
// Make 4 files, one base file and 3 log files associated with base file // Make 4 files, one base file and 3 log files associated with base file
HoodieTestUtils.createDataFile(basePath, HoodieTestDataGenerator.DEFAULT_FIRST_PARTITION_PATH, "001", file1P0); HoodieTestUtils.createDataFile(basePath, HoodieTestDataGenerator.DEFAULT_FIRST_PARTITION_PATH, "001", file1P0);
file2P0L0 = HoodieTestUtils file2P0L0 = HoodieTestUtils.createNewLogFile(fs, basePath, HoodieTestDataGenerator.DEFAULT_FIRST_PARTITION_PATH,
.createNewLogFile(fs, basePath, HoodieTestDataGenerator.DEFAULT_FIRST_PARTITION_PATH, "001", file1P0, "001", file1P0, Option.empty());
Option.empty()); file2P0L0 = HoodieTestUtils.createNewLogFile(fs, basePath, HoodieTestDataGenerator.DEFAULT_FIRST_PARTITION_PATH,
file2P0L0 = HoodieTestUtils "001", file1P0, Option.of(2));
.createNewLogFile(fs, basePath, HoodieTestDataGenerator.DEFAULT_FIRST_PARTITION_PATH, "001", file1P0, file2P0L0 = HoodieTestUtils.createNewLogFile(fs, basePath, HoodieTestDataGenerator.DEFAULT_FIRST_PARTITION_PATH,
Option.of(2)); "001", file1P0, Option.of(3));
file2P0L0 = HoodieTestUtils
.createNewLogFile(fs, basePath, HoodieTestDataGenerator.DEFAULT_FIRST_PARTITION_PATH, "001", file1P0,
Option.of(3));
// make 1 compaction commit // make 1 compaction commit
HoodieTestUtils.createCompactionCommitFiles(fs, basePath, "001"); HoodieTestUtils.createCompactionCommitFiles(fs, basePath, "001");
@@ -548,16 +529,12 @@ public class TestCleaner extends TestHoodieClientBase {
assertEquals("Must clean three files, one parquet and 2 log files", 3, assertEquals("Must clean three files, one parquet and 2 log files", 3,
getCleanStat(hoodieCleanStats, HoodieTestDataGenerator.DEFAULT_FIRST_PARTITION_PATH).getSuccessDeleteFiles() getCleanStat(hoodieCleanStats, HoodieTestDataGenerator.DEFAULT_FIRST_PARTITION_PATH).getSuccessDeleteFiles()
.size()); .size());
assertFalse(HoodieTestUtils assertFalse(HoodieTestUtils.doesDataFileExist(basePath, HoodieTestDataGenerator.DEFAULT_FIRST_PARTITION_PATH, "000",
.doesDataFileExist(basePath, HoodieTestDataGenerator.DEFAULT_FIRST_PARTITION_PATH, "000", file1P0)); file1P0));
assertFalse( assertFalse(HoodieTestUtils.doesLogFileExist(basePath, HoodieTestDataGenerator.DEFAULT_FIRST_PARTITION_PATH, "000",
HoodieTestUtils file2P0L0, Option.empty()));
.doesLogFileExist(basePath, HoodieTestDataGenerator.DEFAULT_FIRST_PARTITION_PATH, "000", file2P0L0, assertFalse(HoodieTestUtils.doesLogFileExist(basePath, HoodieTestDataGenerator.DEFAULT_FIRST_PARTITION_PATH, "000",
Option.empty())); file2P0L0, Option.of(2)));
assertFalse(
HoodieTestUtils
.doesLogFileExist(basePath, HoodieTestDataGenerator.DEFAULT_FIRST_PARTITION_PATH, "000", file2P0L0,
Option.of(2)));
} }
/** /**
@@ -566,16 +543,17 @@ public class TestCleaner extends TestHoodieClientBase {
@Test @Test
public void testKeepLatestCommits() throws IOException { public void testKeepLatestCommits() throws IOException {
HoodieWriteConfig config = HoodieWriteConfig.newBuilder().withPath(basePath).withAssumeDatePartitioning(true) HoodieWriteConfig config = HoodieWriteConfig.newBuilder().withPath(basePath).withAssumeDatePartitioning(true)
.withCompactionConfig(HoodieCompactionConfig.newBuilder().withCleanerPolicy( .withCompactionConfig(HoodieCompactionConfig.newBuilder()
HoodieCleaningPolicy.KEEP_LATEST_COMMITS).retainCommits(2).build()).build(); .withCleanerPolicy(HoodieCleaningPolicy.KEEP_LATEST_COMMITS).retainCommits(2).build())
.build();
// make 1 commit, with 1 file per partition // make 1 commit, with 1 file per partition
HoodieTestUtils.createCommitFiles(basePath, "000"); HoodieTestUtils.createCommitFiles(basePath, "000");
String file1P0C0 = HoodieTestUtils String file1P0C0 =
.createNewDataFile(basePath, HoodieTestDataGenerator.DEFAULT_FIRST_PARTITION_PATH, "000"); HoodieTestUtils.createNewDataFile(basePath, HoodieTestDataGenerator.DEFAULT_FIRST_PARTITION_PATH, "000");
String file1P1C0 = HoodieTestUtils String file1P1C0 =
.createNewDataFile(basePath, HoodieTestDataGenerator.DEFAULT_SECOND_PARTITION_PATH, "000"); HoodieTestUtils.createNewDataFile(basePath, HoodieTestDataGenerator.DEFAULT_SECOND_PARTITION_PATH, "000");
metaClient = HoodieTableMetaClient.reload(metaClient); metaClient = HoodieTableMetaClient.reload(metaClient);
HoodieTable table = HoodieTable.getHoodieTable(metaClient, config, jsc); HoodieTable table = HoodieTable.getHoodieTable(metaClient, config, jsc);
@@ -587,24 +565,22 @@ public class TestCleaner extends TestHoodieClientBase {
assertEquals("Must not clean any files", 0, assertEquals("Must not clean any files", 0,
getCleanStat(hoodieCleanStatsOne, HoodieTestDataGenerator.DEFAULT_SECOND_PARTITION_PATH).getSuccessDeleteFiles() getCleanStat(hoodieCleanStatsOne, HoodieTestDataGenerator.DEFAULT_SECOND_PARTITION_PATH).getSuccessDeleteFiles()
.size()); .size());
assertTrue(HoodieTestUtils assertTrue(HoodieTestUtils.doesDataFileExist(basePath, HoodieTestDataGenerator.DEFAULT_FIRST_PARTITION_PATH, "000",
.doesDataFileExist(basePath, HoodieTestDataGenerator.DEFAULT_FIRST_PARTITION_PATH, "000", file1P0C0)); file1P0C0));
assertTrue(HoodieTestUtils assertTrue(HoodieTestUtils.doesDataFileExist(basePath, HoodieTestDataGenerator.DEFAULT_SECOND_PARTITION_PATH, "000",
.doesDataFileExist(basePath, HoodieTestDataGenerator.DEFAULT_SECOND_PARTITION_PATH, "000", file1P1C0)); file1P1C0));
// make next commit, with 1 insert & 1 update per partition // make next commit, with 1 insert & 1 update per partition
HoodieTestUtils.createCommitFiles(basePath, "001"); HoodieTestUtils.createCommitFiles(basePath, "001");
metaClient = HoodieTableMetaClient.reload(metaClient); metaClient = HoodieTableMetaClient.reload(metaClient);
table = HoodieTable.getHoodieTable(metaClient, config, jsc); table = HoodieTable.getHoodieTable(metaClient, config, jsc);
String file2P0C1 = HoodieTestUtils String file2P0C1 =
.createNewDataFile(basePath, HoodieTestDataGenerator.DEFAULT_FIRST_PARTITION_PATH, "001"); // insert HoodieTestUtils.createNewDataFile(basePath, HoodieTestDataGenerator.DEFAULT_FIRST_PARTITION_PATH, "001"); // insert
String file2P1C1 = HoodieTestUtils String file2P1C1 =
.createNewDataFile(basePath, HoodieTestDataGenerator.DEFAULT_SECOND_PARTITION_PATH, "001"); // insert HoodieTestUtils.createNewDataFile(basePath, HoodieTestDataGenerator.DEFAULT_SECOND_PARTITION_PATH, "001"); // insert
HoodieTestUtils HoodieTestUtils.createDataFile(basePath, HoodieTestDataGenerator.DEFAULT_FIRST_PARTITION_PATH, "001", file1P0C0); // update
.createDataFile(basePath, HoodieTestDataGenerator.DEFAULT_FIRST_PARTITION_PATH, "001", file1P0C0); // update HoodieTestUtils.createDataFile(basePath, HoodieTestDataGenerator.DEFAULT_SECOND_PARTITION_PATH, "001", file1P1C0); // update
HoodieTestUtils
.createDataFile(basePath, HoodieTestDataGenerator.DEFAULT_SECOND_PARTITION_PATH, "001", file1P1C0); // update
List<HoodieCleanStat> hoodieCleanStatsTwo = table.clean(jsc); List<HoodieCleanStat> hoodieCleanStatsTwo = table.clean(jsc);
assertEquals("Must not clean any files", 0, assertEquals("Must not clean any files", 0,
@@ -613,78 +589,73 @@ public class TestCleaner extends TestHoodieClientBase {
assertEquals("Must not clean any files", 0, assertEquals("Must not clean any files", 0,
getCleanStat(hoodieCleanStatsTwo, HoodieTestDataGenerator.DEFAULT_SECOND_PARTITION_PATH).getSuccessDeleteFiles() getCleanStat(hoodieCleanStatsTwo, HoodieTestDataGenerator.DEFAULT_SECOND_PARTITION_PATH).getSuccessDeleteFiles()
.size()); .size());
assertTrue(HoodieTestUtils assertTrue(HoodieTestUtils.doesDataFileExist(basePath, HoodieTestDataGenerator.DEFAULT_FIRST_PARTITION_PATH, "001",
.doesDataFileExist(basePath, HoodieTestDataGenerator.DEFAULT_FIRST_PARTITION_PATH, "001", file2P0C1)); file2P0C1));
assertTrue(HoodieTestUtils assertTrue(HoodieTestUtils.doesDataFileExist(basePath, HoodieTestDataGenerator.DEFAULT_SECOND_PARTITION_PATH, "001",
.doesDataFileExist(basePath, HoodieTestDataGenerator.DEFAULT_SECOND_PARTITION_PATH, "001", file2P1C1)); file2P1C1));
assertTrue(HoodieTestUtils assertTrue(HoodieTestUtils.doesDataFileExist(basePath, HoodieTestDataGenerator.DEFAULT_FIRST_PARTITION_PATH, "000",
.doesDataFileExist(basePath, HoodieTestDataGenerator.DEFAULT_FIRST_PARTITION_PATH, "000", file1P0C0)); file1P0C0));
assertTrue(HoodieTestUtils assertTrue(HoodieTestUtils.doesDataFileExist(basePath, HoodieTestDataGenerator.DEFAULT_SECOND_PARTITION_PATH, "000",
.doesDataFileExist(basePath, HoodieTestDataGenerator.DEFAULT_SECOND_PARTITION_PATH, "000", file1P1C0)); file1P1C0));
// make next commit, with 2 updates to existing files, and 1 insert // make next commit, with 2 updates to existing files, and 1 insert
HoodieTestUtils.createCommitFiles(basePath, "002"); HoodieTestUtils.createCommitFiles(basePath, "002");
metaClient = HoodieTableMetaClient.reload(metaClient); metaClient = HoodieTableMetaClient.reload(metaClient);
table = HoodieTable.getHoodieTable(metaClient, config, jsc); table = HoodieTable.getHoodieTable(metaClient, config, jsc);
HoodieTestUtils HoodieTestUtils.createDataFile(basePath, HoodieTestDataGenerator.DEFAULT_FIRST_PARTITION_PATH, "002", file1P0C0); // update
.createDataFile(basePath, HoodieTestDataGenerator.DEFAULT_FIRST_PARTITION_PATH, "002", file1P0C0); // update HoodieTestUtils.createDataFile(basePath, HoodieTestDataGenerator.DEFAULT_FIRST_PARTITION_PATH, "002", file2P0C1); // update
HoodieTestUtils String file3P0C2 =
.createDataFile(basePath, HoodieTestDataGenerator.DEFAULT_FIRST_PARTITION_PATH, "002", file2P0C1); // update HoodieTestUtils.createNewDataFile(basePath, HoodieTestDataGenerator.DEFAULT_FIRST_PARTITION_PATH, "002");
String file3P0C2 = HoodieTestUtils
.createNewDataFile(basePath, HoodieTestDataGenerator.DEFAULT_FIRST_PARTITION_PATH, "002");
List<HoodieCleanStat> hoodieCleanStatsThree = table.clean(jsc); List<HoodieCleanStat> hoodieCleanStatsThree = table.clean(jsc);
assertEquals("Must not clean any file. We have to keep 1 version before the latest commit time to keep", 0, assertEquals("Must not clean any file. We have to keep 1 version before the latest commit time to keep", 0,
getCleanStat(hoodieCleanStatsThree, HoodieTestDataGenerator.DEFAULT_FIRST_PARTITION_PATH) getCleanStat(hoodieCleanStatsThree, HoodieTestDataGenerator.DEFAULT_FIRST_PARTITION_PATH)
.getSuccessDeleteFiles().size()); .getSuccessDeleteFiles().size());
assertTrue(HoodieTestUtils assertTrue(HoodieTestUtils.doesDataFileExist(basePath, HoodieTestDataGenerator.DEFAULT_FIRST_PARTITION_PATH, "000",
.doesDataFileExist(basePath, HoodieTestDataGenerator.DEFAULT_FIRST_PARTITION_PATH, "000", file1P0C0)); file1P0C0));
// make next commit, with 2 updates to existing files, and 1 insert // make next commit, with 2 updates to existing files, and 1 insert
HoodieTestUtils.createCommitFiles(basePath, "003"); HoodieTestUtils.createCommitFiles(basePath, "003");
metaClient = HoodieTableMetaClient.reload(metaClient); metaClient = HoodieTableMetaClient.reload(metaClient);
table = HoodieTable.getHoodieTable(metaClient, config, jsc); table = HoodieTable.getHoodieTable(metaClient, config, jsc);
HoodieTestUtils HoodieTestUtils.createDataFile(basePath, HoodieTestDataGenerator.DEFAULT_FIRST_PARTITION_PATH, "003", file1P0C0); // update
.createDataFile(basePath, HoodieTestDataGenerator.DEFAULT_FIRST_PARTITION_PATH, "003", file1P0C0); // update HoodieTestUtils.createDataFile(basePath, HoodieTestDataGenerator.DEFAULT_FIRST_PARTITION_PATH, "003", file2P0C1); // update
HoodieTestUtils String file4P0C3 =
.createDataFile(basePath, HoodieTestDataGenerator.DEFAULT_FIRST_PARTITION_PATH, "003", file2P0C1); // update HoodieTestUtils.createNewDataFile(basePath, HoodieTestDataGenerator.DEFAULT_FIRST_PARTITION_PATH, "003");
String file4P0C3 = HoodieTestUtils
.createNewDataFile(basePath, HoodieTestDataGenerator.DEFAULT_FIRST_PARTITION_PATH, "003");
List<HoodieCleanStat> hoodieCleanStatsFour = table.clean(jsc); List<HoodieCleanStat> hoodieCleanStatsFour = table.clean(jsc);
assertEquals("Must not clean one old file", 1, assertEquals("Must not clean one old file", 1,
getCleanStat(hoodieCleanStatsFour, HoodieTestDataGenerator.DEFAULT_FIRST_PARTITION_PATH).getSuccessDeleteFiles() getCleanStat(hoodieCleanStatsFour, HoodieTestDataGenerator.DEFAULT_FIRST_PARTITION_PATH).getSuccessDeleteFiles()
.size()); .size());
assertFalse(HoodieTestUtils assertFalse(HoodieTestUtils.doesDataFileExist(basePath, HoodieTestDataGenerator.DEFAULT_FIRST_PARTITION_PATH, "000",
.doesDataFileExist(basePath, HoodieTestDataGenerator.DEFAULT_FIRST_PARTITION_PATH, "000", file1P0C0)); file1P0C0));
assertTrue(HoodieTestUtils assertTrue(HoodieTestUtils.doesDataFileExist(basePath, HoodieTestDataGenerator.DEFAULT_FIRST_PARTITION_PATH, "001",
.doesDataFileExist(basePath, HoodieTestDataGenerator.DEFAULT_FIRST_PARTITION_PATH, "001", file1P0C0)); file1P0C0));
assertTrue(HoodieTestUtils assertTrue(HoodieTestUtils.doesDataFileExist(basePath, HoodieTestDataGenerator.DEFAULT_FIRST_PARTITION_PATH, "002",
.doesDataFileExist(basePath, HoodieTestDataGenerator.DEFAULT_FIRST_PARTITION_PATH, "002", file1P0C0)); file1P0C0));
assertTrue(HoodieTestUtils assertTrue(HoodieTestUtils.doesDataFileExist(basePath, HoodieTestDataGenerator.DEFAULT_FIRST_PARTITION_PATH, "001",
.doesDataFileExist(basePath, HoodieTestDataGenerator.DEFAULT_FIRST_PARTITION_PATH, "001", file2P0C1)); file2P0C1));
assertTrue(HoodieTestUtils assertTrue(HoodieTestUtils.doesDataFileExist(basePath, HoodieTestDataGenerator.DEFAULT_FIRST_PARTITION_PATH, "002",
.doesDataFileExist(basePath, HoodieTestDataGenerator.DEFAULT_FIRST_PARTITION_PATH, "002", file2P0C1)); file2P0C1));
assertTrue(HoodieTestUtils assertTrue(HoodieTestUtils.doesDataFileExist(basePath, HoodieTestDataGenerator.DEFAULT_FIRST_PARTITION_PATH, "002",
.doesDataFileExist(basePath, HoodieTestDataGenerator.DEFAULT_FIRST_PARTITION_PATH, "002", file3P0C2)); file3P0C2));
assertTrue(HoodieTestUtils assertTrue(HoodieTestUtils.doesDataFileExist(basePath, HoodieTestDataGenerator.DEFAULT_FIRST_PARTITION_PATH, "003",
.doesDataFileExist(basePath, HoodieTestDataGenerator.DEFAULT_FIRST_PARTITION_PATH, "003", file4P0C3)); file4P0C3));
// No cleaning on partially written file, with no commit. // No cleaning on partially written file, with no commit.
HoodieTestUtils HoodieTestUtils.createDataFile(basePath, HoodieTestDataGenerator.DEFAULT_FIRST_PARTITION_PATH, "004", file3P0C2); // update
.createDataFile(basePath, HoodieTestDataGenerator.DEFAULT_FIRST_PARTITION_PATH, "004", file3P0C2); // update
List<HoodieCleanStat> hoodieCleanStatsFive = table.clean(jsc); List<HoodieCleanStat> hoodieCleanStatsFive = table.clean(jsc);
assertEquals("Must not clean any files", 0, assertEquals("Must not clean any files", 0,
getCleanStat(hoodieCleanStatsFive, HoodieTestDataGenerator.DEFAULT_FIRST_PARTITION_PATH).getSuccessDeleteFiles() getCleanStat(hoodieCleanStatsFive, HoodieTestDataGenerator.DEFAULT_FIRST_PARTITION_PATH).getSuccessDeleteFiles()
.size()); .size());
assertTrue(HoodieTestUtils assertTrue(HoodieTestUtils.doesDataFileExist(basePath, HoodieTestDataGenerator.DEFAULT_FIRST_PARTITION_PATH, "001",
.doesDataFileExist(basePath, HoodieTestDataGenerator.DEFAULT_FIRST_PARTITION_PATH, "001", file1P0C0)); file1P0C0));
assertTrue(HoodieTestUtils assertTrue(HoodieTestUtils.doesDataFileExist(basePath, HoodieTestDataGenerator.DEFAULT_FIRST_PARTITION_PATH, "001",
.doesDataFileExist(basePath, HoodieTestDataGenerator.DEFAULT_FIRST_PARTITION_PATH, "001", file2P0C1)); file2P0C1));
} }
/** /**
@@ -711,8 +682,9 @@ public class TestCleaner extends TestHoodieClientBase {
@Test @Test
public void testCleaningWithZeroPartitonPaths() throws IOException { public void testCleaningWithZeroPartitonPaths() throws IOException {
HoodieWriteConfig config = HoodieWriteConfig.newBuilder().withPath(basePath).withAssumeDatePartitioning(true) HoodieWriteConfig config = HoodieWriteConfig.newBuilder().withPath(basePath).withAssumeDatePartitioning(true)
.withCompactionConfig(HoodieCompactionConfig.newBuilder().withCleanerPolicy( .withCompactionConfig(HoodieCompactionConfig.newBuilder()
HoodieCleaningPolicy.KEEP_LATEST_COMMITS).retainCommits(2).build()).build(); .withCleanerPolicy(HoodieCleaningPolicy.KEEP_LATEST_COMMITS).retainCommits(2).build())
.build();
// Make a commit, although there are no partitionPaths. // Make a commit, although there are no partitionPaths.
// Example use-case of this is when a client wants to create a table // Example use-case of this is when a client wants to create a table
@@ -732,8 +704,9 @@ public class TestCleaner extends TestHoodieClientBase {
@Test @Test
public void testCleaningSkewedPartitons() throws IOException { public void testCleaningSkewedPartitons() throws IOException {
HoodieWriteConfig config = HoodieWriteConfig.newBuilder().withPath(basePath).withAssumeDatePartitioning(true) HoodieWriteConfig config = HoodieWriteConfig.newBuilder().withPath(basePath).withAssumeDatePartitioning(true)
.withCompactionConfig(HoodieCompactionConfig.newBuilder().withCleanerPolicy( .withCompactionConfig(HoodieCompactionConfig.newBuilder()
HoodieCleaningPolicy.KEEP_LATEST_COMMITS).retainCommits(2).build()).build(); .withCleanerPolicy(HoodieCleaningPolicy.KEEP_LATEST_COMMITS).retainCommits(2).build())
.build();
Map<Long, Long> stageOneShuffleReadTaskRecordsCountMap = new HashMap<>(); Map<Long, Long> stageOneShuffleReadTaskRecordsCountMap = new HashMap<>();
// Since clean involves repartition in order to uniformly distribute data, // Since clean involves repartition in order to uniformly distribute data,
@@ -783,22 +756,20 @@ public class TestCleaner extends TestHoodieClientBase {
HoodieTable table = HoodieTable.getHoodieTable(metaClient, config, jsc); HoodieTable table = HoodieTable.getHoodieTable(metaClient, config, jsc);
List<HoodieCleanStat> hoodieCleanStats = table.clean(jsc); List<HoodieCleanStat> hoodieCleanStats = table.clean(jsc);
assertEquals(100, assertEquals(100, getCleanStat(hoodieCleanStats, HoodieTestDataGenerator.DEFAULT_FIRST_PARTITION_PATH)
getCleanStat(hoodieCleanStats, HoodieTestDataGenerator.DEFAULT_FIRST_PARTITION_PATH).getSuccessDeleteFiles() .getSuccessDeleteFiles().size());
.size()); assertEquals(10, getCleanStat(hoodieCleanStats, HoodieTestDataGenerator.DEFAULT_SECOND_PARTITION_PATH)
assertEquals(10, .getSuccessDeleteFiles().size());
getCleanStat(hoodieCleanStats, HoodieTestDataGenerator.DEFAULT_SECOND_PARTITION_PATH).getSuccessDeleteFiles() assertEquals(10, getCleanStat(hoodieCleanStats, HoodieTestDataGenerator.DEFAULT_THIRD_PARTITION_PATH)
.size()); .getSuccessDeleteFiles().size());
assertEquals(10,
getCleanStat(hoodieCleanStats, HoodieTestDataGenerator.DEFAULT_THIRD_PARTITION_PATH).getSuccessDeleteFiles()
.size());
// 3 tasks are expected since the number of partitions is 3 // 3 tasks are expected since the number of partitions is 3
assertEquals(3, stageOneShuffleReadTaskRecordsCountMap.keySet().size()); assertEquals(3, stageOneShuffleReadTaskRecordsCountMap.keySet().size());
// Sum of all records processed = total number of files to clean // Sum of all records processed = total number of files to clean
assertEquals(120, assertEquals(120,
stageOneShuffleReadTaskRecordsCountMap.values().stream().reduce((a, b) -> a + b).get().intValue()); stageOneShuffleReadTaskRecordsCountMap.values().stream().reduce((a, b) -> a + b).get().intValue());
assertTrue("The skew in handling files to clean is not removed. " assertTrue(
"The skew in handling files to clean is not removed. "
+ "Each task should handle more records than the partitionPath with least files " + "Each task should handle more records than the partitionPath with least files "
+ "and less records than the partitionPath with most files.", + "and less records than the partitionPath with most files.",
stageOneShuffleReadTaskRecordsCountMap.values().stream().filter(a -> a > 10 && a < 100).count() == 3); stageOneShuffleReadTaskRecordsCountMap.values().stream().filter(a -> a > 10 && a < 100).count() == 3);
@@ -811,8 +782,9 @@ public class TestCleaner extends TestHoodieClientBase {
@Test @Test
public void testKeepLatestCommitsWithPendingCompactions() throws IOException { public void testKeepLatestCommitsWithPendingCompactions() throws IOException {
HoodieWriteConfig config = HoodieWriteConfig.newBuilder().withPath(basePath).withAssumeDatePartitioning(true) HoodieWriteConfig config = HoodieWriteConfig.newBuilder().withPath(basePath).withAssumeDatePartitioning(true)
.withCompactionConfig(HoodieCompactionConfig.newBuilder().withCleanerPolicy( .withCompactionConfig(HoodieCompactionConfig.newBuilder()
HoodieCleaningPolicy.KEEP_LATEST_COMMITS).retainCommits(2).build()).build(); .withCleanerPolicy(HoodieCleaningPolicy.KEEP_LATEST_COMMITS).retainCommits(2).build())
.build();
// Deletions: // Deletions:
// . FileId Parquet Logs Total Retained Commits // . FileId Parquet Logs Total Retained Commits
// FileId7 5 10 15 009, 011 // FileId7 5 10 15 009, 011
@@ -830,9 +802,11 @@ public class TestCleaner extends TestHoodieClientBase {
*/ */
@Test @Test
public void testKeepLatestVersionsWithPendingCompactions() throws IOException { public void testKeepLatestVersionsWithPendingCompactions() throws IOException {
HoodieWriteConfig config = HoodieWriteConfig.newBuilder().withPath(basePath).withAssumeDatePartitioning(true) HoodieWriteConfig config =
.withCompactionConfig(HoodieCompactionConfig.newBuilder().withCleanerPolicy( HoodieWriteConfig.newBuilder().withPath(basePath).withAssumeDatePartitioning(true)
HoodieCleaningPolicy.KEEP_LATEST_FILE_VERSIONS).retainFileVersions(2).build()).build(); .withCompactionConfig(HoodieCompactionConfig.newBuilder()
.withCleanerPolicy(HoodieCleaningPolicy.KEEP_LATEST_FILE_VERSIONS).retainFileVersions(2).build())
.build();
// Deletions: // Deletions:
// . FileId Parquet Logs Total Retained Commits // . FileId Parquet Logs Total Retained Commits
// FileId7 5 10 15 009, 011 // FileId7 5 10 15 009, 011
@@ -853,8 +827,8 @@ public class TestCleaner extends TestHoodieClientBase {
*/ */
public void testPendingCompactions(HoodieWriteConfig config, int expNumFilesDeleted, public void testPendingCompactions(HoodieWriteConfig config, int expNumFilesDeleted,
int expNumFilesUnderCompactionDeleted) throws IOException { int expNumFilesUnderCompactionDeleted) throws IOException {
HoodieTableMetaClient metaClient = HoodieTestUtils.init(jsc.hadoopConfiguration(), basePath, HoodieTableMetaClient metaClient =
HoodieTableType.MERGE_ON_READ); HoodieTestUtils.init(jsc.hadoopConfiguration(), basePath, HoodieTableType.MERGE_ON_READ);
String[] instants = new String[] {"000", "001", "003", "005", "007", "009", "011", "013"}; String[] instants = new String[] {"000", "001", "003", "005", "007", "009", "011", "013"};
String[] compactionInstants = new String[] {"002", "004", "006", "008", "010"}; String[] compactionInstants = new String[] {"002", "004", "006", "008", "010"};
Map<String, String> expFileIdToPendingCompaction = new HashMap<>(); Map<String, String> expFileIdToPendingCompaction = new HashMap<>();
@@ -870,13 +844,11 @@ public class TestCleaner extends TestHoodieClientBase {
// compactions // compactions
// FileIds 2-5 will be under compaction // FileIds 2-5 will be under compaction
int maxNumFileIds = 7; int maxNumFileIds = 7;
String[] fileIds = new String[] String[] fileIds = new String[] {"fileId1", "fileId2", "fileId3", "fileId4", "fileId5", "fileId6", "fileId7"};
{"fileId1", "fileId2", "fileId3", "fileId4", "fileId5", "fileId6", "fileId7"};
int maxNumFileIdsForCompaction = 4; int maxNumFileIdsForCompaction = 4;
for (int i = 0; i < maxNumFileIds; i++) { for (int i = 0; i < maxNumFileIds; i++) {
final String fileId = HoodieTestUtils final String fileId = HoodieTestUtils.createDataFile(basePath,
.createDataFile(basePath, HoodieTestDataGenerator.DEFAULT_FIRST_PARTITION_PATH, instants[0], HoodieTestDataGenerator.DEFAULT_FIRST_PARTITION_PATH, instants[0], fileIds[i]);
fileIds[i]);
HoodieTestUtils.createNewLogFile(fs, basePath, HoodieTestDataGenerator.DEFAULT_FIRST_PARTITION_PATH, instants[0], HoodieTestUtils.createNewLogFile(fs, basePath, HoodieTestDataGenerator.DEFAULT_FIRST_PARTITION_PATH, instants[0],
fileId, Option.empty()); fileId, Option.empty());
HoodieTestUtils.createNewLogFile(fs, basePath, HoodieTestDataGenerator.DEFAULT_FIRST_PARTITION_PATH, instants[0], HoodieTestUtils.createNewLogFile(fs, basePath, HoodieTestDataGenerator.DEFAULT_FIRST_PARTITION_PATH, instants[0],
@@ -887,8 +859,8 @@ public class TestCleaner extends TestHoodieClientBase {
expFileIdToPendingCompaction.put(fileId, compactionInstants[j]); expFileIdToPendingCompaction.put(fileId, compactionInstants[j]);
metaClient = HoodieTableMetaClient.reload(metaClient); metaClient = HoodieTableMetaClient.reload(metaClient);
HoodieTable table = HoodieTable.getHoodieTable(metaClient, config, jsc); HoodieTable table = HoodieTable.getHoodieTable(metaClient, config, jsc);
FileSlice slice = table.getRTFileSystemView().getLatestFileSlices( FileSlice slice =
HoodieTestDataGenerator.DEFAULT_FIRST_PARTITION_PATH) table.getRTFileSystemView().getLatestFileSlices(HoodieTestDataGenerator.DEFAULT_FIRST_PARTITION_PATH)
.filter(fs -> fs.getFileId().equals(fileId)).findFirst().get(); .filter(fs -> fs.getFileId().equals(fileId)).findFirst().get();
List<FileSlice> slices = new ArrayList<>(); List<FileSlice> slices = new ArrayList<>();
if (compactionInstantsToFileSlices.containsKey(compactionInstants[j])) { if (compactionInstantsToFileSlices.containsKey(compactionInstants[j])) {
@@ -898,20 +870,16 @@ public class TestCleaner extends TestHoodieClientBase {
compactionInstantsToFileSlices.put(compactionInstants[j], slices); compactionInstantsToFileSlices.put(compactionInstants[j], slices);
// Add log-files to simulate delta-commits after pending compaction // Add log-files to simulate delta-commits after pending compaction
HoodieTestUtils.createNewLogFile(fs, basePath, HoodieTestDataGenerator.DEFAULT_FIRST_PARTITION_PATH, HoodieTestUtils.createNewLogFile(fs, basePath, HoodieTestDataGenerator.DEFAULT_FIRST_PARTITION_PATH,
compactionInstants[j], compactionInstants[j], fileId, Option.empty());
fileId, Option.empty());
HoodieTestUtils.createNewLogFile(fs, basePath, HoodieTestDataGenerator.DEFAULT_FIRST_PARTITION_PATH, HoodieTestUtils.createNewLogFile(fs, basePath, HoodieTestDataGenerator.DEFAULT_FIRST_PARTITION_PATH,
compactionInstants[j], compactionInstants[j], fileId, Option.of(2));
fileId, Option.of(2));
} else { } else {
HoodieTestUtils HoodieTestUtils.createDataFile(basePath, HoodieTestDataGenerator.DEFAULT_FIRST_PARTITION_PATH, instants[j],
.createDataFile(basePath, HoodieTestDataGenerator.DEFAULT_FIRST_PARTITION_PATH, instants[j], fileId); fileId);
HoodieTestUtils HoodieTestUtils.createNewLogFile(fs, basePath, HoodieTestDataGenerator.DEFAULT_FIRST_PARTITION_PATH,
.createNewLogFile(fs, basePath, HoodieTestDataGenerator.DEFAULT_FIRST_PARTITION_PATH, instants[j], fileId, instants[j], fileId, Option.empty());
Option.empty()); HoodieTestUtils.createNewLogFile(fs, basePath, HoodieTestDataGenerator.DEFAULT_FIRST_PARTITION_PATH,
HoodieTestUtils instants[j], fileId, Option.of(2));
.createNewLogFile(fs, basePath, HoodieTestDataGenerator.DEFAULT_FIRST_PARTITION_PATH, instants[j], fileId,
Option.of(2));
fileIdToLatestInstantBeforeCompaction.put(fileId, instants[j]); fileIdToLatestInstantBeforeCompaction.put(fileId, instants[j]);
} }
} }
@@ -921,9 +889,8 @@ public class TestCleaner extends TestHoodieClientBase {
for (String instant : compactionInstants) { for (String instant : compactionInstants) {
List<FileSlice> fileSliceList = compactionInstantsToFileSlices.get(instant); List<FileSlice> fileSliceList = compactionInstantsToFileSlices.get(instant);
if (null != fileSliceList) { if (null != fileSliceList) {
HoodieTestUtils.createCompactionRequest(metaClient, instant, HoodieTestUtils.createCompactionRequest(metaClient, instant, fileSliceList.stream()
fileSliceList.stream().map(fs -> Pair.of(HoodieTestDataGenerator.DEFAULT_FIRST_PARTITION_PATH, fs)) .map(fs -> Pair.of(HoodieTestDataGenerator.DEFAULT_FIRST_PARTITION_PATH, fs)).collect(Collectors.toList()));
.collect(Collectors.toList()));
} }
} }
@@ -939,22 +906,19 @@ public class TestCleaner extends TestHoodieClientBase {
expFileIdToPendingCompaction.entrySet().stream().forEach(entry -> { expFileIdToPendingCompaction.entrySet().stream().forEach(entry -> {
String fileId = entry.getKey(); String fileId = entry.getKey();
String baseInstantForCompaction = fileIdToLatestInstantBeforeCompaction.get(fileId); String baseInstantForCompaction = fileIdToLatestInstantBeforeCompaction.get(fileId);
Option<FileSlice> fileSliceForCompaction = Option<FileSlice> fileSliceForCompaction = Option.fromJavaOptional(hoodieTable.getRTFileSystemView()
Option.fromJavaOptional( .getLatestFileSlicesBeforeOrOn(HoodieTestDataGenerator.DEFAULT_FIRST_PARTITION_PATH, baseInstantForCompaction,
hoodieTable.getRTFileSystemView().getLatestFileSlicesBeforeOrOn( true)
HoodieTestDataGenerator.DEFAULT_FIRST_PARTITION_PATH, .filter(fs -> fs.getFileId().equals(fileId)).findFirst());
baseInstantForCompaction, true).filter(fs -> fs.getFileId().equals(fileId)).findFirst());
Assert.assertTrue("Base Instant for Compaction must be preserved", fileSliceForCompaction.isPresent()); Assert.assertTrue("Base Instant for Compaction must be preserved", fileSliceForCompaction.isPresent());
Assert.assertTrue("FileSlice has data-file", fileSliceForCompaction.get().getDataFile().isPresent()); Assert.assertTrue("FileSlice has data-file", fileSliceForCompaction.get().getDataFile().isPresent());
Assert.assertEquals("FileSlice has log-files", 2, Assert.assertEquals("FileSlice has log-files", 2, fileSliceForCompaction.get().getLogFiles().count());
fileSliceForCompaction.get().getLogFiles().count());
}); });
// Test for progress (Did we clean some files ?) // Test for progress (Did we clean some files ?)
long numFilesUnderCompactionDeleted = long numFilesUnderCompactionDeleted = hoodieCleanStats.stream().flatMap(cleanStat -> {
hoodieCleanStats.stream().flatMap(cleanStat -> { return convertPathToFileIdWithCommitTime(newMetaClient, cleanStat.getDeletePathPatterns())
return convertPathToFileIdWithCommitTime(newMetaClient, cleanStat.getDeletePathPatterns()).map( .map(fileIdWithCommitTime -> {
fileIdWithCommitTime -> {
if (expFileIdToPendingCompaction.containsKey(fileIdWithCommitTime.getKey())) { if (expFileIdToPendingCompaction.containsKey(fileIdWithCommitTime.getKey())) {
Assert.assertTrue("Deleted instant time must be less than pending compaction", Assert.assertTrue("Deleted instant time must be less than pending compaction",
HoodieTimeline.compareTimestamps( HoodieTimeline.compareTimestamps(
@@ -965,12 +929,12 @@ public class TestCleaner extends TestHoodieClientBase {
return false; return false;
}); });
}).filter(x -> x).count(); }).filter(x -> x).count();
long numDeleted = hoodieCleanStats.stream() long numDeleted =
.flatMap(cleanStat -> cleanStat.getDeletePathPatterns().stream()).count(); hoodieCleanStats.stream().flatMap(cleanStat -> cleanStat.getDeletePathPatterns().stream()).count();
// Tighter check for regression // Tighter check for regression
Assert.assertEquals("Correct number of files deleted", expNumFilesDeleted, numDeleted); Assert.assertEquals("Correct number of files deleted", expNumFilesDeleted, numDeleted);
Assert.assertEquals("Correct number of files under compaction deleted", Assert.assertEquals("Correct number of files under compaction deleted", expNumFilesUnderCompactionDeleted,
expNumFilesUnderCompactionDeleted, numFilesUnderCompactionDeleted); numFilesUnderCompactionDeleted);
} }
/** /**
@@ -991,6 +955,7 @@ public class TestCleaner extends TestHoodieClientBase {
/*** /***
* Helper method to return temporary files count * Helper method to return temporary files count
*
* @return Number of temporary files found * @return Number of temporary files found
* @throws IOException in case of error * @throws IOException in case of error
*/ */
@@ -1004,19 +969,17 @@ public class TestCleaner extends TestHoodieClientBase {
return count; return count;
} }
private Stream<Pair<String, String>> convertPathToFileIdWithCommitTime( private Stream<Pair<String, String>> convertPathToFileIdWithCommitTime(final HoodieTableMetaClient metaClient,
final HoodieTableMetaClient metaClient, List<String> paths) { List<String> paths) {
Predicate<String> roFilePredicate = path -> Predicate<String> roFilePredicate =
path.contains(metaClient.getTableConfig().getROFileFormat().getFileExtension()); path -> path.contains(metaClient.getTableConfig().getROFileFormat().getFileExtension());
Predicate<String> rtFilePredicate = path -> Predicate<String> rtFilePredicate =
path.contains(metaClient.getTableConfig().getRTFileFormat().getFileExtension()); path -> path.contains(metaClient.getTableConfig().getRTFileFormat().getFileExtension());
Stream<Pair<String, String>> stream1 = paths.stream().filter(roFilePredicate) Stream<Pair<String, String>> stream1 = paths.stream().filter(roFilePredicate).map(fullPath -> {
.map(fullPath -> {
String fileName = Paths.get(fullPath).getFileName().toString(); String fileName = Paths.get(fullPath).getFileName().toString();
return Pair.of(FSUtils.getFileId(fileName), FSUtils.getCommitTime(fileName)); return Pair.of(FSUtils.getFileId(fileName), FSUtils.getCommitTime(fileName));
}); });
Stream<Pair<String, String>> stream2 = paths.stream().filter(rtFilePredicate) Stream<Pair<String, String>> stream2 = paths.stream().filter(rtFilePredicate).map(path -> {
.map(path -> {
return Pair.of(FSUtils.getFileIdFromLogPath(new Path(path)), return Pair.of(FSUtils.getFileIdFromLogPath(new Path(path)),
FSUtils.getBaseCommitTimeFromLogPath(new Path(path))); FSUtils.getBaseCommitTimeFromLogPath(new Path(path)));
}); });

Some files were not shown because too many files have changed in this diff Show More