1
0

[HUDI-296] Explore use of spotless to auto fix formatting errors (#945)

- Add spotless format fixing to project
- One time reformatting for conformity
- Build fails for formatting changes and mvn spotless:apply autofixes them
This commit is contained in:
leesf
2019-10-10 20:19:40 +08:00
committed by vinoth chandar
parent 834c591955
commit b19bed442d
381 changed files with 7350 additions and 9064 deletions

View File

@@ -30,6 +30,7 @@
<properties>
<project.build.sourceEncoding>UTF-8</project.build.sourceEncoding>
<checkstyle.skip>true</checkstyle.skip>
<main.basedir>${project.parent.parent.basedir}</main.basedir>
</properties>

View File

@@ -30,6 +30,7 @@
<properties>
<project.build.sourceEncoding>UTF-8</project.build.sourceEncoding>
<checkstyle.skip>true</checkstyle.skip>
<main.basedir>${project.parent.parent.basedir}</main.basedir>
</properties>
<dependencyManagement>

View File

@@ -30,6 +30,7 @@
<properties>
<project.build.sourceEncoding>UTF-8</project.build.sourceEncoding>
<checkstyle.skip>true</checkstyle.skip>
<main.basedir>${project.parent.parent.basedir}</main.basedir>
</properties>
<dependencyManagement>

View File

@@ -30,6 +30,7 @@
<properties>
<project.build.sourceEncoding>UTF-8</project.build.sourceEncoding>
<checkstyle.skip>true</checkstyle.skip>
<main.basedir>${project.parent.parent.basedir}</main.basedir>
</properties>
<dependencyManagement>

View File

@@ -30,6 +30,7 @@
<properties>
<project.build.sourceEncoding>UTF-8</project.build.sourceEncoding>
<checkstyle.skip>true</checkstyle.skip>
<main.basedir>${project.parent.parent.basedir}</main.basedir>
</properties>
<dependencyManagement>

View File

@@ -56,6 +56,7 @@
<docker.presto.version>0.217</docker.presto.version>
<dockerfile.maven.version>1.4.3</dockerfile.maven.version>
<checkstyle.skip>true</checkstyle.skip>
<main.basedir>${project.parent.basedir}</main.basedir>
</properties>
<build>

View File

@@ -32,6 +32,7 @@
<properties>
<project.build.sourceEncoding>UTF-8</project.build.sourceEncoding>
<checkstyle.skip>true</checkstyle.skip>
<main.basedir>${project.parent.parent.basedir}</main.basedir>
</properties>
<dependencyManagement>

View File

@@ -30,6 +30,7 @@
<properties>
<project.build.sourceEncoding>UTF-8</project.build.sourceEncoding>
<checkstyle.skip>true</checkstyle.skip>
<main.basedir>${project.parent.parent.basedir}</main.basedir>
</properties>
<dependencyManagement>

View File

@@ -30,6 +30,7 @@
<properties>
<project.build.sourceEncoding>UTF-8</project.build.sourceEncoding>
<checkstyle.skip>true</checkstyle.skip>
<main.basedir>${project.parent.parent.basedir}</main.basedir>
</properties>
<dependencyManagement>

View File

@@ -30,6 +30,7 @@
<properties>
<project.build.sourceEncoding>UTF-8</project.build.sourceEncoding>
<checkstyle.skip>true</checkstyle.skip>
<main.basedir>${project.parent.parent.basedir}</main.basedir>
</properties>
<dependencyManagement>

View File

@@ -30,6 +30,7 @@
<properties>
<project.build.sourceEncoding>UTF-8</project.build.sourceEncoding>
<checkstyle.skip>true</checkstyle.skip>
<main.basedir>${project.parent.parent.basedir}</main.basedir>
</properties>
<dependencyManagement>

View File

@@ -29,6 +29,7 @@
<properties>
<spring.shell.version>1.2.0.RELEASE</spring.shell.version>
<jar.mainclass>org.springframework.shell.Bootstrap</jar.mainclass>
<main.basedir>${project.parent.basedir}</main.basedir>
</properties>
<repositories>

View File

@@ -52,19 +52,16 @@ public class HoodiePrintHelper {
* @param rows List of rows
* @return Serialized form for printing
*/
public static String print(TableHeader rowHeader,
Map<String, Function<Object, String>> fieldNameToConverterMap,
String sortByField, boolean isDescending, Integer limit, boolean headerOnly,
List<Comparable[]> rows) {
public static String print(TableHeader rowHeader, Map<String, Function<Object, String>> fieldNameToConverterMap,
String sortByField, boolean isDescending, Integer limit, boolean headerOnly, List<Comparable[]> rows) {
if (headerOnly) {
return HoodiePrintHelper.print(rowHeader);
}
Table table = new Table(rowHeader, fieldNameToConverterMap,
Option.ofNullable(sortByField.isEmpty() ? null : sortByField),
Option.ofNullable(isDescending),
Option.ofNullable(limit <= 0 ? null : limit)).addAllRows(rows).flip();
Table table =
new Table(rowHeader, fieldNameToConverterMap, Option.ofNullable(sortByField.isEmpty() ? null : sortByField),
Option.ofNullable(isDescending), Option.ofNullable(limit <= 0 ? null : limit)).addAllRows(rows).flip();
return HoodiePrintHelper.print(table);
}
@@ -79,9 +76,8 @@ public class HoodiePrintHelper {
String[] header = new String[buffer.getFieldNames().size()];
buffer.getFieldNames().toArray(header);
String[][] rows = buffer.getRenderRows().stream()
.map(l -> l.stream().toArray(String[]::new))
.toArray(String[][]::new);
String[][] rows =
buffer.getRenderRows().stream().map(l -> l.stream().toArray(String[]::new)).toArray(String[][]::new);
return printTextTable(header, rows);
}

View File

@@ -31,8 +31,7 @@ import java.util.stream.IntStream;
import org.apache.hudi.common.util.Option;
/**
* Table to be rendered. This class takes care of ordering
* rows and limiting before renderer renders it.
* Table to be rendered. This class takes care of ordering rows and limiting before renderer renders it.
*/
public class Table implements Iterable<List<String>> {
@@ -53,11 +52,8 @@ public class Table implements Iterable<List<String>> {
// Rows ready for Rendering
private List<List<String>> renderRows;
public Table(TableHeader rowHeader,
Map<String, Function<Object, String>> fieldNameToConverterMap,
Option<String> orderingFieldNameOptional,
Option<Boolean> isDescendingOptional,
Option<Integer> limitOptional) {
public Table(TableHeader rowHeader, Map<String, Function<Object, String>> fieldNameToConverterMap,
Option<String> orderingFieldNameOptional, Option<Boolean> isDescendingOptional, Option<Integer> limitOptional) {
this.rowHeader = rowHeader;
this.fieldNameToConverterMap = fieldNameToConverterMap;
this.orderingFieldNameOptional = orderingFieldNameOptional;
@@ -68,6 +64,7 @@ public class Table implements Iterable<List<String>> {
/**
* Main API to add row to the table
*
* @param row Row
*/
public Table add(List<Comparable> row) {
@@ -86,6 +83,7 @@ public class Table implements Iterable<List<String>> {
/**
* Add all rows
*
* @param rows Rows to be aded
* @return
*/
@@ -96,6 +94,7 @@ public class Table implements Iterable<List<String>> {
/**
* Add all rows
*
* @param rows Rows to be added
* @return
*/
@@ -115,6 +114,7 @@ public class Table implements Iterable<List<String>> {
/**
* Sorting of rows by a specified field
*
* @return
*/
private List<List<Comparable>> orderRows() {

View File

@@ -59,8 +59,8 @@ public class ArchivedCommitsCommand implements CommandMarker {
@CliOption(key = {"limit"}, help = "Limit commits", unspecifiedDefaultValue = "-1") final Integer limit,
@CliOption(key = {"sortBy"}, help = "Sorting Field", unspecifiedDefaultValue = "") final String sortByField,
@CliOption(key = {"desc"}, help = "Ordering", unspecifiedDefaultValue = "false") final boolean descending,
@CliOption(key = {
"headeronly"}, help = "Print Header Only", unspecifiedDefaultValue = "false") final boolean headerOnly)
@CliOption(key = {"headeronly"}, help = "Print Header Only",
unspecifiedDefaultValue = "false") final boolean headerOnly)
throws IOException {
System.out.println("===============> Showing only " + limit + " archived commits <===============");
String basePath = HoodieCLI.tableMetadata.getBasePath();
@@ -86,9 +86,8 @@ public class ArchivedCommitsCommand implements CommandMarker {
.filter(r -> r.get("actionType").toString().equals(HoodieTimeline.COMMIT_ACTION)
|| r.get("actionType").toString().equals(HoodieTimeline.DELTA_COMMIT_ACTION))
.flatMap(r -> {
HoodieCommitMetadata metadata =
(HoodieCommitMetadata) SpecificData.get().deepCopy(HoodieCommitMetadata.SCHEMA$,
r.get("hoodieCommitMetadata"));
HoodieCommitMetadata metadata = (HoodieCommitMetadata) SpecificData.get()
.deepCopy(HoodieCommitMetadata.SCHEMA$, r.get("hoodieCommitMetadata"));
final String instantTime = r.get("commitTime").toString();
final String action = r.get("actionType").toString();
return metadata.getPartitionToWriteStats().values().stream().flatMap(hoodieWriteStats -> {
@@ -118,22 +117,13 @@ public class ArchivedCommitsCommand implements CommandMarker {
allStats.addAll(readCommits);
reader.close();
}
TableHeader header = new TableHeader().addTableHeaderField("action")
.addTableHeaderField("instant")
.addTableHeaderField("partition")
.addTableHeaderField("file_id")
.addTableHeaderField("prev_instant")
.addTableHeaderField("num_writes")
.addTableHeaderField("num_inserts")
.addTableHeaderField("num_deletes")
.addTableHeaderField("num_update_writes")
.addTableHeaderField("total_log_files")
.addTableHeaderField("total_log_blocks")
.addTableHeaderField("total_corrupt_log_blocks")
.addTableHeaderField("total_rollback_blocks")
.addTableHeaderField("total_log_records")
.addTableHeaderField("total_updated_records_compacted")
.addTableHeaderField("total_write_bytes")
TableHeader header = new TableHeader().addTableHeaderField("action").addTableHeaderField("instant")
.addTableHeaderField("partition").addTableHeaderField("file_id").addTableHeaderField("prev_instant")
.addTableHeaderField("num_writes").addTableHeaderField("num_inserts").addTableHeaderField("num_deletes")
.addTableHeaderField("num_update_writes").addTableHeaderField("total_log_files")
.addTableHeaderField("total_log_blocks").addTableHeaderField("total_corrupt_log_blocks")
.addTableHeaderField("total_rollback_blocks").addTableHeaderField("total_log_records")
.addTableHeaderField("total_updated_records_compacted").addTableHeaderField("total_write_bytes")
.addTableHeaderField("total_write_errors");
return HoodiePrintHelper.print(header, new HashMap<>(), sortByField, descending, limit, headerOnly, allStats);
@@ -141,19 +131,19 @@ public class ArchivedCommitsCommand implements CommandMarker {
@CliCommand(value = "show archived commits", help = "Read commits from archived files and show details")
public String showCommits(
@CliOption(key = {"skipMetadata"}, help = "Skip displaying commit metadata", unspecifiedDefaultValue = "true")
boolean skipMetadata,
@CliOption(key = {"skipMetadata"}, help = "Skip displaying commit metadata",
unspecifiedDefaultValue = "true") boolean skipMetadata,
@CliOption(key = {"limit"}, help = "Limit commits", unspecifiedDefaultValue = "10") final Integer limit,
@CliOption(key = {"sortBy"}, help = "Sorting Field", unspecifiedDefaultValue = "") final String sortByField,
@CliOption(key = {"desc"}, help = "Ordering", unspecifiedDefaultValue = "false") final boolean descending,
@CliOption(key = {
"headeronly"}, help = "Print Header Only", unspecifiedDefaultValue = "false") final boolean headerOnly)
@CliOption(key = {"headeronly"}, help = "Print Header Only",
unspecifiedDefaultValue = "false") final boolean headerOnly)
throws IOException {
System.out.println("===============> Showing only " + limit + " archived commits <===============");
String basePath = HoodieCLI.tableMetadata.getBasePath();
FileStatus[] fsStatuses = FSUtils.getFs(basePath, HoodieCLI.conf)
.globStatus(new Path(basePath + "/.hoodie/.commits_.archive*"));
FileStatus[] fsStatuses =
FSUtils.getFs(basePath, HoodieCLI.conf).globStatus(new Path(basePath + "/.hoodie/.commits_.archive*"));
List<Comparable[]> allCommits = new ArrayList<>();
for (FileStatus fs : fsStatuses) {
// read the archived file
@@ -167,15 +157,13 @@ public class ArchivedCommitsCommand implements CommandMarker {
List<IndexedRecord> records = blk.getRecords();
readRecords.addAll(records);
}
List<Comparable[]> readCommits = readRecords.stream().map(r -> (GenericRecord) r).map(r ->
readCommit(r, skipMetadata))
.collect(Collectors.toList());
List<Comparable[]> readCommits = readRecords.stream().map(r -> (GenericRecord) r)
.map(r -> readCommit(r, skipMetadata)).collect(Collectors.toList());
allCommits.addAll(readCommits);
reader.close();
}
TableHeader header = new TableHeader().addTableHeaderField("CommitTime")
.addTableHeaderField("CommitType");
TableHeader header = new TableHeader().addTableHeaderField("CommitTime").addTableHeaderField("CommitType");
if (!skipMetadata) {
header = header.addTableHeaderField("CommitDetails");

View File

@@ -63,8 +63,8 @@ public class CleansCommand implements CommandMarker {
@CliOption(key = {"limit"}, help = "Limit commits", unspecifiedDefaultValue = "-1") final Integer limit,
@CliOption(key = {"sortBy"}, help = "Sorting Field", unspecifiedDefaultValue = "") final String sortByField,
@CliOption(key = {"desc"}, help = "Ordering", unspecifiedDefaultValue = "false") final boolean descending,
@CliOption(key = {
"headeronly"}, help = "Print Header Only", unspecifiedDefaultValue = "false") final boolean headerOnly)
@CliOption(key = {"headeronly"}, help = "Print Header Only",
unspecifiedDefaultValue = "false") final boolean headerOnly)
throws IOException {
HoodieActiveTimeline activeTimeline = HoodieCLI.tableMetadata.getActiveTimeline();
@@ -74,17 +74,15 @@ public class CleansCommand implements CommandMarker {
Collections.reverse(cleans);
for (int i = 0; i < cleans.size(); i++) {
HoodieInstant clean = cleans.get(i);
HoodieCleanMetadata cleanMetadata = AvroUtils
.deserializeHoodieCleanMetadata(timeline.getInstantDetails(clean).get());
HoodieCleanMetadata cleanMetadata =
AvroUtils.deserializeHoodieCleanMetadata(timeline.getInstantDetails(clean).get());
rows.add(new Comparable[] {clean.getTimestamp(), cleanMetadata.getEarliestCommitToRetain(),
cleanMetadata.getTotalFilesDeleted(), cleanMetadata.getTimeTakenInMillis()});
}
TableHeader header = new TableHeader()
.addTableHeaderField("CleanTime")
.addTableHeaderField("EarliestCommandRetained")
.addTableHeaderField("Total Files Deleted")
.addTableHeaderField("Total Time Taken");
TableHeader header =
new TableHeader().addTableHeaderField("CleanTime").addTableHeaderField("EarliestCommandRetained")
.addTableHeaderField("Total Files Deleted").addTableHeaderField("Total Time Taken");
return HoodiePrintHelper.print(header, new HashMap<>(), sortByField, descending, limit, headerOnly, rows);
}
@@ -95,13 +93,12 @@ public class CleansCommand implements CommandMarker {
}
@CliCommand(value = "clean showpartitions", help = "Show partition level details of a clean")
public String showCleanPartitions(
@CliOption(key = {"clean"}, help = "clean to show") final String commitTime,
public String showCleanPartitions(@CliOption(key = {"clean"}, help = "clean to show") final String commitTime,
@CliOption(key = {"limit"}, help = "Limit commits", unspecifiedDefaultValue = "-1") final Integer limit,
@CliOption(key = {"sortBy"}, help = "Sorting Field", unspecifiedDefaultValue = "") final String sortByField,
@CliOption(key = {"desc"}, help = "Ordering", unspecifiedDefaultValue = "false") final boolean descending,
@CliOption(key = {
"headeronly"}, help = "Print Header Only", unspecifiedDefaultValue = "false") final boolean headerOnly)
@CliOption(key = {"headeronly"}, help = "Print Header Only",
unspecifiedDefaultValue = "false") final boolean headerOnly)
throws Exception {
HoodieActiveTimeline activeTimeline = HoodieCLI.tableMetadata.getActiveTimeline();
@@ -112,8 +109,8 @@ public class CleansCommand implements CommandMarker {
return "Clean " + commitTime + " not found in metadata " + timeline;
}
HoodieCleanMetadata cleanMetadata = AvroUtils.deserializeHoodieCleanMetadata(
timeline.getInstantDetails(cleanInstant).get());
HoodieCleanMetadata cleanMetadata =
AvroUtils.deserializeHoodieCleanMetadata(timeline.getInstantDetails(cleanInstant).get());
List<Comparable[]> rows = new ArrayList<>();
for (Map.Entry<String, HoodieCleanPartitionMetadata> entry : cleanMetadata.getPartitionMetadata().entrySet()) {
String path = entry.getKey();
@@ -124,11 +121,8 @@ public class CleansCommand implements CommandMarker {
rows.add(new Comparable[] {path, policy, totalSuccessDeletedFiles, totalFailedDeletedFiles});
}
TableHeader header = new TableHeader()
.addTableHeaderField("Partition Path")
.addTableHeaderField("Cleaning policy")
.addTableHeaderField("Total Files Successfully Deleted")
.addTableHeaderField("Total Failed Deletions");
TableHeader header = new TableHeader().addTableHeaderField("Partition Path").addTableHeaderField("Cleaning policy")
.addTableHeaderField("Total Files Successfully Deleted").addTableHeaderField("Total Failed Deletions");
return HoodiePrintHelper.print(header, new HashMap<>(), sortByField, descending, limit, headerOnly, rows);
}

View File

@@ -69,12 +69,13 @@ public class CommitsCommand implements CommandMarker {
}
@CliCommand(value = "commits show", help = "Show the commits")
public String showCommits(@CliOption(key = {
"limit"}, mandatory = false, help = "Limit commits", unspecifiedDefaultValue = "-1") final Integer limit,
public String showCommits(
@CliOption(key = {"limit"}, mandatory = false, help = "Limit commits",
unspecifiedDefaultValue = "-1") final Integer limit,
@CliOption(key = {"sortBy"}, help = "Sorting Field", unspecifiedDefaultValue = "") final String sortByField,
@CliOption(key = {"desc"}, help = "Ordering", unspecifiedDefaultValue = "false") final boolean descending,
@CliOption(key = {
"headeronly"}, help = "Print Header Only", unspecifiedDefaultValue = "false") final boolean headerOnly)
@CliOption(key = {"headeronly"}, help = "Print Header Only",
unspecifiedDefaultValue = "false") final boolean headerOnly)
throws IOException {
HoodieActiveTimeline activeTimeline = HoodieCLI.tableMetadata.getActiveTimeline();
@@ -84,16 +85,12 @@ public class CommitsCommand implements CommandMarker {
Collections.reverse(commits);
for (int i = 0; i < commits.size(); i++) {
HoodieInstant commit = commits.get(i);
HoodieCommitMetadata commitMetadata = HoodieCommitMetadata.fromBytes(timeline.getInstantDetails(commit).get(),
HoodieCommitMetadata.class);
rows.add(new Comparable[]{commit.getTimestamp(),
commitMetadata.fetchTotalBytesWritten(),
commitMetadata.fetchTotalFilesInsert(),
commitMetadata.fetchTotalFilesUpdated(),
commitMetadata.fetchTotalPartitionsWritten(),
commitMetadata.fetchTotalRecordsWritten(),
commitMetadata.fetchTotalUpdateRecordsWritten(),
commitMetadata.fetchTotalWriteErrors()});
HoodieCommitMetadata commitMetadata =
HoodieCommitMetadata.fromBytes(timeline.getInstantDetails(commit).get(), HoodieCommitMetadata.class);
rows.add(new Comparable[] {commit.getTimestamp(), commitMetadata.fetchTotalBytesWritten(),
commitMetadata.fetchTotalFilesInsert(), commitMetadata.fetchTotalFilesUpdated(),
commitMetadata.fetchTotalPartitionsWritten(), commitMetadata.fetchTotalRecordsWritten(),
commitMetadata.fetchTotalUpdateRecordsWritten(), commitMetadata.fetchTotalWriteErrors()});
}
Map<String, Function<Object, String>> fieldNameToConverterMap = new HashMap<>();
@@ -101,15 +98,10 @@ public class CommitsCommand implements CommandMarker {
return NumericUtils.humanReadableByteCount((Double.valueOf(entry.toString())));
});
TableHeader header = new TableHeader()
.addTableHeaderField("CommitTime")
.addTableHeaderField("Total Bytes Written")
.addTableHeaderField("Total Files Added")
.addTableHeaderField("Total Files Updated")
.addTableHeaderField("Total Partitions Written")
.addTableHeaderField("Total Records Written")
.addTableHeaderField("Total Update Records Written")
.addTableHeaderField("Total Errors");
TableHeader header = new TableHeader().addTableHeaderField("CommitTime").addTableHeaderField("Total Bytes Written")
.addTableHeaderField("Total Files Added").addTableHeaderField("Total Files Updated")
.addTableHeaderField("Total Partitions Written").addTableHeaderField("Total Records Written")
.addTableHeaderField("Total Update Records Written").addTableHeaderField("Total Errors");
return HoodiePrintHelper.print(header, fieldNameToConverterMap, sortByField, descending, limit, headerOnly, rows);
}
@@ -132,8 +124,8 @@ public class CommitsCommand implements CommandMarker {
}
SparkLauncher sparkLauncher = SparkUtil.initLauncher(sparkPropertiesPath);
sparkLauncher
.addAppArgs(SparkMain.SparkCommand.ROLLBACK.toString(), commitTime, HoodieCLI.tableMetadata.getBasePath());
sparkLauncher.addAppArgs(SparkMain.SparkCommand.ROLLBACK.toString(), commitTime,
HoodieCLI.tableMetadata.getBasePath());
Process process = sparkLauncher.launch();
InputStreamConsumer.captureOutput(process);
int exitCode = process.waitFor();
@@ -146,13 +138,12 @@ public class CommitsCommand implements CommandMarker {
}
@CliCommand(value = "commit showpartitions", help = "Show partition level details of a commit")
public String showCommitPartitions(
@CliOption(key = {"commit"}, help = "Commit to show") final String commitTime,
public String showCommitPartitions(@CliOption(key = {"commit"}, help = "Commit to show") final String commitTime,
@CliOption(key = {"limit"}, help = "Limit commits", unspecifiedDefaultValue = "-1") final Integer limit,
@CliOption(key = {"sortBy"}, help = "Sorting Field", unspecifiedDefaultValue = "") final String sortByField,
@CliOption(key = {"desc"}, help = "Ordering", unspecifiedDefaultValue = "false") final boolean descending,
@CliOption(key = {
"headeronly"}, help = "Print Header Only", unspecifiedDefaultValue = "false") final boolean headerOnly)
@CliOption(key = {"headeronly"}, help = "Print Header Only",
unspecifiedDefaultValue = "false") final boolean headerOnly)
throws Exception {
HoodieActiveTimeline activeTimeline = HoodieCLI.tableMetadata.getActiveTimeline();
@@ -185,8 +176,7 @@ public class CommitsCommand implements CommandMarker {
totalBytesWritten += stat.getTotalWriteBytes();
totalWriteErrors += stat.getTotalWriteErrors();
}
rows.add(new Comparable[]{path, totalFilesAdded, totalFilesUpdated,
totalRecordsInserted, totalRecordsUpdated,
rows.add(new Comparable[] {path, totalFilesAdded, totalFilesUpdated, totalRecordsInserted, totalRecordsUpdated,
totalBytesWritten, totalWriteErrors});
}
@@ -195,26 +185,21 @@ public class CommitsCommand implements CommandMarker {
return NumericUtils.humanReadableByteCount((Long.valueOf(entry.toString())));
});
TableHeader header = new TableHeader()
.addTableHeaderField("Partition Path")
.addTableHeaderField("Total Files Added")
.addTableHeaderField("Total Files Updated")
.addTableHeaderField("Total Records Inserted")
.addTableHeaderField("Total Records Updated")
.addTableHeaderField("Total Bytes Written")
.addTableHeaderField("Total Errors");
TableHeader header = new TableHeader().addTableHeaderField("Partition Path")
.addTableHeaderField("Total Files Added").addTableHeaderField("Total Files Updated")
.addTableHeaderField("Total Records Inserted").addTableHeaderField("Total Records Updated")
.addTableHeaderField("Total Bytes Written").addTableHeaderField("Total Errors");
return HoodiePrintHelper.print(header, fieldNameToConverterMap, sortByField, descending, limit, headerOnly, rows);
}
@CliCommand(value = "commit showfiles", help = "Show file level details of a commit")
public String showCommitFiles(
@CliOption(key = {"commit"}, help = "Commit to show") final String commitTime,
public String showCommitFiles(@CliOption(key = {"commit"}, help = "Commit to show") final String commitTime,
@CliOption(key = {"limit"}, help = "Limit commits", unspecifiedDefaultValue = "-1") final Integer limit,
@CliOption(key = {"sortBy"}, help = "Sorting Field", unspecifiedDefaultValue = "") final String sortByField,
@CliOption(key = {"desc"}, help = "Ordering", unspecifiedDefaultValue = "false") final boolean descending,
@CliOption(key = {
"headeronly"}, help = "Print Header Only", unspecifiedDefaultValue = "false") final boolean headerOnly)
@CliOption(key = {"headeronly"}, help = "Print Header Only",
unspecifiedDefaultValue = "false") final boolean headerOnly)
throws Exception {
HoodieActiveTimeline activeTimeline = HoodieCLI.tableMetadata.getActiveTimeline();
@@ -232,22 +217,14 @@ public class CommitsCommand implements CommandMarker {
List<HoodieWriteStat> stats = entry.getValue();
for (HoodieWriteStat stat : stats) {
rows.add(new Comparable[] {path, stat.getFileId(), stat.getPrevCommit(), stat.getNumUpdateWrites(),
stat.getNumWrites(), stat.getTotalWriteBytes(),
stat.getTotalWriteErrors(),
stat.getFileSizeInBytes()
});
stat.getNumWrites(), stat.getTotalWriteBytes(), stat.getTotalWriteErrors(), stat.getFileSizeInBytes()});
}
}
TableHeader header = new TableHeader()
.addTableHeaderField("Partition Path")
.addTableHeaderField("File ID")
.addTableHeaderField("Previous Commit")
.addTableHeaderField("Total Records Updated")
.addTableHeaderField("Total Records Written")
.addTableHeaderField("Total Bytes Written")
.addTableHeaderField("Total Errors")
.addTableHeaderField("File Size");
TableHeader header = new TableHeader().addTableHeaderField("Partition Path").addTableHeaderField("File ID")
.addTableHeaderField("Previous Commit").addTableHeaderField("Total Records Updated")
.addTableHeaderField("Total Records Written").addTableHeaderField("Total Bytes Written")
.addTableHeaderField("Total Errors").addTableHeaderField("File Size");
return HoodiePrintHelper.print(header, new HashMap<>(), sortByField, descending, limit, headerOnly, rows);
}
@@ -270,8 +247,8 @@ public class CommitsCommand implements CommandMarker {
String sourceLatestCommit =
sourceTimeline.getInstants().iterator().hasNext() ? "0" : sourceTimeline.lastInstant().get().getTimestamp();
if (sourceLatestCommit != null && HoodieTimeline.compareTimestamps(targetLatestCommit, sourceLatestCommit,
HoodieTimeline.GREATER)) {
if (sourceLatestCommit != null
&& HoodieTimeline.compareTimestamps(targetLatestCommit, sourceLatestCommit, HoodieTimeline.GREATER)) {
// source is behind the target
List<String> commitsToCatchup = targetTimeline.findInstantsAfter(sourceLatestCommit, Integer.MAX_VALUE)
.getInstants().map(HoodieInstant::getTimestamp).collect(Collectors.toList());

View File

@@ -75,16 +75,15 @@ public class CompactionCommand implements CommandMarker {
@CliCommand(value = "compactions show all", help = "Shows all compactions that are in active timeline")
public String compactionsAll(
@CliOption(key = {
"includeExtraMetadata"}, help = "Include extra metadata", unspecifiedDefaultValue = "false") final
boolean includeExtraMetadata,
@CliOption(key = {
"limit"}, mandatory = false, help = "Limit commits", unspecifiedDefaultValue = "-1") final Integer limit,
@CliOption(key = {"includeExtraMetadata"}, help = "Include extra metadata",
unspecifiedDefaultValue = "false") final boolean includeExtraMetadata,
@CliOption(key = {"limit"}, mandatory = false, help = "Limit commits",
unspecifiedDefaultValue = "-1") final Integer limit,
@CliOption(key = {"sortBy"}, help = "Sorting Field", unspecifiedDefaultValue = "") final String sortByField,
@CliOption(key = {"desc"}, help = "Ordering", unspecifiedDefaultValue = "false") final boolean descending,
@CliOption(key = {
"headeronly"}, help = "Print Header Only", unspecifiedDefaultValue = "false") final
boolean headerOnly) throws IOException {
@CliOption(key = {"headeronly"}, help = "Print Header Only",
unspecifiedDefaultValue = "false") final boolean headerOnly)
throws IOException {
HoodieActiveTimeline activeTimeline = HoodieCLI.tableMetadata.getActiveTimeline();
HoodieTimeline timeline = activeTimeline.getCommitsAndCompactionTimeline();
HoodieTimeline commitTimeline = activeTimeline.getCommitTimeline().filterCompletedInstants();
@@ -99,15 +98,14 @@ public class CompactionCommand implements CommandMarker {
if (!instant.getAction().equals(HoodieTimeline.COMPACTION_ACTION)) {
try {
// This could be a completed compaction. Assume a compaction request file is present but skip if fails
workload = AvroUtils.deserializeCompactionPlan(
activeTimeline.getInstantAuxiliaryDetails(
HoodieTimeline.getCompactionRequestedInstant(instant.getTimestamp())).get());
workload = AvroUtils.deserializeCompactionPlan(activeTimeline
.getInstantAuxiliaryDetails(HoodieTimeline.getCompactionRequestedInstant(instant.getTimestamp())).get());
} catch (HoodieIOException ioe) {
// SKIP
}
} else {
workload = AvroUtils.deserializeCompactionPlan(activeTimeline.getInstantAuxiliaryDetails(
HoodieTimeline.getCompactionRequestedInstant(instant.getTimestamp())).get());
workload = AvroUtils.deserializeCompactionPlan(activeTimeline
.getInstantAuxiliaryDetails(HoodieTimeline.getCompactionRequestedInstant(instant.getTimestamp())).get());
}
if (null != workload) {
@@ -116,22 +114,18 @@ public class CompactionCommand implements CommandMarker {
state = State.COMPLETED;
}
if (includeExtraMetadata) {
rows.add(new Comparable[]{instant.getTimestamp(),
state.toString(),
rows.add(new Comparable[] {instant.getTimestamp(), state.toString(),
workload.getOperations() == null ? 0 : workload.getOperations().size(),
workload.getExtraMetadata().toString()});
} else {
rows.add(new Comparable[]{instant.getTimestamp(),
state.toString(),
rows.add(new Comparable[] {instant.getTimestamp(), state.toString(),
workload.getOperations() == null ? 0 : workload.getOperations().size()});
}
}
}
Map<String, Function<Object, String>> fieldNameToConverterMap = new HashMap<>();
TableHeader header = new TableHeader()
.addTableHeaderField("Compaction Instant Time")
.addTableHeaderField("State")
TableHeader header = new TableHeader().addTableHeaderField("Compaction Instant Time").addTableHeaderField("State")
.addTableHeaderField("Total FileIds to be Compacted");
if (includeExtraMetadata) {
header = header.addTableHeaderField("Extra Metadata");
@@ -141,48 +135,37 @@ public class CompactionCommand implements CommandMarker {
@CliCommand(value = "compaction show", help = "Shows compaction details for a specific compaction instant")
public String compactionShow(
@CliOption(key = "instant", mandatory = true, help = "Base path for the target hoodie dataset") final
String compactionInstantTime,
@CliOption(key = {
"limit"}, mandatory = false, help = "Limit commits", unspecifiedDefaultValue = "-1") final Integer limit,
@CliOption(key = "instant", mandatory = true,
help = "Base path for the target hoodie dataset") final String compactionInstantTime,
@CliOption(key = {"limit"}, mandatory = false, help = "Limit commits",
unspecifiedDefaultValue = "-1") final Integer limit,
@CliOption(key = {"sortBy"}, help = "Sorting Field", unspecifiedDefaultValue = "") final String sortByField,
@CliOption(key = {"desc"}, help = "Ordering", unspecifiedDefaultValue = "false") final boolean descending,
@CliOption(key = {
"headeronly"}, help = "Print Header Only", unspecifiedDefaultValue = "false") final boolean headerOnly)
@CliOption(key = {"headeronly"}, help = "Print Header Only",
unspecifiedDefaultValue = "false") final boolean headerOnly)
throws Exception {
HoodieActiveTimeline activeTimeline = HoodieCLI.tableMetadata.getActiveTimeline();
HoodieCompactionPlan workload = AvroUtils.deserializeCompactionPlan(
activeTimeline.getInstantAuxiliaryDetails(
HoodieTimeline.getCompactionRequestedInstant(compactionInstantTime)).get());
HoodieCompactionPlan workload = AvroUtils.deserializeCompactionPlan(activeTimeline
.getInstantAuxiliaryDetails(HoodieTimeline.getCompactionRequestedInstant(compactionInstantTime)).get());
List<Comparable[]> rows = new ArrayList<>();
if ((null != workload) && (null != workload.getOperations())) {
for (HoodieCompactionOperation op : workload.getOperations()) {
rows.add(new Comparable[]{op.getPartitionPath(),
op.getFileId(),
op.getBaseInstantTime(),
op.getDataFilePath(),
op.getDeltaFilePaths().size(),
op.getMetrics() == null ? "" : op.getMetrics().toString()
});
rows.add(new Comparable[] {op.getPartitionPath(), op.getFileId(), op.getBaseInstantTime(), op.getDataFilePath(),
op.getDeltaFilePaths().size(), op.getMetrics() == null ? "" : op.getMetrics().toString()});
}
}
Map<String, Function<Object, String>> fieldNameToConverterMap = new HashMap<>();
TableHeader header = new TableHeader()
.addTableHeaderField("Partition Path")
.addTableHeaderField("File Id")
.addTableHeaderField("Base Instant")
.addTableHeaderField("Data File Path")
.addTableHeaderField("Total Delta Files")
.addTableHeaderField("getMetrics");
TableHeader header = new TableHeader().addTableHeaderField("Partition Path").addTableHeaderField("File Id")
.addTableHeaderField("Base Instant").addTableHeaderField("Data File Path")
.addTableHeaderField("Total Delta Files").addTableHeaderField("getMetrics");
return HoodiePrintHelper.print(header, fieldNameToConverterMap, sortByField, descending, limit, headerOnly, rows);
}
@CliCommand(value = "compaction schedule", help = "Schedule Compaction")
public String scheduleCompact(
@CliOption(key = "sparkMemory", unspecifiedDefaultValue = "1G", help = "Spark executor memory")
final String sparkMemory) throws Exception {
public String scheduleCompact(@CliOption(key = "sparkMemory", unspecifiedDefaultValue = "1G",
help = "Spark executor memory") final String sparkMemory) throws Exception {
boolean initialized = HoodieCLI.initConf();
HoodieCLI.initFS(initialized);
@@ -190,8 +173,8 @@ public class CompactionCommand implements CommandMarker {
String compactionInstantTime = HoodieActiveTimeline.createNewCommitTime();
if (HoodieCLI.tableMetadata.getTableType() == HoodieTableType.MERGE_ON_READ) {
String sparkPropertiesPath = Utils.getDefaultPropertiesFile(
scala.collection.JavaConversions.propertiesAsScalaMap(System.getProperties()));
String sparkPropertiesPath =
Utils.getDefaultPropertiesFile(scala.collection.JavaConversions.propertiesAsScalaMap(System.getProperties()));
SparkLauncher sparkLauncher = SparkUtil.initLauncher(sparkPropertiesPath);
sparkLauncher.addAppArgs(SparkCommand.COMPACT_SCHEDULE.toString(), HoodieCLI.tableMetadata.getBasePath(),
HoodieCLI.tableMetadata.getTableConfig().getTableName(), compactionInstantTime, sparkMemory);
@@ -209,33 +192,34 @@ public class CompactionCommand implements CommandMarker {
@CliCommand(value = "compaction run", help = "Run Compaction for given instant time")
public String compact(
@CliOption(key = {"parallelism"}, mandatory = true, help = "Parallelism for hoodie compaction")
final String parallelism,
@CliOption(key = "schemaFilePath", mandatory = true, help = "Path for Avro schema file")
final String schemaFilePath,
@CliOption(key = "sparkMemory", unspecifiedDefaultValue = "4G", help = "Spark executor memory")
final String sparkMemory,
@CliOption(key = "retry", unspecifiedDefaultValue = "1", help = "Number of retries")
final String retry,
@CliOption(key = "compactionInstant", mandatory = false, help = "Base path for the target hoodie dataset")
String compactionInstantTime) throws Exception {
@CliOption(key = {"parallelism"}, mandatory = true,
help = "Parallelism for hoodie compaction") final String parallelism,
@CliOption(key = "schemaFilePath", mandatory = true,
help = "Path for Avro schema file") final String schemaFilePath,
@CliOption(key = "sparkMemory", unspecifiedDefaultValue = "4G",
help = "Spark executor memory") final String sparkMemory,
@CliOption(key = "retry", unspecifiedDefaultValue = "1", help = "Number of retries") final String retry,
@CliOption(key = "compactionInstant", mandatory = false,
help = "Base path for the target hoodie dataset") String compactionInstantTime)
throws Exception {
boolean initialized = HoodieCLI.initConf();
HoodieCLI.initFS(initialized);
if (HoodieCLI.tableMetadata.getTableType() == HoodieTableType.MERGE_ON_READ) {
if (null == compactionInstantTime) {
// pick outstanding one with lowest timestamp
Option<String> firstPendingInstant = HoodieCLI.tableMetadata.reloadActiveTimeline()
.filterCompletedAndCompactionInstants().filter(instant -> instant.getAction()
.equals(HoodieTimeline.COMPACTION_ACTION)).firstInstant().map(HoodieInstant::getTimestamp);
Option<String> firstPendingInstant =
HoodieCLI.tableMetadata.reloadActiveTimeline().filterCompletedAndCompactionInstants()
.filter(instant -> instant.getAction().equals(HoodieTimeline.COMPACTION_ACTION)).firstInstant()
.map(HoodieInstant::getTimestamp);
if (!firstPendingInstant.isPresent()) {
return "NO PENDING COMPACTION TO RUN";
}
compactionInstantTime = firstPendingInstant.get();
}
String sparkPropertiesPath = Utils.getDefaultPropertiesFile(
scala.collection.JavaConversions.propertiesAsScalaMap(System.getProperties()));
String sparkPropertiesPath =
Utils.getDefaultPropertiesFile(scala.collection.JavaConversions.propertiesAsScalaMap(System.getProperties()));
SparkLauncher sparkLauncher = SparkUtil.initLauncher(sparkPropertiesPath);
sparkLauncher.addAppArgs(SparkCommand.COMPACT_RUN.toString(), HoodieCLI.tableMetadata.getBasePath(),
HoodieCLI.tableMetadata.getTableConfig().getTableName(), compactionInstantTime, parallelism, schemaFilePath,
@@ -279,8 +263,8 @@ public class CompactionCommand implements CommandMarker {
@CliOption(key = {"limit"}, help = "Limit commits", unspecifiedDefaultValue = "-1") Integer limit,
@CliOption(key = {"sortBy"}, help = "Sorting Field", unspecifiedDefaultValue = "") String sortByField,
@CliOption(key = {"desc"}, help = "Ordering", unspecifiedDefaultValue = "false") boolean descending,
@CliOption(key = {
"headeronly"}, help = "Print Header Only", unspecifiedDefaultValue = "false") boolean headerOnly)
@CliOption(key = {"headeronly"}, help = "Print Header Only",
unspecifiedDefaultValue = "false") boolean headerOnly)
throws Exception {
boolean initialized = HoodieCLI.initConf();
HoodieCLI.initFS(initialized);
@@ -290,12 +274,11 @@ public class CompactionCommand implements CommandMarker {
String output = null;
if (HoodieCLI.tableMetadata.getTableType() == HoodieTableType.MERGE_ON_READ) {
try {
String sparkPropertiesPath = Utils.getDefaultPropertiesFile(
scala.collection.JavaConversions.propertiesAsScalaMap(System.getProperties()));
String sparkPropertiesPath = Utils
.getDefaultPropertiesFile(scala.collection.JavaConversions.propertiesAsScalaMap(System.getProperties()));
SparkLauncher sparkLauncher = SparkUtil.initLauncher(sparkPropertiesPath);
sparkLauncher.addAppArgs(SparkCommand.COMPACT_VALIDATE.toString(),
HoodieCLI.tableMetadata.getBasePath(), compactionInstant, outputPathStr, parallelism, master,
sparkMemory);
sparkLauncher.addAppArgs(SparkCommand.COMPACT_VALIDATE.toString(), HoodieCLI.tableMetadata.getBasePath(),
compactionInstant, outputPathStr, parallelism, master, sparkMemory);
Process process = sparkLauncher.launch();
InputStreamConsumer.captureOutput(process);
int exitCode = process.waitFor();
@@ -307,8 +290,7 @@ public class CompactionCommand implements CommandMarker {
String message = "\n\n\t COMPACTION PLAN " + (valid ? "VALID" : "INVALID") + "\n\n";
List<Comparable[]> rows = new ArrayList<>();
res.stream().forEach(r -> {
Comparable[] row = new Comparable[]{r.getOperation().getFileId(),
r.getOperation().getBaseInstantTime(),
Comparable[] row = new Comparable[] {r.getOperation().getFileId(), r.getOperation().getBaseInstantTime(),
r.getOperation().getDataFilePath().isPresent() ? r.getOperation().getDataFilePath().get() : "",
r.getOperation().getDeltaFilePaths().size(), r.isSuccess(),
r.getException().isPresent() ? r.getException().get().getMessage() : ""};
@@ -316,12 +298,8 @@ public class CompactionCommand implements CommandMarker {
});
Map<String, Function<Object, String>> fieldNameToConverterMap = new HashMap<>();
TableHeader header = new TableHeader()
.addTableHeaderField("File Id")
.addTableHeaderField("Base Instant Time")
.addTableHeaderField("Base Data File")
.addTableHeaderField("Num Delta Files")
.addTableHeaderField("Valid")
TableHeader header = new TableHeader().addTableHeaderField("File Id").addTableHeaderField("Base Instant Time")
.addTableHeaderField("Base Data File").addTableHeaderField("Num Delta Files").addTableHeaderField("Valid")
.addTableHeaderField("Error");
output = message + HoodiePrintHelper.print(header, fieldNameToConverterMap, sortByField, descending, limit,
@@ -349,8 +327,8 @@ public class CompactionCommand implements CommandMarker {
@CliOption(key = {"limit"}, help = "Limit commits", unspecifiedDefaultValue = "-1") Integer limit,
@CliOption(key = {"sortBy"}, help = "Sorting Field", unspecifiedDefaultValue = "") String sortByField,
@CliOption(key = {"desc"}, help = "Ordering", unspecifiedDefaultValue = "false") boolean descending,
@CliOption(key = {
"headeronly"}, help = "Print Header Only", unspecifiedDefaultValue = "false") boolean headerOnly)
@CliOption(key = {"headeronly"}, help = "Print Header Only",
unspecifiedDefaultValue = "false") boolean headerOnly)
throws Exception {
boolean initialized = HoodieCLI.initConf();
HoodieCLI.initFS(initialized);
@@ -360,12 +338,12 @@ public class CompactionCommand implements CommandMarker {
String output = "";
if (HoodieCLI.tableMetadata.getTableType() == HoodieTableType.MERGE_ON_READ) {
try {
String sparkPropertiesPath = Utils.getDefaultPropertiesFile(
scala.collection.JavaConversions.propertiesAsScalaMap(System.getProperties()));
String sparkPropertiesPath = Utils
.getDefaultPropertiesFile(scala.collection.JavaConversions.propertiesAsScalaMap(System.getProperties()));
SparkLauncher sparkLauncher = SparkUtil.initLauncher(sparkPropertiesPath);
sparkLauncher.addAppArgs(SparkCommand.COMPACT_UNSCHEDULE_PLAN.toString(),
HoodieCLI.tableMetadata.getBasePath(), compactionInstant, outputPathStr, parallelism, master,
sparkMemory, Boolean.valueOf(skipV).toString(), Boolean.valueOf(dryRun).toString());
sparkLauncher.addAppArgs(SparkCommand.COMPACT_UNSCHEDULE_PLAN.toString(), HoodieCLI.tableMetadata.getBasePath(),
compactionInstant, outputPathStr, parallelism, master, sparkMemory, Boolean.valueOf(skipV).toString(),
Boolean.valueOf(dryRun).toString());
Process process = sparkLauncher.launch();
InputStreamConsumer.captureOutput(process);
int exitCode = process.waitFor();
@@ -373,8 +351,8 @@ public class CompactionCommand implements CommandMarker {
return "Failed to unschedule compaction for " + compactionInstant;
}
List<RenameOpResult> res = deSerializeOperationResult(outputPathStr, HoodieCLI.fs);
output = getRenamesToBePrinted(res, limit, sortByField, descending, headerOnly,
"unschedule pending compaction");
output =
getRenamesToBePrinted(res, limit, sortByField, descending, headerOnly, "unschedule pending compaction");
} finally {
// Delete tmp file used to serialize result
if (HoodieCLI.fs.exists(outputPath)) {
@@ -407,12 +385,12 @@ public class CompactionCommand implements CommandMarker {
String output = "";
if (HoodieCLI.tableMetadata.getTableType() == HoodieTableType.MERGE_ON_READ) {
try {
String sparkPropertiesPath = Utils.getDefaultPropertiesFile(
scala.collection.JavaConversions.propertiesAsScalaMap(System.getProperties()));
String sparkPropertiesPath = Utils
.getDefaultPropertiesFile(scala.collection.JavaConversions.propertiesAsScalaMap(System.getProperties()));
SparkLauncher sparkLauncher = SparkUtil.initLauncher(sparkPropertiesPath);
sparkLauncher.addAppArgs(SparkCommand.COMPACT_UNSCHEDULE_FILE.toString(),
HoodieCLI.tableMetadata.getBasePath(), fileId, outputPathStr, "1", master,
sparkMemory, Boolean.valueOf(skipV).toString(), Boolean.valueOf(dryRun).toString());
sparkLauncher.addAppArgs(SparkCommand.COMPACT_UNSCHEDULE_FILE.toString(), HoodieCLI.tableMetadata.getBasePath(),
fileId, outputPathStr, "1", master, sparkMemory, Boolean.valueOf(skipV).toString(),
Boolean.valueOf(dryRun).toString());
Process process = sparkLauncher.launch();
InputStreamConsumer.captureOutput(process);
int exitCode = process.waitFor();
@@ -445,8 +423,8 @@ public class CompactionCommand implements CommandMarker {
@CliOption(key = {"limit"}, help = "Limit commits", unspecifiedDefaultValue = "-1") Integer limit,
@CliOption(key = {"sortBy"}, help = "Sorting Field", unspecifiedDefaultValue = "") String sortByField,
@CliOption(key = {"desc"}, help = "Ordering", unspecifiedDefaultValue = "false") boolean descending,
@CliOption(key = {
"headeronly"}, help = "Print Header Only", unspecifiedDefaultValue = "false") boolean headerOnly)
@CliOption(key = {"headeronly"}, help = "Print Header Only",
unspecifiedDefaultValue = "false") boolean headerOnly)
throws Exception {
boolean initialized = HoodieCLI.initConf();
HoodieCLI.initFS(initialized);
@@ -455,12 +433,11 @@ public class CompactionCommand implements CommandMarker {
String output = "";
if (HoodieCLI.tableMetadata.getTableType() == HoodieTableType.MERGE_ON_READ) {
try {
String sparkPropertiesPath = Utils.getDefaultPropertiesFile(
scala.collection.JavaConversions.propertiesAsScalaMap(System.getProperties()));
String sparkPropertiesPath = Utils
.getDefaultPropertiesFile(scala.collection.JavaConversions.propertiesAsScalaMap(System.getProperties()));
SparkLauncher sparkLauncher = SparkUtil.initLauncher(sparkPropertiesPath);
sparkLauncher.addAppArgs(SparkCommand.COMPACT_REPAIR.toString(),
HoodieCLI.tableMetadata.getBasePath(), compactionInstant, outputPathStr, parallelism, master,
sparkMemory, Boolean.valueOf(dryRun).toString());
sparkLauncher.addAppArgs(SparkCommand.COMPACT_REPAIR.toString(), HoodieCLI.tableMetadata.getBasePath(),
compactionInstant, outputPathStr, parallelism, master, sparkMemory, Boolean.valueOf(dryRun).toString());
Process process = sparkLauncher.launch();
InputStreamConsumer.captureOutput(process);
int exitCode = process.waitFor();
@@ -481,41 +458,35 @@ public class CompactionCommand implements CommandMarker {
}
}
private String getRenamesToBePrinted(List<RenameOpResult> res, Integer limit,
String sortByField, boolean descending, boolean headerOnly, String operation) {
private String getRenamesToBePrinted(List<RenameOpResult> res, Integer limit, String sortByField, boolean descending,
boolean headerOnly, String operation) {
Option<Boolean> result = Option.fromJavaOptional(
res.stream().map(r -> r.isExecuted() && r.isSuccess()).reduce(Boolean::logicalAnd));
Option<Boolean> result =
Option.fromJavaOptional(res.stream().map(r -> r.isExecuted() && r.isSuccess()).reduce(Boolean::logicalAnd));
if (result.isPresent()) {
System.out.println("There were some file renames that needed to be done to " + operation);
if (result.get()) {
System.out.println("All renames successfully completed to " + operation + " done !!");
} else {
System.out.println("Some renames failed. DataSet could be in inconsistent-state. "
+ "Try running compaction repair");
System.out
.println("Some renames failed. DataSet could be in inconsistent-state. " + "Try running compaction repair");
}
List<Comparable[]> rows = new ArrayList<>();
res.stream().forEach(r -> {
Comparable[] row = new Comparable[] {
r.getOperation().fileId, r.getOperation().srcPath, r.getOperation().destPath,
r.isExecuted(), r.isSuccess(), r.getException().isPresent() ? r.getException().get().getMessage() : ""
};
Comparable[] row =
new Comparable[] {r.getOperation().fileId, r.getOperation().srcPath, r.getOperation().destPath,
r.isExecuted(), r.isSuccess(), r.getException().isPresent() ? r.getException().get().getMessage() : ""};
rows.add(row);
});
Map<String, Function<Object, String>> fieldNameToConverterMap = new HashMap<>();
TableHeader header = new TableHeader()
.addTableHeaderField("File Id")
.addTableHeaderField("Source File Path")
.addTableHeaderField("Destination File Path")
.addTableHeaderField("Rename Executed?")
.addTableHeaderField("Rename Succeeded?")
.addTableHeaderField("Error");
TableHeader header = new TableHeader().addTableHeaderField("File Id").addTableHeaderField("Source File Path")
.addTableHeaderField("Destination File Path").addTableHeaderField("Rename Executed?")
.addTableHeaderField("Rename Succeeded?").addTableHeaderField("Error");
return HoodiePrintHelper.print(header, fieldNameToConverterMap, sortByField, descending,
limit, headerOnly, rows);
return HoodiePrintHelper.print(header, fieldNameToConverterMap, sortByField, descending, limit, headerOnly, rows);
} else {
return "No File renames needed to " + operation + ". Operation successful.";
}

View File

@@ -52,13 +52,12 @@ public class DatasetsCommand implements CommandMarker {
@CliOption(key = {"maxCheckIntervalMs"}, mandatory = false, unspecifiedDefaultValue = "300000",
help = "Max wait time for eventual consistency") final Integer maxConsistencyIntervalMs,
@CliOption(key = {"maxCheckIntervalMs"}, mandatory = false, unspecifiedDefaultValue = "7",
help = "Max checks for eventual consistency") final Integer maxConsistencyChecks) throws IOException {
HoodieCLI.setConsistencyGuardConfig(
ConsistencyGuardConfig.newBuilder()
.withConsistencyCheckEnabled(eventuallyConsistent)
help = "Max checks for eventual consistency") final Integer maxConsistencyChecks)
throws IOException {
HoodieCLI
.setConsistencyGuardConfig(ConsistencyGuardConfig.newBuilder().withConsistencyCheckEnabled(eventuallyConsistent)
.withInitialConsistencyCheckIntervalMs(initialConsistencyIntervalMs)
.withMaxConsistencyCheckIntervalMs(maxConsistencyIntervalMs)
.withMaxConsistencyChecks(maxConsistencyChecks)
.withMaxConsistencyCheckIntervalMs(maxConsistencyIntervalMs).withMaxConsistencyChecks(maxConsistencyChecks)
.build());
HoodieCLI.initConf();
HoodieCLI.connectTo(path);
@@ -82,7 +81,8 @@ public class DatasetsCommand implements CommandMarker {
@CliOption(key = {"tableType"}, unspecifiedDefaultValue = "COPY_ON_WRITE",
help = "Hoodie Table Type. Must be one of : COPY_ON_WRITE or MERGE_ON_READ") final String tableTypeStr,
@CliOption(key = {"payloadClass"}, unspecifiedDefaultValue = "org.apache.hudi.common.model.HoodieAvroPayload",
help = "Payload Class") final String payloadClass) throws IOException {
help = "Payload Class") final String payloadClass)
throws IOException {
boolean initialized = HoodieCLI.initConf();
HoodieCLI.initFS(initialized);
@@ -117,9 +117,7 @@ public class DatasetsCommand implements CommandMarker {
*/
@CliCommand(value = "desc", help = "Describle Hoodie Table properties")
public String descTable() {
TableHeader header = new TableHeader()
.addTableHeaderField("Property")
.addTableHeaderField("Value");
TableHeader header = new TableHeader().addTableHeaderField("Property").addTableHeaderField("Value");
List<Comparable[]> rows = new ArrayList<>();
rows.add(new Comparable[] {"basePath", HoodieCLI.tableMetadata.getBasePath()});
rows.add(new Comparable[] {"metaPath", HoodieCLI.tableMetadata.getMetaPath()});

View File

@@ -52,24 +52,23 @@ public class FileSystemViewCommand implements CommandMarker {
@CliCommand(value = "show fsview all", help = "Show entire file-system view")
public String showAllFileSlices(
@CliOption(key = {"pathRegex"},
help = "regex to select files, eg: 2016/08/02", unspecifiedDefaultValue = "*/*/*") String globRegex,
@CliOption(key = {"pathRegex"}, help = "regex to select files, eg: 2016/08/02",
unspecifiedDefaultValue = "*/*/*") String globRegex,
@CliOption(key = {"readOptimizedOnly"}, help = "Only display read-optimized view",
unspecifiedDefaultValue = "false") boolean readOptimizedOnly,
@CliOption(key = {"maxInstant"}, help = "File-Slices upto this instant are displayed",
unspecifiedDefaultValue = "") String maxInstant,
@CliOption(key = {
"includeMax"}, help = "Include Max Instant", unspecifiedDefaultValue = "false") boolean includeMaxInstant,
@CliOption(key = {
"includeInflight"}, help = "Include Inflight Instants", unspecifiedDefaultValue = "false")
boolean includeInflight,
@CliOption(key = {"excludeCompaction"}, help = "Exclude compaction Instants", unspecifiedDefaultValue = "false")
boolean excludeCompaction,
@CliOption(key = {"includeMax"}, help = "Include Max Instant",
unspecifiedDefaultValue = "false") boolean includeMaxInstant,
@CliOption(key = {"includeInflight"}, help = "Include Inflight Instants",
unspecifiedDefaultValue = "false") boolean includeInflight,
@CliOption(key = {"excludeCompaction"}, help = "Exclude compaction Instants",
unspecifiedDefaultValue = "false") boolean excludeCompaction,
@CliOption(key = {"limit"}, help = "Limit rows to be displayed", unspecifiedDefaultValue = "-1") Integer limit,
@CliOption(key = {"sortBy"}, help = "Sorting Field", unspecifiedDefaultValue = "") final String sortByField,
@CliOption(key = {"desc"}, help = "Ordering", unspecifiedDefaultValue = "false") final boolean descending,
@CliOption(key = {
"headeronly"}, help = "Print Header Only", unspecifiedDefaultValue = "false") final boolean headerOnly)
@CliOption(key = {"headeronly"}, help = "Print Header Only",
unspecifiedDefaultValue = "false") final boolean headerOnly)
throws IOException {
HoodieTableFileSystemView fsView = buildFileSystemView(globRegex, maxInstant, readOptimizedOnly, includeMaxInstant,
@@ -97,15 +96,10 @@ public class FileSystemViewCommand implements CommandMarker {
fieldNameToConverterMap.put("Total Delta File Size", converterFunction);
fieldNameToConverterMap.put("Data-File Size", converterFunction);
TableHeader header = new TableHeader()
.addTableHeaderField("Partition")
.addTableHeaderField("FileId")
.addTableHeaderField("Base-Instant")
.addTableHeaderField("Data-File")
.addTableHeaderField("Data-File Size");
TableHeader header = new TableHeader().addTableHeaderField("Partition").addTableHeaderField("FileId")
.addTableHeaderField("Base-Instant").addTableHeaderField("Data-File").addTableHeaderField("Data-File Size");
if (!readOptimizedOnly) {
header = header.addTableHeaderField("Num Delta Files")
.addTableHeaderField("Total Delta File Size")
header = header.addTableHeaderField("Num Delta Files").addTableHeaderField("Total Delta File Size")
.addTableHeaderField("Delta Files");
}
return HoodiePrintHelper.print(header, fieldNameToConverterMap, sortByField, descending, limit, headerOnly, rows);
@@ -113,25 +107,24 @@ public class FileSystemViewCommand implements CommandMarker {
@CliCommand(value = "show fsview latest", help = "Show latest file-system view")
public String showLatestFileSlices(
@CliOption(key = {"partitionPath"},
help = "A valid paritition path", mandatory = true) String partition,
@CliOption(key = {"partitionPath"}, help = "A valid paritition path", mandatory = true) String partition,
@CliOption(key = {"readOptimizedOnly"}, help = "Only display read-optimized view",
unspecifiedDefaultValue = "false") boolean readOptimizedOnly,
@CliOption(key = {"maxInstant"}, help = "File-Slices upto this instant are displayed",
unspecifiedDefaultValue = "") String maxInstant,
@CliOption(key = {"merge"}, help = "Merge File Slices due to pending compaction",
unspecifiedDefaultValue = "true") final boolean merge,
@CliOption(key = {"includeMax"}, help = "Include Max Instant", unspecifiedDefaultValue = "false")
boolean includeMaxInstant,
@CliOption(key = {"includeInflight"}, help = "Include Inflight Instants", unspecifiedDefaultValue = "false")
boolean includeInflight,
@CliOption(key = {"excludeCompaction"}, help = "Exclude compaction Instants", unspecifiedDefaultValue = "false")
boolean excludeCompaction,
@CliOption(key = {"includeMax"}, help = "Include Max Instant",
unspecifiedDefaultValue = "false") boolean includeMaxInstant,
@CliOption(key = {"includeInflight"}, help = "Include Inflight Instants",
unspecifiedDefaultValue = "false") boolean includeInflight,
@CliOption(key = {"excludeCompaction"}, help = "Exclude compaction Instants",
unspecifiedDefaultValue = "false") boolean excludeCompaction,
@CliOption(key = {"limit"}, help = "Limit rows to be displayed", unspecifiedDefaultValue = "-1") Integer limit,
@CliOption(key = {"sortBy"}, help = "Sorting Field", unspecifiedDefaultValue = "") final String sortByField,
@CliOption(key = {"desc"}, help = "Ordering", unspecifiedDefaultValue = "false") final boolean descending,
@CliOption(key = {
"headeronly"}, help = "Print Header Only", unspecifiedDefaultValue = "false") final boolean headerOnly)
@CliOption(key = {"headeronly"}, help = "Print Header Only",
unspecifiedDefaultValue = "false") final boolean headerOnly)
throws IOException {
HoodieTableFileSystemView fsView = buildFileSystemView(partition, maxInstant, readOptimizedOnly, includeMaxInstant,
@@ -163,28 +156,25 @@ public class FileSystemViewCommand implements CommandMarker {
if (!readOptimizedOnly) {
row[idx++] = fs.getLogFiles().count();
row[idx++] = fs.getLogFiles().mapToLong(lf -> lf.getFileSize()).sum();
long logFilesScheduledForCompactionTotalSize = fs.getLogFiles()
.filter(lf -> lf.getBaseCommitTime().equals(fs.getBaseInstantTime()))
long logFilesScheduledForCompactionTotalSize =
fs.getLogFiles().filter(lf -> lf.getBaseCommitTime().equals(fs.getBaseInstantTime()))
.mapToLong(lf -> lf.getFileSize()).sum();
row[idx++] = logFilesScheduledForCompactionTotalSize;
long logFilesUnscheduledTotalSize = fs.getLogFiles()
.filter(lf -> !lf.getBaseCommitTime().equals(fs.getBaseInstantTime()))
long logFilesUnscheduledTotalSize =
fs.getLogFiles().filter(lf -> !lf.getBaseCommitTime().equals(fs.getBaseInstantTime()))
.mapToLong(lf -> lf.getFileSize()).sum();
row[idx++] = logFilesUnscheduledTotalSize;
double logSelectedForCompactionToBaseRatio =
dataFileSize > 0 ? logFilesScheduledForCompactionTotalSize / (dataFileSize * 1.0) : -1;
row[idx++] = logSelectedForCompactionToBaseRatio;
double logUnscheduledToBaseRatio =
dataFileSize > 0 ? logFilesUnscheduledTotalSize / (dataFileSize * 1.0) : -1;
double logUnscheduledToBaseRatio = dataFileSize > 0 ? logFilesUnscheduledTotalSize / (dataFileSize * 1.0) : -1;
row[idx++] = logUnscheduledToBaseRatio;
row[idx++] = fs.getLogFiles()
.filter(lf -> lf.getBaseCommitTime().equals(fs.getBaseInstantTime()))
row[idx++] = fs.getLogFiles().filter(lf -> lf.getBaseCommitTime().equals(fs.getBaseInstantTime()))
.collect(Collectors.toList()).toString();
row[idx++] = fs.getLogFiles()
.filter(lf -> !lf.getBaseCommitTime().equals(fs.getBaseInstantTime()))
row[idx++] = fs.getLogFiles().filter(lf -> !lf.getBaseCommitTime().equals(fs.getBaseInstantTime()))
.collect(Collectors.toList()).toString();
}
rows.add(row);
@@ -200,16 +190,11 @@ public class FileSystemViewCommand implements CommandMarker {
fieldNameToConverterMap.put("Delta Size - compaction unscheduled", converterFunction);
}
TableHeader header = new TableHeader()
.addTableHeaderField("Partition")
.addTableHeaderField("FileId")
.addTableHeaderField("Base-Instant")
.addTableHeaderField("Data-File")
.addTableHeaderField("Data-File Size");
TableHeader header = new TableHeader().addTableHeaderField("Partition").addTableHeaderField("FileId")
.addTableHeaderField("Base-Instant").addTableHeaderField("Data-File").addTableHeaderField("Data-File Size");
if (!readOptimizedOnly) {
header = header.addTableHeaderField("Num Delta Files")
.addTableHeaderField("Total Delta Size")
header = header.addTableHeaderField("Num Delta Files").addTableHeaderField("Total Delta Size")
.addTableHeaderField("Delta Size - compaction scheduled")
.addTableHeaderField("Delta Size - compaction unscheduled")
.addTableHeaderField("Delta To Base Ratio - compaction scheduled")
@@ -222,6 +207,7 @@ public class FileSystemViewCommand implements CommandMarker {
/**
* Build File System View
*
* @param globRegex Path Regex
* @param maxInstant Max Instants to be used for displaying file-instants
* @param readOptimizedOnly Include only read optimized view
@@ -233,8 +219,8 @@ public class FileSystemViewCommand implements CommandMarker {
*/
private HoodieTableFileSystemView buildFileSystemView(String globRegex, String maxInstant, boolean readOptimizedOnly,
boolean includeMaxInstant, boolean includeInflight, boolean excludeCompaction) throws IOException {
HoodieTableMetaClient metaClient = new HoodieTableMetaClient(HoodieCLI.tableMetadata.getHadoopConf(),
HoodieCLI.tableMetadata.getBasePath(), true);
HoodieTableMetaClient metaClient =
new HoodieTableMetaClient(HoodieCLI.tableMetadata.getHadoopConf(), HoodieCLI.tableMetadata.getBasePath(), true);
FileSystem fs = HoodieCLI.fs;
String globPath = String.format("%s/%s/*", HoodieCLI.tableMetadata.getBasePath(), globRegex);
FileStatus[] statuses = fs.globStatus(new Path(globPath));

View File

@@ -43,17 +43,17 @@ public class HDFSParquetImportCommand implements CommandMarker {
@CliOption(key = "upsert", mandatory = false, unspecifiedDefaultValue = "false",
help = "Uses upsert API instead of the default insert API of WriteClient") boolean useUpsert,
@CliOption(key = "srcPath", mandatory = true, help = "Base path for the input dataset") final String srcPath,
@CliOption(key = "targetPath", mandatory = true, help = "Base path for the target hoodie dataset") final String
targetPath,
@CliOption(key = "targetPath", mandatory = true,
help = "Base path for the target hoodie dataset") final String targetPath,
@CliOption(key = "tableName", mandatory = true, help = "Table name") final String tableName,
@CliOption(key = "tableType", mandatory = true, help = "Table type") final String tableType,
@CliOption(key = "rowKeyField", mandatory = true, help = "Row key field name") final String rowKeyField,
@CliOption(key = "partitionPathField", mandatory = true, help = "Partition path field name") final String
partitionPathField,
@CliOption(key = {
"parallelism"}, mandatory = true, help = "Parallelism for hoodie insert") final String parallelism,
@CliOption(key = "schemaFilePath", mandatory = true, help = "Path for Avro schema file") final String
schemaFilePath,
@CliOption(key = "partitionPathField", mandatory = true,
help = "Partition path field name") final String partitionPathField,
@CliOption(key = {"parallelism"}, mandatory = true,
help = "Parallelism for hoodie insert") final String parallelism,
@CliOption(key = "schemaFilePath", mandatory = true,
help = "Path for Avro schema file") final String schemaFilePath,
@CliOption(key = "format", mandatory = true, help = "Format for the input data") final String format,
@CliOption(key = "sparkMemory", mandatory = true, help = "Spark executor memory") final String sparkMemory,
@CliOption(key = "retry", mandatory = true, help = "Number of retries") final String retry) throws Exception {
@@ -62,8 +62,8 @@ public class HDFSParquetImportCommand implements CommandMarker {
boolean initialized = HoodieCLI.initConf();
HoodieCLI.initFS(initialized);
String sparkPropertiesPath = Utils.getDefaultPropertiesFile(
JavaConverters.mapAsScalaMapConverter(System.getenv()).asScala());
String sparkPropertiesPath =
Utils.getDefaultPropertiesFile(JavaConverters.mapAsScalaMapConverter(System.getenv()).asScala());
SparkLauncher sparkLauncher = SparkUtil.initLauncher(sparkPropertiesPath);
@@ -72,8 +72,8 @@ public class HDFSParquetImportCommand implements CommandMarker {
cmd = SparkCommand.UPSERT.toString();
}
sparkLauncher.addAppArgs(cmd, srcPath, targetPath, tableName, tableType, rowKeyField,
partitionPathField, parallelism, schemaFilePath, sparkMemory, retry);
sparkLauncher.addAppArgs(cmd, srcPath, targetPath, tableName, tableType, rowKeyField, partitionPathField,
parallelism, schemaFilePath, sparkMemory, retry);
Process process = sparkLauncher.launch();
InputStreamConsumer.captureOutput(process);
int exitCode = process.waitFor();

View File

@@ -69,30 +69,29 @@ public class HoodieLogFileCommand implements CommandMarker {
@CliCommand(value = "show logfile metadata", help = "Read commit metadata from log files")
public String showLogFileCommits(
@CliOption(key = "logFilePathPattern", mandatory = true, help = "Fully qualified path for the log file") final
String logFilePathPattern,
@CliOption(key = "logFilePathPattern", mandatory = true,
help = "Fully qualified path for the log file") final String logFilePathPattern,
@CliOption(key = {"limit"}, help = "Limit commits", unspecifiedDefaultValue = "-1") final Integer limit,
@CliOption(key = {"sortBy"}, help = "Sorting Field", unspecifiedDefaultValue = "") final String sortByField,
@CliOption(key = {"desc"}, help = "Ordering", unspecifiedDefaultValue = "false") final boolean descending,
@CliOption(key = {"headeronly"}, help = "Print Header Only", unspecifiedDefaultValue = "false")
final boolean headerOnly) throws IOException {
@CliOption(key = {"headeronly"}, help = "Print Header Only",
unspecifiedDefaultValue = "false") final boolean headerOnly)
throws IOException {
FileSystem fs = HoodieCLI.tableMetadata.getFs();
List<String> logFilePaths = Arrays.stream(fs.globStatus(new Path(logFilePathPattern)))
.map(status -> status.getPath().toString()).collect(Collectors.toList());
Map<String, List<Tuple3<HoodieLogBlockType, Tuple2<Map<HeaderMetadataType, String>, Map<HeaderMetadataType,
String>>, Integer>>>
commitCountAndMetadata = Maps.newHashMap();
Map<String, List<Tuple3<HoodieLogBlockType, Tuple2<Map<HeaderMetadataType, String>, Map<HeaderMetadataType, String>>, Integer>>> commitCountAndMetadata =
Maps.newHashMap();
int totalEntries = 0;
int numCorruptBlocks = 0;
int dummyInstantTimeCount = 0;
for (String logFilePath : logFilePaths) {
FileStatus[] fsStatus = fs.listStatus(new Path(logFilePath));
Schema writerSchema = new AvroSchemaConverter().convert(
SchemaUtil.readSchemaFromLogFile(HoodieCLI.tableMetadata.getFs(), new Path(logFilePath)));
Reader reader = HoodieLogFormat
.newReader(fs, new HoodieLogFile(fsStatus[0].getPath()), writerSchema);
Schema writerSchema = new AvroSchemaConverter()
.convert(SchemaUtil.readSchemaFromLogFile(HoodieCLI.tableMetadata.getFs(), new Path(logFilePath)));
Reader reader = HoodieLogFormat.newReader(fs, new HoodieLogFile(fsStatus[0].getPath()), writerSchema);
// read the avro blocks
while (reader.hasNext()) {
@@ -126,8 +125,8 @@ public class HoodieLogFileCommand implements CommandMarker {
new Tuple3<>(n.getBlockType(), new Tuple2<>(n.getLogBlockHeader(), n.getLogBlockFooter()), recordCount));
totalEntries++;
} else {
List<Tuple3<HoodieLogBlockType, Tuple2<Map<HeaderMetadataType, String>, Map<HeaderMetadataType, String>>,
Integer>> list = new ArrayList<>();
List<Tuple3<HoodieLogBlockType, Tuple2<Map<HeaderMetadataType, String>, Map<HeaderMetadataType, String>>, Integer>> list =
new ArrayList<>();
list.add(
new Tuple3<>(n.getBlockType(), new Tuple2<>(n.getLogBlockHeader(), n.getLogBlockFooter()), recordCount));
commitCountAndMetadata.put(instantTime, list);
@@ -139,12 +138,11 @@ public class HoodieLogFileCommand implements CommandMarker {
List<Comparable[]> rows = new ArrayList<>();
int i = 0;
ObjectMapper objectMapper = new ObjectMapper();
for (Map.Entry<String, List<Tuple3<HoodieLogBlockType,
Tuple2<Map<HeaderMetadataType, String>, Map<HeaderMetadataType, String>>, Integer>>> entry
: commitCountAndMetadata.entrySet()) {
for (Map.Entry<String, List<Tuple3<HoodieLogBlockType, Tuple2<Map<HeaderMetadataType, String>, Map<HeaderMetadataType, String>>, Integer>>> entry : commitCountAndMetadata
.entrySet()) {
String instantTime = entry.getKey().toString();
for (Tuple3<HoodieLogBlockType, Tuple2<Map<HeaderMetadataType, String>,
Map<HeaderMetadataType, String>>, Integer> tuple3 : entry.getValue()) {
for (Tuple3<HoodieLogBlockType, Tuple2<Map<HeaderMetadataType, String>, Map<HeaderMetadataType, String>>, Integer> tuple3 : entry
.getValue()) {
Comparable[] output = new Comparable[5];
output[0] = instantTime;
output[1] = tuple3._3();
@@ -156,21 +154,18 @@ public class HoodieLogFileCommand implements CommandMarker {
}
}
TableHeader header = new TableHeader()
.addTableHeaderField("InstantTime")
.addTableHeaderField("RecordCount")
.addTableHeaderField("BlockType")
.addTableHeaderField("HeaderMetadata")
.addTableHeaderField("FooterMetadata");
TableHeader header = new TableHeader().addTableHeaderField("InstantTime").addTableHeaderField("RecordCount")
.addTableHeaderField("BlockType").addTableHeaderField("HeaderMetadata").addTableHeaderField("FooterMetadata");
return HoodiePrintHelper.print(header, new HashMap<>(), sortByField, descending, limit, headerOnly, rows);
}
@CliCommand(value = "show logfile records", help = "Read records from log files")
public String showLogFileRecords(@CliOption(key = {
"limit"}, mandatory = false, help = "Limit commits", unspecifiedDefaultValue = "10") final Integer limit,
@CliOption(key = "logFilePathPattern", mandatory = true, help = "Fully qualified paths for the log files")
final String logFilePathPattern,
public String showLogFileRecords(
@CliOption(key = {"limit"}, mandatory = false, help = "Limit commits",
unspecifiedDefaultValue = "10") final Integer limit,
@CliOption(key = "logFilePathPattern", mandatory = true,
help = "Fully qualified paths for the log files") final String logFilePathPattern,
@CliOption(key = "mergeRecords", mandatory = false, help = "If the records in the log files should be merged",
unspecifiedDefaultValue = "false") final Boolean shouldMerge)
throws IOException {
@@ -184,17 +179,16 @@ public class HoodieLogFileCommand implements CommandMarker {
// TODO : readerSchema can change across blocks/log files, fix this inside Scanner
AvroSchemaConverter converter = new AvroSchemaConverter();
// get schema from last log file
Schema readerSchema = converter.convert(
SchemaUtil.readSchemaFromLogFile(fs, new Path(logFilePaths.get(logFilePaths.size() - 1))));
Schema readerSchema =
converter.convert(SchemaUtil.readSchemaFromLogFile(fs, new Path(logFilePaths.get(logFilePaths.size() - 1))));
List<IndexedRecord> allRecords = new ArrayList<>();
if (shouldMerge) {
System.out.println("===========================> MERGING RECORDS <===================");
HoodieMergedLogRecordScanner scanner = new HoodieMergedLogRecordScanner(fs,
HoodieCLI.tableMetadata.getBasePath(), logFilePaths, readerSchema,
HoodieCLI.tableMetadata.getActiveTimeline().getCommitTimeline().lastInstant().get()
.getTimestamp(),
HoodieMergedLogRecordScanner scanner =
new HoodieMergedLogRecordScanner(fs, HoodieCLI.tableMetadata.getBasePath(), logFilePaths, readerSchema,
HoodieCLI.tableMetadata.getActiveTimeline().getCommitTimeline().lastInstant().get().getTimestamp(),
Long.valueOf(HoodieMemoryConfig.DEFAULT_MAX_MEMORY_FOR_SPILLABLE_MAP_IN_BYTES),
Boolean.valueOf(HoodieCompactionConfig.DEFAULT_COMPACTION_LAZY_BLOCK_READ_ENABLED),
Boolean.valueOf(HoodieCompactionConfig.DEFAULT_COMPACTION_REVERSE_LOG_READ_ENABLED),
@@ -209,10 +203,10 @@ public class HoodieLogFileCommand implements CommandMarker {
}
} else {
for (String logFile : logFilePaths) {
Schema writerSchema = new AvroSchemaConverter().convert(
SchemaUtil.readSchemaFromLogFile(HoodieCLI.tableMetadata.getFs(), new Path(logFile)));
HoodieLogFormat.Reader reader = HoodieLogFormat
.newReader(fs, new HoodieLogFile(new Path(logFile)), writerSchema);
Schema writerSchema = new AvroSchemaConverter()
.convert(SchemaUtil.readSchemaFromLogFile(HoodieCLI.tableMetadata.getFs(), new Path(logFile)));
HoodieLogFormat.Reader reader =
HoodieLogFormat.newReader(fs, new HoodieLogFile(new Path(logFile)), writerSchema);
// read the avro blocks
while (reader.hasNext()) {
HoodieLogBlock n = reader.next();

View File

@@ -44,19 +44,16 @@ public class HoodieSyncCommand implements CommandMarker {
public String validateSync(
@CliOption(key = {"mode"}, unspecifiedDefaultValue = "complete", help = "Check mode") final String mode,
@CliOption(key = {"sourceDb"}, unspecifiedDefaultValue = "rawdata", help = "source database") final String srcDb,
@CliOption(key = {
"targetDb"}, unspecifiedDefaultValue = "dwh_hoodie", help = "target database") final String tgtDb,
@CliOption(key = {
"partitionCount"}, unspecifiedDefaultValue = "5", help = "total number of recent partitions to validate")
final int partitionCount,
@CliOption(key = {
"hiveServerUrl"}, mandatory = true, help = "hiveServerURL to connect to") final String hiveServerUrl,
@CliOption(key = {
"hiveUser"}, mandatory = false, unspecifiedDefaultValue = "", help = "hive username to connect to") final
String hiveUser,
@CliOption(key = {
"hivePass"}, mandatory = true, unspecifiedDefaultValue = "", help = "hive password to connect to") final
String hivePass)
@CliOption(key = {"targetDb"}, unspecifiedDefaultValue = "dwh_hoodie",
help = "target database") final String tgtDb,
@CliOption(key = {"partitionCount"}, unspecifiedDefaultValue = "5",
help = "total number of recent partitions to validate") final int partitionCount,
@CliOption(key = {"hiveServerUrl"}, mandatory = true,
help = "hiveServerURL to connect to") final String hiveServerUrl,
@CliOption(key = {"hiveUser"}, mandatory = false, unspecifiedDefaultValue = "",
help = "hive username to connect to") final String hiveUser,
@CliOption(key = {"hivePass"}, mandatory = true, unspecifiedDefaultValue = "",
help = "hive password to connect to") final String hivePass)
throws Exception {
HoodieTableMetaClient target = HoodieCLI.syncTableMetadata;
HoodieTimeline targetTimeline = target.getActiveTimeline().getCommitsTimeline();
@@ -77,8 +74,8 @@ public class HoodieSyncCommand implements CommandMarker {
String sourceLatestCommit =
sourceTimeline.getInstants().iterator().hasNext() ? "0" : sourceTimeline.lastInstant().get().getTimestamp();
if (sourceLatestCommit != null && HoodieTimeline.compareTimestamps(targetLatestCommit, sourceLatestCommit,
HoodieTimeline.GREATER)) {
if (sourceLatestCommit != null
&& HoodieTimeline.compareTimestamps(targetLatestCommit, sourceLatestCommit, HoodieTimeline.GREATER)) {
// source is behind the target
List<HoodieInstant> commitsToCatchup = targetTimeline.findInstantsAfter(sourceLatestCommit, Integer.MAX_VALUE)
.getInstants().collect(Collectors.toList());
@@ -89,8 +86,8 @@ public class HoodieSyncCommand implements CommandMarker {
long newInserts = CommitUtil.countNewRecords(target,
commitsToCatchup.stream().map(HoodieInstant::getTimestamp).collect(Collectors.toList()));
return "Count difference now is (count(" + target.getTableConfig().getTableName() + ") - count("
+ source.getTableConfig().getTableName()
+ ") == " + (targetCount - sourceCount) + ". Catch up count is " + newInserts;
+ source.getTableConfig().getTableName() + ") == " + (targetCount - sourceCount) + ". Catch up count is "
+ newInserts;
}
} else {
List<HoodieInstant> commitsToCatchup = sourceTimeline.findInstantsAfter(targetLatestCommit, Integer.MAX_VALUE)
@@ -102,8 +99,8 @@ public class HoodieSyncCommand implements CommandMarker {
long newInserts = CommitUtil.countNewRecords(source,
commitsToCatchup.stream().map(HoodieInstant::getTimestamp).collect(Collectors.toList()));
return "Count difference now is (count(" + source.getTableConfig().getTableName() + ") - count("
+ target.getTableConfig().getTableName()
+ ") == " + (sourceCount - targetCount) + ". Catch up count is " + newInserts;
+ target.getTableConfig().getTableName() + ") == " + (sourceCount - targetCount) + ". Catch up count is "
+ newInserts;
}
}

View File

@@ -47,16 +47,15 @@ public class RepairsCommand implements CommandMarker {
return HoodieCLI.tableMetadata != null;
}
@CliCommand(value = "repair deduplicate", help = "De-duplicate a partition path contains duplicates & produce "
+ "repaired files to replace with")
public String deduplicate(@CliOption(key = {
"duplicatedPartitionPath"}, help = "Partition Path containing the duplicates", mandatory = true) final String
duplicatedPartitionPath,
@CliOption(key = {
"repairedOutputPath"}, help = "Location to place the repaired files", mandatory = true) final String
repairedOutputPath,
@CliOption(key = {
"sparkProperties"}, help = "Spark Properites File Path", mandatory = true) final String sparkPropertiesPath)
@CliCommand(value = "repair deduplicate",
help = "De-duplicate a partition path contains duplicates & produce " + "repaired files to replace with")
public String deduplicate(
@CliOption(key = {"duplicatedPartitionPath"}, help = "Partition Path containing the duplicates",
mandatory = true) final String duplicatedPartitionPath,
@CliOption(key = {"repairedOutputPath"}, help = "Location to place the repaired files",
mandatory = true) final String repairedOutputPath,
@CliOption(key = {"sparkProperties"}, help = "Spark Properites File Path",
mandatory = true) final String sparkPropertiesPath)
throws Exception {
SparkLauncher sparkLauncher = SparkUtil.initLauncher(sparkPropertiesPath);
sparkLauncher.addAppArgs(SparkMain.SparkCommand.DEDUPLICATE.toString(), duplicatedPartitionPath, repairedOutputPath,
@@ -73,14 +72,15 @@ public class RepairsCommand implements CommandMarker {
@CliCommand(value = "repair addpartitionmeta", help = "Add partition metadata to a dataset, if not present")
public String addPartitionMeta(@CliOption(key = {
"dryrun"}, help = "Should we actually add or just print what would be done", unspecifiedDefaultValue = "true")
final boolean dryRun) throws IOException {
public String addPartitionMeta(
@CliOption(key = {"dryrun"}, help = "Should we actually add or just print what would be done",
unspecifiedDefaultValue = "true") final boolean dryRun)
throws IOException {
String latestCommit = HoodieCLI.tableMetadata.getActiveTimeline().getCommitTimeline().lastInstant().get()
.getTimestamp();
List<String> partitionPaths = FSUtils.getAllPartitionFoldersThreeLevelsDown(HoodieCLI.fs,
HoodieCLI.tableMetadata.getBasePath());
String latestCommit =
HoodieCLI.tableMetadata.getActiveTimeline().getCommitTimeline().lastInstant().get().getTimestamp();
List<String> partitionPaths =
FSUtils.getAllPartitionFoldersThreeLevelsDown(HoodieCLI.fs, HoodieCLI.tableMetadata.getBasePath());
Path basePath = new Path(HoodieCLI.tableMetadata.getBasePath());
String[][] rows = new String[partitionPaths.size() + 1][];
@@ -94,8 +94,8 @@ public class RepairsCommand implements CommandMarker {
if (!HoodiePartitionMetadata.hasPartitionMetadata(HoodieCLI.fs, partitionPath)) {
row[1] = "No";
if (!dryRun) {
HoodiePartitionMetadata partitionMetadata = new HoodiePartitionMetadata(HoodieCLI.fs, latestCommit, basePath,
partitionPath);
HoodiePartitionMetadata partitionMetadata =
new HoodiePartitionMetadata(HoodieCLI.fs, latestCommit, basePath, partitionPath);
partitionMetadata.trySave(0);
}
}

View File

@@ -50,8 +50,8 @@ public class RollbacksCommand implements CommandMarker {
@CliOption(key = {"limit"}, help = "Limit #rows to be displayed", unspecifiedDefaultValue = "10") Integer limit,
@CliOption(key = {"sortBy"}, help = "Sorting Field", unspecifiedDefaultValue = "") final String sortByField,
@CliOption(key = {"desc"}, help = "Ordering", unspecifiedDefaultValue = "false") final boolean descending,
@CliOption(key = {
"headeronly"}, help = "Print Header Only", unspecifiedDefaultValue = "false") final boolean headerOnly)
@CliOption(key = {"headeronly"}, help = "Print Header Only",
unspecifiedDefaultValue = "false") final boolean headerOnly)
throws IOException {
HoodieActiveTimeline activeTimeline = new RollbackTimeline(HoodieCLI.tableMetadata);
HoodieTimeline rollback = activeTimeline.getRollbackTimeline().filterCompletedInstants();
@@ -59,8 +59,8 @@ public class RollbacksCommand implements CommandMarker {
final List<Comparable[]> rows = new ArrayList<>();
rollback.getInstants().forEach(instant -> {
try {
HoodieRollbackMetadata metadata = AvroUtils.deserializeAvroMetadata(
activeTimeline.getInstantDetails(instant).get(), HoodieRollbackMetadata.class);
HoodieRollbackMetadata metadata = AvroUtils
.deserializeAvroMetadata(activeTimeline.getInstantDetails(instant).get(), HoodieRollbackMetadata.class);
metadata.getCommitsRollback().forEach(c -> {
Comparable[] row = new Comparable[5];
row[0] = metadata.getStartRollbackTime();
@@ -74,11 +74,8 @@ public class RollbacksCommand implements CommandMarker {
e.printStackTrace();
}
});
TableHeader header = new TableHeader()
.addTableHeaderField("Instant")
.addTableHeaderField("Rolledback Instant")
.addTableHeaderField("Total Files Deleted")
.addTableHeaderField("Time taken in millis")
TableHeader header = new TableHeader().addTableHeaderField("Instant").addTableHeaderField("Rolledback Instant")
.addTableHeaderField("Total Files Deleted").addTableHeaderField("Time taken in millis")
.addTableHeaderField("Total Partitions");
return HoodiePrintHelper.print(header, new HashMap<>(), sortByField, descending, limit, headerOnly, rows);
}
@@ -89,16 +86,17 @@ public class RollbacksCommand implements CommandMarker {
@CliOption(key = {"limit"}, help = "Limit #rows to be displayed", unspecifiedDefaultValue = "10") Integer limit,
@CliOption(key = {"sortBy"}, help = "Sorting Field", unspecifiedDefaultValue = "") final String sortByField,
@CliOption(key = {"desc"}, help = "Ordering", unspecifiedDefaultValue = "false") final boolean descending,
@CliOption(key = {
"headeronly"}, help = "Print Header Only", unspecifiedDefaultValue = "false") final boolean headerOnly)
@CliOption(key = {"headeronly"}, help = "Print Header Only",
unspecifiedDefaultValue = "false") final boolean headerOnly)
throws IOException {
HoodieActiveTimeline activeTimeline = new RollbackTimeline(HoodieCLI.tableMetadata);
final List<Comparable[]> rows = new ArrayList<>();
HoodieRollbackMetadata metadata = AvroUtils.deserializeAvroMetadata(
activeTimeline.getInstantDetails(new HoodieInstant(State.COMPLETED, ROLLBACK_ACTION, rollbackInstant))
.get(), HoodieRollbackMetadata.class);
activeTimeline.getInstantDetails(new HoodieInstant(State.COMPLETED, ROLLBACK_ACTION, rollbackInstant)).get(),
HoodieRollbackMetadata.class);
metadata.getPartitionMetadata().entrySet().forEach(e -> {
Stream.concat(e.getValue().getSuccessDeleteFiles().stream().map(f -> Pair.of(f, true)),
Stream
.concat(e.getValue().getSuccessDeleteFiles().stream().map(f -> Pair.of(f, true)),
e.getValue().getFailedDeleteFiles().stream().map(f -> Pair.of(f, false)))
.forEach(fileWithDeleteStatus -> {
Comparable[] row = new Comparable[5];
@@ -111,12 +109,8 @@ public class RollbacksCommand implements CommandMarker {
});
});
TableHeader header = new TableHeader()
.addTableHeaderField("Instant")
.addTableHeaderField("Rolledback Instants")
.addTableHeaderField("Partition")
.addTableHeaderField("Deleted File")
.addTableHeaderField("Succeeded");
TableHeader header = new TableHeader().addTableHeaderField("Instant").addTableHeaderField("Rolledback Instants")
.addTableHeaderField("Partition").addTableHeaderField("Deleted File").addTableHeaderField("Succeeded");
return HoodiePrintHelper.print(header, new HashMap<>(), sortByField, descending, limit, headerOnly, rows);
}

View File

@@ -62,8 +62,8 @@ public class SavepointsCommand implements CommandMarker {
@CliAvailabilityIndicator({"savepoint rollback"})
public boolean isRollbackToSavepointAvailable() {
return HoodieCLI.tableMetadata != null && !HoodieCLI.tableMetadata.getActiveTimeline().getSavePointTimeline()
.filterCompletedInstants().empty();
return HoodieCLI.tableMetadata != null
&& !HoodieCLI.tableMetadata.getActiveTimeline().getSavePointTimeline().filterCompletedInstants().empty();
}
@CliCommand(value = "savepoints show", help = "Show the savepoints")
@@ -137,8 +137,8 @@ public class SavepointsCommand implements CommandMarker {
}
private static HoodieWriteClient createHoodieClient(JavaSparkContext jsc, String basePath) throws Exception {
HoodieWriteConfig config = HoodieWriteConfig.newBuilder().withPath(basePath).withIndexConfig(
HoodieIndexConfig.newBuilder().withIndexType(HoodieIndex.IndexType.BLOOM).build()).build();
HoodieWriteConfig config = HoodieWriteConfig.newBuilder().withPath(basePath)
.withIndexConfig(HoodieIndexConfig.newBuilder().withIndexType(HoodieIndex.IndexType.BLOOM).build()).build();
return new HoodieWriteClient(jsc, config, false);
}

View File

@@ -43,8 +43,7 @@ public class SparkMain {
* Commands
*/
enum SparkCommand {
ROLLBACK, DEDUPLICATE, ROLLBACK_TO_SAVEPOINT, SAVEPOINT, IMPORT, UPSERT, COMPACT_SCHEDULE, COMPACT_RUN,
COMPACT_UNSCHEDULE_PLAN, COMPACT_UNSCHEDULE_FILE, COMPACT_VALIDATE, COMPACT_REPAIR
ROLLBACK, DEDUPLICATE, ROLLBACK_TO_SAVEPOINT, SAVEPOINT, IMPORT, UPSERT, COMPACT_SCHEDULE, COMPACT_RUN, COMPACT_UNSCHEDULE_PLAN, COMPACT_UNSCHEDULE_FILE, COMPACT_VALIDATE, COMPACT_REPAIR
}
public static void main(String[] args) throws Exception {
@@ -76,13 +75,12 @@ public class SparkMain {
break;
case COMPACT_RUN:
assert (args.length == 8);
returnCode = compact(jsc, args[1], args[2], args[3], Integer.parseInt(args[4]),
args[5], args[6], Integer.parseInt(args[7]), false);
returnCode = compact(jsc, args[1], args[2], args[3], Integer.parseInt(args[4]), args[5], args[6],
Integer.parseInt(args[7]), false);
break;
case COMPACT_SCHEDULE:
assert (args.length == 5);
returnCode = compact(jsc, args[1], args[2], args[3], 1,
"", args[4], 0, true);
returnCode = compact(jsc, args[1], args[2], args[3], 1, "", args[4], 0, true);
break;
case COMPACT_VALIDATE:
assert (args.length == 7);
@@ -113,8 +111,7 @@ public class SparkMain {
System.exit(returnCode);
}
private static int dataLoad(JavaSparkContext jsc, String command,
String srcPath, String targetPath, String tableName,
private static int dataLoad(JavaSparkContext jsc, String command, String srcPath, String targetPath, String tableName,
String tableType, String rowKey, String partitionKey, int parallelism, String schemaFile, String sparkMaster,
String sparkMemory, int retry) throws Exception {
Config cfg = new Config();
@@ -180,9 +177,9 @@ public class SparkMain {
new HoodieCompactionAdminTool(cfg).run(jsc);
}
private static void doCompactUnscheduleFile(JavaSparkContext jsc, String basePath, String fileId,
String outputPath, int parallelism, String sparkMaster, String sparkMemory, boolean skipValidation,
boolean dryRun) throws Exception {
private static void doCompactUnscheduleFile(JavaSparkContext jsc, String basePath, String fileId, String outputPath,
int parallelism, String sparkMaster, String sparkMemory, boolean skipValidation, boolean dryRun)
throws Exception {
HoodieCompactionAdminTool.Config cfg = new HoodieCompactionAdminTool.Config();
cfg.basePath = basePath;
cfg.operation = Operation.UNSCHEDULE_FILE;
@@ -244,8 +241,8 @@ public class SparkMain {
}
private static HoodieWriteClient createHoodieClient(JavaSparkContext jsc, String basePath) throws Exception {
HoodieWriteConfig config = HoodieWriteConfig.newBuilder().withPath(basePath).withIndexConfig(
HoodieIndexConfig.newBuilder().withIndexType(HoodieIndex.IndexType.BLOOM).build()).build();
HoodieWriteConfig config = HoodieWriteConfig.newBuilder().withPath(basePath)
.withIndexConfig(HoodieIndexConfig.newBuilder().withIndexType(HoodieIndex.IndexType.BLOOM).build()).build();
return new HoodieWriteClient(jsc, config);
}
}

View File

@@ -63,8 +63,9 @@ public class StatsCommand implements CommandMarker {
@CliOption(key = {"limit"}, help = "Limit commits", unspecifiedDefaultValue = "-1") final Integer limit,
@CliOption(key = {"sortBy"}, help = "Sorting Field", unspecifiedDefaultValue = "") final String sortByField,
@CliOption(key = {"desc"}, help = "Ordering", unspecifiedDefaultValue = "false") final boolean descending,
@CliOption(key = {"headeronly"}, help = "Print Header Only", unspecifiedDefaultValue = "false")
final boolean headerOnly) throws IOException {
@CliOption(key = {"headeronly"}, help = "Print Header Only",
unspecifiedDefaultValue = "false") final boolean headerOnly)
throws IOException {
long totalRecordsUpserted = 0;
long totalRecordsWritten = 0;
@@ -93,31 +94,26 @@ public class StatsCommand implements CommandMarker {
}
rows.add(new Comparable[] {"Total", totalRecordsUpserted, totalRecordsWritten, waf});
TableHeader header = new TableHeader()
.addTableHeaderField("CommitTime")
.addTableHeaderField("Total Upserted")
.addTableHeaderField("Total Written")
.addTableHeaderField("Write Amplifiation Factor");
TableHeader header = new TableHeader().addTableHeaderField("CommitTime").addTableHeaderField("Total Upserted")
.addTableHeaderField("Total Written").addTableHeaderField("Write Amplifiation Factor");
return HoodiePrintHelper.print(header, new HashMap<>(), sortByField, descending, limit, headerOnly, rows);
}
private Comparable[] printFileSizeHistogram(String commitTime, Snapshot s) {
return new Comparable[]{commitTime, s.getMin(),
s.getValue(0.1), s.getMedian(),
s.getMean(), s.get95thPercentile(),
s.getMax(), s.size(),
s.getStdDev()};
return new Comparable[] {commitTime, s.getMin(), s.getValue(0.1), s.getMedian(), s.getMean(), s.get95thPercentile(),
s.getMax(), s.size(), s.getStdDev()};
}
@CliCommand(value = "stats filesizes", help = "File Sizes. Display summary stats on sizes of files")
public String fileSizeStats(
@CliOption(key = {"partitionPath"},
help = "regex to select files, eg: 2016/08/02", unspecifiedDefaultValue = "*/*/*") final String globRegex,
@CliOption(key = {"partitionPath"}, help = "regex to select files, eg: 2016/08/02",
unspecifiedDefaultValue = "*/*/*") final String globRegex,
@CliOption(key = {"limit"}, help = "Limit commits", unspecifiedDefaultValue = "-1") final Integer limit,
@CliOption(key = {"sortBy"}, help = "Sorting Field", unspecifiedDefaultValue = "") final String sortByField,
@CliOption(key = {"desc"}, help = "Ordering", unspecifiedDefaultValue = "false") final boolean descending,
@CliOption(key = {"headeronly"}, help = "Print Header Only", unspecifiedDefaultValue = "false")
final boolean headerOnly) throws IOException {
@CliOption(key = {"headeronly"}, help = "Print Header Only",
unspecifiedDefaultValue = "false") final boolean headerOnly)
throws IOException {
FileSystem fs = HoodieCLI.fs;
String globPath = String.format("%s/%s/*", HoodieCLI.tableMetadata.getBasePath(), globRegex);
@@ -145,8 +141,8 @@ public class StatsCommand implements CommandMarker {
Snapshot s = globalHistogram.getSnapshot();
rows.add(printFileSizeHistogram("ALL", s));
Function<Object, String> converterFunction = entry ->
NumericUtils.humanReadableByteCount((Double.valueOf(entry.toString())));
Function<Object, String> converterFunction =
entry -> NumericUtils.humanReadableByteCount((Double.valueOf(entry.toString())));
Map<String, Function<Object, String>> fieldNameToConverterMap = new HashMap<>();
fieldNameToConverterMap.put("Min", converterFunction);
fieldNameToConverterMap.put("10th", converterFunction);
@@ -156,16 +152,9 @@ public class StatsCommand implements CommandMarker {
fieldNameToConverterMap.put("Max", converterFunction);
fieldNameToConverterMap.put("StdDev", converterFunction);
TableHeader header = new TableHeader()
.addTableHeaderField("CommitTime")
.addTableHeaderField("Min")
.addTableHeaderField("10th")
.addTableHeaderField("50th")
.addTableHeaderField("avg")
.addTableHeaderField("95th")
.addTableHeaderField("Max")
.addTableHeaderField("NumFiles")
.addTableHeaderField("StdDev");
TableHeader header = new TableHeader().addTableHeaderField("CommitTime").addTableHeaderField("Min")
.addTableHeaderField("10th").addTableHeaderField("50th").addTableHeaderField("avg").addTableHeaderField("95th")
.addTableHeaderField("Max").addTableHeaderField("NumFiles").addTableHeaderField("StdDev");
return HoodiePrintHelper.print(header, fieldNameToConverterMap, sortByField, descending, limit, headerOnly, rows);
}
}

View File

@@ -52,8 +52,7 @@ public class HiveUtil {
stmt.execute("set hive.input.format=org.apache.hadoop.hive.ql.io.HiveInputFormat");
stmt.execute("set hive.stats.autogather=false");
rs = stmt.executeQuery(
"select count(`_hoodie_commit_time`) as cnt from " + dbName + "."
+ source.getTableConfig().getTableName());
"select count(`_hoodie_commit_time`) as cnt from " + dbName + "." + source.getTableConfig().getTableName());
long count = -1;
if (rs.next()) {
count = rs.getLong("cnt");

View File

@@ -40,8 +40,8 @@ public class SparkUtil {
public static SparkLauncher initLauncher(String propertiesFile) throws URISyntaxException {
String currentJar = new File(SparkUtil.class.getProtectionDomain().getCodeSource().getLocation().toURI().getPath())
.getAbsolutePath();
SparkLauncher sparkLauncher = new SparkLauncher().setAppResource(currentJar)
.setMainClass(SparkMain.class.getName());
SparkLauncher sparkLauncher =
new SparkLauncher().setAppResource(currentJar).setMainClass(SparkMain.class.getName());
if (!StringUtils.isNullOrEmpty(propertiesFile)) {
sparkLauncher.setPropertiesFile(propertiesFile);

View File

@@ -26,6 +26,10 @@
<artifactId>hudi-client</artifactId>
<packaging>jar</packaging>
<properties>
<main.basedir>${project.parent.basedir}</main.basedir>
</properties>
<build>
<plugins>
<plugin>

View File

@@ -32,8 +32,8 @@ import org.apache.log4j.Logger;
import org.apache.spark.api.java.JavaSparkContext;
/**
* Abstract class taking care of holding common member variables (FileSystem, SparkContext, HoodieConfigs)
* Also, manages embedded timeline-server if enabled.
* Abstract class taking care of holding common member variables (FileSystem, SparkContext, HoodieConfigs) Also, manages
* embedded timeline-server if enabled.
*/
public abstract class AbstractHoodieClient implements Serializable, AutoCloseable {
@@ -45,10 +45,9 @@ public abstract class AbstractHoodieClient implements Serializable, AutoCloseabl
protected final String basePath;
/**
* Timeline Server has the same lifetime as that of Client.
* Any operations done on the same timeline service will be able to take advantage
* of the cached file-system view. New completed actions will be synced automatically
* in an incremental fashion.
* Timeline Server has the same lifetime as that of Client. Any operations done on the same timeline service will be
* able to take advantage of the cached file-system view. New completed actions will be synced automatically in an
* incremental fashion.
*/
private transient Option<EmbeddedTimelineService> timelineServer;
private final boolean shouldStopTimelineServer;

View File

@@ -69,8 +69,7 @@ public class CompactionAdminClient extends AbstractHoodieClient {
super(jsc, HoodieWriteConfig.newBuilder().withPath(basePath).build());
}
public CompactionAdminClient(JavaSparkContext jsc, String basePath,
Option<EmbeddedTimelineService> timelineServer) {
public CompactionAdminClient(JavaSparkContext jsc, String basePath, Option<EmbeddedTimelineService> timelineServer) {
super(jsc, HoodieWriteConfig.newBuilder().withPath(basePath).build(), timelineServer);
}
@@ -81,8 +80,8 @@ public class CompactionAdminClient extends AbstractHoodieClient {
* @param metaClient Hoodie Table Meta Client
* @param compactionInstant Compaction Instant
*/
public List<ValidationOpResult> validateCompactionPlan(HoodieTableMetaClient metaClient,
String compactionInstant, int parallelism) throws IOException {
public List<ValidationOpResult> validateCompactionPlan(HoodieTableMetaClient metaClient, String compactionInstant,
int parallelism) throws IOException {
HoodieCompactionPlan plan = getCompactionPlan(metaClient, compactionInstant);
HoodieTableFileSystemView fsView =
new HoodieTableFileSystemView(metaClient, metaClient.getCommitsAndCompactionTimeline());
@@ -112,15 +111,13 @@ public class CompactionAdminClient extends AbstractHoodieClient {
* @param parallelism Parallelism
* @param dryRun Dry Run
*/
public List<RenameOpResult> unscheduleCompactionPlan(
String compactionInstant, boolean skipValidation, int parallelism, boolean dryRun) throws Exception {
public List<RenameOpResult> unscheduleCompactionPlan(String compactionInstant, boolean skipValidation,
int parallelism, boolean dryRun) throws Exception {
HoodieTableMetaClient metaClient = createMetaClient(false);
List<Pair<HoodieLogFile, HoodieLogFile>> renameActions =
getRenamingActionsForUnschedulingCompactionPlan(metaClient, compactionInstant, parallelism,
Option.empty(), skipValidation);
List<Pair<HoodieLogFile, HoodieLogFile>> renameActions = getRenamingActionsForUnschedulingCompactionPlan(metaClient,
compactionInstant, parallelism, Option.empty(), skipValidation);
List<RenameOpResult> res =
runRenamingOps(metaClient, renameActions, parallelism, dryRun);
List<RenameOpResult> res = runRenamingOps(metaClient, renameActions, parallelism, dryRun);
Option<Boolean> success =
Option.fromJavaOptional(res.stream().map(r -> (r.isExecuted() && r.isSuccess())).reduce(Boolean::logicalAnd));
@@ -145,8 +142,8 @@ public class CompactionAdminClient extends AbstractHoodieClient {
}
/**
* Remove a fileId from pending compaction. Removes the associated compaction operation and rename delta-files
* that were generated for that file-id after the compaction operation was scheduled.
* Remove a fileId from pending compaction. Removes the associated compaction operation and rename delta-files that
* were generated for that file-id after the compaction operation was scheduled.
*
* This operation MUST be executed with compactions and writer turned OFF.
*
@@ -154,12 +151,11 @@ public class CompactionAdminClient extends AbstractHoodieClient {
* @param skipValidation Skip validation
* @param dryRun Dry Run Mode
*/
public List<RenameOpResult> unscheduleCompactionFileId(HoodieFileGroupId fgId,
boolean skipValidation, boolean dryRun) throws Exception {
public List<RenameOpResult> unscheduleCompactionFileId(HoodieFileGroupId fgId, boolean skipValidation, boolean dryRun)
throws Exception {
HoodieTableMetaClient metaClient = createMetaClient(false);
List<Pair<HoodieLogFile, HoodieLogFile>> renameActions =
getRenamingActionsForUnschedulingCompactionForFileId(metaClient, fgId,
Option.empty(), skipValidation);
getRenamingActionsForUnschedulingCompactionForFileId(metaClient, fgId, Option.empty(), skipValidation);
List<RenameOpResult> res = runRenamingOps(metaClient, renameActions, 1, dryRun);
@@ -167,15 +163,15 @@ public class CompactionAdminClient extends AbstractHoodieClient {
// Ready to remove this file-Id from compaction request
Pair<String, HoodieCompactionOperation> compactionOperationWithInstant =
CompactionUtils.getAllPendingCompactionOperations(metaClient).get(fgId);
HoodieCompactionPlan plan = CompactionUtils
.getCompactionPlan(metaClient, compactionOperationWithInstant.getKey());
List<HoodieCompactionOperation> newOps = plan.getOperations().stream()
.filter(op -> (!op.getFileId().equals(fgId.getFileId()))
&& (!op.getPartitionPath().equals(fgId.getPartitionPath()))).collect(Collectors.toList());
HoodieCompactionPlan plan =
CompactionUtils.getCompactionPlan(metaClient, compactionOperationWithInstant.getKey());
List<HoodieCompactionOperation> newOps = plan.getOperations().stream().filter(
op -> (!op.getFileId().equals(fgId.getFileId())) && (!op.getPartitionPath().equals(fgId.getPartitionPath())))
.collect(Collectors.toList());
HoodieCompactionPlan newPlan =
HoodieCompactionPlan.newBuilder().setOperations(newOps).setExtraMetadata(plan.getExtraMetadata()).build();
HoodieInstant inflight = new HoodieInstant(State.INFLIGHT, COMPACTION_ACTION,
compactionOperationWithInstant.getLeft());
HoodieInstant inflight =
new HoodieInstant(State.INFLIGHT, COMPACTION_ACTION, compactionOperationWithInstant.getLeft());
Path inflightPath = new Path(metaClient.getMetaPath(), inflight.getFileName());
if (metaClient.getFs().exists(inflightPath)) {
// revert if in inflight state
@@ -189,28 +185,28 @@ public class CompactionAdminClient extends AbstractHoodieClient {
}
/**
* Renames delta files to make file-slices consistent with the timeline as dictated by Hoodie metadata.
* Use when compaction unschedule fails partially.
* Renames delta files to make file-slices consistent with the timeline as dictated by Hoodie metadata. Use when
* compaction unschedule fails partially.
*
* This operation MUST be executed with compactions and writer turned OFF.
*
* @param compactionInstant Compaction Instant to be repaired
* @param dryRun Dry Run Mode
*/
public List<RenameOpResult> repairCompaction(String compactionInstant,
int parallelism, boolean dryRun) throws Exception {
public List<RenameOpResult> repairCompaction(String compactionInstant, int parallelism, boolean dryRun)
throws Exception {
HoodieTableMetaClient metaClient = createMetaClient(false);
List<ValidationOpResult> validationResults =
validateCompactionPlan(metaClient, compactionInstant, parallelism);
List<ValidationOpResult> failed = validationResults.stream()
.filter(v -> !v.isSuccess()).collect(Collectors.toList());
List<ValidationOpResult> validationResults = validateCompactionPlan(metaClient, compactionInstant, parallelism);
List<ValidationOpResult> failed =
validationResults.stream().filter(v -> !v.isSuccess()).collect(Collectors.toList());
if (failed.isEmpty()) {
return new ArrayList<>();
}
final HoodieTableFileSystemView fsView = new HoodieTableFileSystemView(metaClient,
metaClient.getCommitsAndCompactionTimeline());
List<Pair<HoodieLogFile, HoodieLogFile>> renameActions = failed.stream().flatMap(v ->
getRenamingActionsToAlignWithCompactionOperation(metaClient, compactionInstant,
final HoodieTableFileSystemView fsView =
new HoodieTableFileSystemView(metaClient, metaClient.getCommitsAndCompactionTimeline());
List<Pair<HoodieLogFile, HoodieLogFile>> renameActions =
failed.stream().flatMap(v -> getRenamingActionsToAlignWithCompactionOperation(metaClient, compactionInstant,
v.getOperation(), Option.of(fsView)).stream()).collect(Collectors.toList());
return runRenamingOps(metaClient, renameActions, parallelism, dryRun);
}
@@ -218,11 +214,10 @@ public class CompactionAdminClient extends AbstractHoodieClient {
/**
* Construction Compaction Plan from compaction instant
*/
private static HoodieCompactionPlan getCompactionPlan(HoodieTableMetaClient metaClient,
String compactionInstant) throws IOException {
HoodieCompactionPlan compactionPlan = AvroUtils.deserializeCompactionPlan(
metaClient.getActiveTimeline().getInstantAuxiliaryDetails(
HoodieTimeline.getCompactionRequestedInstant(compactionInstant)).get());
private static HoodieCompactionPlan getCompactionPlan(HoodieTableMetaClient metaClient, String compactionInstant)
throws IOException {
HoodieCompactionPlan compactionPlan = AvroUtils.deserializeCompactionPlan(metaClient.getActiveTimeline()
.getInstantAuxiliaryDetails(HoodieTimeline.getCompactionRequestedInstant(compactionInstant)).get());
return compactionPlan;
}
@@ -238,20 +233,18 @@ public class CompactionAdminClient extends AbstractHoodieClient {
protected static List<Pair<HoodieLogFile, HoodieLogFile>> getRenamingActionsToAlignWithCompactionOperation(
HoodieTableMetaClient metaClient, String compactionInstant, CompactionOperation op,
Option<HoodieTableFileSystemView> fsViewOpt) {
HoodieTableFileSystemView fileSystemView = fsViewOpt.isPresent() ? fsViewOpt.get() :
new HoodieTableFileSystemView(metaClient, metaClient.getCommitsAndCompactionTimeline());
HoodieTableFileSystemView fileSystemView = fsViewOpt.isPresent() ? fsViewOpt.get()
: new HoodieTableFileSystemView(metaClient, metaClient.getCommitsAndCompactionTimeline());
HoodieInstant lastInstant = metaClient.getCommitsAndCompactionTimeline().lastInstant().get();
FileSlice merged =
fileSystemView.getLatestMergedFileSlicesBeforeOrOn(op.getPartitionPath(), lastInstant.getTimestamp())
.filter(fs -> fs.getFileId().equals(op.getFileId())).findFirst().get();
final int maxVersion =
op.getDeltaFilePaths().stream().map(lf -> FSUtils.getFileVersionFromLog(new Path(lf)))
final int maxVersion = op.getDeltaFilePaths().stream().map(lf -> FSUtils.getFileVersionFromLog(new Path(lf)))
.reduce((x, y) -> x > y ? x : y).orElse(0);
List<HoodieLogFile> logFilesToBeMoved =
merged.getLogFiles().filter(lf -> lf.getLogVersion() > maxVersion).collect(Collectors.toList());
return logFilesToBeMoved.stream().map(lf -> {
Preconditions.checkArgument(lf.getLogVersion() - maxVersion > 0,
"Expect new log version to be sane");
Preconditions.checkArgument(lf.getLogVersion() - maxVersion > 0, "Expect new log version to be sane");
HoodieLogFile newLogFile = new HoodieLogFile(new Path(lf.getPath().getParent(),
FSUtils.makeLogFileName(lf.getFileId(), "." + FSUtils.getFileExtensionFromLog(lf.getPath()),
compactionInstant, lf.getLogVersion() - maxVersion, HoodieLogFormat.UNKNOWN_WRITE_TOKEN)));
@@ -285,11 +278,10 @@ public class CompactionAdminClient extends AbstractHoodieClient {
* @param operation Compaction Operation
* @param fsViewOpt File System View
*/
private ValidationOpResult validateCompactionOperation(HoodieTableMetaClient metaClient,
String compactionInstant, CompactionOperation operation, Option<HoodieTableFileSystemView> fsViewOpt)
throws IOException {
HoodieTableFileSystemView fileSystemView = fsViewOpt.isPresent() ? fsViewOpt.get() :
new HoodieTableFileSystemView(metaClient, metaClient.getCommitsAndCompactionTimeline());
private ValidationOpResult validateCompactionOperation(HoodieTableMetaClient metaClient, String compactionInstant,
CompactionOperation operation, Option<HoodieTableFileSystemView> fsViewOpt) throws IOException {
HoodieTableFileSystemView fileSystemView = fsViewOpt.isPresent() ? fsViewOpt.get()
: new HoodieTableFileSystemView(metaClient, metaClient.getCommitsAndCompactionTimeline());
Option<HoodieInstant> lastInstant = metaClient.getCommitsAndCompactionTimeline().lastInstant();
try {
if (lastInstant.isPresent()) {
@@ -300,16 +292,15 @@ public class CompactionAdminClient extends AbstractHoodieClient {
FileSlice fs = fileSliceOptional.get();
Option<HoodieDataFile> df = fs.getDataFile();
if (operation.getDataFilePath().isPresent()) {
String expPath = metaClient.getFs().getFileStatus(new Path(operation.getDataFilePath().get())).getPath()
.toString();
Preconditions.checkArgument(df.isPresent(), "Data File must be present. File Slice was : "
+ fs + ", operation :" + operation);
String expPath =
metaClient.getFs().getFileStatus(new Path(operation.getDataFilePath().get())).getPath().toString();
Preconditions.checkArgument(df.isPresent(),
"Data File must be present. File Slice was : " + fs + ", operation :" + operation);
Preconditions.checkArgument(df.get().getPath().equals(expPath),
"Base Path in operation is specified as " + expPath + " but got path " + df.get().getPath());
}
Set<HoodieLogFile> logFilesInFileSlice = fs.getLogFiles().collect(Collectors.toSet());
Set<HoodieLogFile> logFilesInCompactionOp = operation.getDeltaFilePaths().stream()
.map(dp -> {
Set<HoodieLogFile> logFilesInCompactionOp = operation.getDeltaFilePaths().stream().map(dp -> {
try {
FileStatus[] fileStatuses = metaClient.getFs().listStatus(new Path(dp));
Preconditions.checkArgument(fileStatuses.length == 1, "Expect only 1 file-status");
@@ -320,25 +311,23 @@ public class CompactionAdminClient extends AbstractHoodieClient {
throw new HoodieIOException(ioe.getMessage(), ioe);
}
}).collect(Collectors.toSet());
Set<HoodieLogFile> missing =
logFilesInCompactionOp.stream().filter(lf -> !logFilesInFileSlice.contains(lf))
Set<HoodieLogFile> missing = logFilesInCompactionOp.stream().filter(lf -> !logFilesInFileSlice.contains(lf))
.collect(Collectors.toSet());
Preconditions.checkArgument(missing.isEmpty(),
"All log files specified in compaction operation is not present. Missing :" + missing
+ ", Exp :" + logFilesInCompactionOp + ", Got :" + logFilesInFileSlice);
Set<HoodieLogFile> diff =
logFilesInFileSlice.stream().filter(lf -> !logFilesInCompactionOp.contains(lf))
"All log files specified in compaction operation is not present. Missing :" + missing + ", Exp :"
+ logFilesInCompactionOp + ", Got :" + logFilesInFileSlice);
Set<HoodieLogFile> diff = logFilesInFileSlice.stream().filter(lf -> !logFilesInCompactionOp.contains(lf))
.collect(Collectors.toSet());
Preconditions.checkArgument(diff.stream().allMatch(lf -> lf.getBaseCommitTime().equals(compactionInstant)),
"There are some log-files which are neither specified in compaction plan "
+ "nor present after compaction request instant. Some of these :" + diff);
} else {
throw new CompactionValidationException("Unable to find file-slice for file-id (" + operation.getFileId()
+ " Compaction operation is invalid.");
throw new CompactionValidationException(
"Unable to find file-slice for file-id (" + operation.getFileId() + " Compaction operation is invalid.");
}
} else {
throw new CompactionValidationException("Unable to find any committed instant. Compaction Operation may "
+ "be pointing to stale file-slices");
throw new CompactionValidationException(
"Unable to find any committed instant. Compaction Operation may " + "be pointing to stale file-slices");
}
} catch (CompactionValidationException | IllegalArgumentException e) {
return new ValidationOpResult(operation, false, Option.of(e));
@@ -374,8 +363,7 @@ public class CompactionAdminClient extends AbstractHoodieClient {
}).collect();
} else {
log.info("Dry-Run Mode activated for rename operations");
return renameActions.parallelStream()
.map(lfPair -> new RenameOpResult(lfPair, false, false, Option.empty()))
return renameActions.parallelStream().map(lfPair -> new RenameOpResult(lfPair, false, false, Option.empty()))
.collect(Collectors.toList());
}
}
@@ -395,18 +383,18 @@ public class CompactionAdminClient extends AbstractHoodieClient {
protected List<Pair<HoodieLogFile, HoodieLogFile>> getRenamingActionsForUnschedulingCompactionPlan(
HoodieTableMetaClient metaClient, String compactionInstant, int parallelism,
Option<HoodieTableFileSystemView> fsViewOpt, boolean skipValidation) throws IOException {
HoodieTableFileSystemView fsView = fsViewOpt.isPresent() ? fsViewOpt.get() :
new HoodieTableFileSystemView(metaClient, metaClient.getCommitsAndCompactionTimeline());
HoodieTableFileSystemView fsView = fsViewOpt.isPresent() ? fsViewOpt.get()
: new HoodieTableFileSystemView(metaClient, metaClient.getCommitsAndCompactionTimeline());
HoodieCompactionPlan plan = getCompactionPlan(metaClient, compactionInstant);
if (plan.getOperations() != null) {
log.info("Number of Compaction Operations :" + plan.getOperations().size()
+ " for instant :" + compactionInstant);
log.info(
"Number of Compaction Operations :" + plan.getOperations().size() + " for instant :" + compactionInstant);
List<CompactionOperation> ops = plan.getOperations().stream()
.map(CompactionOperation::convertFromAvroRecordInstance).collect(Collectors.toList());
return jsc.parallelize(ops, parallelism).flatMap(op -> {
try {
return getRenamingActionsForUnschedulingCompactionOperation(metaClient, compactionInstant,
op, Option.of(fsView), skipValidation).iterator();
return getRenamingActionsForUnschedulingCompactionOperation(metaClient, compactionInstant, op,
Option.of(fsView), skipValidation).iterator();
} catch (IOException ioe) {
throw new HoodieIOException(ioe.getMessage(), ioe);
} catch (CompactionValidationException ve) {
@@ -434,8 +422,8 @@ public class CompactionAdminClient extends AbstractHoodieClient {
HoodieTableMetaClient metaClient, String compactionInstant, CompactionOperation operation,
Option<HoodieTableFileSystemView> fsViewOpt, boolean skipValidation) throws IOException {
List<Pair<HoodieLogFile, HoodieLogFile>> result = new ArrayList<>();
HoodieTableFileSystemView fileSystemView = fsViewOpt.isPresent() ? fsViewOpt.get() :
new HoodieTableFileSystemView(metaClient, metaClient.getCommitsAndCompactionTimeline());
HoodieTableFileSystemView fileSystemView = fsViewOpt.isPresent() ? fsViewOpt.get()
: new HoodieTableFileSystemView(metaClient, metaClient.getCommitsAndCompactionTimeline());
if (!skipValidation) {
validateCompactionOperation(metaClient, compactionInstant, operation, Option.of(fileSystemView));
}
@@ -445,13 +433,11 @@ public class CompactionAdminClient extends AbstractHoodieClient {
.filter(fs -> fs.getFileId().equals(operation.getFileId())).findFirst().get();
List<HoodieLogFile> logFilesToRepair =
merged.getLogFiles().filter(lf -> lf.getBaseCommitTime().equals(compactionInstant))
.sorted(HoodieLogFile.getLogFileComparator())
.collect(Collectors.toList());
.sorted(HoodieLogFile.getLogFileComparator()).collect(Collectors.toList());
FileSlice fileSliceForCompaction =
fileSystemView.getLatestFileSlicesBeforeOrOn(operation.getPartitionPath(), operation.getBaseInstantTime(), true)
.filter(fs -> fs.getFileId().equals(operation.getFileId())).findFirst().get();
int maxUsedVersion =
fileSliceForCompaction.getLogFiles().findFirst().map(HoodieLogFile::getLogVersion)
int maxUsedVersion = fileSliceForCompaction.getLogFiles().findFirst().map(HoodieLogFile::getLogVersion)
.orElse(HoodieLogFile.LOGFILE_BASE_VERSION - 1);
String logExtn = fileSliceForCompaction.getLogFiles().findFirst().map(lf -> "." + lf.getFileExtension())
.orElse(HoodieLogFile.DELTA_EXTENSION);
@@ -479,8 +465,8 @@ public class CompactionAdminClient extends AbstractHoodieClient {
* compaction.
*/
public List<Pair<HoodieLogFile, HoodieLogFile>> getRenamingActionsForUnschedulingCompactionForFileId(
HoodieTableMetaClient metaClient, HoodieFileGroupId fgId,
Option<HoodieTableFileSystemView> fsViewOpt, boolean skipValidation) throws IOException {
HoodieTableMetaClient metaClient, HoodieFileGroupId fgId, Option<HoodieTableFileSystemView> fsViewOpt,
boolean skipValidation) throws IOException {
Map<HoodieFileGroupId, Pair<String, HoodieCompactionOperation>> allPendingCompactions =
CompactionUtils.getAllPendingCompactionOperations(metaClient);
if (allPendingCompactions.containsKey(fgId)) {
@@ -496,20 +482,19 @@ public class CompactionAdminClient extends AbstractHoodieClient {
*/
public static class RenameOpResult extends OperationResult<RenameInfo> {
public RenameOpResult() {
public RenameOpResult() {}
public RenameOpResult(Pair<HoodieLogFile, HoodieLogFile> op, boolean success, Option<Exception> exception) {
super(
new RenameInfo(op.getKey().getFileId(), op.getKey().getPath().toString(), op.getRight().getPath().toString()),
success, exception);
}
public RenameOpResult(Pair<HoodieLogFile, HoodieLogFile> op, boolean success,
public RenameOpResult(Pair<HoodieLogFile, HoodieLogFile> op, boolean executed, boolean success,
Option<Exception> exception) {
super(new RenameInfo(op.getKey().getFileId(), op.getKey().getPath().toString(),
op.getRight().getPath().toString()), success, exception);
}
public RenameOpResult(
Pair<HoodieLogFile, HoodieLogFile> op, boolean executed, boolean success,
Option<Exception> exception) {
super(new RenameInfo(op.getKey().getFileId(), op.getKey().getPath().toString(),
op.getRight().getPath().toString()), executed, success, exception);
super(
new RenameInfo(op.getKey().getFileId(), op.getKey().getPath().toString(), op.getRight().getPath().toString()),
executed, success, exception);
}
}
@@ -518,11 +503,9 @@ public class CompactionAdminClient extends AbstractHoodieClient {
*/
public static class ValidationOpResult extends OperationResult<CompactionOperation> {
public ValidationOpResult() {
}
public ValidationOpResult() {}
public ValidationOpResult(
CompactionOperation operation, boolean success, Option<Exception> exception) {
public ValidationOpResult(CompactionOperation operation, boolean success, Option<Exception> exception) {
super(operation, success, exception);
}
}
@@ -533,8 +516,7 @@ public class CompactionAdminClient extends AbstractHoodieClient {
public String srcPath;
public String destPath;
public RenameInfo() {
}
public RenameInfo() {}
public RenameInfo(String fileId, String srcPath, String destPath) {
this.fileId = fileId;

View File

@@ -58,9 +58,8 @@ public class HoodieReadClient<T extends HoodieRecordPayload> extends AbstractHoo
private static final Logger logger = LogManager.getLogger(HoodieReadClient.class);
/**
* TODO: We need to persist the index type into hoodie.properties and be able to access the index
* just with a simple basepath pointing to the dataset. Until, then just always assume a
* BloomIndex
* TODO: We need to persist the index type into hoodie.properties and be able to access the index just with a simple
* basepath pointing to the dataset. Until, then just always assume a BloomIndex
*/
private final transient HoodieIndex<T> index;
private final HoodieTimeline commitTimeline;
@@ -70,13 +69,11 @@ public class HoodieReadClient<T extends HoodieRecordPayload> extends AbstractHoo
/**
* @param basePath path to Hoodie dataset
*/
public HoodieReadClient(JavaSparkContext jsc, String basePath,
Option<EmbeddedTimelineService> timelineService) {
public HoodieReadClient(JavaSparkContext jsc, String basePath, Option<EmbeddedTimelineService> timelineService) {
this(jsc, HoodieWriteConfig.newBuilder().withPath(basePath)
// by default we use HoodieBloomIndex
.withIndexConfig(
HoodieIndexConfig.newBuilder().withIndexType(HoodieIndex.IndexType.BLOOM).build())
.build(), timelineService);
.withIndexConfig(HoodieIndexConfig.newBuilder().withIndexType(HoodieIndex.IndexType.BLOOM).build()).build(),
timelineService);
}
/**
@@ -130,8 +127,7 @@ public class HoodieReadClient<T extends HoodieRecordPayload> extends AbstractHoo
private void assertSqlContext() {
if (!sqlContextOpt.isPresent()) {
throw new IllegalStateException(
"SQLContext must be set, when performing dataframe operations");
throw new IllegalStateException("SQLContext must be set, when performing dataframe operations");
}
}
@@ -152,17 +148,16 @@ public class HoodieReadClient<T extends HoodieRecordPayload> extends AbstractHoo
*/
public Dataset<Row> readROView(JavaRDD<HoodieKey> hoodieKeys, int parallelism) {
assertSqlContext();
JavaPairRDD<HoodieKey, Option<Pair<String, String>>> lookupResultRDD = index
.fetchRecordLocation(hoodieKeys, jsc, hoodieTable);
JavaPairRDD<HoodieKey, Option<String>> keyToFileRDD = lookupResultRDD
.mapToPair(r -> new Tuple2<>(r._1, convertToDataFilePath(r._2)));
JavaPairRDD<HoodieKey, Option<Pair<String, String>>> lookupResultRDD =
index.fetchRecordLocation(hoodieKeys, jsc, hoodieTable);
JavaPairRDD<HoodieKey, Option<String>> keyToFileRDD =
lookupResultRDD.mapToPair(r -> new Tuple2<>(r._1, convertToDataFilePath(r._2)));
List<String> paths = keyToFileRDD.filter(keyFileTuple -> keyFileTuple._2().isPresent())
.map(keyFileTuple -> keyFileTuple._2().get()).collect();
// record locations might be same for multiple keys, so need a unique list
Set<String> uniquePaths = new HashSet<>(paths);
Dataset<Row> originalDF = sqlContextOpt.get().read()
.parquet(uniquePaths.toArray(new String[uniquePaths.size()]));
Dataset<Row> originalDF = sqlContextOpt.get().read().parquet(uniquePaths.toArray(new String[uniquePaths.size()]));
StructType schema = originalDF.schema();
JavaPairRDD<HoodieKey, Row> keyRowRDD = originalDF.javaRDD().mapToPair(row -> {
HoodieKey key = new HoodieKey(row.getAs(HoodieRecord.RECORD_KEY_METADATA_FIELD),
@@ -176,18 +171,16 @@ public class HoodieReadClient<T extends HoodieRecordPayload> extends AbstractHoo
}
/**
* Checks if the given [Keys] exists in the hoodie table and returns [Key, Option[FullFilePath]]
* If the optional FullFilePath value is not present, then the key is not found. If the
* FullFilePath value is present, it is the path component (without scheme) of the URI underlying
* file
* Checks if the given [Keys] exists in the hoodie table and returns [Key, Option[FullFilePath]] If the optional
* FullFilePath value is not present, then the key is not found. If the FullFilePath value is present, it is the path
* component (without scheme) of the URI underlying file
*/
public JavaPairRDD<HoodieKey, Option<String>> checkExists(JavaRDD<HoodieKey> hoodieKeys) {
return index.fetchRecordLocation(hoodieKeys, jsc, hoodieTable);
}
/**
* Filter out HoodieRecords that already exists in the output folder. This is useful in
* deduplication.
* Filter out HoodieRecords that already exists in the output folder. This is useful in deduplication.
*
* @param hoodieRecords Input RDD of Hoodie records.
* @return A subset of hoodieRecords RDD, with existing records filtered out.
@@ -198,27 +191,27 @@ public class HoodieReadClient<T extends HoodieRecordPayload> extends AbstractHoo
}
/**
* Looks up the index and tags each incoming record with a location of a file that contains the
* row (if it is actually present). Input RDD should contain no duplicates if needed.
* Looks up the index and tags each incoming record with a location of a file that contains the row (if it is actually
* present). Input RDD should contain no duplicates if needed.
*
* @param hoodieRecords Input RDD of Hoodie records
* @return Tagged RDD of Hoodie records
*/
public JavaRDD<HoodieRecord<T>> tagLocation(JavaRDD<HoodieRecord<T>> hoodieRecords)
throws HoodieIndexException {
public JavaRDD<HoodieRecord<T>> tagLocation(JavaRDD<HoodieRecord<T>> hoodieRecords) throws HoodieIndexException {
return index.tagLocation(hoodieRecords, jsc, hoodieTable);
}
/**
* Return all pending compactions with instant time for clients to decide what to compact next.
*
* @return
*/
public List<Pair<String, HoodieCompactionPlan>> getPendingCompactions() {
HoodieTableMetaClient metaClient = new HoodieTableMetaClient(jsc.hadoopConfiguration(),
hoodieTable.getMetaClient().getBasePath(), true);
HoodieTableMetaClient metaClient =
new HoodieTableMetaClient(jsc.hadoopConfiguration(), hoodieTable.getMetaClient().getBasePath(), true);
return CompactionUtils.getAllPendingCompactionPlans(metaClient).stream()
.map(instantWorkloadPair ->
Pair.of(instantWorkloadPair.getKey().getTimestamp(), instantWorkloadPair.getValue()))
.map(
instantWorkloadPair -> Pair.of(instantWorkloadPair.getKey().getTimestamp(), instantWorkloadPair.getValue()))
.collect(Collectors.toList());
}
}

View File

@@ -64,14 +64,11 @@ public class WriteStatus implements Serializable {
}
/**
* Mark write as success, optionally using given parameters for the purpose of calculating some
* aggregate metrics. This method is not meant to cache passed arguments, since WriteStatus
* objects are collected in Spark Driver.
* Mark write as success, optionally using given parameters for the purpose of calculating some aggregate metrics.
* This method is not meant to cache passed arguments, since WriteStatus objects are collected in Spark Driver.
*
* @param record deflated {@code HoodieRecord} containing information that uniquely identifies
* it.
* @param optionalRecordMetadata optional metadata related to data contained in {@link
* HoodieRecord} before deflation.
* @param record deflated {@code HoodieRecord} containing information that uniquely identifies it.
* @param optionalRecordMetadata optional metadata related to data contained in {@link HoodieRecord} before deflation.
*/
public void markSuccess(HoodieRecord record, Option<Map<String, String>> optionalRecordMetadata) {
if (trackSuccessRecords) {
@@ -81,14 +78,11 @@ public class WriteStatus implements Serializable {
}
/**
* Mark write as failed, optionally using given parameters for the purpose of calculating some
* aggregate metrics. This method is not meant to cache passed arguments, since WriteStatus
* objects are collected in Spark Driver.
* Mark write as failed, optionally using given parameters for the purpose of calculating some aggregate metrics. This
* method is not meant to cache passed arguments, since WriteStatus objects are collected in Spark Driver.
*
* @param record deflated {@code HoodieRecord} containing information that uniquely identifies
* it.
* @param optionalRecordMetadata optional metadata related to data contained in {@link
* HoodieRecord} before deflation.
* @param record deflated {@code HoodieRecord} containing information that uniquely identifies it.
* @param optionalRecordMetadata optional metadata related to data contained in {@link HoodieRecord} before deflation.
*/
public void markFailure(HoodieRecord record, Throwable t, Option<Map<String, String>> optionalRecordMetadata) {
if (failedRecords.isEmpty() || (random.nextDouble() <= failureFraction)) {

View File

@@ -40,10 +40,8 @@ public class HoodieCompactionConfig extends DefaultHoodieConfig {
// Turn on inline compaction - after fw delta commits a inline compaction will be run
public static final String INLINE_COMPACT_PROP = "hoodie.compact.inline";
// Run a compaction every N delta commits
public static final String INLINE_COMPACT_NUM_DELTA_COMMITS_PROP =
"hoodie.compact.inline.max" + ".delta.commits";
public static final String CLEANER_FILE_VERSIONS_RETAINED_PROP =
"hoodie.cleaner.fileversions" + ".retained";
public static final String INLINE_COMPACT_NUM_DELTA_COMMITS_PROP = "hoodie.compact.inline.max" + ".delta.commits";
public static final String CLEANER_FILE_VERSIONS_RETAINED_PROP = "hoodie.cleaner.fileversions" + ".retained";
public static final String CLEANER_COMMITS_RETAINED_PROP = "hoodie.cleaner.commits.retained";
public static final String MAX_COMMITS_TO_KEEP_PROP = "hoodie.keep.max.commits";
public static final String MIN_COMMITS_TO_KEEP_PROP = "hoodie.keep.min.commits";
@@ -56,25 +54,21 @@ public class HoodieCompactionConfig extends DefaultHoodieConfig {
* Configs related to specific table types
**/
// Number of inserts, that will be put each partition/bucket for writing
public static final String COPY_ON_WRITE_TABLE_INSERT_SPLIT_SIZE =
"hoodie.copyonwrite.insert" + ".split.size";
public static final String COPY_ON_WRITE_TABLE_INSERT_SPLIT_SIZE = "hoodie.copyonwrite.insert" + ".split.size";
// The rationale to pick the insert parallelism is the following. Writing out 100MB files,
// with atleast 1kb records, means 100K records per file. we just overprovision to 500K
public static final String DEFAULT_COPY_ON_WRITE_TABLE_INSERT_SPLIT_SIZE = String.valueOf(500000);
// Config to control whether we control insert split sizes automatically based on average
// record sizes
public static final String COPY_ON_WRITE_TABLE_AUTO_SPLIT_INSERTS =
"hoodie.copyonwrite.insert" + ".auto.split";
public static final String COPY_ON_WRITE_TABLE_AUTO_SPLIT_INSERTS = "hoodie.copyonwrite.insert" + ".auto.split";
// its off by default
public static final String DEFAULT_COPY_ON_WRITE_TABLE_AUTO_SPLIT_INSERTS = String.valueOf(true);
// This value is used as a guessimate for the record size, if we can't determine this from
// previous commits
public static final String COPY_ON_WRITE_TABLE_RECORD_SIZE_ESTIMATE =
"hoodie.copyonwrite" + ".record.size.estimate";
public static final String COPY_ON_WRITE_TABLE_RECORD_SIZE_ESTIMATE = "hoodie.copyonwrite" + ".record.size.estimate";
// Used to determine how much more can be packed into a small file, before it exceeds the size
// limit.
public static final String DEFAULT_COPY_ON_WRITE_TABLE_RECORD_SIZE_ESTIMATE = String
.valueOf(1024);
public static final String DEFAULT_COPY_ON_WRITE_TABLE_RECORD_SIZE_ESTIMATE = String.valueOf(1024);
public static final String CLEANER_PARALLELISM = "hoodie.cleaner.parallelism";
public static final String DEFAULT_CLEANER_PARALLELISM = String.valueOf(200);
public static final String TARGET_IO_PER_COMPACTION_IN_MB_PROP = "hoodie.compaction.target.io";
@@ -82,8 +76,7 @@ public class HoodieCompactionConfig extends DefaultHoodieConfig {
public static final String DEFAULT_TARGET_IO_PER_COMPACTION_IN_MB = String.valueOf(500 * 1024);
public static final String COMPACTION_STRATEGY_PROP = "hoodie.compaction.strategy";
// 200GB of target IO per compaction
public static final String DEFAULT_COMPACTION_STRATEGY = LogFileSizeBasedCompactionStrategy.class
.getName();
public static final String DEFAULT_COMPACTION_STRATEGY = LogFileSizeBasedCompactionStrategy.class.getName();
// used to merge records written to log file
public static final String DEFAULT_PAYLOAD_CLASS = HoodieAvroPayload.class.getName();
public static final String PAYLOAD_CLASS_PROP = "hoodie.compaction.payload.class";
@@ -91,15 +84,12 @@ public class HoodieCompactionConfig extends DefaultHoodieConfig {
// used to choose a trade off between IO vs Memory when performing compaction process
// Depending on outputfile_size and memory provided, choose true to avoid OOM for large file
// size + small memory
public static final String COMPACTION_LAZY_BLOCK_READ_ENABLED_PROP =
"hoodie.compaction.lazy" + ".block.read";
public static final String COMPACTION_LAZY_BLOCK_READ_ENABLED_PROP = "hoodie.compaction.lazy" + ".block.read";
public static final String DEFAULT_COMPACTION_LAZY_BLOCK_READ_ENABLED = "false";
// used to choose whether to enable reverse log reading (reverse log traversal)
public static final String COMPACTION_REVERSE_LOG_READ_ENABLED_PROP =
"hoodie.compaction" + ".reverse.log.read";
public static final String COMPACTION_REVERSE_LOG_READ_ENABLED_PROP = "hoodie.compaction" + ".reverse.log.read";
public static final String DEFAULT_COMPACTION_REVERSE_LOG_READ_ENABLED = "false";
private static final String DEFAULT_CLEANER_POLICY = HoodieCleaningPolicy.KEEP_LATEST_COMMITS
.name();
private static final String DEFAULT_CLEANER_POLICY = HoodieCleaningPolicy.KEEP_LATEST_COMMITS.name();
private static final String DEFAULT_AUTO_CLEAN = "true";
private static final String DEFAULT_INLINE_COMPACT = "false";
private static final String DEFAULT_INLINE_COMPACT_NUM_DELTA_COMMITS = "1";
@@ -108,8 +98,8 @@ public class HoodieCompactionConfig extends DefaultHoodieConfig {
private static final String DEFAULT_MAX_COMMITS_TO_KEEP = "30";
private static final String DEFAULT_MIN_COMMITS_TO_KEEP = "20";
private static final String DEFAULT_COMMITS_ARCHIVAL_BATCH_SIZE = String.valueOf(10);
public static final String TARGET_PARTITIONS_PER_DAYBASED_COMPACTION_PROP = "hoodie.compaction.daybased.target"
+ ".partitions";
public static final String TARGET_PARTITIONS_PER_DAYBASED_COMPACTION_PROP =
"hoodie.compaction.daybased.target" + ".partitions";
// 500GB of target IO per compaction (both read and write)
public static final String DEFAULT_TARGET_PARTITIONS_PER_DAYBASED_COMPACTION = String.valueOf(10);
@@ -188,14 +178,12 @@ public class HoodieCompactionConfig extends DefaultHoodieConfig {
}
public Builder autoTuneInsertSplits(boolean autoTuneInsertSplits) {
props.setProperty(COPY_ON_WRITE_TABLE_AUTO_SPLIT_INSERTS,
String.valueOf(autoTuneInsertSplits));
props.setProperty(COPY_ON_WRITE_TABLE_AUTO_SPLIT_INSERTS, String.valueOf(autoTuneInsertSplits));
return this;
}
public Builder approxRecordSize(int recordSizeEstimate) {
props.setProperty(COPY_ON_WRITE_TABLE_RECORD_SIZE_ESTIMATE,
String.valueOf(recordSizeEstimate));
props.setProperty(COPY_ON_WRITE_TABLE_RECORD_SIZE_ESTIMATE, String.valueOf(recordSizeEstimate));
return this;
}
@@ -215,32 +203,27 @@ public class HoodieCompactionConfig extends DefaultHoodieConfig {
}
public Builder withTargetIOPerCompactionInMB(long targetIOPerCompactionInMB) {
props.setProperty(TARGET_IO_PER_COMPACTION_IN_MB_PROP,
String.valueOf(targetIOPerCompactionInMB));
props.setProperty(TARGET_IO_PER_COMPACTION_IN_MB_PROP, String.valueOf(targetIOPerCompactionInMB));
return this;
}
public Builder withMaxNumDeltaCommitsBeforeCompaction(int maxNumDeltaCommitsBeforeCompaction) {
props.setProperty(INLINE_COMPACT_NUM_DELTA_COMMITS_PROP,
String.valueOf(maxNumDeltaCommitsBeforeCompaction));
props.setProperty(INLINE_COMPACT_NUM_DELTA_COMMITS_PROP, String.valueOf(maxNumDeltaCommitsBeforeCompaction));
return this;
}
public Builder withCompactionLazyBlockReadEnabled(Boolean compactionLazyBlockReadEnabled) {
props.setProperty(COMPACTION_LAZY_BLOCK_READ_ENABLED_PROP,
String.valueOf(compactionLazyBlockReadEnabled));
props.setProperty(COMPACTION_LAZY_BLOCK_READ_ENABLED_PROP, String.valueOf(compactionLazyBlockReadEnabled));
return this;
}
public Builder withCompactionReverseLogReadEnabled(Boolean compactionReverseLogReadEnabled) {
props.setProperty(COMPACTION_REVERSE_LOG_READ_ENABLED_PROP,
String.valueOf(compactionReverseLogReadEnabled));
props.setProperty(COMPACTION_REVERSE_LOG_READ_ENABLED_PROP, String.valueOf(compactionReverseLogReadEnabled));
return this;
}
public Builder withTargetPartitionsPerDayBasedCompaction(int targetPartitionsPerCompaction) {
props.setProperty(TARGET_PARTITIONS_PER_DAYBASED_COMPACTION_PROP,
String.valueOf(targetPartitionsPerCompaction));
props.setProperty(TARGET_PARTITIONS_PER_DAYBASED_COMPACTION_PROP, String.valueOf(targetPartitionsPerCompaction));
return this;
}
@@ -251,8 +234,7 @@ public class HoodieCompactionConfig extends DefaultHoodieConfig {
public HoodieCompactionConfig build() {
HoodieCompactionConfig config = new HoodieCompactionConfig(props);
setDefaultOnCondition(props, !props.containsKey(AUTO_CLEAN_PROP), AUTO_CLEAN_PROP,
DEFAULT_AUTO_CLEAN);
setDefaultOnCondition(props, !props.containsKey(AUTO_CLEAN_PROP), AUTO_CLEAN_PROP, DEFAULT_AUTO_CLEAN);
setDefaultOnCondition(props, !props.containsKey(INLINE_COMPACT_PROP), INLINE_COMPACT_PROP,
DEFAULT_INLINE_COMPACT);
setDefaultOnCondition(props, !props.containsKey(INLINE_COMPACT_NUM_DELTA_COMMITS_PROP),
@@ -261,27 +243,25 @@ public class HoodieCompactionConfig extends DefaultHoodieConfig {
DEFAULT_CLEANER_POLICY);
setDefaultOnCondition(props, !props.containsKey(CLEANER_FILE_VERSIONS_RETAINED_PROP),
CLEANER_FILE_VERSIONS_RETAINED_PROP, DEFAULT_CLEANER_FILE_VERSIONS_RETAINED);
setDefaultOnCondition(props, !props.containsKey(CLEANER_COMMITS_RETAINED_PROP),
CLEANER_COMMITS_RETAINED_PROP, DEFAULT_CLEANER_COMMITS_RETAINED);
setDefaultOnCondition(props, !props.containsKey(CLEANER_COMMITS_RETAINED_PROP), CLEANER_COMMITS_RETAINED_PROP,
DEFAULT_CLEANER_COMMITS_RETAINED);
setDefaultOnCondition(props, !props.containsKey(MAX_COMMITS_TO_KEEP_PROP), MAX_COMMITS_TO_KEEP_PROP,
DEFAULT_MAX_COMMITS_TO_KEEP);
setDefaultOnCondition(props, !props.containsKey(MIN_COMMITS_TO_KEEP_PROP), MIN_COMMITS_TO_KEEP_PROP,
DEFAULT_MIN_COMMITS_TO_KEEP);
setDefaultOnCondition(props, !props.containsKey(PARQUET_SMALL_FILE_LIMIT_BYTES),
PARQUET_SMALL_FILE_LIMIT_BYTES, DEFAULT_PARQUET_SMALL_FILE_LIMIT_BYTES);
setDefaultOnCondition(props, !props.containsKey(PARQUET_SMALL_FILE_LIMIT_BYTES), PARQUET_SMALL_FILE_LIMIT_BYTES,
DEFAULT_PARQUET_SMALL_FILE_LIMIT_BYTES);
setDefaultOnCondition(props, !props.containsKey(COPY_ON_WRITE_TABLE_INSERT_SPLIT_SIZE),
COPY_ON_WRITE_TABLE_INSERT_SPLIT_SIZE, DEFAULT_COPY_ON_WRITE_TABLE_INSERT_SPLIT_SIZE);
setDefaultOnCondition(props, !props.containsKey(COPY_ON_WRITE_TABLE_AUTO_SPLIT_INSERTS),
COPY_ON_WRITE_TABLE_AUTO_SPLIT_INSERTS, DEFAULT_COPY_ON_WRITE_TABLE_AUTO_SPLIT_INSERTS);
setDefaultOnCondition(props, !props.containsKey(COPY_ON_WRITE_TABLE_RECORD_SIZE_ESTIMATE),
COPY_ON_WRITE_TABLE_RECORD_SIZE_ESTIMATE,
DEFAULT_COPY_ON_WRITE_TABLE_RECORD_SIZE_ESTIMATE);
COPY_ON_WRITE_TABLE_RECORD_SIZE_ESTIMATE, DEFAULT_COPY_ON_WRITE_TABLE_RECORD_SIZE_ESTIMATE);
setDefaultOnCondition(props, !props.containsKey(CLEANER_PARALLELISM), CLEANER_PARALLELISM,
DEFAULT_CLEANER_PARALLELISM);
setDefaultOnCondition(props, !props.containsKey(COMPACTION_STRATEGY_PROP),
COMPACTION_STRATEGY_PROP, DEFAULT_COMPACTION_STRATEGY);
setDefaultOnCondition(props, !props.containsKey(PAYLOAD_CLASS_PROP),
PAYLOAD_CLASS_PROP, DEFAULT_PAYLOAD_CLASS);
setDefaultOnCondition(props, !props.containsKey(COMPACTION_STRATEGY_PROP), COMPACTION_STRATEGY_PROP,
DEFAULT_COMPACTION_STRATEGY);
setDefaultOnCondition(props, !props.containsKey(PAYLOAD_CLASS_PROP), PAYLOAD_CLASS_PROP, DEFAULT_PAYLOAD_CLASS);
setDefaultOnCondition(props, !props.containsKey(TARGET_IO_PER_COMPACTION_IN_MB_PROP),
TARGET_IO_PER_COMPACTION_IN_MB_PROP, DEFAULT_TARGET_IO_PER_COMPACTION_IN_MB);
setDefaultOnCondition(props, !props.containsKey(COMPACTION_LAZY_BLOCK_READ_ENABLED_PROP),
@@ -299,13 +279,15 @@ public class HoodieCompactionConfig extends DefaultHoodieConfig {
// commit instant on timeline, that still has not been cleaned. Could miss some data via incr pull
int minInstantsToKeep = Integer.parseInt(props.getProperty(HoodieCompactionConfig.MIN_COMMITS_TO_KEEP_PROP));
int maxInstantsToKeep = Integer.parseInt(props.getProperty(HoodieCompactionConfig.MAX_COMMITS_TO_KEEP_PROP));
int cleanerCommitsRetained = Integer
.parseInt(props.getProperty(HoodieCompactionConfig.CLEANER_COMMITS_RETAINED_PROP));
int cleanerCommitsRetained =
Integer.parseInt(props.getProperty(HoodieCompactionConfig.CLEANER_COMMITS_RETAINED_PROP));
Preconditions.checkArgument(maxInstantsToKeep > minInstantsToKeep);
Preconditions.checkArgument(minInstantsToKeep > cleanerCommitsRetained,
String.format("Increase %s=%d to be greater than %s=%d. Otherwise, there is risk of incremental pull "
+ "missing data from few instants.", HoodieCompactionConfig.MIN_COMMITS_TO_KEEP_PROP,
minInstantsToKeep, HoodieCompactionConfig.CLEANER_COMMITS_RETAINED_PROP, cleanerCommitsRetained));
String.format(
"Increase %s=%d to be greater than %s=%d. Otherwise, there is risk of incremental pull "
+ "missing data from few instants.",
HoodieCompactionConfig.MIN_COMMITS_TO_KEEP_PROP, minInstantsToKeep,
HoodieCompactionConfig.CLEANER_COMMITS_RETAINED_PROP, cleanerCommitsRetained));
return config;
}
}

View File

@@ -32,8 +32,8 @@ public class HoodieHBaseIndexConfig extends DefaultHoodieConfig {
public static final String HBASE_GET_BATCH_SIZE_PROP = "hoodie.index.hbase.get.batch.size";
public static final String HBASE_ZK_ZNODEPARENT = "hoodie.index.hbase.zknode.path";
/**
* Note that if HBASE_PUT_BATCH_SIZE_AUTO_COMPUTE_PROP is set to true, this batch size will not
* be honored for HBase Puts
* Note that if HBASE_PUT_BATCH_SIZE_AUTO_COMPUTE_PROP is set to true, this batch size will not be honored for HBase
* Puts
*/
public static final String HBASE_PUT_BATCH_SIZE_PROP = "hoodie.index.hbase.put.batch.size";
@@ -48,18 +48,16 @@ public class HoodieHBaseIndexConfig extends DefaultHoodieConfig {
public static final String HBASE_PUT_BATCH_SIZE_AUTO_COMPUTE_PROP = "hoodie.index.hbase.put.batch.size.autocompute";
public static final String DEFAULT_HBASE_PUT_BATCH_SIZE_AUTO_COMPUTE = "false";
/**
* Property to set the fraction of the global share of QPS that should be allocated to this job.
* Let's say there are 3 jobs which have input size in terms of number of rows required for
* HbaseIndexing as x, 2x, 3x respectively. Then this fraction for the jobs would be (0.17) 1/6,
* 0.33 (2/6) and 0.5 (3/6) respectively.
* Property to set the fraction of the global share of QPS that should be allocated to this job. Let's say there are 3
* jobs which have input size in terms of number of rows required for HbaseIndexing as x, 2x, 3x respectively. Then
* this fraction for the jobs would be (0.17) 1/6, 0.33 (2/6) and 0.5 (3/6) respectively.
*/
public static final String HBASE_QPS_FRACTION_PROP = "hoodie.index.hbase.qps.fraction";
/**
* Property to set maximum QPS allowed per Region Server. This should be same across various
* jobs. This is intended to limit the aggregate QPS generated across various jobs to an Hbase
* Region Server. It is recommended to set this value based on global indexing throughput needs
* and most importantly, how much the HBase installation in use is able to tolerate without
* Region Servers going down.
* Property to set maximum QPS allowed per Region Server. This should be same across various jobs. This is intended to
* limit the aggregate QPS generated across various jobs to an Hbase Region Server. It is recommended to set this
* value based on global indexing throughput needs and most importantly, how much the HBase installation in use is
* able to tolerate without Region Servers going down.
*/
public static String HBASE_MAX_QPS_PER_REGION_SERVER_PROP = "hoodie.index.hbase.max.qps.per.region.server";
/**
@@ -71,8 +69,7 @@ public class HoodieHBaseIndexConfig extends DefaultHoodieConfig {
*/
public static final int DEFAULT_HBASE_MAX_QPS_PER_REGION_SERVER = 1000;
/**
* Default is 50%, which means a total of 2 jobs can run using HbaseIndex without overwhelming
* Region Servers
* Default is 50%, which means a total of 2 jobs can run using HbaseIndex without overwhelming Region Servers
*/
public static final float DEFAULT_HBASE_QPS_FRACTION = 0.5f;
@@ -218,18 +215,15 @@ public class HoodieHBaseIndexConfig extends DefaultHoodieConfig {
/**
* <p>
* Method to set maximum QPS allowed per Region Server. This should be same across various
* jobs. This is intended to limit the aggregate QPS generated across various jobs to an
* Hbase Region Server.
* Method to set maximum QPS allowed per Region Server. This should be same across various jobs. This is intended to
* limit the aggregate QPS generated across various jobs to an Hbase Region Server.
* </p>
* <p>
* It is recommended to set this value based on your global indexing throughput needs and
* most importantly, how much your HBase installation is able to tolerate without Region
* Servers going down.
* It is recommended to set this value based on your global indexing throughput needs and most importantly, how much
* your HBase installation is able to tolerate without Region Servers going down.
* </p>
*/
public HoodieHBaseIndexConfig.Builder hbaseIndexMaxQPSPerRegionServer(
int maxQPSPerRegionServer) {
public HoodieHBaseIndexConfig.Builder hbaseIndexMaxQPSPerRegionServer(int maxQPSPerRegionServer) {
// This should be same across various jobs
props.setProperty(HoodieHBaseIndexConfig.HBASE_MAX_QPS_PER_REGION_SERVER_PROP,
String.valueOf(maxQPSPerRegionServer));
@@ -238,30 +232,30 @@ public class HoodieHBaseIndexConfig extends DefaultHoodieConfig {
public HoodieHBaseIndexConfig build() {
HoodieHBaseIndexConfig config = new HoodieHBaseIndexConfig(props);
setDefaultOnCondition(props, !props.containsKey(HBASE_GET_BATCH_SIZE_PROP),
HBASE_GET_BATCH_SIZE_PROP, String.valueOf(DEFAULT_HBASE_BATCH_SIZE));
setDefaultOnCondition(props, !props.containsKey(HBASE_PUT_BATCH_SIZE_PROP),
HBASE_PUT_BATCH_SIZE_PROP, String.valueOf(DEFAULT_HBASE_BATCH_SIZE));
setDefaultOnCondition(props, !props.containsKey(HBASE_GET_BATCH_SIZE_PROP), HBASE_GET_BATCH_SIZE_PROP,
String.valueOf(DEFAULT_HBASE_BATCH_SIZE));
setDefaultOnCondition(props, !props.containsKey(HBASE_PUT_BATCH_SIZE_PROP), HBASE_PUT_BATCH_SIZE_PROP,
String.valueOf(DEFAULT_HBASE_BATCH_SIZE));
setDefaultOnCondition(props, !props.containsKey(HBASE_PUT_BATCH_SIZE_AUTO_COMPUTE_PROP),
HBASE_PUT_BATCH_SIZE_AUTO_COMPUTE_PROP, String.valueOf(DEFAULT_HBASE_PUT_BATCH_SIZE_AUTO_COMPUTE));
setDefaultOnCondition(props, !props.containsKey(HBASE_QPS_FRACTION_PROP),
HBASE_QPS_FRACTION_PROP, String.valueOf(DEFAULT_HBASE_QPS_FRACTION));
setDefaultOnCondition(props, !props.containsKey(HBASE_QPS_FRACTION_PROP), HBASE_QPS_FRACTION_PROP,
String.valueOf(DEFAULT_HBASE_QPS_FRACTION));
setDefaultOnCondition(props, !props.containsKey(HBASE_MAX_QPS_PER_REGION_SERVER_PROP),
HBASE_MAX_QPS_PER_REGION_SERVER_PROP, String.valueOf(DEFAULT_HBASE_MAX_QPS_PER_REGION_SERVER));
setDefaultOnCondition(props, !props.containsKey(HOODIE_INDEX_COMPUTE_QPS_DYNAMICALLY),
HOODIE_INDEX_COMPUTE_QPS_DYNAMICALLY, String.valueOf(DEFAULT_HOODIE_INDEX_COMPUTE_QPS_DYNAMICALLY));
setDefaultOnCondition(props, !props.containsKey(HBASE_INDEX_QPS_ALLOCATOR_CLASS),
HBASE_INDEX_QPS_ALLOCATOR_CLASS, String.valueOf(DEFAULT_HBASE_INDEX_QPS_ALLOCATOR_CLASS));
setDefaultOnCondition(props, !props.containsKey(HBASE_INDEX_QPS_ALLOCATOR_CLASS), HBASE_INDEX_QPS_ALLOCATOR_CLASS,
String.valueOf(DEFAULT_HBASE_INDEX_QPS_ALLOCATOR_CLASS));
setDefaultOnCondition(props, !props.containsKey(HOODIE_INDEX_DESIRED_PUTS_TIME_IN_SECS),
HOODIE_INDEX_DESIRED_PUTS_TIME_IN_SECS, String.valueOf(DEFAULT_HOODIE_INDEX_DESIRED_PUTS_TIME_IN_SECS));
setDefaultOnCondition(props, !props.containsKey(HBASE_ZK_PATH_QPS_ROOT),
HBASE_ZK_PATH_QPS_ROOT, String.valueOf(DEFAULT_HBASE_ZK_PATH_QPS_ROOT));
setDefaultOnCondition(props, !props.containsKey(HBASE_ZK_PATH_QPS_ROOT), HBASE_ZK_PATH_QPS_ROOT,
String.valueOf(DEFAULT_HBASE_ZK_PATH_QPS_ROOT));
setDefaultOnCondition(props, !props.containsKey(HOODIE_INDEX_HBASE_ZK_SESSION_TIMEOUT_MS),
HOODIE_INDEX_HBASE_ZK_SESSION_TIMEOUT_MS, String.valueOf(DEFAULT_ZK_SESSION_TIMEOUT_MS));
setDefaultOnCondition(props, !props.containsKey(HOODIE_INDEX_HBASE_ZK_CONNECTION_TIMEOUT_MS),
HOODIE_INDEX_HBASE_ZK_CONNECTION_TIMEOUT_MS, String.valueOf(DEFAULT_ZK_CONNECTION_TIMEOUT_MS));
setDefaultOnCondition(props, !props.containsKey(HBASE_INDEX_QPS_ALLOCATOR_CLASS),
HBASE_INDEX_QPS_ALLOCATOR_CLASS, String.valueOf(DEFAULT_HBASE_INDEX_QPS_ALLOCATOR_CLASS));
setDefaultOnCondition(props, !props.containsKey(HBASE_INDEX_QPS_ALLOCATOR_CLASS), HBASE_INDEX_QPS_ALLOCATOR_CLASS,
String.valueOf(DEFAULT_HBASE_INDEX_QPS_ALLOCATOR_CLASS));
return config;
}

View File

@@ -42,8 +42,7 @@ public class HoodieIndexConfig extends DefaultHoodieConfig {
public static final String BLOOM_INDEX_PARALLELISM_PROP = "hoodie.bloom.index.parallelism";
// Disable explicit bloom index parallelism setting by default - hoodie auto computes
public static final String DEFAULT_BLOOM_INDEX_PARALLELISM = "0";
public static final String BLOOM_INDEX_PRUNE_BY_RANGES_PROP =
"hoodie.bloom.index.prune.by" + ".ranges";
public static final String BLOOM_INDEX_PRUNE_BY_RANGES_PROP = "hoodie.bloom.index.prune.by" + ".ranges";
public static final String DEFAULT_BLOOM_INDEX_PRUNE_BY_RANGES = "true";
public static final String BLOOM_INDEX_USE_CACHING_PROP = "hoodie.bloom.index.use.caching";
public static final String DEFAULT_BLOOM_INDEX_USE_CACHING = "true";
@@ -67,8 +66,7 @@ public class HoodieIndexConfig extends DefaultHoodieConfig {
public static final String DEFAULT_HBASE_BATCH_SIZE = "100";
public static final String BLOOM_INDEX_INPUT_STORAGE_LEVEL =
"hoodie.bloom.index.input.storage" + ".level";
public static final String BLOOM_INDEX_INPUT_STORAGE_LEVEL = "hoodie.bloom.index.input.storage" + ".level";
public static final String DEFAULT_BLOOM_INDEX_INPUT_STORAGE_LEVEL = "MEMORY_AND_DISK_SER";
private HoodieIndexConfig(Properties props) {
@@ -175,20 +173,18 @@ public class HoodieIndexConfig extends DefaultHoodieConfig {
public HoodieIndexConfig build() {
HoodieIndexConfig config = new HoodieIndexConfig(props);
setDefaultOnCondition(props, !props.containsKey(INDEX_TYPE_PROP), INDEX_TYPE_PROP,
DEFAULT_INDEX_TYPE);
setDefaultOnCondition(props, !props.containsKey(BLOOM_FILTER_NUM_ENTRIES),
BLOOM_FILTER_NUM_ENTRIES, DEFAULT_BLOOM_FILTER_NUM_ENTRIES);
setDefaultOnCondition(props, !props.containsKey(BLOOM_FILTER_FPP), BLOOM_FILTER_FPP,
DEFAULT_BLOOM_FILTER_FPP);
setDefaultOnCondition(props, !props.containsKey(BLOOM_INDEX_PARALLELISM_PROP),
BLOOM_INDEX_PARALLELISM_PROP, DEFAULT_BLOOM_INDEX_PARALLELISM);
setDefaultOnCondition(props, !props.containsKey(INDEX_TYPE_PROP), INDEX_TYPE_PROP, DEFAULT_INDEX_TYPE);
setDefaultOnCondition(props, !props.containsKey(BLOOM_FILTER_NUM_ENTRIES), BLOOM_FILTER_NUM_ENTRIES,
DEFAULT_BLOOM_FILTER_NUM_ENTRIES);
setDefaultOnCondition(props, !props.containsKey(BLOOM_FILTER_FPP), BLOOM_FILTER_FPP, DEFAULT_BLOOM_FILTER_FPP);
setDefaultOnCondition(props, !props.containsKey(BLOOM_INDEX_PARALLELISM_PROP), BLOOM_INDEX_PARALLELISM_PROP,
DEFAULT_BLOOM_INDEX_PARALLELISM);
setDefaultOnCondition(props, !props.containsKey(BLOOM_INDEX_PRUNE_BY_RANGES_PROP),
BLOOM_INDEX_PRUNE_BY_RANGES_PROP, DEFAULT_BLOOM_INDEX_PRUNE_BY_RANGES);
setDefaultOnCondition(props, !props.containsKey(BLOOM_INDEX_USE_CACHING_PROP),
BLOOM_INDEX_USE_CACHING_PROP, DEFAULT_BLOOM_INDEX_USE_CACHING);
setDefaultOnCondition(props, !props.containsKey(BLOOM_INDEX_INPUT_STORAGE_LEVEL),
BLOOM_INDEX_INPUT_STORAGE_LEVEL, DEFAULT_BLOOM_INDEX_INPUT_STORAGE_LEVEL);
setDefaultOnCondition(props, !props.containsKey(BLOOM_INDEX_USE_CACHING_PROP), BLOOM_INDEX_USE_CACHING_PROP,
DEFAULT_BLOOM_INDEX_USE_CACHING);
setDefaultOnCondition(props, !props.containsKey(BLOOM_INDEX_INPUT_STORAGE_LEVEL), BLOOM_INDEX_INPUT_STORAGE_LEVEL,
DEFAULT_BLOOM_INDEX_INPUT_STORAGE_LEVEL);
setDefaultOnCondition(props, !props.containsKey(BLOOM_INDEX_TREE_BASED_FILTER_PROP),
BLOOM_INDEX_TREE_BASED_FILTER_PROP, DEFAULT_BLOOM_INDEX_TREE_BASED_FILTER);
setDefaultOnCondition(props, !props.containsKey(BLOOM_INDEX_BUCKETIZED_CHECKING_PROP),

View File

@@ -41,8 +41,7 @@ public class HoodieMemoryConfig extends DefaultHoodieConfig {
// Default max memory fraction during compaction, excess spills to disk
public static final String DEFAULT_MAX_MEMORY_FRACTION_FOR_COMPACTION = String.valueOf(0.6);
// Default memory size per compaction (used if SparkEnv is absent), excess spills to disk
public static final long DEFAULT_MAX_MEMORY_FOR_SPILLABLE_MAP_IN_BYTES =
1024 * 1024 * 1024L; // 1GB
public static final long DEFAULT_MAX_MEMORY_FOR_SPILLABLE_MAP_IN_BYTES = 1024 * 1024 * 1024L; // 1GB
// Property to set the max memory for merge
public static final String MAX_MEMORY_FOR_MERGE_PROP = "hoodie.memory.merge.max.size";
// Property to set the max memory for compaction
@@ -88,20 +87,17 @@ public class HoodieMemoryConfig extends DefaultHoodieConfig {
}
public Builder withMaxMemoryFractionPerPartitionMerge(double maxMemoryFractionPerPartitionMerge) {
props.setProperty(MAX_MEMORY_FRACTION_FOR_MERGE_PROP,
String.valueOf(maxMemoryFractionPerPartitionMerge));
props.setProperty(MAX_MEMORY_FRACTION_FOR_MERGE_PROP, String.valueOf(maxMemoryFractionPerPartitionMerge));
return this;
}
public Builder withMaxMemoryFractionPerCompaction(double maxMemoryFractionPerCompaction) {
props.setProperty(MAX_MEMORY_FRACTION_FOR_COMPACTION_PROP,
String.valueOf(maxMemoryFractionPerCompaction));
props.setProperty(MAX_MEMORY_FRACTION_FOR_COMPACTION_PROP, String.valueOf(maxMemoryFractionPerCompaction));
return this;
}
public Builder withMaxDFSStreamBufferSize(int maxStreamBufferSize) {
props.setProperty(MAX_DFS_STREAM_BUFFER_SIZE_PROP,
String.valueOf(maxStreamBufferSize));
props.setProperty(MAX_DFS_STREAM_BUFFER_SIZE_PROP, String.valueOf(maxStreamBufferSize));
return this;
}
@@ -130,19 +126,16 @@ public class HoodieMemoryConfig extends DefaultHoodieConfig {
if (SparkEnv.get() != null) {
// 1 GB is the default conf used by Spark, look at SparkContext.scala
long executorMemoryInBytes = Utils.memoryStringToMb(SparkEnv.get().conf().get(SPARK_EXECUTOR_MEMORY_PROP,
DEFAULT_SPARK_EXECUTOR_MEMORY_MB)) * 1024
* 1024L;
long executorMemoryInBytes = Utils.memoryStringToMb(
SparkEnv.get().conf().get(SPARK_EXECUTOR_MEMORY_PROP, DEFAULT_SPARK_EXECUTOR_MEMORY_MB)) * 1024 * 1024L;
// 0.6 is the default value used by Spark,
// look at {@link
// https://github.com/apache/spark/blob/master/core/src/main/scala/org/apache/spark/SparkConf.scala#L507}
double memoryFraction = Double
.valueOf(SparkEnv.get().conf().get(SPARK_EXECUTOR_MEMORY_FRACTION_PROP,
DEFAULT_SPARK_EXECUTOR_MEMORY_FRACTION));
double memoryFraction = Double.valueOf(
SparkEnv.get().conf().get(SPARK_EXECUTOR_MEMORY_FRACTION_PROP, DEFAULT_SPARK_EXECUTOR_MEMORY_FRACTION));
double maxMemoryFractionForMerge = Double.valueOf(maxMemoryFraction);
double userAvailableMemory = executorMemoryInBytes * (1 - memoryFraction);
long maxMemoryForMerge = (long) Math
.floor(userAvailableMemory * maxMemoryFractionForMerge);
long maxMemoryForMerge = (long) Math.floor(userAvailableMemory * maxMemoryFractionForMerge);
return maxMemoryForMerge;
} else {
return DEFAULT_MAX_MEMORY_FOR_SPILLABLE_MAP_IN_BYTES;
@@ -151,29 +144,19 @@ public class HoodieMemoryConfig extends DefaultHoodieConfig {
public HoodieMemoryConfig build() {
HoodieMemoryConfig config = new HoodieMemoryConfig(props);
setDefaultOnCondition(props,
!props.containsKey(MAX_MEMORY_FRACTION_FOR_COMPACTION_PROP),
MAX_MEMORY_FRACTION_FOR_COMPACTION_PROP,
DEFAULT_MAX_MEMORY_FRACTION_FOR_COMPACTION);
setDefaultOnCondition(props,
!props.containsKey(MAX_MEMORY_FRACTION_FOR_MERGE_PROP),
setDefaultOnCondition(props, !props.containsKey(MAX_MEMORY_FRACTION_FOR_COMPACTION_PROP),
MAX_MEMORY_FRACTION_FOR_COMPACTION_PROP, DEFAULT_MAX_MEMORY_FRACTION_FOR_COMPACTION);
setDefaultOnCondition(props, !props.containsKey(MAX_MEMORY_FRACTION_FOR_MERGE_PROP),
MAX_MEMORY_FRACTION_FOR_MERGE_PROP, DEFAULT_MAX_MEMORY_FRACTION_FOR_MERGE);
setDefaultOnCondition(props,
!props.containsKey(MAX_MEMORY_FOR_MERGE_PROP),
MAX_MEMORY_FOR_MERGE_PROP, String.valueOf(
getMaxMemoryAllowedForMerge(props.getProperty(MAX_MEMORY_FRACTION_FOR_MERGE_PROP))));
setDefaultOnCondition(props,
!props.containsKey(MAX_MEMORY_FOR_COMPACTION_PROP),
MAX_MEMORY_FOR_COMPACTION_PROP, String.valueOf(
getMaxMemoryAllowedForMerge(props.getProperty(MAX_MEMORY_FRACTION_FOR_COMPACTION_PROP))));
setDefaultOnCondition(props,
!props.containsKey(MAX_DFS_STREAM_BUFFER_SIZE_PROP),
MAX_DFS_STREAM_BUFFER_SIZE_PROP, String.valueOf(DEFAULT_MAX_DFS_STREAM_BUFFER_SIZE));
setDefaultOnCondition(props,
!props.containsKey(SPILLABLE_MAP_BASE_PATH_PROP),
SPILLABLE_MAP_BASE_PATH_PROP, DEFAULT_SPILLABLE_MAP_BASE_PATH);
setDefaultOnCondition(props,
!props.containsKey(WRITESTATUS_FAILURE_FRACTION_PROP),
setDefaultOnCondition(props, !props.containsKey(MAX_MEMORY_FOR_MERGE_PROP), MAX_MEMORY_FOR_MERGE_PROP,
String.valueOf(getMaxMemoryAllowedForMerge(props.getProperty(MAX_MEMORY_FRACTION_FOR_MERGE_PROP))));
setDefaultOnCondition(props, !props.containsKey(MAX_MEMORY_FOR_COMPACTION_PROP), MAX_MEMORY_FOR_COMPACTION_PROP,
String.valueOf(getMaxMemoryAllowedForMerge(props.getProperty(MAX_MEMORY_FRACTION_FOR_COMPACTION_PROP))));
setDefaultOnCondition(props, !props.containsKey(MAX_DFS_STREAM_BUFFER_SIZE_PROP), MAX_DFS_STREAM_BUFFER_SIZE_PROP,
String.valueOf(DEFAULT_MAX_DFS_STREAM_BUFFER_SIZE));
setDefaultOnCondition(props, !props.containsKey(SPILLABLE_MAP_BASE_PATH_PROP), SPILLABLE_MAP_BASE_PATH_PROP,
DEFAULT_SPILLABLE_MAP_BASE_PATH);
setDefaultOnCondition(props, !props.containsKey(WRITESTATUS_FAILURE_FRACTION_PROP),
WRITESTATUS_FAILURE_FRACTION_PROP, String.valueOf(DEFAULT_WRITESTATUS_FAILURE_FRACTION));
return config;
}

View File

@@ -35,8 +35,7 @@ public class HoodieMetricsConfig extends DefaultHoodieConfig {
public static final String METRICS_ON = METRIC_PREFIX + ".on";
public static final boolean DEFAULT_METRICS_ON = false;
public static final String METRICS_REPORTER_TYPE = METRIC_PREFIX + ".reporter.type";
public static final MetricsReporterType DEFAULT_METRICS_REPORTER_TYPE = MetricsReporterType
.GRAPHITE;
public static final MetricsReporterType DEFAULT_METRICS_REPORTER_TYPE = MetricsReporterType.GRAPHITE;
// Graphite
public static final String GRAPHITE_PREFIX = METRIC_PREFIX + ".graphite";
@@ -103,8 +102,7 @@ public class HoodieMetricsConfig extends DefaultHoodieConfig {
public HoodieMetricsConfig build() {
HoodieMetricsConfig config = new HoodieMetricsConfig(props);
setDefaultOnCondition(props, !props.containsKey(METRICS_ON), METRICS_ON,
String.valueOf(DEFAULT_METRICS_ON));
setDefaultOnCondition(props, !props.containsKey(METRICS_ON), METRICS_ON, String.valueOf(DEFAULT_METRICS_ON));
setDefaultOnCondition(props, !props.containsKey(METRICS_REPORTER_TYPE), METRICS_REPORTER_TYPE,
DEFAULT_METRICS_REPORTER_TYPE.name());
setDefaultOnCondition(props, !props.containsKey(GRAPHITE_SERVER_HOST), GRAPHITE_SERVER_HOST,

View File

@@ -38,8 +38,7 @@ public class HoodieStorageConfig extends DefaultHoodieConfig {
public static final String DEFAULT_PARQUET_PAGE_SIZE_BYTES = String.valueOf(1 * 1024 * 1024);
// used to size log files
public static final String LOGFILE_SIZE_MAX_BYTES = "hoodie.logfile.max.size";
public static final String DEFAULT_LOGFILE_SIZE_MAX_BYTES = String
.valueOf(1024 * 1024 * 1024); // 1 GB
public static final String DEFAULT_LOGFILE_SIZE_MAX_BYTES = String.valueOf(1024 * 1024 * 1024); // 1 GB
// used to size data blocks in log file
public static final String LOGFILE_DATA_BLOCK_SIZE_MAX_BYTES = "hoodie.logfile.data.block.max.size";
public static final String DEFAULT_LOGFILE_DATA_BLOCK_SIZE_MAX_BYTES = String.valueOf(256 * 1024 * 1024); // 256 MB
@@ -122,20 +121,20 @@ public class HoodieStorageConfig extends DefaultHoodieConfig {
public HoodieStorageConfig build() {
HoodieStorageConfig config = new HoodieStorageConfig(props);
setDefaultOnCondition(props, !props.containsKey(PARQUET_FILE_MAX_BYTES),
PARQUET_FILE_MAX_BYTES, DEFAULT_PARQUET_FILE_MAX_BYTES);
setDefaultOnCondition(props, !props.containsKey(PARQUET_BLOCK_SIZE_BYTES),
PARQUET_BLOCK_SIZE_BYTES, DEFAULT_PARQUET_BLOCK_SIZE_BYTES);
setDefaultOnCondition(props, !props.containsKey(PARQUET_PAGE_SIZE_BYTES),
PARQUET_PAGE_SIZE_BYTES, DEFAULT_PARQUET_PAGE_SIZE_BYTES);
setDefaultOnCondition(props, !props.containsKey(PARQUET_FILE_MAX_BYTES), PARQUET_FILE_MAX_BYTES,
DEFAULT_PARQUET_FILE_MAX_BYTES);
setDefaultOnCondition(props, !props.containsKey(PARQUET_BLOCK_SIZE_BYTES), PARQUET_BLOCK_SIZE_BYTES,
DEFAULT_PARQUET_BLOCK_SIZE_BYTES);
setDefaultOnCondition(props, !props.containsKey(PARQUET_PAGE_SIZE_BYTES), PARQUET_PAGE_SIZE_BYTES,
DEFAULT_PARQUET_PAGE_SIZE_BYTES);
setDefaultOnCondition(props, !props.containsKey(LOGFILE_DATA_BLOCK_SIZE_MAX_BYTES),
LOGFILE_DATA_BLOCK_SIZE_MAX_BYTES, DEFAULT_LOGFILE_DATA_BLOCK_SIZE_MAX_BYTES);
setDefaultOnCondition(props, !props.containsKey(LOGFILE_SIZE_MAX_BYTES),
LOGFILE_SIZE_MAX_BYTES, DEFAULT_LOGFILE_SIZE_MAX_BYTES);
setDefaultOnCondition(props, !props.containsKey(PARQUET_COMPRESSION_RATIO),
PARQUET_COMPRESSION_RATIO, DEFAULT_STREAM_COMPRESSION_RATIO);
setDefaultOnCondition(props, !props.containsKey(PARQUET_COMPRESSION_CODEC),
PARQUET_COMPRESSION_CODEC, DEFAULT_PARQUET_COMPRESSION_CODEC);
setDefaultOnCondition(props, !props.containsKey(LOGFILE_SIZE_MAX_BYTES), LOGFILE_SIZE_MAX_BYTES,
DEFAULT_LOGFILE_SIZE_MAX_BYTES);
setDefaultOnCondition(props, !props.containsKey(PARQUET_COMPRESSION_RATIO), PARQUET_COMPRESSION_RATIO,
DEFAULT_STREAM_COMPRESSION_RATIO);
setDefaultOnCondition(props, !props.containsKey(PARQUET_COMPRESSION_CODEC), PARQUET_COMPRESSION_CODEC,
DEFAULT_PARQUET_COMPRESSION_CODEC);
setDefaultOnCondition(props, !props.containsKey(LOGFILE_TO_PARQUET_COMPRESSION_RATIO),
LOGFILE_TO_PARQUET_COMPRESSION_RATIO, DEFAULT_LOGFILE_TO_PARQUET_COMPRESSION_RATIO);
return config;

View File

@@ -61,8 +61,7 @@ public class HoodieWriteConfig extends DefaultHoodieConfig {
private static final String DEFAULT_WRITE_STATUS_STORAGE_LEVEL = "MEMORY_AND_DISK_SER";
private static final String HOODIE_AUTO_COMMIT_PROP = "hoodie.auto.commit";
private static final String DEFAULT_HOODIE_AUTO_COMMIT = "true";
private static final String HOODIE_ASSUME_DATE_PARTITIONING_PROP =
"hoodie.assume.date" + ".partitioning";
private static final String HOODIE_ASSUME_DATE_PARTITIONING_PROP = "hoodie.assume.date" + ".partitioning";
private static final String DEFAULT_ASSUME_DATE_PARTITIONING = "false";
private static final String HOODIE_WRITE_STATUS_CLASS_PROP = "hoodie.writestatus.class";
private static final String DEFAULT_HOODIE_WRITE_STATUS_CLASS = WriteStatus.class.getName();
@@ -143,8 +142,7 @@ public class HoodieWriteConfig extends DefaultHoodieConfig {
}
public int getWriteBufferLimitBytes() {
return Integer
.parseInt(props.getProperty(WRITE_BUFFER_LIMIT_BYTES, DEFAULT_WRITE_BUFFER_LIMIT_BYTES));
return Integer.parseInt(props.getProperty(WRITE_BUFFER_LIMIT_BYTES, DEFAULT_WRITE_BUFFER_LIMIT_BYTES));
}
public boolean shouldCombineBeforeInsert() {
@@ -191,18 +189,15 @@ public class HoodieWriteConfig extends DefaultHoodieConfig {
* compaction properties
**/
public HoodieCleaningPolicy getCleanerPolicy() {
return HoodieCleaningPolicy
.valueOf(props.getProperty(HoodieCompactionConfig.CLEANER_POLICY_PROP));
return HoodieCleaningPolicy.valueOf(props.getProperty(HoodieCompactionConfig.CLEANER_POLICY_PROP));
}
public int getCleanerFileVersionsRetained() {
return Integer
.parseInt(props.getProperty(HoodieCompactionConfig.CLEANER_FILE_VERSIONS_RETAINED_PROP));
return Integer.parseInt(props.getProperty(HoodieCompactionConfig.CLEANER_FILE_VERSIONS_RETAINED_PROP));
}
public int getCleanerCommitsRetained() {
return Integer
.parseInt(props.getProperty(HoodieCompactionConfig.CLEANER_COMMITS_RETAINED_PROP));
return Integer.parseInt(props.getProperty(HoodieCompactionConfig.CLEANER_COMMITS_RETAINED_PROP));
}
public int getMaxCommitsToKeep() {
@@ -214,23 +209,19 @@ public class HoodieWriteConfig extends DefaultHoodieConfig {
}
public int getParquetSmallFileLimit() {
return Integer
.parseInt(props.getProperty(HoodieCompactionConfig.PARQUET_SMALL_FILE_LIMIT_BYTES));
return Integer.parseInt(props.getProperty(HoodieCompactionConfig.PARQUET_SMALL_FILE_LIMIT_BYTES));
}
public int getCopyOnWriteInsertSplitSize() {
return Integer
.parseInt(props.getProperty(HoodieCompactionConfig.COPY_ON_WRITE_TABLE_INSERT_SPLIT_SIZE));
return Integer.parseInt(props.getProperty(HoodieCompactionConfig.COPY_ON_WRITE_TABLE_INSERT_SPLIT_SIZE));
}
public int getCopyOnWriteRecordSizeEstimate() {
return Integer.parseInt(
props.getProperty(HoodieCompactionConfig.COPY_ON_WRITE_TABLE_RECORD_SIZE_ESTIMATE));
return Integer.parseInt(props.getProperty(HoodieCompactionConfig.COPY_ON_WRITE_TABLE_RECORD_SIZE_ESTIMATE));
}
public boolean shouldAutoTuneInsertSplits() {
return Boolean.parseBoolean(
props.getProperty(HoodieCompactionConfig.COPY_ON_WRITE_TABLE_AUTO_SPLIT_INSERTS));
return Boolean.parseBoolean(props.getProperty(HoodieCompactionConfig.COPY_ON_WRITE_TABLE_AUTO_SPLIT_INSERTS));
}
public int getCleanerParallelism() {
@@ -246,28 +237,23 @@ public class HoodieWriteConfig extends DefaultHoodieConfig {
}
public int getInlineCompactDeltaCommitMax() {
return Integer
.parseInt(props.getProperty(HoodieCompactionConfig.INLINE_COMPACT_NUM_DELTA_COMMITS_PROP));
return Integer.parseInt(props.getProperty(HoodieCompactionConfig.INLINE_COMPACT_NUM_DELTA_COMMITS_PROP));
}
public CompactionStrategy getCompactionStrategy() {
return ReflectionUtils
.loadClass(props.getProperty(HoodieCompactionConfig.COMPACTION_STRATEGY_PROP));
return ReflectionUtils.loadClass(props.getProperty(HoodieCompactionConfig.COMPACTION_STRATEGY_PROP));
}
public Long getTargetIOPerCompactionInMB() {
return Long
.parseLong(props.getProperty(HoodieCompactionConfig.TARGET_IO_PER_COMPACTION_IN_MB_PROP));
return Long.parseLong(props.getProperty(HoodieCompactionConfig.TARGET_IO_PER_COMPACTION_IN_MB_PROP));
}
public Boolean getCompactionLazyBlockReadEnabled() {
return Boolean
.valueOf(props.getProperty(HoodieCompactionConfig.COMPACTION_LAZY_BLOCK_READ_ENABLED_PROP));
return Boolean.valueOf(props.getProperty(HoodieCompactionConfig.COMPACTION_LAZY_BLOCK_READ_ENABLED_PROP));
}
public Boolean getCompactionReverseLogReadEnabled() {
return Boolean.valueOf(
props.getProperty(HoodieCompactionConfig.COMPACTION_REVERSE_LOG_READ_ENABLED_PROP));
return Boolean.valueOf(props.getProperty(HoodieCompactionConfig.COMPACTION_REVERSE_LOG_READ_ENABLED_PROP));
}
public String getPayloadClass() {
@@ -275,13 +261,11 @@ public class HoodieWriteConfig extends DefaultHoodieConfig {
}
public int getTargetPartitionsPerDayBasedCompaction() {
return Integer
.parseInt(props.getProperty(HoodieCompactionConfig.TARGET_PARTITIONS_PER_DAYBASED_COMPACTION_PROP));
return Integer.parseInt(props.getProperty(HoodieCompactionConfig.TARGET_PARTITIONS_PER_DAYBASED_COMPACTION_PROP));
}
public int getCommitArchivalBatchSize() {
return Integer
.parseInt(props.getProperty(HoodieCompactionConfig.COMMITS_ARCHIVAL_BATCH_SIZE_PROP));
return Integer.parseInt(props.getProperty(HoodieCompactionConfig.COMMITS_ARCHIVAL_BATCH_SIZE_PROP));
}
/**
@@ -352,9 +336,8 @@ public class HoodieWriteConfig extends DefaultHoodieConfig {
}
/**
* Fraction of the global share of QPS that should be allocated to this job.
* Let's say there are 3 jobs which have input size in terms of number of rows
* required for HbaseIndexing as x, 2x, 3x respectively. Then this fraction for
* Fraction of the global share of QPS that should be allocated to this job. Let's say there are 3 jobs which have
* input size in terms of number of rows required for HbaseIndexing as x, 2x, 3x respectively. Then this fraction for
* the jobs would be (0.17) 1/6, 0.33 (2/6) and 0.5 (3/6) respectively.
*/
public float getHbaseIndexQPSFraction() {
@@ -370,8 +353,8 @@ public class HoodieWriteConfig extends DefaultHoodieConfig {
}
/**
* This should be same across various jobs. This is intended to limit the aggregate
* QPS generated across various Hoodie jobs to an Hbase Region Server
* This should be same across various jobs. This is intended to limit the aggregate QPS generated across various
* Hoodie jobs to an Hbase Region Server
*/
public int getHbaseIndexMaxQPSPerRegionServer() {
return Integer.parseInt(props.getProperty(HoodieHBaseIndexConfig.HBASE_MAX_QPS_PER_REGION_SERVER_PROP));
@@ -382,8 +365,7 @@ public class HoodieWriteConfig extends DefaultHoodieConfig {
}
public boolean getBloomIndexPruneByRanges() {
return Boolean
.parseBoolean(props.getProperty(HoodieIndexConfig.BLOOM_INDEX_PRUNE_BY_RANGES_PROP));
return Boolean.parseBoolean(props.getProperty(HoodieIndexConfig.BLOOM_INDEX_PRUNE_BY_RANGES_PROP));
}
public boolean getBloomIndexUseCaching() {
@@ -403,8 +385,7 @@ public class HoodieWriteConfig extends DefaultHoodieConfig {
}
public StorageLevel getBloomIndexInputStorageLevel() {
return StorageLevel
.fromString(props.getProperty(HoodieIndexConfig.BLOOM_INDEX_INPUT_STORAGE_LEVEL));
return StorageLevel.fromString(props.getProperty(HoodieIndexConfig.BLOOM_INDEX_INPUT_STORAGE_LEVEL));
}
/**
@@ -423,8 +404,7 @@ public class HoodieWriteConfig extends DefaultHoodieConfig {
}
public int getLogFileDataBlockMaxSize() {
return Integer
.parseInt(props.getProperty(HoodieStorageConfig.LOGFILE_DATA_BLOCK_SIZE_MAX_BYTES));
return Integer.parseInt(props.getProperty(HoodieStorageConfig.LOGFILE_DATA_BLOCK_SIZE_MAX_BYTES));
}
public int getLogFileMaxSize() {
@@ -451,8 +431,7 @@ public class HoodieWriteConfig extends DefaultHoodieConfig {
}
public MetricsReporterType getMetricsReporterType() {
return MetricsReporterType
.valueOf(props.getProperty(HoodieMetricsConfig.METRICS_REPORTER_TYPE));
return MetricsReporterType.valueOf(props.getProperty(HoodieMetricsConfig.METRICS_REPORTER_TYPE));
}
public String getGraphiteServerHost() {
@@ -475,9 +454,7 @@ public class HoodieWriteConfig extends DefaultHoodieConfig {
}
public Double getMaxMemoryFractionPerCompaction() {
return Double
.valueOf(
props.getProperty(HoodieMemoryConfig.MAX_MEMORY_FRACTION_FOR_COMPACTION_PROP));
return Double.valueOf(props.getProperty(HoodieMemoryConfig.MAX_MEMORY_FRACTION_FOR_COMPACTION_PROP));
}
public Long getMaxMemoryPerPartitionMerge() {
@@ -637,8 +614,7 @@ public class HoodieWriteConfig extends DefaultHoodieConfig {
}
public Builder withAssumeDatePartitioning(boolean assumeDatePartitioning) {
props.setProperty(HOODIE_ASSUME_DATE_PARTITIONING_PROP,
String.valueOf(assumeDatePartitioning));
props.setProperty(HOODIE_ASSUME_DATE_PARTITIONING_PROP, String.valueOf(assumeDatePartitioning));
return this;
}
@@ -671,48 +647,42 @@ public class HoodieWriteConfig extends DefaultHoodieConfig {
public HoodieWriteConfig build() {
// Check for mandatory properties
setDefaultOnCondition(props, !props.containsKey(INSERT_PARALLELISM), INSERT_PARALLELISM,
setDefaultOnCondition(props, !props.containsKey(INSERT_PARALLELISM), INSERT_PARALLELISM, DEFAULT_PARALLELISM);
setDefaultOnCondition(props, !props.containsKey(BULKINSERT_PARALLELISM), BULKINSERT_PARALLELISM,
DEFAULT_PARALLELISM);
setDefaultOnCondition(props, !props.containsKey(BULKINSERT_PARALLELISM),
BULKINSERT_PARALLELISM, DEFAULT_PARALLELISM);
setDefaultOnCondition(props, !props.containsKey(UPSERT_PARALLELISM), UPSERT_PARALLELISM,
DEFAULT_PARALLELISM);
setDefaultOnCondition(props, !props.containsKey(COMBINE_BEFORE_INSERT_PROP),
COMBINE_BEFORE_INSERT_PROP, DEFAULT_COMBINE_BEFORE_INSERT);
setDefaultOnCondition(props, !props.containsKey(COMBINE_BEFORE_UPSERT_PROP),
COMBINE_BEFORE_UPSERT_PROP, DEFAULT_COMBINE_BEFORE_UPSERT);
setDefaultOnCondition(props, !props.containsKey(WRITE_STATUS_STORAGE_LEVEL),
WRITE_STATUS_STORAGE_LEVEL, DEFAULT_WRITE_STATUS_STORAGE_LEVEL);
setDefaultOnCondition(props, !props.containsKey(HOODIE_AUTO_COMMIT_PROP),
HOODIE_AUTO_COMMIT_PROP, DEFAULT_HOODIE_AUTO_COMMIT);
setDefaultOnCondition(props, !props.containsKey(UPSERT_PARALLELISM), UPSERT_PARALLELISM, DEFAULT_PARALLELISM);
setDefaultOnCondition(props, !props.containsKey(COMBINE_BEFORE_INSERT_PROP), COMBINE_BEFORE_INSERT_PROP,
DEFAULT_COMBINE_BEFORE_INSERT);
setDefaultOnCondition(props, !props.containsKey(COMBINE_BEFORE_UPSERT_PROP), COMBINE_BEFORE_UPSERT_PROP,
DEFAULT_COMBINE_BEFORE_UPSERT);
setDefaultOnCondition(props, !props.containsKey(WRITE_STATUS_STORAGE_LEVEL), WRITE_STATUS_STORAGE_LEVEL,
DEFAULT_WRITE_STATUS_STORAGE_LEVEL);
setDefaultOnCondition(props, !props.containsKey(HOODIE_AUTO_COMMIT_PROP), HOODIE_AUTO_COMMIT_PROP,
DEFAULT_HOODIE_AUTO_COMMIT);
setDefaultOnCondition(props, !props.containsKey(HOODIE_ASSUME_DATE_PARTITIONING_PROP),
HOODIE_ASSUME_DATE_PARTITIONING_PROP, DEFAULT_ASSUME_DATE_PARTITIONING);
setDefaultOnCondition(props, !props.containsKey(HOODIE_WRITE_STATUS_CLASS_PROP),
HOODIE_WRITE_STATUS_CLASS_PROP, DEFAULT_HOODIE_WRITE_STATUS_CLASS);
setDefaultOnCondition(props, !props.containsKey(FINALIZE_WRITE_PARALLELISM),
FINALIZE_WRITE_PARALLELISM, DEFAULT_FINALIZE_WRITE_PARALLELISM);
setDefaultOnCondition(props, !props.containsKey(HOODIE_WRITE_STATUS_CLASS_PROP), HOODIE_WRITE_STATUS_CLASS_PROP,
DEFAULT_HOODIE_WRITE_STATUS_CLASS);
setDefaultOnCondition(props, !props.containsKey(FINALIZE_WRITE_PARALLELISM), FINALIZE_WRITE_PARALLELISM,
DEFAULT_FINALIZE_WRITE_PARALLELISM);
setDefaultOnCondition(props, !props.containsKey(EMBEDDED_TIMELINE_SERVER_ENABLED),
EMBEDDED_TIMELINE_SERVER_ENABLED, DEFAULT_EMBEDDED_TIMELINE_SERVER_ENABLED);
setDefaultOnCondition(props, !props.containsKey(INITIAL_CONSISTENCY_CHECK_INTERVAL_MS_PROP),
INITIAL_CONSISTENCY_CHECK_INTERVAL_MS_PROP, String.valueOf(DEFAULT_INITIAL_CONSISTENCY_CHECK_INTERVAL_MS));
setDefaultOnCondition(props, !props.containsKey(MAX_CONSISTENCY_CHECK_INTERVAL_MS_PROP),
MAX_CONSISTENCY_CHECK_INTERVAL_MS_PROP, String.valueOf(DEFAULT_MAX_CONSISTENCY_CHECK_INTERVAL_MS));
setDefaultOnCondition(props, !props.containsKey(MAX_CONSISTENCY_CHECKS_PROP),
MAX_CONSISTENCY_CHECKS_PROP, String.valueOf(DEFAULT_MAX_CONSISTENCY_CHECKS));
setDefaultOnCondition(props, !props.containsKey(MAX_CONSISTENCY_CHECKS_PROP), MAX_CONSISTENCY_CHECKS_PROP,
String.valueOf(DEFAULT_MAX_CONSISTENCY_CHECKS));
setDefaultOnCondition(props, !props.containsKey(FAIL_ON_TIMELINE_ARCHIVING_ENABLED_PROP),
FAIL_ON_TIMELINE_ARCHIVING_ENABLED_PROP, DEFAULT_FAIL_ON_TIMELINE_ARCHIVING_ENABLED);
// Make sure the props is propagated
setDefaultOnCondition(props, !isIndexConfigSet,
HoodieIndexConfig.newBuilder().fromProperties(props).build());
setDefaultOnCondition(props, !isStorageConfigSet,
HoodieStorageConfig.newBuilder().fromProperties(props).build());
setDefaultOnCondition(props, !isIndexConfigSet, HoodieIndexConfig.newBuilder().fromProperties(props).build());
setDefaultOnCondition(props, !isStorageConfigSet, HoodieStorageConfig.newBuilder().fromProperties(props).build());
setDefaultOnCondition(props, !isCompactionConfigSet,
HoodieCompactionConfig.newBuilder().fromProperties(props).build());
setDefaultOnCondition(props, !isMetricsConfigSet,
HoodieMetricsConfig.newBuilder().fromProperties(props).build());
setDefaultOnCondition(props, !isMemoryConfigSet,
HoodieMemoryConfig.newBuilder().fromProperties(props).build());
setDefaultOnCondition(props, !isMetricsConfigSet, HoodieMetricsConfig.newBuilder().fromProperties(props).build());
setDefaultOnCondition(props, !isMemoryConfigSet, HoodieMemoryConfig.newBuilder().fromProperties(props).build());
setDefaultOnCondition(props, !isViewConfigSet,
FileSystemViewStorageConfig.newBuilder().fromProperties(props).build());
setDefaultOnCondition(props, !isConsistencyGuardSet,

View File

@@ -19,8 +19,9 @@
package org.apache.hudi.exception;
/**
* <p> Exception thrown for any higher level errors when <code>HoodieClient</code> is doing a delta
* commit </p>
* <p>
* Exception thrown for any higher level errors when <code>HoodieClient</code> is doing a delta commit
* </p>
*/
public class HoodieAppendException extends HoodieException {

View File

@@ -19,7 +19,8 @@
package org.apache.hudi.exception;
/**
* <p> Exception thrown for any higher level errors when <code>HoodieClient</code> is doing a Commit
* <p>
* Exception thrown for any higher level errors when <code>HoodieClient</code> is doing a Commit
* </p>
*/
public class HoodieCommitException extends HoodieException {

View File

@@ -20,7 +20,9 @@ package org.apache.hudi.exception;
/**
* <p> Exception thrown when dependent system is not available </p>
* <p>
* Exception thrown when dependent system is not available
* </p>
*/
public class HoodieDependentSystemUnavailableException extends HoodieException {

View File

@@ -19,8 +19,9 @@
package org.apache.hudi.exception;
/**
* <p> Exception thrown for any higher level errors when <code>HoodieClient</code> is doing a bulk
* insert </p>
* <p>
* Exception thrown for any higher level errors when <code>HoodieClient</code> is doing a bulk insert
* </p>
*/
public class HoodieInsertException extends HoodieException {

View File

@@ -19,8 +19,9 @@
package org.apache.hudi.exception;
/**
* <p> Exception thrown for any higher level errors when <code>HoodieClient</code> is doing a
* incremental upsert </p>
* <p>
* Exception thrown for any higher level errors when <code>HoodieClient</code> is doing a incremental upsert
* </p>
*/
public class HoodieUpsertException extends HoodieException {

View File

@@ -31,16 +31,16 @@ import org.apache.spark.api.java.function.Function2;
/**
* Map function that handles a sorted stream of HoodieRecords
*/
public class BulkInsertMapFunction<T extends HoodieRecordPayload> implements
Function2<Integer, Iterator<HoodieRecord<T>>, Iterator<List<WriteStatus>>> {
public class BulkInsertMapFunction<T extends HoodieRecordPayload>
implements Function2<Integer, Iterator<HoodieRecord<T>>, Iterator<List<WriteStatus>>> {
private String commitTime;
private HoodieWriteConfig config;
private HoodieTable<T> hoodieTable;
private List<String> fileIDPrefixes;
public BulkInsertMapFunction(String commitTime, HoodieWriteConfig config,
HoodieTable<T> hoodieTable, List<String> fileIDPrefixes) {
public BulkInsertMapFunction(String commitTime, HoodieWriteConfig config, HoodieTable<T> hoodieTable,
List<String> fileIDPrefixes) {
this.commitTime = commitTime;
this.config = config;
this.hoodieTable = hoodieTable;

View File

@@ -37,11 +37,10 @@ import org.apache.hudi.io.HoodieWriteHandle;
import org.apache.hudi.table.HoodieTable;
/**
* Lazy Iterable, that writes a stream of HoodieRecords sorted by the partitionPath, into new
* files.
* Lazy Iterable, that writes a stream of HoodieRecords sorted by the partitionPath, into new files.
*/
public class CopyOnWriteLazyInsertIterable<T extends HoodieRecordPayload> extends
LazyIterableIterator<HoodieRecord<T>, List<WriteStatus>> {
public class CopyOnWriteLazyInsertIterable<T extends HoodieRecordPayload>
extends LazyIterableIterator<HoodieRecord<T>, List<WriteStatus>> {
protected final HoodieWriteConfig hoodieConfig;
protected final String commitTime;
@@ -80,25 +79,23 @@ public class CopyOnWriteLazyInsertIterable<T extends HoodieRecordPayload> extend
* Transformer function to help transform a HoodieRecord. This transformer is used by BufferedIterator to offload some
* expensive operations of transformation to the reader thread.
*/
static <T extends HoodieRecordPayload> Function<HoodieRecord<T>,
HoodieInsertValueGenResult<HoodieRecord>> getTransformFunction(Schema schema) {
static <T extends HoodieRecordPayload> Function<HoodieRecord<T>, HoodieInsertValueGenResult<HoodieRecord>> getTransformFunction(
Schema schema) {
return hoodieRecord -> new HoodieInsertValueGenResult(hoodieRecord, schema);
}
@Override
protected void start() {
}
protected void start() {}
@Override
protected List<WriteStatus> computeNext() {
// Executor service used for launching writer thread.
BoundedInMemoryExecutor<HoodieRecord<T>,
HoodieInsertValueGenResult<HoodieRecord>, List<WriteStatus>> bufferedIteratorExecutor = null;
BoundedInMemoryExecutor<HoodieRecord<T>, HoodieInsertValueGenResult<HoodieRecord>, List<WriteStatus>> bufferedIteratorExecutor =
null;
try {
final Schema schema = new Schema.Parser().parse(hoodieConfig.getSchema());
bufferedIteratorExecutor =
new SparkBoundedInMemoryExecutor<>(hoodieConfig, inputItr,
getInsertHandler(), getTransformFunction(schema));
new SparkBoundedInMemoryExecutor<>(hoodieConfig, inputItr, getInsertHandler(), getTransformFunction(schema));
final List<WriteStatus> result = bufferedIteratorExecutor.execute();
assert result != null && !result.isEmpty() && !bufferedIteratorExecutor.isRemaining();
return result;
@@ -112,8 +109,7 @@ public class CopyOnWriteLazyInsertIterable<T extends HoodieRecordPayload> extend
}
@Override
protected void end() {
}
protected void end() {}
protected String getNextFileId(String idPfx) {
return String.format("%s-%d", idPfx, numFilesWritten++);
@@ -124,11 +120,10 @@ public class CopyOnWriteLazyInsertIterable<T extends HoodieRecordPayload> extend
}
/**
* Consumes stream of hoodie records from in-memory queue and
* writes to one or more create-handles
* Consumes stream of hoodie records from in-memory queue and writes to one or more create-handles
*/
protected class CopyOnWriteInsertHandler extends
BoundedInMemoryQueueConsumer<HoodieInsertValueGenResult<HoodieRecord>, List<WriteStatus>> {
protected class CopyOnWriteInsertHandler
extends BoundedInMemoryQueueConsumer<HoodieInsertValueGenResult<HoodieRecord>, List<WriteStatus>> {
protected final List<WriteStatus> statuses = new ArrayList<>();
protected HoodieWriteHandle handle;

View File

@@ -21,16 +21,15 @@ package org.apache.hudi.func;
import java.util.Iterator;
/**
* (NOTE: Adapted from Apache SystemML) This class is a generic base class for lazy, single pass
* inputItr classes in order to simplify the implementation of lazy iterators for mapPartitions use
* cases. Note [SPARK-3369], which gives the reasons for backwards compatibility with regard to the
* iterable API despite Spark's single pass nature.
* (NOTE: Adapted from Apache SystemML) This class is a generic base class for lazy, single pass inputItr classes in
* order to simplify the implementation of lazy iterators for mapPartitions use cases. Note [SPARK-3369], which gives
* the reasons for backwards compatibility with regard to the iterable API despite Spark's single pass nature.
* <p>
* Provide a way to obtain a inputItr of type O (output), out of an inputItr of type I (input)
* <p>
* Things to remember: - Assumes Spark calls hasNext() to check for elements, before calling next()
* to obtain them - Assumes hasNext() gets called atleast once. - Concrete Implementation is
* responsible for calling inputIterator.next() and doing the processing in computeNext()
* Things to remember: - Assumes Spark calls hasNext() to check for elements, before calling next() to obtain them -
* Assumes hasNext() gets called atleast once. - Concrete Implementation is responsible for calling inputIterator.next()
* and doing the processing in computeNext()
*/
public abstract class LazyIterableIterator<I, O> implements Iterable<O>, Iterator<O> {

View File

@@ -29,11 +29,9 @@ import org.apache.hudi.io.HoodieAppendHandle;
import org.apache.hudi.table.HoodieTable;
/**
* Lazy Iterable, that writes a stream of HoodieRecords sorted by the partitionPath, into new
* log files.
* Lazy Iterable, that writes a stream of HoodieRecords sorted by the partitionPath, into new log files.
*/
public class MergeOnReadLazyInsertIterable<T extends HoodieRecordPayload> extends
CopyOnWriteLazyInsertIterable<T> {
public class MergeOnReadLazyInsertIterable<T extends HoodieRecordPayload> extends CopyOnWriteLazyInsertIterable<T> {
public MergeOnReadLazyInsertIterable(Iterator<HoodieRecord<T>> sortedRecordItr, HoodieWriteConfig config,
String commitTime, HoodieTable<T> hoodieTable, String idPfx) {

View File

@@ -32,8 +32,7 @@ public class OperationResult<T> implements Serializable {
private boolean success;
private Option<Exception> exception;
public OperationResult() {
}
public OperationResult() {}
public OperationResult(T operation, boolean success, Option<Exception> exception) {
this.operation = operation;
@@ -67,11 +66,7 @@ public class OperationResult<T> implements Serializable {
@Override
public String toString() {
return "OperationResult{"
+ "operation=" + operation
+ ", executed=" + executed
+ ", success=" + success
+ ", exception=" + exception
+ '}';
return "OperationResult{" + "operation=" + operation + ", executed=" + executed + ", success=" + success
+ ", exception=" + exception + '}';
}
}

View File

@@ -25,8 +25,8 @@ import org.apache.hudi.exception.HoodieIOException;
import org.apache.parquet.hadoop.ParquetReader;
/**
* This class wraps a parquet reader and provides an iterator based api to
* read from a parquet file. This is used in {@link BoundedInMemoryQueue}
* This class wraps a parquet reader and provides an iterator based api to read from a parquet file. This is used in
* {@link BoundedInMemoryQueue}
*/
public class ParquetReaderIterator<T> implements Iterator<T> {

View File

@@ -36,17 +36,13 @@ public class SparkBoundedInMemoryExecutor<I, O, E> extends BoundedInMemoryExecut
final TaskContext sparkThreadTaskContext;
public SparkBoundedInMemoryExecutor(final HoodieWriteConfig hoodieConfig, final Iterator<I> inputItr,
BoundedInMemoryQueueConsumer<O, E> consumer,
Function<I, O> bufferedIteratorTransform) {
BoundedInMemoryQueueConsumer<O, E> consumer, Function<I, O> bufferedIteratorTransform) {
this(hoodieConfig, new IteratorBasedQueueProducer<>(inputItr), consumer, bufferedIteratorTransform);
}
public SparkBoundedInMemoryExecutor(final HoodieWriteConfig hoodieConfig,
BoundedInMemoryQueueProducer<I> producer,
BoundedInMemoryQueueConsumer<O, E> consumer,
Function<I, O> bufferedIteratorTransform) {
super(hoodieConfig.getWriteBufferLimitBytes(), producer,
Option.of(consumer), bufferedIteratorTransform);
public SparkBoundedInMemoryExecutor(final HoodieWriteConfig hoodieConfig, BoundedInMemoryQueueProducer<I> producer,
BoundedInMemoryQueueConsumer<O, E> consumer, Function<I, O> bufferedIteratorTransform) {
super(hoodieConfig.getWriteBufferLimitBytes(), producer, Option.of(consumer), bufferedIteratorTransform);
this.sparkThreadTaskContext = TaskContext.get();
}

View File

@@ -65,18 +65,18 @@ public abstract class HoodieIndex<T extends HoodieRecordPayload> implements Seri
}
/**
* Checks if the given [Keys] exists in the hoodie table and returns [Key, Option[partitionPath, fileID]]
* If the optional is empty, then the key is not found.
* Checks if the given [Keys] exists in the hoodie table and returns [Key, Option[partitionPath, fileID]] If the
* optional is empty, then the key is not found.
*/
public abstract JavaPairRDD<HoodieKey, Option<Pair<String, String>>> fetchRecordLocation(
JavaRDD<HoodieKey> hoodieKeys, final JavaSparkContext jsc, HoodieTable<T> hoodieTable);
/**
* Looks up the index and tags each incoming record with a location of a file that contains the
* row (if it is actually present)
* Looks up the index and tags each incoming record with a location of a file that contains the row (if it is actually
* present)
*/
public abstract JavaRDD<HoodieRecord<T>> tagLocation(JavaRDD<HoodieRecord<T>> recordRDD,
JavaSparkContext jsc, HoodieTable<T> hoodieTable) throws HoodieIndexException;
public abstract JavaRDD<HoodieRecord<T>> tagLocation(JavaRDD<HoodieRecord<T>> recordRDD, JavaSparkContext jsc,
HoodieTable<T> hoodieTable) throws HoodieIndexException;
/**
* Extracts the location of written records, and updates the index.
@@ -84,8 +84,7 @@ public abstract class HoodieIndex<T extends HoodieRecordPayload> implements Seri
* TODO(vc): We may need to propagate the record as well in a WriteStatus class
*/
public abstract JavaRDD<WriteStatus> updateLocation(JavaRDD<WriteStatus> writeStatusRDD, JavaSparkContext jsc,
HoodieTable<T> hoodieTable)
throws HoodieIndexException;
HoodieTable<T> hoodieTable) throws HoodieIndexException;
/**
* Rollback the efffects of the commit made at commitTime.
@@ -93,17 +92,17 @@ public abstract class HoodieIndex<T extends HoodieRecordPayload> implements Seri
public abstract boolean rollbackCommit(String commitTime);
/**
* An index is `global` if {@link HoodieKey} to fileID mapping, does not depend on the
* `partitionPath`. Such an implementation is able to obtain the same mapping, for two hoodie keys
* with same `recordKey` but different `partitionPath`
* An index is `global` if {@link HoodieKey} to fileID mapping, does not depend on the `partitionPath`. Such an
* implementation is able to obtain the same mapping, for two hoodie keys with same `recordKey` but different
* `partitionPath`
*
* @return whether or not, the index implementation is global in nature
*/
public abstract boolean isGlobal();
/**
* This is used by storage to determine, if its safe to send inserts, straight to the log, i.e
* having a {@link FileSlice}, with no data file.
* This is used by storage to determine, if its safe to send inserts, straight to the log, i.e having a
* {@link FileSlice}, with no data file.
*
* @return Returns true/false depending on whether the impl has this capability
*/
@@ -111,8 +110,8 @@ public abstract class HoodieIndex<T extends HoodieRecordPayload> implements Seri
/**
* An index is "implicit" with respect to storage, if just writing new data to a file slice,
* updates the index as well. This is used by storage, to save memory footprint in certain cases.
* An index is "implicit" with respect to storage, if just writing new data to a file slice, updates the index as
* well. This is used by storage, to save memory footprint in certain cases.
*/
public abstract boolean isImplicitWithStorage();

View File

@@ -40,7 +40,9 @@ import org.apache.spark.api.java.function.Function2;
/**
* Hoodie Index implementation backed by an in-memory Hash map. <p> ONLY USE FOR LOCAL TESTING
* Hoodie Index implementation backed by an in-memory Hash map.
* <p>
* ONLY USE FOR LOCAL TESTING
*/
public class InMemoryHashIndex<T extends HoodieRecordPayload> extends HoodieIndex<T> {
@@ -122,12 +124,10 @@ public class InMemoryHashIndex<T extends HoodieRecordPayload> extends HoodieInde
/**
* Function that tags each HoodieRecord with an existing location, if known.
*/
class LocationTagFunction implements
Function2<Integer, Iterator<HoodieRecord<T>>, Iterator<HoodieRecord<T>>> {
class LocationTagFunction implements Function2<Integer, Iterator<HoodieRecord<T>>, Iterator<HoodieRecord<T>>> {
@Override
public Iterator<HoodieRecord<T>> call(Integer partitionNum,
Iterator<HoodieRecord<T>> hoodieRecordIterator) {
public Iterator<HoodieRecord<T>> call(Integer partitionNum, Iterator<HoodieRecord<T>> hoodieRecordIterator) {
List<HoodieRecord<T>> taggedRecords = new ArrayList<>();
while (hoodieRecordIterator.hasNext()) {
HoodieRecord<T> rec = hoodieRecordIterator.next();

View File

@@ -35,6 +35,7 @@ import org.apache.spark.Partitioner;
* Partitions bloom filter checks by spreading out comparisons across buckets of work.
*
* Each bucket incurs the following cost
*
* <pre>
* 1) Read bloom filter from file footer
* 2) Check keys against bloom filter
@@ -47,6 +48,7 @@ import org.apache.spark.Partitioner;
* could bound the amount of skew to std_dev(numberOfBucketsPerPartition) * cost of (3), lower than sort partitioning.
*
* Approach has two goals :
*
* <pre>
* 1) Pack as many buckets from same file group into same partition, to amortize cost of (1) and (2) further
* 2) Spread buckets across partitions evenly to achieve skew reduction
@@ -76,8 +78,7 @@ public class BucketizedBloomCheckPartitioner extends Partitioner {
Map<String, Integer> bucketsPerFileGroup = new HashMap<>();
// Compute the buckets needed per file group, using simple uniform distribution
fileGroupToComparisons.forEach((f, c) ->
bucketsPerFileGroup.put(f, (int) Math.ceil((c * 1.0) / keysPerBucket)));
fileGroupToComparisons.forEach((f, c) -> bucketsPerFileGroup.put(f, (int) Math.ceil((c * 1.0) / keysPerBucket)));
int totalBuckets = bucketsPerFileGroup.values().stream().mapToInt(i -> i).sum();
// If totalBuckets > targetPartitions, no need to have extra partitions
this.partitions = Math.min(targetPartitions, totalBuckets);

View File

@@ -78,12 +78,12 @@ public class HoodieBloomIndex<T extends HoodieRecordPayload> extends HoodieIndex
}
// Step 1: Extract out thinner JavaPairRDD of (partitionPath, recordKey)
JavaPairRDD<String, String> partitionRecordKeyPairRDD = recordRDD
.mapToPair(record -> new Tuple2<>(record.getPartitionPath(), record.getRecordKey()));
JavaPairRDD<String, String> partitionRecordKeyPairRDD =
recordRDD.mapToPair(record -> new Tuple2<>(record.getPartitionPath(), record.getRecordKey()));
// Lookup indexes for all the partition/recordkey pair
JavaPairRDD<HoodieKey, HoodieRecordLocation> keyFilenamePairRDD = lookupIndex(partitionRecordKeyPairRDD, jsc,
hoodieTable);
JavaPairRDD<HoodieKey, HoodieRecordLocation> keyFilenamePairRDD =
lookupIndex(partitionRecordKeyPairRDD, jsc, hoodieTable);
// Cache the result, for subsequent stages.
if (config.getBloomIndexUseCaching()) {
@@ -96,8 +96,7 @@ public class HoodieBloomIndex<T extends HoodieRecordPayload> extends HoodieIndex
// Step 4: Tag the incoming records, as inserts or updates, by joining with existing record keys
// Cost: 4 sec.
JavaRDD<HoodieRecord<T>> taggedRecordRDD = tagLocationBacktoRecords(keyFilenamePairRDD,
recordRDD);
JavaRDD<HoodieRecord<T>> taggedRecordRDD = tagLocationBacktoRecords(keyFilenamePairRDD, recordRDD);
if (config.getBloomIndexUseCaching()) {
recordRDD.unpersist(); // unpersist the input Record RDD
@@ -108,8 +107,8 @@ public class HoodieBloomIndex<T extends HoodieRecordPayload> extends HoodieIndex
}
/**
* Returns an RDD mapping each HoodieKey with a partitionPath/fileID which contains it. Option.Empty if the key is
* not found.
* Returns an RDD mapping each HoodieKey with a partitionPath/fileID which contains it. Option.Empty if the key is not
* found.
*
* @param hoodieKeys keys to lookup
* @param jsc spark context
@@ -118,12 +117,12 @@ public class HoodieBloomIndex<T extends HoodieRecordPayload> extends HoodieIndex
@Override
public JavaPairRDD<HoodieKey, Option<Pair<String, String>>> fetchRecordLocation(JavaRDD<HoodieKey> hoodieKeys,
JavaSparkContext jsc, HoodieTable<T> hoodieTable) {
JavaPairRDD<String, String> partitionRecordKeyPairRDD = hoodieKeys
.mapToPair(key -> new Tuple2<>(key.getPartitionPath(), key.getRecordKey()));
JavaPairRDD<String, String> partitionRecordKeyPairRDD =
hoodieKeys.mapToPair(key -> new Tuple2<>(key.getPartitionPath(), key.getRecordKey()));
// Lookup indexes for all the partition/recordkey pair
JavaPairRDD<HoodieKey, HoodieRecordLocation> recordKeyLocationRDD = lookupIndex(partitionRecordKeyPairRDD, jsc,
hoodieTable);
JavaPairRDD<HoodieKey, HoodieRecordLocation> recordKeyLocationRDD =
lookupIndex(partitionRecordKeyPairRDD, jsc, hoodieTable);
JavaPairRDD<HoodieKey, String> keyHoodieKeyPairRDD = hoodieKeys.mapToPair(key -> new Tuple2<>(key, null));
return keyHoodieKeyPairRDD.leftOuterJoin(recordKeyLocationRDD).mapToPair(keyLoc -> {
@@ -149,19 +148,19 @@ public class HoodieBloomIndex<T extends HoodieRecordPayload> extends HoodieIndex
List<String> affectedPartitionPathList = new ArrayList<>(recordsPerPartition.keySet());
// Step 2: Load all involved files as <Partition, filename> pairs
List<Tuple2<String, BloomIndexFileInfo>> fileInfoList = loadInvolvedFiles(affectedPartitionPathList, jsc,
hoodieTable);
final Map<String, List<BloomIndexFileInfo>> partitionToFileInfo = fileInfoList.stream()
.collect(groupingBy(Tuple2::_1, mapping(Tuple2::_2, toList())));
List<Tuple2<String, BloomIndexFileInfo>> fileInfoList =
loadInvolvedFiles(affectedPartitionPathList, jsc, hoodieTable);
final Map<String, List<BloomIndexFileInfo>> partitionToFileInfo =
fileInfoList.stream().collect(groupingBy(Tuple2::_1, mapping(Tuple2::_2, toList())));
// Step 3: Obtain a RDD, for each incoming record, that already exists, with the file id,
// that contains it.
Map<String, Long> comparisonsPerFileGroup = computeComparisonsPerFileGroup(recordsPerPartition, partitionToFileInfo,
partitionRecordKeyPairRDD);
Map<String, Long> comparisonsPerFileGroup =
computeComparisonsPerFileGroup(recordsPerPartition, partitionToFileInfo, partitionRecordKeyPairRDD);
int safeParallelism = computeSafeParallelism(recordsPerPartition, comparisonsPerFileGroup);
int joinParallelism = determineParallelism(partitionRecordKeyPairRDD.partitions().size(), safeParallelism);
return findMatchingFilesForRecordKeys(partitionToFileInfo, partitionRecordKeyPairRDD, joinParallelism,
hoodieTable, comparisonsPerFileGroup);
return findMatchingFilesForRecordKeys(partitionToFileInfo, partitionRecordKeyPairRDD, joinParallelism, hoodieTable,
comparisonsPerFileGroup);
}
/**
@@ -175,8 +174,8 @@ public class HoodieBloomIndex<T extends HoodieRecordPayload> extends HoodieIndex
if (config.getBloomIndexPruneByRanges()) {
// we will just try exploding the input and then count to determine comparisons
// FIX(vc): Only do sampling here and extrapolate?
fileToComparisons = explodeRecordRDDWithFileComparisons(partitionToFileInfo,
partitionRecordKeyPairRDD).mapToPair(t -> t).countByKey();
fileToComparisons = explodeRecordRDDWithFileComparisons(partitionToFileInfo, partitionRecordKeyPairRDD)
.mapToPair(t -> t).countByKey();
} else {
fileToComparisons = new HashMap<>();
partitionToFileInfo.entrySet().stream().forEach(e -> {
@@ -191,34 +190,41 @@ public class HoodieBloomIndex<T extends HoodieRecordPayload> extends HoodieIndex
/**
* Compute the minimum parallelism needed to play well with the spark 2GB limitation.. The index lookup can be skewed
* in three dimensions : #files, #partitions, #records <p> To be able to smoothly handle skews, we need to compute how
* to split each partitions into subpartitions. We do it here, in a way that keeps the amount of each Spark join
* partition to < 2GB. <p> If {@link HoodieIndexConfig#BLOOM_INDEX_PARALLELISM_PROP} is
* specified as a NON-zero number, then that is used explicitly.
* in three dimensions : #files, #partitions, #records
* <p>
* To be able to smoothly handle skews, we need to compute how to split each partitions into subpartitions. We do it
* here, in a way that keeps the amount of each Spark join partition to < 2GB.
* <p>
* If {@link HoodieIndexConfig#BLOOM_INDEX_PARALLELISM_PROP} is specified as a NON-zero number, then that is used
* explicitly.
*/
int computeSafeParallelism(Map<String, Long> recordsPerPartition, Map<String, Long> comparisonsPerFileGroup) {
long totalComparisons = comparisonsPerFileGroup.values().stream().mapToLong(Long::longValue).sum();
long totalFiles = comparisonsPerFileGroup.size();
long totalRecords = recordsPerPartition.values().stream().mapToLong(Long::longValue).sum();
int parallelism = (int) (totalComparisons / MAX_ITEMS_PER_SHUFFLE_PARTITION + 1);
logger.info(String.format("TotalRecords %d, TotalFiles %d, TotalAffectedPartitions %d, TotalComparisons %d, "
+ "SafeParallelism %d", totalRecords, totalFiles, recordsPerPartition.size(), totalComparisons, parallelism));
logger.info(String.format(
"TotalRecords %d, TotalFiles %d, TotalAffectedPartitions %d, TotalComparisons %d, " + "SafeParallelism %d",
totalRecords, totalFiles, recordsPerPartition.size(), totalComparisons, parallelism));
return parallelism;
}
/**
* Its crucial to pick the right parallelism. <p> totalSubPartitions : this is deemed safe limit, to be nice with
* Spark. inputParallelism : typically number of input file splits <p> We pick the max such that, we are always safe,
* but go higher if say a there are a lot of input files. (otherwise, we will fallback to number of partitions in
* input and end up with slow performance)
* Its crucial to pick the right parallelism.
* <p>
* totalSubPartitions : this is deemed safe limit, to be nice with Spark. inputParallelism : typically number of input
* file splits
* <p>
* We pick the max such that, we are always safe, but go higher if say a there are a lot of input files. (otherwise,
* we will fallback to number of partitions in input and end up with slow performance)
*/
private int determineParallelism(int inputParallelism, int totalSubPartitions) {
// If bloom index parallelism is set, use it to to check against the input parallelism and
// take the max
int indexParallelism = Math.max(inputParallelism, config.getBloomIndexParallelism());
int joinParallelism = Math.max(totalSubPartitions, indexParallelism);
logger.info("InputParallelism: ${" + inputParallelism + "}, " + "IndexParallelism: ${" + config
.getBloomIndexParallelism() + "}, " + "TotalSubParts: ${" + totalSubPartitions + "}, "
logger.info("InputParallelism: ${" + inputParallelism + "}, " + "IndexParallelism: ${"
+ config.getBloomIndexParallelism() + "}, " + "TotalSubParts: ${" + totalSubPartitions + "}, "
+ "Join Parallelism set to : " + joinParallelism);
return joinParallelism;
}
@@ -231,11 +237,10 @@ public class HoodieBloomIndex<T extends HoodieRecordPayload> extends HoodieIndex
final HoodieTable hoodieTable) {
// Obtain the latest data files from all the partitions.
List<Pair<String, String>> partitionPathFileIDList = jsc
.parallelize(partitions, Math.max(partitions.size(), 1))
.flatMap(partitionPath -> {
Option<HoodieInstant> latestCommitTime = hoodieTable.getMetaClient().getCommitsTimeline()
.filterCompletedInstants().lastInstant();
List<Pair<String, String>> partitionPathFileIDList =
jsc.parallelize(partitions, Math.max(partitions.size(), 1)).flatMap(partitionPath -> {
Option<HoodieInstant> latestCommitTime =
hoodieTable.getMetaClient().getCommitsTimeline().filterCompletedInstants().lastInstant();
List<Pair<String, String>> filteredFiles = new ArrayList<>();
if (latestCommitTime.isPresent()) {
filteredFiles = hoodieTable.getROFileSystemView()
@@ -259,8 +264,7 @@ public class HoodieBloomIndex<T extends HoodieRecordPayload> extends HoodieIndex
}).collect();
} else {
return partitionPathFileIDList.stream()
.map(pf -> new Tuple2<>(pf.getKey(), new BloomIndexFileInfo(pf.getValue())))
.collect(toList());
.map(pf -> new Tuple2<>(pf.getKey(), new BloomIndexFileInfo(pf.getValue()))).collect(toList());
}
}
@@ -307,8 +311,8 @@ public class HoodieBloomIndex<T extends HoodieRecordPayload> extends HoodieIndex
JavaRDD<Tuple2<String, HoodieKey>> explodeRecordRDDWithFileComparisons(
final Map<String, List<BloomIndexFileInfo>> partitionToFileIndexInfo,
JavaPairRDD<String, String> partitionRecordKeyPairRDD) {
IndexFileFilter indexFileFilter = config.useBloomIndexTreebasedFilter()
? new IntervalTreeBasedIndexFileFilter(partitionToFileIndexInfo)
IndexFileFilter indexFileFilter =
config.useBloomIndexTreebasedFilter() ? new IntervalTreeBasedIndexFileFilter(partitionToFileIndexInfo)
: new ListBasedIndexFileFilter(partitionToFileIndexInfo);
return partitionRecordKeyPairRDD.map(partitionRecordKeyPair -> {
@@ -322,10 +326,12 @@ public class HoodieBloomIndex<T extends HoodieRecordPayload> extends HoodieIndex
}
/**
* Find out <RowKey, filename> pair. All workload grouped by file-level. <p> Join PairRDD(PartitionPath, RecordKey)
* and PairRDD(PartitionPath, File) & then repartition such that each RDD partition is a file, then for each file, we
* do (1) load bloom filter, (2) load rowKeys, (3) Tag rowKey <p> Make sure the parallelism is atleast the groupby
* parallelism for tagging location
* Find out <RowKey, filename> pair. All workload grouped by file-level.
* <p>
* Join PairRDD(PartitionPath, RecordKey) and PairRDD(PartitionPath, File) & then repartition such that each RDD
* partition is a file, then for each file, we do (1) load bloom filter, (2) load rowKeys, (3) Tag rowKey
* <p>
* Make sure the parallelism is atleast the groupby parallelism for tagging location
*/
@VisibleForTesting
JavaPairRDD<HoodieKey, HoodieRecordLocation> findMatchingFilesForRecordKeys(
@@ -336,33 +342,24 @@ public class HoodieBloomIndex<T extends HoodieRecordPayload> extends HoodieIndex
explodeRecordRDDWithFileComparisons(partitionToFileIndexInfo, partitionRecordKeyPairRDD);
if (config.useBloomIndexBucketizedChecking()) {
Partitioner partitioner = new BucketizedBloomCheckPartitioner(
shuffleParallelism,
fileGroupToComparisons,
config.getBloomIndexKeysPerBucket()
);
Partitioner partitioner = new BucketizedBloomCheckPartitioner(shuffleParallelism, fileGroupToComparisons,
config.getBloomIndexKeysPerBucket());
fileComparisonsRDD = fileComparisonsRDD
.mapToPair(t -> new Tuple2<>(Pair.of(t._1, t._2.getRecordKey()), t))
.repartitionAndSortWithinPartitions(partitioner)
.map(Tuple2::_2);
fileComparisonsRDD = fileComparisonsRDD.mapToPair(t -> new Tuple2<>(Pair.of(t._1, t._2.getRecordKey()), t))
.repartitionAndSortWithinPartitions(partitioner).map(Tuple2::_2);
} else {
fileComparisonsRDD = fileComparisonsRDD.sortBy(Tuple2::_1, true, shuffleParallelism);
}
return fileComparisonsRDD
.mapPartitionsWithIndex(new HoodieBloomIndexCheckFunction(hoodieTable, config), true)
.flatMap(List::iterator)
.filter(lr -> lr.getMatchingRecordKeys().size() > 0)
return fileComparisonsRDD.mapPartitionsWithIndex(new HoodieBloomIndexCheckFunction(hoodieTable, config), true)
.flatMap(List::iterator).filter(lr -> lr.getMatchingRecordKeys().size() > 0)
.flatMapToPair(lookupResult -> lookupResult.getMatchingRecordKeys().stream()
.map(recordKey -> new Tuple2<>(new HoodieKey(recordKey, lookupResult.getPartitionPath()),
new HoodieRecordLocation(lookupResult.getBaseInstantTime(), lookupResult.getFileId())))
.collect(Collectors.toList())
.iterator());
.collect(Collectors.toList()).iterator());
}
HoodieRecord<T> getTaggedRecord(HoodieRecord<T> inputRecord,
Option<HoodieRecordLocation> location) {
HoodieRecord<T> getTaggedRecord(HoodieRecord<T> inputRecord, Option<HoodieRecordLocation> location) {
HoodieRecord<T> record = inputRecord;
if (location.isPresent()) {
// When you have a record in multiple files in the same partition, then rowKeyRecordPairRDD
@@ -383,12 +380,12 @@ public class HoodieBloomIndex<T extends HoodieRecordPayload> extends HoodieIndex
*/
protected JavaRDD<HoodieRecord<T>> tagLocationBacktoRecords(
JavaPairRDD<HoodieKey, HoodieRecordLocation> keyFilenamePairRDD, JavaRDD<HoodieRecord<T>> recordRDD) {
JavaPairRDD<HoodieKey, HoodieRecord<T>> keyRecordPairRDD = recordRDD
.mapToPair(record -> new Tuple2<>(record.getKey(), record));
JavaPairRDD<HoodieKey, HoodieRecord<T>> keyRecordPairRDD =
recordRDD.mapToPair(record -> new Tuple2<>(record.getKey(), record));
// Here as the recordRDD might have more data than rowKeyRDD (some rowKeys' fileId is null),
// so we do left outer join.
return keyRecordPairRDD.leftOuterJoin(keyFilenamePairRDD).values().map(
v1 -> getTaggedRecord(v1._1, Option.ofNullable(v1._2.orNull())));
return keyRecordPairRDD.leftOuterJoin(keyFilenamePairRDD).values()
.map(v1 -> getTaggedRecord(v1._1, Option.ofNullable(v1._2.orNull())));
}
@Override

View File

@@ -34,11 +34,10 @@ import org.apache.spark.api.java.function.Function2;
import scala.Tuple2;
/**
* Function performing actual checking of RDD partition containing (fileId, hoodieKeys) against the
* actual files
* Function performing actual checking of RDD partition containing (fileId, hoodieKeys) against the actual files
*/
public class HoodieBloomIndexCheckFunction implements
Function2<Integer, Iterator<Tuple2<String, HoodieKey>>, Iterator<List<KeyLookupResult>>> {
public class HoodieBloomIndexCheckFunction
implements Function2<Integer, Iterator<Tuple2<String, HoodieKey>>, Iterator<List<KeyLookupResult>>> {
private final HoodieTable hoodieTable;
@@ -59,14 +58,12 @@ public class HoodieBloomIndexCheckFunction implements
private HoodieKeyLookupHandle keyLookupHandle;
LazyKeyCheckIterator(
Iterator<Tuple2<String, HoodieKey>> filePartitionRecordKeyTripletItr) {
LazyKeyCheckIterator(Iterator<Tuple2<String, HoodieKey>> filePartitionRecordKeyTripletItr) {
super(filePartitionRecordKeyTripletItr);
}
@Override
protected void start() {
}
protected void start() {}
@Override
protected List<HoodieKeyLookupHandle.KeyLookupResult> computeNext() {
@@ -113,7 +110,6 @@ public class HoodieBloomIndexCheckFunction implements
}
@Override
protected void end() {
}
protected void end() {}
}
}

View File

@@ -59,8 +59,7 @@ public class HoodieGlobalBloomIndex<T extends HoodieRecordPayload> extends Hoodi
final HoodieTable hoodieTable) {
HoodieTableMetaClient metaClient = hoodieTable.getMetaClient();
try {
List<String> allPartitionPaths = FSUtils
.getAllPartitionPaths(metaClient.getFs(), metaClient.getBasePath(),
List<String> allPartitionPaths = FSUtils.getAllPartitionPaths(metaClient.getFs(), metaClient.getBasePath(),
config.shouldAssumeDatePartitioning());
return super.loadInvolvedFiles(allPartitionPaths, jsc, hoodieTable);
} catch (IOException e) {
@@ -88,8 +87,8 @@ public class HoodieGlobalBloomIndex<T extends HoodieRecordPayload> extends Hoodi
entry.getValue().forEach(indexFile -> indexToPartitionMap.put(indexFile.getFileId(), entry.getKey()));
}
IndexFileFilter indexFileFilter = config.getBloomIndexPruneByRanges()
? new IntervalTreeBasedGlobalIndexFileFilter(partitionToFileIndexInfo)
IndexFileFilter indexFileFilter =
config.getBloomIndexPruneByRanges() ? new IntervalTreeBasedGlobalIndexFileFilter(partitionToFileIndexInfo)
: new ListBasedGlobalIndexFileFilter(partitionToFileIndexInfo);
return partitionRecordKeyPairRDD.map(partitionRecordKeyPair -> {
@@ -109,8 +108,8 @@ public class HoodieGlobalBloomIndex<T extends HoodieRecordPayload> extends Hoodi
@Override
protected JavaRDD<HoodieRecord<T>> tagLocationBacktoRecords(
JavaPairRDD<HoodieKey, HoodieRecordLocation> keyFilenamePairRDD, JavaRDD<HoodieRecord<T>> recordRDD) {
JavaPairRDD<String, HoodieRecord<T>> rowKeyRecordPairRDD = recordRDD
.mapToPair(record -> new Tuple2<>(record.getRecordKey(), record));
JavaPairRDD<String, HoodieRecord<T>> rowKeyRecordPairRDD =
recordRDD.mapToPair(record -> new Tuple2<>(record.getRecordKey(), record));
// Here as the recordRDD might have more data than rowKeyRDD (some rowKeys' fileId is null),
// so we do left outer join.

View File

@@ -41,16 +41,16 @@ class IntervalTreeBasedGlobalIndexFileFilter implements IndexFileFilter {
* @param partitionToFileIndexInfo Map of partition to List of {@link BloomIndexFileInfo}s
*/
IntervalTreeBasedGlobalIndexFileFilter(final Map<String, List<BloomIndexFileInfo>> partitionToFileIndexInfo) {
List<BloomIndexFileInfo> allIndexFiles = partitionToFileIndexInfo.values().stream().flatMap(Collection::stream)
.collect(Collectors.toList());
List<BloomIndexFileInfo> allIndexFiles =
partitionToFileIndexInfo.values().stream().flatMap(Collection::stream).collect(Collectors.toList());
// Note that the interval tree implementation doesn't have auto-balancing to ensure logN search time.
// So, we are shuffling the input here hoping the tree will not have any skewness. If not, the tree could be skewed
// which could result in N search time instead of NlogN.
Collections.shuffle(allIndexFiles);
allIndexFiles.forEach(indexFile -> {
if (indexFile.hasKeyRanges()) {
indexLookUpTree.insert(new KeyRangeNode(indexFile.getMinRecordKey(),
indexFile.getMaxRecordKey(), indexFile.getFileId()));
indexLookUpTree
.insert(new KeyRangeNode(indexFile.getMinRecordKey(), indexFile.getMaxRecordKey(), indexFile.getFileId()));
} else {
filesWithNoRanges.add(indexFile.getFileId());
}

View File

@@ -48,8 +48,8 @@ class IntervalTreeBasedIndexFileFilter implements IndexFileFilter {
KeyRangeLookupTree lookUpTree = new KeyRangeLookupTree();
bloomIndexFiles.forEach(indexFileInfo -> {
if (indexFileInfo.hasKeyRanges()) {
lookUpTree.insert(new KeyRangeNode(indexFileInfo.getMinRecordKey(),
indexFileInfo.getMaxRecordKey(), indexFileInfo.getFileId()));
lookUpTree.insert(new KeyRangeNode(indexFileInfo.getMinRecordKey(), indexFileInfo.getMaxRecordKey(),
indexFileInfo.getFileId()));
} else {
if (!partitionToFilesWithNoRanges.containsKey(partition)) {
partitionToFilesWithNoRanges.put(partition, new HashSet<>());

View File

@@ -50,25 +50,16 @@ class KeyRangeLookupTree implements Serializable {
*
* If no root exists, make {@code newNode} as the root and return the new root.
*
* If current root and newNode matches with min record key and max record key,
* merge two nodes. In other words, add files from {@code newNode} to current root.
* Return current root.
* If current root and newNode matches with min record key and max record key, merge two nodes. In other words, add
* files from {@code newNode} to current root. Return current root.
*
* If current root is < newNode
* if current root has no right sub tree
* update current root's right sub tree max and min
* set newNode as right sub tree
* else
* update root's right sub tree min and max with newNode's min and max record key as applicable
* recursively call insert() with root's right subtree as new root
* If current root is < newNode if current root has no right sub tree update current root's right sub tree max and min
* set newNode as right sub tree else update root's right sub tree min and max with newNode's min and max record key
* as applicable recursively call insert() with root's right subtree as new root
*
* else // current root is >= newNode
* if current root has no left sub tree
* update current root's left sub tree max and min
* set newNode as left sub tree
* else
* update root's left sub tree min and max with newNode's min and max record key as applicable
* recursively call insert() with root's left subtree as new root
* else // current root is >= newNode if current root has no left sub tree update current root's left sub tree max and
* min set newNode as left sub tree else update root's left sub tree min and max with newNode's min and max record key
* as applicable recursively call insert() with root's left subtree as new root
*
* @param root refers to the current root of the look up tree
* @param newNode newNode the new {@link KeyRangeNode} to be inserted

View File

@@ -62,15 +62,10 @@ class KeyRangeNode implements Comparable<KeyRangeNode>, Serializable {
@Override
public String toString() {
return "KeyRangeNode{"
+ "minRecordKey='" + minRecordKey + '\''
+ ", maxRecordKey='" + maxRecordKey + '\''
+ ", fileNameList=" + fileNameList
+ ", rightSubTreeMax='" + rightSubTreeMax + '\''
+ ", leftSubTreeMax='" + leftSubTreeMax + '\''
+ ", rightSubTreeMin='" + rightSubTreeMin + '\''
+ ", leftSubTreeMin='" + leftSubTreeMin + '\''
+ '}';
return "KeyRangeNode{" + "minRecordKey='" + minRecordKey + '\'' + ", maxRecordKey='" + maxRecordKey + '\''
+ ", fileNameList=" + fileNameList + ", rightSubTreeMax='" + rightSubTreeMax + '\'' + ", leftSubTreeMax='"
+ leftSubTreeMax + '\'' + ", rightSubTreeMin='" + rightSubTreeMin + '\'' + ", leftSubTreeMin='" + leftSubTreeMin
+ '\'' + '}';
}
/**
@@ -78,8 +73,8 @@ class KeyRangeNode implements Comparable<KeyRangeNode>, Serializable {
*
* @param that the {@link KeyRangeNode} to be compared with
* @return the result of comparison. 0 if both min and max are equal in both. 1 if this {@link KeyRangeNode} is
* greater than the {@code that} keyRangeNode. -1 if {@code that} keyRangeNode is greater than this {@link
* KeyRangeNode}
* greater than the {@code that} keyRangeNode. -1 if {@code that} keyRangeNode is greater than this
* {@link KeyRangeNode}
*/
@Override
public int compareTo(KeyRangeNode that) {

View File

@@ -30,8 +30,7 @@ class ListBasedGlobalIndexFileFilter extends ListBasedIndexFileFilter {
*
* @param partitionToFileIndexInfo Map of partition to List of {@link BloomIndexFileInfo}
*/
ListBasedGlobalIndexFileFilter(
Map<String, List<BloomIndexFileInfo>> partitionToFileIndexInfo) {
ListBasedGlobalIndexFileFilter(Map<String, List<BloomIndexFileInfo>> partitionToFileIndexInfo) {
super(partitionToFileIndexInfo);
}

View File

@@ -68,10 +68,8 @@ import scala.Tuple2;
*/
public class HBaseIndex<T extends HoodieRecordPayload> extends HoodieIndex<T> {
public static final String DEFAULT_SPARK_EXECUTOR_INSTANCES_CONFIG_NAME =
"spark.executor.instances";
public static final String DEFAULT_SPARK_DYNAMIC_ALLOCATION_ENABLED_CONFIG_NAME =
"spark.dynamicAllocation.enabled";
public static final String DEFAULT_SPARK_EXECUTOR_INSTANCES_CONFIG_NAME = "spark.executor.instances";
public static final String DEFAULT_SPARK_DYNAMIC_ALLOCATION_ENABLED_CONFIG_NAME = "spark.dynamicAllocation.enabled";
public static final String DEFAULT_SPARK_DYNAMIC_ALLOCATION_MAX_EXECUTORS_CONFIG_NAME =
"spark.dynamicAllocation.maxExecutors";
@@ -114,9 +112,8 @@ public class HBaseIndex<T extends HoodieRecordPayload> extends HoodieIndex<T> {
public HBaseIndexQPSResourceAllocator createQPSResourceAllocator(HoodieWriteConfig config) {
try {
logger.info("createQPSResourceAllocator :" + config.getHBaseQPSResourceAllocatorClass());
final HBaseIndexQPSResourceAllocator resourceAllocator =
(HBaseIndexQPSResourceAllocator) ReflectionUtils.loadClass(
config.getHBaseQPSResourceAllocatorClass(), config);
final HBaseIndexQPSResourceAllocator resourceAllocator = (HBaseIndexQPSResourceAllocator) ReflectionUtils
.loadClass(config.getHBaseQPSResourceAllocatorClass(), config);
return resourceAllocator;
} catch (Exception e) {
logger.warn("error while instantiating HBaseIndexQPSResourceAllocator", e);
@@ -143,14 +140,14 @@ public class HBaseIndex<T extends HoodieRecordPayload> extends HoodieIndex<T> {
try {
return ConnectionFactory.createConnection(hbaseConfig);
} catch (IOException e) {
throw new HoodieDependentSystemUnavailableException(
HoodieDependentSystemUnavailableException.HBASE, quorum + ":" + port);
throw new HoodieDependentSystemUnavailableException(HoodieDependentSystemUnavailableException.HBASE,
quorum + ":" + port);
}
}
/**
* Since we are sharing the HbaseConnection across tasks in a JVM, make sure the HbaseConnectio is
* closed when JVM exits
* Since we are sharing the HbaseConnection across tasks in a JVM, make sure the HbaseConnectio is closed when JVM
* exits
*/
private void addShutDownHook() {
Runtime.getRuntime().addShutdownHook(new Thread() {
@@ -172,31 +169,28 @@ public class HBaseIndex<T extends HoodieRecordPayload> extends HoodieIndex<T> {
}
private Get generateStatement(String key) throws IOException {
return new Get(Bytes.toBytes(key)).setMaxVersions(1)
.addColumn(SYSTEM_COLUMN_FAMILY, COMMIT_TS_COLUMN)
.addColumn(SYSTEM_COLUMN_FAMILY, FILE_NAME_COLUMN)
.addColumn(SYSTEM_COLUMN_FAMILY, PARTITION_PATH_COLUMN);
return new Get(Bytes.toBytes(key)).setMaxVersions(1).addColumn(SYSTEM_COLUMN_FAMILY, COMMIT_TS_COLUMN)
.addColumn(SYSTEM_COLUMN_FAMILY, FILE_NAME_COLUMN).addColumn(SYSTEM_COLUMN_FAMILY, PARTITION_PATH_COLUMN);
}
private boolean checkIfValidCommit(HoodieTableMetaClient metaClient, String commitTs) {
HoodieTimeline commitTimeline = metaClient.getActiveTimeline().filterCompletedInstants();
// Check if the last commit ts for this row is 1) present in the timeline or
// 2) is less than the first commit ts in the timeline
return !commitTimeline.empty() && (commitTimeline
.containsInstant(new HoodieInstant(false, HoodieTimeline.COMMIT_ACTION, commitTs))
|| HoodieTimeline
.compareTimestamps(commitTimeline.firstInstant().get().getTimestamp(), commitTs,
return !commitTimeline.empty()
&& (commitTimeline.containsInstant(new HoodieInstant(false, HoodieTimeline.COMMIT_ACTION, commitTs))
|| HoodieTimeline.compareTimestamps(commitTimeline.firstInstant().get().getTimestamp(), commitTs,
HoodieTimeline.GREATER));
}
/**
* Function that tags each HoodieRecord with an existing location, if known.
*/
private Function2<Integer, Iterator<HoodieRecord<T>>,
Iterator<HoodieRecord<T>>> locationTagFunction(HoodieTableMetaClient metaClient) {
private Function2<Integer, Iterator<HoodieRecord<T>>, Iterator<HoodieRecord<T>>> locationTagFunction(
HoodieTableMetaClient metaClient) {
return (Function2<Integer, Iterator<HoodieRecord<T>>, Iterator<HoodieRecord<T>>>)
(partitionNum, hoodieRecordIterator) -> {
return (Function2<Integer, Iterator<HoodieRecord<T>>, Iterator<HoodieRecord<T>>>) (partitionNum,
hoodieRecordIterator) -> {
Integer multiGetBatchSize = config.getHbaseIndexGetBatchSize();
@@ -228,16 +222,12 @@ public class HBaseIndex<T extends HoodieRecordPayload> extends HoodieIndex<T> {
HoodieRecord currentRecord = currentBatchOfRecords.remove(0);
if (result.getRow() != null) {
String keyFromResult = Bytes.toString(result.getRow());
String commitTs = Bytes
.toString(result.getValue(SYSTEM_COLUMN_FAMILY, COMMIT_TS_COLUMN));
String fileId = Bytes
.toString(result.getValue(SYSTEM_COLUMN_FAMILY, FILE_NAME_COLUMN));
String partitionPath = Bytes
.toString(result.getValue(SYSTEM_COLUMN_FAMILY, PARTITION_PATH_COLUMN));
String commitTs = Bytes.toString(result.getValue(SYSTEM_COLUMN_FAMILY, COMMIT_TS_COLUMN));
String fileId = Bytes.toString(result.getValue(SYSTEM_COLUMN_FAMILY, FILE_NAME_COLUMN));
String partitionPath = Bytes.toString(result.getValue(SYSTEM_COLUMN_FAMILY, PARTITION_PATH_COLUMN));
if (checkIfValidCommit(metaClient, commitTs)) {
currentRecord = new HoodieRecord(
new HoodieKey(currentRecord.getRecordKey(), partitionPath),
currentRecord = new HoodieRecord(new HoodieKey(currentRecord.getRecordKey(), partitionPath),
currentRecord.getData());
currentRecord.unseal();
currentRecord.setCurrentLocation(new HoodieRecordLocation(commitTs, fileId));
@@ -255,8 +245,7 @@ public class HBaseIndex<T extends HoodieRecordPayload> extends HoodieIndex<T> {
}
}
} catch (IOException e) {
throw new HoodieIndexException(
"Failed to Tag indexed locations because of exception with HBase Client", e);
throw new HoodieIndexException("Failed to Tag indexed locations because of exception with HBase Client", e);
} finally {
if (hTable != null) {
try {
@@ -310,12 +299,9 @@ public class HBaseIndex<T extends HoodieRecordPayload> extends HoodieIndex<T> {
continue;
}
Put put = new Put(Bytes.toBytes(rec.getRecordKey()));
put.addColumn(SYSTEM_COLUMN_FAMILY, COMMIT_TS_COLUMN,
Bytes.toBytes(loc.get().getInstantTime()));
put.addColumn(SYSTEM_COLUMN_FAMILY, FILE_NAME_COLUMN,
Bytes.toBytes(loc.get().getFileId()));
put.addColumn(SYSTEM_COLUMN_FAMILY, PARTITION_PATH_COLUMN,
Bytes.toBytes(rec.getPartitionPath()));
put.addColumn(SYSTEM_COLUMN_FAMILY, COMMIT_TS_COLUMN, Bytes.toBytes(loc.get().getInstantTime()));
put.addColumn(SYSTEM_COLUMN_FAMILY, FILE_NAME_COLUMN, Bytes.toBytes(loc.get().getFileId()));
put.addColumn(SYSTEM_COLUMN_FAMILY, PARTITION_PATH_COLUMN, Bytes.toBytes(rec.getPartitionPath()));
puts.add(put);
} else {
// Delete existing index for a deleted record
@@ -338,8 +324,7 @@ public class HBaseIndex<T extends HoodieRecordPayload> extends HoodieIndex<T> {
writeStatusList.add(writeStatus);
}
} catch (IOException e) {
throw new HoodieIndexException(
"Failed to Update Index locations because of exception with HBase Client", e);
throw new HoodieIndexException("Failed to Update Index locations because of exception with HBase Client", e);
} finally {
if (hTable != null) {
try {
@@ -356,8 +341,7 @@ public class HBaseIndex<T extends HoodieRecordPayload> extends HoodieIndex<T> {
/**
* Helper method to facilitate performing puts and deletes in Hbase
*/
private void doPutsAndDeletes(HTable hTable, List<Put> puts, List<Delete> deletes)
throws IOException {
private void doPutsAndDeletes(HTable hTable, List<Put> puts, List<Delete> deletes) throws IOException {
if (puts.size() > 0) {
hTable.put(puts);
}
@@ -385,58 +369,49 @@ public class HBaseIndex<T extends HoodieRecordPayload> extends HoodieIndex<T> {
final HBaseIndexQPSResourceAllocator hBaseIndexQPSResourceAllocator = createQPSResourceAllocator(this.config);
setPutBatchSize(writeStatusRDD, hBaseIndexQPSResourceAllocator, jsc);
logger.info("multiPutBatchSize: before hbase puts" + multiPutBatchSize);
JavaRDD<WriteStatus> writeStatusJavaRDD = writeStatusRDD.mapPartitionsWithIndex(
updateLocationFunction(), true);
JavaRDD<WriteStatus> writeStatusJavaRDD = writeStatusRDD.mapPartitionsWithIndex(updateLocationFunction(), true);
// caching the index updated status RDD
writeStatusJavaRDD = writeStatusJavaRDD.persist(config.getWriteStatusStorageLevel());
return writeStatusJavaRDD;
}
private void setPutBatchSize(JavaRDD<WriteStatus> writeStatusRDD,
HBaseIndexQPSResourceAllocator hBaseIndexQPSResourceAllocator,
final JavaSparkContext jsc) {
HBaseIndexQPSResourceAllocator hBaseIndexQPSResourceAllocator, final JavaSparkContext jsc) {
if (config.getHbaseIndexPutBatchSizeAutoCompute()) {
SparkConf conf = jsc.getConf();
int maxExecutors = conf.getInt(DEFAULT_SPARK_EXECUTOR_INSTANCES_CONFIG_NAME, 1);
if (conf.getBoolean(DEFAULT_SPARK_DYNAMIC_ALLOCATION_ENABLED_CONFIG_NAME, false)) {
maxExecutors = Math.max(maxExecutors, conf.getInt(
DEFAULT_SPARK_DYNAMIC_ALLOCATION_MAX_EXECUTORS_CONFIG_NAME, 1));
maxExecutors =
Math.max(maxExecutors, conf.getInt(DEFAULT_SPARK_DYNAMIC_ALLOCATION_MAX_EXECUTORS_CONFIG_NAME, 1));
}
/*
Each writeStatus represents status information from a write done in one of the IOHandles.
If a writeStatus has any insert, it implies that the corresponding task contacts HBase for
doing puts, since we only do puts for inserts from HBaseIndex.
* Each writeStatus represents status information from a write done in one of the IOHandles. If a writeStatus has
* any insert, it implies that the corresponding task contacts HBase for doing puts, since we only do puts for
* inserts from HBaseIndex.
*/
final Tuple2<Long, Integer> numPutsParallelismTuple = getHBasePutAccessParallelism(writeStatusRDD);
final long numPuts = numPutsParallelismTuple._1;
final int hbasePutsParallelism = numPutsParallelismTuple._2;
this.numRegionServersForTable = getNumRegionServersAliveForTable();
final float desiredQPSFraction = hBaseIndexQPSResourceAllocator
.calculateQPSFractionForPutsTime(numPuts, this.numRegionServersForTable);
final float desiredQPSFraction =
hBaseIndexQPSResourceAllocator.calculateQPSFractionForPutsTime(numPuts, this.numRegionServersForTable);
logger.info("Desired QPSFraction :" + desiredQPSFraction);
logger.info("Number HBase puts :" + numPuts);
logger.info("Hbase Puts Parallelism :" + hbasePutsParallelism);
final float availableQpsFraction = hBaseIndexQPSResourceAllocator
.acquireQPSResources(desiredQPSFraction, numPuts);
final float availableQpsFraction =
hBaseIndexQPSResourceAllocator.acquireQPSResources(desiredQPSFraction, numPuts);
logger.info("Allocated QPS Fraction :" + availableQpsFraction);
multiPutBatchSize = putBatchSizeCalculator
.getBatchSize(
numRegionServersForTable,
maxQpsPerRegionServer,
hbasePutsParallelism,
maxExecutors,
SLEEP_TIME_MILLISECONDS,
availableQpsFraction);
multiPutBatchSize = putBatchSizeCalculator.getBatchSize(numRegionServersForTable, maxQpsPerRegionServer,
hbasePutsParallelism, maxExecutors, SLEEP_TIME_MILLISECONDS, availableQpsFraction);
logger.info("multiPutBatchSize :" + multiPutBatchSize);
}
}
@VisibleForTesting
public Tuple2<Long, Integer> getHBasePutAccessParallelism(final JavaRDD<WriteStatus> writeStatusRDD) {
final JavaPairRDD<Long, Integer> insertOnlyWriteStatusRDD =
writeStatusRDD.filter(w -> w.getStat().getNumInserts() > 0)
.mapToPair(w -> new Tuple2<>(w.getStat().getNumInserts(), 1));
final JavaPairRDD<Long, Integer> insertOnlyWriteStatusRDD = writeStatusRDD
.filter(w -> w.getStat().getNumInserts() > 0).mapToPair(w -> new Tuple2<>(w.getStat().getNumInserts(), 1));
return insertOnlyWriteStatusRDD.fold(new Tuple2<>(0L, 0), (w, c) -> new Tuple2<>(w._1 + c._1, w._2 + c._2));
}
@@ -460,21 +435,25 @@ public class HBaseIndex<T extends HoodieRecordPayload> extends HoodieIndex<T> {
* 16000. We assume requests get distributed to Region Servers uniformly, so each RS gets 1600 requests which
* happens to be 10% of 16667 (maxQPSPerRegionServer), as expected.
* </p>
* <p> Assumptions made here <li> In a batch, writes get evenly distributed to each RS for that
* table. Since we do writes only in the case of inserts and not updates, for this assumption to fail, inserts would
* have to be skewed towards few RS, likelihood of which is less if Hbase table is pre-split and rowKeys are UUIDs
* (random strings). If this assumption fails, then it is possible for some RS to receive more than
* maxQpsPerRegionServer QPS, but for simplicity, we are going ahead with this model, since this is meant to be a
* lightweight distributed throttling mechanism without maintaining a global context. So if this assumption breaks,
* we are hoping the HBase Master relocates hot-spot regions to new Region Servers.
* <p>
* Assumptions made here
* <li>In a batch, writes get evenly distributed to each RS for that table. Since we do writes only in the case of
* inserts and not updates, for this assumption to fail, inserts would have to be skewed towards few RS, likelihood
* of which is less if Hbase table is pre-split and rowKeys are UUIDs (random strings). If this assumption fails,
* then it is possible for some RS to receive more than maxQpsPerRegionServer QPS, but for simplicity, we are going
* ahead with this model, since this is meant to be a lightweight distributed throttling mechanism without
* maintaining a global context. So if this assumption breaks, we are hoping the HBase Master relocates hot-spot
* regions to new Region Servers.
*
* </li> <li> For Region Server stability, throttling at a second level granularity is fine.
* Although, within a second, the sum of queries might be within maxQpsPerRegionServer, there could be peaks at some
* sub second intervals. So, the assumption is that these peaks are tolerated by the Region Server (which at max can
* be maxQpsPerRegionServer). </li> </p>
* </li>
* <li>For Region Server stability, throttling at a second level granularity is fine. Although, within a second, the
* sum of queries might be within maxQpsPerRegionServer, there could be peaks at some sub second intervals. So, the
* assumption is that these peaks are tolerated by the Region Server (which at max can be maxQpsPerRegionServer).
* </li>
* </p>
*/
public int getBatchSize(int numRegionServersForTable, int maxQpsPerRegionServer,
int numTasksDuringPut, int maxExecutors, int sleepTimeMs, float qpsFraction) {
public int getBatchSize(int numRegionServersForTable, int maxQpsPerRegionServer, int numTasksDuringPut,
int maxExecutors, int sleepTimeMs, float qpsFraction) {
int numRSAlive = numRegionServersForTable;
int maxReqPerSec = (int) (qpsFraction * numRSAlive * maxQpsPerRegionServer);
int numTasks = numTasksDuringPut;
@@ -499,11 +478,9 @@ public class HBaseIndex<T extends HoodieRecordPayload> extends HoodieIndex<T> {
// from the driver, so ok to use a local connection variable.
if (numRegionServersForTable == null) {
try (Connection conn = getHBaseConnection()) {
RegionLocator regionLocator = conn
.getRegionLocator(TableName.valueOf(tableName));
numRegionServersForTable = Math.toIntExact(
regionLocator.getAllRegionLocations().stream().map(e -> e.getServerName()).distinct()
.count());
RegionLocator regionLocator = conn.getRegionLocator(TableName.valueOf(tableName));
numRegionServersForTable = Math
.toIntExact(regionLocator.getAllRegionLocations().stream().map(e -> e.getServerName()).distinct().count());
return numRegionServersForTable;
} catch (IOException e) {
logger.error(e);

View File

@@ -26,8 +26,8 @@ import java.io.Serializable;
public interface HBaseIndexQPSResourceAllocator extends Serializable {
/**
* This method returns the QPS Fraction value that needs to be acquired such that the respective
* HBase index operation can be completed in desiredPutsTime.
* This method returns the QPS Fraction value that needs to be acquired such that the respective HBase index operation
* can be completed in desiredPutsTime.
*
* @param numPuts Number of inserts to be written to HBase index
* @param desiredPutsTimeInSecs Total expected time for the HBase inserts operation

View File

@@ -96,8 +96,8 @@ public class HoodieAppendHandle<T extends HoodieRecordPayload> extends HoodieWri
// Total number of new records inserted into the delta file
private long insertRecordsWritten = 0;
public HoodieAppendHandle(HoodieWriteConfig config, String commitTime, HoodieTable<T> hoodieTable,
String fileId, Iterator<HoodieRecord<T>> recordItr) {
public HoodieAppendHandle(HoodieWriteConfig config, String commitTime, HoodieTable<T> hoodieTable, String fileId,
Iterator<HoodieRecord<T>> recordItr) {
super(config, commitTime, fileId, hoodieTable);
writeStatus.setStat(new HoodieDeltaWriteStat());
this.fileId = fileId;
@@ -137,10 +137,8 @@ public class HoodieAppendHandle<T extends HoodieRecordPayload> extends HoodieWri
} catch (Exception e) {
logger.error("Error in update task at commit " + instantTime, e);
writeStatus.setGlobalError(e);
throw new HoodieUpsertException(
"Failed to initialize HoodieAppendHandle for FileId: " + fileId + " on commit "
+ instantTime + " on HDFS path " + hoodieTable.getMetaClient().getBasePath()
+ partitionPath, e);
throw new HoodieUpsertException("Failed to initialize HoodieAppendHandle for FileId: " + fileId + " on commit "
+ instantTime + " on HDFS path " + hoodieTable.getMetaClient().getBasePath() + partitionPath, e);
}
Path path = new Path(partitionPath, writer.getLogFile().getFileName());
writeStatus.getStat().setPath(path.toString());
@@ -155,13 +153,11 @@ public class HoodieAppendHandle<T extends HoodieRecordPayload> extends HoodieWri
if (avroRecord.isPresent()) {
// Convert GenericRecord to GenericRecord with hoodie commit metadata in schema
avroRecord = Option.of(rewriteRecord((GenericRecord) avroRecord.get()));
String seqId = HoodieRecord.generateSequenceId(instantTime, TaskContext.getPartitionId(),
recordIndex.getAndIncrement());
HoodieAvroUtils
.addHoodieKeyToRecord((GenericRecord) avroRecord.get(), hoodieRecord.getRecordKey(),
String seqId =
HoodieRecord.generateSequenceId(instantTime, TaskContext.getPartitionId(), recordIndex.getAndIncrement());
HoodieAvroUtils.addHoodieKeyToRecord((GenericRecord) avroRecord.get(), hoodieRecord.getRecordKey(),
hoodieRecord.getPartitionPath(), fileId);
HoodieAvroUtils
.addCommitMetadataToRecord((GenericRecord) avroRecord.get(), instantTime, seqId);
HoodieAvroUtils.addCommitMetadataToRecord((GenericRecord) avroRecord.get(), instantTime, seqId);
// If currentLocation is present, then this is an update
if (hoodieRecord.getCurrentLocation() != null) {
updatedRecordsWritten++;
@@ -208,20 +204,18 @@ public class HoodieAppendHandle<T extends HoodieRecordPayload> extends HoodieWri
recordList.clear();
}
if (keysToDelete.size() > 0) {
writer = writer.appendBlock(
new HoodieDeleteBlock(keysToDelete.stream().toArray(HoodieKey[]::new), header));
writer = writer.appendBlock(new HoodieDeleteBlock(keysToDelete.stream().toArray(HoodieKey[]::new), header));
keysToDelete.clear();
}
} catch (Exception e) {
throw new HoodieAppendException(
"Failed while appending records to " + currentLogFile.getPath(), e);
throw new HoodieAppendException("Failed while appending records to " + currentLogFile.getPath(), e);
}
}
@Override
public boolean canWrite(HoodieRecord record) {
return config.getParquetMaxFileSize() >= estimatedNumberOfBytesWritten * config
.getLogFileToParquetCompressionRatio();
return config.getParquetMaxFileSize() >= estimatedNumberOfBytesWritten
* config.getLogFileToParquetCompressionRatio();
}
@Override
@@ -262,8 +256,8 @@ public class HoodieAppendHandle<T extends HoodieRecordPayload> extends HoodieWri
runtimeStats.setTotalUpsertTime(timer.endTimer());
stat.setRuntimeStats(runtimeStats);
logger.info(String.format("AppendHandle for partitionPath %s fileID %s, took %d ms.",
stat.getPartitionPath(), stat.getFileId(), runtimeStats.getTotalUpsertTime()));
logger.info(String.format("AppendHandle for partitionPath %s fileID %s, took %d ms.", stat.getPartitionPath(),
stat.getFileId(), runtimeStats.getTotalUpsertTime()));
return writeStatus;
} catch (IOException e) {
@@ -282,13 +276,11 @@ public class HoodieAppendHandle<T extends HoodieRecordPayload> extends HoodieWri
return HoodieLogFormat.newWriterBuilder()
.onParentPath(FSUtils.getPartitionPath(hoodieTable.getMetaClient().getBasePath(), partitionPath))
.withFileId(fileId).overBaseCommit(baseCommitTime).withLogVersion(
latestLogFile.map(HoodieLogFile::getLogVersion).orElse(HoodieLogFile.LOGFILE_BASE_VERSION))
.withFileId(fileId).overBaseCommit(baseCommitTime)
.withLogVersion(latestLogFile.map(HoodieLogFile::getLogVersion).orElse(HoodieLogFile.LOGFILE_BASE_VERSION))
.withSizeThreshold(config.getLogFileMaxSize()).withFs(fs)
.withLogWriteToken(
latestLogFile.map(x -> FSUtils.getWriteTokenFromLogPath(x.getPath())).orElse(writeToken))
.withRolloverLogWriteToken(writeToken)
.withFileExtension(HoodieLogFile.DELTA_EXTENSION).build();
.withLogWriteToken(latestLogFile.map(x -> FSUtils.getWriteTokenFromLogPath(x.getPath())).orElse(writeToken))
.withRolloverLogWriteToken(writeToken).withFileExtension(HoodieLogFile.DELTA_EXTENSION).build();
}
private void writeToBuffer(HoodieRecord<T> record) {

View File

@@ -45,9 +45,12 @@ import org.apache.log4j.Logger;
/**
* Cleaner is responsible for garbage collecting older files in a given partition path, such that
* <p> 1) It provides sufficient time for existing queries running on older versions, to close <p>
* 2) It bounds the growth of the files in the file system <p> TODO: Should all cleaning be done
* based on {@link HoodieCommitMetadata}
* <p>
* 1) It provides sufficient time for existing queries running on older versions, to close
* <p>
* 2) It bounds the growth of the files in the file system
* <p>
* TODO: Should all cleaning be done based on {@link HoodieCommitMetadata}
*/
public class HoodieCleanHelper<T extends HoodieRecordPayload<T>> {
@@ -66,22 +69,21 @@ public class HoodieCleanHelper<T extends HoodieRecordPayload<T>> {
this.config = config;
this.fgIdToPendingCompactionOperations =
((SyncableFileSystemView) hoodieTable.getRTFileSystemView()).getPendingCompactionOperations()
.map(entry -> Pair.of(new HoodieFileGroupId(entry.getValue().getPartitionPath(),
entry.getValue().getFileId()), entry.getValue()))
.map(entry -> Pair.of(
new HoodieFileGroupId(entry.getValue().getPartitionPath(), entry.getValue().getFileId()),
entry.getValue()))
.collect(Collectors.toMap(Pair::getKey, Pair::getValue));
}
/**
* Selects the older versions of files for cleaning, such that it bounds the number of versions of
* each file. This policy is useful, if you are simply interested in querying the table, and you
* don't want too many versions for a single file (i.e run it with versionsRetained = 1)
* Selects the older versions of files for cleaning, such that it bounds the number of versions of each file. This
* policy is useful, if you are simply interested in querying the table, and you don't want too many versions for a
* single file (i.e run it with versionsRetained = 1)
*/
private List<String> getFilesToCleanKeepingLatestVersions(String partitionPath)
throws IOException {
logger.info("Cleaning " + partitionPath + ", retaining latest " + config
.getCleanerFileVersionsRetained() + " file versions. ");
List<HoodieFileGroup> fileGroups = fileSystemView.getAllFileGroups(partitionPath)
.collect(Collectors.toList());
private List<String> getFilesToCleanKeepingLatestVersions(String partitionPath) throws IOException {
logger.info("Cleaning " + partitionPath + ", retaining latest " + config.getCleanerFileVersionsRetained()
+ " file versions. ");
List<HoodieFileGroup> fileGroups = fileSystemView.getAllFileGroups(partitionPath).collect(Collectors.toList());
List<String> deletePaths = new ArrayList<>();
// Collect all the datafiles savepointed by all the savepoints
List<String> savepointedFiles = hoodieTable.getSavepoints().stream()
@@ -90,8 +92,8 @@ public class HoodieCleanHelper<T extends HoodieRecordPayload<T>> {
for (HoodieFileGroup fileGroup : fileGroups) {
int keepVersions = config.getCleanerFileVersionsRetained();
// do not cleanup slice required for pending compaction
Iterator<FileSlice> fileSliceIterator = fileGroup.getAllFileSlices()
.filter(fs -> !isFileSliceNeededForPendingCompaction(fs)).iterator();
Iterator<FileSlice> fileSliceIterator =
fileGroup.getAllFileSlices().filter(fs -> !isFileSliceNeededForPendingCompaction(fs)).iterator();
if (isFileGroupInPendingCompaction(fileGroup)) {
// We have already saved the last version of file-groups for pending compaction Id
keepVersions--;
@@ -116,8 +118,8 @@ public class HoodieCleanHelper<T extends HoodieRecordPayload<T>> {
}
if (hoodieTable.getMetaClient().getTableType() == HoodieTableType.MERGE_ON_READ) {
// If merge on read, then clean the log files for the commits as well
deletePaths.addAll(nextSlice.getLogFiles().map(file -> file.getPath().toString())
.collect(Collectors.toList()));
deletePaths
.addAll(nextSlice.getLogFiles().map(file -> file.getPath().toString()).collect(Collectors.toList()));
}
}
}
@@ -126,21 +128,21 @@ public class HoodieCleanHelper<T extends HoodieRecordPayload<T>> {
/**
* Selects the versions for file for cleaning, such that it <p> - Leaves the latest version of the
* file untouched - For older versions, - It leaves all the commits untouched which has occured in
* last <code>config.getCleanerCommitsRetained()</code> commits - It leaves ONE commit before this
* window. We assume that the max(query execution time) == commit_batch_time *
* config.getCleanerCommitsRetained(). This is 12 hours by default. This is essential to leave the
* file used by the query thats running for the max time. <p> This provides the effect of having
* lookback into all changes that happened in the last X commits. (eg: if you retain 24 commits,
* and commit batch time is 30 mins, then you have 12 hrs of lookback) <p> This policy is the
* default.
* Selects the versions for file for cleaning, such that it
* <p>
* - Leaves the latest version of the file untouched - For older versions, - It leaves all the commits untouched which
* has occured in last <code>config.getCleanerCommitsRetained()</code> commits - It leaves ONE commit before this
* window. We assume that the max(query execution time) == commit_batch_time * config.getCleanerCommitsRetained().
* This is 12 hours by default. This is essential to leave the file used by the query thats running for the max time.
* <p>
* This provides the effect of having lookback into all changes that happened in the last X commits. (eg: if you
* retain 24 commits, and commit batch time is 30 mins, then you have 12 hrs of lookback)
* <p>
* This policy is the default.
*/
private List<String> getFilesToCleanKeepingLatestCommits(String partitionPath)
throws IOException {
private List<String> getFilesToCleanKeepingLatestCommits(String partitionPath) throws IOException {
int commitsRetained = config.getCleanerCommitsRetained();
logger
.info("Cleaning " + partitionPath + ", retaining latest " + commitsRetained + " commits. ");
logger.info("Cleaning " + partitionPath + ", retaining latest " + commitsRetained + " commits. ");
List<String> deletePaths = new ArrayList<>();
// Collect all the datafiles savepointed by all the savepoints
@@ -150,8 +152,7 @@ public class HoodieCleanHelper<T extends HoodieRecordPayload<T>> {
// determine if we have enough commits, to start cleaning.
if (commitTimeline.countInstants() > commitsRetained) {
HoodieInstant earliestCommitToRetain = getEarliestCommitToRetain().get();
List<HoodieFileGroup> fileGroups = fileSystemView.getAllFileGroups(partitionPath)
.collect(Collectors.toList());
List<HoodieFileGroup> fileGroups = fileSystemView.getAllFileGroups(partitionPath).collect(Collectors.toList());
for (HoodieFileGroup fileGroup : fileGroups) {
List<FileSlice> fileSliceList = fileGroup.getAllFileSlices().collect(Collectors.toList());
@@ -160,8 +161,8 @@ public class HoodieCleanHelper<T extends HoodieRecordPayload<T>> {
}
String lastVersion = fileSliceList.get(0).getBaseInstantTime();
String lastVersionBeforeEarliestCommitToRetain = getLatestVersionBeforeCommit(fileSliceList,
earliestCommitToRetain);
String lastVersionBeforeEarliestCommitToRetain =
getLatestVersionBeforeCommit(fileSliceList, earliestCommitToRetain);
// Ensure there are more than 1 version of the file (we only clean old files from updates)
// i.e always spare the last commit.
@@ -183,16 +184,14 @@ public class HoodieCleanHelper<T extends HoodieRecordPayload<T>> {
}
// Always keep the last commit
if (!isFileSliceNeededForPendingCompaction(aSlice)
&& HoodieTimeline
.compareTimestamps(earliestCommitToRetain.getTimestamp(), fileCommitTime,
HoodieTimeline.GREATER)) {
if (!isFileSliceNeededForPendingCompaction(aSlice) && HoodieTimeline
.compareTimestamps(earliestCommitToRetain.getTimestamp(), fileCommitTime, HoodieTimeline.GREATER)) {
// this is a commit, that should be cleaned.
aFile.ifPresent(hoodieDataFile -> deletePaths.add(hoodieDataFile.getPath()));
if (hoodieTable.getMetaClient().getTableType() == HoodieTableType.MERGE_ON_READ) {
// If merge on read, then clean the log files for the commits as well
deletePaths.addAll(aSlice.getLogFiles().map(file -> file.getPath().toString())
.collect(Collectors.toList()));
deletePaths
.addAll(aSlice.getLogFiles().map(file -> file.getPath().toString()).collect(Collectors.toList()));
}
}
}
@@ -205,12 +204,10 @@ public class HoodieCleanHelper<T extends HoodieRecordPayload<T>> {
/**
* Gets the latest version < commitTime. This version file could still be used by queries.
*/
private String getLatestVersionBeforeCommit(List<FileSlice> fileSliceList,
HoodieInstant commitTime) {
private String getLatestVersionBeforeCommit(List<FileSlice> fileSliceList, HoodieInstant commitTime) {
for (FileSlice file : fileSliceList) {
String fileCommitTime = file.getBaseInstantTime();
if (HoodieTimeline
.compareTimestamps(commitTime.getTimestamp(), fileCommitTime, HoodieTimeline.GREATER)) {
if (HoodieTimeline.compareTimestamps(commitTime.getTimestamp(), fileCommitTime, HoodieTimeline.GREATER)) {
// fileList is sorted on the reverse, so the first commit we find <= commitTime is the
// one we want
return fileCommitTime;
@@ -246,14 +243,14 @@ public class HoodieCleanHelper<T extends HoodieRecordPayload<T>> {
int commitsRetained = config.getCleanerCommitsRetained();
if (config.getCleanerPolicy() == HoodieCleaningPolicy.KEEP_LATEST_COMMITS
&& commitTimeline.countInstants() > commitsRetained) {
earliestCommitToRetain = commitTimeline
.nthInstant(commitTimeline.countInstants() - commitsRetained);
earliestCommitToRetain = commitTimeline.nthInstant(commitTimeline.countInstants() - commitsRetained);
}
return earliestCommitToRetain;
}
/**
* Determine if file slice needed to be preserved for pending compaction
*
* @param fileSlice File Slice
* @return true if file slice needs to be preserved, false otherwise.
*/

View File

@@ -83,9 +83,8 @@ public class HoodieCommitArchiveLog {
try {
if (this.writer == null) {
return HoodieLogFormat.newWriterBuilder().onParentPath(archiveFilePath.getParent())
.withFileId(archiveFilePath.getName())
.withFileExtension(HoodieArchivedLogFile.ARCHIVE_EXTENSION).withFs(metaClient.getFs())
.overBaseCommit("").build();
.withFileId(archiveFilePath.getName()).withFileExtension(HoodieArchivedLogFile.ARCHIVE_EXTENSION)
.withFs(metaClient.getFs()).overBaseCommit("").build();
} else {
return this.writer;
}
@@ -137,8 +136,7 @@ public class HoodieCommitArchiveLog {
// TODO: Handle ROLLBACK_ACTION in future
// ROLLBACK_ACTION is currently not defined in HoodieActiveTimeline
HoodieTimeline cleanAndRollbackTimeline = table.getActiveTimeline()
.getTimelineOfActions(Sets.newHashSet(HoodieTimeline.CLEAN_ACTION))
.filterCompletedInstants();
.getTimelineOfActions(Sets.newHashSet(HoodieTimeline.CLEAN_ACTION)).filterCompletedInstants();
Stream<HoodieInstant> instants = cleanAndRollbackTimeline.getInstants()
.collect(Collectors.groupingBy(s -> s.getAction())).entrySet().stream().map(i -> {
if (i.getValue().size() > maxCommitsToKeep) {
@@ -159,20 +157,16 @@ public class HoodieCommitArchiveLog {
Option<HoodieInstant> firstSavepoint = table.getCompletedSavepointTimeline().firstInstant();
if (!commitTimeline.empty() && commitTimeline.countInstants() > maxCommitsToKeep) {
// Actually do the commits
instants = Stream.concat(instants, commitTimeline.getInstants()
.filter(s -> {
instants = Stream.concat(instants, commitTimeline.getInstants().filter(s -> {
// if no savepoint present, then dont filter
return !(firstSavepoint.isPresent() && HoodieTimeline
.compareTimestamps(firstSavepoint.get().getTimestamp(), s.getTimestamp(),
HoodieTimeline.LESSER_OR_EQUAL));
})
.filter(s -> {
return !(firstSavepoint.isPresent() && HoodieTimeline.compareTimestamps(firstSavepoint.get().getTimestamp(),
s.getTimestamp(), HoodieTimeline.LESSER_OR_EQUAL));
}).filter(s -> {
// Ensure commits >= oldest pending compaction commit is retained
return oldestPendingCompactionInstant.map(instant -> {
return HoodieTimeline.compareTimestamps(instant.getTimestamp(), s.getTimestamp(), HoodieTimeline.GREATER);
}).orElse(true);
})
.limit(commitTimeline.countInstants() - minCommitsToKeep));
}).limit(commitTimeline.countInstants() - minCommitsToKeep));
}
return instants;
@@ -194,12 +188,9 @@ public class HoodieCommitArchiveLog {
}
// Remove older meta-data from auxiliary path too
Option<HoodieInstant> latestCommitted =
Option.fromJavaOptional(archivedInstants.stream()
.filter(i -> {
return i.isCompleted()
&& (i.getAction().equals(HoodieTimeline.COMMIT_ACTION) || (i.getAction().equals(
HoodieTimeline.DELTA_COMMIT_ACTION)));
Option<HoodieInstant> latestCommitted = Option.fromJavaOptional(archivedInstants.stream().filter(i -> {
return i.isCompleted() && (i.getAction().equals(HoodieTimeline.COMMIT_ACTION)
|| (i.getAction().equals(HoodieTimeline.DELTA_COMMIT_ACTION)));
}).max(Comparator.comparing(HoodieInstant::getTimestamp)));
if (latestCommitted.isPresent()) {
success &= deleteAllInstantsOlderorEqualsInAuxMetaFolder(latestCommitted.get());
@@ -214,12 +205,9 @@ public class HoodieCommitArchiveLog {
* @return success if all eligible file deleted successfully
* @throws IOException in case of error
*/
private boolean deleteAllInstantsOlderorEqualsInAuxMetaFolder(HoodieInstant thresholdInstant)
throws IOException {
List<HoodieInstant> instants =
HoodieTableMetaClient.scanHoodieInstantsFromFileSystem(metaClient.getFs(),
new Path(metaClient.getMetaAuxiliaryPath()),
HoodieActiveTimeline.VALID_EXTENSIONS_IN_ACTIVE_TIMELINE);
private boolean deleteAllInstantsOlderorEqualsInAuxMetaFolder(HoodieInstant thresholdInstant) throws IOException {
List<HoodieInstant> instants = HoodieTableMetaClient.scanHoodieInstantsFromFileSystem(metaClient.getFs(),
new Path(metaClient.getMetaAuxiliaryPath()), HoodieActiveTimeline.VALID_EXTENSIONS_IN_ACTIVE_TIMELINE);
List<HoodieInstant> instantsToBeDeleted =
instants.stream().filter(instant1 -> HoodieTimeline.compareTimestamps(instant1.getTimestamp(),
@@ -239,8 +227,7 @@ public class HoodieCommitArchiveLog {
public void archive(List<HoodieInstant> instants) throws HoodieCommitException {
try {
HoodieTimeline commitTimeline = metaClient.getActiveTimeline().getAllCommitsTimeline()
.filterCompletedInstants();
HoodieTimeline commitTimeline = metaClient.getActiveTimeline().getAllCommitsTimeline().filterCompletedInstants();
Schema wrapperSchema = HoodieArchivedMetaEntry.getClassSchema();
log.info("Wrapper schema " + wrapperSchema.toString());
List<IndexedRecord> records = new ArrayList<>();
@@ -277,15 +264,14 @@ public class HoodieCommitArchiveLog {
}
}
private IndexedRecord convertToAvroRecord(HoodieTimeline commitTimeline,
HoodieInstant hoodieInstant) throws IOException {
private IndexedRecord convertToAvroRecord(HoodieTimeline commitTimeline, HoodieInstant hoodieInstant)
throws IOException {
HoodieArchivedMetaEntry archivedMetaWrapper = new HoodieArchivedMetaEntry();
archivedMetaWrapper.setCommitTime(hoodieInstant.getTimestamp());
switch (hoodieInstant.getAction()) {
case HoodieTimeline.CLEAN_ACTION: {
archivedMetaWrapper.setHoodieCleanMetadata(AvroUtils
.deserializeAvroMetadata(commitTimeline.getInstantDetails(hoodieInstant).get(),
HoodieCleanMetadata.class));
.deserializeAvroMetadata(commitTimeline.getInstantDetails(hoodieInstant).get(), HoodieCleanMetadata.class));
archivedMetaWrapper.setActionType(ActionType.clean.name());
break;
}
@@ -297,16 +283,14 @@ public class HoodieCommitArchiveLog {
break;
}
case HoodieTimeline.ROLLBACK_ACTION: {
archivedMetaWrapper.setHoodieRollbackMetadata(AvroUtils
.deserializeAvroMetadata(commitTimeline.getInstantDetails(hoodieInstant).get(),
HoodieRollbackMetadata.class));
archivedMetaWrapper.setHoodieRollbackMetadata(AvroUtils.deserializeAvroMetadata(
commitTimeline.getInstantDetails(hoodieInstant).get(), HoodieRollbackMetadata.class));
archivedMetaWrapper.setActionType(ActionType.rollback.name());
break;
}
case HoodieTimeline.SAVEPOINT_ACTION: {
archivedMetaWrapper.setHoodieSavePointMetadata(AvroUtils
.deserializeAvroMetadata(commitTimeline.getInstantDetails(hoodieInstant).get(),
HoodieSavepointMetadata.class));
archivedMetaWrapper.setHoodieSavePointMetadata(AvroUtils.deserializeAvroMetadata(
commitTimeline.getInstantDetails(hoodieInstant).get(), HoodieSavepointMetadata.class));
archivedMetaWrapper.setActionType(ActionType.savepoint.name());
break;
}
@@ -328,8 +312,8 @@ public class HoodieCommitArchiveLog {
ObjectMapper mapper = new ObjectMapper();
// Need this to ignore other public get() methods
mapper.configure(DeserializationFeature.FAIL_ON_UNKNOWN_PROPERTIES, false);
org.apache.hudi.avro.model.HoodieCommitMetadata avroMetaData = mapper
.convertValue(hoodieCommitMetadata, org.apache.hudi.avro.model.HoodieCommitMetadata.class);
org.apache.hudi.avro.model.HoodieCommitMetadata avroMetaData =
mapper.convertValue(hoodieCommitMetadata, org.apache.hudi.avro.model.HoodieCommitMetadata.class);
// Do not archive Rolling Stats, cannot set to null since AVRO will throw null pointer
avroMetaData.getExtraMetadata().put(HoodieRollingStatMetadata.ROLLING_STAT_METADATA_KEY, "");
return avroMetaData;

View File

@@ -66,11 +66,10 @@ public class HoodieCreateHandle<T extends HoodieRecordPayload> extends HoodieWri
new Path(config.getBasePath()), FSUtils.getPartitionPath(config.getBasePath(), partitionPath));
partitionMetadata.trySave(TaskContext.getPartitionId());
createMarkerFile(partitionPath);
this.storageWriter = HoodieStorageWriterFactory
.getStorageWriter(commitTime, path, hoodieTable, config, writerSchema);
this.storageWriter =
HoodieStorageWriterFactory.getStorageWriter(commitTime, path, hoodieTable, config, writerSchema);
} catch (IOException e) {
throw new HoodieInsertException(
"Failed to initialize HoodieStorageWriter for path " + path, e);
throw new HoodieInsertException("Failed to initialize HoodieStorageWriter for path " + path, e);
}
logger.info("New CreateHandle for partition :" + partitionPath + " with fileId " + fileId);
}
@@ -136,8 +135,7 @@ public class HoodieCreateHandle<T extends HoodieRecordPayload> extends HoodieWri
}
}
} catch (IOException io) {
throw new HoodieInsertException(
"Failed to insert records for path " + path, io);
throw new HoodieInsertException("Failed to insert records for path " + path, io);
}
}
@@ -151,8 +149,8 @@ public class HoodieCreateHandle<T extends HoodieRecordPayload> extends HoodieWri
*/
@Override
public WriteStatus close() {
logger.info("Closing the file " + writeStatus.getFileId() + " as we are done with all the records "
+ recordsWritten);
logger
.info("Closing the file " + writeStatus.getFileId() + " as we are done with all the records " + recordsWritten);
try {
storageWriter.close();
@@ -174,8 +172,8 @@ public class HoodieCreateHandle<T extends HoodieRecordPayload> extends HoodieWri
stat.setRuntimeStats(runtimeStats);
writeStatus.setStat(stat);
logger.info(String.format("CreateHandle for partitionPath %s fileID %s, took %d ms.",
stat.getPartitionPath(), stat.getFileId(), runtimeStats.getTotalCreateTime()));
logger.info(String.format("CreateHandle for partitionPath %s fileID %s, took %d ms.", stat.getPartitionPath(),
stat.getFileId(), runtimeStats.getTotalCreateTime()));
return writeStatus;
} catch (IOException e) {

View File

@@ -67,15 +67,15 @@ public class HoodieKeyLookupHandle<T extends HoodieRecordPayload> extends Hoodie
/**
* Given a list of row keys and one file, return only row keys existing in that file.
*/
public static List<String> checkCandidatesAgainstFile(Configuration configuration,
List<String> candidateRecordKeys, Path filePath) throws HoodieIndexException {
public static List<String> checkCandidatesAgainstFile(Configuration configuration, List<String> candidateRecordKeys,
Path filePath) throws HoodieIndexException {
List<String> foundRecordKeys = new ArrayList<>();
try {
// Load all rowKeys from the file, to double-confirm
if (!candidateRecordKeys.isEmpty()) {
HoodieTimer timer = new HoodieTimer().startTimer();
Set<String> fileRowKeys = ParquetUtils.filterParquetRowKeys(configuration, filePath,
new HashSet<>(candidateRecordKeys));
Set<String> fileRowKeys =
ParquetUtils.filterParquetRowKeys(configuration, filePath, new HashSet<>(candidateRecordKeys));
foundRecordKeys.addAll(fileRowKeys);
logger.info(String.format("Checked keys against file %s, in %d ms. #candidates (%d) #found (%d)", filePath,
timer.endTimer(), candidateRecordKeys.size(), foundRecordKeys.size()));
@@ -112,11 +112,11 @@ public class HoodieKeyLookupHandle<T extends HoodieRecordPayload> extends Hoodie
}
HoodieDataFile dataFile = getLatestDataFile();
List<String> matchingKeys = checkCandidatesAgainstFile(hoodieTable.getHadoopConf(), candidateRecordKeys,
new Path(dataFile.getPath()));
logger.info(String.format("Total records (%d), bloom filter candidates (%d)/fp(%d), actual matches (%d)",
totalKeysChecked, candidateRecordKeys.size(), candidateRecordKeys.size() - matchingKeys.size(),
matchingKeys.size()));
List<String> matchingKeys =
checkCandidatesAgainstFile(hoodieTable.getHadoopConf(), candidateRecordKeys, new Path(dataFile.getPath()));
logger.info(
String.format("Total records (%d), bloom filter candidates (%d)/fp(%d), actual matches (%d)", totalKeysChecked,
candidateRecordKeys.size(), candidateRecordKeys.size() - matchingKeys.size(), matchingKeys.size()));
return new KeyLookupResult(partitionPathFilePair.getRight(), partitionPathFilePair.getLeft(),
dataFile.getCommitTime(), matchingKeys);
}

View File

@@ -71,8 +71,7 @@ public class HoodieMergeHandle<T extends HoodieRecordPayload> extends HoodieWrit
Iterator<HoodieRecord<T>> recordItr, String fileId) {
super(config, commitTime, fileId, hoodieTable);
String partitionPath = init(fileId, recordItr);
init(fileId, partitionPath,
hoodieTable.getROFileSystemView().getLatestDataFile(partitionPath, fileId).get());
init(fileId, partitionPath, hoodieTable.getROFileSystemView().getLatestDataFile(partitionPath, fileId).get());
}
/**
@@ -83,8 +82,8 @@ public class HoodieMergeHandle<T extends HoodieRecordPayload> extends HoodieWrit
super(config, commitTime, fileId, hoodieTable);
this.keyToNewRecords = keyToNewRecords;
this.useWriterSchema = true;
init(fileId, keyToNewRecords.get(keyToNewRecords.keySet().stream().findFirst().get())
.getPartitionPath(), dataFileToBeMerged);
init(fileId, keyToNewRecords.get(keyToNewRecords.keySet().stream().findFirst().get()).getPartitionPath(),
dataFileToBeMerged);
}
@@ -160,14 +159,12 @@ public class HoodieMergeHandle<T extends HoodieRecordPayload> extends HoodieWrit
new Path(config.getBasePath()), FSUtils.getPartitionPath(config.getBasePath(), partitionPath));
partitionMetadata.trySave(TaskContext.getPartitionId());
oldFilePath = new Path(
config.getBasePath() + "/" + partitionPath + "/" + latestValidFilePath);
oldFilePath = new Path(config.getBasePath() + "/" + partitionPath + "/" + latestValidFilePath);
String relativePath = new Path((partitionPath.isEmpty() ? "" : partitionPath + "/")
+ FSUtils.makeDataFileName(instantTime, writeToken, fileId)).toString();
newFilePath = new Path(config.getBasePath(), relativePath);
logger.info(String
.format("Merging new data into oldPath %s, as newPath %s", oldFilePath.toString(),
logger.info(String.format("Merging new data into oldPath %s, as newPath %s", oldFilePath.toString(),
newFilePath.toString()));
// file name is same for all records, in this bunch
writeStatus.setFileId(fileId);
@@ -180,13 +177,12 @@ public class HoodieMergeHandle<T extends HoodieRecordPayload> extends HoodieWrit
createMarkerFile(partitionPath);
// Create the writer for writing the new version file
storageWriter = HoodieStorageWriterFactory
.getStorageWriter(instantTime, newFilePath, hoodieTable, config, writerSchema);
storageWriter =
HoodieStorageWriterFactory.getStorageWriter(instantTime, newFilePath, hoodieTable, config, writerSchema);
} catch (IOException io) {
logger.error("Error in update task at commit " + instantTime, io);
writeStatus.setGlobalError(io);
throw new HoodieUpsertException(
"Failed to initialize HoodieUpdateHandle for FileId: " + fileId + " on commit "
throw new HoodieUpsertException("Failed to initialize HoodieUpdateHandle for FileId: " + fileId + " on commit "
+ instantTime + " on path " + hoodieTable.getMetaClient().getBasePath(), io);
}
}
@@ -217,10 +213,8 @@ public class HoodieMergeHandle<T extends HoodieRecordPayload> extends HoodieWrit
logger.info("Number of entries in MemoryBasedMap => "
+ ((ExternalSpillableMap) keyToNewRecords).getInMemoryMapNumEntries()
+ "Total size in bytes of MemoryBasedMap => "
+ ((ExternalSpillableMap) keyToNewRecords).getCurrentInMemoryMapSize()
+ "Number of entries in DiskBasedMap => "
+ ((ExternalSpillableMap) keyToNewRecords).getDiskBasedMapNumEntries()
+ "Size of file spilled to disk => "
+ ((ExternalSpillableMap) keyToNewRecords).getCurrentInMemoryMapSize() + "Number of entries in DiskBasedMap => "
+ ((ExternalSpillableMap) keyToNewRecords).getDiskBasedMapNumEntries() + "Size of file spilled to disk => "
+ ((ExternalSpillableMap) keyToNewRecords).getSizeOfFileOnDiskInBytes());
return partitionPath;
}
@@ -258,8 +252,7 @@ public class HoodieMergeHandle<T extends HoodieRecordPayload> extends HoodieWrit
}
/**
* Go through an old record. Here if we detect a newer version shows up, we write the new one to
* the file.
* Go through an old record. Here if we detect a newer version shows up, we write the new one to the file.
*/
public void write(GenericRecord oldRecord) {
String key = oldRecord.get(HoodieRecord.RECORD_KEY_METADATA_FIELD).toString();
@@ -269,12 +262,12 @@ public class HoodieMergeHandle<T extends HoodieRecordPayload> extends HoodieWrit
// writing the first record. So make a copy of the record to be merged
HoodieRecord<T> hoodieRecord = new HoodieRecord<>(keyToNewRecords.get(key));
try {
Option<IndexedRecord> combinedAvroRecord = hoodieRecord.getData()
.combineAndGetUpdateValue(oldRecord, useWriterSchema ? writerSchema : originalSchema);
Option<IndexedRecord> combinedAvroRecord =
hoodieRecord.getData().combineAndGetUpdateValue(oldRecord, useWriterSchema ? writerSchema : originalSchema);
if (writeUpdateRecord(hoodieRecord, combinedAvroRecord)) {
/* ONLY WHEN
* 1) we have an update for this key AND
* 2) We are able to successfully write the the combined new value
/*
* ONLY WHEN 1) we have an update for this key AND 2) We are able to successfully write the the combined new
* value
*
* We no longer need to copy the old record over.
*/
@@ -282,26 +275,24 @@ public class HoodieMergeHandle<T extends HoodieRecordPayload> extends HoodieWrit
}
writtenRecordKeys.add(key);
} catch (Exception e) {
throw new HoodieUpsertException(
"Failed to combine/merge new record with old value in storage, for new record {"
throw new HoodieUpsertException("Failed to combine/merge new record with old value in storage, for new record {"
+ keyToNewRecords.get(key) + "}, old value {" + oldRecord + "}", e);
}
}
if (copyOldRecord) {
// this should work as it is, since this is an existing record
String errMsg = "Failed to merge old record into new file for key " + key + " from old file "
+ getOldFilePath() + " to new file " + newFilePath;
String errMsg = "Failed to merge old record into new file for key " + key + " from old file " + getOldFilePath()
+ " to new file " + newFilePath;
try {
storageWriter.writeAvro(key, oldRecord);
} catch (ClassCastException e) {
logger.error("Schema mismatch when rewriting old record " + oldRecord + " from file "
+ getOldFilePath() + " to file " + newFilePath + " with writerSchema " + writerSchema
.toString(true));
logger.error("Schema mismatch when rewriting old record " + oldRecord + " from file " + getOldFilePath()
+ " to file " + newFilePath + " with writerSchema " + writerSchema.toString(true));
throw new HoodieUpsertException(errMsg, e);
} catch (IOException e) {
logger.error("Failed to merge old record into new file for key " + key + " from old file "
+ getOldFilePath() + " to new file " + newFilePath, e);
logger.error("Failed to merge old record into new file for key " + key + " from old file " + getOldFilePath()
+ " to new file " + newFilePath, e);
throw new HoodieUpsertException(errMsg, e);
}
recordsWritten++;
@@ -344,8 +335,8 @@ public class HoodieMergeHandle<T extends HoodieRecordPayload> extends HoodieWrit
runtimeStats.setTotalUpsertTime(timer.endTimer());
stat.setRuntimeStats(runtimeStats);
logger.info(String.format("MergeHandle for partitionPath %s fileID %s, took %d ms.",
stat.getPartitionPath(), stat.getFileId(), runtimeStats.getTotalUpsertTime()));
logger.info(String.format("MergeHandle for partitionPath %s fileID %s, took %d ms.", stat.getPartitionPath(),
stat.getFileId(), runtimeStats.getTotalUpsertTime()));
return writeStatus;
} catch (IOException e) {

View File

@@ -61,8 +61,7 @@ public abstract class HoodieWriteHandle<T extends HoodieRecordPayload> extends H
this.writerSchema = createHoodieWriteSchema(originalSchema);
this.timer = new HoodieTimer().startTimer();
this.writeStatus = (WriteStatus) ReflectionUtils.loadClass(config.getWriteStatusClassName(),
!hoodieTable.getIndex().isImplicitWithStorage(),
config.getWriteStatusFailureFraction());
!hoodieTable.getIndex().isImplicitWithStorage(), config.getWriteStatusFailureFraction());
}
/**

View File

@@ -45,15 +45,12 @@ public interface HoodieCompactor extends Serializable {
* @return Compaction Plan
* @throws IOException when encountering errors
*/
HoodieCompactionPlan generateCompactionPlan(JavaSparkContext jsc,
HoodieTable hoodieTable, HoodieWriteConfig config, String compactionCommitTime,
Set<HoodieFileGroupId> fgIdsInPendingCompactions)
throws IOException;
HoodieCompactionPlan generateCompactionPlan(JavaSparkContext jsc, HoodieTable hoodieTable, HoodieWriteConfig config,
String compactionCommitTime, Set<HoodieFileGroupId> fgIdsInPendingCompactions) throws IOException;
/**
* Execute compaction operations and report back status
*/
JavaRDD<WriteStatus> compact(JavaSparkContext jsc,
HoodieCompactionPlan compactionPlan, HoodieTable hoodieTable, HoodieWriteConfig config,
String compactionInstantTime) throws IOException;
JavaRDD<WriteStatus> compact(JavaSparkContext jsc, HoodieCompactionPlan compactionPlan, HoodieTable hoodieTable,
HoodieWriteConfig config, String compactionInstantTime) throws IOException;
}

View File

@@ -63,9 +63,9 @@ import org.apache.spark.util.AccumulatorV2;
import org.apache.spark.util.LongAccumulator;
/**
* HoodieRealtimeTableCompactor compacts a hoodie table with merge on read storage. Computes all
* possible compactions, passes it through a CompactionFilter and executes all the compactions and
* writes a new version of base files and make a normal commit
* HoodieRealtimeTableCompactor compacts a hoodie table with merge on read storage. Computes all possible compactions,
* passes it through a CompactionFilter and executes all the compactions and writes a new version of base files and make
* a normal commit
*
* @see HoodieCompactor
*/
@@ -78,9 +78,8 @@ public class HoodieRealtimeTableCompactor implements HoodieCompactor {
private AccumulatorV2<Long, Long> totalFileSlices;
@Override
public JavaRDD<WriteStatus> compact(JavaSparkContext jsc,
HoodieCompactionPlan compactionPlan, HoodieTable hoodieTable, HoodieWriteConfig config,
String compactionInstantTime) throws IOException {
public JavaRDD<WriteStatus> compact(JavaSparkContext jsc, HoodieCompactionPlan compactionPlan,
HoodieTable hoodieTable, HoodieWriteConfig config, String compactionInstantTime) throws IOException {
if (compactionPlan == null || (compactionPlan.getOperations() == null)
|| (compactionPlan.getOperations().isEmpty())) {
return jsc.emptyRDD();
@@ -88,41 +87,36 @@ public class HoodieRealtimeTableCompactor implements HoodieCompactor {
HoodieTableMetaClient metaClient = hoodieTable.getMetaClient();
// Compacting is very similar to applying updates to existing file
HoodieCopyOnWriteTable table = new HoodieCopyOnWriteTable(config, jsc);
List<CompactionOperation> operations = compactionPlan.getOperations().stream().map(
CompactionOperation::convertFromAvroRecordInstance).collect(toList());
List<CompactionOperation> operations = compactionPlan.getOperations().stream()
.map(CompactionOperation::convertFromAvroRecordInstance).collect(toList());
log.info("Compactor compacting " + operations + " files");
return jsc.parallelize(operations, operations.size())
.map(s -> compact(table, metaClient, config, s, compactionInstantTime))
.flatMap(List::iterator);
.map(s -> compact(table, metaClient, config, s, compactionInstantTime)).flatMap(List::iterator);
}
private List<WriteStatus> compact(HoodieCopyOnWriteTable hoodieCopyOnWriteTable, HoodieTableMetaClient metaClient,
HoodieWriteConfig config,
CompactionOperation operation, String commitTime) throws IOException {
HoodieWriteConfig config, CompactionOperation operation, String commitTime) throws IOException {
FileSystem fs = metaClient.getFs();
Schema readerSchema = HoodieAvroUtils
.addMetadataFields(new Schema.Parser().parse(config.getSchema()));
Schema readerSchema = HoodieAvroUtils.addMetadataFields(new Schema.Parser().parse(config.getSchema()));
log.info("Compacting base " + operation.getDataFilePath() + " with delta files " + operation
.getDeltaFilePaths() + " for commit " + commitTime);
log.info("Compacting base " + operation.getDataFilePath() + " with delta files " + operation.getDeltaFilePaths()
+ " for commit " + commitTime);
// TODO - FIX THIS
// Reads the entire avro file. Always only specific blocks should be read from the avro file
// (failure recover).
// Load all the delta commits since the last compaction commit and get all the blocks to be
// loaded and load it using CompositeAvroLogReader
// Since a DeltaCommit is not defined yet, reading all the records. revisit this soon.
String maxInstantTime = metaClient.getActiveTimeline()
.getTimelineOfActions(
Sets.newHashSet(HoodieTimeline.COMMIT_ACTION, HoodieTimeline.ROLLBACK_ACTION,
HoodieTimeline.DELTA_COMMIT_ACTION))
String maxInstantTime = metaClient
.getActiveTimeline().getTimelineOfActions(Sets.newHashSet(HoodieTimeline.COMMIT_ACTION,
HoodieTimeline.ROLLBACK_ACTION, HoodieTimeline.DELTA_COMMIT_ACTION))
.filterCompletedInstants().lastInstant().get().getTimestamp();
log.info("MaxMemoryPerCompaction => " + config.getMaxMemoryPerCompaction());
HoodieMergedLogRecordScanner scanner = new HoodieMergedLogRecordScanner(fs,
metaClient.getBasePath(), operation.getDeltaFilePaths(), readerSchema, maxInstantTime,
config.getMaxMemoryPerCompaction(), config.getCompactionLazyBlockReadEnabled(),
config.getCompactionReverseLogReadEnabled(), config.getMaxDFSStreamBufferSize(),
config.getSpillableMapBasePath());
HoodieMergedLogRecordScanner scanner = new HoodieMergedLogRecordScanner(fs, metaClient.getBasePath(),
operation.getDeltaFilePaths(), readerSchema, maxInstantTime, config.getMaxMemoryPerCompaction(),
config.getCompactionLazyBlockReadEnabled(), config.getCompactionReverseLogReadEnabled(),
config.getMaxDFSStreamBufferSize(), config.getSpillableMapBasePath());
if (!scanner.iterator().hasNext()) {
return Lists.<WriteStatus>newArrayList();
}
@@ -134,21 +128,20 @@ public class HoodieRealtimeTableCompactor implements HoodieCompactor {
// If the dataFile is present, there is a base parquet file present, perform updates else perform inserts into a
// new base parquet file.
if (oldDataFileOpt.isPresent()) {
result = hoodieCopyOnWriteTable
.handleUpdate(commitTime, operation.getFileId(), scanner.getRecords(), oldDataFileOpt.get());
result = hoodieCopyOnWriteTable.handleUpdate(commitTime, operation.getFileId(), scanner.getRecords(),
oldDataFileOpt.get());
} else {
result = hoodieCopyOnWriteTable
.handleInsert(commitTime, operation.getPartitionPath(), operation.getFileId(), scanner.iterator());
result = hoodieCopyOnWriteTable.handleInsert(commitTime, operation.getPartitionPath(), operation.getFileId(),
scanner.iterator());
}
Iterable<List<WriteStatus>> resultIterable = () -> result;
return StreamSupport.stream(resultIterable.spliterator(), false).flatMap(Collection::stream)
.peek(s -> {
return StreamSupport.stream(resultIterable.spliterator(), false).flatMap(Collection::stream).peek(s -> {
s.getStat().setTotalUpdatedRecordsCompacted(scanner.getNumMergedRecordsInLog());
s.getStat().setTotalLogFilesCompacted(scanner.getTotalLogFiles());
s.getStat().setTotalLogRecords(scanner.getTotalLogRecords());
s.getStat().setPartitionPath(operation.getPartitionPath());
s.getStat().setTotalLogSizeCompacted(operation.getMetrics().get(
CompactionStrategy.TOTAL_LOG_FILE_SIZE).longValue());
s.getStat()
.setTotalLogSizeCompacted(operation.getMetrics().get(CompactionStrategy.TOTAL_LOG_FILE_SIZE).longValue());
s.getStat().setTotalLogBlocks(scanner.getTotalLogBlocks());
s.getStat().setTotalCorruptLogBlock(scanner.getTotalCorruptBlocks());
s.getStat().setTotalRollbackBlocks(scanner.getTotalRollbacks());
@@ -159,27 +152,24 @@ public class HoodieRealtimeTableCompactor implements HoodieCompactor {
}
@Override
public HoodieCompactionPlan generateCompactionPlan(JavaSparkContext jsc,
HoodieTable hoodieTable, HoodieWriteConfig config, String compactionCommitTime,
Set<HoodieFileGroupId> fgIdsInPendingCompactions) throws IOException {
public HoodieCompactionPlan generateCompactionPlan(JavaSparkContext jsc, HoodieTable hoodieTable,
HoodieWriteConfig config, String compactionCommitTime, Set<HoodieFileGroupId> fgIdsInPendingCompactions)
throws IOException {
totalLogFiles = new LongAccumulator();
totalFileSlices = new LongAccumulator();
jsc.sc().register(totalLogFiles);
jsc.sc().register(totalFileSlices);
Preconditions
.checkArgument(hoodieTable.getMetaClient().getTableType() == HoodieTableType.MERGE_ON_READ,
"HoodieRealtimeTableCompactor can only compact table of type "
+ HoodieTableType.MERGE_ON_READ + " and not " + hoodieTable.getMetaClient()
.getTableType().name());
Preconditions.checkArgument(hoodieTable.getMetaClient().getTableType() == HoodieTableType.MERGE_ON_READ,
"HoodieRealtimeTableCompactor can only compact table of type " + HoodieTableType.MERGE_ON_READ + " and not "
+ hoodieTable.getMetaClient().getTableType().name());
// TODO : check if maxMemory is not greater than JVM or spark.executor memory
// TODO - rollback any compactions in flight
HoodieTableMetaClient metaClient = hoodieTable.getMetaClient();
log.info("Compacting " + metaClient.getBasePath() + " with commit " + compactionCommitTime);
List<String> partitionPaths = FSUtils
.getAllPartitionPaths(metaClient.getFs(), metaClient.getBasePath(),
List<String> partitionPaths = FSUtils.getAllPartitionPaths(metaClient.getFs(), metaClient.getBasePath(),
config.shouldAssumeDatePartitioning());
// filter the partition paths if needed to reduce list status
@@ -192,16 +182,12 @@ public class HoodieRealtimeTableCompactor implements HoodieCompactor {
RealtimeView fileSystemView = hoodieTable.getRTFileSystemView();
log.info("Compaction looking for files to compact in " + partitionPaths + " partitions");
List<HoodieCompactionOperation> operations =
jsc.parallelize(partitionPaths, partitionPaths.size())
List<HoodieCompactionOperation> operations = jsc.parallelize(partitionPaths, partitionPaths.size())
.flatMap((FlatMapFunction<String, CompactionOperation>) partitionPath -> fileSystemView
.getLatestFileSlices(partitionPath)
.filter(slice ->
!fgIdsInPendingCompactions.contains(slice.getFileGroupId()))
.map(
s -> {
List<HoodieLogFile> logFiles = s.getLogFiles().sorted(HoodieLogFile
.getLogFileComparator()).collect(Collectors.toList());
.filter(slice -> !fgIdsInPendingCompactions.contains(slice.getFileGroupId())).map(s -> {
List<HoodieLogFile> logFiles =
s.getLogFiles().sorted(HoodieLogFile.getLogFileComparator()).collect(Collectors.toList());
totalLogFiles.add((long) logFiles.size());
totalFileSlices.add(1L);
// Avro generated classes are not inheriting Serializable. Using CompactionOperation POJO
@@ -210,10 +196,8 @@ public class HoodieRealtimeTableCompactor implements HoodieCompactor {
Option<HoodieDataFile> dataFile = s.getDataFile();
return new CompactionOperation(dataFile, partitionPath, logFiles,
config.getCompactionStrategy().captureMetrics(config, dataFile, partitionPath, logFiles));
})
.filter(c -> !c.getDeltaFilePaths().isEmpty())
.collect(toList()).iterator()).collect().stream().map(CompactionUtils::buildHoodieCompactionOperation)
.collect(toList());
}).filter(c -> !c.getDeltaFilePaths().isEmpty()).collect(toList()).iterator())
.collect().stream().map(CompactionUtils::buildHoodieCompactionOperation).collect(toList());
log.info("Total of " + operations.size() + " compactions are retrieved");
log.info("Total number of latest files slices " + totalFileSlices.value());
log.info("Total number of log files " + totalLogFiles.value());
@@ -222,11 +206,11 @@ public class HoodieRealtimeTableCompactor implements HoodieCompactor {
// compactions only
HoodieCompactionPlan compactionPlan = config.getCompactionStrategy().generateCompactionPlan(config, operations,
CompactionUtils.getAllPendingCompactionPlans(metaClient).stream().map(Pair::getValue).collect(toList()));
Preconditions.checkArgument(compactionPlan.getOperations().stream().noneMatch(
Preconditions.checkArgument(
compactionPlan.getOperations().stream().noneMatch(
op -> fgIdsInPendingCompactions.contains(new HoodieFileGroupId(op.getPartitionPath(), op.getFileId()))),
"Bad Compaction Plan. FileId MUST NOT have multiple pending compactions. "
+ "Please fix your strategy implementation."
+ "FileIdsWithPendingCompactions :" + fgIdsInPendingCompactions
+ "Please fix your strategy implementation." + "FileIdsWithPendingCompactions :" + fgIdsInPendingCompactions
+ ", Selected workload :" + compactionPlan);
if (compactionPlan.getOperations().isEmpty()) {
log.warn("After filtering, Nothing to compact for " + metaClient.getBasePath());

View File

@@ -25,8 +25,8 @@ import org.apache.hudi.avro.model.HoodieCompactionPlan;
import org.apache.hudi.config.HoodieWriteConfig;
/**
* CompactionStrategy which looks at total IO to be done for the compaction (read + write) and
* limits the list of compactions to be under a configured limit on the IO
* CompactionStrategy which looks at total IO to be done for the compaction (read + write) and limits the list of
* compactions to be under a configured limit on the IO
*
* @see CompactionStrategy
*/

View File

@@ -32,11 +32,10 @@ import org.apache.hudi.config.HoodieWriteConfig;
/**
* This strategy ensures that the last N partitions are picked up even if there are later partitions created for the
* dataset. lastNPartitions is defined as the N partitions before the currentDate.
* currentDay = 2018/01/01
* The dataset has partitions for 2018/02/02 and 2018/03/03 beyond the currentDay
* This strategy will pick up the following partitions for compaction :
* (2018/01/01, allPartitionsInRange[(2018/01/01 - lastNPartitions) to 2018/01/01), 2018/02/02, 2018/03/03)
* dataset. lastNPartitions is defined as the N partitions before the currentDate. currentDay = 2018/01/01 The dataset
* has partitions for 2018/02/02 and 2018/03/03 beyond the currentDay This strategy will pick up the following
* partitions for compaction : (2018/01/01, allPartitionsInRange[(2018/01/01 - lastNPartitions) to 2018/01/01),
* 2018/02/02, 2018/03/03)
*/
public class BoundedPartitionAwareCompactionStrategy extends DayBasedCompactionStrategy {
@@ -46,15 +45,14 @@ public class BoundedPartitionAwareCompactionStrategy extends DayBasedCompactionS
public List<HoodieCompactionOperation> orderAndFilter(HoodieWriteConfig writeConfig,
List<HoodieCompactionOperation> operations, List<HoodieCompactionPlan> pendingCompactionPlans) {
// The earliest partition to compact - current day minus the target partitions limit
String earliestPartitionPathToCompact = dateFormat.format(
getDateAtOffsetFromToday(-1 * writeConfig.getTargetPartitionsPerDayBasedCompaction()));
String earliestPartitionPathToCompact =
dateFormat.format(getDateAtOffsetFromToday(-1 * writeConfig.getTargetPartitionsPerDayBasedCompaction()));
// Filter out all partitions greater than earliestPartitionPathToCompact
List<HoodieCompactionOperation> eligibleCompactionOperations = operations.stream()
.collect(Collectors.groupingBy(HoodieCompactionOperation::getPartitionPath)).entrySet().stream()
.sorted(Map.Entry.comparingByKey(comparator))
List<HoodieCompactionOperation> eligibleCompactionOperations =
operations.stream().collect(Collectors.groupingBy(HoodieCompactionOperation::getPartitionPath)).entrySet()
.stream().sorted(Map.Entry.comparingByKey(comparator))
.filter(e -> comparator.compare(earliestPartitionPathToCompact, e.getKey()) >= 0)
.flatMap(e -> e.getValue().stream())
.collect(Collectors.toList());
.flatMap(e -> e.getValue().stream()).collect(Collectors.toList());
return eligibleCompactionOperations;
}
@@ -62,13 +60,12 @@ public class BoundedPartitionAwareCompactionStrategy extends DayBasedCompactionS
@Override
public List<String> filterPartitionPaths(HoodieWriteConfig writeConfig, List<String> partitionPaths) {
// The earliest partition to compact - current day minus the target partitions limit
String earliestPartitionPathToCompact = dateFormat.format(
getDateAtOffsetFromToday(-1 * writeConfig.getTargetPartitionsPerDayBasedCompaction()));
String earliestPartitionPathToCompact =
dateFormat.format(getDateAtOffsetFromToday(-1 * writeConfig.getTargetPartitionsPerDayBasedCompaction()));
// Get all partitions and sort them
List<String> filteredPartitionPaths = partitionPaths.stream().map(partition -> partition.replace("/", "-"))
.sorted(Comparator.reverseOrder()).map(partitionPath -> partitionPath.replace("-", "/"))
.filter(e -> comparator.compare(earliestPartitionPathToCompact, e) >= 0)
.collect(Collectors.toList());
.filter(e -> comparator.compare(earliestPartitionPathToCompact, e) >= 0).collect(Collectors.toList());
return filteredPartitionPaths;
}

View File

@@ -32,11 +32,10 @@ import org.apache.hudi.config.HoodieWriteConfig;
import org.apache.hudi.io.compact.HoodieRealtimeTableCompactor;
/**
* Strategy for compaction. Pluggable implementation to define how compaction should be done. The
* over-ridden implementations of this abstract class can capture the relevant metrics to order
* and filter the final list of compaction operation to run in a single compaction.
* Implementation of CompactionStrategy cannot hold any state. Difference instantiations can be
* passed in every time
* Strategy for compaction. Pluggable implementation to define how compaction should be done. The over-ridden
* implementations of this abstract class can capture the relevant metrics to order and filter the final list of
* compaction operation to run in a single compaction. Implementation of CompactionStrategy cannot hold any state.
* Difference instantiations can be passed in every time
*
* @see HoodieRealtimeTableCompactor
*/
@@ -49,8 +48,8 @@ public abstract class CompactionStrategy implements Serializable {
public static final String TOTAL_LOG_FILES = "TOTAL_LOG_FILES";
/**
* Callback hook when a HoodieCompactionOperation is created. Individual strategies can capture the
* metrics they need to decide on the priority.
* Callback hook when a HoodieCompactionOperation is created. Individual strategies can capture the metrics they need
* to decide on the priority.
*
* @param dataFile - Base file to compact
* @param partitionPath - Partition path
@@ -65,11 +64,11 @@ public abstract class CompactionStrategy implements Serializable {
Long totalLogFileSize = logFiles.stream().map(HoodieLogFile::getFileSize).filter(size -> size >= 0)
.reduce((size1, size2) -> size1 + size2).orElse(0L);
// Total read will be the base file + all the log files
Long totalIORead = FSUtils.getSizeInMB((dataFile.isPresent() ? dataFile.get().getFileSize() : 0L)
+ totalLogFileSize);
Long totalIORead =
FSUtils.getSizeInMB((dataFile.isPresent() ? dataFile.get().getFileSize() : 0L) + totalLogFileSize);
// Total write will be similar to the size of the base file
Long totalIOWrite = FSUtils
.getSizeInMB(dataFile.isPresent() ? dataFile.get().getFileSize() : defaultMaxParquetFileSize);
Long totalIOWrite =
FSUtils.getSizeInMB(dataFile.isPresent() ? dataFile.get().getFileSize() : defaultMaxParquetFileSize);
// Total IO will the the IO for read + write
Long totalIO = totalIORead + totalIOWrite;
// Save these metrics and we will use during the filter
@@ -95,8 +94,7 @@ public abstract class CompactionStrategy implements Serializable {
List<HoodieCompactionOperation> operations, List<HoodieCompactionPlan> pendingCompactionPlans) {
// Strategy implementation can overload this method to set specific compactor-id
return HoodieCompactionPlan.newBuilder()
.setOperations(orderAndFilter(writeConfig, operations, pendingCompactionPlans))
.build();
.setOperations(orderAndFilter(writeConfig, operations, pendingCompactionPlans)).build();
}
/**
@@ -109,13 +107,13 @@ public abstract class CompactionStrategy implements Serializable {
* @return list of compactions to perform in this run
*/
public List<HoodieCompactionOperation> orderAndFilter(HoodieWriteConfig writeConfig,
List<HoodieCompactionOperation> operations,
List<HoodieCompactionPlan> pendingCompactionPlans) {
List<HoodieCompactionOperation> operations, List<HoodieCompactionPlan> pendingCompactionPlans) {
return operations;
}
/**
* Filter the partition paths based on compaction strategy
*
* @param writeConfig
* @param allPartitionPaths
* @return

View File

@@ -34,21 +34,18 @@ import org.apache.hudi.config.HoodieWriteConfig;
import org.apache.hudi.exception.HoodieException;
/**
* This strategy orders compactions in reverse order of creation of Hive Partitions. It helps to
* compact data in latest partitions first and then older capped at the Total_IO allowed.
* This strategy orders compactions in reverse order of creation of Hive Partitions. It helps to compact data in latest
* partitions first and then older capped at the Total_IO allowed.
*/
public class DayBasedCompactionStrategy extends CompactionStrategy {
// For now, use SimpleDateFormat as default partition format
protected static String datePartitionFormat = "yyyy/MM/dd";
// Sorts compaction in LastInFirstCompacted order
protected static Comparator<String> comparator = (String leftPartition,
String rightPartition) -> {
protected static Comparator<String> comparator = (String leftPartition, String rightPartition) -> {
try {
Date left = new SimpleDateFormat(datePartitionFormat, Locale.ENGLISH)
.parse(leftPartition);
Date right = new SimpleDateFormat(datePartitionFormat, Locale.ENGLISH)
.parse(rightPartition);
Date left = new SimpleDateFormat(datePartitionFormat, Locale.ENGLISH).parse(leftPartition);
Date right = new SimpleDateFormat(datePartitionFormat, Locale.ENGLISH).parse(rightPartition);
return left.after(right) ? -1 : right.after(left) ? 1 : 0;
} catch (ParseException e) {
throw new HoodieException("Invalid Partition Date Format", e);
@@ -68,8 +65,7 @@ public class DayBasedCompactionStrategy extends CompactionStrategy {
List<HoodieCompactionOperation> filteredList = operations.stream()
.collect(Collectors.groupingBy(HoodieCompactionOperation::getPartitionPath)).entrySet().stream()
.sorted(Map.Entry.comparingByKey(comparator)).limit(writeConfig.getTargetPartitionsPerDayBasedCompaction())
.flatMap(e -> e.getValue().stream())
.collect(Collectors.toList());
.flatMap(e -> e.getValue().stream()).collect(Collectors.toList());
return filteredList;
}

View File

@@ -30,14 +30,14 @@ import org.apache.hudi.common.util.Option;
import org.apache.hudi.config.HoodieWriteConfig;
/**
* LogFileSizeBasedCompactionStrategy orders the compactions based on the total log files size and
* limits the compactions within a configured IO bound
* LogFileSizeBasedCompactionStrategy orders the compactions based on the total log files size and limits the
* compactions within a configured IO bound
*
* @see BoundedIOCompactionStrategy
* @see CompactionStrategy
*/
public class LogFileSizeBasedCompactionStrategy extends BoundedIOCompactionStrategy implements
Comparator<HoodieCompactionOperation> {
public class LogFileSizeBasedCompactionStrategy extends BoundedIOCompactionStrategy
implements Comparator<HoodieCompactionOperation> {
private static final String TOTAL_LOG_FILE_SIZE = "TOTAL_LOG_FILE_SIZE";
@@ -47,9 +47,8 @@ public class LogFileSizeBasedCompactionStrategy extends BoundedIOCompactionStrat
Map<String, Double> metrics = super.captureMetrics(config, dataFile, partitionPath, logFiles);
// Total size of all the log files
Long totalLogFileSize = logFiles.stream().map(HoodieLogFile::getFileSize)
.filter(size -> size >= 0).reduce((size1, size2) -> size1 + size2)
.orElse(0L);
Long totalLogFileSize = logFiles.stream().map(HoodieLogFile::getFileSize).filter(size -> size >= 0)
.reduce((size1, size2) -> size1 + size2).orElse(0L);
// save the metrics needed during the order
metrics.put(TOTAL_LOG_FILE_SIZE, totalLogFileSize.doubleValue());
return metrics;
@@ -59,9 +58,8 @@ public class LogFileSizeBasedCompactionStrategy extends BoundedIOCompactionStrat
public List<HoodieCompactionOperation> orderAndFilter(HoodieWriteConfig writeConfig,
List<HoodieCompactionOperation> operations, List<HoodieCompactionPlan> pendingCompactionPlans) {
// Order the operations based on the reverse size of the logs and limit them by the IO
return super
.orderAndFilter(writeConfig,
operations.stream().sorted(this).collect(Collectors.toList()), pendingCompactionPlans);
return super.orderAndFilter(writeConfig, operations.stream().sorted(this).collect(Collectors.toList()),
pendingCompactionPlans);
}
@Override

View File

@@ -24,9 +24,8 @@ import org.apache.hudi.avro.model.HoodieCompactionPlan;
import org.apache.hudi.config.HoodieWriteConfig;
/**
* UnBoundedCompactionStrategy will not change ordering or filter any compaction. It is a
* pass-through and will compact all the base files which has a log file. This usually means
* no-intelligence on compaction.
* UnBoundedCompactionStrategy will not change ordering or filter any compaction. It is a pass-through and will compact
* all the base files which has a log file. This usually means no-intelligence on compaction.
*
* @see CompactionStrategy
*/

View File

@@ -27,12 +27,11 @@ import org.apache.hudi.avro.model.HoodieCompactionPlan;
import org.apache.hudi.config.HoodieWriteConfig;
/**
* UnBoundedPartitionAwareCompactionStrategy is a custom UnBounded Strategy.
* This will filter all the partitions that are eligible to be compacted by a
* {@link BoundedPartitionAwareCompactionStrategy} and return the result.
* This is done so that a long running UnBoundedPartitionAwareCompactionStrategy does not step over partitions
* in a shorter running BoundedPartitionAwareCompactionStrategy. Essentially, this is an inverse of the
* partitions chosen in BoundedPartitionAwareCompactionStrategy
* UnBoundedPartitionAwareCompactionStrategy is a custom UnBounded Strategy. This will filter all the partitions that
* are eligible to be compacted by a {@link BoundedPartitionAwareCompactionStrategy} and return the result. This is done
* so that a long running UnBoundedPartitionAwareCompactionStrategy does not step over partitions in a shorter running
* BoundedPartitionAwareCompactionStrategy. Essentially, this is an inverse of the partitions chosen in
* BoundedPartitionAwareCompactionStrategy
*
* @see CompactionStrategy
*/
@@ -41,10 +40,10 @@ public class UnBoundedPartitionAwareCompactionStrategy extends CompactionStrateg
@Override
public List<HoodieCompactionOperation> orderAndFilter(HoodieWriteConfig config,
final List<HoodieCompactionOperation> operations, final List<HoodieCompactionPlan> pendingCompactionWorkloads) {
BoundedPartitionAwareCompactionStrategy boundedPartitionAwareCompactionStrategy
= new BoundedPartitionAwareCompactionStrategy();
List<HoodieCompactionOperation> operationsToExclude = boundedPartitionAwareCompactionStrategy
.orderAndFilter(config, operations, pendingCompactionWorkloads);
BoundedPartitionAwareCompactionStrategy boundedPartitionAwareCompactionStrategy =
new BoundedPartitionAwareCompactionStrategy();
List<HoodieCompactionOperation> operationsToExclude =
boundedPartitionAwareCompactionStrategy.orderAndFilter(config, operations, pendingCompactionWorkloads);
List<HoodieCompactionOperation> allOperations = new ArrayList<>(operations);
allOperations.removeAll(operationsToExclude);
return allOperations;
@@ -52,13 +51,13 @@ public class UnBoundedPartitionAwareCompactionStrategy extends CompactionStrateg
@Override
public List<String> filterPartitionPaths(HoodieWriteConfig writeConfig, List<String> partitionPaths) {
List<String> allPartitionPaths = partitionPaths.stream().map(partition -> partition.replace("/", "-"))
.sorted(Comparator.reverseOrder()).map(partitionPath -> partitionPath.replace("-", "/"))
.collect(Collectors.toList());
BoundedPartitionAwareCompactionStrategy boundedPartitionAwareCompactionStrategy
= new BoundedPartitionAwareCompactionStrategy();
List<String> partitionsToExclude = boundedPartitionAwareCompactionStrategy.filterPartitionPaths(writeConfig,
partitionPaths);
List<String> allPartitionPaths =
partitionPaths.stream().map(partition -> partition.replace("/", "-")).sorted(Comparator.reverseOrder())
.map(partitionPath -> partitionPath.replace("-", "/")).collect(Collectors.toList());
BoundedPartitionAwareCompactionStrategy boundedPartitionAwareCompactionStrategy =
new BoundedPartitionAwareCompactionStrategy();
List<String> partitionsToExclude =
boundedPartitionAwareCompactionStrategy.filterPartitionPaths(writeConfig, partitionPaths);
allPartitionPaths.removeAll(partitionsToExclude);
return allPartitionPaths;
}

View File

@@ -32,9 +32,8 @@ public class HoodieParquetConfig {
private Configuration hadoopConf;
private double compressionRatio;
public HoodieParquetConfig(HoodieAvroWriteSupport writeSupport,
CompressionCodecName compressionCodecName, int blockSize, int pageSize, long maxFileSize,
Configuration hadoopConf, double compressionRatio) {
public HoodieParquetConfig(HoodieAvroWriteSupport writeSupport, CompressionCodecName compressionCodecName,
int blockSize, int pageSize, long maxFileSize, Configuration hadoopConf, double compressionRatio) {
this.writeSupport = writeSupport;
this.compressionCodecName = compressionCodecName;
this.blockSize = blockSize;

View File

@@ -36,11 +36,11 @@ import org.apache.parquet.hadoop.ParquetWriter;
import org.apache.spark.TaskContext;
/**
* HoodieParquetWriter extends the ParquetWriter to help limit the size of underlying file. Provides
* a way to check if the current file can take more records with the <code>canWrite()</code>
* HoodieParquetWriter extends the ParquetWriter to help limit the size of underlying file. Provides a way to check if
* the current file can take more records with the <code>canWrite()</code>
*/
public class HoodieParquetWriter<T extends HoodieRecordPayload, R extends IndexedRecord> extends
ParquetWriter<IndexedRecord> implements HoodieStorageWriter<R> {
public class HoodieParquetWriter<T extends HoodieRecordPayload, R extends IndexedRecord>
extends ParquetWriter<IndexedRecord> implements HoodieStorageWriter<R> {
private static AtomicLong recordIndex = new AtomicLong(1);
@@ -52,24 +52,22 @@ public class HoodieParquetWriter<T extends HoodieRecordPayload, R extends Indexe
private final Schema schema;
public HoodieParquetWriter(String commitTime, Path file, HoodieParquetConfig parquetConfig,
Schema schema) throws IOException {
public HoodieParquetWriter(String commitTime, Path file, HoodieParquetConfig parquetConfig, Schema schema)
throws IOException {
super(HoodieWrapperFileSystem.convertToHoodiePath(file, parquetConfig.getHadoopConf()),
ParquetFileWriter.Mode.CREATE, parquetConfig.getWriteSupport(),
parquetConfig.getCompressionCodecName(), parquetConfig.getBlockSize(),
parquetConfig.getPageSize(), parquetConfig.getPageSize(),
ParquetFileWriter.Mode.CREATE, parquetConfig.getWriteSupport(), parquetConfig.getCompressionCodecName(),
parquetConfig.getBlockSize(), parquetConfig.getPageSize(), parquetConfig.getPageSize(),
ParquetWriter.DEFAULT_IS_DICTIONARY_ENABLED, ParquetWriter.DEFAULT_IS_VALIDATING_ENABLED,
ParquetWriter.DEFAULT_WRITER_VERSION,
registerFileSystem(file, parquetConfig.getHadoopConf()));
ParquetWriter.DEFAULT_WRITER_VERSION, registerFileSystem(file, parquetConfig.getHadoopConf()));
this.file = HoodieWrapperFileSystem.convertToHoodiePath(file, parquetConfig.getHadoopConf());
this.fs = (HoodieWrapperFileSystem) this.file
.getFileSystem(registerFileSystem(file, parquetConfig.getHadoopConf()));
this.fs =
(HoodieWrapperFileSystem) this.file.getFileSystem(registerFileSystem(file, parquetConfig.getHadoopConf()));
// We cannot accurately measure the snappy compressed output file size. We are choosing a
// conservative 10%
// TODO - compute this compression ratio dynamically by looking at the bytes written to the
// stream and the actual file size reported by HDFS
this.maxFileSize = parquetConfig.getMaxFileSize() + Math
.round(parquetConfig.getMaxFileSize() * parquetConfig.getCompressionRatio());
this.maxFileSize = parquetConfig.getMaxFileSize()
+ Math.round(parquetConfig.getMaxFileSize() * parquetConfig.getCompressionRatio());
this.writeSupport = parquetConfig.getWriteSupport();
this.commitTime = commitTime;
this.schema = schema;
@@ -85,10 +83,10 @@ public class HoodieParquetWriter<T extends HoodieRecordPayload, R extends Indexe
@Override
public void writeAvroWithMetadata(R avroRecord, HoodieRecord record) throws IOException {
String seqId = HoodieRecord.generateSequenceId(commitTime, TaskContext.getPartitionId(),
recordIndex.getAndIncrement());
HoodieAvroUtils.addHoodieKeyToRecord((GenericRecord) avroRecord, record.getRecordKey(),
record.getPartitionPath(), file.getName());
String seqId =
HoodieRecord.generateSequenceId(commitTime, TaskContext.getPartitionId(), recordIndex.getAndIncrement());
HoodieAvroUtils.addHoodieKeyToRecord((GenericRecord) avroRecord, record.getRecordKey(), record.getPartitionPath(),
file.getName());
HoodieAvroUtils.addCommitMetadataToRecord((GenericRecord) avroRecord, commitTime, seqId);
super.write(avroRecord);
writeSupport.add(record.getRecordKey());

View File

@@ -36,8 +36,8 @@ import org.apache.parquet.avro.AvroSchemaConverter;
public class HoodieStorageWriterFactory {
public static <T extends HoodieRecordPayload, R extends IndexedRecord> HoodieStorageWriter<R> getStorageWriter(
String commitTime, Path path, HoodieTable<T> hoodieTable,
HoodieWriteConfig config, Schema schema) throws IOException {
String commitTime, Path path, HoodieTable<T> hoodieTable, HoodieWriteConfig config, Schema schema)
throws IOException {
final String name = path.getName();
final String extension = FSUtils.isLogFile(path) ? HOODIE_LOG.getFileExtension() : FSUtils.getFileExtension(name);
if (PARQUET.getFileExtension().equals(extension)) {
@@ -46,19 +46,16 @@ public class HoodieStorageWriterFactory {
throw new UnsupportedOperationException(extension + " format not supported yet.");
}
private static <T extends HoodieRecordPayload,
R extends IndexedRecord> HoodieStorageWriter<R> newParquetStorageWriter(String commitTime, Path path,
HoodieWriteConfig config, Schema schema, HoodieTable hoodieTable) throws IOException {
BloomFilter filter = new BloomFilter(config.getBloomFilterNumEntries(),
config.getBloomFilterFPP());
HoodieAvroWriteSupport writeSupport = new HoodieAvroWriteSupport(
new AvroSchemaConverter().convert(schema), schema, filter);
private static <T extends HoodieRecordPayload, R extends IndexedRecord> HoodieStorageWriter<R> newParquetStorageWriter(
String commitTime, Path path, HoodieWriteConfig config, Schema schema, HoodieTable hoodieTable)
throws IOException {
BloomFilter filter = new BloomFilter(config.getBloomFilterNumEntries(), config.getBloomFilterFPP());
HoodieAvroWriteSupport writeSupport =
new HoodieAvroWriteSupport(new AvroSchemaConverter().convert(schema), schema, filter);
HoodieParquetConfig parquetConfig =
new HoodieParquetConfig(writeSupport, config.getParquetCompressionCodec(),
config.getParquetBlockSize(), config.getParquetPageSize(),
config.getParquetMaxFileSize(), hoodieTable.getHadoopConf(),
config.getParquetCompressionRatio());
HoodieParquetConfig parquetConfig = new HoodieParquetConfig(writeSupport, config.getParquetCompressionCodec(),
config.getParquetBlockSize(), config.getParquetPageSize(), config.getParquetMaxFileSize(),
hoodieTable.getHadoopConf(), config.getParquetCompressionRatio());
return new HoodieParquetWriter<>(commitTime, path, parquetConfig, schema);
}

View File

@@ -118,8 +118,8 @@ public class HoodieMetrics {
return indexTimer == null ? null : indexTimer.time();
}
public void updateCommitMetrics(long commitEpochTimeInMs, long durationInMs,
HoodieCommitMetadata metadata, String actionType) {
public void updateCommitMetrics(long commitEpochTimeInMs, long durationInMs, HoodieCommitMetadata metadata,
String actionType) {
if (config.isMetricsOn()) {
long totalPartitionsWritten = metadata.fetchTotalPartitionsWritten();
long totalFilesInsert = metadata.fetchTotalFilesInsert();
@@ -154,9 +154,8 @@ public class HoodieMetrics {
public void updateRollbackMetrics(long durationInMs, long numFilesDeleted) {
if (config.isMetricsOn()) {
logger.info(String
.format("Sending rollback metrics (duration=%d, numFilesDeleted=%d)", durationInMs,
numFilesDeleted));
logger.info(
String.format("Sending rollback metrics (duration=%d, numFilesDeleted=%d)", durationInMs, numFilesDeleted));
Metrics.registerGauge(getMetricsName("rollback", "duration"), durationInMs);
Metrics.registerGauge(getMetricsName("rollback", "numFilesDeleted"), numFilesDeleted);
}
@@ -164,9 +163,8 @@ public class HoodieMetrics {
public void updateCleanMetrics(long durationInMs, int numFilesDeleted) {
if (config.isMetricsOn()) {
logger.info(String
.format("Sending clean metrics (duration=%d, numFilesDeleted=%d)", durationInMs,
numFilesDeleted));
logger.info(
String.format("Sending clean metrics (duration=%d, numFilesDeleted=%d)", durationInMs, numFilesDeleted));
Metrics.registerGauge(getMetricsName("clean", "duration"), durationInMs);
Metrics.registerGauge(getMetricsName("clean", "numFilesDeleted"), numFilesDeleted);
}
@@ -174,9 +172,8 @@ public class HoodieMetrics {
public void updateFinalizeWriteMetrics(long durationInMs, long numFilesFinalized) {
if (config.isMetricsOn()) {
logger.info(String
.format("Sending finalize write metrics (duration=%d, numFilesFinalized=%d)",
durationInMs, numFilesFinalized));
logger.info(String.format("Sending finalize write metrics (duration=%d, numFilesFinalized=%d)", durationInMs,
numFilesFinalized));
Metrics.registerGauge(getMetricsName("finalize", "duration"), durationInMs);
Metrics.registerGauge(getMetricsName("finalize", "numFilesFinalized"), numFilesFinalized);
}
@@ -184,10 +181,8 @@ public class HoodieMetrics {
public void updateIndexMetrics(final String action, final long durationInMs) {
if (config.isMetricsOn()) {
logger.info(String
.format("Sending index metrics (%s.duration, %d)",action, durationInMs));
Metrics.registerGauge(getMetricsName("index", String.format("%s.duration", action)),
durationInMs);
logger.info(String.format("Sending index metrics (%s.duration, %d)", action, durationInMs));
Metrics.registerGauge(getMetricsName("index", String.format("%s.duration", action)), durationInMs);
}
}

View File

@@ -26,12 +26,10 @@ import java.io.Closeable;
public class InMemoryMetricsReporter extends MetricsReporter {
@Override
public void start() {
}
public void start() {}
@Override
public void report() {
}
public void report() {}
@Override
public Closeable getReporter() {

View File

@@ -30,8 +30,7 @@ import org.apache.log4j.LogManager;
import org.apache.log4j.Logger;
/**
* Implementation of Graphite reporter, which connects to the Graphite server, and send metrics to
* that server.
* Implementation of Graphite reporter, which connects to the Graphite server, and send metrics to that server.
*/
public class MetricsGraphiteReporter extends MetricsReporter {
@@ -50,8 +49,7 @@ public class MetricsGraphiteReporter extends MetricsReporter {
this.serverHost = config.getGraphiteServerHost();
this.serverPort = config.getGraphiteServerPort();
if (serverHost == null || serverPort == 0) {
throw new RuntimeException(String
.format("Graphite cannot be initialized with serverHost[%s] and serverPort[%s].",
throw new RuntimeException(String.format("Graphite cannot be initialized with serverHost[%s] and serverPort[%s].",
serverHost, serverPort));
}
@@ -84,8 +82,7 @@ public class MetricsGraphiteReporter extends MetricsReporter {
private GraphiteReporter createGraphiteReport() {
Graphite graphite = new Graphite(new InetSocketAddress(serverHost, serverPort));
String reporterPrefix = config.getGraphiteMetricPrefix();
return GraphiteReporter.forRegistry(registry).prefixedWith(reporterPrefix)
.convertRatesTo(TimeUnit.SECONDS).convertDurationsTo(TimeUnit.MILLISECONDS)
.filter(MetricFilter.ALL).build(graphite);
return GraphiteReporter.forRegistry(registry).prefixedWith(reporterPrefix).convertRatesTo(TimeUnit.SECONDS)
.convertDurationsTo(TimeUnit.MILLISECONDS).filter(MetricFilter.ALL).build(graphite);
}
}

View File

@@ -19,8 +19,7 @@
package org.apache.hudi.metrics;
/**
* Types of the reporter. Right now we only support Graphite. We can include JMX and CSV in the
* future.
* Types of the reporter. Right now we only support Graphite. We can include JMX and CSV in the future.
*/
public enum MetricsReporterType {
GRAPHITE, INMEMORY

View File

@@ -82,8 +82,7 @@ import scala.Tuple2;
/**
* Implementation of a very heavily read-optimized Hoodie Table where
* <p>
* INSERTS - Produce new files, block aligned to desired size (or) Merge with the smallest existing
* file, to expand it
* INSERTS - Produce new files, block aligned to desired size (or) Merge with the smallest existing file, to expand it
* <p>
* UPDATES - Produce a new version of the file, just replacing the updated records with new values
*/
@@ -95,11 +94,9 @@ public class HoodieCopyOnWriteTable<T extends HoodieRecordPayload> extends Hoodi
super(config, jsc);
}
private static PairFlatMapFunction<Iterator<Tuple2<String, String>>, String,
PartitionCleanStat> deleteFilesFunc(
private static PairFlatMapFunction<Iterator<Tuple2<String, String>>, String, PartitionCleanStat> deleteFilesFunc(
HoodieTable table) {
return (PairFlatMapFunction<Iterator<Tuple2<String, String>>, String, PartitionCleanStat>)
iter -> {
return (PairFlatMapFunction<Iterator<Tuple2<String, String>>, String, PartitionCleanStat>) iter -> {
Map<String, PartitionCleanStat> partitionCleanStatMap = new HashMap<>();
FileSystem fs = table.getMetaClient().getFs();
@@ -116,8 +113,7 @@ public class HoodieCopyOnWriteTable<T extends HoodieRecordPayload> extends Hoodi
partitionCleanStat.addDeletedFileResult(deletePathStr, deletedFileResult);
}
return partitionCleanStatMap.entrySet().stream()
.map(e -> new Tuple2<>(e.getKey(), e.getValue()))
return partitionCleanStatMap.entrySet().stream().map(e -> new Tuple2<>(e.getKey(), e.getValue()))
.collect(Collectors.toList()).iterator();
};
}
@@ -131,8 +127,7 @@ public class HoodieCopyOnWriteTable<T extends HoodieRecordPayload> extends Hoodi
};
}
private static Boolean deleteFileAndGetResult(FileSystem fs, String deletePathStr)
throws IOException {
private static Boolean deleteFileAndGetResult(FileSystem fs, String deletePathStr) throws IOException {
Path deletePath = new Path(deletePathStr);
logger.debug("Working on delete path :" + deletePath);
boolean deleteResult = fs.delete(deletePath, false);
@@ -171,8 +166,8 @@ public class HoodieCopyOnWriteTable<T extends HoodieRecordPayload> extends Hoodi
throw new HoodieNotSupportedException("Compaction is not supported from a CopyOnWrite table");
}
public Iterator<List<WriteStatus>> handleUpdate(String commitTime, String fileId,
Iterator<HoodieRecord<T>> recordItr) throws IOException {
public Iterator<List<WriteStatus>> handleUpdate(String commitTime, String fileId, Iterator<HoodieRecord<T>> recordItr)
throws IOException {
// This is needed since sometimes some buckets are never picked in getPartition() and end up with 0 records
if (!recordItr.hasNext()) {
logger.info("Empty partition with fileId => " + fileId);
@@ -190,17 +185,16 @@ public class HoodieCopyOnWriteTable<T extends HoodieRecordPayload> extends Hoodi
return handleUpdateInternal(upsertHandle, commitTime, fileId);
}
protected Iterator<List<WriteStatus>> handleUpdateInternal(HoodieMergeHandle upsertHandle,
String commitTime, String fileId)
throws IOException {
protected Iterator<List<WriteStatus>> handleUpdateInternal(HoodieMergeHandle upsertHandle, String commitTime,
String fileId) throws IOException {
if (upsertHandle.getOldFilePath() == null) {
throw new HoodieUpsertException(
"Error in finding the old file path at commit " + commitTime + " for fileId: " + fileId);
} else {
AvroReadSupport.setAvroReadSchema(getHadoopConf(), upsertHandle.getWriterSchema());
BoundedInMemoryExecutor<GenericRecord, GenericRecord, Void> wrapper = null;
try (ParquetReader<IndexedRecord> reader = AvroParquetReader.<IndexedRecord>builder(upsertHandle.getOldFilePath())
.withConf(getHadoopConf()).build()) {
try (ParquetReader<IndexedRecord> reader =
AvroParquetReader.<IndexedRecord>builder(upsertHandle.getOldFilePath()).withConf(getHadoopConf()).build()) {
wrapper = new SparkBoundedInMemoryExecutor(config, new ParquetReaderIterator(reader),
new UpdateHandler(upsertHandle), x -> x);
wrapper.execute();
@@ -216,15 +210,13 @@ public class HoodieCopyOnWriteTable<T extends HoodieRecordPayload> extends Hoodi
// TODO(vc): This needs to be revisited
if (upsertHandle.getWriteStatus().getPartitionPath() == null) {
logger.info("Upsert Handle has partition path as null " + upsertHandle.getOldFilePath()
+ ", " + upsertHandle.getWriteStatus());
logger.info("Upsert Handle has partition path as null " + upsertHandle.getOldFilePath() + ", "
+ upsertHandle.getWriteStatus());
}
return Collections.singletonList(Collections.singletonList(upsertHandle.getWriteStatus()))
.iterator();
return Collections.singletonList(Collections.singletonList(upsertHandle.getWriteStatus())).iterator();
}
protected HoodieMergeHandle getUpdateHandle(String commitTime, String fileId,
Iterator<HoodieRecord<T>> recordItr) {
protected HoodieMergeHandle getUpdateHandle(String commitTime, String fileId, Iterator<HoodieRecord<T>> recordItr) {
return new HoodieMergeHandle<>(config, commitTime, this, recordItr, fileId);
}
@@ -233,8 +225,8 @@ public class HoodieCopyOnWriteTable<T extends HoodieRecordPayload> extends Hoodi
return new HoodieMergeHandle<>(config, commitTime, this, keyToNewRecords, fileId, dataFileToBeMerged);
}
public Iterator<List<WriteStatus>> handleInsert(String commitTime, String idPfx,
Iterator<HoodieRecord<T>> recordItr) throws Exception {
public Iterator<List<WriteStatus>> handleInsert(String commitTime, String idPfx, Iterator<HoodieRecord<T>> recordItr)
throws Exception {
// This is needed since sometimes some buckets are never picked in getPartition() and end up with 0 records
if (!recordItr.hasNext()) {
logger.info("Empty partition");
@@ -245,16 +237,16 @@ public class HoodieCopyOnWriteTable<T extends HoodieRecordPayload> extends Hoodi
public Iterator<List<WriteStatus>> handleInsert(String commitTime, String partitionPath, String fileId,
Iterator<HoodieRecord<T>> recordItr) {
HoodieCreateHandle createHandle = new HoodieCreateHandle(config, commitTime, this, partitionPath, fileId,
recordItr);
HoodieCreateHandle createHandle =
new HoodieCreateHandle(config, commitTime, this, partitionPath, fileId, recordItr);
createHandle.write();
return Collections.singletonList(Collections.singletonList(createHandle.close())).iterator();
}
@SuppressWarnings("unchecked")
@Override
public Iterator<List<WriteStatus>> handleUpsertPartition(String commitTime, Integer partition,
Iterator recordItr, Partitioner partitioner) {
public Iterator<List<WriteStatus>> handleUpsertPartition(String commitTime, Integer partition, Iterator recordItr,
Partitioner partitioner) {
UpsertPartitioner upsertPartitioner = (UpsertPartitioner) partitioner;
BucketInfo binfo = upsertPartitioner.getBucketInfo(partition);
BucketType btype = binfo.bucketType;
@@ -264,8 +256,7 @@ public class HoodieCopyOnWriteTable<T extends HoodieRecordPayload> extends Hoodi
} else if (btype.equals(BucketType.UPDATE)) {
return handleUpdate(commitTime, binfo.fileIdPrefix, recordItr);
} else {
throw new HoodieUpsertException(
"Unknown bucketType " + btype + " for partition :" + partition);
throw new HoodieUpsertException("Unknown bucketType " + btype + " for partition :" + partition);
}
} catch (Throwable t) {
String msg = "Error upserting bucketType " + btype + " for partition :" + partition;
@@ -275,15 +266,14 @@ public class HoodieCopyOnWriteTable<T extends HoodieRecordPayload> extends Hoodi
}
@Override
public Iterator<List<WriteStatus>> handleInsertPartition(String commitTime, Integer partition,
Iterator recordItr, Partitioner partitioner) {
public Iterator<List<WriteStatus>> handleInsertPartition(String commitTime, Integer partition, Iterator recordItr,
Partitioner partitioner) {
return handleUpsertPartition(commitTime, partition, recordItr, partitioner);
}
/**
* Performs cleaning of partition paths according to cleaning policy and returns the number of
* files cleaned. Handles skews in partitions to clean by making files to clean as the unit of
* task distribution.
* Performs cleaning of partition paths according to cleaning policy and returns the number of files cleaned. Handles
* skews in partitions to clean by making files to clean as the unit of task distribution.
*
* @throws IllegalArgumentException if unknown cleaning policy is provided
*/
@@ -291,11 +281,9 @@ public class HoodieCopyOnWriteTable<T extends HoodieRecordPayload> extends Hoodi
public List<HoodieCleanStat> clean(JavaSparkContext jsc) {
try {
FileSystem fs = getMetaClient().getFs();
List<String> partitionsToClean = FSUtils
.getAllPartitionPaths(fs, getMetaClient().getBasePath(),
config.shouldAssumeDatePartitioning());
logger.info("Partitions to clean up : " + partitionsToClean + ", with policy " + config
.getCleanerPolicy());
List<String> partitionsToClean =
FSUtils.getAllPartitionPaths(fs, getMetaClient().getBasePath(), config.shouldAssumeDatePartitioning());
logger.info("Partitions to clean up : " + partitionsToClean + ", with policy " + config.getCleanerPolicy());
if (partitionsToClean.isEmpty()) {
logger.info("Nothing to clean here mom. It is already clean");
return Collections.emptyList();
@@ -307,12 +295,10 @@ public class HoodieCopyOnWriteTable<T extends HoodieRecordPayload> extends Hoodi
}
/**
* Common method used for cleaning out parquet files under a partition path during rollback of a
* set of commits
* Common method used for cleaning out parquet files under a partition path during rollback of a set of commits
*/
protected Map<FileStatus, Boolean> deleteCleanedFiles(Map<FileStatus, Boolean> results, String partitionPath,
PathFilter filter)
throws IOException {
PathFilter filter) throws IOException {
logger.info("Cleaning path " + partitionPath);
FileSystem fs = getMetaClient().getFs();
FileStatus[] toBeDeleted = fs.listStatus(FSUtils.getPartitionPath(config.getBasePath(), partitionPath), filter);
@@ -325,12 +311,10 @@ public class HoodieCopyOnWriteTable<T extends HoodieRecordPayload> extends Hoodi
}
/**
* Common method used for cleaning out parquet files under a partition path during rollback of a
* set of commits
* Common method used for cleaning out parquet files under a partition path during rollback of a set of commits
*/
protected Map<FileStatus, Boolean> deleteCleanedFiles(Map<FileStatus, Boolean> results, String commit, String
partitionPath)
throws IOException {
protected Map<FileStatus, Boolean> deleteCleanedFiles(Map<FileStatus, Boolean> results, String commit,
String partitionPath) throws IOException {
logger.info("Cleaning path " + partitionPath);
FileSystem fs = getMetaClient().getFs();
PathFilter filter = (path) -> {
@@ -354,8 +338,8 @@ public class HoodieCopyOnWriteTable<T extends HoodieRecordPayload> extends Hoodi
throws IOException {
String actionType = metaClient.getCommitActionType();
HoodieActiveTimeline activeTimeline = this.getActiveTimeline();
List<String> inflights = this.getInflightCommitTimeline().getInstants()
.map(HoodieInstant::getTimestamp).collect(Collectors.toList());
List<String> inflights =
this.getInflightCommitTimeline().getInstants().map(HoodieInstant::getTimestamp).collect(Collectors.toList());
// Atomically unpublish the commits
if (!inflights.contains(commit)) {
activeTimeline.revertToInflight(new HoodieInstant(false, actionType, commit));
@@ -364,10 +348,9 @@ public class HoodieCopyOnWriteTable<T extends HoodieRecordPayload> extends Hoodi
// delete all the data files for this commit
logger.info("Clean out all parquet files generated for commit: " + commit);
List<HoodieRollbackStat> stats = jsc.parallelize(FSUtils
.getAllPartitionPaths(metaClient.getFs(), getMetaClient().getBasePath(),
config.shouldAssumeDatePartitioning()))
.map((Function<String, HoodieRollbackStat>) partitionPath -> {
List<HoodieRollbackStat> stats =
jsc.parallelize(FSUtils.getAllPartitionPaths(metaClient.getFs(), getMetaClient().getBasePath(),
config.shouldAssumeDatePartitioning())).map((Function<String, HoodieRollbackStat>) partitionPath -> {
// Scan all partitions files with this commit time
final Map<FileStatus, Boolean> filesToDeletedStatus = new HashMap<>();
deleteCleanedFiles(filesToDeletedStatus, commit, partitionPath);
@@ -376,13 +359,13 @@ public class HoodieCopyOnWriteTable<T extends HoodieRecordPayload> extends Hoodi
}).collect();
// Delete Inflight instant if enabled
deleteInflightInstant(deleteInstants, activeTimeline,
new HoodieInstant(true, actionType, commit));
deleteInflightInstant(deleteInstants, activeTimeline, new HoodieInstant(true, actionType, commit));
return stats;
}
/**
* Delete Inflight instant if enabled
*
* @param deleteInstant Enable Deletion of Inflight instant
* @param activeTimeline Hoodie active timeline
* @param instantToBeDeleted Instant to be deleted
@@ -401,30 +384,27 @@ public class HoodieCopyOnWriteTable<T extends HoodieRecordPayload> extends Hoodi
}
}
private List<HoodieCleanStat> cleanPartitionPaths(List<String> partitionsToClean,
JavaSparkContext jsc) {
private List<HoodieCleanStat> cleanPartitionPaths(List<String> partitionsToClean, JavaSparkContext jsc) {
int cleanerParallelism = Math.min(partitionsToClean.size(), config.getCleanerParallelism());
logger.info("Using cleanerParallelism: " + cleanerParallelism);
List<Tuple2<String, PartitionCleanStat>> partitionCleanStats = jsc
.parallelize(partitionsToClean, cleanerParallelism)
.flatMapToPair(getFilesToDeleteFunc(this, config))
.parallelize(partitionsToClean, cleanerParallelism).flatMapToPair(getFilesToDeleteFunc(this, config))
.repartition(cleanerParallelism) // repartition to remove skews
.mapPartitionsToPair(deleteFilesFunc(this)).reduceByKey(
// merge partition level clean stats below
(Function2<PartitionCleanStat, PartitionCleanStat, PartitionCleanStat>) (e1, e2) -> e1
.merge(e2)).collect();
(Function2<PartitionCleanStat, PartitionCleanStat, PartitionCleanStat>) (e1, e2) -> e1.merge(e2))
.collect();
Map<String, PartitionCleanStat> partitionCleanStatsMap = partitionCleanStats.stream()
.collect(Collectors.toMap(Tuple2::_1, Tuple2::_2));
Map<String, PartitionCleanStat> partitionCleanStatsMap =
partitionCleanStats.stream().collect(Collectors.toMap(Tuple2::_1, Tuple2::_2));
HoodieCleanHelper cleaner = new HoodieCleanHelper(this, config);
// Return PartitionCleanStat for each partition passed.
return partitionsToClean.stream().map(partitionPath -> {
PartitionCleanStat partitionCleanStat =
(partitionCleanStatsMap.containsKey(partitionPath)) ? partitionCleanStatsMap
.get(partitionPath) : new PartitionCleanStat(partitionPath);
return HoodieCleanStat.newBuilder().withPolicy(config.getCleanerPolicy())
.withPartitionPath(partitionPath)
(partitionCleanStatsMap.containsKey(partitionPath)) ? partitionCleanStatsMap.get(partitionPath)
: new PartitionCleanStat(partitionPath);
return HoodieCleanStat.newBuilder().withPolicy(config.getCleanerPolicy()).withPartitionPath(partitionPath)
.withEarliestCommitRetained(cleaner.getEarliestCommitToRetain())
.withDeletePathPattern(partitionCleanStat.deletePathPatterns)
.withSuccessfulDeletes(partitionCleanStat.successDeleteFiles)
@@ -453,8 +433,7 @@ public class HoodieCopyOnWriteTable<T extends HoodieRecordPayload> extends Hoodi
}
@Override
protected void finish() {
}
protected void finish() {}
@Override
protected Void getResult() {
@@ -487,8 +466,8 @@ public class HoodieCopyOnWriteTable<T extends HoodieRecordPayload> extends Hoodi
private PartitionCleanStat merge(PartitionCleanStat other) {
if (!this.partitionPath.equals(other.partitionPath)) {
throw new RuntimeException(String
.format("partitionPath is not a match: (%s, %s)", partitionPath, other.partitionPath));
throw new RuntimeException(
String.format("partitionPath is not a match: (%s, %s)", partitionPath, other.partitionPath));
}
successDeleteFiles.addAll(other.successDeleteFiles);
deletePathPatterns.addAll(other.deletePathPatterns);
@@ -516,8 +495,8 @@ public class HoodieCopyOnWriteTable<T extends HoodieRecordPayload> extends Hoodi
}
/**
* Helper class for an insert bucket along with the weight [0.0, 0.1] that defines the amount of
* incoming inserts that should be allocated to the bucket
* Helper class for an insert bucket along with the weight [0.0, 0.1] that defines the amount of incoming inserts that
* should be allocated to the bucket
*/
class InsertBucket implements Serializable {
@@ -563,8 +542,7 @@ public class HoodieCopyOnWriteTable<T extends HoodieRecordPayload> extends Hoodi
*/
List<SmallFile> smallFiles = new ArrayList<SmallFile>();
/**
* Total number of RDD partitions, is determined by total buckets we want to pack the incoming
* workload into
* Total number of RDD partitions, is determined by total buckets we want to pack the incoming workload into
*/
private int totalBuckets = 0;
/**
@@ -599,8 +577,7 @@ public class HoodieCopyOnWriteTable<T extends HoodieRecordPayload> extends Hoodi
assignUpdates(profile);
assignInserts(profile);
logger.info(
"Total Buckets :" + totalBuckets + ", " + "buckets info => " + bucketInfoMap + ", \n"
logger.info("Total Buckets :" + totalBuckets + ", " + "buckets info => " + bucketInfoMap + ", \n"
+ "Partition to insert buckets => " + partitionPathToInsertBuckets + ", \n"
+ "UpdateLocations mapped to buckets =>" + updateLocationToBucket);
}
@@ -608,8 +585,7 @@ public class HoodieCopyOnWriteTable<T extends HoodieRecordPayload> extends Hoodi
private void assignUpdates(WorkloadProfile profile) {
// each update location gets a partition
WorkloadStat gStat = profile.getGlobalStat();
for (Map.Entry<String, Pair<String, Long>> updateLocEntry : gStat.getUpdateLocationToCount()
.entrySet()) {
for (Map.Entry<String, Pair<String, Long>> updateLocEntry : gStat.getUpdateLocationToCount().entrySet()) {
addUpdateBucket(updateLocEntry.getKey());
}
}
@@ -628,8 +604,9 @@ public class HoodieCopyOnWriteTable<T extends HoodieRecordPayload> extends Hoodi
private void assignInserts(WorkloadProfile profile) {
// for new inserts, compute buckets depending on how many records we have for each partition
Set<String> partitionPaths = profile.getPartitionPaths();
long averageRecordSize = averageBytesPerRecord(metaClient.getActiveTimeline().getCommitTimeline()
.filterCompletedInstants(), config.getCopyOnWriteRecordSizeEstimate());
long averageRecordSize =
averageBytesPerRecord(metaClient.getActiveTimeline().getCommitTimeline().filterCompletedInstants(),
config.getCopyOnWriteRecordSizeEstimate());
logger.info("AvgRecordSize => " + averageRecordSize);
for (String partitionPath : partitionPaths) {
WorkloadStat pStat = profile.getWorkloadStat(partitionPath);
@@ -644,20 +621,17 @@ public class HoodieCopyOnWriteTable<T extends HoodieRecordPayload> extends Hoodi
// first try packing this into one of the smallFiles
for (SmallFile smallFile : smallFiles) {
long recordsToAppend = Math
.min((config.getParquetMaxFileSize() - smallFile.sizeBytes) / averageRecordSize,
long recordsToAppend = Math.min((config.getParquetMaxFileSize() - smallFile.sizeBytes) / averageRecordSize,
totalUnassignedInserts);
if (recordsToAppend > 0 && totalUnassignedInserts > 0) {
// create a new bucket or re-use an existing bucket
int bucket;
if (updateLocationToBucket.containsKey(smallFile.location.getFileId())) {
bucket = updateLocationToBucket.get(smallFile.location.getFileId());
logger.info("Assigning " + recordsToAppend + " inserts to existing update bucket "
+ bucket);
logger.info("Assigning " + recordsToAppend + " inserts to existing update bucket " + bucket);
} else {
bucket = addUpdateBucket(smallFile.location.getFileId());
logger.info(
"Assigning " + recordsToAppend + " inserts to new update bucket " + bucket);
logger.info("Assigning " + recordsToAppend + " inserts to new update bucket " + bucket);
}
bucketNumbers.add(bucket);
recordsPerBucket.add(recordsToAppend);
@@ -673,10 +647,8 @@ public class HoodieCopyOnWriteTable<T extends HoodieRecordPayload> extends Hoodi
}
int insertBuckets = (int) Math.ceil((1.0 * totalUnassignedInserts) / insertRecordsPerBucket);
logger.info(
"After small file assignment: unassignedInserts => " + totalUnassignedInserts
+ ", totalInsertBuckets => " + insertBuckets + ", recordsPerBucket => "
+ insertRecordsPerBucket);
logger.info("After small file assignment: unassignedInserts => " + totalUnassignedInserts
+ ", totalInsertBuckets => " + insertBuckets + ", recordsPerBucket => " + insertRecordsPerBucket);
for (int b = 0; b < insertBuckets; b++) {
bucketNumbers.add(totalBuckets);
recordsPerBucket.add(totalUnassignedInserts / insertBuckets);
@@ -696,8 +668,7 @@ public class HoodieCopyOnWriteTable<T extends HoodieRecordPayload> extends Hoodi
bkt.weight = (1.0 * recordsPerBucket.get(i)) / pStat.getNumInserts();
insertBuckets.add(bkt);
}
logger.info(
"Total insert buckets for partition path " + partitionPath + " => " + insertBuckets);
logger.info("Total insert buckets for partition path " + partitionPath + " => " + insertBuckets);
partitionPathToInsertBuckets.put(partitionPath, insertBuckets);
}
}
@@ -716,15 +687,13 @@ public class HoodieCopyOnWriteTable<T extends HoodieRecordPayload> extends Hoodi
if (!commitTimeline.empty()) { // if we have some commits
HoodieInstant latestCommitTime = commitTimeline.lastInstant().get();
List<HoodieDataFile> allFiles = getROFileSystemView()
.getLatestDataFilesBeforeOrOn(partitionPath, latestCommitTime.getTimestamp())
.collect(Collectors.toList());
.getLatestDataFilesBeforeOrOn(partitionPath, latestCommitTime.getTimestamp()).collect(Collectors.toList());
for (HoodieDataFile file : allFiles) {
if (file.getFileSize() < config.getParquetSmallFileLimit()) {
String filename = file.getFileName();
SmallFile sf = new SmallFile();
sf.location = new HoodieRecordLocation(FSUtils.getCommitTime(filename),
FSUtils.getFileId(filename));
sf.location = new HoodieRecordLocation(FSUtils.getCommitTime(filename), FSUtils.getFileId(filename));
sf.sizeBytes = file.getFileSize();
smallFileLocations.add(sf);
// Update the global small files list
@@ -751,19 +720,18 @@ public class HoodieCopyOnWriteTable<T extends HoodieRecordPayload> extends Hoodi
@Override
public int getPartition(Object key) {
Tuple2<HoodieKey, Option<HoodieRecordLocation>> keyLocation = (Tuple2<HoodieKey,
Option<HoodieRecordLocation>>) key;
Tuple2<HoodieKey, Option<HoodieRecordLocation>> keyLocation =
(Tuple2<HoodieKey, Option<HoodieRecordLocation>>) key;
if (keyLocation._2().isPresent()) {
HoodieRecordLocation location = keyLocation._2().get();
return updateLocationToBucket.get(location.getFileId());
} else {
List<InsertBucket> targetBuckets = partitionPathToInsertBuckets
.get(keyLocation._1().getPartitionPath());
List<InsertBucket> targetBuckets = partitionPathToInsertBuckets.get(keyLocation._1().getPartitionPath());
// pick the target bucket to use based on the weights.
double totalWeight = 0.0;
final long totalInserts = Math.max(1, globalStat.getNumInserts());
final long hashOfKey = Hashing.md5()
.hashString(keyLocation._1().getRecordKey(), StandardCharsets.UTF_8).asLong();
final long hashOfKey =
Hashing.md5().hashString(keyLocation._1().getRecordKey(), StandardCharsets.UTF_8).asLong();
final double r = 1.0 * Math.floorMod(hashOfKey, totalInserts) / totalInserts;
for (InsertBucket insertBucket : targetBuckets) {
totalWeight += insertBucket.weight;
@@ -782,8 +750,8 @@ public class HoodieCopyOnWriteTable<T extends HoodieRecordPayload> extends Hoodi
}
/**
* Obtains the average record size based on records written during previous commits. Used for
* estimating how many records pack into one file.
* Obtains the average record size based on records written during previous commits. Used for estimating how many
* records pack into one file.
*/
protected static long averageBytesPerRecord(HoodieTimeline commitTimeline, int defaultRecordSizeEstimate) {
long avgSize = defaultRecordSizeEstimate;

View File

@@ -73,15 +73,21 @@ import org.apache.spark.api.java.JavaSparkContext;
import org.apache.spark.api.java.function.Function;
/**
* Implementation of a more real-time read-optimized Hoodie Table where <p> INSERTS - Same as
* HoodieCopyOnWriteTable - Produce new files, block aligned to desired size (or) Merge with the
* smallest existing file, to expand it </p> <p> UPDATES - Appends the changes to a rolling log file
* maintained per file Id. Compaction merges the log file into the base file. </p> <p> WARNING - MOR
* table type does not support nested rollbacks, every rollback must be followed by an attempted
* commit action </p>
* Implementation of a more real-time read-optimized Hoodie Table where
* <p>
* INSERTS - Same as HoodieCopyOnWriteTable - Produce new files, block aligned to desired size (or) Merge with the
* smallest existing file, to expand it
* </p>
* <p>
* UPDATES - Appends the changes to a rolling log file maintained per file Id. Compaction merges the log file into the
* base file.
* </p>
* <p>
* WARNING - MOR table type does not support nested rollbacks, every rollback must be followed by an attempted commit
* action
* </p>
*/
public class HoodieMergeOnReadTable<T extends HoodieRecordPayload> extends
HoodieCopyOnWriteTable<T> {
public class HoodieMergeOnReadTable<T extends HoodieRecordPayload> extends HoodieCopyOnWriteTable<T> {
private static Logger logger = LogManager.getLogger(HoodieMergeOnReadTable.class);
@@ -102,27 +108,24 @@ public class HoodieMergeOnReadTable<T extends HoodieRecordPayload> extends
}
@Override
public Iterator<List<WriteStatus>> handleUpdate(String commitTime, String fileId,
Iterator<HoodieRecord<T>> recordItr) throws IOException {
public Iterator<List<WriteStatus>> handleUpdate(String commitTime, String fileId, Iterator<HoodieRecord<T>> recordItr)
throws IOException {
logger.info("Merging updates for commit " + commitTime + " for file " + fileId);
if (!index.canIndexLogFiles() && mergeOnReadUpsertPartitioner.getSmallFileIds().contains(fileId)) {
logger.info(
"Small file corrections for updates for commit " + commitTime + " for file " + fileId);
logger.info("Small file corrections for updates for commit " + commitTime + " for file " + fileId);
return super.handleUpdate(commitTime, fileId, recordItr);
} else {
HoodieAppendHandle<T> appendHandle = new HoodieAppendHandle<>(config, commitTime, this,
fileId, recordItr);
HoodieAppendHandle<T> appendHandle = new HoodieAppendHandle<>(config, commitTime, this, fileId, recordItr);
appendHandle.doAppend();
appendHandle.close();
return Collections.singletonList(Collections.singletonList(appendHandle.getWriteStatus()))
.iterator();
return Collections.singletonList(Collections.singletonList(appendHandle.getWriteStatus())).iterator();
}
}
@Override
public Iterator<List<WriteStatus>> handleInsert(String commitTime, String idPfx,
Iterator<HoodieRecord<T>> recordItr) throws Exception {
public Iterator<List<WriteStatus>> handleInsert(String commitTime, String idPfx, Iterator<HoodieRecord<T>> recordItr)
throws Exception {
// If canIndexLogFiles, write inserts to log files else write inserts to parquet files
if (index.canIndexLogFiles()) {
return new MergeOnReadLazyInsertIterable<>(recordItr, config, commitTime, this, idPfx);
@@ -134,8 +137,8 @@ public class HoodieMergeOnReadTable<T extends HoodieRecordPayload> extends
@Override
public HoodieCompactionPlan scheduleCompaction(JavaSparkContext jsc, String instantTime) {
logger.info("Checking if compaction needs to be run on " + config.getBasePath());
Option<HoodieInstant> lastCompaction = getActiveTimeline().getCommitTimeline()
.filterCompletedInstants().lastInstant();
Option<HoodieInstant> lastCompaction =
getActiveTimeline().getCommitTimeline().filterCompletedInstants().lastInstant();
String deltaCommitsSinceTs = "0";
if (lastCompaction.isPresent()) {
deltaCommitsSinceTs = lastCompaction.get().getTimestamp();
@@ -145,8 +148,8 @@ public class HoodieMergeOnReadTable<T extends HoodieRecordPayload> extends
.findInstantsAfter(deltaCommitsSinceTs, Integer.MAX_VALUE).countInstants();
if (config.getInlineCompactDeltaCommitMax() > deltaCommitsSinceLastCompaction) {
logger.info("Not running compaction as only " + deltaCommitsSinceLastCompaction
+ " delta commits was found since last compaction " + deltaCommitsSinceTs
+ ". Waiting for " + config.getInlineCompactDeltaCommitMax());
+ " delta commits was found since last compaction " + deltaCommitsSinceTs + ". Waiting for "
+ config.getInlineCompactDeltaCommitMax());
return new HoodieCompactionPlan();
}
@@ -186,9 +189,8 @@ public class HoodieMergeOnReadTable<T extends HoodieRecordPayload> extends
// Atomically un-publish all non-inflight commits
Option<HoodieInstant> commitOrCompactionOption = Option.fromJavaOptional(this.getActiveTimeline()
.getTimelineOfActions(Sets.newHashSet(HoodieActiveTimeline.COMMIT_ACTION,
HoodieActiveTimeline.DELTA_COMMIT_ACTION, HoodieActiveTimeline.COMPACTION_ACTION)).getInstants()
.filter(i -> commit.equals(i.getTimestamp()))
.findFirst());
HoodieActiveTimeline.DELTA_COMMIT_ACTION, HoodieActiveTimeline.COMPACTION_ACTION))
.getInstants().filter(i -> commit.equals(i.getTimestamp())).findFirst());
HoodieInstant instantToRollback = commitOrCompactionOption.get();
// Atomically un-publish all non-inflight commits
if (!instantToRollback.isInflight()) {
@@ -196,10 +198,9 @@ public class HoodieMergeOnReadTable<T extends HoodieRecordPayload> extends
}
logger.info("Unpublished " + commit);
Long startTime = System.currentTimeMillis();
List<HoodieRollbackStat> allRollbackStats = jsc.parallelize(FSUtils
.getAllPartitionPaths(this.metaClient.getFs(), this.getMetaClient().getBasePath(),
config.shouldAssumeDatePartitioning()))
.map((Function<String, HoodieRollbackStat>) partitionPath -> {
List<HoodieRollbackStat> allRollbackStats =
jsc.parallelize(FSUtils.getAllPartitionPaths(this.metaClient.getFs(), this.getMetaClient().getBasePath(),
config.shouldAssumeDatePartitioning())).map((Function<String, HoodieRollbackStat>) partitionPath -> {
HoodieActiveTimeline activeTimeline = this.getActiveTimeline().reload();
HoodieRollbackStat hoodieRollbackStats = null;
// Need to put the path filter here since Filter is not serializable
@@ -222,10 +223,11 @@ public class HoodieMergeOnReadTable<T extends HoodieRecordPayload> extends
case HoodieTimeline.COMMIT_ACTION:
try {
// Rollback of a commit should delete the newly created parquet files along with any log
// files created with this as baseCommit. This is required to support multi-rollbacks in a MOR table.
// files created with this as baseCommit. This is required to support multi-rollbacks in a MOR
// table.
super.deleteCleanedFiles(filesToDeletedStatus, partitionPath, filter);
hoodieRollbackStats = HoodieRollbackStat.newBuilder()
.withPartitionPath(partitionPath).withDeletedFileResults(filesToDeletedStatus).build();
hoodieRollbackStats = HoodieRollbackStat.newBuilder().withPartitionPath(partitionPath)
.withDeletedFileResults(filesToDeletedStatus).build();
break;
} catch (IOException io) {
throw new UncheckedIOException("Failed to rollback for commit " + commit, io);
@@ -233,25 +235,28 @@ public class HoodieMergeOnReadTable<T extends HoodieRecordPayload> extends
case HoodieTimeline.COMPACTION_ACTION:
try {
// If there is no delta commit present after the current commit (if compaction), no action, else we
// need to make sure that a compaction commit rollback also deletes any log files written as part of the
// need to make sure that a compaction commit rollback also deletes any log files written as part of
// the
// succeeding deltacommit.
boolean higherDeltaCommits = !activeTimeline.getDeltaCommitTimeline()
.filterCompletedInstants().findInstantsAfter(commit, 1).empty();
boolean higherDeltaCommits = !activeTimeline.getDeltaCommitTimeline().filterCompletedInstants()
.findInstantsAfter(commit, 1).empty();
if (higherDeltaCommits) {
// Rollback of a compaction action with no higher deltacommit means that the compaction is scheduled
// and has not yet finished. In this scenario we should delete only the newly created parquet files
// Rollback of a compaction action with no higher deltacommit means that the compaction is
// scheduled
// and has not yet finished. In this scenario we should delete only the newly created parquet
// files
// and not corresponding base commit log files created with this as baseCommit since updates would
// have been written to the log files.
super.deleteCleanedFiles(filesToDeletedStatus, commit, partitionPath);
hoodieRollbackStats = HoodieRollbackStat.newBuilder()
.withPartitionPath(partitionPath).withDeletedFileResults(filesToDeletedStatus).build();
hoodieRollbackStats = HoodieRollbackStat.newBuilder().withPartitionPath(partitionPath)
.withDeletedFileResults(filesToDeletedStatus).build();
} else {
// No deltacommits present after this compaction commit (inflight or requested). In this case, we
// can also delete any log files that were created with this compaction commit as base
// commit.
super.deleteCleanedFiles(filesToDeletedStatus, partitionPath, filter);
hoodieRollbackStats = HoodieRollbackStat.newBuilder()
.withPartitionPath(partitionPath).withDeletedFileResults(filesToDeletedStatus).build();
hoodieRollbackStats = HoodieRollbackStat.newBuilder().withPartitionPath(partitionPath)
.withDeletedFileResults(filesToDeletedStatus).build();
}
break;
} catch (IOException io) {
@@ -261,12 +266,14 @@ public class HoodieMergeOnReadTable<T extends HoodieRecordPayload> extends
// --------------------------------------------------------------------------------------------------
// (A) The following cases are possible if index.canIndexLogFiles and/or index.isGlobal
// --------------------------------------------------------------------------------------------------
// (A.1) Failed first commit - Inserts were written to log files and HoodieWriteStat has no entries. In
// (A.1) Failed first commit - Inserts were written to log files and HoodieWriteStat has no entries.
// In
// this scenario we would want to delete these log files.
// (A.2) Failed recurring commit - Inserts/Updates written to log files. In this scenario,
// HoodieWriteStat will have the baseCommitTime for the first log file written, add rollback blocks.
// (A.3) Rollback triggered for first commit - Inserts were written to the log files but the commit is
// being reverted. In this scenario, HoodieWriteStat will be `null` for the attribute prevCommitTime and
// being reverted. In this scenario, HoodieWriteStat will be `null` for the attribute prevCommitTime
// and
// and hence will end up deleting these log files. This is done so there are no orphan log files
// lying around.
// (A.4) Rollback triggered for recurring commits - Inserts/Updates are being rolled back, the actions
@@ -274,7 +281,8 @@ public class HoodieMergeOnReadTable<T extends HoodieRecordPayload> extends
// ---------------------------------------------------------------------------------------------------
// (B) The following cases are possible if !index.canIndexLogFiles and/or !index.isGlobal
// ---------------------------------------------------------------------------------------------------
// (B.1) Failed first commit - Inserts were written to parquet files and HoodieWriteStat has no entries.
// (B.1) Failed first commit - Inserts were written to parquet files and HoodieWriteStat has no
// entries.
// In this scenario, we delete all the parquet files written for the failed commit.
// (B.2) Failed recurring commits - Inserts were written to parquet files and updates to log files. In
// this scenario, perform (A.1) and for updates written to log files, write rollback blocks.
@@ -282,10 +290,11 @@ public class HoodieMergeOnReadTable<T extends HoodieRecordPayload> extends
// (B.4) Rollback triggered for recurring commits - Same as (B.2) plus we need to delete the log files
// as well if the base parquet file gets deleted.
try {
HoodieCommitMetadata commitMetadata = HoodieCommitMetadata.fromBytes(
metaClient.getCommitTimeline().getInstantDetails(
new HoodieInstant(true, instantToRollback.getAction(), instantToRollback.getTimestamp()))
.get(), HoodieCommitMetadata.class);
HoodieCommitMetadata commitMetadata =
HoodieCommitMetadata.fromBytes(
metaClient.getCommitTimeline().getInstantDetails(new HoodieInstant(true,
instantToRollback.getAction(), instantToRollback.getTimestamp())).get(),
HoodieCommitMetadata.class);
// read commit file and (either append delete blocks or delete file)
Map<FileStatus, Long> filesToNumBlocksRollback = new HashMap<>();
@@ -294,8 +303,7 @@ public class HoodieMergeOnReadTable<T extends HoodieRecordPayload> extends
// We do not know fileIds for inserts (first inserts are either log files or parquet files),
// delete all files for the corresponding failed commit, if present (same as COW)
super.deleteCleanedFiles(filesToDeletedStatus, partitionPath, filter);
final Set<String> deletedFiles = filesToDeletedStatus.entrySet().stream()
.map(entry -> {
final Set<String> deletedFiles = filesToDeletedStatus.entrySet().stream().map(entry -> {
Path filePath = entry.getKey().getPath();
return FSUtils.getFileIdFromFilePath(filePath);
}).collect(Collectors.toSet());
@@ -316,8 +324,8 @@ public class HoodieMergeOnReadTable<T extends HoodieRecordPayload> extends
}).filter(Objects::nonNull).collect();
// Delete Inflight instants if enabled
deleteInflightInstant(deleteInstants, this.getActiveTimeline(), new HoodieInstant(true, instantToRollback
.getAction(), instantToRollback.getTimestamp()));
deleteInflightInstant(deleteInstants, this.getActiveTimeline(),
new HoodieInstant(true, instantToRollback.getAction(), instantToRollback.getTimestamp()));
logger.debug("Time(in ms) taken to finish rollback " + (System.currentTimeMillis() - startTime));
@@ -332,8 +340,8 @@ public class HoodieMergeOnReadTable<T extends HoodieRecordPayload> extends
}
/**
* UpsertPartitioner for MergeOnRead table type, this allows auto correction of small parquet
* files to larger ones without the need for an index in the logFile.
* UpsertPartitioner for MergeOnRead table type, this allows auto correction of small parquet files to larger ones
* without the need for an index in the logFile.
*/
class MergeOnReadUpsertPartitioner extends HoodieCopyOnWriteTable.UpsertPartitioner {
@@ -361,20 +369,22 @@ public class HoodieMergeOnReadTable<T extends HoodieRecordPayload> extends
// TODO : choose last N small files since there can be multiple small files written to a single partition
// by different spark partitions in a single batch
Option<FileSlice> smallFileSlice = Option.fromJavaOptional(getRTFileSystemView()
.getLatestFileSlicesBeforeOrOn(partitionPath, latestCommitTime.getTimestamp(), false).filter(
fileSlice -> fileSlice.getLogFiles().count() < 1
&& fileSlice.getDataFile().get().getFileSize() < config
.getParquetSmallFileLimit()).sorted((FileSlice left, FileSlice right) ->
left.getDataFile().get().getFileSize() < right.getDataFile().get().getFileSize()
? -1 : 1).findFirst());
.getLatestFileSlicesBeforeOrOn(partitionPath, latestCommitTime.getTimestamp(), false)
.filter(fileSlice -> fileSlice.getLogFiles().count() < 1
&& fileSlice.getDataFile().get().getFileSize() < config.getParquetSmallFileLimit())
.sorted((FileSlice left,
FileSlice right) -> left.getDataFile().get().getFileSize() < right.getDataFile().get().getFileSize()
? -1
: 1)
.findFirst());
if (smallFileSlice.isPresent()) {
allSmallFileSlices.add(smallFileSlice.get());
}
} else {
// If we can index log files, we can add more inserts to log files for fileIds including those under
// pending compaction.
List<FileSlice> allFileSlices = getRTFileSystemView()
.getLatestFileSlicesBeforeOrOn(partitionPath, latestCommitTime.getTimestamp(), true)
List<FileSlice> allFileSlices =
getRTFileSystemView().getLatestFileSlicesBeforeOrOn(partitionPath, latestCommitTime.getTimestamp(), true)
.collect(Collectors.toList());
for (FileSlice fileSlice : allFileSlices) {
if (isSmallFile(partitionPath, fileSlice)) {
@@ -408,8 +418,7 @@ public class HoodieMergeOnReadTable<T extends HoodieRecordPayload> extends
}
public List<String> getSmallFileIds() {
return (List<String>) smallFiles.stream()
.map(smallFile -> ((SmallFile) smallFile).location.getFileId())
return (List<String>) smallFiles.stream().map(smallFile -> ((SmallFile) smallFile).location.getFileId())
.collect(Collectors.toList());
}
@@ -417,8 +426,8 @@ public class HoodieMergeOnReadTable<T extends HoodieRecordPayload> extends
if (!fileSlice.getDataFile().isPresent()) {
return convertLogFilesSizeToExpectedParquetSize(fileSlice.getLogFiles().collect(Collectors.toList()));
} else {
return fileSlice.getDataFile().get().getFileSize() + convertLogFilesSizeToExpectedParquetSize(fileSlice
.getLogFiles().collect(Collectors.toList()));
return fileSlice.getDataFile().get().getFileSize()
+ convertLogFilesSizeToExpectedParquetSize(fileSlice.getLogFiles().collect(Collectors.toList()));
}
}
@@ -431,13 +440,12 @@ public class HoodieMergeOnReadTable<T extends HoodieRecordPayload> extends
@VisibleForTesting
public long convertLogFilesSizeToExpectedParquetSize(List<HoodieLogFile> hoodieLogFiles) {
long totalSizeOfLogFiles = hoodieLogFiles.stream().map(hoodieLogFile -> hoodieLogFile.getFileSize())
.filter(size -> size > 0)
.reduce((a, b) -> (a + b)).orElse(0L);
.filter(size -> size > 0).reduce((a, b) -> (a + b)).orElse(0L);
// Here we assume that if there is no base parquet file, all log files contain only inserts.
// We can then just get the parquet equivalent size of these log files, compare that with
// {@link config.getParquetMaxFileSize()} and decide if there is scope to insert more rows
long logFilesEquivalentParquetFileSize = (long) (totalSizeOfLogFiles * config
.getLogFileToParquetCompressionRatio());
long logFilesEquivalentParquetFileSize =
(long) (totalSizeOfLogFiles * config.getLogFileToParquetCompressionRatio());
return logFilesEquivalentParquetFileSize;
}
}
@@ -447,8 +455,8 @@ public class HoodieMergeOnReadTable<T extends HoodieRecordPayload> extends
Map<HeaderMetadataType, String> header = Maps.newHashMap();
header.put(HeaderMetadataType.INSTANT_TIME, metaClient.getActiveTimeline().lastInstant().get().getTimestamp());
header.put(HeaderMetadataType.TARGET_INSTANT_TIME, commit);
header.put(HeaderMetadataType.COMMAND_BLOCK_TYPE, String.valueOf(HoodieCommandBlockTypeEnum.ROLLBACK_PREVIOUS_BLOCK
.ordinal()));
header.put(HeaderMetadataType.COMMAND_BLOCK_TYPE,
String.valueOf(HoodieCommandBlockTypeEnum.ROLLBACK_PREVIOUS_BLOCK.ordinal()));
return header;
}
@@ -462,8 +470,7 @@ public class HoodieMergeOnReadTable<T extends HoodieRecordPayload> extends
// baseCommit always by listing the file slice
Map<String, String> fileIdToBaseCommitTimeForLogMap = this.getRTFileSystemView().getLatestFileSlices(partitionPath)
.collect(Collectors.toMap(FileSlice::getFileId, FileSlice::getBaseInstantTime));
commitMetadata.getPartitionToWriteStats().get(partitionPath).stream()
.filter(wStat -> {
commitMetadata.getPartitionToWriteStats().get(partitionPath).stream().filter(wStat -> {
// Filter out stats without prevCommit since they are all inserts
return wStat != null && wStat.getPrevCommit() != HoodieWriteStat.NULL_COMMIT && wStat.getPrevCommit() != null
&& !deletedFiles.contains(wStat.getFileId());
@@ -473,10 +480,9 @@ public class HoodieMergeOnReadTable<T extends HoodieRecordPayload> extends
if (null != baseCommitTime) {
boolean success = false;
try {
writer = HoodieLogFormat.newWriterBuilder().onParentPath(
FSUtils.getPartitionPath(this.getMetaClient().getBasePath(), partitionPath))
.withFileId(wStat.getFileId()).overBaseCommit(baseCommitTime)
.withFs(this.metaClient.getFs())
writer = HoodieLogFormat.newWriterBuilder()
.onParentPath(FSUtils.getPartitionPath(this.getMetaClient().getBasePath(), partitionPath))
.withFileId(wStat.getFileId()).overBaseCommit(baseCommitTime).withFs(this.metaClient.getFs())
.withFileExtension(HoodieLogFile.DELTA_EXTENSION).build();
// generate metadata
Map<HeaderMetadataType, String> header = generateHeader(commit);
@@ -484,8 +490,7 @@ public class HoodieMergeOnReadTable<T extends HoodieRecordPayload> extends
writer = writer.appendBlock(new HoodieCommandBlock(header));
success = true;
} catch (IOException | InterruptedException io) {
throw new HoodieRollbackException(
"Failed to rollback for commit " + commit, io);
throw new HoodieRollbackException("Failed to rollback for commit " + commit, io);
} finally {
try {
if (writer != null) {
@@ -495,8 +500,8 @@ public class HoodieMergeOnReadTable<T extends HoodieRecordPayload> extends
// This step is intentionally done after writer is closed. Guarantees that
// getFileStatus would reflect correct stats and FileNotFoundException is not thrown in
// cloud-storage : HUDI-168
filesToNumBlocksRollback.put(this.getMetaClient().getFs()
.getFileStatus(writer.getLogFile().getPath()), 1L);
filesToNumBlocksRollback.put(this.getMetaClient().getFs().getFileStatus(writer.getLogFile().getPath()),
1L);
}
} catch (IOException io) {
throw new UncheckedIOException(io);
@@ -504,9 +509,7 @@ public class HoodieMergeOnReadTable<T extends HoodieRecordPayload> extends
}
}
});
return HoodieRollbackStat.newBuilder()
.withPartitionPath(partitionPath)
.withDeletedFileResults(filesToDeletedStatus)
return HoodieRollbackStat.newBuilder().withPartitionPath(partitionPath).withDeletedFileResults(filesToDeletedStatus)
.withRollbackBlockAppendResults(filesToNumBlocksRollback).build();
}

View File

@@ -82,22 +82,21 @@ public abstract class HoodieTable<T extends HoodieRecordPayload> implements Seri
protected HoodieTable(HoodieWriteConfig config, JavaSparkContext jsc) {
this.config = config;
this.hadoopConfiguration = new SerializableConfiguration(jsc.hadoopConfiguration());
this.viewManager = FileSystemViewManager.createViewManager(
new SerializableConfiguration(jsc.hadoopConfiguration()), config.getViewStorageConfig());
this.viewManager = FileSystemViewManager.createViewManager(new SerializableConfiguration(jsc.hadoopConfiguration()),
config.getViewStorageConfig());
this.metaClient = ClientUtils.createMetaClient(jsc, config, true);
this.index = HoodieIndex.createIndex(config, jsc);
}
private synchronized FileSystemViewManager getViewManager() {
if (null == viewManager) {
viewManager = FileSystemViewManager.createViewManager(hadoopConfiguration,
config.getViewStorageConfig());
viewManager = FileSystemViewManager.createViewManager(hadoopConfiguration, config.getViewStorageConfig());
}
return viewManager;
}
public static <T extends HoodieRecordPayload> HoodieTable<T> getHoodieTable(
HoodieTableMetaClient metaClient, HoodieWriteConfig config, JavaSparkContext jsc) {
public static <T extends HoodieRecordPayload> HoodieTable<T> getHoodieTable(HoodieTableMetaClient metaClient,
HoodieWriteConfig config, JavaSparkContext jsc) {
switch (metaClient.getTableType()) {
case COPY_ON_WRITE:
return new HoodieCopyOnWriteTable<>(config, jsc);
@@ -202,8 +201,7 @@ public abstract class HoodieTable<T extends HoodieRecordPayload> implements Seri
* Get the list of savepoints in this table
*/
public List<String> getSavepoints() {
return getCompletedSavepointTimeline().getInstants().map(HoodieInstant::getTimestamp)
.collect(Collectors.toList());
return getCompletedSavepointTimeline().getInstants().map(HoodieInstant::getTimestamp).collect(Collectors.toList());
}
/**
@@ -214,18 +212,14 @@ public abstract class HoodieTable<T extends HoodieRecordPayload> implements Seri
throw new HoodieSavepointException(
"Could not get data files for savepoint " + savepointTime + ". No such savepoint.");
}
HoodieInstant instant = new HoodieInstant(false, HoodieTimeline.SAVEPOINT_ACTION,
savepointTime);
HoodieInstant instant = new HoodieInstant(false, HoodieTimeline.SAVEPOINT_ACTION, savepointTime);
HoodieSavepointMetadata metadata = null;
try {
metadata = AvroUtils
.deserializeHoodieSavepointMetadata(getActiveTimeline().getInstantDetails(instant).get());
metadata = AvroUtils.deserializeHoodieSavepointMetadata(getActiveTimeline().getInstantDetails(instant).get());
} catch (IOException e) {
throw new HoodieSavepointException(
"Could not get savepointed data files for savepoint " + savepointTime, e);
throw new HoodieSavepointException("Could not get savepointed data files for savepoint " + savepointTime, e);
}
return metadata.getPartitionMetadata().values().stream()
.flatMap(s -> s.getSavepointDataFile().stream());
return metadata.getPartitionMetadata().values().stream().flatMap(s -> s.getSavepointDataFile().stream());
}
public HoodieActiveTimeline getActiveTimeline() {
@@ -242,17 +236,18 @@ public abstract class HoodieTable<T extends HoodieRecordPayload> implements Seri
/**
* Perform the ultimate IO for a given upserted (RDD) partition
*/
public abstract Iterator<List<WriteStatus>> handleUpsertPartition(String commitTime,
Integer partition, Iterator<HoodieRecord<T>> recordIterator, Partitioner partitioner);
public abstract Iterator<List<WriteStatus>> handleUpsertPartition(String commitTime, Integer partition,
Iterator<HoodieRecord<T>> recordIterator, Partitioner partitioner);
/**
* Perform the ultimate IO for a given inserted (RDD) partition
*/
public abstract Iterator<List<WriteStatus>> handleInsertPartition(String commitTime,
Integer partition, Iterator<HoodieRecord<T>> recordIterator, Partitioner partitioner);
public abstract Iterator<List<WriteStatus>> handleInsertPartition(String commitTime, Integer partition,
Iterator<HoodieRecord<T>> recordIterator, Partitioner partitioner);
/**
* Schedule compaction for the instant time
*
* @param jsc Spark Context
* @param instantTime Instant Time for scheduling compaction
* @return
@@ -260,8 +255,7 @@ public abstract class HoodieTable<T extends HoodieRecordPayload> implements Seri
public abstract HoodieCompactionPlan scheduleCompaction(JavaSparkContext jsc, String instantTime);
/**
* Run Compaction on the table. Compaction arranges the data so that it is optimized for data
* access
* Run Compaction on the table. Compaction arranges the data so that it is optimized for data access
*
* @param jsc Spark Context
* @param compactionInstantTime Instant Time
@@ -276,9 +270,9 @@ public abstract class HoodieTable<T extends HoodieRecordPayload> implements Seri
public abstract List<HoodieCleanStat> clean(JavaSparkContext jsc);
/**
* Rollback the (inflight/committed) record changes with the given commit time. Four steps: (1)
* Atomically unpublish this commit (2) clean indexing data (3) clean new generated parquet files
* / log blocks (4) Finally, delete .<action>.commit or .<action>.inflight file if deleteInstants = true
* Rollback the (inflight/committed) record changes with the given commit time. Four steps: (1) Atomically unpublish
* this commit (2) clean indexing data (3) clean new generated parquet files / log blocks (4) Finally, delete
* .<action>.commit or .<action>.inflight file if deleteInstants = true
*/
public abstract List<HoodieRollbackStat> rollback(JavaSparkContext jsc, String commit, boolean deleteInstants)
throws IOException;
@@ -297,6 +291,7 @@ public abstract class HoodieTable<T extends HoodieRecordPayload> implements Seri
/**
* Delete Marker directory corresponding to an instant
*
* @param instantTs Instant Time
*/
protected void deleteMarkerDir(String instantTs) {
@@ -343,13 +338,12 @@ public abstract class HoodieTable<T extends HoodieRecordPayload> implements Seri
// Contains list of partially created files. These needs to be cleaned up.
invalidDataPaths.removeAll(validDataPaths);
if (!invalidDataPaths.isEmpty()) {
logger.info("Removing duplicate data files created due to spark retries before committing. Paths="
+ invalidDataPaths);
logger.info(
"Removing duplicate data files created due to spark retries before committing. Paths=" + invalidDataPaths);
}
Map<String, List<Pair<String, String>>> groupByPartition = invalidDataPaths.stream()
.map(dp -> Pair.of(new Path(dp).getParent().toString(), dp))
.collect(Collectors.groupingBy(Pair::getKey));
.map(dp -> Pair.of(new Path(dp).getParent().toString(), dp)).collect(Collectors.groupingBy(Pair::getKey));
if (!groupByPartition.isEmpty()) {
// Ensure all files in delete list is actually present. This is mandatory for an eventually consistent FS.
@@ -394,6 +388,7 @@ public abstract class HoodieTable<T extends HoodieRecordPayload> implements Seri
/**
* Ensures all files passed either appear or disappear
*
* @param jsc JavaSparkContext
* @param groupByPartition Files grouped by partition
* @param visibility Appear/Disappear

View File

@@ -23,13 +23,11 @@ import org.apache.hudi.common.model.HoodieRecordPayload;
import org.apache.spark.api.java.JavaRDD;
/**
* Repartition input records into at least expected number of output spark partitions. It should
* give below guarantees - Output spark partition will have records from only one hoodie partition.
* - Average records per output spark partitions should be almost equal to (#inputRecords /
* #outputSparkPartitions) to avoid possible skews.
* Repartition input records into at least expected number of output spark partitions. It should give below guarantees -
* Output spark partition will have records from only one hoodie partition. - Average records per output spark
* partitions should be almost equal to (#inputRecords / #outputSparkPartitions) to avoid possible skews.
*/
public interface UserDefinedBulkInsertPartitioner<T extends HoodieRecordPayload> {
JavaRDD<HoodieRecord<T>> repartitionRecords(JavaRDD<HoodieRecord<T>> records,
int outputSparkPartitions);
JavaRDD<HoodieRecord<T>> repartitionRecords(JavaRDD<HoodieRecord<T>> records, int outputSparkPartitions);
}

View File

@@ -30,8 +30,7 @@ import org.apache.spark.api.java.JavaRDD;
import scala.Tuple2;
/**
* Information about incoming records for upsert/insert obtained either via sampling or
* introspecting the data fully
* Information about incoming records for upsert/insert obtained either via sampling or introspecting the data fully
* <p>
* TODO(vc): Think about obtaining this directly from index.tagLocation
*/
@@ -62,11 +61,10 @@ public class WorkloadProfile<T extends HoodieRecordPayload> implements Serializa
Map<Tuple2<String, Option<HoodieRecordLocation>>, Long> partitionLocationCounts = taggedRecords
.mapToPair(record -> new Tuple2<>(
new Tuple2<>(record.getPartitionPath(), Option.ofNullable(record.getCurrentLocation())),
record)).countByKey();
new Tuple2<>(record.getPartitionPath(), Option.ofNullable(record.getCurrentLocation())), record))
.countByKey();
for (Map.Entry<Tuple2<String, Option<HoodieRecordLocation>>, Long> e : partitionLocationCounts
.entrySet()) {
for (Map.Entry<Tuple2<String, Option<HoodieRecordLocation>>, Long> e : partitionLocationCounts.entrySet()) {
String partitionPath = e.getKey()._1();
Long count = e.getValue();
Option<HoodieRecordLocation> locOption = e.getKey()._2();

View File

@@ -41,7 +41,8 @@ import org.apache.spark.api.java.JavaRDD;
import org.apache.spark.api.java.JavaSparkContext;
/**
* Driver program that uses the Hoodie client with synthetic workload, and performs basic operations. <p>
* Driver program that uses the Hoodie client with synthetic workload, and performs basic operations.
* <p>
*/
public class HoodieClientExample {
@@ -82,18 +83,15 @@ public class HoodieClientExample {
Path path = new Path(tablePath);
FileSystem fs = FSUtils.getFs(tablePath, jsc.hadoopConfiguration());
if (!fs.exists(path)) {
HoodieTableMetaClient
.initTableType(jsc.hadoopConfiguration(), tablePath, HoodieTableType.valueOf(tableType), tableName,
HoodieAvroPayload.class.getName());
HoodieTableMetaClient.initTableType(jsc.hadoopConfiguration(), tablePath, HoodieTableType.valueOf(tableType),
tableName, HoodieAvroPayload.class.getName());
}
// Create the write client to write some records in
HoodieWriteConfig cfg = HoodieWriteConfig.newBuilder().withPath(tablePath)
.withSchema(HoodieTestDataGenerator.TRIP_EXAMPLE_SCHEMA).withParallelism(2, 2)
.forTable(tableName)
.withSchema(HoodieTestDataGenerator.TRIP_EXAMPLE_SCHEMA).withParallelism(2, 2).forTable(tableName)
.withIndexConfig(HoodieIndexConfig.newBuilder().withIndexType(IndexType.BLOOM).build())
.withCompactionConfig(
HoodieCompactionConfig.newBuilder().archiveCommitsWith(2, 3).build()).build();
.withCompactionConfig(HoodieCompactionConfig.newBuilder().archiveCommitsWith(2, 3).build()).build();
HoodieWriteClient client = new HoodieWriteClient(jsc, cfg);
/**

View File

@@ -74,6 +74,7 @@ public abstract class HoodieClientTestHarness extends HoodieCommonTestHarness im
/**
* Cleanups resource group for the subclasses of {@link TestHoodieClientBase}.
*
* @throws IOException
*/
public void cleanupResources() throws IOException {
@@ -84,8 +85,7 @@ public abstract class HoodieClientTestHarness extends HoodieCommonTestHarness im
}
/**
* Initializes the Spark contexts ({@link JavaSparkContext} and {@link SQLContext})
* with the given application name.
* Initializes the Spark contexts ({@link JavaSparkContext} and {@link SQLContext}) with the given application name.
*
* @param appName The specified application name.
*/
@@ -99,8 +99,8 @@ public abstract class HoodieClientTestHarness extends HoodieCommonTestHarness im
}
/**
* Initializes the Spark contexts ({@link JavaSparkContext} and {@link SQLContext})
* with a default name <b>TestHoodieClient</b>.
* Initializes the Spark contexts ({@link JavaSparkContext} and {@link SQLContext}) with a default name
* <b>TestHoodieClient</b>.
*/
protected void initSparkContexts() {
initSparkContexts("TestHoodieClient");
@@ -155,8 +155,8 @@ public abstract class HoodieClientTestHarness extends HoodieCommonTestHarness im
}
/**
* Initializes an instance of {@link HoodieTableMetaClient} with a special table type
* specified by {@code getTableType()}.
* Initializes an instance of {@link HoodieTableMetaClient} with a special table type specified by
* {@code getTableType()}.
*
* @throws IOException
*/

View File

@@ -73,15 +73,14 @@ public class TestAsyncCompaction extends TestHoodieClientBase {
private HoodieWriteConfig.Builder getConfigBuilder(Boolean autoCommit) {
return HoodieWriteConfig.newBuilder().withPath(basePath).withSchema(TRIP_EXAMPLE_SCHEMA).withParallelism(2, 2)
.withAutoCommit(autoCommit).withAssumeDatePartitioning(true).withCompactionConfig(
HoodieCompactionConfig.newBuilder().compactionSmallFileSize(1024 * 1024 * 1024).withInlineCompaction(false)
.withMaxNumDeltaCommitsBeforeCompaction(1).build())
.withAutoCommit(autoCommit).withAssumeDatePartitioning(true)
.withCompactionConfig(HoodieCompactionConfig.newBuilder().compactionSmallFileSize(1024 * 1024 * 1024)
.withInlineCompaction(false).withMaxNumDeltaCommitsBeforeCompaction(1).build())
.withStorageConfig(HoodieStorageConfig.newBuilder().limitFileSize(1024 * 1024 * 1024).build())
.forTable("test-trip-table")
.withIndexConfig(HoodieIndexConfig.newBuilder().withIndexType(HoodieIndex.IndexType.BLOOM).build())
.withEmbeddedTimelineServerEnabled(true).withFileSystemViewConfig(
FileSystemViewStorageConfig.newBuilder().withStorageType(FileSystemViewStorageType.EMBEDDED_KV_STORE)
.build());
.withEmbeddedTimelineServerEnabled(true).withFileSystemViewConfig(FileSystemViewStorageConfig.newBuilder()
.withStorageType(FileSystemViewStorageType.EMBEDDED_KV_STORE).build());
}
@Test
@@ -97,8 +96,8 @@ public class TestAsyncCompaction extends TestHoodieClientBase {
int numRecs = 2000;
List<HoodieRecord> records = dataGen.generateInserts(firstInstantTime, numRecs);
runNextDeltaCommits(client, Arrays.asList(firstInstantTime, secondInstantTime),
records, cfg, true, new ArrayList<>());
runNextDeltaCommits(client, Arrays.asList(firstInstantTime, secondInstantTime), records, cfg, true,
new ArrayList<>());
// Schedule compaction but do not run them
scheduleCompaction(compactionInstantTime, client, cfg);
@@ -158,8 +157,8 @@ public class TestAsyncCompaction extends TestHoodieClientBase {
try (HoodieWriteClient client = getHoodieWriteClient(cfg, true);) {
List<HoodieRecord> records = dataGen.generateInserts(firstInstantTime, numRecs);
records = runNextDeltaCommits(client, Arrays.asList(firstInstantTime, secondInstantTime),
records, cfg, true, new ArrayList<>());
records = runNextDeltaCommits(client, Arrays.asList(firstInstantTime, secondInstantTime), records, cfg, true,
new ArrayList<>());
// Schedule compaction but do not run them
scheduleCompaction(compactionInstantTime, client, cfg);
@@ -182,15 +181,13 @@ public class TestAsyncCompaction extends TestHoodieClientBase {
// Validate
metaClient = new HoodieTableMetaClient(jsc.hadoopConfiguration(), cfg.getBasePath());
inflightInstant =
metaClient.getActiveTimeline().filterInflightsExcludingCompaction().firstInstant().get();
inflightInstant = metaClient.getActiveTimeline().filterInflightsExcludingCompaction().firstInstant().get();
assertTrue("inflight instant has expected instant time",
inflightInstant.getTimestamp().equals(nextInflightInstantTime));
assertTrue("Expect only one inflight instant",
metaClient.getActiveTimeline().filterInflightsExcludingCompaction().getInstants().count() == 1);
// Expect pending Compaction to be present
pendingCompactionInstant =
metaClient.getActiveTimeline().filterPendingCompactionTimeline().firstInstant().get();
pendingCompactionInstant = metaClient.getActiveTimeline().filterPendingCompactionTimeline().firstInstant().get();
assertTrue("Pending Compaction instant has expected instant time",
pendingCompactionInstant.getTimestamp().equals(compactionInstantTime));
}
@@ -211,8 +208,8 @@ public class TestAsyncCompaction extends TestHoodieClientBase {
int numRecs = 2000;
List<HoodieRecord> records = dataGen.generateInserts(firstInstantTime, numRecs);
records = runNextDeltaCommits(client, Arrays.asList(firstInstantTime, secondInstantTime),
records, cfg, true, new ArrayList<>());
records = runNextDeltaCommits(client, Arrays.asList(firstInstantTime, secondInstantTime), records, cfg, true,
new ArrayList<>());
// Schedule and mark compaction instant as inflight
HoodieTableMetaClient metaClient = new HoodieTableMetaClient(jsc.hadoopConfiguration(), cfg.getBasePath());
@@ -221,8 +218,8 @@ public class TestAsyncCompaction extends TestHoodieClientBase {
moveCompactionFromRequestedToInflight(compactionInstantTime, client, cfg);
// Complete ingestions
runNextDeltaCommits(client, Arrays.asList(thirdInstantTime, fourthInstantTime),
records, cfg, false, Arrays.asList(compactionInstantTime));
runNextDeltaCommits(client, Arrays.asList(thirdInstantTime, fourthInstantTime), records, cfg, false,
Arrays.asList(compactionInstantTime));
// execute inflight compaction
executeCompaction(compactionInstantTime, client, hoodieTable, cfg, numRecs, true);
@@ -242,8 +239,8 @@ public class TestAsyncCompaction extends TestHoodieClientBase {
int numRecs = 2000;
List<HoodieRecord> records = dataGen.generateInserts(firstInstantTime, numRecs);
records = runNextDeltaCommits(client, Arrays.asList(firstInstantTime, secondInstantTime),
records, cfg, true, new ArrayList<>());
records = runNextDeltaCommits(client, Arrays.asList(firstInstantTime, secondInstantTime), records, cfg, true,
new ArrayList<>());
// Schedule compaction but do not run them
HoodieTableMetaClient metaClient = new HoodieTableMetaClient(jsc.hadoopConfiguration(), cfg.getBasePath());
@@ -256,8 +253,8 @@ public class TestAsyncCompaction extends TestHoodieClientBase {
boolean gotException = false;
try {
runNextDeltaCommits(client, Arrays.asList(failedInstantTime),
records, cfg, false, Arrays.asList(compactionInstantTime));
runNextDeltaCommits(client, Arrays.asList(failedInstantTime), records, cfg, false,
Arrays.asList(compactionInstantTime));
} catch (IllegalArgumentException iex) {
// Latest pending compaction instant time must be earlier than this instant time. Should fail here
gotException = true;
@@ -279,8 +276,8 @@ public class TestAsyncCompaction extends TestHoodieClientBase {
int numRecs = 2000;
List<HoodieRecord> records = dataGen.generateInserts(firstInstantTime, numRecs);
records = runNextDeltaCommits(client, Arrays.asList(firstInstantTime, secondInstantTime),
records, cfg, true, new ArrayList<>());
records = runNextDeltaCommits(client, Arrays.asList(firstInstantTime, secondInstantTime), records, cfg, true,
new ArrayList<>());
HoodieTableMetaClient metaClient = new HoodieTableMetaClient(jsc.hadoopConfiguration(), cfg.getBasePath());
createNextDeltaCommit(inflightInstantTime, records, client, metaClient, cfg, true);
@@ -315,8 +312,8 @@ public class TestAsyncCompaction extends TestHoodieClientBase {
int numRecs = 2000;
List<HoodieRecord> records = dataGen.generateInserts(firstInstantTime, numRecs);
records = runNextDeltaCommits(client, Arrays.asList(firstInstantTime, secondInstantTime),
records, cfg, true, new ArrayList<>());
records = runNextDeltaCommits(client, Arrays.asList(firstInstantTime, secondInstantTime), records, cfg, true,
new ArrayList<>());
HoodieTableMetaClient metaClient = new HoodieTableMetaClient(jsc.hadoopConfiguration(), cfg.getBasePath());
boolean gotException = false;
@@ -337,8 +334,7 @@ public class TestAsyncCompaction extends TestHoodieClientBase {
} catch (IllegalArgumentException iex) {
gotException = true;
}
assertTrue("Compaction Instant to be scheduled cannot have same timestamp as committed instant",
gotException);
assertTrue("Compaction Instant to be scheduled cannot have same timestamp as committed instant", gotException);
compactionInstantTime = "006";
scheduleCompaction(compactionInstantTime, client, cfg);
@@ -349,8 +345,7 @@ public class TestAsyncCompaction extends TestHoodieClientBase {
} catch (IllegalArgumentException iex) {
gotException = true;
}
assertTrue("Compaction Instant to be scheduled cannot have same timestamp as a pending compaction",
gotException);
assertTrue("Compaction Instant to be scheduled cannot have same timestamp as a pending compaction", gotException);
}
@Test
@@ -365,8 +360,8 @@ public class TestAsyncCompaction extends TestHoodieClientBase {
int numRecs = 2000;
List<HoodieRecord> records = dataGen.generateInserts(firstInstantTime, numRecs);
records = runNextDeltaCommits(client, Arrays.asList(firstInstantTime, secondInstantTime),
records, cfg, true, new ArrayList<>());
records = runNextDeltaCommits(client, Arrays.asList(firstInstantTime, secondInstantTime), records, cfg, true,
new ArrayList<>());
HoodieTableMetaClient metaClient = new HoodieTableMetaClient(jsc.hadoopConfiguration(), cfg.getBasePath());
HoodieTable hoodieTable = getHoodieTable(metaClient, cfg);
@@ -389,15 +384,15 @@ public class TestAsyncCompaction extends TestHoodieClientBase {
int numRecs = 2000;
List<HoodieRecord> records = dataGen.generateInserts(firstInstantTime, numRecs);
records = runNextDeltaCommits(client, Arrays.asList(firstInstantTime, secondInstantTime),
records, cfg, true, new ArrayList<>());
records = runNextDeltaCommits(client, Arrays.asList(firstInstantTime, secondInstantTime), records, cfg, true,
new ArrayList<>());
HoodieTableMetaClient metaClient = new HoodieTableMetaClient(jsc.hadoopConfiguration(), cfg.getBasePath());
HoodieTable hoodieTable = getHoodieTable(metaClient, cfg);
scheduleCompaction(compactionInstantTime, client, cfg);
runNextDeltaCommits(client, Arrays.asList(thirdInstantTime, fourthInstantTime),
records, cfg, false, Arrays.asList(compactionInstantTime));
runNextDeltaCommits(client, Arrays.asList(thirdInstantTime, fourthInstantTime), records, cfg, false,
Arrays.asList(compactionInstantTime));
executeCompaction(compactionInstantTime, client, hoodieTable, cfg, numRecs, true);
}
}
@@ -428,8 +423,8 @@ public class TestAsyncCompaction extends TestHoodieClientBase {
}
private List<HoodieRecord> runNextDeltaCommits(HoodieWriteClient client, List<String> deltaInstants,
List<HoodieRecord> records, HoodieWriteConfig cfg, boolean insertFirst,
List<String> expPendingCompactionInstants) throws Exception {
List<HoodieRecord> records, HoodieWriteConfig cfg, boolean insertFirst, List<String> expPendingCompactionInstants)
throws Exception {
HoodieTableMetaClient metaClient = new HoodieTableMetaClient(jsc.hadoopConfiguration(), cfg.getBasePath());
List<Pair<HoodieInstant, HoodieCompactionPlan>> pendingCompactions =
@@ -476,8 +471,8 @@ public class TestAsyncCompaction extends TestHoodieClientBase {
HoodieWriteConfig cfg) throws IOException {
HoodieTableMetaClient metaClient = new HoodieTableMetaClient(jsc.hadoopConfiguration(), cfg.getBasePath());
HoodieInstant compactionInstant = HoodieTimeline.getCompactionRequestedInstant(compactionInstantTime);
HoodieCompactionPlan workload = AvroUtils.deserializeCompactionPlan(
metaClient.getActiveTimeline().getInstantAuxiliaryDetails(compactionInstant).get());
HoodieCompactionPlan workload = AvroUtils
.deserializeCompactionPlan(metaClient.getActiveTimeline().getInstantAuxiliaryDetails(compactionInstant).get());
metaClient.getActiveTimeline().transitionCompactionRequestedToInflight(compactionInstant);
HoodieInstant instant = metaClient.getActiveTimeline().reload().filterPendingCompactionTimeline().getInstants()
.filter(in -> in.getTimestamp().equals(compactionInstantTime)).findAny().get();
@@ -489,27 +484,23 @@ public class TestAsyncCompaction extends TestHoodieClientBase {
client.scheduleCompactionAtInstant(compactionInstantTime, Option.empty());
HoodieTableMetaClient metaClient = new HoodieTableMetaClient(jsc.hadoopConfiguration(), cfg.getBasePath());
HoodieInstant instant = metaClient.getActiveTimeline().filterPendingCompactionTimeline().lastInstant().get();
assertEquals("Last compaction instant must be the one set",
instant.getTimestamp(), compactionInstantTime);
assertEquals("Last compaction instant must be the one set", instant.getTimestamp(), compactionInstantTime);
}
private void scheduleAndExecuteCompaction(String compactionInstantTime,
HoodieWriteClient client, HoodieTable table, HoodieWriteConfig cfg, int expectedNumRecs,
boolean hasDeltaCommitAfterPendingCompaction) throws IOException {
private void scheduleAndExecuteCompaction(String compactionInstantTime, HoodieWriteClient client, HoodieTable table,
HoodieWriteConfig cfg, int expectedNumRecs, boolean hasDeltaCommitAfterPendingCompaction) throws IOException {
scheduleCompaction(compactionInstantTime, client, cfg);
executeCompaction(compactionInstantTime, client, table, cfg, expectedNumRecs, hasDeltaCommitAfterPendingCompaction);
}
private void executeCompaction(String compactionInstantTime,
HoodieWriteClient client, HoodieTable table, HoodieWriteConfig cfg, int expectedNumRecs,
boolean hasDeltaCommitAfterPendingCompaction) throws IOException {
private void executeCompaction(String compactionInstantTime, HoodieWriteClient client, HoodieTable table,
HoodieWriteConfig cfg, int expectedNumRecs, boolean hasDeltaCommitAfterPendingCompaction) throws IOException {
client.compact(compactionInstantTime);
List<FileSlice> fileSliceList = getCurrentLatestFileSlices(table, cfg);
assertTrue("Ensure latest file-slices are not empty", fileSliceList.stream().findAny().isPresent());
assertFalse("Verify all file-slices have base-instant same as compaction instant",
fileSliceList.stream().filter(fs -> !fs.getBaseInstantTime().equals(compactionInstantTime))
.findAny().isPresent());
assertFalse("Verify all file-slices have base-instant same as compaction instant", fileSliceList.stream()
.filter(fs -> !fs.getBaseInstantTime().equals(compactionInstantTime)).findAny().isPresent());
assertFalse("Verify all file-slices have data-files",
fileSliceList.stream().filter(fs -> !fs.getDataFile().isPresent()).findAny().isPresent());
@@ -522,12 +513,11 @@ public class TestAsyncCompaction extends TestHoodieClientBase {
}
// verify that there is a commit
table = getHoodieTable(
new HoodieTableMetaClient(jsc.hadoopConfiguration(), cfg.getBasePath(), true), cfg);
table = getHoodieTable(new HoodieTableMetaClient(jsc.hadoopConfiguration(), cfg.getBasePath(), true), cfg);
HoodieTimeline timeline = table.getMetaClient().getCommitTimeline().filterCompletedInstants();
String latestCompactionCommitTime = timeline.lastInstant().get().getTimestamp();
assertEquals("Expect compaction instant time to be the latest commit time",
latestCompactionCommitTime, compactionInstantTime);
assertEquals("Expect compaction instant time to be the latest commit time", latestCompactionCommitTime,
compactionInstantTime);
Assert.assertEquals("Must contain expected records", expectedNumRecs,
HoodieClientTestUtils.readSince(basePath, sqlContext, timeline, "000").count());
@@ -546,8 +536,8 @@ public class TestAsyncCompaction extends TestHoodieClientBase {
client.commit(instantTime, statuses);
}
Option<HoodieInstant> deltaCommit = metaClient.getActiveTimeline().reload().getDeltaCommitTimeline()
.filterCompletedInstants().lastInstant();
Option<HoodieInstant> deltaCommit =
metaClient.getActiveTimeline().reload().getDeltaCommitTimeline().filterCompletedInstants().lastInstant();
if (skipCommit && !cfg.shouldAutoCommit()) {
assertTrue("Delta commit should not be latest instant",
deltaCommit.get().getTimestamp().compareTo(instantTime) < 0);
@@ -560,8 +550,8 @@ public class TestAsyncCompaction extends TestHoodieClientBase {
private List<HoodieDataFile> getCurrentLatestDataFiles(HoodieTable table, HoodieWriteConfig cfg) throws IOException {
FileStatus[] allFiles = HoodieTestUtils.listAllDataFilesInPath(table.getMetaClient().getFs(), cfg.getBasePath());
HoodieTableFileSystemView
view = new HoodieTableFileSystemView(table.getMetaClient(), table.getCompletedCommitsTimeline(), allFiles);
HoodieTableFileSystemView view =
new HoodieTableFileSystemView(table.getMetaClient(), table.getCompletedCommitsTimeline(), allFiles);
List<HoodieDataFile> dataFilesToRead = view.getLatestDataFiles().collect(Collectors.toList());
return dataFilesToRead;
}
@@ -569,9 +559,8 @@ public class TestAsyncCompaction extends TestHoodieClientBase {
private List<FileSlice> getCurrentLatestFileSlices(HoodieTable table, HoodieWriteConfig cfg) throws IOException {
HoodieTableFileSystemView view = new HoodieTableFileSystemView(table.getMetaClient(),
table.getMetaClient().getActiveTimeline().reload().getCommitsAndCompactionTimeline());
List<FileSlice> fileSliceList =
Arrays.asList(HoodieTestDataGenerator.DEFAULT_PARTITION_PATHS).stream().flatMap(partition ->
view.getLatestFileSlices(partition)).collect(Collectors.toList());
List<FileSlice> fileSliceList = Arrays.asList(HoodieTestDataGenerator.DEFAULT_PARTITION_PATHS).stream()
.flatMap(partition -> view.getLatestFileSlices(partition)).collect(Collectors.toList());
return fileSliceList;
}

View File

@@ -93,16 +93,13 @@ public class TestCleaner extends TestHoodieClientBase {
* @param insertFn Insertion API for testing
* @throws Exception in case of error
*/
private String insertFirstBigBatchForClientCleanerTest(
HoodieWriteConfig cfg,
HoodieWriteClient client,
private String insertFirstBigBatchForClientCleanerTest(HoodieWriteConfig cfg, HoodieWriteClient client,
Function2<List<HoodieRecord>, String, Integer> recordGenFunction,
Function3<JavaRDD<WriteStatus>, HoodieWriteClient, JavaRDD<HoodieRecord>, String> insertFn) throws Exception {
/**
* do a big insert
* (this is basically same as insert part of upsert, just adding it here so we can
* catch breakages in insert(), if the implementation diverges.)
* do a big insert (this is basically same as insert part of upsert, just adding it here so we can catch breakages
* in insert(), if the implementation diverges.)
*/
String newCommitTime = client.startCommit();
@@ -145,8 +142,8 @@ public class TestCleaner extends TestHoodieClientBase {
*/
@Test
public void testInsertPreppedAndCleanByVersions() throws Exception {
testInsertAndCleanByVersions(HoodieWriteClient::insertPreppedRecords,
HoodieWriteClient::upsertPreppedRecords, true);
testInsertAndCleanByVersions(HoodieWriteClient::insertPreppedRecords, HoodieWriteClient::upsertPreppedRecords,
true);
}
/**
@@ -178,15 +175,13 @@ public class TestCleaner extends TestHoodieClientBase {
*/
private void testInsertAndCleanByVersions(
Function3<JavaRDD<WriteStatus>, HoodieWriteClient, JavaRDD<HoodieRecord>, String> insertFn,
Function3<JavaRDD<WriteStatus>, HoodieWriteClient, JavaRDD<HoodieRecord>, String> upsertFn,
boolean isPreppedAPI
) throws Exception {
Function3<JavaRDD<WriteStatus>, HoodieWriteClient, JavaRDD<HoodieRecord>, String> upsertFn, boolean isPreppedAPI)
throws Exception {
int maxVersions = 2; // keep upto 2 versions for each file
HoodieWriteConfig cfg = getConfigBuilder().withCompactionConfig(
HoodieCompactionConfig.newBuilder().withCleanerPolicy(HoodieCleaningPolicy.KEEP_LATEST_FILE_VERSIONS)
.retainFileVersions(maxVersions).build())
.withParallelism(1, 1).withBulkInsertParallelism(1)
.withFinalizeWriteParallelism(1)
HoodieWriteConfig cfg = getConfigBuilder()
.withCompactionConfig(HoodieCompactionConfig.newBuilder()
.withCleanerPolicy(HoodieCleaningPolicy.KEEP_LATEST_FILE_VERSIONS).retainFileVersions(maxVersions).build())
.withParallelism(1, 1).withBulkInsertParallelism(1).withFinalizeWriteParallelism(1)
.withConsistencyGuardConfig(ConsistencyGuardConfig.newBuilder().withConsistencyCheckEnabled(true).build())
.build();
try (HoodieWriteClient client = getHoodieWriteClient(cfg);) {
@@ -204,8 +199,7 @@ public class TestCleaner extends TestHoodieClientBase {
HoodieTable table = HoodieTable.getHoodieTable(metaClient, getConfig(), jsc);
for (String partitionPath : dataGen.getPartitionPaths()) {
TableFileSystemView fsView = table.getFileSystemView();
Option<Boolean> added = Option.fromJavaOptional(fsView.getAllFileGroups(partitionPath).findFirst()
.map(fg -> {
Option<Boolean> added = Option.fromJavaOptional(fsView.getAllFileGroups(partitionPath).findFirst().map(fg -> {
fg.getLatestFileSlice().map(fs -> compactionFileIdToLatestFileSlice.put(fg.getFileGroupId(), fs));
return true;
}));
@@ -234,8 +228,7 @@ public class TestCleaner extends TestHoodieClientBase {
client.startCommitWithTime(newInstantTime);
List<HoodieRecord> records = recordUpsertGenWrappedFunction.apply(newInstantTime, 100);
List<WriteStatus> statuses =
upsertFn.apply(client, jsc.parallelize(records, 1), newInstantTime).collect();
List<WriteStatus> statuses = upsertFn.apply(client, jsc.parallelize(records, 1), newInstantTime).collect();
// Verify there are no errors
assertNoWriteErrors(statuses);
@@ -249,8 +242,8 @@ public class TestCleaner extends TestHoodieClientBase {
// compute all the versions of all files, from time 0
HashMap<String, TreeSet<String>> fileIdToVersions = new HashMap<>();
for (HoodieInstant entry : timeline.getInstants().collect(Collectors.toList())) {
HoodieCommitMetadata commitMetadata = HoodieCommitMetadata
.fromBytes(timeline.getInstantDetails(entry).get(), HoodieCommitMetadata.class);
HoodieCommitMetadata commitMetadata =
HoodieCommitMetadata.fromBytes(timeline.getInstantDetails(entry).get(), HoodieCommitMetadata.class);
for (HoodieWriteStat wstat : commitMetadata.getWriteStats(partitionPath)) {
if (!fileIdToVersions.containsKey(wstat.getFileId())) {
@@ -267,8 +260,8 @@ public class TestCleaner extends TestHoodieClientBase {
// Ensure latest file-slice selected for compaction is retained
Option<HoodieDataFile> dataFileForCompactionPresent =
Option.fromJavaOptional(fileGroup.getAllDataFiles().filter(df -> {
return compactionFileIdToLatestFileSlice.get(fileGroup.getFileGroupId())
.getBaseInstantTime().equals(df.getCommitTime());
return compactionFileIdToLatestFileSlice.get(fileGroup.getFileGroupId()).getBaseInstantTime()
.equals(df.getCommitTime());
}).findAny());
Assert.assertTrue("Data File selected for compaction is retained",
dataFileForCompactionPresent.isPresent());
@@ -310,8 +303,7 @@ public class TestCleaner extends TestHoodieClientBase {
*/
@Test
public void testInsertPreppedAndCleanByCommits() throws Exception {
testInsertAndCleanByCommits(HoodieWriteClient::insertPreppedRecords,
HoodieWriteClient::upsertPreppedRecords, true);
testInsertAndCleanByCommits(HoodieWriteClient::insertPreppedRecords, HoodieWriteClient::upsertPreppedRecords, true);
}
/**
@@ -343,15 +335,13 @@ public class TestCleaner extends TestHoodieClientBase {
*/
private void testInsertAndCleanByCommits(
Function3<JavaRDD<WriteStatus>, HoodieWriteClient, JavaRDD<HoodieRecord>, String> insertFn,
Function3<JavaRDD<WriteStatus>, HoodieWriteClient, JavaRDD<HoodieRecord>, String> upsertFn,
boolean isPreppedAPI
) throws Exception {
Function3<JavaRDD<WriteStatus>, HoodieWriteClient, JavaRDD<HoodieRecord>, String> upsertFn, boolean isPreppedAPI)
throws Exception {
int maxCommits = 3; // keep upto 3 commits from the past
HoodieWriteConfig cfg = getConfigBuilder().withCompactionConfig(
HoodieCompactionConfig.newBuilder()
HoodieWriteConfig cfg = getConfigBuilder()
.withCompactionConfig(HoodieCompactionConfig.newBuilder()
.withCleanerPolicy(HoodieCleaningPolicy.KEEP_LATEST_FILE_VERSIONS).retainCommits(maxCommits).build())
.withParallelism(1, 1).withBulkInsertParallelism(1)
.withFinalizeWriteParallelism(1)
.withParallelism(1, 1).withBulkInsertParallelism(1).withFinalizeWriteParallelism(1)
.withConsistencyGuardConfig(ConsistencyGuardConfig.newBuilder().withConsistencyCheckEnabled(true).build())
.build();
HoodieWriteClient client = getHoodieWriteClient(cfg);
@@ -370,8 +360,7 @@ public class TestCleaner extends TestHoodieClientBase {
client.startCommitWithTime(newCommitTime);
List<HoodieRecord> records = recordUpsertGenWrappedFunction.apply(newCommitTime, 100);
List<WriteStatus> statuses =
upsertFn.apply(client, jsc.parallelize(records, 1), newCommitTime).collect();
List<WriteStatus> statuses = upsertFn.apply(client, jsc.parallelize(records, 1), newCommitTime).collect();
// Verify there are no errors
assertNoWriteErrors(statuses);
@@ -381,9 +370,9 @@ public class TestCleaner extends TestHoodieClientBase {
Option<HoodieInstant> earliestRetainedCommit = activeTimeline.nthFromLastInstant(maxCommits - 1);
Set<HoodieInstant> acceptableCommits = activeTimeline.getInstants().collect(Collectors.toSet());
if (earliestRetainedCommit.isPresent()) {
acceptableCommits.removeAll(
activeTimeline.findInstantsInRange("000", earliestRetainedCommit.get().getTimestamp()).getInstants()
.collect(Collectors.toSet()));
acceptableCommits
.removeAll(activeTimeline.findInstantsInRange("000", earliestRetainedCommit.get().getTimestamp())
.getInstants().collect(Collectors.toSet()));
acceptableCommits.add(earliestRetainedCommit.get());
}
@@ -412,18 +401,19 @@ public class TestCleaner extends TestHoodieClientBase {
*/
@Test
public void testKeepLatestFileVersions() throws IOException {
HoodieWriteConfig config = HoodieWriteConfig.newBuilder().withPath(basePath).withAssumeDatePartitioning(true)
.withCompactionConfig(HoodieCompactionConfig.newBuilder().withCleanerPolicy(
HoodieCleaningPolicy.KEEP_LATEST_FILE_VERSIONS).retainFileVersions(1).build())
HoodieWriteConfig config =
HoodieWriteConfig.newBuilder().withPath(basePath).withAssumeDatePartitioning(true)
.withCompactionConfig(HoodieCompactionConfig.newBuilder()
.withCleanerPolicy(HoodieCleaningPolicy.KEEP_LATEST_FILE_VERSIONS).retainFileVersions(1).build())
.build();
// make 1 commit, with 1 file per partition
HoodieTestUtils.createCommitFiles(basePath, "000");
String file1P0C0 = HoodieTestUtils
.createNewDataFile(basePath, HoodieTestDataGenerator.DEFAULT_FIRST_PARTITION_PATH, "000");
String file1P1C0 = HoodieTestUtils
.createNewDataFile(basePath, HoodieTestDataGenerator.DEFAULT_SECOND_PARTITION_PATH, "000");
String file1P0C0 =
HoodieTestUtils.createNewDataFile(basePath, HoodieTestDataGenerator.DEFAULT_FIRST_PARTITION_PATH, "000");
String file1P1C0 =
HoodieTestUtils.createNewDataFile(basePath, HoodieTestDataGenerator.DEFAULT_SECOND_PARTITION_PATH, "000");
metaClient = HoodieTableMetaClient.reload(metaClient);
HoodieTable table = HoodieTable.getHoodieTable(metaClient, config, jsc);
@@ -434,24 +424,22 @@ public class TestCleaner extends TestHoodieClientBase {
assertEquals("Must not clean any files", 0,
getCleanStat(hoodieCleanStatsOne, HoodieTestDataGenerator.DEFAULT_SECOND_PARTITION_PATH).getSuccessDeleteFiles()
.size());
assertTrue(HoodieTestUtils
.doesDataFileExist(basePath, HoodieTestDataGenerator.DEFAULT_FIRST_PARTITION_PATH, "000", file1P0C0));
assertTrue(HoodieTestUtils
.doesDataFileExist(basePath, HoodieTestDataGenerator.DEFAULT_SECOND_PARTITION_PATH, "000", file1P1C0));
assertTrue(HoodieTestUtils.doesDataFileExist(basePath, HoodieTestDataGenerator.DEFAULT_FIRST_PARTITION_PATH, "000",
file1P0C0));
assertTrue(HoodieTestUtils.doesDataFileExist(basePath, HoodieTestDataGenerator.DEFAULT_SECOND_PARTITION_PATH, "000",
file1P1C0));
// make next commit, with 1 insert & 1 update per partition
HoodieTestUtils.createCommitFiles(basePath, "001");
metaClient = HoodieTableMetaClient.reload(metaClient);
table = HoodieTable.getHoodieTable(metaClient, config, jsc);
String file2P0C1 = HoodieTestUtils
.createNewDataFile(basePath, HoodieTestDataGenerator.DEFAULT_FIRST_PARTITION_PATH, "001"); // insert
String file2P1C1 = HoodieTestUtils
.createNewDataFile(basePath, HoodieTestDataGenerator.DEFAULT_SECOND_PARTITION_PATH, "001"); // insert
HoodieTestUtils
.createDataFile(basePath, HoodieTestDataGenerator.DEFAULT_FIRST_PARTITION_PATH, "001", file1P0C0); // update
HoodieTestUtils
.createDataFile(basePath, HoodieTestDataGenerator.DEFAULT_SECOND_PARTITION_PATH, "001", file1P1C0); // update
String file2P0C1 =
HoodieTestUtils.createNewDataFile(basePath, HoodieTestDataGenerator.DEFAULT_FIRST_PARTITION_PATH, "001"); // insert
String file2P1C1 =
HoodieTestUtils.createNewDataFile(basePath, HoodieTestDataGenerator.DEFAULT_SECOND_PARTITION_PATH, "001"); // insert
HoodieTestUtils.createDataFile(basePath, HoodieTestDataGenerator.DEFAULT_FIRST_PARTITION_PATH, "001", file1P0C0); // update
HoodieTestUtils.createDataFile(basePath, HoodieTestDataGenerator.DEFAULT_SECOND_PARTITION_PATH, "001", file1P1C0); // update
List<HoodieCleanStat> hoodieCleanStatsTwo = table.clean(jsc);
assertEquals("Must clean 1 file", 1,
@@ -460,47 +448,44 @@ public class TestCleaner extends TestHoodieClientBase {
assertEquals("Must clean 1 file", 1,
getCleanStat(hoodieCleanStatsTwo, HoodieTestDataGenerator.DEFAULT_SECOND_PARTITION_PATH).getSuccessDeleteFiles()
.size());
assertTrue(HoodieTestUtils
.doesDataFileExist(basePath, HoodieTestDataGenerator.DEFAULT_FIRST_PARTITION_PATH, "001", file2P0C1));
assertTrue(HoodieTestUtils
.doesDataFileExist(basePath, HoodieTestDataGenerator.DEFAULT_SECOND_PARTITION_PATH, "001", file2P1C1));
assertFalse(HoodieTestUtils
.doesDataFileExist(basePath, HoodieTestDataGenerator.DEFAULT_FIRST_PARTITION_PATH, "000", file1P0C0));
assertFalse(HoodieTestUtils
.doesDataFileExist(basePath, HoodieTestDataGenerator.DEFAULT_SECOND_PARTITION_PATH, "000", file1P1C0));
assertTrue(HoodieTestUtils.doesDataFileExist(basePath, HoodieTestDataGenerator.DEFAULT_FIRST_PARTITION_PATH, "001",
file2P0C1));
assertTrue(HoodieTestUtils.doesDataFileExist(basePath, HoodieTestDataGenerator.DEFAULT_SECOND_PARTITION_PATH, "001",
file2P1C1));
assertFalse(HoodieTestUtils.doesDataFileExist(basePath, HoodieTestDataGenerator.DEFAULT_FIRST_PARTITION_PATH, "000",
file1P0C0));
assertFalse(HoodieTestUtils.doesDataFileExist(basePath, HoodieTestDataGenerator.DEFAULT_SECOND_PARTITION_PATH,
"000", file1P1C0));
// make next commit, with 2 updates to existing files, and 1 insert
HoodieTestUtils.createCommitFiles(basePath, "002");
metaClient = HoodieTableMetaClient.reload(metaClient);
table = HoodieTable.getHoodieTable(metaClient, config, jsc);
HoodieTestUtils
.createDataFile(basePath, HoodieTestDataGenerator.DEFAULT_FIRST_PARTITION_PATH, "002", file1P0C0); // update
HoodieTestUtils
.createDataFile(basePath, HoodieTestDataGenerator.DEFAULT_FIRST_PARTITION_PATH, "002", file2P0C1); // update
String file3P0C2 = HoodieTestUtils
.createNewDataFile(basePath, HoodieTestDataGenerator.DEFAULT_FIRST_PARTITION_PATH, "002");
HoodieTestUtils.createDataFile(basePath, HoodieTestDataGenerator.DEFAULT_FIRST_PARTITION_PATH, "002", file1P0C0); // update
HoodieTestUtils.createDataFile(basePath, HoodieTestDataGenerator.DEFAULT_FIRST_PARTITION_PATH, "002", file2P0C1); // update
String file3P0C2 =
HoodieTestUtils.createNewDataFile(basePath, HoodieTestDataGenerator.DEFAULT_FIRST_PARTITION_PATH, "002");
List<HoodieCleanStat> hoodieCleanStatsThree = table.clean(jsc);
assertEquals("Must clean two files", 2,
getCleanStat(hoodieCleanStatsThree, HoodieTestDataGenerator.DEFAULT_FIRST_PARTITION_PATH)
.getSuccessDeleteFiles().size());
assertFalse(HoodieTestUtils
.doesDataFileExist(basePath, HoodieTestDataGenerator.DEFAULT_FIRST_PARTITION_PATH, "001", file1P0C0));
assertFalse(HoodieTestUtils
.doesDataFileExist(basePath, HoodieTestDataGenerator.DEFAULT_FIRST_PARTITION_PATH, "001", file2P0C1));
assertTrue(HoodieTestUtils
.doesDataFileExist(basePath, HoodieTestDataGenerator.DEFAULT_FIRST_PARTITION_PATH, "002", file3P0C2));
assertFalse(HoodieTestUtils.doesDataFileExist(basePath, HoodieTestDataGenerator.DEFAULT_FIRST_PARTITION_PATH, "001",
file1P0C0));
assertFalse(HoodieTestUtils.doesDataFileExist(basePath, HoodieTestDataGenerator.DEFAULT_FIRST_PARTITION_PATH, "001",
file2P0C1));
assertTrue(HoodieTestUtils.doesDataFileExist(basePath, HoodieTestDataGenerator.DEFAULT_FIRST_PARTITION_PATH, "002",
file3P0C2));
// No cleaning on partially written file, with no commit.
HoodieTestUtils
.createDataFile(basePath, HoodieTestDataGenerator.DEFAULT_FIRST_PARTITION_PATH, "003", file3P0C2); // update
HoodieTestUtils.createDataFile(basePath, HoodieTestDataGenerator.DEFAULT_FIRST_PARTITION_PATH, "003", file3P0C2); // update
List<HoodieCleanStat> hoodieCleanStatsFour = table.clean(jsc);
assertEquals("Must not clean any files", 0,
getCleanStat(hoodieCleanStatsFour, HoodieTestDataGenerator.DEFAULT_FIRST_PARTITION_PATH).getSuccessDeleteFiles()
.size());
assertTrue(HoodieTestUtils
.doesDataFileExist(basePath, HoodieTestDataGenerator.DEFAULT_FIRST_PARTITION_PATH, "002", file3P0C2));
assertTrue(HoodieTestUtils.doesDataFileExist(basePath, HoodieTestDataGenerator.DEFAULT_FIRST_PARTITION_PATH, "002",
file3P0C2));
}
/**
@@ -509,37 +494,33 @@ public class TestCleaner extends TestHoodieClientBase {
@Test
public void testKeepLatestFileVersionsMOR() throws IOException {
HoodieWriteConfig config = HoodieWriteConfig.newBuilder().withPath(basePath).withAssumeDatePartitioning(true)
.withCompactionConfig(HoodieCompactionConfig.newBuilder().withCleanerPolicy(
HoodieCleaningPolicy.KEEP_LATEST_FILE_VERSIONS).retainFileVersions(1).build())
HoodieWriteConfig config =
HoodieWriteConfig.newBuilder().withPath(basePath).withAssumeDatePartitioning(true)
.withCompactionConfig(HoodieCompactionConfig.newBuilder()
.withCleanerPolicy(HoodieCleaningPolicy.KEEP_LATEST_FILE_VERSIONS).retainFileVersions(1).build())
.build();
HoodieTableMetaClient metaClient = HoodieTestUtils.init(jsc.hadoopConfiguration(), basePath,
HoodieTableType.MERGE_ON_READ);
HoodieTableMetaClient metaClient =
HoodieTestUtils.init(jsc.hadoopConfiguration(), basePath, HoodieTableType.MERGE_ON_READ);
// Make 3 files, one base file and 2 log files associated with base file
String file1P0 = HoodieTestUtils
.createNewDataFile(basePath, HoodieTestDataGenerator.DEFAULT_FIRST_PARTITION_PATH, "000");
String file2P0L0 = HoodieTestUtils
.createNewLogFile(fs, basePath, HoodieTestDataGenerator.DEFAULT_FIRST_PARTITION_PATH, "000", file1P0,
Option.empty());
String file2P0L1 = HoodieTestUtils
.createNewLogFile(fs, basePath, HoodieTestDataGenerator.DEFAULT_FIRST_PARTITION_PATH, "000", file1P0,
Option.of(2));
String file1P0 =
HoodieTestUtils.createNewDataFile(basePath, HoodieTestDataGenerator.DEFAULT_FIRST_PARTITION_PATH, "000");
String file2P0L0 = HoodieTestUtils.createNewLogFile(fs, basePath,
HoodieTestDataGenerator.DEFAULT_FIRST_PARTITION_PATH, "000", file1P0, Option.empty());
String file2P0L1 = HoodieTestUtils.createNewLogFile(fs, basePath,
HoodieTestDataGenerator.DEFAULT_FIRST_PARTITION_PATH, "000", file1P0, Option.of(2));
// make 1 compaction commit
HoodieTestUtils.createCompactionCommitFiles(fs, basePath, "000");
// Make 4 files, one base file and 3 log files associated with base file
HoodieTestUtils.createDataFile(basePath, HoodieTestDataGenerator.DEFAULT_FIRST_PARTITION_PATH, "001", file1P0);
file2P0L0 = HoodieTestUtils
.createNewLogFile(fs, basePath, HoodieTestDataGenerator.DEFAULT_FIRST_PARTITION_PATH, "001", file1P0,
Option.empty());
file2P0L0 = HoodieTestUtils
.createNewLogFile(fs, basePath, HoodieTestDataGenerator.DEFAULT_FIRST_PARTITION_PATH, "001", file1P0,
Option.of(2));
file2P0L0 = HoodieTestUtils
.createNewLogFile(fs, basePath, HoodieTestDataGenerator.DEFAULT_FIRST_PARTITION_PATH, "001", file1P0,
Option.of(3));
file2P0L0 = HoodieTestUtils.createNewLogFile(fs, basePath, HoodieTestDataGenerator.DEFAULT_FIRST_PARTITION_PATH,
"001", file1P0, Option.empty());
file2P0L0 = HoodieTestUtils.createNewLogFile(fs, basePath, HoodieTestDataGenerator.DEFAULT_FIRST_PARTITION_PATH,
"001", file1P0, Option.of(2));
file2P0L0 = HoodieTestUtils.createNewLogFile(fs, basePath, HoodieTestDataGenerator.DEFAULT_FIRST_PARTITION_PATH,
"001", file1P0, Option.of(3));
// make 1 compaction commit
HoodieTestUtils.createCompactionCommitFiles(fs, basePath, "001");
@@ -548,16 +529,12 @@ public class TestCleaner extends TestHoodieClientBase {
assertEquals("Must clean three files, one parquet and 2 log files", 3,
getCleanStat(hoodieCleanStats, HoodieTestDataGenerator.DEFAULT_FIRST_PARTITION_PATH).getSuccessDeleteFiles()
.size());
assertFalse(HoodieTestUtils
.doesDataFileExist(basePath, HoodieTestDataGenerator.DEFAULT_FIRST_PARTITION_PATH, "000", file1P0));
assertFalse(
HoodieTestUtils
.doesLogFileExist(basePath, HoodieTestDataGenerator.DEFAULT_FIRST_PARTITION_PATH, "000", file2P0L0,
Option.empty()));
assertFalse(
HoodieTestUtils
.doesLogFileExist(basePath, HoodieTestDataGenerator.DEFAULT_FIRST_PARTITION_PATH, "000", file2P0L0,
Option.of(2)));
assertFalse(HoodieTestUtils.doesDataFileExist(basePath, HoodieTestDataGenerator.DEFAULT_FIRST_PARTITION_PATH, "000",
file1P0));
assertFalse(HoodieTestUtils.doesLogFileExist(basePath, HoodieTestDataGenerator.DEFAULT_FIRST_PARTITION_PATH, "000",
file2P0L0, Option.empty()));
assertFalse(HoodieTestUtils.doesLogFileExist(basePath, HoodieTestDataGenerator.DEFAULT_FIRST_PARTITION_PATH, "000",
file2P0L0, Option.of(2)));
}
/**
@@ -566,16 +543,17 @@ public class TestCleaner extends TestHoodieClientBase {
@Test
public void testKeepLatestCommits() throws IOException {
HoodieWriteConfig config = HoodieWriteConfig.newBuilder().withPath(basePath).withAssumeDatePartitioning(true)
.withCompactionConfig(HoodieCompactionConfig.newBuilder().withCleanerPolicy(
HoodieCleaningPolicy.KEEP_LATEST_COMMITS).retainCommits(2).build()).build();
.withCompactionConfig(HoodieCompactionConfig.newBuilder()
.withCleanerPolicy(HoodieCleaningPolicy.KEEP_LATEST_COMMITS).retainCommits(2).build())
.build();
// make 1 commit, with 1 file per partition
HoodieTestUtils.createCommitFiles(basePath, "000");
String file1P0C0 = HoodieTestUtils
.createNewDataFile(basePath, HoodieTestDataGenerator.DEFAULT_FIRST_PARTITION_PATH, "000");
String file1P1C0 = HoodieTestUtils
.createNewDataFile(basePath, HoodieTestDataGenerator.DEFAULT_SECOND_PARTITION_PATH, "000");
String file1P0C0 =
HoodieTestUtils.createNewDataFile(basePath, HoodieTestDataGenerator.DEFAULT_FIRST_PARTITION_PATH, "000");
String file1P1C0 =
HoodieTestUtils.createNewDataFile(basePath, HoodieTestDataGenerator.DEFAULT_SECOND_PARTITION_PATH, "000");
metaClient = HoodieTableMetaClient.reload(metaClient);
HoodieTable table = HoodieTable.getHoodieTable(metaClient, config, jsc);
@@ -587,24 +565,22 @@ public class TestCleaner extends TestHoodieClientBase {
assertEquals("Must not clean any files", 0,
getCleanStat(hoodieCleanStatsOne, HoodieTestDataGenerator.DEFAULT_SECOND_PARTITION_PATH).getSuccessDeleteFiles()
.size());
assertTrue(HoodieTestUtils
.doesDataFileExist(basePath, HoodieTestDataGenerator.DEFAULT_FIRST_PARTITION_PATH, "000", file1P0C0));
assertTrue(HoodieTestUtils
.doesDataFileExist(basePath, HoodieTestDataGenerator.DEFAULT_SECOND_PARTITION_PATH, "000", file1P1C0));
assertTrue(HoodieTestUtils.doesDataFileExist(basePath, HoodieTestDataGenerator.DEFAULT_FIRST_PARTITION_PATH, "000",
file1P0C0));
assertTrue(HoodieTestUtils.doesDataFileExist(basePath, HoodieTestDataGenerator.DEFAULT_SECOND_PARTITION_PATH, "000",
file1P1C0));
// make next commit, with 1 insert & 1 update per partition
HoodieTestUtils.createCommitFiles(basePath, "001");
metaClient = HoodieTableMetaClient.reload(metaClient);
table = HoodieTable.getHoodieTable(metaClient, config, jsc);
String file2P0C1 = HoodieTestUtils
.createNewDataFile(basePath, HoodieTestDataGenerator.DEFAULT_FIRST_PARTITION_PATH, "001"); // insert
String file2P1C1 = HoodieTestUtils
.createNewDataFile(basePath, HoodieTestDataGenerator.DEFAULT_SECOND_PARTITION_PATH, "001"); // insert
HoodieTestUtils
.createDataFile(basePath, HoodieTestDataGenerator.DEFAULT_FIRST_PARTITION_PATH, "001", file1P0C0); // update
HoodieTestUtils
.createDataFile(basePath, HoodieTestDataGenerator.DEFAULT_SECOND_PARTITION_PATH, "001", file1P1C0); // update
String file2P0C1 =
HoodieTestUtils.createNewDataFile(basePath, HoodieTestDataGenerator.DEFAULT_FIRST_PARTITION_PATH, "001"); // insert
String file2P1C1 =
HoodieTestUtils.createNewDataFile(basePath, HoodieTestDataGenerator.DEFAULT_SECOND_PARTITION_PATH, "001"); // insert
HoodieTestUtils.createDataFile(basePath, HoodieTestDataGenerator.DEFAULT_FIRST_PARTITION_PATH, "001", file1P0C0); // update
HoodieTestUtils.createDataFile(basePath, HoodieTestDataGenerator.DEFAULT_SECOND_PARTITION_PATH, "001", file1P1C0); // update
List<HoodieCleanStat> hoodieCleanStatsTwo = table.clean(jsc);
assertEquals("Must not clean any files", 0,
@@ -613,78 +589,73 @@ public class TestCleaner extends TestHoodieClientBase {
assertEquals("Must not clean any files", 0,
getCleanStat(hoodieCleanStatsTwo, HoodieTestDataGenerator.DEFAULT_SECOND_PARTITION_PATH).getSuccessDeleteFiles()
.size());
assertTrue(HoodieTestUtils
.doesDataFileExist(basePath, HoodieTestDataGenerator.DEFAULT_FIRST_PARTITION_PATH, "001", file2P0C1));
assertTrue(HoodieTestUtils
.doesDataFileExist(basePath, HoodieTestDataGenerator.DEFAULT_SECOND_PARTITION_PATH, "001", file2P1C1));
assertTrue(HoodieTestUtils
.doesDataFileExist(basePath, HoodieTestDataGenerator.DEFAULT_FIRST_PARTITION_PATH, "000", file1P0C0));
assertTrue(HoodieTestUtils
.doesDataFileExist(basePath, HoodieTestDataGenerator.DEFAULT_SECOND_PARTITION_PATH, "000", file1P1C0));
assertTrue(HoodieTestUtils.doesDataFileExist(basePath, HoodieTestDataGenerator.DEFAULT_FIRST_PARTITION_PATH, "001",
file2P0C1));
assertTrue(HoodieTestUtils.doesDataFileExist(basePath, HoodieTestDataGenerator.DEFAULT_SECOND_PARTITION_PATH, "001",
file2P1C1));
assertTrue(HoodieTestUtils.doesDataFileExist(basePath, HoodieTestDataGenerator.DEFAULT_FIRST_PARTITION_PATH, "000",
file1P0C0));
assertTrue(HoodieTestUtils.doesDataFileExist(basePath, HoodieTestDataGenerator.DEFAULT_SECOND_PARTITION_PATH, "000",
file1P1C0));
// make next commit, with 2 updates to existing files, and 1 insert
HoodieTestUtils.createCommitFiles(basePath, "002");
metaClient = HoodieTableMetaClient.reload(metaClient);
table = HoodieTable.getHoodieTable(metaClient, config, jsc);
HoodieTestUtils
.createDataFile(basePath, HoodieTestDataGenerator.DEFAULT_FIRST_PARTITION_PATH, "002", file1P0C0); // update
HoodieTestUtils
.createDataFile(basePath, HoodieTestDataGenerator.DEFAULT_FIRST_PARTITION_PATH, "002", file2P0C1); // update
String file3P0C2 = HoodieTestUtils
.createNewDataFile(basePath, HoodieTestDataGenerator.DEFAULT_FIRST_PARTITION_PATH, "002");
HoodieTestUtils.createDataFile(basePath, HoodieTestDataGenerator.DEFAULT_FIRST_PARTITION_PATH, "002", file1P0C0); // update
HoodieTestUtils.createDataFile(basePath, HoodieTestDataGenerator.DEFAULT_FIRST_PARTITION_PATH, "002", file2P0C1); // update
String file3P0C2 =
HoodieTestUtils.createNewDataFile(basePath, HoodieTestDataGenerator.DEFAULT_FIRST_PARTITION_PATH, "002");
List<HoodieCleanStat> hoodieCleanStatsThree = table.clean(jsc);
assertEquals("Must not clean any file. We have to keep 1 version before the latest commit time to keep", 0,
getCleanStat(hoodieCleanStatsThree, HoodieTestDataGenerator.DEFAULT_FIRST_PARTITION_PATH)
.getSuccessDeleteFiles().size());
assertTrue(HoodieTestUtils
.doesDataFileExist(basePath, HoodieTestDataGenerator.DEFAULT_FIRST_PARTITION_PATH, "000", file1P0C0));
assertTrue(HoodieTestUtils.doesDataFileExist(basePath, HoodieTestDataGenerator.DEFAULT_FIRST_PARTITION_PATH, "000",
file1P0C0));
// make next commit, with 2 updates to existing files, and 1 insert
HoodieTestUtils.createCommitFiles(basePath, "003");
metaClient = HoodieTableMetaClient.reload(metaClient);
table = HoodieTable.getHoodieTable(metaClient, config, jsc);
HoodieTestUtils
.createDataFile(basePath, HoodieTestDataGenerator.DEFAULT_FIRST_PARTITION_PATH, "003", file1P0C0); // update
HoodieTestUtils
.createDataFile(basePath, HoodieTestDataGenerator.DEFAULT_FIRST_PARTITION_PATH, "003", file2P0C1); // update
String file4P0C3 = HoodieTestUtils
.createNewDataFile(basePath, HoodieTestDataGenerator.DEFAULT_FIRST_PARTITION_PATH, "003");
HoodieTestUtils.createDataFile(basePath, HoodieTestDataGenerator.DEFAULT_FIRST_PARTITION_PATH, "003", file1P0C0); // update
HoodieTestUtils.createDataFile(basePath, HoodieTestDataGenerator.DEFAULT_FIRST_PARTITION_PATH, "003", file2P0C1); // update
String file4P0C3 =
HoodieTestUtils.createNewDataFile(basePath, HoodieTestDataGenerator.DEFAULT_FIRST_PARTITION_PATH, "003");
List<HoodieCleanStat> hoodieCleanStatsFour = table.clean(jsc);
assertEquals("Must not clean one old file", 1,
getCleanStat(hoodieCleanStatsFour, HoodieTestDataGenerator.DEFAULT_FIRST_PARTITION_PATH).getSuccessDeleteFiles()
.size());
assertFalse(HoodieTestUtils
.doesDataFileExist(basePath, HoodieTestDataGenerator.DEFAULT_FIRST_PARTITION_PATH, "000", file1P0C0));
assertTrue(HoodieTestUtils
.doesDataFileExist(basePath, HoodieTestDataGenerator.DEFAULT_FIRST_PARTITION_PATH, "001", file1P0C0));
assertTrue(HoodieTestUtils
.doesDataFileExist(basePath, HoodieTestDataGenerator.DEFAULT_FIRST_PARTITION_PATH, "002", file1P0C0));
assertTrue(HoodieTestUtils
.doesDataFileExist(basePath, HoodieTestDataGenerator.DEFAULT_FIRST_PARTITION_PATH, "001", file2P0C1));
assertTrue(HoodieTestUtils
.doesDataFileExist(basePath, HoodieTestDataGenerator.DEFAULT_FIRST_PARTITION_PATH, "002", file2P0C1));
assertTrue(HoodieTestUtils
.doesDataFileExist(basePath, HoodieTestDataGenerator.DEFAULT_FIRST_PARTITION_PATH, "002", file3P0C2));
assertTrue(HoodieTestUtils
.doesDataFileExist(basePath, HoodieTestDataGenerator.DEFAULT_FIRST_PARTITION_PATH, "003", file4P0C3));
assertFalse(HoodieTestUtils.doesDataFileExist(basePath, HoodieTestDataGenerator.DEFAULT_FIRST_PARTITION_PATH, "000",
file1P0C0));
assertTrue(HoodieTestUtils.doesDataFileExist(basePath, HoodieTestDataGenerator.DEFAULT_FIRST_PARTITION_PATH, "001",
file1P0C0));
assertTrue(HoodieTestUtils.doesDataFileExist(basePath, HoodieTestDataGenerator.DEFAULT_FIRST_PARTITION_PATH, "002",
file1P0C0));
assertTrue(HoodieTestUtils.doesDataFileExist(basePath, HoodieTestDataGenerator.DEFAULT_FIRST_PARTITION_PATH, "001",
file2P0C1));
assertTrue(HoodieTestUtils.doesDataFileExist(basePath, HoodieTestDataGenerator.DEFAULT_FIRST_PARTITION_PATH, "002",
file2P0C1));
assertTrue(HoodieTestUtils.doesDataFileExist(basePath, HoodieTestDataGenerator.DEFAULT_FIRST_PARTITION_PATH, "002",
file3P0C2));
assertTrue(HoodieTestUtils.doesDataFileExist(basePath, HoodieTestDataGenerator.DEFAULT_FIRST_PARTITION_PATH, "003",
file4P0C3));
// No cleaning on partially written file, with no commit.
HoodieTestUtils
.createDataFile(basePath, HoodieTestDataGenerator.DEFAULT_FIRST_PARTITION_PATH, "004", file3P0C2); // update
HoodieTestUtils.createDataFile(basePath, HoodieTestDataGenerator.DEFAULT_FIRST_PARTITION_PATH, "004", file3P0C2); // update
List<HoodieCleanStat> hoodieCleanStatsFive = table.clean(jsc);
assertEquals("Must not clean any files", 0,
getCleanStat(hoodieCleanStatsFive, HoodieTestDataGenerator.DEFAULT_FIRST_PARTITION_PATH).getSuccessDeleteFiles()
.size());
assertTrue(HoodieTestUtils
.doesDataFileExist(basePath, HoodieTestDataGenerator.DEFAULT_FIRST_PARTITION_PATH, "001", file1P0C0));
assertTrue(HoodieTestUtils
.doesDataFileExist(basePath, HoodieTestDataGenerator.DEFAULT_FIRST_PARTITION_PATH, "001", file2P0C1));
assertTrue(HoodieTestUtils.doesDataFileExist(basePath, HoodieTestDataGenerator.DEFAULT_FIRST_PARTITION_PATH, "001",
file1P0C0));
assertTrue(HoodieTestUtils.doesDataFileExist(basePath, HoodieTestDataGenerator.DEFAULT_FIRST_PARTITION_PATH, "001",
file2P0C1));
}
/**
@@ -711,8 +682,9 @@ public class TestCleaner extends TestHoodieClientBase {
@Test
public void testCleaningWithZeroPartitonPaths() throws IOException {
HoodieWriteConfig config = HoodieWriteConfig.newBuilder().withPath(basePath).withAssumeDatePartitioning(true)
.withCompactionConfig(HoodieCompactionConfig.newBuilder().withCleanerPolicy(
HoodieCleaningPolicy.KEEP_LATEST_COMMITS).retainCommits(2).build()).build();
.withCompactionConfig(HoodieCompactionConfig.newBuilder()
.withCleanerPolicy(HoodieCleaningPolicy.KEEP_LATEST_COMMITS).retainCommits(2).build())
.build();
// Make a commit, although there are no partitionPaths.
// Example use-case of this is when a client wants to create a table
@@ -732,8 +704,9 @@ public class TestCleaner extends TestHoodieClientBase {
@Test
public void testCleaningSkewedPartitons() throws IOException {
HoodieWriteConfig config = HoodieWriteConfig.newBuilder().withPath(basePath).withAssumeDatePartitioning(true)
.withCompactionConfig(HoodieCompactionConfig.newBuilder().withCleanerPolicy(
HoodieCleaningPolicy.KEEP_LATEST_COMMITS).retainCommits(2).build()).build();
.withCompactionConfig(HoodieCompactionConfig.newBuilder()
.withCleanerPolicy(HoodieCleaningPolicy.KEEP_LATEST_COMMITS).retainCommits(2).build())
.build();
Map<Long, Long> stageOneShuffleReadTaskRecordsCountMap = new HashMap<>();
// Since clean involves repartition in order to uniformly distribute data,
@@ -783,22 +756,20 @@ public class TestCleaner extends TestHoodieClientBase {
HoodieTable table = HoodieTable.getHoodieTable(metaClient, config, jsc);
List<HoodieCleanStat> hoodieCleanStats = table.clean(jsc);
assertEquals(100,
getCleanStat(hoodieCleanStats, HoodieTestDataGenerator.DEFAULT_FIRST_PARTITION_PATH).getSuccessDeleteFiles()
.size());
assertEquals(10,
getCleanStat(hoodieCleanStats, HoodieTestDataGenerator.DEFAULT_SECOND_PARTITION_PATH).getSuccessDeleteFiles()
.size());
assertEquals(10,
getCleanStat(hoodieCleanStats, HoodieTestDataGenerator.DEFAULT_THIRD_PARTITION_PATH).getSuccessDeleteFiles()
.size());
assertEquals(100, getCleanStat(hoodieCleanStats, HoodieTestDataGenerator.DEFAULT_FIRST_PARTITION_PATH)
.getSuccessDeleteFiles().size());
assertEquals(10, getCleanStat(hoodieCleanStats, HoodieTestDataGenerator.DEFAULT_SECOND_PARTITION_PATH)
.getSuccessDeleteFiles().size());
assertEquals(10, getCleanStat(hoodieCleanStats, HoodieTestDataGenerator.DEFAULT_THIRD_PARTITION_PATH)
.getSuccessDeleteFiles().size());
// 3 tasks are expected since the number of partitions is 3
assertEquals(3, stageOneShuffleReadTaskRecordsCountMap.keySet().size());
// Sum of all records processed = total number of files to clean
assertEquals(120,
stageOneShuffleReadTaskRecordsCountMap.values().stream().reduce((a, b) -> a + b).get().intValue());
assertTrue("The skew in handling files to clean is not removed. "
assertTrue(
"The skew in handling files to clean is not removed. "
+ "Each task should handle more records than the partitionPath with least files "
+ "and less records than the partitionPath with most files.",
stageOneShuffleReadTaskRecordsCountMap.values().stream().filter(a -> a > 10 && a < 100).count() == 3);
@@ -811,8 +782,9 @@ public class TestCleaner extends TestHoodieClientBase {
@Test
public void testKeepLatestCommitsWithPendingCompactions() throws IOException {
HoodieWriteConfig config = HoodieWriteConfig.newBuilder().withPath(basePath).withAssumeDatePartitioning(true)
.withCompactionConfig(HoodieCompactionConfig.newBuilder().withCleanerPolicy(
HoodieCleaningPolicy.KEEP_LATEST_COMMITS).retainCommits(2).build()).build();
.withCompactionConfig(HoodieCompactionConfig.newBuilder()
.withCleanerPolicy(HoodieCleaningPolicy.KEEP_LATEST_COMMITS).retainCommits(2).build())
.build();
// Deletions:
// . FileId Parquet Logs Total Retained Commits
// FileId7 5 10 15 009, 011
@@ -830,9 +802,11 @@ public class TestCleaner extends TestHoodieClientBase {
*/
@Test
public void testKeepLatestVersionsWithPendingCompactions() throws IOException {
HoodieWriteConfig config = HoodieWriteConfig.newBuilder().withPath(basePath).withAssumeDatePartitioning(true)
.withCompactionConfig(HoodieCompactionConfig.newBuilder().withCleanerPolicy(
HoodieCleaningPolicy.KEEP_LATEST_FILE_VERSIONS).retainFileVersions(2).build()).build();
HoodieWriteConfig config =
HoodieWriteConfig.newBuilder().withPath(basePath).withAssumeDatePartitioning(true)
.withCompactionConfig(HoodieCompactionConfig.newBuilder()
.withCleanerPolicy(HoodieCleaningPolicy.KEEP_LATEST_FILE_VERSIONS).retainFileVersions(2).build())
.build();
// Deletions:
// . FileId Parquet Logs Total Retained Commits
// FileId7 5 10 15 009, 011
@@ -853,8 +827,8 @@ public class TestCleaner extends TestHoodieClientBase {
*/
public void testPendingCompactions(HoodieWriteConfig config, int expNumFilesDeleted,
int expNumFilesUnderCompactionDeleted) throws IOException {
HoodieTableMetaClient metaClient = HoodieTestUtils.init(jsc.hadoopConfiguration(), basePath,
HoodieTableType.MERGE_ON_READ);
HoodieTableMetaClient metaClient =
HoodieTestUtils.init(jsc.hadoopConfiguration(), basePath, HoodieTableType.MERGE_ON_READ);
String[] instants = new String[] {"000", "001", "003", "005", "007", "009", "011", "013"};
String[] compactionInstants = new String[] {"002", "004", "006", "008", "010"};
Map<String, String> expFileIdToPendingCompaction = new HashMap<>();
@@ -870,13 +844,11 @@ public class TestCleaner extends TestHoodieClientBase {
// compactions
// FileIds 2-5 will be under compaction
int maxNumFileIds = 7;
String[] fileIds = new String[]
{"fileId1", "fileId2", "fileId3", "fileId4", "fileId5", "fileId6", "fileId7"};
String[] fileIds = new String[] {"fileId1", "fileId2", "fileId3", "fileId4", "fileId5", "fileId6", "fileId7"};
int maxNumFileIdsForCompaction = 4;
for (int i = 0; i < maxNumFileIds; i++) {
final String fileId = HoodieTestUtils
.createDataFile(basePath, HoodieTestDataGenerator.DEFAULT_FIRST_PARTITION_PATH, instants[0],
fileIds[i]);
final String fileId = HoodieTestUtils.createDataFile(basePath,
HoodieTestDataGenerator.DEFAULT_FIRST_PARTITION_PATH, instants[0], fileIds[i]);
HoodieTestUtils.createNewLogFile(fs, basePath, HoodieTestDataGenerator.DEFAULT_FIRST_PARTITION_PATH, instants[0],
fileId, Option.empty());
HoodieTestUtils.createNewLogFile(fs, basePath, HoodieTestDataGenerator.DEFAULT_FIRST_PARTITION_PATH, instants[0],
@@ -887,8 +859,8 @@ public class TestCleaner extends TestHoodieClientBase {
expFileIdToPendingCompaction.put(fileId, compactionInstants[j]);
metaClient = HoodieTableMetaClient.reload(metaClient);
HoodieTable table = HoodieTable.getHoodieTable(metaClient, config, jsc);
FileSlice slice = table.getRTFileSystemView().getLatestFileSlices(
HoodieTestDataGenerator.DEFAULT_FIRST_PARTITION_PATH)
FileSlice slice =
table.getRTFileSystemView().getLatestFileSlices(HoodieTestDataGenerator.DEFAULT_FIRST_PARTITION_PATH)
.filter(fs -> fs.getFileId().equals(fileId)).findFirst().get();
List<FileSlice> slices = new ArrayList<>();
if (compactionInstantsToFileSlices.containsKey(compactionInstants[j])) {
@@ -898,20 +870,16 @@ public class TestCleaner extends TestHoodieClientBase {
compactionInstantsToFileSlices.put(compactionInstants[j], slices);
// Add log-files to simulate delta-commits after pending compaction
HoodieTestUtils.createNewLogFile(fs, basePath, HoodieTestDataGenerator.DEFAULT_FIRST_PARTITION_PATH,
compactionInstants[j],
fileId, Option.empty());
compactionInstants[j], fileId, Option.empty());
HoodieTestUtils.createNewLogFile(fs, basePath, HoodieTestDataGenerator.DEFAULT_FIRST_PARTITION_PATH,
compactionInstants[j],
fileId, Option.of(2));
compactionInstants[j], fileId, Option.of(2));
} else {
HoodieTestUtils
.createDataFile(basePath, HoodieTestDataGenerator.DEFAULT_FIRST_PARTITION_PATH, instants[j], fileId);
HoodieTestUtils
.createNewLogFile(fs, basePath, HoodieTestDataGenerator.DEFAULT_FIRST_PARTITION_PATH, instants[j], fileId,
Option.empty());
HoodieTestUtils
.createNewLogFile(fs, basePath, HoodieTestDataGenerator.DEFAULT_FIRST_PARTITION_PATH, instants[j], fileId,
Option.of(2));
HoodieTestUtils.createDataFile(basePath, HoodieTestDataGenerator.DEFAULT_FIRST_PARTITION_PATH, instants[j],
fileId);
HoodieTestUtils.createNewLogFile(fs, basePath, HoodieTestDataGenerator.DEFAULT_FIRST_PARTITION_PATH,
instants[j], fileId, Option.empty());
HoodieTestUtils.createNewLogFile(fs, basePath, HoodieTestDataGenerator.DEFAULT_FIRST_PARTITION_PATH,
instants[j], fileId, Option.of(2));
fileIdToLatestInstantBeforeCompaction.put(fileId, instants[j]);
}
}
@@ -921,9 +889,8 @@ public class TestCleaner extends TestHoodieClientBase {
for (String instant : compactionInstants) {
List<FileSlice> fileSliceList = compactionInstantsToFileSlices.get(instant);
if (null != fileSliceList) {
HoodieTestUtils.createCompactionRequest(metaClient, instant,
fileSliceList.stream().map(fs -> Pair.of(HoodieTestDataGenerator.DEFAULT_FIRST_PARTITION_PATH, fs))
.collect(Collectors.toList()));
HoodieTestUtils.createCompactionRequest(metaClient, instant, fileSliceList.stream()
.map(fs -> Pair.of(HoodieTestDataGenerator.DEFAULT_FIRST_PARTITION_PATH, fs)).collect(Collectors.toList()));
}
}
@@ -939,22 +906,19 @@ public class TestCleaner extends TestHoodieClientBase {
expFileIdToPendingCompaction.entrySet().stream().forEach(entry -> {
String fileId = entry.getKey();
String baseInstantForCompaction = fileIdToLatestInstantBeforeCompaction.get(fileId);
Option<FileSlice> fileSliceForCompaction =
Option.fromJavaOptional(
hoodieTable.getRTFileSystemView().getLatestFileSlicesBeforeOrOn(
HoodieTestDataGenerator.DEFAULT_FIRST_PARTITION_PATH,
baseInstantForCompaction, true).filter(fs -> fs.getFileId().equals(fileId)).findFirst());
Option<FileSlice> fileSliceForCompaction = Option.fromJavaOptional(hoodieTable.getRTFileSystemView()
.getLatestFileSlicesBeforeOrOn(HoodieTestDataGenerator.DEFAULT_FIRST_PARTITION_PATH, baseInstantForCompaction,
true)
.filter(fs -> fs.getFileId().equals(fileId)).findFirst());
Assert.assertTrue("Base Instant for Compaction must be preserved", fileSliceForCompaction.isPresent());
Assert.assertTrue("FileSlice has data-file", fileSliceForCompaction.get().getDataFile().isPresent());
Assert.assertEquals("FileSlice has log-files", 2,
fileSliceForCompaction.get().getLogFiles().count());
Assert.assertEquals("FileSlice has log-files", 2, fileSliceForCompaction.get().getLogFiles().count());
});
// Test for progress (Did we clean some files ?)
long numFilesUnderCompactionDeleted =
hoodieCleanStats.stream().flatMap(cleanStat -> {
return convertPathToFileIdWithCommitTime(newMetaClient, cleanStat.getDeletePathPatterns()).map(
fileIdWithCommitTime -> {
long numFilesUnderCompactionDeleted = hoodieCleanStats.stream().flatMap(cleanStat -> {
return convertPathToFileIdWithCommitTime(newMetaClient, cleanStat.getDeletePathPatterns())
.map(fileIdWithCommitTime -> {
if (expFileIdToPendingCompaction.containsKey(fileIdWithCommitTime.getKey())) {
Assert.assertTrue("Deleted instant time must be less than pending compaction",
HoodieTimeline.compareTimestamps(
@@ -965,12 +929,12 @@ public class TestCleaner extends TestHoodieClientBase {
return false;
});
}).filter(x -> x).count();
long numDeleted = hoodieCleanStats.stream()
.flatMap(cleanStat -> cleanStat.getDeletePathPatterns().stream()).count();
long numDeleted =
hoodieCleanStats.stream().flatMap(cleanStat -> cleanStat.getDeletePathPatterns().stream()).count();
// Tighter check for regression
Assert.assertEquals("Correct number of files deleted", expNumFilesDeleted, numDeleted);
Assert.assertEquals("Correct number of files under compaction deleted",
expNumFilesUnderCompactionDeleted, numFilesUnderCompactionDeleted);
Assert.assertEquals("Correct number of files under compaction deleted", expNumFilesUnderCompactionDeleted,
numFilesUnderCompactionDeleted);
}
/**
@@ -991,6 +955,7 @@ public class TestCleaner extends TestHoodieClientBase {
/***
* Helper method to return temporary files count
*
* @return Number of temporary files found
* @throws IOException in case of error
*/
@@ -1004,19 +969,17 @@ public class TestCleaner extends TestHoodieClientBase {
return count;
}
private Stream<Pair<String, String>> convertPathToFileIdWithCommitTime(
final HoodieTableMetaClient metaClient, List<String> paths) {
Predicate<String> roFilePredicate = path ->
path.contains(metaClient.getTableConfig().getROFileFormat().getFileExtension());
Predicate<String> rtFilePredicate = path ->
path.contains(metaClient.getTableConfig().getRTFileFormat().getFileExtension());
Stream<Pair<String, String>> stream1 = paths.stream().filter(roFilePredicate)
.map(fullPath -> {
private Stream<Pair<String, String>> convertPathToFileIdWithCommitTime(final HoodieTableMetaClient metaClient,
List<String> paths) {
Predicate<String> roFilePredicate =
path -> path.contains(metaClient.getTableConfig().getROFileFormat().getFileExtension());
Predicate<String> rtFilePredicate =
path -> path.contains(metaClient.getTableConfig().getRTFileFormat().getFileExtension());
Stream<Pair<String, String>> stream1 = paths.stream().filter(roFilePredicate).map(fullPath -> {
String fileName = Paths.get(fullPath).getFileName().toString();
return Pair.of(FSUtils.getFileId(fileName), FSUtils.getCommitTime(fileName));
});
Stream<Pair<String, String>> stream2 = paths.stream().filter(rtFilePredicate)
.map(path -> {
Stream<Pair<String, String>> stream2 = paths.stream().filter(rtFilePredicate).map(path -> {
return Pair.of(FSUtils.getFileIdFromLogPath(new Path(path)),
FSUtils.getBaseCommitTimeFromLogPath(new Path(path)));
});

Some files were not shown because too many files have changed in this diff Show More