[HUDI-296] Explore use of spotless to auto fix formatting errors (#945)
- Add spotless format fixing to project - One time reformatting for conformity - Build fails for formatting changes and mvn spotless:apply autofixes them
This commit is contained in:
@@ -30,6 +30,7 @@
|
|||||||
<properties>
|
<properties>
|
||||||
<project.build.sourceEncoding>UTF-8</project.build.sourceEncoding>
|
<project.build.sourceEncoding>UTF-8</project.build.sourceEncoding>
|
||||||
<checkstyle.skip>true</checkstyle.skip>
|
<checkstyle.skip>true</checkstyle.skip>
|
||||||
|
<main.basedir>${project.parent.parent.basedir}</main.basedir>
|
||||||
</properties>
|
</properties>
|
||||||
|
|
||||||
|
|
||||||
|
|||||||
@@ -30,6 +30,7 @@
|
|||||||
<properties>
|
<properties>
|
||||||
<project.build.sourceEncoding>UTF-8</project.build.sourceEncoding>
|
<project.build.sourceEncoding>UTF-8</project.build.sourceEncoding>
|
||||||
<checkstyle.skip>true</checkstyle.skip>
|
<checkstyle.skip>true</checkstyle.skip>
|
||||||
|
<main.basedir>${project.parent.parent.basedir}</main.basedir>
|
||||||
</properties>
|
</properties>
|
||||||
|
|
||||||
<dependencyManagement>
|
<dependencyManagement>
|
||||||
|
|||||||
@@ -30,6 +30,7 @@
|
|||||||
<properties>
|
<properties>
|
||||||
<project.build.sourceEncoding>UTF-8</project.build.sourceEncoding>
|
<project.build.sourceEncoding>UTF-8</project.build.sourceEncoding>
|
||||||
<checkstyle.skip>true</checkstyle.skip>
|
<checkstyle.skip>true</checkstyle.skip>
|
||||||
|
<main.basedir>${project.parent.parent.basedir}</main.basedir>
|
||||||
</properties>
|
</properties>
|
||||||
|
|
||||||
<dependencyManagement>
|
<dependencyManagement>
|
||||||
|
|||||||
@@ -30,6 +30,7 @@
|
|||||||
<properties>
|
<properties>
|
||||||
<project.build.sourceEncoding>UTF-8</project.build.sourceEncoding>
|
<project.build.sourceEncoding>UTF-8</project.build.sourceEncoding>
|
||||||
<checkstyle.skip>true</checkstyle.skip>
|
<checkstyle.skip>true</checkstyle.skip>
|
||||||
|
<main.basedir>${project.parent.parent.basedir}</main.basedir>
|
||||||
</properties>
|
</properties>
|
||||||
|
|
||||||
<dependencyManagement>
|
<dependencyManagement>
|
||||||
|
|||||||
@@ -30,6 +30,7 @@
|
|||||||
<properties>
|
<properties>
|
||||||
<project.build.sourceEncoding>UTF-8</project.build.sourceEncoding>
|
<project.build.sourceEncoding>UTF-8</project.build.sourceEncoding>
|
||||||
<checkstyle.skip>true</checkstyle.skip>
|
<checkstyle.skip>true</checkstyle.skip>
|
||||||
|
<main.basedir>${project.parent.parent.basedir}</main.basedir>
|
||||||
</properties>
|
</properties>
|
||||||
|
|
||||||
<dependencyManagement>
|
<dependencyManagement>
|
||||||
|
|||||||
@@ -56,6 +56,7 @@
|
|||||||
<docker.presto.version>0.217</docker.presto.version>
|
<docker.presto.version>0.217</docker.presto.version>
|
||||||
<dockerfile.maven.version>1.4.3</dockerfile.maven.version>
|
<dockerfile.maven.version>1.4.3</dockerfile.maven.version>
|
||||||
<checkstyle.skip>true</checkstyle.skip>
|
<checkstyle.skip>true</checkstyle.skip>
|
||||||
|
<main.basedir>${project.parent.basedir}</main.basedir>
|
||||||
</properties>
|
</properties>
|
||||||
|
|
||||||
<build>
|
<build>
|
||||||
|
|||||||
@@ -32,6 +32,7 @@
|
|||||||
<properties>
|
<properties>
|
||||||
<project.build.sourceEncoding>UTF-8</project.build.sourceEncoding>
|
<project.build.sourceEncoding>UTF-8</project.build.sourceEncoding>
|
||||||
<checkstyle.skip>true</checkstyle.skip>
|
<checkstyle.skip>true</checkstyle.skip>
|
||||||
|
<main.basedir>${project.parent.parent.basedir}</main.basedir>
|
||||||
</properties>
|
</properties>
|
||||||
|
|
||||||
<dependencyManagement>
|
<dependencyManagement>
|
||||||
|
|||||||
@@ -30,6 +30,7 @@
|
|||||||
<properties>
|
<properties>
|
||||||
<project.build.sourceEncoding>UTF-8</project.build.sourceEncoding>
|
<project.build.sourceEncoding>UTF-8</project.build.sourceEncoding>
|
||||||
<checkstyle.skip>true</checkstyle.skip>
|
<checkstyle.skip>true</checkstyle.skip>
|
||||||
|
<main.basedir>${project.parent.parent.basedir}</main.basedir>
|
||||||
</properties>
|
</properties>
|
||||||
|
|
||||||
<dependencyManagement>
|
<dependencyManagement>
|
||||||
|
|||||||
@@ -30,6 +30,7 @@
|
|||||||
<properties>
|
<properties>
|
||||||
<project.build.sourceEncoding>UTF-8</project.build.sourceEncoding>
|
<project.build.sourceEncoding>UTF-8</project.build.sourceEncoding>
|
||||||
<checkstyle.skip>true</checkstyle.skip>
|
<checkstyle.skip>true</checkstyle.skip>
|
||||||
|
<main.basedir>${project.parent.parent.basedir}</main.basedir>
|
||||||
</properties>
|
</properties>
|
||||||
|
|
||||||
<dependencyManagement>
|
<dependencyManagement>
|
||||||
|
|||||||
@@ -30,6 +30,7 @@
|
|||||||
<properties>
|
<properties>
|
||||||
<project.build.sourceEncoding>UTF-8</project.build.sourceEncoding>
|
<project.build.sourceEncoding>UTF-8</project.build.sourceEncoding>
|
||||||
<checkstyle.skip>true</checkstyle.skip>
|
<checkstyle.skip>true</checkstyle.skip>
|
||||||
|
<main.basedir>${project.parent.parent.basedir}</main.basedir>
|
||||||
</properties>
|
</properties>
|
||||||
|
|
||||||
<dependencyManagement>
|
<dependencyManagement>
|
||||||
|
|||||||
@@ -30,6 +30,7 @@
|
|||||||
<properties>
|
<properties>
|
||||||
<project.build.sourceEncoding>UTF-8</project.build.sourceEncoding>
|
<project.build.sourceEncoding>UTF-8</project.build.sourceEncoding>
|
||||||
<checkstyle.skip>true</checkstyle.skip>
|
<checkstyle.skip>true</checkstyle.skip>
|
||||||
|
<main.basedir>${project.parent.parent.basedir}</main.basedir>
|
||||||
</properties>
|
</properties>
|
||||||
|
|
||||||
<dependencyManagement>
|
<dependencyManagement>
|
||||||
|
|||||||
@@ -29,6 +29,7 @@
|
|||||||
<properties>
|
<properties>
|
||||||
<spring.shell.version>1.2.0.RELEASE</spring.shell.version>
|
<spring.shell.version>1.2.0.RELEASE</spring.shell.version>
|
||||||
<jar.mainclass>org.springframework.shell.Bootstrap</jar.mainclass>
|
<jar.mainclass>org.springframework.shell.Bootstrap</jar.mainclass>
|
||||||
|
<main.basedir>${project.parent.basedir}</main.basedir>
|
||||||
</properties>
|
</properties>
|
||||||
|
|
||||||
<repositories>
|
<repositories>
|
||||||
|
|||||||
@@ -52,19 +52,16 @@ public class HoodiePrintHelper {
|
|||||||
* @param rows List of rows
|
* @param rows List of rows
|
||||||
* @return Serialized form for printing
|
* @return Serialized form for printing
|
||||||
*/
|
*/
|
||||||
public static String print(TableHeader rowHeader,
|
public static String print(TableHeader rowHeader, Map<String, Function<Object, String>> fieldNameToConverterMap,
|
||||||
Map<String, Function<Object, String>> fieldNameToConverterMap,
|
String sortByField, boolean isDescending, Integer limit, boolean headerOnly, List<Comparable[]> rows) {
|
||||||
String sortByField, boolean isDescending, Integer limit, boolean headerOnly,
|
|
||||||
List<Comparable[]> rows) {
|
|
||||||
|
|
||||||
if (headerOnly) {
|
if (headerOnly) {
|
||||||
return HoodiePrintHelper.print(rowHeader);
|
return HoodiePrintHelper.print(rowHeader);
|
||||||
}
|
}
|
||||||
|
|
||||||
Table table = new Table(rowHeader, fieldNameToConverterMap,
|
Table table =
|
||||||
Option.ofNullable(sortByField.isEmpty() ? null : sortByField),
|
new Table(rowHeader, fieldNameToConverterMap, Option.ofNullable(sortByField.isEmpty() ? null : sortByField),
|
||||||
Option.ofNullable(isDescending),
|
Option.ofNullable(isDescending), Option.ofNullable(limit <= 0 ? null : limit)).addAllRows(rows).flip();
|
||||||
Option.ofNullable(limit <= 0 ? null : limit)).addAllRows(rows).flip();
|
|
||||||
|
|
||||||
return HoodiePrintHelper.print(table);
|
return HoodiePrintHelper.print(table);
|
||||||
}
|
}
|
||||||
@@ -79,9 +76,8 @@ public class HoodiePrintHelper {
|
|||||||
String[] header = new String[buffer.getFieldNames().size()];
|
String[] header = new String[buffer.getFieldNames().size()];
|
||||||
buffer.getFieldNames().toArray(header);
|
buffer.getFieldNames().toArray(header);
|
||||||
|
|
||||||
String[][] rows = buffer.getRenderRows().stream()
|
String[][] rows =
|
||||||
.map(l -> l.stream().toArray(String[]::new))
|
buffer.getRenderRows().stream().map(l -> l.stream().toArray(String[]::new)).toArray(String[][]::new);
|
||||||
.toArray(String[][]::new);
|
|
||||||
return printTextTable(header, rows);
|
return printTextTable(header, rows);
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -94,7 +90,7 @@ public class HoodiePrintHelper {
|
|||||||
private static String print(TableHeader header) {
|
private static String print(TableHeader header) {
|
||||||
String[] head = new String[header.getFieldNames().size()];
|
String[] head = new String[header.getFieldNames().size()];
|
||||||
header.getFieldNames().toArray(head);
|
header.getFieldNames().toArray(head);
|
||||||
return printTextTable(head, new String[][]{});
|
return printTextTable(head, new String[][] {});
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
|
|||||||
@@ -31,8 +31,7 @@ import java.util.stream.IntStream;
|
|||||||
import org.apache.hudi.common.util.Option;
|
import org.apache.hudi.common.util.Option;
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Table to be rendered. This class takes care of ordering
|
* Table to be rendered. This class takes care of ordering rows and limiting before renderer renders it.
|
||||||
* rows and limiting before renderer renders it.
|
|
||||||
*/
|
*/
|
||||||
public class Table implements Iterable<List<String>> {
|
public class Table implements Iterable<List<String>> {
|
||||||
|
|
||||||
@@ -53,11 +52,8 @@ public class Table implements Iterable<List<String>> {
|
|||||||
// Rows ready for Rendering
|
// Rows ready for Rendering
|
||||||
private List<List<String>> renderRows;
|
private List<List<String>> renderRows;
|
||||||
|
|
||||||
public Table(TableHeader rowHeader,
|
public Table(TableHeader rowHeader, Map<String, Function<Object, String>> fieldNameToConverterMap,
|
||||||
Map<String, Function<Object, String>> fieldNameToConverterMap,
|
Option<String> orderingFieldNameOptional, Option<Boolean> isDescendingOptional, Option<Integer> limitOptional) {
|
||||||
Option<String> orderingFieldNameOptional,
|
|
||||||
Option<Boolean> isDescendingOptional,
|
|
||||||
Option<Integer> limitOptional) {
|
|
||||||
this.rowHeader = rowHeader;
|
this.rowHeader = rowHeader;
|
||||||
this.fieldNameToConverterMap = fieldNameToConverterMap;
|
this.fieldNameToConverterMap = fieldNameToConverterMap;
|
||||||
this.orderingFieldNameOptional = orderingFieldNameOptional;
|
this.orderingFieldNameOptional = orderingFieldNameOptional;
|
||||||
@@ -68,6 +64,7 @@ public class Table implements Iterable<List<String>> {
|
|||||||
|
|
||||||
/**
|
/**
|
||||||
* Main API to add row to the table
|
* Main API to add row to the table
|
||||||
|
*
|
||||||
* @param row Row
|
* @param row Row
|
||||||
*/
|
*/
|
||||||
public Table add(List<Comparable> row) {
|
public Table add(List<Comparable> row) {
|
||||||
@@ -86,6 +83,7 @@ public class Table implements Iterable<List<String>> {
|
|||||||
|
|
||||||
/**
|
/**
|
||||||
* Add all rows
|
* Add all rows
|
||||||
|
*
|
||||||
* @param rows Rows to be aded
|
* @param rows Rows to be aded
|
||||||
* @return
|
* @return
|
||||||
*/
|
*/
|
||||||
@@ -96,6 +94,7 @@ public class Table implements Iterable<List<String>> {
|
|||||||
|
|
||||||
/**
|
/**
|
||||||
* Add all rows
|
* Add all rows
|
||||||
|
*
|
||||||
* @param rows Rows to be added
|
* @param rows Rows to be added
|
||||||
* @return
|
* @return
|
||||||
*/
|
*/
|
||||||
@@ -115,6 +114,7 @@ public class Table implements Iterable<List<String>> {
|
|||||||
|
|
||||||
/**
|
/**
|
||||||
* Sorting of rows by a specified field
|
* Sorting of rows by a specified field
|
||||||
|
*
|
||||||
* @return
|
* @return
|
||||||
*/
|
*/
|
||||||
private List<List<Comparable>> orderRows() {
|
private List<List<Comparable>> orderRows() {
|
||||||
|
|||||||
@@ -59,8 +59,8 @@ public class ArchivedCommitsCommand implements CommandMarker {
|
|||||||
@CliOption(key = {"limit"}, help = "Limit commits", unspecifiedDefaultValue = "-1") final Integer limit,
|
@CliOption(key = {"limit"}, help = "Limit commits", unspecifiedDefaultValue = "-1") final Integer limit,
|
||||||
@CliOption(key = {"sortBy"}, help = "Sorting Field", unspecifiedDefaultValue = "") final String sortByField,
|
@CliOption(key = {"sortBy"}, help = "Sorting Field", unspecifiedDefaultValue = "") final String sortByField,
|
||||||
@CliOption(key = {"desc"}, help = "Ordering", unspecifiedDefaultValue = "false") final boolean descending,
|
@CliOption(key = {"desc"}, help = "Ordering", unspecifiedDefaultValue = "false") final boolean descending,
|
||||||
@CliOption(key = {
|
@CliOption(key = {"headeronly"}, help = "Print Header Only",
|
||||||
"headeronly"}, help = "Print Header Only", unspecifiedDefaultValue = "false") final boolean headerOnly)
|
unspecifiedDefaultValue = "false") final boolean headerOnly)
|
||||||
throws IOException {
|
throws IOException {
|
||||||
System.out.println("===============> Showing only " + limit + " archived commits <===============");
|
System.out.println("===============> Showing only " + limit + " archived commits <===============");
|
||||||
String basePath = HoodieCLI.tableMetadata.getBasePath();
|
String basePath = HoodieCLI.tableMetadata.getBasePath();
|
||||||
@@ -71,12 +71,12 @@ public class ArchivedCommitsCommand implements CommandMarker {
|
|||||||
FileStatus[] fsStatuses = FSUtils.getFs(basePath, HoodieCLI.conf).globStatus(archivePath);
|
FileStatus[] fsStatuses = FSUtils.getFs(basePath, HoodieCLI.conf).globStatus(archivePath);
|
||||||
List<Comparable[]> allStats = new ArrayList<>();
|
List<Comparable[]> allStats = new ArrayList<>();
|
||||||
for (FileStatus fs : fsStatuses) {
|
for (FileStatus fs : fsStatuses) {
|
||||||
//read the archived file
|
// read the archived file
|
||||||
Reader reader = HoodieLogFormat.newReader(FSUtils.getFs(basePath, HoodieCLI.conf),
|
Reader reader = HoodieLogFormat.newReader(FSUtils.getFs(basePath, HoodieCLI.conf),
|
||||||
new HoodieLogFile(fs.getPath()), HoodieArchivedMetaEntry.getClassSchema());
|
new HoodieLogFile(fs.getPath()), HoodieArchivedMetaEntry.getClassSchema());
|
||||||
|
|
||||||
List<IndexedRecord> readRecords = new ArrayList<>();
|
List<IndexedRecord> readRecords = new ArrayList<>();
|
||||||
//read the avro blocks
|
// read the avro blocks
|
||||||
while (reader.hasNext()) {
|
while (reader.hasNext()) {
|
||||||
HoodieAvroDataBlock blk = (HoodieAvroDataBlock) reader.next();
|
HoodieAvroDataBlock blk = (HoodieAvroDataBlock) reader.next();
|
||||||
List<IndexedRecord> records = blk.getRecords();
|
List<IndexedRecord> records = blk.getRecords();
|
||||||
@@ -86,9 +86,8 @@ public class ArchivedCommitsCommand implements CommandMarker {
|
|||||||
.filter(r -> r.get("actionType").toString().equals(HoodieTimeline.COMMIT_ACTION)
|
.filter(r -> r.get("actionType").toString().equals(HoodieTimeline.COMMIT_ACTION)
|
||||||
|| r.get("actionType").toString().equals(HoodieTimeline.DELTA_COMMIT_ACTION))
|
|| r.get("actionType").toString().equals(HoodieTimeline.DELTA_COMMIT_ACTION))
|
||||||
.flatMap(r -> {
|
.flatMap(r -> {
|
||||||
HoodieCommitMetadata metadata =
|
HoodieCommitMetadata metadata = (HoodieCommitMetadata) SpecificData.get()
|
||||||
(HoodieCommitMetadata) SpecificData.get().deepCopy(HoodieCommitMetadata.SCHEMA$,
|
.deepCopy(HoodieCommitMetadata.SCHEMA$, r.get("hoodieCommitMetadata"));
|
||||||
r.get("hoodieCommitMetadata"));
|
|
||||||
final String instantTime = r.get("commitTime").toString();
|
final String instantTime = r.get("commitTime").toString();
|
||||||
final String action = r.get("actionType").toString();
|
final String action = r.get("actionType").toString();
|
||||||
return metadata.getPartitionToWriteStats().values().stream().flatMap(hoodieWriteStats -> {
|
return metadata.getPartitionToWriteStats().values().stream().flatMap(hoodieWriteStats -> {
|
||||||
@@ -118,22 +117,13 @@ public class ArchivedCommitsCommand implements CommandMarker {
|
|||||||
allStats.addAll(readCommits);
|
allStats.addAll(readCommits);
|
||||||
reader.close();
|
reader.close();
|
||||||
}
|
}
|
||||||
TableHeader header = new TableHeader().addTableHeaderField("action")
|
TableHeader header = new TableHeader().addTableHeaderField("action").addTableHeaderField("instant")
|
||||||
.addTableHeaderField("instant")
|
.addTableHeaderField("partition").addTableHeaderField("file_id").addTableHeaderField("prev_instant")
|
||||||
.addTableHeaderField("partition")
|
.addTableHeaderField("num_writes").addTableHeaderField("num_inserts").addTableHeaderField("num_deletes")
|
||||||
.addTableHeaderField("file_id")
|
.addTableHeaderField("num_update_writes").addTableHeaderField("total_log_files")
|
||||||
.addTableHeaderField("prev_instant")
|
.addTableHeaderField("total_log_blocks").addTableHeaderField("total_corrupt_log_blocks")
|
||||||
.addTableHeaderField("num_writes")
|
.addTableHeaderField("total_rollback_blocks").addTableHeaderField("total_log_records")
|
||||||
.addTableHeaderField("num_inserts")
|
.addTableHeaderField("total_updated_records_compacted").addTableHeaderField("total_write_bytes")
|
||||||
.addTableHeaderField("num_deletes")
|
|
||||||
.addTableHeaderField("num_update_writes")
|
|
||||||
.addTableHeaderField("total_log_files")
|
|
||||||
.addTableHeaderField("total_log_blocks")
|
|
||||||
.addTableHeaderField("total_corrupt_log_blocks")
|
|
||||||
.addTableHeaderField("total_rollback_blocks")
|
|
||||||
.addTableHeaderField("total_log_records")
|
|
||||||
.addTableHeaderField("total_updated_records_compacted")
|
|
||||||
.addTableHeaderField("total_write_bytes")
|
|
||||||
.addTableHeaderField("total_write_errors");
|
.addTableHeaderField("total_write_errors");
|
||||||
|
|
||||||
return HoodiePrintHelper.print(header, new HashMap<>(), sortByField, descending, limit, headerOnly, allStats);
|
return HoodiePrintHelper.print(header, new HashMap<>(), sortByField, descending, limit, headerOnly, allStats);
|
||||||
@@ -141,41 +131,39 @@ public class ArchivedCommitsCommand implements CommandMarker {
|
|||||||
|
|
||||||
@CliCommand(value = "show archived commits", help = "Read commits from archived files and show details")
|
@CliCommand(value = "show archived commits", help = "Read commits from archived files and show details")
|
||||||
public String showCommits(
|
public String showCommits(
|
||||||
@CliOption(key = {"skipMetadata"}, help = "Skip displaying commit metadata", unspecifiedDefaultValue = "true")
|
@CliOption(key = {"skipMetadata"}, help = "Skip displaying commit metadata",
|
||||||
boolean skipMetadata,
|
unspecifiedDefaultValue = "true") boolean skipMetadata,
|
||||||
@CliOption(key = {"limit"}, help = "Limit commits", unspecifiedDefaultValue = "10") final Integer limit,
|
@CliOption(key = {"limit"}, help = "Limit commits", unspecifiedDefaultValue = "10") final Integer limit,
|
||||||
@CliOption(key = {"sortBy"}, help = "Sorting Field", unspecifiedDefaultValue = "") final String sortByField,
|
@CliOption(key = {"sortBy"}, help = "Sorting Field", unspecifiedDefaultValue = "") final String sortByField,
|
||||||
@CliOption(key = {"desc"}, help = "Ordering", unspecifiedDefaultValue = "false") final boolean descending,
|
@CliOption(key = {"desc"}, help = "Ordering", unspecifiedDefaultValue = "false") final boolean descending,
|
||||||
@CliOption(key = {
|
@CliOption(key = {"headeronly"}, help = "Print Header Only",
|
||||||
"headeronly"}, help = "Print Header Only", unspecifiedDefaultValue = "false") final boolean headerOnly)
|
unspecifiedDefaultValue = "false") final boolean headerOnly)
|
||||||
throws IOException {
|
throws IOException {
|
||||||
|
|
||||||
System.out.println("===============> Showing only " + limit + " archived commits <===============");
|
System.out.println("===============> Showing only " + limit + " archived commits <===============");
|
||||||
String basePath = HoodieCLI.tableMetadata.getBasePath();
|
String basePath = HoodieCLI.tableMetadata.getBasePath();
|
||||||
FileStatus[] fsStatuses = FSUtils.getFs(basePath, HoodieCLI.conf)
|
FileStatus[] fsStatuses =
|
||||||
.globStatus(new Path(basePath + "/.hoodie/.commits_.archive*"));
|
FSUtils.getFs(basePath, HoodieCLI.conf).globStatus(new Path(basePath + "/.hoodie/.commits_.archive*"));
|
||||||
List<Comparable[]> allCommits = new ArrayList<>();
|
List<Comparable[]> allCommits = new ArrayList<>();
|
||||||
for (FileStatus fs : fsStatuses) {
|
for (FileStatus fs : fsStatuses) {
|
||||||
//read the archived file
|
// read the archived file
|
||||||
HoodieLogFormat.Reader reader = HoodieLogFormat.newReader(FSUtils.getFs(basePath, HoodieCLI.conf),
|
HoodieLogFormat.Reader reader = HoodieLogFormat.newReader(FSUtils.getFs(basePath, HoodieCLI.conf),
|
||||||
new HoodieLogFile(fs.getPath()), HoodieArchivedMetaEntry.getClassSchema());
|
new HoodieLogFile(fs.getPath()), HoodieArchivedMetaEntry.getClassSchema());
|
||||||
|
|
||||||
List<IndexedRecord> readRecords = new ArrayList<>();
|
List<IndexedRecord> readRecords = new ArrayList<>();
|
||||||
//read the avro blocks
|
// read the avro blocks
|
||||||
while (reader.hasNext()) {
|
while (reader.hasNext()) {
|
||||||
HoodieAvroDataBlock blk = (HoodieAvroDataBlock) reader.next();
|
HoodieAvroDataBlock blk = (HoodieAvroDataBlock) reader.next();
|
||||||
List<IndexedRecord> records = blk.getRecords();
|
List<IndexedRecord> records = blk.getRecords();
|
||||||
readRecords.addAll(records);
|
readRecords.addAll(records);
|
||||||
}
|
}
|
||||||
List<Comparable[]> readCommits = readRecords.stream().map(r -> (GenericRecord) r).map(r ->
|
List<Comparable[]> readCommits = readRecords.stream().map(r -> (GenericRecord) r)
|
||||||
readCommit(r, skipMetadata))
|
.map(r -> readCommit(r, skipMetadata)).collect(Collectors.toList());
|
||||||
.collect(Collectors.toList());
|
|
||||||
allCommits.addAll(readCommits);
|
allCommits.addAll(readCommits);
|
||||||
reader.close();
|
reader.close();
|
||||||
}
|
}
|
||||||
|
|
||||||
TableHeader header = new TableHeader().addTableHeaderField("CommitTime")
|
TableHeader header = new TableHeader().addTableHeaderField("CommitTime").addTableHeaderField("CommitType");
|
||||||
.addTableHeaderField("CommitType");
|
|
||||||
|
|
||||||
if (!skipMetadata) {
|
if (!skipMetadata) {
|
||||||
header = header.addTableHeaderField("CommitDetails");
|
header = header.addTableHeaderField("CommitDetails");
|
||||||
|
|||||||
@@ -63,8 +63,8 @@ public class CleansCommand implements CommandMarker {
|
|||||||
@CliOption(key = {"limit"}, help = "Limit commits", unspecifiedDefaultValue = "-1") final Integer limit,
|
@CliOption(key = {"limit"}, help = "Limit commits", unspecifiedDefaultValue = "-1") final Integer limit,
|
||||||
@CliOption(key = {"sortBy"}, help = "Sorting Field", unspecifiedDefaultValue = "") final String sortByField,
|
@CliOption(key = {"sortBy"}, help = "Sorting Field", unspecifiedDefaultValue = "") final String sortByField,
|
||||||
@CliOption(key = {"desc"}, help = "Ordering", unspecifiedDefaultValue = "false") final boolean descending,
|
@CliOption(key = {"desc"}, help = "Ordering", unspecifiedDefaultValue = "false") final boolean descending,
|
||||||
@CliOption(key = {
|
@CliOption(key = {"headeronly"}, help = "Print Header Only",
|
||||||
"headeronly"}, help = "Print Header Only", unspecifiedDefaultValue = "false") final boolean headerOnly)
|
unspecifiedDefaultValue = "false") final boolean headerOnly)
|
||||||
throws IOException {
|
throws IOException {
|
||||||
|
|
||||||
HoodieActiveTimeline activeTimeline = HoodieCLI.tableMetadata.getActiveTimeline();
|
HoodieActiveTimeline activeTimeline = HoodieCLI.tableMetadata.getActiveTimeline();
|
||||||
@@ -74,17 +74,15 @@ public class CleansCommand implements CommandMarker {
|
|||||||
Collections.reverse(cleans);
|
Collections.reverse(cleans);
|
||||||
for (int i = 0; i < cleans.size(); i++) {
|
for (int i = 0; i < cleans.size(); i++) {
|
||||||
HoodieInstant clean = cleans.get(i);
|
HoodieInstant clean = cleans.get(i);
|
||||||
HoodieCleanMetadata cleanMetadata = AvroUtils
|
HoodieCleanMetadata cleanMetadata =
|
||||||
.deserializeHoodieCleanMetadata(timeline.getInstantDetails(clean).get());
|
AvroUtils.deserializeHoodieCleanMetadata(timeline.getInstantDetails(clean).get());
|
||||||
rows.add(new Comparable[]{clean.getTimestamp(), cleanMetadata.getEarliestCommitToRetain(),
|
rows.add(new Comparable[] {clean.getTimestamp(), cleanMetadata.getEarliestCommitToRetain(),
|
||||||
cleanMetadata.getTotalFilesDeleted(), cleanMetadata.getTimeTakenInMillis()});
|
cleanMetadata.getTotalFilesDeleted(), cleanMetadata.getTimeTakenInMillis()});
|
||||||
}
|
}
|
||||||
|
|
||||||
TableHeader header = new TableHeader()
|
TableHeader header =
|
||||||
.addTableHeaderField("CleanTime")
|
new TableHeader().addTableHeaderField("CleanTime").addTableHeaderField("EarliestCommandRetained")
|
||||||
.addTableHeaderField("EarliestCommandRetained")
|
.addTableHeaderField("Total Files Deleted").addTableHeaderField("Total Time Taken");
|
||||||
.addTableHeaderField("Total Files Deleted")
|
|
||||||
.addTableHeaderField("Total Time Taken");
|
|
||||||
return HoodiePrintHelper.print(header, new HashMap<>(), sortByField, descending, limit, headerOnly, rows);
|
return HoodiePrintHelper.print(header, new HashMap<>(), sortByField, descending, limit, headerOnly, rows);
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -95,13 +93,12 @@ public class CleansCommand implements CommandMarker {
|
|||||||
}
|
}
|
||||||
|
|
||||||
@CliCommand(value = "clean showpartitions", help = "Show partition level details of a clean")
|
@CliCommand(value = "clean showpartitions", help = "Show partition level details of a clean")
|
||||||
public String showCleanPartitions(
|
public String showCleanPartitions(@CliOption(key = {"clean"}, help = "clean to show") final String commitTime,
|
||||||
@CliOption(key = {"clean"}, help = "clean to show") final String commitTime,
|
|
||||||
@CliOption(key = {"limit"}, help = "Limit commits", unspecifiedDefaultValue = "-1") final Integer limit,
|
@CliOption(key = {"limit"}, help = "Limit commits", unspecifiedDefaultValue = "-1") final Integer limit,
|
||||||
@CliOption(key = {"sortBy"}, help = "Sorting Field", unspecifiedDefaultValue = "") final String sortByField,
|
@CliOption(key = {"sortBy"}, help = "Sorting Field", unspecifiedDefaultValue = "") final String sortByField,
|
||||||
@CliOption(key = {"desc"}, help = "Ordering", unspecifiedDefaultValue = "false") final boolean descending,
|
@CliOption(key = {"desc"}, help = "Ordering", unspecifiedDefaultValue = "false") final boolean descending,
|
||||||
@CliOption(key = {
|
@CliOption(key = {"headeronly"}, help = "Print Header Only",
|
||||||
"headeronly"}, help = "Print Header Only", unspecifiedDefaultValue = "false") final boolean headerOnly)
|
unspecifiedDefaultValue = "false") final boolean headerOnly)
|
||||||
throws Exception {
|
throws Exception {
|
||||||
|
|
||||||
HoodieActiveTimeline activeTimeline = HoodieCLI.tableMetadata.getActiveTimeline();
|
HoodieActiveTimeline activeTimeline = HoodieCLI.tableMetadata.getActiveTimeline();
|
||||||
@@ -112,8 +109,8 @@ public class CleansCommand implements CommandMarker {
|
|||||||
return "Clean " + commitTime + " not found in metadata " + timeline;
|
return "Clean " + commitTime + " not found in metadata " + timeline;
|
||||||
}
|
}
|
||||||
|
|
||||||
HoodieCleanMetadata cleanMetadata = AvroUtils.deserializeHoodieCleanMetadata(
|
HoodieCleanMetadata cleanMetadata =
|
||||||
timeline.getInstantDetails(cleanInstant).get());
|
AvroUtils.deserializeHoodieCleanMetadata(timeline.getInstantDetails(cleanInstant).get());
|
||||||
List<Comparable[]> rows = new ArrayList<>();
|
List<Comparable[]> rows = new ArrayList<>();
|
||||||
for (Map.Entry<String, HoodieCleanPartitionMetadata> entry : cleanMetadata.getPartitionMetadata().entrySet()) {
|
for (Map.Entry<String, HoodieCleanPartitionMetadata> entry : cleanMetadata.getPartitionMetadata().entrySet()) {
|
||||||
String path = entry.getKey();
|
String path = entry.getKey();
|
||||||
@@ -121,14 +118,11 @@ public class CleansCommand implements CommandMarker {
|
|||||||
String policy = stats.getPolicy();
|
String policy = stats.getPolicy();
|
||||||
Integer totalSuccessDeletedFiles = stats.getSuccessDeleteFiles().size();
|
Integer totalSuccessDeletedFiles = stats.getSuccessDeleteFiles().size();
|
||||||
Integer totalFailedDeletedFiles = stats.getFailedDeleteFiles().size();
|
Integer totalFailedDeletedFiles = stats.getFailedDeleteFiles().size();
|
||||||
rows.add(new Comparable[]{path, policy, totalSuccessDeletedFiles, totalFailedDeletedFiles});
|
rows.add(new Comparable[] {path, policy, totalSuccessDeletedFiles, totalFailedDeletedFiles});
|
||||||
}
|
}
|
||||||
|
|
||||||
TableHeader header = new TableHeader()
|
TableHeader header = new TableHeader().addTableHeaderField("Partition Path").addTableHeaderField("Cleaning policy")
|
||||||
.addTableHeaderField("Partition Path")
|
.addTableHeaderField("Total Files Successfully Deleted").addTableHeaderField("Total Failed Deletions");
|
||||||
.addTableHeaderField("Cleaning policy")
|
|
||||||
.addTableHeaderField("Total Files Successfully Deleted")
|
|
||||||
.addTableHeaderField("Total Failed Deletions");
|
|
||||||
return HoodiePrintHelper.print(header, new HashMap<>(), sortByField, descending, limit, headerOnly, rows);
|
return HoodiePrintHelper.print(header, new HashMap<>(), sortByField, descending, limit, headerOnly, rows);
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -69,12 +69,13 @@ public class CommitsCommand implements CommandMarker {
|
|||||||
}
|
}
|
||||||
|
|
||||||
@CliCommand(value = "commits show", help = "Show the commits")
|
@CliCommand(value = "commits show", help = "Show the commits")
|
||||||
public String showCommits(@CliOption(key = {
|
public String showCommits(
|
||||||
"limit"}, mandatory = false, help = "Limit commits", unspecifiedDefaultValue = "-1") final Integer limit,
|
@CliOption(key = {"limit"}, mandatory = false, help = "Limit commits",
|
||||||
|
unspecifiedDefaultValue = "-1") final Integer limit,
|
||||||
@CliOption(key = {"sortBy"}, help = "Sorting Field", unspecifiedDefaultValue = "") final String sortByField,
|
@CliOption(key = {"sortBy"}, help = "Sorting Field", unspecifiedDefaultValue = "") final String sortByField,
|
||||||
@CliOption(key = {"desc"}, help = "Ordering", unspecifiedDefaultValue = "false") final boolean descending,
|
@CliOption(key = {"desc"}, help = "Ordering", unspecifiedDefaultValue = "false") final boolean descending,
|
||||||
@CliOption(key = {
|
@CliOption(key = {"headeronly"}, help = "Print Header Only",
|
||||||
"headeronly"}, help = "Print Header Only", unspecifiedDefaultValue = "false") final boolean headerOnly)
|
unspecifiedDefaultValue = "false") final boolean headerOnly)
|
||||||
throws IOException {
|
throws IOException {
|
||||||
|
|
||||||
HoodieActiveTimeline activeTimeline = HoodieCLI.tableMetadata.getActiveTimeline();
|
HoodieActiveTimeline activeTimeline = HoodieCLI.tableMetadata.getActiveTimeline();
|
||||||
@@ -84,16 +85,12 @@ public class CommitsCommand implements CommandMarker {
|
|||||||
Collections.reverse(commits);
|
Collections.reverse(commits);
|
||||||
for (int i = 0; i < commits.size(); i++) {
|
for (int i = 0; i < commits.size(); i++) {
|
||||||
HoodieInstant commit = commits.get(i);
|
HoodieInstant commit = commits.get(i);
|
||||||
HoodieCommitMetadata commitMetadata = HoodieCommitMetadata.fromBytes(timeline.getInstantDetails(commit).get(),
|
HoodieCommitMetadata commitMetadata =
|
||||||
HoodieCommitMetadata.class);
|
HoodieCommitMetadata.fromBytes(timeline.getInstantDetails(commit).get(), HoodieCommitMetadata.class);
|
||||||
rows.add(new Comparable[]{commit.getTimestamp(),
|
rows.add(new Comparable[] {commit.getTimestamp(), commitMetadata.fetchTotalBytesWritten(),
|
||||||
commitMetadata.fetchTotalBytesWritten(),
|
commitMetadata.fetchTotalFilesInsert(), commitMetadata.fetchTotalFilesUpdated(),
|
||||||
commitMetadata.fetchTotalFilesInsert(),
|
commitMetadata.fetchTotalPartitionsWritten(), commitMetadata.fetchTotalRecordsWritten(),
|
||||||
commitMetadata.fetchTotalFilesUpdated(),
|
commitMetadata.fetchTotalUpdateRecordsWritten(), commitMetadata.fetchTotalWriteErrors()});
|
||||||
commitMetadata.fetchTotalPartitionsWritten(),
|
|
||||||
commitMetadata.fetchTotalRecordsWritten(),
|
|
||||||
commitMetadata.fetchTotalUpdateRecordsWritten(),
|
|
||||||
commitMetadata.fetchTotalWriteErrors()});
|
|
||||||
}
|
}
|
||||||
|
|
||||||
Map<String, Function<Object, String>> fieldNameToConverterMap = new HashMap<>();
|
Map<String, Function<Object, String>> fieldNameToConverterMap = new HashMap<>();
|
||||||
@@ -101,15 +98,10 @@ public class CommitsCommand implements CommandMarker {
|
|||||||
return NumericUtils.humanReadableByteCount((Double.valueOf(entry.toString())));
|
return NumericUtils.humanReadableByteCount((Double.valueOf(entry.toString())));
|
||||||
});
|
});
|
||||||
|
|
||||||
TableHeader header = new TableHeader()
|
TableHeader header = new TableHeader().addTableHeaderField("CommitTime").addTableHeaderField("Total Bytes Written")
|
||||||
.addTableHeaderField("CommitTime")
|
.addTableHeaderField("Total Files Added").addTableHeaderField("Total Files Updated")
|
||||||
.addTableHeaderField("Total Bytes Written")
|
.addTableHeaderField("Total Partitions Written").addTableHeaderField("Total Records Written")
|
||||||
.addTableHeaderField("Total Files Added")
|
.addTableHeaderField("Total Update Records Written").addTableHeaderField("Total Errors");
|
||||||
.addTableHeaderField("Total Files Updated")
|
|
||||||
.addTableHeaderField("Total Partitions Written")
|
|
||||||
.addTableHeaderField("Total Records Written")
|
|
||||||
.addTableHeaderField("Total Update Records Written")
|
|
||||||
.addTableHeaderField("Total Errors");
|
|
||||||
return HoodiePrintHelper.print(header, fieldNameToConverterMap, sortByField, descending, limit, headerOnly, rows);
|
return HoodiePrintHelper.print(header, fieldNameToConverterMap, sortByField, descending, limit, headerOnly, rows);
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -132,8 +124,8 @@ public class CommitsCommand implements CommandMarker {
|
|||||||
}
|
}
|
||||||
|
|
||||||
SparkLauncher sparkLauncher = SparkUtil.initLauncher(sparkPropertiesPath);
|
SparkLauncher sparkLauncher = SparkUtil.initLauncher(sparkPropertiesPath);
|
||||||
sparkLauncher
|
sparkLauncher.addAppArgs(SparkMain.SparkCommand.ROLLBACK.toString(), commitTime,
|
||||||
.addAppArgs(SparkMain.SparkCommand.ROLLBACK.toString(), commitTime, HoodieCLI.tableMetadata.getBasePath());
|
HoodieCLI.tableMetadata.getBasePath());
|
||||||
Process process = sparkLauncher.launch();
|
Process process = sparkLauncher.launch();
|
||||||
InputStreamConsumer.captureOutput(process);
|
InputStreamConsumer.captureOutput(process);
|
||||||
int exitCode = process.waitFor();
|
int exitCode = process.waitFor();
|
||||||
@@ -146,13 +138,12 @@ public class CommitsCommand implements CommandMarker {
|
|||||||
}
|
}
|
||||||
|
|
||||||
@CliCommand(value = "commit showpartitions", help = "Show partition level details of a commit")
|
@CliCommand(value = "commit showpartitions", help = "Show partition level details of a commit")
|
||||||
public String showCommitPartitions(
|
public String showCommitPartitions(@CliOption(key = {"commit"}, help = "Commit to show") final String commitTime,
|
||||||
@CliOption(key = {"commit"}, help = "Commit to show") final String commitTime,
|
|
||||||
@CliOption(key = {"limit"}, help = "Limit commits", unspecifiedDefaultValue = "-1") final Integer limit,
|
@CliOption(key = {"limit"}, help = "Limit commits", unspecifiedDefaultValue = "-1") final Integer limit,
|
||||||
@CliOption(key = {"sortBy"}, help = "Sorting Field", unspecifiedDefaultValue = "") final String sortByField,
|
@CliOption(key = {"sortBy"}, help = "Sorting Field", unspecifiedDefaultValue = "") final String sortByField,
|
||||||
@CliOption(key = {"desc"}, help = "Ordering", unspecifiedDefaultValue = "false") final boolean descending,
|
@CliOption(key = {"desc"}, help = "Ordering", unspecifiedDefaultValue = "false") final boolean descending,
|
||||||
@CliOption(key = {
|
@CliOption(key = {"headeronly"}, help = "Print Header Only",
|
||||||
"headeronly"}, help = "Print Header Only", unspecifiedDefaultValue = "false") final boolean headerOnly)
|
unspecifiedDefaultValue = "false") final boolean headerOnly)
|
||||||
throws Exception {
|
throws Exception {
|
||||||
|
|
||||||
HoodieActiveTimeline activeTimeline = HoodieCLI.tableMetadata.getActiveTimeline();
|
HoodieActiveTimeline activeTimeline = HoodieCLI.tableMetadata.getActiveTimeline();
|
||||||
@@ -185,8 +176,7 @@ public class CommitsCommand implements CommandMarker {
|
|||||||
totalBytesWritten += stat.getTotalWriteBytes();
|
totalBytesWritten += stat.getTotalWriteBytes();
|
||||||
totalWriteErrors += stat.getTotalWriteErrors();
|
totalWriteErrors += stat.getTotalWriteErrors();
|
||||||
}
|
}
|
||||||
rows.add(new Comparable[]{path, totalFilesAdded, totalFilesUpdated,
|
rows.add(new Comparable[] {path, totalFilesAdded, totalFilesUpdated, totalRecordsInserted, totalRecordsUpdated,
|
||||||
totalRecordsInserted, totalRecordsUpdated,
|
|
||||||
totalBytesWritten, totalWriteErrors});
|
totalBytesWritten, totalWriteErrors});
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -195,26 +185,21 @@ public class CommitsCommand implements CommandMarker {
|
|||||||
return NumericUtils.humanReadableByteCount((Long.valueOf(entry.toString())));
|
return NumericUtils.humanReadableByteCount((Long.valueOf(entry.toString())));
|
||||||
});
|
});
|
||||||
|
|
||||||
TableHeader header = new TableHeader()
|
TableHeader header = new TableHeader().addTableHeaderField("Partition Path")
|
||||||
.addTableHeaderField("Partition Path")
|
.addTableHeaderField("Total Files Added").addTableHeaderField("Total Files Updated")
|
||||||
.addTableHeaderField("Total Files Added")
|
.addTableHeaderField("Total Records Inserted").addTableHeaderField("Total Records Updated")
|
||||||
.addTableHeaderField("Total Files Updated")
|
.addTableHeaderField("Total Bytes Written").addTableHeaderField("Total Errors");
|
||||||
.addTableHeaderField("Total Records Inserted")
|
|
||||||
.addTableHeaderField("Total Records Updated")
|
|
||||||
.addTableHeaderField("Total Bytes Written")
|
|
||||||
.addTableHeaderField("Total Errors");
|
|
||||||
|
|
||||||
return HoodiePrintHelper.print(header, fieldNameToConverterMap, sortByField, descending, limit, headerOnly, rows);
|
return HoodiePrintHelper.print(header, fieldNameToConverterMap, sortByField, descending, limit, headerOnly, rows);
|
||||||
}
|
}
|
||||||
|
|
||||||
@CliCommand(value = "commit showfiles", help = "Show file level details of a commit")
|
@CliCommand(value = "commit showfiles", help = "Show file level details of a commit")
|
||||||
public String showCommitFiles(
|
public String showCommitFiles(@CliOption(key = {"commit"}, help = "Commit to show") final String commitTime,
|
||||||
@CliOption(key = {"commit"}, help = "Commit to show") final String commitTime,
|
|
||||||
@CliOption(key = {"limit"}, help = "Limit commits", unspecifiedDefaultValue = "-1") final Integer limit,
|
@CliOption(key = {"limit"}, help = "Limit commits", unspecifiedDefaultValue = "-1") final Integer limit,
|
||||||
@CliOption(key = {"sortBy"}, help = "Sorting Field", unspecifiedDefaultValue = "") final String sortByField,
|
@CliOption(key = {"sortBy"}, help = "Sorting Field", unspecifiedDefaultValue = "") final String sortByField,
|
||||||
@CliOption(key = {"desc"}, help = "Ordering", unspecifiedDefaultValue = "false") final boolean descending,
|
@CliOption(key = {"desc"}, help = "Ordering", unspecifiedDefaultValue = "false") final boolean descending,
|
||||||
@CliOption(key = {
|
@CliOption(key = {"headeronly"}, help = "Print Header Only",
|
||||||
"headeronly"}, help = "Print Header Only", unspecifiedDefaultValue = "false") final boolean headerOnly)
|
unspecifiedDefaultValue = "false") final boolean headerOnly)
|
||||||
throws Exception {
|
throws Exception {
|
||||||
|
|
||||||
HoodieActiveTimeline activeTimeline = HoodieCLI.tableMetadata.getActiveTimeline();
|
HoodieActiveTimeline activeTimeline = HoodieCLI.tableMetadata.getActiveTimeline();
|
||||||
@@ -231,23 +216,15 @@ public class CommitsCommand implements CommandMarker {
|
|||||||
String path = entry.getKey();
|
String path = entry.getKey();
|
||||||
List<HoodieWriteStat> stats = entry.getValue();
|
List<HoodieWriteStat> stats = entry.getValue();
|
||||||
for (HoodieWriteStat stat : stats) {
|
for (HoodieWriteStat stat : stats) {
|
||||||
rows.add(new Comparable[]{path, stat.getFileId(), stat.getPrevCommit(), stat.getNumUpdateWrites(),
|
rows.add(new Comparable[] {path, stat.getFileId(), stat.getPrevCommit(), stat.getNumUpdateWrites(),
|
||||||
stat.getNumWrites(), stat.getTotalWriteBytes(),
|
stat.getNumWrites(), stat.getTotalWriteBytes(), stat.getTotalWriteErrors(), stat.getFileSizeInBytes()});
|
||||||
stat.getTotalWriteErrors(),
|
|
||||||
stat.getFileSizeInBytes()
|
|
||||||
});
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
TableHeader header = new TableHeader()
|
TableHeader header = new TableHeader().addTableHeaderField("Partition Path").addTableHeaderField("File ID")
|
||||||
.addTableHeaderField("Partition Path")
|
.addTableHeaderField("Previous Commit").addTableHeaderField("Total Records Updated")
|
||||||
.addTableHeaderField("File ID")
|
.addTableHeaderField("Total Records Written").addTableHeaderField("Total Bytes Written")
|
||||||
.addTableHeaderField("Previous Commit")
|
.addTableHeaderField("Total Errors").addTableHeaderField("File Size");
|
||||||
.addTableHeaderField("Total Records Updated")
|
|
||||||
.addTableHeaderField("Total Records Written")
|
|
||||||
.addTableHeaderField("Total Bytes Written")
|
|
||||||
.addTableHeaderField("Total Errors")
|
|
||||||
.addTableHeaderField("File Size");
|
|
||||||
|
|
||||||
return HoodiePrintHelper.print(header, new HashMap<>(), sortByField, descending, limit, headerOnly, rows);
|
return HoodiePrintHelper.print(header, new HashMap<>(), sortByField, descending, limit, headerOnly, rows);
|
||||||
}
|
}
|
||||||
@@ -270,8 +247,8 @@ public class CommitsCommand implements CommandMarker {
|
|||||||
String sourceLatestCommit =
|
String sourceLatestCommit =
|
||||||
sourceTimeline.getInstants().iterator().hasNext() ? "0" : sourceTimeline.lastInstant().get().getTimestamp();
|
sourceTimeline.getInstants().iterator().hasNext() ? "0" : sourceTimeline.lastInstant().get().getTimestamp();
|
||||||
|
|
||||||
if (sourceLatestCommit != null && HoodieTimeline.compareTimestamps(targetLatestCommit, sourceLatestCommit,
|
if (sourceLatestCommit != null
|
||||||
HoodieTimeline.GREATER)) {
|
&& HoodieTimeline.compareTimestamps(targetLatestCommit, sourceLatestCommit, HoodieTimeline.GREATER)) {
|
||||||
// source is behind the target
|
// source is behind the target
|
||||||
List<String> commitsToCatchup = targetTimeline.findInstantsAfter(sourceLatestCommit, Integer.MAX_VALUE)
|
List<String> commitsToCatchup = targetTimeline.findInstantsAfter(sourceLatestCommit, Integer.MAX_VALUE)
|
||||||
.getInstants().map(HoodieInstant::getTimestamp).collect(Collectors.toList());
|
.getInstants().map(HoodieInstant::getTimestamp).collect(Collectors.toList());
|
||||||
|
|||||||
@@ -75,16 +75,15 @@ public class CompactionCommand implements CommandMarker {
|
|||||||
|
|
||||||
@CliCommand(value = "compactions show all", help = "Shows all compactions that are in active timeline")
|
@CliCommand(value = "compactions show all", help = "Shows all compactions that are in active timeline")
|
||||||
public String compactionsAll(
|
public String compactionsAll(
|
||||||
@CliOption(key = {
|
@CliOption(key = {"includeExtraMetadata"}, help = "Include extra metadata",
|
||||||
"includeExtraMetadata"}, help = "Include extra metadata", unspecifiedDefaultValue = "false") final
|
unspecifiedDefaultValue = "false") final boolean includeExtraMetadata,
|
||||||
boolean includeExtraMetadata,
|
@CliOption(key = {"limit"}, mandatory = false, help = "Limit commits",
|
||||||
@CliOption(key = {
|
unspecifiedDefaultValue = "-1") final Integer limit,
|
||||||
"limit"}, mandatory = false, help = "Limit commits", unspecifiedDefaultValue = "-1") final Integer limit,
|
|
||||||
@CliOption(key = {"sortBy"}, help = "Sorting Field", unspecifiedDefaultValue = "") final String sortByField,
|
@CliOption(key = {"sortBy"}, help = "Sorting Field", unspecifiedDefaultValue = "") final String sortByField,
|
||||||
@CliOption(key = {"desc"}, help = "Ordering", unspecifiedDefaultValue = "false") final boolean descending,
|
@CliOption(key = {"desc"}, help = "Ordering", unspecifiedDefaultValue = "false") final boolean descending,
|
||||||
@CliOption(key = {
|
@CliOption(key = {"headeronly"}, help = "Print Header Only",
|
||||||
"headeronly"}, help = "Print Header Only", unspecifiedDefaultValue = "false") final
|
unspecifiedDefaultValue = "false") final boolean headerOnly)
|
||||||
boolean headerOnly) throws IOException {
|
throws IOException {
|
||||||
HoodieActiveTimeline activeTimeline = HoodieCLI.tableMetadata.getActiveTimeline();
|
HoodieActiveTimeline activeTimeline = HoodieCLI.tableMetadata.getActiveTimeline();
|
||||||
HoodieTimeline timeline = activeTimeline.getCommitsAndCompactionTimeline();
|
HoodieTimeline timeline = activeTimeline.getCommitsAndCompactionTimeline();
|
||||||
HoodieTimeline commitTimeline = activeTimeline.getCommitTimeline().filterCompletedInstants();
|
HoodieTimeline commitTimeline = activeTimeline.getCommitTimeline().filterCompletedInstants();
|
||||||
@@ -99,15 +98,14 @@ public class CompactionCommand implements CommandMarker {
|
|||||||
if (!instant.getAction().equals(HoodieTimeline.COMPACTION_ACTION)) {
|
if (!instant.getAction().equals(HoodieTimeline.COMPACTION_ACTION)) {
|
||||||
try {
|
try {
|
||||||
// This could be a completed compaction. Assume a compaction request file is present but skip if fails
|
// This could be a completed compaction. Assume a compaction request file is present but skip if fails
|
||||||
workload = AvroUtils.deserializeCompactionPlan(
|
workload = AvroUtils.deserializeCompactionPlan(activeTimeline
|
||||||
activeTimeline.getInstantAuxiliaryDetails(
|
.getInstantAuxiliaryDetails(HoodieTimeline.getCompactionRequestedInstant(instant.getTimestamp())).get());
|
||||||
HoodieTimeline.getCompactionRequestedInstant(instant.getTimestamp())).get());
|
|
||||||
} catch (HoodieIOException ioe) {
|
} catch (HoodieIOException ioe) {
|
||||||
// SKIP
|
// SKIP
|
||||||
}
|
}
|
||||||
} else {
|
} else {
|
||||||
workload = AvroUtils.deserializeCompactionPlan(activeTimeline.getInstantAuxiliaryDetails(
|
workload = AvroUtils.deserializeCompactionPlan(activeTimeline
|
||||||
HoodieTimeline.getCompactionRequestedInstant(instant.getTimestamp())).get());
|
.getInstantAuxiliaryDetails(HoodieTimeline.getCompactionRequestedInstant(instant.getTimestamp())).get());
|
||||||
}
|
}
|
||||||
|
|
||||||
if (null != workload) {
|
if (null != workload) {
|
||||||
@@ -116,22 +114,18 @@ public class CompactionCommand implements CommandMarker {
|
|||||||
state = State.COMPLETED;
|
state = State.COMPLETED;
|
||||||
}
|
}
|
||||||
if (includeExtraMetadata) {
|
if (includeExtraMetadata) {
|
||||||
rows.add(new Comparable[]{instant.getTimestamp(),
|
rows.add(new Comparable[] {instant.getTimestamp(), state.toString(),
|
||||||
state.toString(),
|
|
||||||
workload.getOperations() == null ? 0 : workload.getOperations().size(),
|
workload.getOperations() == null ? 0 : workload.getOperations().size(),
|
||||||
workload.getExtraMetadata().toString()});
|
workload.getExtraMetadata().toString()});
|
||||||
} else {
|
} else {
|
||||||
rows.add(new Comparable[]{instant.getTimestamp(),
|
rows.add(new Comparable[] {instant.getTimestamp(), state.toString(),
|
||||||
state.toString(),
|
|
||||||
workload.getOperations() == null ? 0 : workload.getOperations().size()});
|
workload.getOperations() == null ? 0 : workload.getOperations().size()});
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
Map<String, Function<Object, String>> fieldNameToConverterMap = new HashMap<>();
|
Map<String, Function<Object, String>> fieldNameToConverterMap = new HashMap<>();
|
||||||
TableHeader header = new TableHeader()
|
TableHeader header = new TableHeader().addTableHeaderField("Compaction Instant Time").addTableHeaderField("State")
|
||||||
.addTableHeaderField("Compaction Instant Time")
|
|
||||||
.addTableHeaderField("State")
|
|
||||||
.addTableHeaderField("Total FileIds to be Compacted");
|
.addTableHeaderField("Total FileIds to be Compacted");
|
||||||
if (includeExtraMetadata) {
|
if (includeExtraMetadata) {
|
||||||
header = header.addTableHeaderField("Extra Metadata");
|
header = header.addTableHeaderField("Extra Metadata");
|
||||||
@@ -141,48 +135,37 @@ public class CompactionCommand implements CommandMarker {
|
|||||||
|
|
||||||
@CliCommand(value = "compaction show", help = "Shows compaction details for a specific compaction instant")
|
@CliCommand(value = "compaction show", help = "Shows compaction details for a specific compaction instant")
|
||||||
public String compactionShow(
|
public String compactionShow(
|
||||||
@CliOption(key = "instant", mandatory = true, help = "Base path for the target hoodie dataset") final
|
@CliOption(key = "instant", mandatory = true,
|
||||||
String compactionInstantTime,
|
help = "Base path for the target hoodie dataset") final String compactionInstantTime,
|
||||||
@CliOption(key = {
|
@CliOption(key = {"limit"}, mandatory = false, help = "Limit commits",
|
||||||
"limit"}, mandatory = false, help = "Limit commits", unspecifiedDefaultValue = "-1") final Integer limit,
|
unspecifiedDefaultValue = "-1") final Integer limit,
|
||||||
@CliOption(key = {"sortBy"}, help = "Sorting Field", unspecifiedDefaultValue = "") final String sortByField,
|
@CliOption(key = {"sortBy"}, help = "Sorting Field", unspecifiedDefaultValue = "") final String sortByField,
|
||||||
@CliOption(key = {"desc"}, help = "Ordering", unspecifiedDefaultValue = "false") final boolean descending,
|
@CliOption(key = {"desc"}, help = "Ordering", unspecifiedDefaultValue = "false") final boolean descending,
|
||||||
@CliOption(key = {
|
@CliOption(key = {"headeronly"}, help = "Print Header Only",
|
||||||
"headeronly"}, help = "Print Header Only", unspecifiedDefaultValue = "false") final boolean headerOnly)
|
unspecifiedDefaultValue = "false") final boolean headerOnly)
|
||||||
throws Exception {
|
throws Exception {
|
||||||
HoodieActiveTimeline activeTimeline = HoodieCLI.tableMetadata.getActiveTimeline();
|
HoodieActiveTimeline activeTimeline = HoodieCLI.tableMetadata.getActiveTimeline();
|
||||||
HoodieCompactionPlan workload = AvroUtils.deserializeCompactionPlan(
|
HoodieCompactionPlan workload = AvroUtils.deserializeCompactionPlan(activeTimeline
|
||||||
activeTimeline.getInstantAuxiliaryDetails(
|
.getInstantAuxiliaryDetails(HoodieTimeline.getCompactionRequestedInstant(compactionInstantTime)).get());
|
||||||
HoodieTimeline.getCompactionRequestedInstant(compactionInstantTime)).get());
|
|
||||||
|
|
||||||
List<Comparable[]> rows = new ArrayList<>();
|
List<Comparable[]> rows = new ArrayList<>();
|
||||||
if ((null != workload) && (null != workload.getOperations())) {
|
if ((null != workload) && (null != workload.getOperations())) {
|
||||||
for (HoodieCompactionOperation op : workload.getOperations()) {
|
for (HoodieCompactionOperation op : workload.getOperations()) {
|
||||||
rows.add(new Comparable[]{op.getPartitionPath(),
|
rows.add(new Comparable[] {op.getPartitionPath(), op.getFileId(), op.getBaseInstantTime(), op.getDataFilePath(),
|
||||||
op.getFileId(),
|
op.getDeltaFilePaths().size(), op.getMetrics() == null ? "" : op.getMetrics().toString()});
|
||||||
op.getBaseInstantTime(),
|
|
||||||
op.getDataFilePath(),
|
|
||||||
op.getDeltaFilePaths().size(),
|
|
||||||
op.getMetrics() == null ? "" : op.getMetrics().toString()
|
|
||||||
});
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
Map<String, Function<Object, String>> fieldNameToConverterMap = new HashMap<>();
|
Map<String, Function<Object, String>> fieldNameToConverterMap = new HashMap<>();
|
||||||
TableHeader header = new TableHeader()
|
TableHeader header = new TableHeader().addTableHeaderField("Partition Path").addTableHeaderField("File Id")
|
||||||
.addTableHeaderField("Partition Path")
|
.addTableHeaderField("Base Instant").addTableHeaderField("Data File Path")
|
||||||
.addTableHeaderField("File Id")
|
.addTableHeaderField("Total Delta Files").addTableHeaderField("getMetrics");
|
||||||
.addTableHeaderField("Base Instant")
|
|
||||||
.addTableHeaderField("Data File Path")
|
|
||||||
.addTableHeaderField("Total Delta Files")
|
|
||||||
.addTableHeaderField("getMetrics");
|
|
||||||
return HoodiePrintHelper.print(header, fieldNameToConverterMap, sortByField, descending, limit, headerOnly, rows);
|
return HoodiePrintHelper.print(header, fieldNameToConverterMap, sortByField, descending, limit, headerOnly, rows);
|
||||||
}
|
}
|
||||||
|
|
||||||
@CliCommand(value = "compaction schedule", help = "Schedule Compaction")
|
@CliCommand(value = "compaction schedule", help = "Schedule Compaction")
|
||||||
public String scheduleCompact(
|
public String scheduleCompact(@CliOption(key = "sparkMemory", unspecifiedDefaultValue = "1G",
|
||||||
@CliOption(key = "sparkMemory", unspecifiedDefaultValue = "1G", help = "Spark executor memory")
|
help = "Spark executor memory") final String sparkMemory) throws Exception {
|
||||||
final String sparkMemory) throws Exception {
|
|
||||||
boolean initialized = HoodieCLI.initConf();
|
boolean initialized = HoodieCLI.initConf();
|
||||||
HoodieCLI.initFS(initialized);
|
HoodieCLI.initFS(initialized);
|
||||||
|
|
||||||
@@ -190,8 +173,8 @@ public class CompactionCommand implements CommandMarker {
|
|||||||
String compactionInstantTime = HoodieActiveTimeline.createNewCommitTime();
|
String compactionInstantTime = HoodieActiveTimeline.createNewCommitTime();
|
||||||
|
|
||||||
if (HoodieCLI.tableMetadata.getTableType() == HoodieTableType.MERGE_ON_READ) {
|
if (HoodieCLI.tableMetadata.getTableType() == HoodieTableType.MERGE_ON_READ) {
|
||||||
String sparkPropertiesPath = Utils.getDefaultPropertiesFile(
|
String sparkPropertiesPath =
|
||||||
scala.collection.JavaConversions.propertiesAsScalaMap(System.getProperties()));
|
Utils.getDefaultPropertiesFile(scala.collection.JavaConversions.propertiesAsScalaMap(System.getProperties()));
|
||||||
SparkLauncher sparkLauncher = SparkUtil.initLauncher(sparkPropertiesPath);
|
SparkLauncher sparkLauncher = SparkUtil.initLauncher(sparkPropertiesPath);
|
||||||
sparkLauncher.addAppArgs(SparkCommand.COMPACT_SCHEDULE.toString(), HoodieCLI.tableMetadata.getBasePath(),
|
sparkLauncher.addAppArgs(SparkCommand.COMPACT_SCHEDULE.toString(), HoodieCLI.tableMetadata.getBasePath(),
|
||||||
HoodieCLI.tableMetadata.getTableConfig().getTableName(), compactionInstantTime, sparkMemory);
|
HoodieCLI.tableMetadata.getTableConfig().getTableName(), compactionInstantTime, sparkMemory);
|
||||||
@@ -209,33 +192,34 @@ public class CompactionCommand implements CommandMarker {
|
|||||||
|
|
||||||
@CliCommand(value = "compaction run", help = "Run Compaction for given instant time")
|
@CliCommand(value = "compaction run", help = "Run Compaction for given instant time")
|
||||||
public String compact(
|
public String compact(
|
||||||
@CliOption(key = {"parallelism"}, mandatory = true, help = "Parallelism for hoodie compaction")
|
@CliOption(key = {"parallelism"}, mandatory = true,
|
||||||
final String parallelism,
|
help = "Parallelism for hoodie compaction") final String parallelism,
|
||||||
@CliOption(key = "schemaFilePath", mandatory = true, help = "Path for Avro schema file")
|
@CliOption(key = "schemaFilePath", mandatory = true,
|
||||||
final String schemaFilePath,
|
help = "Path for Avro schema file") final String schemaFilePath,
|
||||||
@CliOption(key = "sparkMemory", unspecifiedDefaultValue = "4G", help = "Spark executor memory")
|
@CliOption(key = "sparkMemory", unspecifiedDefaultValue = "4G",
|
||||||
final String sparkMemory,
|
help = "Spark executor memory") final String sparkMemory,
|
||||||
@CliOption(key = "retry", unspecifiedDefaultValue = "1", help = "Number of retries")
|
@CliOption(key = "retry", unspecifiedDefaultValue = "1", help = "Number of retries") final String retry,
|
||||||
final String retry,
|
@CliOption(key = "compactionInstant", mandatory = false,
|
||||||
@CliOption(key = "compactionInstant", mandatory = false, help = "Base path for the target hoodie dataset")
|
help = "Base path for the target hoodie dataset") String compactionInstantTime)
|
||||||
String compactionInstantTime) throws Exception {
|
throws Exception {
|
||||||
boolean initialized = HoodieCLI.initConf();
|
boolean initialized = HoodieCLI.initConf();
|
||||||
HoodieCLI.initFS(initialized);
|
HoodieCLI.initFS(initialized);
|
||||||
|
|
||||||
if (HoodieCLI.tableMetadata.getTableType() == HoodieTableType.MERGE_ON_READ) {
|
if (HoodieCLI.tableMetadata.getTableType() == HoodieTableType.MERGE_ON_READ) {
|
||||||
if (null == compactionInstantTime) {
|
if (null == compactionInstantTime) {
|
||||||
// pick outstanding one with lowest timestamp
|
// pick outstanding one with lowest timestamp
|
||||||
Option<String> firstPendingInstant = HoodieCLI.tableMetadata.reloadActiveTimeline()
|
Option<String> firstPendingInstant =
|
||||||
.filterCompletedAndCompactionInstants().filter(instant -> instant.getAction()
|
HoodieCLI.tableMetadata.reloadActiveTimeline().filterCompletedAndCompactionInstants()
|
||||||
.equals(HoodieTimeline.COMPACTION_ACTION)).firstInstant().map(HoodieInstant::getTimestamp);
|
.filter(instant -> instant.getAction().equals(HoodieTimeline.COMPACTION_ACTION)).firstInstant()
|
||||||
|
.map(HoodieInstant::getTimestamp);
|
||||||
if (!firstPendingInstant.isPresent()) {
|
if (!firstPendingInstant.isPresent()) {
|
||||||
return "NO PENDING COMPACTION TO RUN";
|
return "NO PENDING COMPACTION TO RUN";
|
||||||
}
|
}
|
||||||
compactionInstantTime = firstPendingInstant.get();
|
compactionInstantTime = firstPendingInstant.get();
|
||||||
}
|
}
|
||||||
|
|
||||||
String sparkPropertiesPath = Utils.getDefaultPropertiesFile(
|
String sparkPropertiesPath =
|
||||||
scala.collection.JavaConversions.propertiesAsScalaMap(System.getProperties()));
|
Utils.getDefaultPropertiesFile(scala.collection.JavaConversions.propertiesAsScalaMap(System.getProperties()));
|
||||||
SparkLauncher sparkLauncher = SparkUtil.initLauncher(sparkPropertiesPath);
|
SparkLauncher sparkLauncher = SparkUtil.initLauncher(sparkPropertiesPath);
|
||||||
sparkLauncher.addAppArgs(SparkCommand.COMPACT_RUN.toString(), HoodieCLI.tableMetadata.getBasePath(),
|
sparkLauncher.addAppArgs(SparkCommand.COMPACT_RUN.toString(), HoodieCLI.tableMetadata.getBasePath(),
|
||||||
HoodieCLI.tableMetadata.getTableConfig().getTableName(), compactionInstantTime, parallelism, schemaFilePath,
|
HoodieCLI.tableMetadata.getTableConfig().getTableName(), compactionInstantTime, parallelism, schemaFilePath,
|
||||||
@@ -279,8 +263,8 @@ public class CompactionCommand implements CommandMarker {
|
|||||||
@CliOption(key = {"limit"}, help = "Limit commits", unspecifiedDefaultValue = "-1") Integer limit,
|
@CliOption(key = {"limit"}, help = "Limit commits", unspecifiedDefaultValue = "-1") Integer limit,
|
||||||
@CliOption(key = {"sortBy"}, help = "Sorting Field", unspecifiedDefaultValue = "") String sortByField,
|
@CliOption(key = {"sortBy"}, help = "Sorting Field", unspecifiedDefaultValue = "") String sortByField,
|
||||||
@CliOption(key = {"desc"}, help = "Ordering", unspecifiedDefaultValue = "false") boolean descending,
|
@CliOption(key = {"desc"}, help = "Ordering", unspecifiedDefaultValue = "false") boolean descending,
|
||||||
@CliOption(key = {
|
@CliOption(key = {"headeronly"}, help = "Print Header Only",
|
||||||
"headeronly"}, help = "Print Header Only", unspecifiedDefaultValue = "false") boolean headerOnly)
|
unspecifiedDefaultValue = "false") boolean headerOnly)
|
||||||
throws Exception {
|
throws Exception {
|
||||||
boolean initialized = HoodieCLI.initConf();
|
boolean initialized = HoodieCLI.initConf();
|
||||||
HoodieCLI.initFS(initialized);
|
HoodieCLI.initFS(initialized);
|
||||||
@@ -290,12 +274,11 @@ public class CompactionCommand implements CommandMarker {
|
|||||||
String output = null;
|
String output = null;
|
||||||
if (HoodieCLI.tableMetadata.getTableType() == HoodieTableType.MERGE_ON_READ) {
|
if (HoodieCLI.tableMetadata.getTableType() == HoodieTableType.MERGE_ON_READ) {
|
||||||
try {
|
try {
|
||||||
String sparkPropertiesPath = Utils.getDefaultPropertiesFile(
|
String sparkPropertiesPath = Utils
|
||||||
scala.collection.JavaConversions.propertiesAsScalaMap(System.getProperties()));
|
.getDefaultPropertiesFile(scala.collection.JavaConversions.propertiesAsScalaMap(System.getProperties()));
|
||||||
SparkLauncher sparkLauncher = SparkUtil.initLauncher(sparkPropertiesPath);
|
SparkLauncher sparkLauncher = SparkUtil.initLauncher(sparkPropertiesPath);
|
||||||
sparkLauncher.addAppArgs(SparkCommand.COMPACT_VALIDATE.toString(),
|
sparkLauncher.addAppArgs(SparkCommand.COMPACT_VALIDATE.toString(), HoodieCLI.tableMetadata.getBasePath(),
|
||||||
HoodieCLI.tableMetadata.getBasePath(), compactionInstant, outputPathStr, parallelism, master,
|
compactionInstant, outputPathStr, parallelism, master, sparkMemory);
|
||||||
sparkMemory);
|
|
||||||
Process process = sparkLauncher.launch();
|
Process process = sparkLauncher.launch();
|
||||||
InputStreamConsumer.captureOutput(process);
|
InputStreamConsumer.captureOutput(process);
|
||||||
int exitCode = process.waitFor();
|
int exitCode = process.waitFor();
|
||||||
@@ -307,8 +290,7 @@ public class CompactionCommand implements CommandMarker {
|
|||||||
String message = "\n\n\t COMPACTION PLAN " + (valid ? "VALID" : "INVALID") + "\n\n";
|
String message = "\n\n\t COMPACTION PLAN " + (valid ? "VALID" : "INVALID") + "\n\n";
|
||||||
List<Comparable[]> rows = new ArrayList<>();
|
List<Comparable[]> rows = new ArrayList<>();
|
||||||
res.stream().forEach(r -> {
|
res.stream().forEach(r -> {
|
||||||
Comparable[] row = new Comparable[]{r.getOperation().getFileId(),
|
Comparable[] row = new Comparable[] {r.getOperation().getFileId(), r.getOperation().getBaseInstantTime(),
|
||||||
r.getOperation().getBaseInstantTime(),
|
|
||||||
r.getOperation().getDataFilePath().isPresent() ? r.getOperation().getDataFilePath().get() : "",
|
r.getOperation().getDataFilePath().isPresent() ? r.getOperation().getDataFilePath().get() : "",
|
||||||
r.getOperation().getDeltaFilePaths().size(), r.isSuccess(),
|
r.getOperation().getDeltaFilePaths().size(), r.isSuccess(),
|
||||||
r.getException().isPresent() ? r.getException().get().getMessage() : ""};
|
r.getException().isPresent() ? r.getException().get().getMessage() : ""};
|
||||||
@@ -316,12 +298,8 @@ public class CompactionCommand implements CommandMarker {
|
|||||||
});
|
});
|
||||||
|
|
||||||
Map<String, Function<Object, String>> fieldNameToConverterMap = new HashMap<>();
|
Map<String, Function<Object, String>> fieldNameToConverterMap = new HashMap<>();
|
||||||
TableHeader header = new TableHeader()
|
TableHeader header = new TableHeader().addTableHeaderField("File Id").addTableHeaderField("Base Instant Time")
|
||||||
.addTableHeaderField("File Id")
|
.addTableHeaderField("Base Data File").addTableHeaderField("Num Delta Files").addTableHeaderField("Valid")
|
||||||
.addTableHeaderField("Base Instant Time")
|
|
||||||
.addTableHeaderField("Base Data File")
|
|
||||||
.addTableHeaderField("Num Delta Files")
|
|
||||||
.addTableHeaderField("Valid")
|
|
||||||
.addTableHeaderField("Error");
|
.addTableHeaderField("Error");
|
||||||
|
|
||||||
output = message + HoodiePrintHelper.print(header, fieldNameToConverterMap, sortByField, descending, limit,
|
output = message + HoodiePrintHelper.print(header, fieldNameToConverterMap, sortByField, descending, limit,
|
||||||
@@ -349,8 +327,8 @@ public class CompactionCommand implements CommandMarker {
|
|||||||
@CliOption(key = {"limit"}, help = "Limit commits", unspecifiedDefaultValue = "-1") Integer limit,
|
@CliOption(key = {"limit"}, help = "Limit commits", unspecifiedDefaultValue = "-1") Integer limit,
|
||||||
@CliOption(key = {"sortBy"}, help = "Sorting Field", unspecifiedDefaultValue = "") String sortByField,
|
@CliOption(key = {"sortBy"}, help = "Sorting Field", unspecifiedDefaultValue = "") String sortByField,
|
||||||
@CliOption(key = {"desc"}, help = "Ordering", unspecifiedDefaultValue = "false") boolean descending,
|
@CliOption(key = {"desc"}, help = "Ordering", unspecifiedDefaultValue = "false") boolean descending,
|
||||||
@CliOption(key = {
|
@CliOption(key = {"headeronly"}, help = "Print Header Only",
|
||||||
"headeronly"}, help = "Print Header Only", unspecifiedDefaultValue = "false") boolean headerOnly)
|
unspecifiedDefaultValue = "false") boolean headerOnly)
|
||||||
throws Exception {
|
throws Exception {
|
||||||
boolean initialized = HoodieCLI.initConf();
|
boolean initialized = HoodieCLI.initConf();
|
||||||
HoodieCLI.initFS(initialized);
|
HoodieCLI.initFS(initialized);
|
||||||
@@ -360,12 +338,12 @@ public class CompactionCommand implements CommandMarker {
|
|||||||
String output = "";
|
String output = "";
|
||||||
if (HoodieCLI.tableMetadata.getTableType() == HoodieTableType.MERGE_ON_READ) {
|
if (HoodieCLI.tableMetadata.getTableType() == HoodieTableType.MERGE_ON_READ) {
|
||||||
try {
|
try {
|
||||||
String sparkPropertiesPath = Utils.getDefaultPropertiesFile(
|
String sparkPropertiesPath = Utils
|
||||||
scala.collection.JavaConversions.propertiesAsScalaMap(System.getProperties()));
|
.getDefaultPropertiesFile(scala.collection.JavaConversions.propertiesAsScalaMap(System.getProperties()));
|
||||||
SparkLauncher sparkLauncher = SparkUtil.initLauncher(sparkPropertiesPath);
|
SparkLauncher sparkLauncher = SparkUtil.initLauncher(sparkPropertiesPath);
|
||||||
sparkLauncher.addAppArgs(SparkCommand.COMPACT_UNSCHEDULE_PLAN.toString(),
|
sparkLauncher.addAppArgs(SparkCommand.COMPACT_UNSCHEDULE_PLAN.toString(), HoodieCLI.tableMetadata.getBasePath(),
|
||||||
HoodieCLI.tableMetadata.getBasePath(), compactionInstant, outputPathStr, parallelism, master,
|
compactionInstant, outputPathStr, parallelism, master, sparkMemory, Boolean.valueOf(skipV).toString(),
|
||||||
sparkMemory, Boolean.valueOf(skipV).toString(), Boolean.valueOf(dryRun).toString());
|
Boolean.valueOf(dryRun).toString());
|
||||||
Process process = sparkLauncher.launch();
|
Process process = sparkLauncher.launch();
|
||||||
InputStreamConsumer.captureOutput(process);
|
InputStreamConsumer.captureOutput(process);
|
||||||
int exitCode = process.waitFor();
|
int exitCode = process.waitFor();
|
||||||
@@ -373,8 +351,8 @@ public class CompactionCommand implements CommandMarker {
|
|||||||
return "Failed to unschedule compaction for " + compactionInstant;
|
return "Failed to unschedule compaction for " + compactionInstant;
|
||||||
}
|
}
|
||||||
List<RenameOpResult> res = deSerializeOperationResult(outputPathStr, HoodieCLI.fs);
|
List<RenameOpResult> res = deSerializeOperationResult(outputPathStr, HoodieCLI.fs);
|
||||||
output = getRenamesToBePrinted(res, limit, sortByField, descending, headerOnly,
|
output =
|
||||||
"unschedule pending compaction");
|
getRenamesToBePrinted(res, limit, sortByField, descending, headerOnly, "unschedule pending compaction");
|
||||||
} finally {
|
} finally {
|
||||||
// Delete tmp file used to serialize result
|
// Delete tmp file used to serialize result
|
||||||
if (HoodieCLI.fs.exists(outputPath)) {
|
if (HoodieCLI.fs.exists(outputPath)) {
|
||||||
@@ -407,12 +385,12 @@ public class CompactionCommand implements CommandMarker {
|
|||||||
String output = "";
|
String output = "";
|
||||||
if (HoodieCLI.tableMetadata.getTableType() == HoodieTableType.MERGE_ON_READ) {
|
if (HoodieCLI.tableMetadata.getTableType() == HoodieTableType.MERGE_ON_READ) {
|
||||||
try {
|
try {
|
||||||
String sparkPropertiesPath = Utils.getDefaultPropertiesFile(
|
String sparkPropertiesPath = Utils
|
||||||
scala.collection.JavaConversions.propertiesAsScalaMap(System.getProperties()));
|
.getDefaultPropertiesFile(scala.collection.JavaConversions.propertiesAsScalaMap(System.getProperties()));
|
||||||
SparkLauncher sparkLauncher = SparkUtil.initLauncher(sparkPropertiesPath);
|
SparkLauncher sparkLauncher = SparkUtil.initLauncher(sparkPropertiesPath);
|
||||||
sparkLauncher.addAppArgs(SparkCommand.COMPACT_UNSCHEDULE_FILE.toString(),
|
sparkLauncher.addAppArgs(SparkCommand.COMPACT_UNSCHEDULE_FILE.toString(), HoodieCLI.tableMetadata.getBasePath(),
|
||||||
HoodieCLI.tableMetadata.getBasePath(), fileId, outputPathStr, "1", master,
|
fileId, outputPathStr, "1", master, sparkMemory, Boolean.valueOf(skipV).toString(),
|
||||||
sparkMemory, Boolean.valueOf(skipV).toString(), Boolean.valueOf(dryRun).toString());
|
Boolean.valueOf(dryRun).toString());
|
||||||
Process process = sparkLauncher.launch();
|
Process process = sparkLauncher.launch();
|
||||||
InputStreamConsumer.captureOutput(process);
|
InputStreamConsumer.captureOutput(process);
|
||||||
int exitCode = process.waitFor();
|
int exitCode = process.waitFor();
|
||||||
@@ -445,8 +423,8 @@ public class CompactionCommand implements CommandMarker {
|
|||||||
@CliOption(key = {"limit"}, help = "Limit commits", unspecifiedDefaultValue = "-1") Integer limit,
|
@CliOption(key = {"limit"}, help = "Limit commits", unspecifiedDefaultValue = "-1") Integer limit,
|
||||||
@CliOption(key = {"sortBy"}, help = "Sorting Field", unspecifiedDefaultValue = "") String sortByField,
|
@CliOption(key = {"sortBy"}, help = "Sorting Field", unspecifiedDefaultValue = "") String sortByField,
|
||||||
@CliOption(key = {"desc"}, help = "Ordering", unspecifiedDefaultValue = "false") boolean descending,
|
@CliOption(key = {"desc"}, help = "Ordering", unspecifiedDefaultValue = "false") boolean descending,
|
||||||
@CliOption(key = {
|
@CliOption(key = {"headeronly"}, help = "Print Header Only",
|
||||||
"headeronly"}, help = "Print Header Only", unspecifiedDefaultValue = "false") boolean headerOnly)
|
unspecifiedDefaultValue = "false") boolean headerOnly)
|
||||||
throws Exception {
|
throws Exception {
|
||||||
boolean initialized = HoodieCLI.initConf();
|
boolean initialized = HoodieCLI.initConf();
|
||||||
HoodieCLI.initFS(initialized);
|
HoodieCLI.initFS(initialized);
|
||||||
@@ -455,12 +433,11 @@ public class CompactionCommand implements CommandMarker {
|
|||||||
String output = "";
|
String output = "";
|
||||||
if (HoodieCLI.tableMetadata.getTableType() == HoodieTableType.MERGE_ON_READ) {
|
if (HoodieCLI.tableMetadata.getTableType() == HoodieTableType.MERGE_ON_READ) {
|
||||||
try {
|
try {
|
||||||
String sparkPropertiesPath = Utils.getDefaultPropertiesFile(
|
String sparkPropertiesPath = Utils
|
||||||
scala.collection.JavaConversions.propertiesAsScalaMap(System.getProperties()));
|
.getDefaultPropertiesFile(scala.collection.JavaConversions.propertiesAsScalaMap(System.getProperties()));
|
||||||
SparkLauncher sparkLauncher = SparkUtil.initLauncher(sparkPropertiesPath);
|
SparkLauncher sparkLauncher = SparkUtil.initLauncher(sparkPropertiesPath);
|
||||||
sparkLauncher.addAppArgs(SparkCommand.COMPACT_REPAIR.toString(),
|
sparkLauncher.addAppArgs(SparkCommand.COMPACT_REPAIR.toString(), HoodieCLI.tableMetadata.getBasePath(),
|
||||||
HoodieCLI.tableMetadata.getBasePath(), compactionInstant, outputPathStr, parallelism, master,
|
compactionInstant, outputPathStr, parallelism, master, sparkMemory, Boolean.valueOf(dryRun).toString());
|
||||||
sparkMemory, Boolean.valueOf(dryRun).toString());
|
|
||||||
Process process = sparkLauncher.launch();
|
Process process = sparkLauncher.launch();
|
||||||
InputStreamConsumer.captureOutput(process);
|
InputStreamConsumer.captureOutput(process);
|
||||||
int exitCode = process.waitFor();
|
int exitCode = process.waitFor();
|
||||||
@@ -481,41 +458,35 @@ public class CompactionCommand implements CommandMarker {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
private String getRenamesToBePrinted(List<RenameOpResult> res, Integer limit,
|
private String getRenamesToBePrinted(List<RenameOpResult> res, Integer limit, String sortByField, boolean descending,
|
||||||
String sortByField, boolean descending, boolean headerOnly, String operation) {
|
boolean headerOnly, String operation) {
|
||||||
|
|
||||||
Option<Boolean> result = Option.fromJavaOptional(
|
Option<Boolean> result =
|
||||||
res.stream().map(r -> r.isExecuted() && r.isSuccess()).reduce(Boolean::logicalAnd));
|
Option.fromJavaOptional(res.stream().map(r -> r.isExecuted() && r.isSuccess()).reduce(Boolean::logicalAnd));
|
||||||
if (result.isPresent()) {
|
if (result.isPresent()) {
|
||||||
System.out.println("There were some file renames that needed to be done to " + operation);
|
System.out.println("There were some file renames that needed to be done to " + operation);
|
||||||
|
|
||||||
if (result.get()) {
|
if (result.get()) {
|
||||||
System.out.println("All renames successfully completed to " + operation + " done !!");
|
System.out.println("All renames successfully completed to " + operation + " done !!");
|
||||||
} else {
|
} else {
|
||||||
System.out.println("Some renames failed. DataSet could be in inconsistent-state. "
|
System.out
|
||||||
+ "Try running compaction repair");
|
.println("Some renames failed. DataSet could be in inconsistent-state. " + "Try running compaction repair");
|
||||||
}
|
}
|
||||||
|
|
||||||
List<Comparable[]> rows = new ArrayList<>();
|
List<Comparable[]> rows = new ArrayList<>();
|
||||||
res.stream().forEach(r -> {
|
res.stream().forEach(r -> {
|
||||||
Comparable[] row = new Comparable[] {
|
Comparable[] row =
|
||||||
r.getOperation().fileId, r.getOperation().srcPath, r.getOperation().destPath,
|
new Comparable[] {r.getOperation().fileId, r.getOperation().srcPath, r.getOperation().destPath,
|
||||||
r.isExecuted(), r.isSuccess(), r.getException().isPresent() ? r.getException().get().getMessage() : ""
|
r.isExecuted(), r.isSuccess(), r.getException().isPresent() ? r.getException().get().getMessage() : ""};
|
||||||
};
|
|
||||||
rows.add(row);
|
rows.add(row);
|
||||||
});
|
});
|
||||||
|
|
||||||
Map<String, Function<Object, String>> fieldNameToConverterMap = new HashMap<>();
|
Map<String, Function<Object, String>> fieldNameToConverterMap = new HashMap<>();
|
||||||
TableHeader header = new TableHeader()
|
TableHeader header = new TableHeader().addTableHeaderField("File Id").addTableHeaderField("Source File Path")
|
||||||
.addTableHeaderField("File Id")
|
.addTableHeaderField("Destination File Path").addTableHeaderField("Rename Executed?")
|
||||||
.addTableHeaderField("Source File Path")
|
.addTableHeaderField("Rename Succeeded?").addTableHeaderField("Error");
|
||||||
.addTableHeaderField("Destination File Path")
|
|
||||||
.addTableHeaderField("Rename Executed?")
|
|
||||||
.addTableHeaderField("Rename Succeeded?")
|
|
||||||
.addTableHeaderField("Error");
|
|
||||||
|
|
||||||
return HoodiePrintHelper.print(header, fieldNameToConverterMap, sortByField, descending,
|
return HoodiePrintHelper.print(header, fieldNameToConverterMap, sortByField, descending, limit, headerOnly, rows);
|
||||||
limit, headerOnly, rows);
|
|
||||||
} else {
|
} else {
|
||||||
return "No File renames needed to " + operation + ". Operation successful.";
|
return "No File renames needed to " + operation + ". Operation successful.";
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -52,13 +52,12 @@ public class DatasetsCommand implements CommandMarker {
|
|||||||
@CliOption(key = {"maxCheckIntervalMs"}, mandatory = false, unspecifiedDefaultValue = "300000",
|
@CliOption(key = {"maxCheckIntervalMs"}, mandatory = false, unspecifiedDefaultValue = "300000",
|
||||||
help = "Max wait time for eventual consistency") final Integer maxConsistencyIntervalMs,
|
help = "Max wait time for eventual consistency") final Integer maxConsistencyIntervalMs,
|
||||||
@CliOption(key = {"maxCheckIntervalMs"}, mandatory = false, unspecifiedDefaultValue = "7",
|
@CliOption(key = {"maxCheckIntervalMs"}, mandatory = false, unspecifiedDefaultValue = "7",
|
||||||
help = "Max checks for eventual consistency") final Integer maxConsistencyChecks) throws IOException {
|
help = "Max checks for eventual consistency") final Integer maxConsistencyChecks)
|
||||||
HoodieCLI.setConsistencyGuardConfig(
|
throws IOException {
|
||||||
ConsistencyGuardConfig.newBuilder()
|
HoodieCLI
|
||||||
.withConsistencyCheckEnabled(eventuallyConsistent)
|
.setConsistencyGuardConfig(ConsistencyGuardConfig.newBuilder().withConsistencyCheckEnabled(eventuallyConsistent)
|
||||||
.withInitialConsistencyCheckIntervalMs(initialConsistencyIntervalMs)
|
.withInitialConsistencyCheckIntervalMs(initialConsistencyIntervalMs)
|
||||||
.withMaxConsistencyCheckIntervalMs(maxConsistencyIntervalMs)
|
.withMaxConsistencyCheckIntervalMs(maxConsistencyIntervalMs).withMaxConsistencyChecks(maxConsistencyChecks)
|
||||||
.withMaxConsistencyChecks(maxConsistencyChecks)
|
|
||||||
.build());
|
.build());
|
||||||
HoodieCLI.initConf();
|
HoodieCLI.initConf();
|
||||||
HoodieCLI.connectTo(path);
|
HoodieCLI.connectTo(path);
|
||||||
@@ -70,8 +69,8 @@ public class DatasetsCommand implements CommandMarker {
|
|||||||
/**
|
/**
|
||||||
* Create a Hoodie Table if it does not exist
|
* Create a Hoodie Table if it does not exist
|
||||||
*
|
*
|
||||||
* @param path Base Path
|
* @param path Base Path
|
||||||
* @param name Hoodie Table Name
|
* @param name Hoodie Table Name
|
||||||
* @param tableTypeStr Hoodie Table Type
|
* @param tableTypeStr Hoodie Table Type
|
||||||
* @param payloadClass Payload Class
|
* @param payloadClass Payload Class
|
||||||
*/
|
*/
|
||||||
@@ -82,7 +81,8 @@ public class DatasetsCommand implements CommandMarker {
|
|||||||
@CliOption(key = {"tableType"}, unspecifiedDefaultValue = "COPY_ON_WRITE",
|
@CliOption(key = {"tableType"}, unspecifiedDefaultValue = "COPY_ON_WRITE",
|
||||||
help = "Hoodie Table Type. Must be one of : COPY_ON_WRITE or MERGE_ON_READ") final String tableTypeStr,
|
help = "Hoodie Table Type. Must be one of : COPY_ON_WRITE or MERGE_ON_READ") final String tableTypeStr,
|
||||||
@CliOption(key = {"payloadClass"}, unspecifiedDefaultValue = "org.apache.hudi.common.model.HoodieAvroPayload",
|
@CliOption(key = {"payloadClass"}, unspecifiedDefaultValue = "org.apache.hudi.common.model.HoodieAvroPayload",
|
||||||
help = "Payload Class") final String payloadClass) throws IOException {
|
help = "Payload Class") final String payloadClass)
|
||||||
|
throws IOException {
|
||||||
|
|
||||||
boolean initialized = HoodieCLI.initConf();
|
boolean initialized = HoodieCLI.initConf();
|
||||||
HoodieCLI.initFS(initialized);
|
HoodieCLI.initFS(initialized);
|
||||||
@@ -117,15 +117,13 @@ public class DatasetsCommand implements CommandMarker {
|
|||||||
*/
|
*/
|
||||||
@CliCommand(value = "desc", help = "Describle Hoodie Table properties")
|
@CliCommand(value = "desc", help = "Describle Hoodie Table properties")
|
||||||
public String descTable() {
|
public String descTable() {
|
||||||
TableHeader header = new TableHeader()
|
TableHeader header = new TableHeader().addTableHeaderField("Property").addTableHeaderField("Value");
|
||||||
.addTableHeaderField("Property")
|
|
||||||
.addTableHeaderField("Value");
|
|
||||||
List<Comparable[]> rows = new ArrayList<>();
|
List<Comparable[]> rows = new ArrayList<>();
|
||||||
rows.add(new Comparable[]{"basePath", HoodieCLI.tableMetadata.getBasePath()});
|
rows.add(new Comparable[] {"basePath", HoodieCLI.tableMetadata.getBasePath()});
|
||||||
rows.add(new Comparable[]{"metaPath", HoodieCLI.tableMetadata.getMetaPath()});
|
rows.add(new Comparable[] {"metaPath", HoodieCLI.tableMetadata.getMetaPath()});
|
||||||
rows.add(new Comparable[]{"fileSystem", HoodieCLI.tableMetadata.getFs().getScheme()});
|
rows.add(new Comparable[] {"fileSystem", HoodieCLI.tableMetadata.getFs().getScheme()});
|
||||||
HoodieCLI.tableMetadata.getTableConfig().getProps().entrySet().forEach(e -> {
|
HoodieCLI.tableMetadata.getTableConfig().getProps().entrySet().forEach(e -> {
|
||||||
rows.add(new Comparable[]{e.getKey(), e.getValue()});
|
rows.add(new Comparable[] {e.getKey(), e.getValue()});
|
||||||
});
|
});
|
||||||
return HoodiePrintHelper.print(header, new HashMap<>(), "", false, -1, false, rows);
|
return HoodiePrintHelper.print(header, new HashMap<>(), "", false, -1, false, rows);
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -52,24 +52,23 @@ public class FileSystemViewCommand implements CommandMarker {
|
|||||||
|
|
||||||
@CliCommand(value = "show fsview all", help = "Show entire file-system view")
|
@CliCommand(value = "show fsview all", help = "Show entire file-system view")
|
||||||
public String showAllFileSlices(
|
public String showAllFileSlices(
|
||||||
@CliOption(key = {"pathRegex"},
|
@CliOption(key = {"pathRegex"}, help = "regex to select files, eg: 2016/08/02",
|
||||||
help = "regex to select files, eg: 2016/08/02", unspecifiedDefaultValue = "*/*/*") String globRegex,
|
unspecifiedDefaultValue = "*/*/*") String globRegex,
|
||||||
@CliOption(key = {"readOptimizedOnly"}, help = "Only display read-optimized view",
|
@CliOption(key = {"readOptimizedOnly"}, help = "Only display read-optimized view",
|
||||||
unspecifiedDefaultValue = "false") boolean readOptimizedOnly,
|
unspecifiedDefaultValue = "false") boolean readOptimizedOnly,
|
||||||
@CliOption(key = {"maxInstant"}, help = "File-Slices upto this instant are displayed",
|
@CliOption(key = {"maxInstant"}, help = "File-Slices upto this instant are displayed",
|
||||||
unspecifiedDefaultValue = "") String maxInstant,
|
unspecifiedDefaultValue = "") String maxInstant,
|
||||||
@CliOption(key = {
|
@CliOption(key = {"includeMax"}, help = "Include Max Instant",
|
||||||
"includeMax"}, help = "Include Max Instant", unspecifiedDefaultValue = "false") boolean includeMaxInstant,
|
unspecifiedDefaultValue = "false") boolean includeMaxInstant,
|
||||||
@CliOption(key = {
|
@CliOption(key = {"includeInflight"}, help = "Include Inflight Instants",
|
||||||
"includeInflight"}, help = "Include Inflight Instants", unspecifiedDefaultValue = "false")
|
unspecifiedDefaultValue = "false") boolean includeInflight,
|
||||||
boolean includeInflight,
|
@CliOption(key = {"excludeCompaction"}, help = "Exclude compaction Instants",
|
||||||
@CliOption(key = {"excludeCompaction"}, help = "Exclude compaction Instants", unspecifiedDefaultValue = "false")
|
unspecifiedDefaultValue = "false") boolean excludeCompaction,
|
||||||
boolean excludeCompaction,
|
|
||||||
@CliOption(key = {"limit"}, help = "Limit rows to be displayed", unspecifiedDefaultValue = "-1") Integer limit,
|
@CliOption(key = {"limit"}, help = "Limit rows to be displayed", unspecifiedDefaultValue = "-1") Integer limit,
|
||||||
@CliOption(key = {"sortBy"}, help = "Sorting Field", unspecifiedDefaultValue = "") final String sortByField,
|
@CliOption(key = {"sortBy"}, help = "Sorting Field", unspecifiedDefaultValue = "") final String sortByField,
|
||||||
@CliOption(key = {"desc"}, help = "Ordering", unspecifiedDefaultValue = "false") final boolean descending,
|
@CliOption(key = {"desc"}, help = "Ordering", unspecifiedDefaultValue = "false") final boolean descending,
|
||||||
@CliOption(key = {
|
@CliOption(key = {"headeronly"}, help = "Print Header Only",
|
||||||
"headeronly"}, help = "Print Header Only", unspecifiedDefaultValue = "false") final boolean headerOnly)
|
unspecifiedDefaultValue = "false") final boolean headerOnly)
|
||||||
throws IOException {
|
throws IOException {
|
||||||
|
|
||||||
HoodieTableFileSystemView fsView = buildFileSystemView(globRegex, maxInstant, readOptimizedOnly, includeMaxInstant,
|
HoodieTableFileSystemView fsView = buildFileSystemView(globRegex, maxInstant, readOptimizedOnly, includeMaxInstant,
|
||||||
@@ -97,15 +96,10 @@ public class FileSystemViewCommand implements CommandMarker {
|
|||||||
fieldNameToConverterMap.put("Total Delta File Size", converterFunction);
|
fieldNameToConverterMap.put("Total Delta File Size", converterFunction);
|
||||||
fieldNameToConverterMap.put("Data-File Size", converterFunction);
|
fieldNameToConverterMap.put("Data-File Size", converterFunction);
|
||||||
|
|
||||||
TableHeader header = new TableHeader()
|
TableHeader header = new TableHeader().addTableHeaderField("Partition").addTableHeaderField("FileId")
|
||||||
.addTableHeaderField("Partition")
|
.addTableHeaderField("Base-Instant").addTableHeaderField("Data-File").addTableHeaderField("Data-File Size");
|
||||||
.addTableHeaderField("FileId")
|
|
||||||
.addTableHeaderField("Base-Instant")
|
|
||||||
.addTableHeaderField("Data-File")
|
|
||||||
.addTableHeaderField("Data-File Size");
|
|
||||||
if (!readOptimizedOnly) {
|
if (!readOptimizedOnly) {
|
||||||
header = header.addTableHeaderField("Num Delta Files")
|
header = header.addTableHeaderField("Num Delta Files").addTableHeaderField("Total Delta File Size")
|
||||||
.addTableHeaderField("Total Delta File Size")
|
|
||||||
.addTableHeaderField("Delta Files");
|
.addTableHeaderField("Delta Files");
|
||||||
}
|
}
|
||||||
return HoodiePrintHelper.print(header, fieldNameToConverterMap, sortByField, descending, limit, headerOnly, rows);
|
return HoodiePrintHelper.print(header, fieldNameToConverterMap, sortByField, descending, limit, headerOnly, rows);
|
||||||
@@ -113,25 +107,24 @@ public class FileSystemViewCommand implements CommandMarker {
|
|||||||
|
|
||||||
@CliCommand(value = "show fsview latest", help = "Show latest file-system view")
|
@CliCommand(value = "show fsview latest", help = "Show latest file-system view")
|
||||||
public String showLatestFileSlices(
|
public String showLatestFileSlices(
|
||||||
@CliOption(key = {"partitionPath"},
|
@CliOption(key = {"partitionPath"}, help = "A valid paritition path", mandatory = true) String partition,
|
||||||
help = "A valid paritition path", mandatory = true) String partition,
|
|
||||||
@CliOption(key = {"readOptimizedOnly"}, help = "Only display read-optimized view",
|
@CliOption(key = {"readOptimizedOnly"}, help = "Only display read-optimized view",
|
||||||
unspecifiedDefaultValue = "false") boolean readOptimizedOnly,
|
unspecifiedDefaultValue = "false") boolean readOptimizedOnly,
|
||||||
@CliOption(key = {"maxInstant"}, help = "File-Slices upto this instant are displayed",
|
@CliOption(key = {"maxInstant"}, help = "File-Slices upto this instant are displayed",
|
||||||
unspecifiedDefaultValue = "") String maxInstant,
|
unspecifiedDefaultValue = "") String maxInstant,
|
||||||
@CliOption(key = {"merge"}, help = "Merge File Slices due to pending compaction",
|
@CliOption(key = {"merge"}, help = "Merge File Slices due to pending compaction",
|
||||||
unspecifiedDefaultValue = "true") final boolean merge,
|
unspecifiedDefaultValue = "true") final boolean merge,
|
||||||
@CliOption(key = {"includeMax"}, help = "Include Max Instant", unspecifiedDefaultValue = "false")
|
@CliOption(key = {"includeMax"}, help = "Include Max Instant",
|
||||||
boolean includeMaxInstant,
|
unspecifiedDefaultValue = "false") boolean includeMaxInstant,
|
||||||
@CliOption(key = {"includeInflight"}, help = "Include Inflight Instants", unspecifiedDefaultValue = "false")
|
@CliOption(key = {"includeInflight"}, help = "Include Inflight Instants",
|
||||||
boolean includeInflight,
|
unspecifiedDefaultValue = "false") boolean includeInflight,
|
||||||
@CliOption(key = {"excludeCompaction"}, help = "Exclude compaction Instants", unspecifiedDefaultValue = "false")
|
@CliOption(key = {"excludeCompaction"}, help = "Exclude compaction Instants",
|
||||||
boolean excludeCompaction,
|
unspecifiedDefaultValue = "false") boolean excludeCompaction,
|
||||||
@CliOption(key = {"limit"}, help = "Limit rows to be displayed", unspecifiedDefaultValue = "-1") Integer limit,
|
@CliOption(key = {"limit"}, help = "Limit rows to be displayed", unspecifiedDefaultValue = "-1") Integer limit,
|
||||||
@CliOption(key = {"sortBy"}, help = "Sorting Field", unspecifiedDefaultValue = "") final String sortByField,
|
@CliOption(key = {"sortBy"}, help = "Sorting Field", unspecifiedDefaultValue = "") final String sortByField,
|
||||||
@CliOption(key = {"desc"}, help = "Ordering", unspecifiedDefaultValue = "false") final boolean descending,
|
@CliOption(key = {"desc"}, help = "Ordering", unspecifiedDefaultValue = "false") final boolean descending,
|
||||||
@CliOption(key = {
|
@CliOption(key = {"headeronly"}, help = "Print Header Only",
|
||||||
"headeronly"}, help = "Print Header Only", unspecifiedDefaultValue = "false") final boolean headerOnly)
|
unspecifiedDefaultValue = "false") final boolean headerOnly)
|
||||||
throws IOException {
|
throws IOException {
|
||||||
|
|
||||||
HoodieTableFileSystemView fsView = buildFileSystemView(partition, maxInstant, readOptimizedOnly, includeMaxInstant,
|
HoodieTableFileSystemView fsView = buildFileSystemView(partition, maxInstant, readOptimizedOnly, includeMaxInstant,
|
||||||
@@ -163,28 +156,25 @@ public class FileSystemViewCommand implements CommandMarker {
|
|||||||
if (!readOptimizedOnly) {
|
if (!readOptimizedOnly) {
|
||||||
row[idx++] = fs.getLogFiles().count();
|
row[idx++] = fs.getLogFiles().count();
|
||||||
row[idx++] = fs.getLogFiles().mapToLong(lf -> lf.getFileSize()).sum();
|
row[idx++] = fs.getLogFiles().mapToLong(lf -> lf.getFileSize()).sum();
|
||||||
long logFilesScheduledForCompactionTotalSize = fs.getLogFiles()
|
long logFilesScheduledForCompactionTotalSize =
|
||||||
.filter(lf -> lf.getBaseCommitTime().equals(fs.getBaseInstantTime()))
|
fs.getLogFiles().filter(lf -> lf.getBaseCommitTime().equals(fs.getBaseInstantTime()))
|
||||||
.mapToLong(lf -> lf.getFileSize()).sum();
|
.mapToLong(lf -> lf.getFileSize()).sum();
|
||||||
row[idx++] = logFilesScheduledForCompactionTotalSize;
|
row[idx++] = logFilesScheduledForCompactionTotalSize;
|
||||||
|
|
||||||
long logFilesUnscheduledTotalSize = fs.getLogFiles()
|
long logFilesUnscheduledTotalSize =
|
||||||
.filter(lf -> !lf.getBaseCommitTime().equals(fs.getBaseInstantTime()))
|
fs.getLogFiles().filter(lf -> !lf.getBaseCommitTime().equals(fs.getBaseInstantTime()))
|
||||||
.mapToLong(lf -> lf.getFileSize()).sum();
|
.mapToLong(lf -> lf.getFileSize()).sum();
|
||||||
row[idx++] = logFilesUnscheduledTotalSize;
|
row[idx++] = logFilesUnscheduledTotalSize;
|
||||||
|
|
||||||
double logSelectedForCompactionToBaseRatio =
|
double logSelectedForCompactionToBaseRatio =
|
||||||
dataFileSize > 0 ? logFilesScheduledForCompactionTotalSize / (dataFileSize * 1.0) : -1;
|
dataFileSize > 0 ? logFilesScheduledForCompactionTotalSize / (dataFileSize * 1.0) : -1;
|
||||||
row[idx++] = logSelectedForCompactionToBaseRatio;
|
row[idx++] = logSelectedForCompactionToBaseRatio;
|
||||||
double logUnscheduledToBaseRatio =
|
double logUnscheduledToBaseRatio = dataFileSize > 0 ? logFilesUnscheduledTotalSize / (dataFileSize * 1.0) : -1;
|
||||||
dataFileSize > 0 ? logFilesUnscheduledTotalSize / (dataFileSize * 1.0) : -1;
|
|
||||||
row[idx++] = logUnscheduledToBaseRatio;
|
row[idx++] = logUnscheduledToBaseRatio;
|
||||||
|
|
||||||
row[idx++] = fs.getLogFiles()
|
row[idx++] = fs.getLogFiles().filter(lf -> lf.getBaseCommitTime().equals(fs.getBaseInstantTime()))
|
||||||
.filter(lf -> lf.getBaseCommitTime().equals(fs.getBaseInstantTime()))
|
|
||||||
.collect(Collectors.toList()).toString();
|
.collect(Collectors.toList()).toString();
|
||||||
row[idx++] = fs.getLogFiles()
|
row[idx++] = fs.getLogFiles().filter(lf -> !lf.getBaseCommitTime().equals(fs.getBaseInstantTime()))
|
||||||
.filter(lf -> !lf.getBaseCommitTime().equals(fs.getBaseInstantTime()))
|
|
||||||
.collect(Collectors.toList()).toString();
|
.collect(Collectors.toList()).toString();
|
||||||
}
|
}
|
||||||
rows.add(row);
|
rows.add(row);
|
||||||
@@ -200,16 +190,11 @@ public class FileSystemViewCommand implements CommandMarker {
|
|||||||
fieldNameToConverterMap.put("Delta Size - compaction unscheduled", converterFunction);
|
fieldNameToConverterMap.put("Delta Size - compaction unscheduled", converterFunction);
|
||||||
}
|
}
|
||||||
|
|
||||||
TableHeader header = new TableHeader()
|
TableHeader header = new TableHeader().addTableHeaderField("Partition").addTableHeaderField("FileId")
|
||||||
.addTableHeaderField("Partition")
|
.addTableHeaderField("Base-Instant").addTableHeaderField("Data-File").addTableHeaderField("Data-File Size");
|
||||||
.addTableHeaderField("FileId")
|
|
||||||
.addTableHeaderField("Base-Instant")
|
|
||||||
.addTableHeaderField("Data-File")
|
|
||||||
.addTableHeaderField("Data-File Size");
|
|
||||||
|
|
||||||
if (!readOptimizedOnly) {
|
if (!readOptimizedOnly) {
|
||||||
header = header.addTableHeaderField("Num Delta Files")
|
header = header.addTableHeaderField("Num Delta Files").addTableHeaderField("Total Delta Size")
|
||||||
.addTableHeaderField("Total Delta Size")
|
|
||||||
.addTableHeaderField("Delta Size - compaction scheduled")
|
.addTableHeaderField("Delta Size - compaction scheduled")
|
||||||
.addTableHeaderField("Delta Size - compaction unscheduled")
|
.addTableHeaderField("Delta Size - compaction unscheduled")
|
||||||
.addTableHeaderField("Delta To Base Ratio - compaction scheduled")
|
.addTableHeaderField("Delta To Base Ratio - compaction scheduled")
|
||||||
@@ -222,19 +207,20 @@ public class FileSystemViewCommand implements CommandMarker {
|
|||||||
|
|
||||||
/**
|
/**
|
||||||
* Build File System View
|
* Build File System View
|
||||||
|
*
|
||||||
* @param globRegex Path Regex
|
* @param globRegex Path Regex
|
||||||
* @param maxInstant Max Instants to be used for displaying file-instants
|
* @param maxInstant Max Instants to be used for displaying file-instants
|
||||||
* @param readOptimizedOnly Include only read optimized view
|
* @param readOptimizedOnly Include only read optimized view
|
||||||
* @param includeMaxInstant Include Max instant
|
* @param includeMaxInstant Include Max instant
|
||||||
* @param includeInflight Include inflight instants
|
* @param includeInflight Include inflight instants
|
||||||
* @param excludeCompaction Exclude Compaction instants
|
* @param excludeCompaction Exclude Compaction instants
|
||||||
* @return
|
* @return
|
||||||
* @throws IOException
|
* @throws IOException
|
||||||
*/
|
*/
|
||||||
private HoodieTableFileSystemView buildFileSystemView(String globRegex, String maxInstant, boolean readOptimizedOnly,
|
private HoodieTableFileSystemView buildFileSystemView(String globRegex, String maxInstant, boolean readOptimizedOnly,
|
||||||
boolean includeMaxInstant, boolean includeInflight, boolean excludeCompaction) throws IOException {
|
boolean includeMaxInstant, boolean includeInflight, boolean excludeCompaction) throws IOException {
|
||||||
HoodieTableMetaClient metaClient = new HoodieTableMetaClient(HoodieCLI.tableMetadata.getHadoopConf(),
|
HoodieTableMetaClient metaClient =
|
||||||
HoodieCLI.tableMetadata.getBasePath(), true);
|
new HoodieTableMetaClient(HoodieCLI.tableMetadata.getHadoopConf(), HoodieCLI.tableMetadata.getBasePath(), true);
|
||||||
FileSystem fs = HoodieCLI.fs;
|
FileSystem fs = HoodieCLI.fs;
|
||||||
String globPath = String.format("%s/%s/*", HoodieCLI.tableMetadata.getBasePath(), globRegex);
|
String globPath = String.format("%s/%s/*", HoodieCLI.tableMetadata.getBasePath(), globRegex);
|
||||||
FileStatus[] statuses = fs.globStatus(new Path(globPath));
|
FileStatus[] statuses = fs.globStatus(new Path(globPath));
|
||||||
|
|||||||
@@ -43,17 +43,17 @@ public class HDFSParquetImportCommand implements CommandMarker {
|
|||||||
@CliOption(key = "upsert", mandatory = false, unspecifiedDefaultValue = "false",
|
@CliOption(key = "upsert", mandatory = false, unspecifiedDefaultValue = "false",
|
||||||
help = "Uses upsert API instead of the default insert API of WriteClient") boolean useUpsert,
|
help = "Uses upsert API instead of the default insert API of WriteClient") boolean useUpsert,
|
||||||
@CliOption(key = "srcPath", mandatory = true, help = "Base path for the input dataset") final String srcPath,
|
@CliOption(key = "srcPath", mandatory = true, help = "Base path for the input dataset") final String srcPath,
|
||||||
@CliOption(key = "targetPath", mandatory = true, help = "Base path for the target hoodie dataset") final String
|
@CliOption(key = "targetPath", mandatory = true,
|
||||||
targetPath,
|
help = "Base path for the target hoodie dataset") final String targetPath,
|
||||||
@CliOption(key = "tableName", mandatory = true, help = "Table name") final String tableName,
|
@CliOption(key = "tableName", mandatory = true, help = "Table name") final String tableName,
|
||||||
@CliOption(key = "tableType", mandatory = true, help = "Table type") final String tableType,
|
@CliOption(key = "tableType", mandatory = true, help = "Table type") final String tableType,
|
||||||
@CliOption(key = "rowKeyField", mandatory = true, help = "Row key field name") final String rowKeyField,
|
@CliOption(key = "rowKeyField", mandatory = true, help = "Row key field name") final String rowKeyField,
|
||||||
@CliOption(key = "partitionPathField", mandatory = true, help = "Partition path field name") final String
|
@CliOption(key = "partitionPathField", mandatory = true,
|
||||||
partitionPathField,
|
help = "Partition path field name") final String partitionPathField,
|
||||||
@CliOption(key = {
|
@CliOption(key = {"parallelism"}, mandatory = true,
|
||||||
"parallelism"}, mandatory = true, help = "Parallelism for hoodie insert") final String parallelism,
|
help = "Parallelism for hoodie insert") final String parallelism,
|
||||||
@CliOption(key = "schemaFilePath", mandatory = true, help = "Path for Avro schema file") final String
|
@CliOption(key = "schemaFilePath", mandatory = true,
|
||||||
schemaFilePath,
|
help = "Path for Avro schema file") final String schemaFilePath,
|
||||||
@CliOption(key = "format", mandatory = true, help = "Format for the input data") final String format,
|
@CliOption(key = "format", mandatory = true, help = "Format for the input data") final String format,
|
||||||
@CliOption(key = "sparkMemory", mandatory = true, help = "Spark executor memory") final String sparkMemory,
|
@CliOption(key = "sparkMemory", mandatory = true, help = "Spark executor memory") final String sparkMemory,
|
||||||
@CliOption(key = "retry", mandatory = true, help = "Number of retries") final String retry) throws Exception {
|
@CliOption(key = "retry", mandatory = true, help = "Number of retries") final String retry) throws Exception {
|
||||||
@@ -62,8 +62,8 @@ public class HDFSParquetImportCommand implements CommandMarker {
|
|||||||
|
|
||||||
boolean initialized = HoodieCLI.initConf();
|
boolean initialized = HoodieCLI.initConf();
|
||||||
HoodieCLI.initFS(initialized);
|
HoodieCLI.initFS(initialized);
|
||||||
String sparkPropertiesPath = Utils.getDefaultPropertiesFile(
|
String sparkPropertiesPath =
|
||||||
JavaConverters.mapAsScalaMapConverter(System.getenv()).asScala());
|
Utils.getDefaultPropertiesFile(JavaConverters.mapAsScalaMapConverter(System.getenv()).asScala());
|
||||||
|
|
||||||
SparkLauncher sparkLauncher = SparkUtil.initLauncher(sparkPropertiesPath);
|
SparkLauncher sparkLauncher = SparkUtil.initLauncher(sparkPropertiesPath);
|
||||||
|
|
||||||
@@ -72,8 +72,8 @@ public class HDFSParquetImportCommand implements CommandMarker {
|
|||||||
cmd = SparkCommand.UPSERT.toString();
|
cmd = SparkCommand.UPSERT.toString();
|
||||||
}
|
}
|
||||||
|
|
||||||
sparkLauncher.addAppArgs(cmd, srcPath, targetPath, tableName, tableType, rowKeyField,
|
sparkLauncher.addAppArgs(cmd, srcPath, targetPath, tableName, tableType, rowKeyField, partitionPathField,
|
||||||
partitionPathField, parallelism, schemaFilePath, sparkMemory, retry);
|
parallelism, schemaFilePath, sparkMemory, retry);
|
||||||
Process process = sparkLauncher.launch();
|
Process process = sparkLauncher.launch();
|
||||||
InputStreamConsumer.captureOutput(process);
|
InputStreamConsumer.captureOutput(process);
|
||||||
int exitCode = process.waitFor();
|
int exitCode = process.waitFor();
|
||||||
|
|||||||
@@ -69,30 +69,29 @@ public class HoodieLogFileCommand implements CommandMarker {
|
|||||||
|
|
||||||
@CliCommand(value = "show logfile metadata", help = "Read commit metadata from log files")
|
@CliCommand(value = "show logfile metadata", help = "Read commit metadata from log files")
|
||||||
public String showLogFileCommits(
|
public String showLogFileCommits(
|
||||||
@CliOption(key = "logFilePathPattern", mandatory = true, help = "Fully qualified path for the log file") final
|
@CliOption(key = "logFilePathPattern", mandatory = true,
|
||||||
String logFilePathPattern,
|
help = "Fully qualified path for the log file") final String logFilePathPattern,
|
||||||
@CliOption(key = {"limit"}, help = "Limit commits", unspecifiedDefaultValue = "-1") final Integer limit,
|
@CliOption(key = {"limit"}, help = "Limit commits", unspecifiedDefaultValue = "-1") final Integer limit,
|
||||||
@CliOption(key = {"sortBy"}, help = "Sorting Field", unspecifiedDefaultValue = "") final String sortByField,
|
@CliOption(key = {"sortBy"}, help = "Sorting Field", unspecifiedDefaultValue = "") final String sortByField,
|
||||||
@CliOption(key = {"desc"}, help = "Ordering", unspecifiedDefaultValue = "false") final boolean descending,
|
@CliOption(key = {"desc"}, help = "Ordering", unspecifiedDefaultValue = "false") final boolean descending,
|
||||||
@CliOption(key = {"headeronly"}, help = "Print Header Only", unspecifiedDefaultValue = "false")
|
@CliOption(key = {"headeronly"}, help = "Print Header Only",
|
||||||
final boolean headerOnly) throws IOException {
|
unspecifiedDefaultValue = "false") final boolean headerOnly)
|
||||||
|
throws IOException {
|
||||||
|
|
||||||
FileSystem fs = HoodieCLI.tableMetadata.getFs();
|
FileSystem fs = HoodieCLI.tableMetadata.getFs();
|
||||||
List<String> logFilePaths = Arrays.stream(fs.globStatus(new Path(logFilePathPattern)))
|
List<String> logFilePaths = Arrays.stream(fs.globStatus(new Path(logFilePathPattern)))
|
||||||
.map(status -> status.getPath().toString()).collect(Collectors.toList());
|
.map(status -> status.getPath().toString()).collect(Collectors.toList());
|
||||||
Map<String, List<Tuple3<HoodieLogBlockType, Tuple2<Map<HeaderMetadataType, String>, Map<HeaderMetadataType,
|
Map<String, List<Tuple3<HoodieLogBlockType, Tuple2<Map<HeaderMetadataType, String>, Map<HeaderMetadataType, String>>, Integer>>> commitCountAndMetadata =
|
||||||
String>>, Integer>>>
|
Maps.newHashMap();
|
||||||
commitCountAndMetadata = Maps.newHashMap();
|
|
||||||
int totalEntries = 0;
|
int totalEntries = 0;
|
||||||
int numCorruptBlocks = 0;
|
int numCorruptBlocks = 0;
|
||||||
int dummyInstantTimeCount = 0;
|
int dummyInstantTimeCount = 0;
|
||||||
|
|
||||||
for (String logFilePath : logFilePaths) {
|
for (String logFilePath : logFilePaths) {
|
||||||
FileStatus[] fsStatus = fs.listStatus(new Path(logFilePath));
|
FileStatus[] fsStatus = fs.listStatus(new Path(logFilePath));
|
||||||
Schema writerSchema = new AvroSchemaConverter().convert(
|
Schema writerSchema = new AvroSchemaConverter()
|
||||||
SchemaUtil.readSchemaFromLogFile(HoodieCLI.tableMetadata.getFs(), new Path(logFilePath)));
|
.convert(SchemaUtil.readSchemaFromLogFile(HoodieCLI.tableMetadata.getFs(), new Path(logFilePath)));
|
||||||
Reader reader = HoodieLogFormat
|
Reader reader = HoodieLogFormat.newReader(fs, new HoodieLogFile(fsStatus[0].getPath()), writerSchema);
|
||||||
.newReader(fs, new HoodieLogFile(fsStatus[0].getPath()), writerSchema);
|
|
||||||
|
|
||||||
// read the avro blocks
|
// read the avro blocks
|
||||||
while (reader.hasNext()) {
|
while (reader.hasNext()) {
|
||||||
@@ -126,8 +125,8 @@ public class HoodieLogFileCommand implements CommandMarker {
|
|||||||
new Tuple3<>(n.getBlockType(), new Tuple2<>(n.getLogBlockHeader(), n.getLogBlockFooter()), recordCount));
|
new Tuple3<>(n.getBlockType(), new Tuple2<>(n.getLogBlockHeader(), n.getLogBlockFooter()), recordCount));
|
||||||
totalEntries++;
|
totalEntries++;
|
||||||
} else {
|
} else {
|
||||||
List<Tuple3<HoodieLogBlockType, Tuple2<Map<HeaderMetadataType, String>, Map<HeaderMetadataType, String>>,
|
List<Tuple3<HoodieLogBlockType, Tuple2<Map<HeaderMetadataType, String>, Map<HeaderMetadataType, String>>, Integer>> list =
|
||||||
Integer>> list = new ArrayList<>();
|
new ArrayList<>();
|
||||||
list.add(
|
list.add(
|
||||||
new Tuple3<>(n.getBlockType(), new Tuple2<>(n.getLogBlockHeader(), n.getLogBlockFooter()), recordCount));
|
new Tuple3<>(n.getBlockType(), new Tuple2<>(n.getLogBlockHeader(), n.getLogBlockFooter()), recordCount));
|
||||||
commitCountAndMetadata.put(instantTime, list);
|
commitCountAndMetadata.put(instantTime, list);
|
||||||
@@ -139,12 +138,11 @@ public class HoodieLogFileCommand implements CommandMarker {
|
|||||||
List<Comparable[]> rows = new ArrayList<>();
|
List<Comparable[]> rows = new ArrayList<>();
|
||||||
int i = 0;
|
int i = 0;
|
||||||
ObjectMapper objectMapper = new ObjectMapper();
|
ObjectMapper objectMapper = new ObjectMapper();
|
||||||
for (Map.Entry<String, List<Tuple3<HoodieLogBlockType,
|
for (Map.Entry<String, List<Tuple3<HoodieLogBlockType, Tuple2<Map<HeaderMetadataType, String>, Map<HeaderMetadataType, String>>, Integer>>> entry : commitCountAndMetadata
|
||||||
Tuple2<Map<HeaderMetadataType, String>, Map<HeaderMetadataType, String>>, Integer>>> entry
|
.entrySet()) {
|
||||||
: commitCountAndMetadata.entrySet()) {
|
|
||||||
String instantTime = entry.getKey().toString();
|
String instantTime = entry.getKey().toString();
|
||||||
for (Tuple3<HoodieLogBlockType, Tuple2<Map<HeaderMetadataType, String>,
|
for (Tuple3<HoodieLogBlockType, Tuple2<Map<HeaderMetadataType, String>, Map<HeaderMetadataType, String>>, Integer> tuple3 : entry
|
||||||
Map<HeaderMetadataType, String>>, Integer> tuple3 : entry.getValue()) {
|
.getValue()) {
|
||||||
Comparable[] output = new Comparable[5];
|
Comparable[] output = new Comparable[5];
|
||||||
output[0] = instantTime;
|
output[0] = instantTime;
|
||||||
output[1] = tuple3._3();
|
output[1] = tuple3._3();
|
||||||
@@ -156,21 +154,18 @@ public class HoodieLogFileCommand implements CommandMarker {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
TableHeader header = new TableHeader()
|
TableHeader header = new TableHeader().addTableHeaderField("InstantTime").addTableHeaderField("RecordCount")
|
||||||
.addTableHeaderField("InstantTime")
|
.addTableHeaderField("BlockType").addTableHeaderField("HeaderMetadata").addTableHeaderField("FooterMetadata");
|
||||||
.addTableHeaderField("RecordCount")
|
|
||||||
.addTableHeaderField("BlockType")
|
|
||||||
.addTableHeaderField("HeaderMetadata")
|
|
||||||
.addTableHeaderField("FooterMetadata");
|
|
||||||
|
|
||||||
return HoodiePrintHelper.print(header, new HashMap<>(), sortByField, descending, limit, headerOnly, rows);
|
return HoodiePrintHelper.print(header, new HashMap<>(), sortByField, descending, limit, headerOnly, rows);
|
||||||
}
|
}
|
||||||
|
|
||||||
@CliCommand(value = "show logfile records", help = "Read records from log files")
|
@CliCommand(value = "show logfile records", help = "Read records from log files")
|
||||||
public String showLogFileRecords(@CliOption(key = {
|
public String showLogFileRecords(
|
||||||
"limit"}, mandatory = false, help = "Limit commits", unspecifiedDefaultValue = "10") final Integer limit,
|
@CliOption(key = {"limit"}, mandatory = false, help = "Limit commits",
|
||||||
@CliOption(key = "logFilePathPattern", mandatory = true, help = "Fully qualified paths for the log files")
|
unspecifiedDefaultValue = "10") final Integer limit,
|
||||||
final String logFilePathPattern,
|
@CliOption(key = "logFilePathPattern", mandatory = true,
|
||||||
|
help = "Fully qualified paths for the log files") final String logFilePathPattern,
|
||||||
@CliOption(key = "mergeRecords", mandatory = false, help = "If the records in the log files should be merged",
|
@CliOption(key = "mergeRecords", mandatory = false, help = "If the records in the log files should be merged",
|
||||||
unspecifiedDefaultValue = "false") final Boolean shouldMerge)
|
unspecifiedDefaultValue = "false") final Boolean shouldMerge)
|
||||||
throws IOException {
|
throws IOException {
|
||||||
@@ -184,22 +179,21 @@ public class HoodieLogFileCommand implements CommandMarker {
|
|||||||
// TODO : readerSchema can change across blocks/log files, fix this inside Scanner
|
// TODO : readerSchema can change across blocks/log files, fix this inside Scanner
|
||||||
AvroSchemaConverter converter = new AvroSchemaConverter();
|
AvroSchemaConverter converter = new AvroSchemaConverter();
|
||||||
// get schema from last log file
|
// get schema from last log file
|
||||||
Schema readerSchema = converter.convert(
|
Schema readerSchema =
|
||||||
SchemaUtil.readSchemaFromLogFile(fs, new Path(logFilePaths.get(logFilePaths.size() - 1))));
|
converter.convert(SchemaUtil.readSchemaFromLogFile(fs, new Path(logFilePaths.get(logFilePaths.size() - 1))));
|
||||||
|
|
||||||
List<IndexedRecord> allRecords = new ArrayList<>();
|
List<IndexedRecord> allRecords = new ArrayList<>();
|
||||||
|
|
||||||
if (shouldMerge) {
|
if (shouldMerge) {
|
||||||
System.out.println("===========================> MERGING RECORDS <===================");
|
System.out.println("===========================> MERGING RECORDS <===================");
|
||||||
HoodieMergedLogRecordScanner scanner = new HoodieMergedLogRecordScanner(fs,
|
HoodieMergedLogRecordScanner scanner =
|
||||||
HoodieCLI.tableMetadata.getBasePath(), logFilePaths, readerSchema,
|
new HoodieMergedLogRecordScanner(fs, HoodieCLI.tableMetadata.getBasePath(), logFilePaths, readerSchema,
|
||||||
HoodieCLI.tableMetadata.getActiveTimeline().getCommitTimeline().lastInstant().get()
|
HoodieCLI.tableMetadata.getActiveTimeline().getCommitTimeline().lastInstant().get().getTimestamp(),
|
||||||
.getTimestamp(),
|
Long.valueOf(HoodieMemoryConfig.DEFAULT_MAX_MEMORY_FOR_SPILLABLE_MAP_IN_BYTES),
|
||||||
Long.valueOf(HoodieMemoryConfig.DEFAULT_MAX_MEMORY_FOR_SPILLABLE_MAP_IN_BYTES),
|
Boolean.valueOf(HoodieCompactionConfig.DEFAULT_COMPACTION_LAZY_BLOCK_READ_ENABLED),
|
||||||
Boolean.valueOf(HoodieCompactionConfig.DEFAULT_COMPACTION_LAZY_BLOCK_READ_ENABLED),
|
Boolean.valueOf(HoodieCompactionConfig.DEFAULT_COMPACTION_REVERSE_LOG_READ_ENABLED),
|
||||||
Boolean.valueOf(HoodieCompactionConfig.DEFAULT_COMPACTION_REVERSE_LOG_READ_ENABLED),
|
Integer.valueOf(HoodieMemoryConfig.DEFAULT_MAX_DFS_STREAM_BUFFER_SIZE),
|
||||||
Integer.valueOf(HoodieMemoryConfig.DEFAULT_MAX_DFS_STREAM_BUFFER_SIZE),
|
HoodieMemoryConfig.DEFAULT_SPILLABLE_MAP_BASE_PATH);
|
||||||
HoodieMemoryConfig.DEFAULT_SPILLABLE_MAP_BASE_PATH);
|
|
||||||
for (HoodieRecord<? extends HoodieRecordPayload> hoodieRecord : scanner) {
|
for (HoodieRecord<? extends HoodieRecordPayload> hoodieRecord : scanner) {
|
||||||
Option<IndexedRecord> record = hoodieRecord.getData().getInsertValue(readerSchema);
|
Option<IndexedRecord> record = hoodieRecord.getData().getInsertValue(readerSchema);
|
||||||
if (allRecords.size() >= limit) {
|
if (allRecords.size() >= limit) {
|
||||||
@@ -209,10 +203,10 @@ public class HoodieLogFileCommand implements CommandMarker {
|
|||||||
}
|
}
|
||||||
} else {
|
} else {
|
||||||
for (String logFile : logFilePaths) {
|
for (String logFile : logFilePaths) {
|
||||||
Schema writerSchema = new AvroSchemaConverter().convert(
|
Schema writerSchema = new AvroSchemaConverter()
|
||||||
SchemaUtil.readSchemaFromLogFile(HoodieCLI.tableMetadata.getFs(), new Path(logFile)));
|
.convert(SchemaUtil.readSchemaFromLogFile(HoodieCLI.tableMetadata.getFs(), new Path(logFile)));
|
||||||
HoodieLogFormat.Reader reader = HoodieLogFormat
|
HoodieLogFormat.Reader reader =
|
||||||
.newReader(fs, new HoodieLogFile(new Path(logFile)), writerSchema);
|
HoodieLogFormat.newReader(fs, new HoodieLogFile(new Path(logFile)), writerSchema);
|
||||||
// read the avro blocks
|
// read the avro blocks
|
||||||
while (reader.hasNext()) {
|
while (reader.hasNext()) {
|
||||||
HoodieLogBlock n = reader.next();
|
HoodieLogBlock n = reader.next();
|
||||||
|
|||||||
@@ -44,19 +44,16 @@ public class HoodieSyncCommand implements CommandMarker {
|
|||||||
public String validateSync(
|
public String validateSync(
|
||||||
@CliOption(key = {"mode"}, unspecifiedDefaultValue = "complete", help = "Check mode") final String mode,
|
@CliOption(key = {"mode"}, unspecifiedDefaultValue = "complete", help = "Check mode") final String mode,
|
||||||
@CliOption(key = {"sourceDb"}, unspecifiedDefaultValue = "rawdata", help = "source database") final String srcDb,
|
@CliOption(key = {"sourceDb"}, unspecifiedDefaultValue = "rawdata", help = "source database") final String srcDb,
|
||||||
@CliOption(key = {
|
@CliOption(key = {"targetDb"}, unspecifiedDefaultValue = "dwh_hoodie",
|
||||||
"targetDb"}, unspecifiedDefaultValue = "dwh_hoodie", help = "target database") final String tgtDb,
|
help = "target database") final String tgtDb,
|
||||||
@CliOption(key = {
|
@CliOption(key = {"partitionCount"}, unspecifiedDefaultValue = "5",
|
||||||
"partitionCount"}, unspecifiedDefaultValue = "5", help = "total number of recent partitions to validate")
|
help = "total number of recent partitions to validate") final int partitionCount,
|
||||||
final int partitionCount,
|
@CliOption(key = {"hiveServerUrl"}, mandatory = true,
|
||||||
@CliOption(key = {
|
help = "hiveServerURL to connect to") final String hiveServerUrl,
|
||||||
"hiveServerUrl"}, mandatory = true, help = "hiveServerURL to connect to") final String hiveServerUrl,
|
@CliOption(key = {"hiveUser"}, mandatory = false, unspecifiedDefaultValue = "",
|
||||||
@CliOption(key = {
|
help = "hive username to connect to") final String hiveUser,
|
||||||
"hiveUser"}, mandatory = false, unspecifiedDefaultValue = "", help = "hive username to connect to") final
|
@CliOption(key = {"hivePass"}, mandatory = true, unspecifiedDefaultValue = "",
|
||||||
String hiveUser,
|
help = "hive password to connect to") final String hivePass)
|
||||||
@CliOption(key = {
|
|
||||||
"hivePass"}, mandatory = true, unspecifiedDefaultValue = "", help = "hive password to connect to") final
|
|
||||||
String hivePass)
|
|
||||||
throws Exception {
|
throws Exception {
|
||||||
HoodieTableMetaClient target = HoodieCLI.syncTableMetadata;
|
HoodieTableMetaClient target = HoodieCLI.syncTableMetadata;
|
||||||
HoodieTimeline targetTimeline = target.getActiveTimeline().getCommitsTimeline();
|
HoodieTimeline targetTimeline = target.getActiveTimeline().getCommitsTimeline();
|
||||||
@@ -77,8 +74,8 @@ public class HoodieSyncCommand implements CommandMarker {
|
|||||||
String sourceLatestCommit =
|
String sourceLatestCommit =
|
||||||
sourceTimeline.getInstants().iterator().hasNext() ? "0" : sourceTimeline.lastInstant().get().getTimestamp();
|
sourceTimeline.getInstants().iterator().hasNext() ? "0" : sourceTimeline.lastInstant().get().getTimestamp();
|
||||||
|
|
||||||
if (sourceLatestCommit != null && HoodieTimeline.compareTimestamps(targetLatestCommit, sourceLatestCommit,
|
if (sourceLatestCommit != null
|
||||||
HoodieTimeline.GREATER)) {
|
&& HoodieTimeline.compareTimestamps(targetLatestCommit, sourceLatestCommit, HoodieTimeline.GREATER)) {
|
||||||
// source is behind the target
|
// source is behind the target
|
||||||
List<HoodieInstant> commitsToCatchup = targetTimeline.findInstantsAfter(sourceLatestCommit, Integer.MAX_VALUE)
|
List<HoodieInstant> commitsToCatchup = targetTimeline.findInstantsAfter(sourceLatestCommit, Integer.MAX_VALUE)
|
||||||
.getInstants().collect(Collectors.toList());
|
.getInstants().collect(Collectors.toList());
|
||||||
@@ -89,8 +86,8 @@ public class HoodieSyncCommand implements CommandMarker {
|
|||||||
long newInserts = CommitUtil.countNewRecords(target,
|
long newInserts = CommitUtil.countNewRecords(target,
|
||||||
commitsToCatchup.stream().map(HoodieInstant::getTimestamp).collect(Collectors.toList()));
|
commitsToCatchup.stream().map(HoodieInstant::getTimestamp).collect(Collectors.toList()));
|
||||||
return "Count difference now is (count(" + target.getTableConfig().getTableName() + ") - count("
|
return "Count difference now is (count(" + target.getTableConfig().getTableName() + ") - count("
|
||||||
+ source.getTableConfig().getTableName()
|
+ source.getTableConfig().getTableName() + ") == " + (targetCount - sourceCount) + ". Catch up count is "
|
||||||
+ ") == " + (targetCount - sourceCount) + ". Catch up count is " + newInserts;
|
+ newInserts;
|
||||||
}
|
}
|
||||||
} else {
|
} else {
|
||||||
List<HoodieInstant> commitsToCatchup = sourceTimeline.findInstantsAfter(targetLatestCommit, Integer.MAX_VALUE)
|
List<HoodieInstant> commitsToCatchup = sourceTimeline.findInstantsAfter(targetLatestCommit, Integer.MAX_VALUE)
|
||||||
@@ -102,8 +99,8 @@ public class HoodieSyncCommand implements CommandMarker {
|
|||||||
long newInserts = CommitUtil.countNewRecords(source,
|
long newInserts = CommitUtil.countNewRecords(source,
|
||||||
commitsToCatchup.stream().map(HoodieInstant::getTimestamp).collect(Collectors.toList()));
|
commitsToCatchup.stream().map(HoodieInstant::getTimestamp).collect(Collectors.toList()));
|
||||||
return "Count difference now is (count(" + source.getTableConfig().getTableName() + ") - count("
|
return "Count difference now is (count(" + source.getTableConfig().getTableName() + ") - count("
|
||||||
+ target.getTableConfig().getTableName()
|
+ target.getTableConfig().getTableName() + ") == " + (sourceCount - targetCount) + ". Catch up count is "
|
||||||
+ ") == " + (sourceCount - targetCount) + ". Catch up count is " + newInserts;
|
+ newInserts;
|
||||||
}
|
}
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -47,16 +47,15 @@ public class RepairsCommand implements CommandMarker {
|
|||||||
return HoodieCLI.tableMetadata != null;
|
return HoodieCLI.tableMetadata != null;
|
||||||
}
|
}
|
||||||
|
|
||||||
@CliCommand(value = "repair deduplicate", help = "De-duplicate a partition path contains duplicates & produce "
|
@CliCommand(value = "repair deduplicate",
|
||||||
+ "repaired files to replace with")
|
help = "De-duplicate a partition path contains duplicates & produce " + "repaired files to replace with")
|
||||||
public String deduplicate(@CliOption(key = {
|
public String deduplicate(
|
||||||
"duplicatedPartitionPath"}, help = "Partition Path containing the duplicates", mandatory = true) final String
|
@CliOption(key = {"duplicatedPartitionPath"}, help = "Partition Path containing the duplicates",
|
||||||
duplicatedPartitionPath,
|
mandatory = true) final String duplicatedPartitionPath,
|
||||||
@CliOption(key = {
|
@CliOption(key = {"repairedOutputPath"}, help = "Location to place the repaired files",
|
||||||
"repairedOutputPath"}, help = "Location to place the repaired files", mandatory = true) final String
|
mandatory = true) final String repairedOutputPath,
|
||||||
repairedOutputPath,
|
@CliOption(key = {"sparkProperties"}, help = "Spark Properites File Path",
|
||||||
@CliOption(key = {
|
mandatory = true) final String sparkPropertiesPath)
|
||||||
"sparkProperties"}, help = "Spark Properites File Path", mandatory = true) final String sparkPropertiesPath)
|
|
||||||
throws Exception {
|
throws Exception {
|
||||||
SparkLauncher sparkLauncher = SparkUtil.initLauncher(sparkPropertiesPath);
|
SparkLauncher sparkLauncher = SparkUtil.initLauncher(sparkPropertiesPath);
|
||||||
sparkLauncher.addAppArgs(SparkMain.SparkCommand.DEDUPLICATE.toString(), duplicatedPartitionPath, repairedOutputPath,
|
sparkLauncher.addAppArgs(SparkMain.SparkCommand.DEDUPLICATE.toString(), duplicatedPartitionPath, repairedOutputPath,
|
||||||
@@ -73,14 +72,15 @@ public class RepairsCommand implements CommandMarker {
|
|||||||
|
|
||||||
|
|
||||||
@CliCommand(value = "repair addpartitionmeta", help = "Add partition metadata to a dataset, if not present")
|
@CliCommand(value = "repair addpartitionmeta", help = "Add partition metadata to a dataset, if not present")
|
||||||
public String addPartitionMeta(@CliOption(key = {
|
public String addPartitionMeta(
|
||||||
"dryrun"}, help = "Should we actually add or just print what would be done", unspecifiedDefaultValue = "true")
|
@CliOption(key = {"dryrun"}, help = "Should we actually add or just print what would be done",
|
||||||
final boolean dryRun) throws IOException {
|
unspecifiedDefaultValue = "true") final boolean dryRun)
|
||||||
|
throws IOException {
|
||||||
|
|
||||||
String latestCommit = HoodieCLI.tableMetadata.getActiveTimeline().getCommitTimeline().lastInstant().get()
|
String latestCommit =
|
||||||
.getTimestamp();
|
HoodieCLI.tableMetadata.getActiveTimeline().getCommitTimeline().lastInstant().get().getTimestamp();
|
||||||
List<String> partitionPaths = FSUtils.getAllPartitionFoldersThreeLevelsDown(HoodieCLI.fs,
|
List<String> partitionPaths =
|
||||||
HoodieCLI.tableMetadata.getBasePath());
|
FSUtils.getAllPartitionFoldersThreeLevelsDown(HoodieCLI.fs, HoodieCLI.tableMetadata.getBasePath());
|
||||||
Path basePath = new Path(HoodieCLI.tableMetadata.getBasePath());
|
Path basePath = new Path(HoodieCLI.tableMetadata.getBasePath());
|
||||||
String[][] rows = new String[partitionPaths.size() + 1][];
|
String[][] rows = new String[partitionPaths.size() + 1][];
|
||||||
|
|
||||||
@@ -94,8 +94,8 @@ public class RepairsCommand implements CommandMarker {
|
|||||||
if (!HoodiePartitionMetadata.hasPartitionMetadata(HoodieCLI.fs, partitionPath)) {
|
if (!HoodiePartitionMetadata.hasPartitionMetadata(HoodieCLI.fs, partitionPath)) {
|
||||||
row[1] = "No";
|
row[1] = "No";
|
||||||
if (!dryRun) {
|
if (!dryRun) {
|
||||||
HoodiePartitionMetadata partitionMetadata = new HoodiePartitionMetadata(HoodieCLI.fs, latestCommit, basePath,
|
HoodiePartitionMetadata partitionMetadata =
|
||||||
partitionPath);
|
new HoodiePartitionMetadata(HoodieCLI.fs, latestCommit, basePath, partitionPath);
|
||||||
partitionMetadata.trySave(0);
|
partitionMetadata.trySave(0);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -50,8 +50,8 @@ public class RollbacksCommand implements CommandMarker {
|
|||||||
@CliOption(key = {"limit"}, help = "Limit #rows to be displayed", unspecifiedDefaultValue = "10") Integer limit,
|
@CliOption(key = {"limit"}, help = "Limit #rows to be displayed", unspecifiedDefaultValue = "10") Integer limit,
|
||||||
@CliOption(key = {"sortBy"}, help = "Sorting Field", unspecifiedDefaultValue = "") final String sortByField,
|
@CliOption(key = {"sortBy"}, help = "Sorting Field", unspecifiedDefaultValue = "") final String sortByField,
|
||||||
@CliOption(key = {"desc"}, help = "Ordering", unspecifiedDefaultValue = "false") final boolean descending,
|
@CliOption(key = {"desc"}, help = "Ordering", unspecifiedDefaultValue = "false") final boolean descending,
|
||||||
@CliOption(key = {
|
@CliOption(key = {"headeronly"}, help = "Print Header Only",
|
||||||
"headeronly"}, help = "Print Header Only", unspecifiedDefaultValue = "false") final boolean headerOnly)
|
unspecifiedDefaultValue = "false") final boolean headerOnly)
|
||||||
throws IOException {
|
throws IOException {
|
||||||
HoodieActiveTimeline activeTimeline = new RollbackTimeline(HoodieCLI.tableMetadata);
|
HoodieActiveTimeline activeTimeline = new RollbackTimeline(HoodieCLI.tableMetadata);
|
||||||
HoodieTimeline rollback = activeTimeline.getRollbackTimeline().filterCompletedInstants();
|
HoodieTimeline rollback = activeTimeline.getRollbackTimeline().filterCompletedInstants();
|
||||||
@@ -59,8 +59,8 @@ public class RollbacksCommand implements CommandMarker {
|
|||||||
final List<Comparable[]> rows = new ArrayList<>();
|
final List<Comparable[]> rows = new ArrayList<>();
|
||||||
rollback.getInstants().forEach(instant -> {
|
rollback.getInstants().forEach(instant -> {
|
||||||
try {
|
try {
|
||||||
HoodieRollbackMetadata metadata = AvroUtils.deserializeAvroMetadata(
|
HoodieRollbackMetadata metadata = AvroUtils
|
||||||
activeTimeline.getInstantDetails(instant).get(), HoodieRollbackMetadata.class);
|
.deserializeAvroMetadata(activeTimeline.getInstantDetails(instant).get(), HoodieRollbackMetadata.class);
|
||||||
metadata.getCommitsRollback().forEach(c -> {
|
metadata.getCommitsRollback().forEach(c -> {
|
||||||
Comparable[] row = new Comparable[5];
|
Comparable[] row = new Comparable[5];
|
||||||
row[0] = metadata.getStartRollbackTime();
|
row[0] = metadata.getStartRollbackTime();
|
||||||
@@ -74,11 +74,8 @@ public class RollbacksCommand implements CommandMarker {
|
|||||||
e.printStackTrace();
|
e.printStackTrace();
|
||||||
}
|
}
|
||||||
});
|
});
|
||||||
TableHeader header = new TableHeader()
|
TableHeader header = new TableHeader().addTableHeaderField("Instant").addTableHeaderField("Rolledback Instant")
|
||||||
.addTableHeaderField("Instant")
|
.addTableHeaderField("Total Files Deleted").addTableHeaderField("Time taken in millis")
|
||||||
.addTableHeaderField("Rolledback Instant")
|
|
||||||
.addTableHeaderField("Total Files Deleted")
|
|
||||||
.addTableHeaderField("Time taken in millis")
|
|
||||||
.addTableHeaderField("Total Partitions");
|
.addTableHeaderField("Total Partitions");
|
||||||
return HoodiePrintHelper.print(header, new HashMap<>(), sortByField, descending, limit, headerOnly, rows);
|
return HoodiePrintHelper.print(header, new HashMap<>(), sortByField, descending, limit, headerOnly, rows);
|
||||||
}
|
}
|
||||||
@@ -89,17 +86,18 @@ public class RollbacksCommand implements CommandMarker {
|
|||||||
@CliOption(key = {"limit"}, help = "Limit #rows to be displayed", unspecifiedDefaultValue = "10") Integer limit,
|
@CliOption(key = {"limit"}, help = "Limit #rows to be displayed", unspecifiedDefaultValue = "10") Integer limit,
|
||||||
@CliOption(key = {"sortBy"}, help = "Sorting Field", unspecifiedDefaultValue = "") final String sortByField,
|
@CliOption(key = {"sortBy"}, help = "Sorting Field", unspecifiedDefaultValue = "") final String sortByField,
|
||||||
@CliOption(key = {"desc"}, help = "Ordering", unspecifiedDefaultValue = "false") final boolean descending,
|
@CliOption(key = {"desc"}, help = "Ordering", unspecifiedDefaultValue = "false") final boolean descending,
|
||||||
@CliOption(key = {
|
@CliOption(key = {"headeronly"}, help = "Print Header Only",
|
||||||
"headeronly"}, help = "Print Header Only", unspecifiedDefaultValue = "false") final boolean headerOnly)
|
unspecifiedDefaultValue = "false") final boolean headerOnly)
|
||||||
throws IOException {
|
throws IOException {
|
||||||
HoodieActiveTimeline activeTimeline = new RollbackTimeline(HoodieCLI.tableMetadata);
|
HoodieActiveTimeline activeTimeline = new RollbackTimeline(HoodieCLI.tableMetadata);
|
||||||
final List<Comparable[]> rows = new ArrayList<>();
|
final List<Comparable[]> rows = new ArrayList<>();
|
||||||
HoodieRollbackMetadata metadata = AvroUtils.deserializeAvroMetadata(
|
HoodieRollbackMetadata metadata = AvroUtils.deserializeAvroMetadata(
|
||||||
activeTimeline.getInstantDetails(new HoodieInstant(State.COMPLETED, ROLLBACK_ACTION, rollbackInstant))
|
activeTimeline.getInstantDetails(new HoodieInstant(State.COMPLETED, ROLLBACK_ACTION, rollbackInstant)).get(),
|
||||||
.get(), HoodieRollbackMetadata.class);
|
HoodieRollbackMetadata.class);
|
||||||
metadata.getPartitionMetadata().entrySet().forEach(e -> {
|
metadata.getPartitionMetadata().entrySet().forEach(e -> {
|
||||||
Stream.concat(e.getValue().getSuccessDeleteFiles().stream().map(f -> Pair.of(f, true)),
|
Stream
|
||||||
e.getValue().getFailedDeleteFiles().stream().map(f -> Pair.of(f, false)))
|
.concat(e.getValue().getSuccessDeleteFiles().stream().map(f -> Pair.of(f, true)),
|
||||||
|
e.getValue().getFailedDeleteFiles().stream().map(f -> Pair.of(f, false)))
|
||||||
.forEach(fileWithDeleteStatus -> {
|
.forEach(fileWithDeleteStatus -> {
|
||||||
Comparable[] row = new Comparable[5];
|
Comparable[] row = new Comparable[5];
|
||||||
row[0] = metadata.getStartRollbackTime();
|
row[0] = metadata.getStartRollbackTime();
|
||||||
@@ -111,12 +109,8 @@ public class RollbacksCommand implements CommandMarker {
|
|||||||
});
|
});
|
||||||
});
|
});
|
||||||
|
|
||||||
TableHeader header = new TableHeader()
|
TableHeader header = new TableHeader().addTableHeaderField("Instant").addTableHeaderField("Rolledback Instants")
|
||||||
.addTableHeaderField("Instant")
|
.addTableHeaderField("Partition").addTableHeaderField("Deleted File").addTableHeaderField("Succeeded");
|
||||||
.addTableHeaderField("Rolledback Instants")
|
|
||||||
.addTableHeaderField("Partition")
|
|
||||||
.addTableHeaderField("Deleted File")
|
|
||||||
.addTableHeaderField("Succeeded");
|
|
||||||
return HoodiePrintHelper.print(header, new HashMap<>(), sortByField, descending, limit, headerOnly, rows);
|
return HoodiePrintHelper.print(header, new HashMap<>(), sortByField, descending, limit, headerOnly, rows);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|||||||
@@ -62,8 +62,8 @@ public class SavepointsCommand implements CommandMarker {
|
|||||||
|
|
||||||
@CliAvailabilityIndicator({"savepoint rollback"})
|
@CliAvailabilityIndicator({"savepoint rollback"})
|
||||||
public boolean isRollbackToSavepointAvailable() {
|
public boolean isRollbackToSavepointAvailable() {
|
||||||
return HoodieCLI.tableMetadata != null && !HoodieCLI.tableMetadata.getActiveTimeline().getSavePointTimeline()
|
return HoodieCLI.tableMetadata != null
|
||||||
.filterCompletedInstants().empty();
|
&& !HoodieCLI.tableMetadata.getActiveTimeline().getSavePointTimeline().filterCompletedInstants().empty();
|
||||||
}
|
}
|
||||||
|
|
||||||
@CliCommand(value = "savepoints show", help = "Show the savepoints")
|
@CliCommand(value = "savepoints show", help = "Show the savepoints")
|
||||||
@@ -137,8 +137,8 @@ public class SavepointsCommand implements CommandMarker {
|
|||||||
}
|
}
|
||||||
|
|
||||||
private static HoodieWriteClient createHoodieClient(JavaSparkContext jsc, String basePath) throws Exception {
|
private static HoodieWriteClient createHoodieClient(JavaSparkContext jsc, String basePath) throws Exception {
|
||||||
HoodieWriteConfig config = HoodieWriteConfig.newBuilder().withPath(basePath).withIndexConfig(
|
HoodieWriteConfig config = HoodieWriteConfig.newBuilder().withPath(basePath)
|
||||||
HoodieIndexConfig.newBuilder().withIndexType(HoodieIndex.IndexType.BLOOM).build()).build();
|
.withIndexConfig(HoodieIndexConfig.newBuilder().withIndexType(HoodieIndex.IndexType.BLOOM).build()).build();
|
||||||
return new HoodieWriteClient(jsc, config, false);
|
return new HoodieWriteClient(jsc, config, false);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|||||||
@@ -43,8 +43,7 @@ public class SparkMain {
|
|||||||
* Commands
|
* Commands
|
||||||
*/
|
*/
|
||||||
enum SparkCommand {
|
enum SparkCommand {
|
||||||
ROLLBACK, DEDUPLICATE, ROLLBACK_TO_SAVEPOINT, SAVEPOINT, IMPORT, UPSERT, COMPACT_SCHEDULE, COMPACT_RUN,
|
ROLLBACK, DEDUPLICATE, ROLLBACK_TO_SAVEPOINT, SAVEPOINT, IMPORT, UPSERT, COMPACT_SCHEDULE, COMPACT_RUN, COMPACT_UNSCHEDULE_PLAN, COMPACT_UNSCHEDULE_FILE, COMPACT_VALIDATE, COMPACT_REPAIR
|
||||||
COMPACT_UNSCHEDULE_PLAN, COMPACT_UNSCHEDULE_FILE, COMPACT_VALIDATE, COMPACT_REPAIR
|
|
||||||
}
|
}
|
||||||
|
|
||||||
public static void main(String[] args) throws Exception {
|
public static void main(String[] args) throws Exception {
|
||||||
@@ -76,13 +75,12 @@ public class SparkMain {
|
|||||||
break;
|
break;
|
||||||
case COMPACT_RUN:
|
case COMPACT_RUN:
|
||||||
assert (args.length == 8);
|
assert (args.length == 8);
|
||||||
returnCode = compact(jsc, args[1], args[2], args[3], Integer.parseInt(args[4]),
|
returnCode = compact(jsc, args[1], args[2], args[3], Integer.parseInt(args[4]), args[5], args[6],
|
||||||
args[5], args[6], Integer.parseInt(args[7]), false);
|
Integer.parseInt(args[7]), false);
|
||||||
break;
|
break;
|
||||||
case COMPACT_SCHEDULE:
|
case COMPACT_SCHEDULE:
|
||||||
assert (args.length == 5);
|
assert (args.length == 5);
|
||||||
returnCode = compact(jsc, args[1], args[2], args[3], 1,
|
returnCode = compact(jsc, args[1], args[2], args[3], 1, "", args[4], 0, true);
|
||||||
"", args[4], 0, true);
|
|
||||||
break;
|
break;
|
||||||
case COMPACT_VALIDATE:
|
case COMPACT_VALIDATE:
|
||||||
assert (args.length == 7);
|
assert (args.length == 7);
|
||||||
@@ -113,8 +111,7 @@ public class SparkMain {
|
|||||||
System.exit(returnCode);
|
System.exit(returnCode);
|
||||||
}
|
}
|
||||||
|
|
||||||
private static int dataLoad(JavaSparkContext jsc, String command,
|
private static int dataLoad(JavaSparkContext jsc, String command, String srcPath, String targetPath, String tableName,
|
||||||
String srcPath, String targetPath, String tableName,
|
|
||||||
String tableType, String rowKey, String partitionKey, int parallelism, String schemaFile, String sparkMaster,
|
String tableType, String rowKey, String partitionKey, int parallelism, String schemaFile, String sparkMaster,
|
||||||
String sparkMemory, int retry) throws Exception {
|
String sparkMemory, int retry) throws Exception {
|
||||||
Config cfg = new Config();
|
Config cfg = new Config();
|
||||||
@@ -180,9 +177,9 @@ public class SparkMain {
|
|||||||
new HoodieCompactionAdminTool(cfg).run(jsc);
|
new HoodieCompactionAdminTool(cfg).run(jsc);
|
||||||
}
|
}
|
||||||
|
|
||||||
private static void doCompactUnscheduleFile(JavaSparkContext jsc, String basePath, String fileId,
|
private static void doCompactUnscheduleFile(JavaSparkContext jsc, String basePath, String fileId, String outputPath,
|
||||||
String outputPath, int parallelism, String sparkMaster, String sparkMemory, boolean skipValidation,
|
int parallelism, String sparkMaster, String sparkMemory, boolean skipValidation, boolean dryRun)
|
||||||
boolean dryRun) throws Exception {
|
throws Exception {
|
||||||
HoodieCompactionAdminTool.Config cfg = new HoodieCompactionAdminTool.Config();
|
HoodieCompactionAdminTool.Config cfg = new HoodieCompactionAdminTool.Config();
|
||||||
cfg.basePath = basePath;
|
cfg.basePath = basePath;
|
||||||
cfg.operation = Operation.UNSCHEDULE_FILE;
|
cfg.operation = Operation.UNSCHEDULE_FILE;
|
||||||
@@ -244,8 +241,8 @@ public class SparkMain {
|
|||||||
}
|
}
|
||||||
|
|
||||||
private static HoodieWriteClient createHoodieClient(JavaSparkContext jsc, String basePath) throws Exception {
|
private static HoodieWriteClient createHoodieClient(JavaSparkContext jsc, String basePath) throws Exception {
|
||||||
HoodieWriteConfig config = HoodieWriteConfig.newBuilder().withPath(basePath).withIndexConfig(
|
HoodieWriteConfig config = HoodieWriteConfig.newBuilder().withPath(basePath)
|
||||||
HoodieIndexConfig.newBuilder().withIndexType(HoodieIndex.IndexType.BLOOM).build()).build();
|
.withIndexConfig(HoodieIndexConfig.newBuilder().withIndexType(HoodieIndex.IndexType.BLOOM).build()).build();
|
||||||
return new HoodieWriteClient(jsc, config);
|
return new HoodieWriteClient(jsc, config);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -63,8 +63,9 @@ public class StatsCommand implements CommandMarker {
|
|||||||
@CliOption(key = {"limit"}, help = "Limit commits", unspecifiedDefaultValue = "-1") final Integer limit,
|
@CliOption(key = {"limit"}, help = "Limit commits", unspecifiedDefaultValue = "-1") final Integer limit,
|
||||||
@CliOption(key = {"sortBy"}, help = "Sorting Field", unspecifiedDefaultValue = "") final String sortByField,
|
@CliOption(key = {"sortBy"}, help = "Sorting Field", unspecifiedDefaultValue = "") final String sortByField,
|
||||||
@CliOption(key = {"desc"}, help = "Ordering", unspecifiedDefaultValue = "false") final boolean descending,
|
@CliOption(key = {"desc"}, help = "Ordering", unspecifiedDefaultValue = "false") final boolean descending,
|
||||||
@CliOption(key = {"headeronly"}, help = "Print Header Only", unspecifiedDefaultValue = "false")
|
@CliOption(key = {"headeronly"}, help = "Print Header Only",
|
||||||
final boolean headerOnly) throws IOException {
|
unspecifiedDefaultValue = "false") final boolean headerOnly)
|
||||||
|
throws IOException {
|
||||||
|
|
||||||
long totalRecordsUpserted = 0;
|
long totalRecordsUpserted = 0;
|
||||||
long totalRecordsWritten = 0;
|
long totalRecordsWritten = 0;
|
||||||
@@ -82,7 +83,7 @@ public class StatsCommand implements CommandMarker {
|
|||||||
if (commit.fetchTotalUpdateRecordsWritten() > 0) {
|
if (commit.fetchTotalUpdateRecordsWritten() > 0) {
|
||||||
waf = df.format((float) commit.fetchTotalRecordsWritten() / commit.fetchTotalUpdateRecordsWritten());
|
waf = df.format((float) commit.fetchTotalRecordsWritten() / commit.fetchTotalUpdateRecordsWritten());
|
||||||
}
|
}
|
||||||
rows.add(new Comparable[]{commitTime.getTimestamp(), commit.fetchTotalUpdateRecordsWritten(),
|
rows.add(new Comparable[] {commitTime.getTimestamp(), commit.fetchTotalUpdateRecordsWritten(),
|
||||||
commit.fetchTotalRecordsWritten(), waf});
|
commit.fetchTotalRecordsWritten(), waf});
|
||||||
totalRecordsUpserted += commit.fetchTotalUpdateRecordsWritten();
|
totalRecordsUpserted += commit.fetchTotalUpdateRecordsWritten();
|
||||||
totalRecordsWritten += commit.fetchTotalRecordsWritten();
|
totalRecordsWritten += commit.fetchTotalRecordsWritten();
|
||||||
@@ -91,33 +92,28 @@ public class StatsCommand implements CommandMarker {
|
|||||||
if (totalRecordsUpserted > 0) {
|
if (totalRecordsUpserted > 0) {
|
||||||
waf = df.format((float) totalRecordsWritten / totalRecordsUpserted);
|
waf = df.format((float) totalRecordsWritten / totalRecordsUpserted);
|
||||||
}
|
}
|
||||||
rows.add(new Comparable[]{"Total", totalRecordsUpserted, totalRecordsWritten, waf});
|
rows.add(new Comparable[] {"Total", totalRecordsUpserted, totalRecordsWritten, waf});
|
||||||
|
|
||||||
TableHeader header = new TableHeader()
|
TableHeader header = new TableHeader().addTableHeaderField("CommitTime").addTableHeaderField("Total Upserted")
|
||||||
.addTableHeaderField("CommitTime")
|
.addTableHeaderField("Total Written").addTableHeaderField("Write Amplifiation Factor");
|
||||||
.addTableHeaderField("Total Upserted")
|
|
||||||
.addTableHeaderField("Total Written")
|
|
||||||
.addTableHeaderField("Write Amplifiation Factor");
|
|
||||||
return HoodiePrintHelper.print(header, new HashMap<>(), sortByField, descending, limit, headerOnly, rows);
|
return HoodiePrintHelper.print(header, new HashMap<>(), sortByField, descending, limit, headerOnly, rows);
|
||||||
}
|
}
|
||||||
|
|
||||||
private Comparable[] printFileSizeHistogram(String commitTime, Snapshot s) {
|
private Comparable[] printFileSizeHistogram(String commitTime, Snapshot s) {
|
||||||
return new Comparable[]{commitTime, s.getMin(),
|
return new Comparable[] {commitTime, s.getMin(), s.getValue(0.1), s.getMedian(), s.getMean(), s.get95thPercentile(),
|
||||||
s.getValue(0.1), s.getMedian(),
|
s.getMax(), s.size(), s.getStdDev()};
|
||||||
s.getMean(), s.get95thPercentile(),
|
|
||||||
s.getMax(), s.size(),
|
|
||||||
s.getStdDev()};
|
|
||||||
}
|
}
|
||||||
|
|
||||||
@CliCommand(value = "stats filesizes", help = "File Sizes. Display summary stats on sizes of files")
|
@CliCommand(value = "stats filesizes", help = "File Sizes. Display summary stats on sizes of files")
|
||||||
public String fileSizeStats(
|
public String fileSizeStats(
|
||||||
@CliOption(key = {"partitionPath"},
|
@CliOption(key = {"partitionPath"}, help = "regex to select files, eg: 2016/08/02",
|
||||||
help = "regex to select files, eg: 2016/08/02", unspecifiedDefaultValue = "*/*/*") final String globRegex,
|
unspecifiedDefaultValue = "*/*/*") final String globRegex,
|
||||||
@CliOption(key = {"limit"}, help = "Limit commits", unspecifiedDefaultValue = "-1") final Integer limit,
|
@CliOption(key = {"limit"}, help = "Limit commits", unspecifiedDefaultValue = "-1") final Integer limit,
|
||||||
@CliOption(key = {"sortBy"}, help = "Sorting Field", unspecifiedDefaultValue = "") final String sortByField,
|
@CliOption(key = {"sortBy"}, help = "Sorting Field", unspecifiedDefaultValue = "") final String sortByField,
|
||||||
@CliOption(key = {"desc"}, help = "Ordering", unspecifiedDefaultValue = "false") final boolean descending,
|
@CliOption(key = {"desc"}, help = "Ordering", unspecifiedDefaultValue = "false") final boolean descending,
|
||||||
@CliOption(key = {"headeronly"}, help = "Print Header Only", unspecifiedDefaultValue = "false")
|
@CliOption(key = {"headeronly"}, help = "Print Header Only",
|
||||||
final boolean headerOnly) throws IOException {
|
unspecifiedDefaultValue = "false") final boolean headerOnly)
|
||||||
|
throws IOException {
|
||||||
|
|
||||||
FileSystem fs = HoodieCLI.fs;
|
FileSystem fs = HoodieCLI.fs;
|
||||||
String globPath = String.format("%s/%s/*", HoodieCLI.tableMetadata.getBasePath(), globRegex);
|
String globPath = String.format("%s/%s/*", HoodieCLI.tableMetadata.getBasePath(), globRegex);
|
||||||
@@ -145,8 +141,8 @@ public class StatsCommand implements CommandMarker {
|
|||||||
Snapshot s = globalHistogram.getSnapshot();
|
Snapshot s = globalHistogram.getSnapshot();
|
||||||
rows.add(printFileSizeHistogram("ALL", s));
|
rows.add(printFileSizeHistogram("ALL", s));
|
||||||
|
|
||||||
Function<Object, String> converterFunction = entry ->
|
Function<Object, String> converterFunction =
|
||||||
NumericUtils.humanReadableByteCount((Double.valueOf(entry.toString())));
|
entry -> NumericUtils.humanReadableByteCount((Double.valueOf(entry.toString())));
|
||||||
Map<String, Function<Object, String>> fieldNameToConverterMap = new HashMap<>();
|
Map<String, Function<Object, String>> fieldNameToConverterMap = new HashMap<>();
|
||||||
fieldNameToConverterMap.put("Min", converterFunction);
|
fieldNameToConverterMap.put("Min", converterFunction);
|
||||||
fieldNameToConverterMap.put("10th", converterFunction);
|
fieldNameToConverterMap.put("10th", converterFunction);
|
||||||
@@ -156,16 +152,9 @@ public class StatsCommand implements CommandMarker {
|
|||||||
fieldNameToConverterMap.put("Max", converterFunction);
|
fieldNameToConverterMap.put("Max", converterFunction);
|
||||||
fieldNameToConverterMap.put("StdDev", converterFunction);
|
fieldNameToConverterMap.put("StdDev", converterFunction);
|
||||||
|
|
||||||
TableHeader header = new TableHeader()
|
TableHeader header = new TableHeader().addTableHeaderField("CommitTime").addTableHeaderField("Min")
|
||||||
.addTableHeaderField("CommitTime")
|
.addTableHeaderField("10th").addTableHeaderField("50th").addTableHeaderField("avg").addTableHeaderField("95th")
|
||||||
.addTableHeaderField("Min")
|
.addTableHeaderField("Max").addTableHeaderField("NumFiles").addTableHeaderField("StdDev");
|
||||||
.addTableHeaderField("10th")
|
|
||||||
.addTableHeaderField("50th")
|
|
||||||
.addTableHeaderField("avg")
|
|
||||||
.addTableHeaderField("95th")
|
|
||||||
.addTableHeaderField("Max")
|
|
||||||
.addTableHeaderField("NumFiles")
|
|
||||||
.addTableHeaderField("StdDev");
|
|
||||||
return HoodiePrintHelper.print(header, fieldNameToConverterMap, sortByField, descending, limit, headerOnly, rows);
|
return HoodiePrintHelper.print(header, fieldNameToConverterMap, sortByField, descending, limit, headerOnly, rows);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -48,12 +48,11 @@ public class HiveUtil {
|
|||||||
ResultSet rs = null;
|
ResultSet rs = null;
|
||||||
Statement stmt = conn.createStatement();
|
Statement stmt = conn.createStatement();
|
||||||
try {
|
try {
|
||||||
//stmt.execute("set mapred.job.queue.name=<queue_name>");
|
// stmt.execute("set mapred.job.queue.name=<queue_name>");
|
||||||
stmt.execute("set hive.input.format=org.apache.hadoop.hive.ql.io.HiveInputFormat");
|
stmt.execute("set hive.input.format=org.apache.hadoop.hive.ql.io.HiveInputFormat");
|
||||||
stmt.execute("set hive.stats.autogather=false");
|
stmt.execute("set hive.stats.autogather=false");
|
||||||
rs = stmt.executeQuery(
|
rs = stmt.executeQuery(
|
||||||
"select count(`_hoodie_commit_time`) as cnt from " + dbName + "."
|
"select count(`_hoodie_commit_time`) as cnt from " + dbName + "." + source.getTableConfig().getTableName());
|
||||||
+ source.getTableConfig().getTableName());
|
|
||||||
long count = -1;
|
long count = -1;
|
||||||
if (rs.next()) {
|
if (rs.next()) {
|
||||||
count = rs.getLong("cnt");
|
count = rs.getLong("cnt");
|
||||||
@@ -88,7 +87,7 @@ public class HiveUtil {
|
|||||||
ResultSet rs = null;
|
ResultSet rs = null;
|
||||||
Statement stmt = conn.createStatement();
|
Statement stmt = conn.createStatement();
|
||||||
try {
|
try {
|
||||||
//stmt.execute("set mapred.job.queue.name=<queue_name>");
|
// stmt.execute("set mapred.job.queue.name=<queue_name>");
|
||||||
stmt.execute("set hive.input.format=org.apache.hadoop.hive.ql.io.HiveInputFormat");
|
stmt.execute("set hive.input.format=org.apache.hadoop.hive.ql.io.HiveInputFormat");
|
||||||
stmt.execute("set hive.stats.autogather=false");
|
stmt.execute("set hive.stats.autogather=false");
|
||||||
rs = stmt.executeQuery(
|
rs = stmt.executeQuery(
|
||||||
|
|||||||
@@ -40,8 +40,8 @@ public class SparkUtil {
|
|||||||
public static SparkLauncher initLauncher(String propertiesFile) throws URISyntaxException {
|
public static SparkLauncher initLauncher(String propertiesFile) throws URISyntaxException {
|
||||||
String currentJar = new File(SparkUtil.class.getProtectionDomain().getCodeSource().getLocation().toURI().getPath())
|
String currentJar = new File(SparkUtil.class.getProtectionDomain().getCodeSource().getLocation().toURI().getPath())
|
||||||
.getAbsolutePath();
|
.getAbsolutePath();
|
||||||
SparkLauncher sparkLauncher = new SparkLauncher().setAppResource(currentJar)
|
SparkLauncher sparkLauncher =
|
||||||
.setMainClass(SparkMain.class.getName());
|
new SparkLauncher().setAppResource(currentJar).setMainClass(SparkMain.class.getName());
|
||||||
|
|
||||||
if (!StringUtils.isNullOrEmpty(propertiesFile)) {
|
if (!StringUtils.isNullOrEmpty(propertiesFile)) {
|
||||||
sparkLauncher.setPropertiesFile(propertiesFile);
|
sparkLauncher.setPropertiesFile(propertiesFile);
|
||||||
|
|||||||
@@ -26,6 +26,10 @@
|
|||||||
<artifactId>hudi-client</artifactId>
|
<artifactId>hudi-client</artifactId>
|
||||||
<packaging>jar</packaging>
|
<packaging>jar</packaging>
|
||||||
|
|
||||||
|
<properties>
|
||||||
|
<main.basedir>${project.parent.basedir}</main.basedir>
|
||||||
|
</properties>
|
||||||
|
|
||||||
<build>
|
<build>
|
||||||
<plugins>
|
<plugins>
|
||||||
<plugin>
|
<plugin>
|
||||||
|
|||||||
@@ -32,8 +32,8 @@ import org.apache.log4j.Logger;
|
|||||||
import org.apache.spark.api.java.JavaSparkContext;
|
import org.apache.spark.api.java.JavaSparkContext;
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Abstract class taking care of holding common member variables (FileSystem, SparkContext, HoodieConfigs)
|
* Abstract class taking care of holding common member variables (FileSystem, SparkContext, HoodieConfigs) Also, manages
|
||||||
* Also, manages embedded timeline-server if enabled.
|
* embedded timeline-server if enabled.
|
||||||
*/
|
*/
|
||||||
public abstract class AbstractHoodieClient implements Serializable, AutoCloseable {
|
public abstract class AbstractHoodieClient implements Serializable, AutoCloseable {
|
||||||
|
|
||||||
@@ -45,10 +45,9 @@ public abstract class AbstractHoodieClient implements Serializable, AutoCloseabl
|
|||||||
protected final String basePath;
|
protected final String basePath;
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Timeline Server has the same lifetime as that of Client.
|
* Timeline Server has the same lifetime as that of Client. Any operations done on the same timeline service will be
|
||||||
* Any operations done on the same timeline service will be able to take advantage
|
* able to take advantage of the cached file-system view. New completed actions will be synced automatically in an
|
||||||
* of the cached file-system view. New completed actions will be synced automatically
|
* incremental fashion.
|
||||||
* in an incremental fashion.
|
|
||||||
*/
|
*/
|
||||||
private transient Option<EmbeddedTimelineService> timelineServer;
|
private transient Option<EmbeddedTimelineService> timelineServer;
|
||||||
private final boolean shouldStopTimelineServer;
|
private final boolean shouldStopTimelineServer;
|
||||||
|
|||||||
@@ -69,8 +69,7 @@ public class CompactionAdminClient extends AbstractHoodieClient {
|
|||||||
super(jsc, HoodieWriteConfig.newBuilder().withPath(basePath).build());
|
super(jsc, HoodieWriteConfig.newBuilder().withPath(basePath).build());
|
||||||
}
|
}
|
||||||
|
|
||||||
public CompactionAdminClient(JavaSparkContext jsc, String basePath,
|
public CompactionAdminClient(JavaSparkContext jsc, String basePath, Option<EmbeddedTimelineService> timelineServer) {
|
||||||
Option<EmbeddedTimelineService> timelineServer) {
|
|
||||||
super(jsc, HoodieWriteConfig.newBuilder().withPath(basePath).build(), timelineServer);
|
super(jsc, HoodieWriteConfig.newBuilder().withPath(basePath).build(), timelineServer);
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -78,11 +77,11 @@ public class CompactionAdminClient extends AbstractHoodieClient {
|
|||||||
* Validate all compaction operations in a compaction plan. Verifies the file-slices are consistent with corresponding
|
* Validate all compaction operations in a compaction plan. Verifies the file-slices are consistent with corresponding
|
||||||
* compaction operations.
|
* compaction operations.
|
||||||
*
|
*
|
||||||
* @param metaClient Hoodie Table Meta Client
|
* @param metaClient Hoodie Table Meta Client
|
||||||
* @param compactionInstant Compaction Instant
|
* @param compactionInstant Compaction Instant
|
||||||
*/
|
*/
|
||||||
public List<ValidationOpResult> validateCompactionPlan(HoodieTableMetaClient metaClient,
|
public List<ValidationOpResult> validateCompactionPlan(HoodieTableMetaClient metaClient, String compactionInstant,
|
||||||
String compactionInstant, int parallelism) throws IOException {
|
int parallelism) throws IOException {
|
||||||
HoodieCompactionPlan plan = getCompactionPlan(metaClient, compactionInstant);
|
HoodieCompactionPlan plan = getCompactionPlan(metaClient, compactionInstant);
|
||||||
HoodieTableFileSystemView fsView =
|
HoodieTableFileSystemView fsView =
|
||||||
new HoodieTableFileSystemView(metaClient, metaClient.getCommitsAndCompactionTimeline());
|
new HoodieTableFileSystemView(metaClient, metaClient.getCommitsAndCompactionTimeline());
|
||||||
@@ -108,19 +107,17 @@ public class CompactionAdminClient extends AbstractHoodieClient {
|
|||||||
* This operation MUST be executed with compactions and writer turned OFF.
|
* This operation MUST be executed with compactions and writer turned OFF.
|
||||||
*
|
*
|
||||||
* @param compactionInstant Compaction Instant
|
* @param compactionInstant Compaction Instant
|
||||||
* @param skipValidation Skip validation step
|
* @param skipValidation Skip validation step
|
||||||
* @param parallelism Parallelism
|
* @param parallelism Parallelism
|
||||||
* @param dryRun Dry Run
|
* @param dryRun Dry Run
|
||||||
*/
|
*/
|
||||||
public List<RenameOpResult> unscheduleCompactionPlan(
|
public List<RenameOpResult> unscheduleCompactionPlan(String compactionInstant, boolean skipValidation,
|
||||||
String compactionInstant, boolean skipValidation, int parallelism, boolean dryRun) throws Exception {
|
int parallelism, boolean dryRun) throws Exception {
|
||||||
HoodieTableMetaClient metaClient = createMetaClient(false);
|
HoodieTableMetaClient metaClient = createMetaClient(false);
|
||||||
List<Pair<HoodieLogFile, HoodieLogFile>> renameActions =
|
List<Pair<HoodieLogFile, HoodieLogFile>> renameActions = getRenamingActionsForUnschedulingCompactionPlan(metaClient,
|
||||||
getRenamingActionsForUnschedulingCompactionPlan(metaClient, compactionInstant, parallelism,
|
compactionInstant, parallelism, Option.empty(), skipValidation);
|
||||||
Option.empty(), skipValidation);
|
|
||||||
|
|
||||||
List<RenameOpResult> res =
|
List<RenameOpResult> res = runRenamingOps(metaClient, renameActions, parallelism, dryRun);
|
||||||
runRenamingOps(metaClient, renameActions, parallelism, dryRun);
|
|
||||||
|
|
||||||
Option<Boolean> success =
|
Option<Boolean> success =
|
||||||
Option.fromJavaOptional(res.stream().map(r -> (r.isExecuted() && r.isSuccess())).reduce(Boolean::logicalAnd));
|
Option.fromJavaOptional(res.stream().map(r -> (r.isExecuted() && r.isSuccess())).reduce(Boolean::logicalAnd));
|
||||||
@@ -145,21 +142,20 @@ public class CompactionAdminClient extends AbstractHoodieClient {
|
|||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Remove a fileId from pending compaction. Removes the associated compaction operation and rename delta-files
|
* Remove a fileId from pending compaction. Removes the associated compaction operation and rename delta-files that
|
||||||
* that were generated for that file-id after the compaction operation was scheduled.
|
* were generated for that file-id after the compaction operation was scheduled.
|
||||||
*
|
*
|
||||||
* This operation MUST be executed with compactions and writer turned OFF.
|
* This operation MUST be executed with compactions and writer turned OFF.
|
||||||
*
|
*
|
||||||
* @param fgId FileGroupId to be unscheduled
|
* @param fgId FileGroupId to be unscheduled
|
||||||
* @param skipValidation Skip validation
|
* @param skipValidation Skip validation
|
||||||
* @param dryRun Dry Run Mode
|
* @param dryRun Dry Run Mode
|
||||||
*/
|
*/
|
||||||
public List<RenameOpResult> unscheduleCompactionFileId(HoodieFileGroupId fgId,
|
public List<RenameOpResult> unscheduleCompactionFileId(HoodieFileGroupId fgId, boolean skipValidation, boolean dryRun)
|
||||||
boolean skipValidation, boolean dryRun) throws Exception {
|
throws Exception {
|
||||||
HoodieTableMetaClient metaClient = createMetaClient(false);
|
HoodieTableMetaClient metaClient = createMetaClient(false);
|
||||||
List<Pair<HoodieLogFile, HoodieLogFile>> renameActions =
|
List<Pair<HoodieLogFile, HoodieLogFile>> renameActions =
|
||||||
getRenamingActionsForUnschedulingCompactionForFileId(metaClient, fgId,
|
getRenamingActionsForUnschedulingCompactionForFileId(metaClient, fgId, Option.empty(), skipValidation);
|
||||||
Option.empty(), skipValidation);
|
|
||||||
|
|
||||||
List<RenameOpResult> res = runRenamingOps(metaClient, renameActions, 1, dryRun);
|
List<RenameOpResult> res = runRenamingOps(metaClient, renameActions, 1, dryRun);
|
||||||
|
|
||||||
@@ -167,15 +163,15 @@ public class CompactionAdminClient extends AbstractHoodieClient {
|
|||||||
// Ready to remove this file-Id from compaction request
|
// Ready to remove this file-Id from compaction request
|
||||||
Pair<String, HoodieCompactionOperation> compactionOperationWithInstant =
|
Pair<String, HoodieCompactionOperation> compactionOperationWithInstant =
|
||||||
CompactionUtils.getAllPendingCompactionOperations(metaClient).get(fgId);
|
CompactionUtils.getAllPendingCompactionOperations(metaClient).get(fgId);
|
||||||
HoodieCompactionPlan plan = CompactionUtils
|
HoodieCompactionPlan plan =
|
||||||
.getCompactionPlan(metaClient, compactionOperationWithInstant.getKey());
|
CompactionUtils.getCompactionPlan(metaClient, compactionOperationWithInstant.getKey());
|
||||||
List<HoodieCompactionOperation> newOps = plan.getOperations().stream()
|
List<HoodieCompactionOperation> newOps = plan.getOperations().stream().filter(
|
||||||
.filter(op -> (!op.getFileId().equals(fgId.getFileId()))
|
op -> (!op.getFileId().equals(fgId.getFileId())) && (!op.getPartitionPath().equals(fgId.getPartitionPath())))
|
||||||
&& (!op.getPartitionPath().equals(fgId.getPartitionPath()))).collect(Collectors.toList());
|
.collect(Collectors.toList());
|
||||||
HoodieCompactionPlan newPlan =
|
HoodieCompactionPlan newPlan =
|
||||||
HoodieCompactionPlan.newBuilder().setOperations(newOps).setExtraMetadata(plan.getExtraMetadata()).build();
|
HoodieCompactionPlan.newBuilder().setOperations(newOps).setExtraMetadata(plan.getExtraMetadata()).build();
|
||||||
HoodieInstant inflight = new HoodieInstant(State.INFLIGHT, COMPACTION_ACTION,
|
HoodieInstant inflight =
|
||||||
compactionOperationWithInstant.getLeft());
|
new HoodieInstant(State.INFLIGHT, COMPACTION_ACTION, compactionOperationWithInstant.getLeft());
|
||||||
Path inflightPath = new Path(metaClient.getMetaPath(), inflight.getFileName());
|
Path inflightPath = new Path(metaClient.getMetaPath(), inflight.getFileName());
|
||||||
if (metaClient.getFs().exists(inflightPath)) {
|
if (metaClient.getFs().exists(inflightPath)) {
|
||||||
// revert if in inflight state
|
// revert if in inflight state
|
||||||
@@ -189,28 +185,28 @@ public class CompactionAdminClient extends AbstractHoodieClient {
|
|||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Renames delta files to make file-slices consistent with the timeline as dictated by Hoodie metadata.
|
* Renames delta files to make file-slices consistent with the timeline as dictated by Hoodie metadata. Use when
|
||||||
* Use when compaction unschedule fails partially.
|
* compaction unschedule fails partially.
|
||||||
*
|
*
|
||||||
* This operation MUST be executed with compactions and writer turned OFF.
|
* This operation MUST be executed with compactions and writer turned OFF.
|
||||||
|
*
|
||||||
* @param compactionInstant Compaction Instant to be repaired
|
* @param compactionInstant Compaction Instant to be repaired
|
||||||
* @param dryRun Dry Run Mode
|
* @param dryRun Dry Run Mode
|
||||||
*/
|
*/
|
||||||
public List<RenameOpResult> repairCompaction(String compactionInstant,
|
public List<RenameOpResult> repairCompaction(String compactionInstant, int parallelism, boolean dryRun)
|
||||||
int parallelism, boolean dryRun) throws Exception {
|
throws Exception {
|
||||||
HoodieTableMetaClient metaClient = createMetaClient(false);
|
HoodieTableMetaClient metaClient = createMetaClient(false);
|
||||||
List<ValidationOpResult> validationResults =
|
List<ValidationOpResult> validationResults = validateCompactionPlan(metaClient, compactionInstant, parallelism);
|
||||||
validateCompactionPlan(metaClient, compactionInstant, parallelism);
|
List<ValidationOpResult> failed =
|
||||||
List<ValidationOpResult> failed = validationResults.stream()
|
validationResults.stream().filter(v -> !v.isSuccess()).collect(Collectors.toList());
|
||||||
.filter(v -> !v.isSuccess()).collect(Collectors.toList());
|
|
||||||
if (failed.isEmpty()) {
|
if (failed.isEmpty()) {
|
||||||
return new ArrayList<>();
|
return new ArrayList<>();
|
||||||
}
|
}
|
||||||
|
|
||||||
final HoodieTableFileSystemView fsView = new HoodieTableFileSystemView(metaClient,
|
final HoodieTableFileSystemView fsView =
|
||||||
metaClient.getCommitsAndCompactionTimeline());
|
new HoodieTableFileSystemView(metaClient, metaClient.getCommitsAndCompactionTimeline());
|
||||||
List<Pair<HoodieLogFile, HoodieLogFile>> renameActions = failed.stream().flatMap(v ->
|
List<Pair<HoodieLogFile, HoodieLogFile>> renameActions =
|
||||||
getRenamingActionsToAlignWithCompactionOperation(metaClient, compactionInstant,
|
failed.stream().flatMap(v -> getRenamingActionsToAlignWithCompactionOperation(metaClient, compactionInstant,
|
||||||
v.getOperation(), Option.of(fsView)).stream()).collect(Collectors.toList());
|
v.getOperation(), Option.of(fsView)).stream()).collect(Collectors.toList());
|
||||||
return runRenamingOps(metaClient, renameActions, parallelism, dryRun);
|
return runRenamingOps(metaClient, renameActions, parallelism, dryRun);
|
||||||
}
|
}
|
||||||
@@ -218,11 +214,10 @@ public class CompactionAdminClient extends AbstractHoodieClient {
|
|||||||
/**
|
/**
|
||||||
* Construction Compaction Plan from compaction instant
|
* Construction Compaction Plan from compaction instant
|
||||||
*/
|
*/
|
||||||
private static HoodieCompactionPlan getCompactionPlan(HoodieTableMetaClient metaClient,
|
private static HoodieCompactionPlan getCompactionPlan(HoodieTableMetaClient metaClient, String compactionInstant)
|
||||||
String compactionInstant) throws IOException {
|
throws IOException {
|
||||||
HoodieCompactionPlan compactionPlan = AvroUtils.deserializeCompactionPlan(
|
HoodieCompactionPlan compactionPlan = AvroUtils.deserializeCompactionPlan(metaClient.getActiveTimeline()
|
||||||
metaClient.getActiveTimeline().getInstantAuxiliaryDetails(
|
.getInstantAuxiliaryDetails(HoodieTimeline.getCompactionRequestedInstant(compactionInstant)).get());
|
||||||
HoodieTimeline.getCompactionRequestedInstant(compactionInstant)).get());
|
|
||||||
return compactionPlan;
|
return compactionPlan;
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -230,28 +225,26 @@ public class CompactionAdminClient extends AbstractHoodieClient {
|
|||||||
* Get Renaming actions to ensure the log-files of merged file-slices is aligned with compaction operation. This
|
* Get Renaming actions to ensure the log-files of merged file-slices is aligned with compaction operation. This
|
||||||
* method is used to recover from failures during unschedule compaction operations.
|
* method is used to recover from failures during unschedule compaction operations.
|
||||||
*
|
*
|
||||||
* @param metaClient Hoodie Table Meta Client
|
* @param metaClient Hoodie Table Meta Client
|
||||||
* @param compactionInstant Compaction Instant
|
* @param compactionInstant Compaction Instant
|
||||||
* @param op Compaction Operation
|
* @param op Compaction Operation
|
||||||
* @param fsViewOpt File System View
|
* @param fsViewOpt File System View
|
||||||
*/
|
*/
|
||||||
protected static List<Pair<HoodieLogFile, HoodieLogFile>> getRenamingActionsToAlignWithCompactionOperation(
|
protected static List<Pair<HoodieLogFile, HoodieLogFile>> getRenamingActionsToAlignWithCompactionOperation(
|
||||||
HoodieTableMetaClient metaClient, String compactionInstant, CompactionOperation op,
|
HoodieTableMetaClient metaClient, String compactionInstant, CompactionOperation op,
|
||||||
Option<HoodieTableFileSystemView> fsViewOpt) {
|
Option<HoodieTableFileSystemView> fsViewOpt) {
|
||||||
HoodieTableFileSystemView fileSystemView = fsViewOpt.isPresent() ? fsViewOpt.get() :
|
HoodieTableFileSystemView fileSystemView = fsViewOpt.isPresent() ? fsViewOpt.get()
|
||||||
new HoodieTableFileSystemView(metaClient, metaClient.getCommitsAndCompactionTimeline());
|
: new HoodieTableFileSystemView(metaClient, metaClient.getCommitsAndCompactionTimeline());
|
||||||
HoodieInstant lastInstant = metaClient.getCommitsAndCompactionTimeline().lastInstant().get();
|
HoodieInstant lastInstant = metaClient.getCommitsAndCompactionTimeline().lastInstant().get();
|
||||||
FileSlice merged =
|
FileSlice merged =
|
||||||
fileSystemView.getLatestMergedFileSlicesBeforeOrOn(op.getPartitionPath(), lastInstant.getTimestamp())
|
fileSystemView.getLatestMergedFileSlicesBeforeOrOn(op.getPartitionPath(), lastInstant.getTimestamp())
|
||||||
.filter(fs -> fs.getFileId().equals(op.getFileId())).findFirst().get();
|
.filter(fs -> fs.getFileId().equals(op.getFileId())).findFirst().get();
|
||||||
final int maxVersion =
|
final int maxVersion = op.getDeltaFilePaths().stream().map(lf -> FSUtils.getFileVersionFromLog(new Path(lf)))
|
||||||
op.getDeltaFilePaths().stream().map(lf -> FSUtils.getFileVersionFromLog(new Path(lf)))
|
.reduce((x, y) -> x > y ? x : y).orElse(0);
|
||||||
.reduce((x, y) -> x > y ? x : y).orElse(0);
|
|
||||||
List<HoodieLogFile> logFilesToBeMoved =
|
List<HoodieLogFile> logFilesToBeMoved =
|
||||||
merged.getLogFiles().filter(lf -> lf.getLogVersion() > maxVersion).collect(Collectors.toList());
|
merged.getLogFiles().filter(lf -> lf.getLogVersion() > maxVersion).collect(Collectors.toList());
|
||||||
return logFilesToBeMoved.stream().map(lf -> {
|
return logFilesToBeMoved.stream().map(lf -> {
|
||||||
Preconditions.checkArgument(lf.getLogVersion() - maxVersion > 0,
|
Preconditions.checkArgument(lf.getLogVersion() - maxVersion > 0, "Expect new log version to be sane");
|
||||||
"Expect new log version to be sane");
|
|
||||||
HoodieLogFile newLogFile = new HoodieLogFile(new Path(lf.getPath().getParent(),
|
HoodieLogFile newLogFile = new HoodieLogFile(new Path(lf.getPath().getParent(),
|
||||||
FSUtils.makeLogFileName(lf.getFileId(), "." + FSUtils.getFileExtensionFromLog(lf.getPath()),
|
FSUtils.makeLogFileName(lf.getFileId(), "." + FSUtils.getFileExtensionFromLog(lf.getPath()),
|
||||||
compactionInstant, lf.getLogVersion() - maxVersion, HoodieLogFormat.UNKNOWN_WRITE_TOKEN)));
|
compactionInstant, lf.getLogVersion() - maxVersion, HoodieLogFormat.UNKNOWN_WRITE_TOKEN)));
|
||||||
@@ -280,16 +273,15 @@ public class CompactionAdminClient extends AbstractHoodieClient {
|
|||||||
/**
|
/**
|
||||||
* Check if a compaction operation is valid
|
* Check if a compaction operation is valid
|
||||||
*
|
*
|
||||||
* @param metaClient Hoodie Table Meta client
|
* @param metaClient Hoodie Table Meta client
|
||||||
* @param compactionInstant Compaction Instant
|
* @param compactionInstant Compaction Instant
|
||||||
* @param operation Compaction Operation
|
* @param operation Compaction Operation
|
||||||
* @param fsViewOpt File System View
|
* @param fsViewOpt File System View
|
||||||
*/
|
*/
|
||||||
private ValidationOpResult validateCompactionOperation(HoodieTableMetaClient metaClient,
|
private ValidationOpResult validateCompactionOperation(HoodieTableMetaClient metaClient, String compactionInstant,
|
||||||
String compactionInstant, CompactionOperation operation, Option<HoodieTableFileSystemView> fsViewOpt)
|
CompactionOperation operation, Option<HoodieTableFileSystemView> fsViewOpt) throws IOException {
|
||||||
throws IOException {
|
HoodieTableFileSystemView fileSystemView = fsViewOpt.isPresent() ? fsViewOpt.get()
|
||||||
HoodieTableFileSystemView fileSystemView = fsViewOpt.isPresent() ? fsViewOpt.get() :
|
: new HoodieTableFileSystemView(metaClient, metaClient.getCommitsAndCompactionTimeline());
|
||||||
new HoodieTableFileSystemView(metaClient, metaClient.getCommitsAndCompactionTimeline());
|
|
||||||
Option<HoodieInstant> lastInstant = metaClient.getCommitsAndCompactionTimeline().lastInstant();
|
Option<HoodieInstant> lastInstant = metaClient.getCommitsAndCompactionTimeline().lastInstant();
|
||||||
try {
|
try {
|
||||||
if (lastInstant.isPresent()) {
|
if (lastInstant.isPresent()) {
|
||||||
@@ -300,45 +292,42 @@ public class CompactionAdminClient extends AbstractHoodieClient {
|
|||||||
FileSlice fs = fileSliceOptional.get();
|
FileSlice fs = fileSliceOptional.get();
|
||||||
Option<HoodieDataFile> df = fs.getDataFile();
|
Option<HoodieDataFile> df = fs.getDataFile();
|
||||||
if (operation.getDataFilePath().isPresent()) {
|
if (operation.getDataFilePath().isPresent()) {
|
||||||
String expPath = metaClient.getFs().getFileStatus(new Path(operation.getDataFilePath().get())).getPath()
|
String expPath =
|
||||||
.toString();
|
metaClient.getFs().getFileStatus(new Path(operation.getDataFilePath().get())).getPath().toString();
|
||||||
Preconditions.checkArgument(df.isPresent(), "Data File must be present. File Slice was : "
|
Preconditions.checkArgument(df.isPresent(),
|
||||||
+ fs + ", operation :" + operation);
|
"Data File must be present. File Slice was : " + fs + ", operation :" + operation);
|
||||||
Preconditions.checkArgument(df.get().getPath().equals(expPath),
|
Preconditions.checkArgument(df.get().getPath().equals(expPath),
|
||||||
"Base Path in operation is specified as " + expPath + " but got path " + df.get().getPath());
|
"Base Path in operation is specified as " + expPath + " but got path " + df.get().getPath());
|
||||||
}
|
}
|
||||||
Set<HoodieLogFile> logFilesInFileSlice = fs.getLogFiles().collect(Collectors.toSet());
|
Set<HoodieLogFile> logFilesInFileSlice = fs.getLogFiles().collect(Collectors.toSet());
|
||||||
Set<HoodieLogFile> logFilesInCompactionOp = operation.getDeltaFilePaths().stream()
|
Set<HoodieLogFile> logFilesInCompactionOp = operation.getDeltaFilePaths().stream().map(dp -> {
|
||||||
.map(dp -> {
|
try {
|
||||||
try {
|
FileStatus[] fileStatuses = metaClient.getFs().listStatus(new Path(dp));
|
||||||
FileStatus[] fileStatuses = metaClient.getFs().listStatus(new Path(dp));
|
Preconditions.checkArgument(fileStatuses.length == 1, "Expect only 1 file-status");
|
||||||
Preconditions.checkArgument(fileStatuses.length == 1, "Expect only 1 file-status");
|
return new HoodieLogFile(fileStatuses[0]);
|
||||||
return new HoodieLogFile(fileStatuses[0]);
|
} catch (FileNotFoundException fe) {
|
||||||
} catch (FileNotFoundException fe) {
|
throw new CompactionValidationException(fe.getMessage());
|
||||||
throw new CompactionValidationException(fe.getMessage());
|
} catch (IOException ioe) {
|
||||||
} catch (IOException ioe) {
|
throw new HoodieIOException(ioe.getMessage(), ioe);
|
||||||
throw new HoodieIOException(ioe.getMessage(), ioe);
|
}
|
||||||
}
|
}).collect(Collectors.toSet());
|
||||||
}).collect(Collectors.toSet());
|
Set<HoodieLogFile> missing = logFilesInCompactionOp.stream().filter(lf -> !logFilesInFileSlice.contains(lf))
|
||||||
Set<HoodieLogFile> missing =
|
.collect(Collectors.toSet());
|
||||||
logFilesInCompactionOp.stream().filter(lf -> !logFilesInFileSlice.contains(lf))
|
|
||||||
.collect(Collectors.toSet());
|
|
||||||
Preconditions.checkArgument(missing.isEmpty(),
|
Preconditions.checkArgument(missing.isEmpty(),
|
||||||
"All log files specified in compaction operation is not present. Missing :" + missing
|
"All log files specified in compaction operation is not present. Missing :" + missing + ", Exp :"
|
||||||
+ ", Exp :" + logFilesInCompactionOp + ", Got :" + logFilesInFileSlice);
|
+ logFilesInCompactionOp + ", Got :" + logFilesInFileSlice);
|
||||||
Set<HoodieLogFile> diff =
|
Set<HoodieLogFile> diff = logFilesInFileSlice.stream().filter(lf -> !logFilesInCompactionOp.contains(lf))
|
||||||
logFilesInFileSlice.stream().filter(lf -> !logFilesInCompactionOp.contains(lf))
|
.collect(Collectors.toSet());
|
||||||
.collect(Collectors.toSet());
|
|
||||||
Preconditions.checkArgument(diff.stream().allMatch(lf -> lf.getBaseCommitTime().equals(compactionInstant)),
|
Preconditions.checkArgument(diff.stream().allMatch(lf -> lf.getBaseCommitTime().equals(compactionInstant)),
|
||||||
"There are some log-files which are neither specified in compaction plan "
|
"There are some log-files which are neither specified in compaction plan "
|
||||||
+ "nor present after compaction request instant. Some of these :" + diff);
|
+ "nor present after compaction request instant. Some of these :" + diff);
|
||||||
} else {
|
} else {
|
||||||
throw new CompactionValidationException("Unable to find file-slice for file-id (" + operation.getFileId()
|
throw new CompactionValidationException(
|
||||||
+ " Compaction operation is invalid.");
|
"Unable to find file-slice for file-id (" + operation.getFileId() + " Compaction operation is invalid.");
|
||||||
}
|
}
|
||||||
} else {
|
} else {
|
||||||
throw new CompactionValidationException("Unable to find any committed instant. Compaction Operation may "
|
throw new CompactionValidationException(
|
||||||
+ "be pointing to stale file-slices");
|
"Unable to find any committed instant. Compaction Operation may " + "be pointing to stale file-slices");
|
||||||
}
|
}
|
||||||
} catch (CompactionValidationException | IllegalArgumentException e) {
|
} catch (CompactionValidationException | IllegalArgumentException e) {
|
||||||
return new ValidationOpResult(operation, false, Option.of(e));
|
return new ValidationOpResult(operation, false, Option.of(e));
|
||||||
@@ -349,7 +338,7 @@ public class CompactionAdminClient extends AbstractHoodieClient {
|
|||||||
/**
|
/**
|
||||||
* Execute Renaming operation
|
* Execute Renaming operation
|
||||||
*
|
*
|
||||||
* @param metaClient HoodieTable MetaClient
|
* @param metaClient HoodieTable MetaClient
|
||||||
* @param renameActions List of rename operations
|
* @param renameActions List of rename operations
|
||||||
*/
|
*/
|
||||||
private List<RenameOpResult> runRenamingOps(HoodieTableMetaClient metaClient,
|
private List<RenameOpResult> runRenamingOps(HoodieTableMetaClient metaClient,
|
||||||
@@ -374,8 +363,7 @@ public class CompactionAdminClient extends AbstractHoodieClient {
|
|||||||
}).collect();
|
}).collect();
|
||||||
} else {
|
} else {
|
||||||
log.info("Dry-Run Mode activated for rename operations");
|
log.info("Dry-Run Mode activated for rename operations");
|
||||||
return renameActions.parallelStream()
|
return renameActions.parallelStream().map(lfPair -> new RenameOpResult(lfPair, false, false, Option.empty()))
|
||||||
.map(lfPair -> new RenameOpResult(lfPair, false, false, Option.empty()))
|
|
||||||
.collect(Collectors.toList());
|
.collect(Collectors.toList());
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@@ -385,28 +373,28 @@ public class CompactionAdminClient extends AbstractHoodieClient {
|
|||||||
* Generate renaming actions for unscheduling a pending compaction plan. NOTE: Can only be used safely when no writer
|
* Generate renaming actions for unscheduling a pending compaction plan. NOTE: Can only be used safely when no writer
|
||||||
* (ingestion/compaction) is running.
|
* (ingestion/compaction) is running.
|
||||||
*
|
*
|
||||||
* @param metaClient Hoodie Table MetaClient
|
* @param metaClient Hoodie Table MetaClient
|
||||||
* @param compactionInstant Compaction Instant to be unscheduled
|
* @param compactionInstant Compaction Instant to be unscheduled
|
||||||
* @param fsViewOpt Cached File System View
|
* @param fsViewOpt Cached File System View
|
||||||
* @param skipValidation Skip Validation
|
* @param skipValidation Skip Validation
|
||||||
* @return list of pairs of log-files (old, new) and for each pair, rename must be done to successfully unschedule
|
* @return list of pairs of log-files (old, new) and for each pair, rename must be done to successfully unschedule
|
||||||
* compaction.
|
* compaction.
|
||||||
*/
|
*/
|
||||||
protected List<Pair<HoodieLogFile, HoodieLogFile>> getRenamingActionsForUnschedulingCompactionPlan(
|
protected List<Pair<HoodieLogFile, HoodieLogFile>> getRenamingActionsForUnschedulingCompactionPlan(
|
||||||
HoodieTableMetaClient metaClient, String compactionInstant, int parallelism,
|
HoodieTableMetaClient metaClient, String compactionInstant, int parallelism,
|
||||||
Option<HoodieTableFileSystemView> fsViewOpt, boolean skipValidation) throws IOException {
|
Option<HoodieTableFileSystemView> fsViewOpt, boolean skipValidation) throws IOException {
|
||||||
HoodieTableFileSystemView fsView = fsViewOpt.isPresent() ? fsViewOpt.get() :
|
HoodieTableFileSystemView fsView = fsViewOpt.isPresent() ? fsViewOpt.get()
|
||||||
new HoodieTableFileSystemView(metaClient, metaClient.getCommitsAndCompactionTimeline());
|
: new HoodieTableFileSystemView(metaClient, metaClient.getCommitsAndCompactionTimeline());
|
||||||
HoodieCompactionPlan plan = getCompactionPlan(metaClient, compactionInstant);
|
HoodieCompactionPlan plan = getCompactionPlan(metaClient, compactionInstant);
|
||||||
if (plan.getOperations() != null) {
|
if (plan.getOperations() != null) {
|
||||||
log.info("Number of Compaction Operations :" + plan.getOperations().size()
|
log.info(
|
||||||
+ " for instant :" + compactionInstant);
|
"Number of Compaction Operations :" + plan.getOperations().size() + " for instant :" + compactionInstant);
|
||||||
List<CompactionOperation> ops = plan.getOperations().stream()
|
List<CompactionOperation> ops = plan.getOperations().stream()
|
||||||
.map(CompactionOperation::convertFromAvroRecordInstance).collect(Collectors.toList());
|
.map(CompactionOperation::convertFromAvroRecordInstance).collect(Collectors.toList());
|
||||||
return jsc.parallelize(ops, parallelism).flatMap(op -> {
|
return jsc.parallelize(ops, parallelism).flatMap(op -> {
|
||||||
try {
|
try {
|
||||||
return getRenamingActionsForUnschedulingCompactionOperation(metaClient, compactionInstant,
|
return getRenamingActionsForUnschedulingCompactionOperation(metaClient, compactionInstant, op,
|
||||||
op, Option.of(fsView), skipValidation).iterator();
|
Option.of(fsView), skipValidation).iterator();
|
||||||
} catch (IOException ioe) {
|
} catch (IOException ioe) {
|
||||||
throw new HoodieIOException(ioe.getMessage(), ioe);
|
throw new HoodieIOException(ioe.getMessage(), ioe);
|
||||||
} catch (CompactionValidationException ve) {
|
} catch (CompactionValidationException ve) {
|
||||||
@@ -422,20 +410,20 @@ public class CompactionAdminClient extends AbstractHoodieClient {
|
|||||||
* Generate renaming actions for unscheduling a compaction operation NOTE: Can only be used safely when no writer
|
* Generate renaming actions for unscheduling a compaction operation NOTE: Can only be used safely when no writer
|
||||||
* (ingestion/compaction) is running.
|
* (ingestion/compaction) is running.
|
||||||
*
|
*
|
||||||
* @param metaClient Hoodie Table MetaClient
|
* @param metaClient Hoodie Table MetaClient
|
||||||
* @param compactionInstant Compaction Instant
|
* @param compactionInstant Compaction Instant
|
||||||
* @param operation Compaction Operation
|
* @param operation Compaction Operation
|
||||||
* @param fsViewOpt Cached File System View
|
* @param fsViewOpt Cached File System View
|
||||||
* @param skipValidation Skip Validation
|
* @param skipValidation Skip Validation
|
||||||
* @return list of pairs of log-files (old, new) and for each pair, rename must be done to successfully unschedule
|
* @return list of pairs of log-files (old, new) and for each pair, rename must be done to successfully unschedule
|
||||||
* compaction.
|
* compaction.
|
||||||
*/
|
*/
|
||||||
public List<Pair<HoodieLogFile, HoodieLogFile>> getRenamingActionsForUnschedulingCompactionOperation(
|
public List<Pair<HoodieLogFile, HoodieLogFile>> getRenamingActionsForUnschedulingCompactionOperation(
|
||||||
HoodieTableMetaClient metaClient, String compactionInstant, CompactionOperation operation,
|
HoodieTableMetaClient metaClient, String compactionInstant, CompactionOperation operation,
|
||||||
Option<HoodieTableFileSystemView> fsViewOpt, boolean skipValidation) throws IOException {
|
Option<HoodieTableFileSystemView> fsViewOpt, boolean skipValidation) throws IOException {
|
||||||
List<Pair<HoodieLogFile, HoodieLogFile>> result = new ArrayList<>();
|
List<Pair<HoodieLogFile, HoodieLogFile>> result = new ArrayList<>();
|
||||||
HoodieTableFileSystemView fileSystemView = fsViewOpt.isPresent() ? fsViewOpt.get() :
|
HoodieTableFileSystemView fileSystemView = fsViewOpt.isPresent() ? fsViewOpt.get()
|
||||||
new HoodieTableFileSystemView(metaClient, metaClient.getCommitsAndCompactionTimeline());
|
: new HoodieTableFileSystemView(metaClient, metaClient.getCommitsAndCompactionTimeline());
|
||||||
if (!skipValidation) {
|
if (!skipValidation) {
|
||||||
validateCompactionOperation(metaClient, compactionInstant, operation, Option.of(fileSystemView));
|
validateCompactionOperation(metaClient, compactionInstant, operation, Option.of(fileSystemView));
|
||||||
}
|
}
|
||||||
@@ -444,15 +432,13 @@ public class CompactionAdminClient extends AbstractHoodieClient {
|
|||||||
fileSystemView.getLatestMergedFileSlicesBeforeOrOn(operation.getPartitionPath(), lastInstant.getTimestamp())
|
fileSystemView.getLatestMergedFileSlicesBeforeOrOn(operation.getPartitionPath(), lastInstant.getTimestamp())
|
||||||
.filter(fs -> fs.getFileId().equals(operation.getFileId())).findFirst().get();
|
.filter(fs -> fs.getFileId().equals(operation.getFileId())).findFirst().get();
|
||||||
List<HoodieLogFile> logFilesToRepair =
|
List<HoodieLogFile> logFilesToRepair =
|
||||||
merged.getLogFiles().filter(lf -> lf.getBaseCommitTime().equals(compactionInstant))
|
merged.getLogFiles().filter(lf -> lf.getBaseCommitTime().equals(compactionInstant))
|
||||||
.sorted(HoodieLogFile.getLogFileComparator())
|
.sorted(HoodieLogFile.getLogFileComparator()).collect(Collectors.toList());
|
||||||
.collect(Collectors.toList());
|
|
||||||
FileSlice fileSliceForCompaction =
|
FileSlice fileSliceForCompaction =
|
||||||
fileSystemView.getLatestFileSlicesBeforeOrOn(operation.getPartitionPath(), operation.getBaseInstantTime(), true)
|
fileSystemView.getLatestFileSlicesBeforeOrOn(operation.getPartitionPath(), operation.getBaseInstantTime(), true)
|
||||||
.filter(fs -> fs.getFileId().equals(operation.getFileId())).findFirst().get();
|
.filter(fs -> fs.getFileId().equals(operation.getFileId())).findFirst().get();
|
||||||
int maxUsedVersion =
|
int maxUsedVersion = fileSliceForCompaction.getLogFiles().findFirst().map(HoodieLogFile::getLogVersion)
|
||||||
fileSliceForCompaction.getLogFiles().findFirst().map(HoodieLogFile::getLogVersion)
|
.orElse(HoodieLogFile.LOGFILE_BASE_VERSION - 1);
|
||||||
.orElse(HoodieLogFile.LOGFILE_BASE_VERSION - 1);
|
|
||||||
String logExtn = fileSliceForCompaction.getLogFiles().findFirst().map(lf -> "." + lf.getFileExtension())
|
String logExtn = fileSliceForCompaction.getLogFiles().findFirst().map(lf -> "." + lf.getFileExtension())
|
||||||
.orElse(HoodieLogFile.DELTA_EXTENSION);
|
.orElse(HoodieLogFile.DELTA_EXTENSION);
|
||||||
String parentPath = fileSliceForCompaction.getDataFile().map(df -> new Path(df.getPath()).getParent().toString())
|
String parentPath = fileSliceForCompaction.getDataFile().map(df -> new Path(df.getPath()).getParent().toString())
|
||||||
@@ -471,16 +457,16 @@ public class CompactionAdminClient extends AbstractHoodieClient {
|
|||||||
* Generate renaming actions for unscheduling a fileId from pending compaction. NOTE: Can only be used safely when no
|
* Generate renaming actions for unscheduling a fileId from pending compaction. NOTE: Can only be used safely when no
|
||||||
* writer (ingestion/compaction) is running.
|
* writer (ingestion/compaction) is running.
|
||||||
*
|
*
|
||||||
* @param metaClient Hoodie Table MetaClient
|
* @param metaClient Hoodie Table MetaClient
|
||||||
* @param fgId FileGroupId to remove compaction
|
* @param fgId FileGroupId to remove compaction
|
||||||
* @param fsViewOpt Cached File System View
|
* @param fsViewOpt Cached File System View
|
||||||
* @param skipValidation Skip Validation
|
* @param skipValidation Skip Validation
|
||||||
* @return list of pairs of log-files (old, new) and for each pair, rename must be done to successfully unschedule
|
* @return list of pairs of log-files (old, new) and for each pair, rename must be done to successfully unschedule
|
||||||
* compaction.
|
* compaction.
|
||||||
*/
|
*/
|
||||||
public List<Pair<HoodieLogFile, HoodieLogFile>> getRenamingActionsForUnschedulingCompactionForFileId(
|
public List<Pair<HoodieLogFile, HoodieLogFile>> getRenamingActionsForUnschedulingCompactionForFileId(
|
||||||
HoodieTableMetaClient metaClient, HoodieFileGroupId fgId,
|
HoodieTableMetaClient metaClient, HoodieFileGroupId fgId, Option<HoodieTableFileSystemView> fsViewOpt,
|
||||||
Option<HoodieTableFileSystemView> fsViewOpt, boolean skipValidation) throws IOException {
|
boolean skipValidation) throws IOException {
|
||||||
Map<HoodieFileGroupId, Pair<String, HoodieCompactionOperation>> allPendingCompactions =
|
Map<HoodieFileGroupId, Pair<String, HoodieCompactionOperation>> allPendingCompactions =
|
||||||
CompactionUtils.getAllPendingCompactionOperations(metaClient);
|
CompactionUtils.getAllPendingCompactionOperations(metaClient);
|
||||||
if (allPendingCompactions.containsKey(fgId)) {
|
if (allPendingCompactions.containsKey(fgId)) {
|
||||||
@@ -496,20 +482,19 @@ public class CompactionAdminClient extends AbstractHoodieClient {
|
|||||||
*/
|
*/
|
||||||
public static class RenameOpResult extends OperationResult<RenameInfo> {
|
public static class RenameOpResult extends OperationResult<RenameInfo> {
|
||||||
|
|
||||||
public RenameOpResult() {
|
public RenameOpResult() {}
|
||||||
|
|
||||||
|
public RenameOpResult(Pair<HoodieLogFile, HoodieLogFile> op, boolean success, Option<Exception> exception) {
|
||||||
|
super(
|
||||||
|
new RenameInfo(op.getKey().getFileId(), op.getKey().getPath().toString(), op.getRight().getPath().toString()),
|
||||||
|
success, exception);
|
||||||
}
|
}
|
||||||
|
|
||||||
public RenameOpResult(Pair<HoodieLogFile, HoodieLogFile> op, boolean success,
|
public RenameOpResult(Pair<HoodieLogFile, HoodieLogFile> op, boolean executed, boolean success,
|
||||||
Option<Exception> exception) {
|
Option<Exception> exception) {
|
||||||
super(new RenameInfo(op.getKey().getFileId(), op.getKey().getPath().toString(),
|
super(
|
||||||
op.getRight().getPath().toString()), success, exception);
|
new RenameInfo(op.getKey().getFileId(), op.getKey().getPath().toString(), op.getRight().getPath().toString()),
|
||||||
}
|
executed, success, exception);
|
||||||
|
|
||||||
public RenameOpResult(
|
|
||||||
Pair<HoodieLogFile, HoodieLogFile> op, boolean executed, boolean success,
|
|
||||||
Option<Exception> exception) {
|
|
||||||
super(new RenameInfo(op.getKey().getFileId(), op.getKey().getPath().toString(),
|
|
||||||
op.getRight().getPath().toString()), executed, success, exception);
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -518,11 +503,9 @@ public class CompactionAdminClient extends AbstractHoodieClient {
|
|||||||
*/
|
*/
|
||||||
public static class ValidationOpResult extends OperationResult<CompactionOperation> {
|
public static class ValidationOpResult extends OperationResult<CompactionOperation> {
|
||||||
|
|
||||||
public ValidationOpResult() {
|
public ValidationOpResult() {}
|
||||||
}
|
|
||||||
|
|
||||||
public ValidationOpResult(
|
public ValidationOpResult(CompactionOperation operation, boolean success, Option<Exception> exception) {
|
||||||
CompactionOperation operation, boolean success, Option<Exception> exception) {
|
|
||||||
super(operation, success, exception);
|
super(operation, success, exception);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@@ -533,8 +516,7 @@ public class CompactionAdminClient extends AbstractHoodieClient {
|
|||||||
public String srcPath;
|
public String srcPath;
|
||||||
public String destPath;
|
public String destPath;
|
||||||
|
|
||||||
public RenameInfo() {
|
public RenameInfo() {}
|
||||||
}
|
|
||||||
|
|
||||||
public RenameInfo(String fileId, String srcPath, String destPath) {
|
public RenameInfo(String fileId, String srcPath, String destPath) {
|
||||||
this.fileId = fileId;
|
this.fileId = fileId;
|
||||||
|
|||||||
@@ -58,9 +58,8 @@ public class HoodieReadClient<T extends HoodieRecordPayload> extends AbstractHoo
|
|||||||
private static final Logger logger = LogManager.getLogger(HoodieReadClient.class);
|
private static final Logger logger = LogManager.getLogger(HoodieReadClient.class);
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* TODO: We need to persist the index type into hoodie.properties and be able to access the index
|
* TODO: We need to persist the index type into hoodie.properties and be able to access the index just with a simple
|
||||||
* just with a simple basepath pointing to the dataset. Until, then just always assume a
|
* basepath pointing to the dataset. Until, then just always assume a BloomIndex
|
||||||
* BloomIndex
|
|
||||||
*/
|
*/
|
||||||
private final transient HoodieIndex<T> index;
|
private final transient HoodieIndex<T> index;
|
||||||
private final HoodieTimeline commitTimeline;
|
private final HoodieTimeline commitTimeline;
|
||||||
@@ -70,13 +69,11 @@ public class HoodieReadClient<T extends HoodieRecordPayload> extends AbstractHoo
|
|||||||
/**
|
/**
|
||||||
* @param basePath path to Hoodie dataset
|
* @param basePath path to Hoodie dataset
|
||||||
*/
|
*/
|
||||||
public HoodieReadClient(JavaSparkContext jsc, String basePath,
|
public HoodieReadClient(JavaSparkContext jsc, String basePath, Option<EmbeddedTimelineService> timelineService) {
|
||||||
Option<EmbeddedTimelineService> timelineService) {
|
|
||||||
this(jsc, HoodieWriteConfig.newBuilder().withPath(basePath)
|
this(jsc, HoodieWriteConfig.newBuilder().withPath(basePath)
|
||||||
// by default we use HoodieBloomIndex
|
// by default we use HoodieBloomIndex
|
||||||
.withIndexConfig(
|
.withIndexConfig(HoodieIndexConfig.newBuilder().withIndexType(HoodieIndex.IndexType.BLOOM).build()).build(),
|
||||||
HoodieIndexConfig.newBuilder().withIndexType(HoodieIndex.IndexType.BLOOM).build())
|
timelineService);
|
||||||
.build(), timelineService);
|
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
@@ -130,8 +127,7 @@ public class HoodieReadClient<T extends HoodieRecordPayload> extends AbstractHoo
|
|||||||
|
|
||||||
private void assertSqlContext() {
|
private void assertSqlContext() {
|
||||||
if (!sqlContextOpt.isPresent()) {
|
if (!sqlContextOpt.isPresent()) {
|
||||||
throw new IllegalStateException(
|
throw new IllegalStateException("SQLContext must be set, when performing dataframe operations");
|
||||||
"SQLContext must be set, when performing dataframe operations");
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -152,17 +148,16 @@ public class HoodieReadClient<T extends HoodieRecordPayload> extends AbstractHoo
|
|||||||
*/
|
*/
|
||||||
public Dataset<Row> readROView(JavaRDD<HoodieKey> hoodieKeys, int parallelism) {
|
public Dataset<Row> readROView(JavaRDD<HoodieKey> hoodieKeys, int parallelism) {
|
||||||
assertSqlContext();
|
assertSqlContext();
|
||||||
JavaPairRDD<HoodieKey, Option<Pair<String, String>>> lookupResultRDD = index
|
JavaPairRDD<HoodieKey, Option<Pair<String, String>>> lookupResultRDD =
|
||||||
.fetchRecordLocation(hoodieKeys, jsc, hoodieTable);
|
index.fetchRecordLocation(hoodieKeys, jsc, hoodieTable);
|
||||||
JavaPairRDD<HoodieKey, Option<String>> keyToFileRDD = lookupResultRDD
|
JavaPairRDD<HoodieKey, Option<String>> keyToFileRDD =
|
||||||
.mapToPair(r -> new Tuple2<>(r._1, convertToDataFilePath(r._2)));
|
lookupResultRDD.mapToPair(r -> new Tuple2<>(r._1, convertToDataFilePath(r._2)));
|
||||||
List<String> paths = keyToFileRDD.filter(keyFileTuple -> keyFileTuple._2().isPresent())
|
List<String> paths = keyToFileRDD.filter(keyFileTuple -> keyFileTuple._2().isPresent())
|
||||||
.map(keyFileTuple -> keyFileTuple._2().get()).collect();
|
.map(keyFileTuple -> keyFileTuple._2().get()).collect();
|
||||||
|
|
||||||
// record locations might be same for multiple keys, so need a unique list
|
// record locations might be same for multiple keys, so need a unique list
|
||||||
Set<String> uniquePaths = new HashSet<>(paths);
|
Set<String> uniquePaths = new HashSet<>(paths);
|
||||||
Dataset<Row> originalDF = sqlContextOpt.get().read()
|
Dataset<Row> originalDF = sqlContextOpt.get().read().parquet(uniquePaths.toArray(new String[uniquePaths.size()]));
|
||||||
.parquet(uniquePaths.toArray(new String[uniquePaths.size()]));
|
|
||||||
StructType schema = originalDF.schema();
|
StructType schema = originalDF.schema();
|
||||||
JavaPairRDD<HoodieKey, Row> keyRowRDD = originalDF.javaRDD().mapToPair(row -> {
|
JavaPairRDD<HoodieKey, Row> keyRowRDD = originalDF.javaRDD().mapToPair(row -> {
|
||||||
HoodieKey key = new HoodieKey(row.getAs(HoodieRecord.RECORD_KEY_METADATA_FIELD),
|
HoodieKey key = new HoodieKey(row.getAs(HoodieRecord.RECORD_KEY_METADATA_FIELD),
|
||||||
@@ -176,18 +171,16 @@ public class HoodieReadClient<T extends HoodieRecordPayload> extends AbstractHoo
|
|||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Checks if the given [Keys] exists in the hoodie table and returns [Key, Option[FullFilePath]]
|
* Checks if the given [Keys] exists in the hoodie table and returns [Key, Option[FullFilePath]] If the optional
|
||||||
* If the optional FullFilePath value is not present, then the key is not found. If the
|
* FullFilePath value is not present, then the key is not found. If the FullFilePath value is present, it is the path
|
||||||
* FullFilePath value is present, it is the path component (without scheme) of the URI underlying
|
* component (without scheme) of the URI underlying file
|
||||||
* file
|
|
||||||
*/
|
*/
|
||||||
public JavaPairRDD<HoodieKey, Option<String>> checkExists(JavaRDD<HoodieKey> hoodieKeys) {
|
public JavaPairRDD<HoodieKey, Option<String>> checkExists(JavaRDD<HoodieKey> hoodieKeys) {
|
||||||
return index.fetchRecordLocation(hoodieKeys, jsc, hoodieTable);
|
return index.fetchRecordLocation(hoodieKeys, jsc, hoodieTable);
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Filter out HoodieRecords that already exists in the output folder. This is useful in
|
* Filter out HoodieRecords that already exists in the output folder. This is useful in deduplication.
|
||||||
* deduplication.
|
|
||||||
*
|
*
|
||||||
* @param hoodieRecords Input RDD of Hoodie records.
|
* @param hoodieRecords Input RDD of Hoodie records.
|
||||||
* @return A subset of hoodieRecords RDD, with existing records filtered out.
|
* @return A subset of hoodieRecords RDD, with existing records filtered out.
|
||||||
@@ -198,27 +191,27 @@ public class HoodieReadClient<T extends HoodieRecordPayload> extends AbstractHoo
|
|||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Looks up the index and tags each incoming record with a location of a file that contains the
|
* Looks up the index and tags each incoming record with a location of a file that contains the row (if it is actually
|
||||||
* row (if it is actually present). Input RDD should contain no duplicates if needed.
|
* present). Input RDD should contain no duplicates if needed.
|
||||||
*
|
*
|
||||||
* @param hoodieRecords Input RDD of Hoodie records
|
* @param hoodieRecords Input RDD of Hoodie records
|
||||||
* @return Tagged RDD of Hoodie records
|
* @return Tagged RDD of Hoodie records
|
||||||
*/
|
*/
|
||||||
public JavaRDD<HoodieRecord<T>> tagLocation(JavaRDD<HoodieRecord<T>> hoodieRecords)
|
public JavaRDD<HoodieRecord<T>> tagLocation(JavaRDD<HoodieRecord<T>> hoodieRecords) throws HoodieIndexException {
|
||||||
throws HoodieIndexException {
|
|
||||||
return index.tagLocation(hoodieRecords, jsc, hoodieTable);
|
return index.tagLocation(hoodieRecords, jsc, hoodieTable);
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Return all pending compactions with instant time for clients to decide what to compact next.
|
* Return all pending compactions with instant time for clients to decide what to compact next.
|
||||||
|
*
|
||||||
* @return
|
* @return
|
||||||
*/
|
*/
|
||||||
public List<Pair<String, HoodieCompactionPlan>> getPendingCompactions() {
|
public List<Pair<String, HoodieCompactionPlan>> getPendingCompactions() {
|
||||||
HoodieTableMetaClient metaClient = new HoodieTableMetaClient(jsc.hadoopConfiguration(),
|
HoodieTableMetaClient metaClient =
|
||||||
hoodieTable.getMetaClient().getBasePath(), true);
|
new HoodieTableMetaClient(jsc.hadoopConfiguration(), hoodieTable.getMetaClient().getBasePath(), true);
|
||||||
return CompactionUtils.getAllPendingCompactionPlans(metaClient).stream()
|
return CompactionUtils.getAllPendingCompactionPlans(metaClient).stream()
|
||||||
.map(instantWorkloadPair ->
|
.map(
|
||||||
Pair.of(instantWorkloadPair.getKey().getTimestamp(), instantWorkloadPair.getValue()))
|
instantWorkloadPair -> Pair.of(instantWorkloadPair.getKey().getTimestamp(), instantWorkloadPair.getValue()))
|
||||||
.collect(Collectors.toList());
|
.collect(Collectors.toList());
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|||||||
File diff suppressed because it is too large
Load Diff
@@ -64,14 +64,11 @@ public class WriteStatus implements Serializable {
|
|||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Mark write as success, optionally using given parameters for the purpose of calculating some
|
* Mark write as success, optionally using given parameters for the purpose of calculating some aggregate metrics.
|
||||||
* aggregate metrics. This method is not meant to cache passed arguments, since WriteStatus
|
* This method is not meant to cache passed arguments, since WriteStatus objects are collected in Spark Driver.
|
||||||
* objects are collected in Spark Driver.
|
|
||||||
*
|
*
|
||||||
* @param record deflated {@code HoodieRecord} containing information that uniquely identifies
|
* @param record deflated {@code HoodieRecord} containing information that uniquely identifies it.
|
||||||
* it.
|
* @param optionalRecordMetadata optional metadata related to data contained in {@link HoodieRecord} before deflation.
|
||||||
* @param optionalRecordMetadata optional metadata related to data contained in {@link
|
|
||||||
* HoodieRecord} before deflation.
|
|
||||||
*/
|
*/
|
||||||
public void markSuccess(HoodieRecord record, Option<Map<String, String>> optionalRecordMetadata) {
|
public void markSuccess(HoodieRecord record, Option<Map<String, String>> optionalRecordMetadata) {
|
||||||
if (trackSuccessRecords) {
|
if (trackSuccessRecords) {
|
||||||
@@ -81,14 +78,11 @@ public class WriteStatus implements Serializable {
|
|||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Mark write as failed, optionally using given parameters for the purpose of calculating some
|
* Mark write as failed, optionally using given parameters for the purpose of calculating some aggregate metrics. This
|
||||||
* aggregate metrics. This method is not meant to cache passed arguments, since WriteStatus
|
* method is not meant to cache passed arguments, since WriteStatus objects are collected in Spark Driver.
|
||||||
* objects are collected in Spark Driver.
|
|
||||||
*
|
*
|
||||||
* @param record deflated {@code HoodieRecord} containing information that uniquely identifies
|
* @param record deflated {@code HoodieRecord} containing information that uniquely identifies it.
|
||||||
* it.
|
* @param optionalRecordMetadata optional metadata related to data contained in {@link HoodieRecord} before deflation.
|
||||||
* @param optionalRecordMetadata optional metadata related to data contained in {@link
|
|
||||||
* HoodieRecord} before deflation.
|
|
||||||
*/
|
*/
|
||||||
public void markFailure(HoodieRecord record, Throwable t, Option<Map<String, String>> optionalRecordMetadata) {
|
public void markFailure(HoodieRecord record, Throwable t, Option<Map<String, String>> optionalRecordMetadata) {
|
||||||
if (failedRecords.isEmpty() || (random.nextDouble() <= failureFraction)) {
|
if (failedRecords.isEmpty() || (random.nextDouble() <= failureFraction)) {
|
||||||
|
|||||||
@@ -40,10 +40,8 @@ public class HoodieCompactionConfig extends DefaultHoodieConfig {
|
|||||||
// Turn on inline compaction - after fw delta commits a inline compaction will be run
|
// Turn on inline compaction - after fw delta commits a inline compaction will be run
|
||||||
public static final String INLINE_COMPACT_PROP = "hoodie.compact.inline";
|
public static final String INLINE_COMPACT_PROP = "hoodie.compact.inline";
|
||||||
// Run a compaction every N delta commits
|
// Run a compaction every N delta commits
|
||||||
public static final String INLINE_COMPACT_NUM_DELTA_COMMITS_PROP =
|
public static final String INLINE_COMPACT_NUM_DELTA_COMMITS_PROP = "hoodie.compact.inline.max" + ".delta.commits";
|
||||||
"hoodie.compact.inline.max" + ".delta.commits";
|
public static final String CLEANER_FILE_VERSIONS_RETAINED_PROP = "hoodie.cleaner.fileversions" + ".retained";
|
||||||
public static final String CLEANER_FILE_VERSIONS_RETAINED_PROP =
|
|
||||||
"hoodie.cleaner.fileversions" + ".retained";
|
|
||||||
public static final String CLEANER_COMMITS_RETAINED_PROP = "hoodie.cleaner.commits.retained";
|
public static final String CLEANER_COMMITS_RETAINED_PROP = "hoodie.cleaner.commits.retained";
|
||||||
public static final String MAX_COMMITS_TO_KEEP_PROP = "hoodie.keep.max.commits";
|
public static final String MAX_COMMITS_TO_KEEP_PROP = "hoodie.keep.max.commits";
|
||||||
public static final String MIN_COMMITS_TO_KEEP_PROP = "hoodie.keep.min.commits";
|
public static final String MIN_COMMITS_TO_KEEP_PROP = "hoodie.keep.min.commits";
|
||||||
@@ -56,25 +54,21 @@ public class HoodieCompactionConfig extends DefaultHoodieConfig {
|
|||||||
* Configs related to specific table types
|
* Configs related to specific table types
|
||||||
**/
|
**/
|
||||||
// Number of inserts, that will be put each partition/bucket for writing
|
// Number of inserts, that will be put each partition/bucket for writing
|
||||||
public static final String COPY_ON_WRITE_TABLE_INSERT_SPLIT_SIZE =
|
public static final String COPY_ON_WRITE_TABLE_INSERT_SPLIT_SIZE = "hoodie.copyonwrite.insert" + ".split.size";
|
||||||
"hoodie.copyonwrite.insert" + ".split.size";
|
|
||||||
// The rationale to pick the insert parallelism is the following. Writing out 100MB files,
|
// The rationale to pick the insert parallelism is the following. Writing out 100MB files,
|
||||||
// with atleast 1kb records, means 100K records per file. we just overprovision to 500K
|
// with atleast 1kb records, means 100K records per file. we just overprovision to 500K
|
||||||
public static final String DEFAULT_COPY_ON_WRITE_TABLE_INSERT_SPLIT_SIZE = String.valueOf(500000);
|
public static final String DEFAULT_COPY_ON_WRITE_TABLE_INSERT_SPLIT_SIZE = String.valueOf(500000);
|
||||||
// Config to control whether we control insert split sizes automatically based on average
|
// Config to control whether we control insert split sizes automatically based on average
|
||||||
// record sizes
|
// record sizes
|
||||||
public static final String COPY_ON_WRITE_TABLE_AUTO_SPLIT_INSERTS =
|
public static final String COPY_ON_WRITE_TABLE_AUTO_SPLIT_INSERTS = "hoodie.copyonwrite.insert" + ".auto.split";
|
||||||
"hoodie.copyonwrite.insert" + ".auto.split";
|
|
||||||
// its off by default
|
// its off by default
|
||||||
public static final String DEFAULT_COPY_ON_WRITE_TABLE_AUTO_SPLIT_INSERTS = String.valueOf(true);
|
public static final String DEFAULT_COPY_ON_WRITE_TABLE_AUTO_SPLIT_INSERTS = String.valueOf(true);
|
||||||
// This value is used as a guessimate for the record size, if we can't determine this from
|
// This value is used as a guessimate for the record size, if we can't determine this from
|
||||||
// previous commits
|
// previous commits
|
||||||
public static final String COPY_ON_WRITE_TABLE_RECORD_SIZE_ESTIMATE =
|
public static final String COPY_ON_WRITE_TABLE_RECORD_SIZE_ESTIMATE = "hoodie.copyonwrite" + ".record.size.estimate";
|
||||||
"hoodie.copyonwrite" + ".record.size.estimate";
|
|
||||||
// Used to determine how much more can be packed into a small file, before it exceeds the size
|
// Used to determine how much more can be packed into a small file, before it exceeds the size
|
||||||
// limit.
|
// limit.
|
||||||
public static final String DEFAULT_COPY_ON_WRITE_TABLE_RECORD_SIZE_ESTIMATE = String
|
public static final String DEFAULT_COPY_ON_WRITE_TABLE_RECORD_SIZE_ESTIMATE = String.valueOf(1024);
|
||||||
.valueOf(1024);
|
|
||||||
public static final String CLEANER_PARALLELISM = "hoodie.cleaner.parallelism";
|
public static final String CLEANER_PARALLELISM = "hoodie.cleaner.parallelism";
|
||||||
public static final String DEFAULT_CLEANER_PARALLELISM = String.valueOf(200);
|
public static final String DEFAULT_CLEANER_PARALLELISM = String.valueOf(200);
|
||||||
public static final String TARGET_IO_PER_COMPACTION_IN_MB_PROP = "hoodie.compaction.target.io";
|
public static final String TARGET_IO_PER_COMPACTION_IN_MB_PROP = "hoodie.compaction.target.io";
|
||||||
@@ -82,8 +76,7 @@ public class HoodieCompactionConfig extends DefaultHoodieConfig {
|
|||||||
public static final String DEFAULT_TARGET_IO_PER_COMPACTION_IN_MB = String.valueOf(500 * 1024);
|
public static final String DEFAULT_TARGET_IO_PER_COMPACTION_IN_MB = String.valueOf(500 * 1024);
|
||||||
public static final String COMPACTION_STRATEGY_PROP = "hoodie.compaction.strategy";
|
public static final String COMPACTION_STRATEGY_PROP = "hoodie.compaction.strategy";
|
||||||
// 200GB of target IO per compaction
|
// 200GB of target IO per compaction
|
||||||
public static final String DEFAULT_COMPACTION_STRATEGY = LogFileSizeBasedCompactionStrategy.class
|
public static final String DEFAULT_COMPACTION_STRATEGY = LogFileSizeBasedCompactionStrategy.class.getName();
|
||||||
.getName();
|
|
||||||
// used to merge records written to log file
|
// used to merge records written to log file
|
||||||
public static final String DEFAULT_PAYLOAD_CLASS = HoodieAvroPayload.class.getName();
|
public static final String DEFAULT_PAYLOAD_CLASS = HoodieAvroPayload.class.getName();
|
||||||
public static final String PAYLOAD_CLASS_PROP = "hoodie.compaction.payload.class";
|
public static final String PAYLOAD_CLASS_PROP = "hoodie.compaction.payload.class";
|
||||||
@@ -91,15 +84,12 @@ public class HoodieCompactionConfig extends DefaultHoodieConfig {
|
|||||||
// used to choose a trade off between IO vs Memory when performing compaction process
|
// used to choose a trade off between IO vs Memory when performing compaction process
|
||||||
// Depending on outputfile_size and memory provided, choose true to avoid OOM for large file
|
// Depending on outputfile_size and memory provided, choose true to avoid OOM for large file
|
||||||
// size + small memory
|
// size + small memory
|
||||||
public static final String COMPACTION_LAZY_BLOCK_READ_ENABLED_PROP =
|
public static final String COMPACTION_LAZY_BLOCK_READ_ENABLED_PROP = "hoodie.compaction.lazy" + ".block.read";
|
||||||
"hoodie.compaction.lazy" + ".block.read";
|
|
||||||
public static final String DEFAULT_COMPACTION_LAZY_BLOCK_READ_ENABLED = "false";
|
public static final String DEFAULT_COMPACTION_LAZY_BLOCK_READ_ENABLED = "false";
|
||||||
// used to choose whether to enable reverse log reading (reverse log traversal)
|
// used to choose whether to enable reverse log reading (reverse log traversal)
|
||||||
public static final String COMPACTION_REVERSE_LOG_READ_ENABLED_PROP =
|
public static final String COMPACTION_REVERSE_LOG_READ_ENABLED_PROP = "hoodie.compaction" + ".reverse.log.read";
|
||||||
"hoodie.compaction" + ".reverse.log.read";
|
|
||||||
public static final String DEFAULT_COMPACTION_REVERSE_LOG_READ_ENABLED = "false";
|
public static final String DEFAULT_COMPACTION_REVERSE_LOG_READ_ENABLED = "false";
|
||||||
private static final String DEFAULT_CLEANER_POLICY = HoodieCleaningPolicy.KEEP_LATEST_COMMITS
|
private static final String DEFAULT_CLEANER_POLICY = HoodieCleaningPolicy.KEEP_LATEST_COMMITS.name();
|
||||||
.name();
|
|
||||||
private static final String DEFAULT_AUTO_CLEAN = "true";
|
private static final String DEFAULT_AUTO_CLEAN = "true";
|
||||||
private static final String DEFAULT_INLINE_COMPACT = "false";
|
private static final String DEFAULT_INLINE_COMPACT = "false";
|
||||||
private static final String DEFAULT_INLINE_COMPACT_NUM_DELTA_COMMITS = "1";
|
private static final String DEFAULT_INLINE_COMPACT_NUM_DELTA_COMMITS = "1";
|
||||||
@@ -108,8 +98,8 @@ public class HoodieCompactionConfig extends DefaultHoodieConfig {
|
|||||||
private static final String DEFAULT_MAX_COMMITS_TO_KEEP = "30";
|
private static final String DEFAULT_MAX_COMMITS_TO_KEEP = "30";
|
||||||
private static final String DEFAULT_MIN_COMMITS_TO_KEEP = "20";
|
private static final String DEFAULT_MIN_COMMITS_TO_KEEP = "20";
|
||||||
private static final String DEFAULT_COMMITS_ARCHIVAL_BATCH_SIZE = String.valueOf(10);
|
private static final String DEFAULT_COMMITS_ARCHIVAL_BATCH_SIZE = String.valueOf(10);
|
||||||
public static final String TARGET_PARTITIONS_PER_DAYBASED_COMPACTION_PROP = "hoodie.compaction.daybased.target"
|
public static final String TARGET_PARTITIONS_PER_DAYBASED_COMPACTION_PROP =
|
||||||
+ ".partitions";
|
"hoodie.compaction.daybased.target" + ".partitions";
|
||||||
// 500GB of target IO per compaction (both read and write)
|
// 500GB of target IO per compaction (both read and write)
|
||||||
public static final String DEFAULT_TARGET_PARTITIONS_PER_DAYBASED_COMPACTION = String.valueOf(10);
|
public static final String DEFAULT_TARGET_PARTITIONS_PER_DAYBASED_COMPACTION = String.valueOf(10);
|
||||||
|
|
||||||
@@ -188,14 +178,12 @@ public class HoodieCompactionConfig extends DefaultHoodieConfig {
|
|||||||
}
|
}
|
||||||
|
|
||||||
public Builder autoTuneInsertSplits(boolean autoTuneInsertSplits) {
|
public Builder autoTuneInsertSplits(boolean autoTuneInsertSplits) {
|
||||||
props.setProperty(COPY_ON_WRITE_TABLE_AUTO_SPLIT_INSERTS,
|
props.setProperty(COPY_ON_WRITE_TABLE_AUTO_SPLIT_INSERTS, String.valueOf(autoTuneInsertSplits));
|
||||||
String.valueOf(autoTuneInsertSplits));
|
|
||||||
return this;
|
return this;
|
||||||
}
|
}
|
||||||
|
|
||||||
public Builder approxRecordSize(int recordSizeEstimate) {
|
public Builder approxRecordSize(int recordSizeEstimate) {
|
||||||
props.setProperty(COPY_ON_WRITE_TABLE_RECORD_SIZE_ESTIMATE,
|
props.setProperty(COPY_ON_WRITE_TABLE_RECORD_SIZE_ESTIMATE, String.valueOf(recordSizeEstimate));
|
||||||
String.valueOf(recordSizeEstimate));
|
|
||||||
return this;
|
return this;
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -215,32 +203,27 @@ public class HoodieCompactionConfig extends DefaultHoodieConfig {
|
|||||||
}
|
}
|
||||||
|
|
||||||
public Builder withTargetIOPerCompactionInMB(long targetIOPerCompactionInMB) {
|
public Builder withTargetIOPerCompactionInMB(long targetIOPerCompactionInMB) {
|
||||||
props.setProperty(TARGET_IO_PER_COMPACTION_IN_MB_PROP,
|
props.setProperty(TARGET_IO_PER_COMPACTION_IN_MB_PROP, String.valueOf(targetIOPerCompactionInMB));
|
||||||
String.valueOf(targetIOPerCompactionInMB));
|
|
||||||
return this;
|
return this;
|
||||||
}
|
}
|
||||||
|
|
||||||
public Builder withMaxNumDeltaCommitsBeforeCompaction(int maxNumDeltaCommitsBeforeCompaction) {
|
public Builder withMaxNumDeltaCommitsBeforeCompaction(int maxNumDeltaCommitsBeforeCompaction) {
|
||||||
props.setProperty(INLINE_COMPACT_NUM_DELTA_COMMITS_PROP,
|
props.setProperty(INLINE_COMPACT_NUM_DELTA_COMMITS_PROP, String.valueOf(maxNumDeltaCommitsBeforeCompaction));
|
||||||
String.valueOf(maxNumDeltaCommitsBeforeCompaction));
|
|
||||||
return this;
|
return this;
|
||||||
}
|
}
|
||||||
|
|
||||||
public Builder withCompactionLazyBlockReadEnabled(Boolean compactionLazyBlockReadEnabled) {
|
public Builder withCompactionLazyBlockReadEnabled(Boolean compactionLazyBlockReadEnabled) {
|
||||||
props.setProperty(COMPACTION_LAZY_BLOCK_READ_ENABLED_PROP,
|
props.setProperty(COMPACTION_LAZY_BLOCK_READ_ENABLED_PROP, String.valueOf(compactionLazyBlockReadEnabled));
|
||||||
String.valueOf(compactionLazyBlockReadEnabled));
|
|
||||||
return this;
|
return this;
|
||||||
}
|
}
|
||||||
|
|
||||||
public Builder withCompactionReverseLogReadEnabled(Boolean compactionReverseLogReadEnabled) {
|
public Builder withCompactionReverseLogReadEnabled(Boolean compactionReverseLogReadEnabled) {
|
||||||
props.setProperty(COMPACTION_REVERSE_LOG_READ_ENABLED_PROP,
|
props.setProperty(COMPACTION_REVERSE_LOG_READ_ENABLED_PROP, String.valueOf(compactionReverseLogReadEnabled));
|
||||||
String.valueOf(compactionReverseLogReadEnabled));
|
|
||||||
return this;
|
return this;
|
||||||
}
|
}
|
||||||
|
|
||||||
public Builder withTargetPartitionsPerDayBasedCompaction(int targetPartitionsPerCompaction) {
|
public Builder withTargetPartitionsPerDayBasedCompaction(int targetPartitionsPerCompaction) {
|
||||||
props.setProperty(TARGET_PARTITIONS_PER_DAYBASED_COMPACTION_PROP,
|
props.setProperty(TARGET_PARTITIONS_PER_DAYBASED_COMPACTION_PROP, String.valueOf(targetPartitionsPerCompaction));
|
||||||
String.valueOf(targetPartitionsPerCompaction));
|
|
||||||
return this;
|
return this;
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -251,8 +234,7 @@ public class HoodieCompactionConfig extends DefaultHoodieConfig {
|
|||||||
|
|
||||||
public HoodieCompactionConfig build() {
|
public HoodieCompactionConfig build() {
|
||||||
HoodieCompactionConfig config = new HoodieCompactionConfig(props);
|
HoodieCompactionConfig config = new HoodieCompactionConfig(props);
|
||||||
setDefaultOnCondition(props, !props.containsKey(AUTO_CLEAN_PROP), AUTO_CLEAN_PROP,
|
setDefaultOnCondition(props, !props.containsKey(AUTO_CLEAN_PROP), AUTO_CLEAN_PROP, DEFAULT_AUTO_CLEAN);
|
||||||
DEFAULT_AUTO_CLEAN);
|
|
||||||
setDefaultOnCondition(props, !props.containsKey(INLINE_COMPACT_PROP), INLINE_COMPACT_PROP,
|
setDefaultOnCondition(props, !props.containsKey(INLINE_COMPACT_PROP), INLINE_COMPACT_PROP,
|
||||||
DEFAULT_INLINE_COMPACT);
|
DEFAULT_INLINE_COMPACT);
|
||||||
setDefaultOnCondition(props, !props.containsKey(INLINE_COMPACT_NUM_DELTA_COMMITS_PROP),
|
setDefaultOnCondition(props, !props.containsKey(INLINE_COMPACT_NUM_DELTA_COMMITS_PROP),
|
||||||
@@ -261,27 +243,25 @@ public class HoodieCompactionConfig extends DefaultHoodieConfig {
|
|||||||
DEFAULT_CLEANER_POLICY);
|
DEFAULT_CLEANER_POLICY);
|
||||||
setDefaultOnCondition(props, !props.containsKey(CLEANER_FILE_VERSIONS_RETAINED_PROP),
|
setDefaultOnCondition(props, !props.containsKey(CLEANER_FILE_VERSIONS_RETAINED_PROP),
|
||||||
CLEANER_FILE_VERSIONS_RETAINED_PROP, DEFAULT_CLEANER_FILE_VERSIONS_RETAINED);
|
CLEANER_FILE_VERSIONS_RETAINED_PROP, DEFAULT_CLEANER_FILE_VERSIONS_RETAINED);
|
||||||
setDefaultOnCondition(props, !props.containsKey(CLEANER_COMMITS_RETAINED_PROP),
|
setDefaultOnCondition(props, !props.containsKey(CLEANER_COMMITS_RETAINED_PROP), CLEANER_COMMITS_RETAINED_PROP,
|
||||||
CLEANER_COMMITS_RETAINED_PROP, DEFAULT_CLEANER_COMMITS_RETAINED);
|
DEFAULT_CLEANER_COMMITS_RETAINED);
|
||||||
setDefaultOnCondition(props, !props.containsKey(MAX_COMMITS_TO_KEEP_PROP), MAX_COMMITS_TO_KEEP_PROP,
|
setDefaultOnCondition(props, !props.containsKey(MAX_COMMITS_TO_KEEP_PROP), MAX_COMMITS_TO_KEEP_PROP,
|
||||||
DEFAULT_MAX_COMMITS_TO_KEEP);
|
DEFAULT_MAX_COMMITS_TO_KEEP);
|
||||||
setDefaultOnCondition(props, !props.containsKey(MIN_COMMITS_TO_KEEP_PROP), MIN_COMMITS_TO_KEEP_PROP,
|
setDefaultOnCondition(props, !props.containsKey(MIN_COMMITS_TO_KEEP_PROP), MIN_COMMITS_TO_KEEP_PROP,
|
||||||
DEFAULT_MIN_COMMITS_TO_KEEP);
|
DEFAULT_MIN_COMMITS_TO_KEEP);
|
||||||
setDefaultOnCondition(props, !props.containsKey(PARQUET_SMALL_FILE_LIMIT_BYTES),
|
setDefaultOnCondition(props, !props.containsKey(PARQUET_SMALL_FILE_LIMIT_BYTES), PARQUET_SMALL_FILE_LIMIT_BYTES,
|
||||||
PARQUET_SMALL_FILE_LIMIT_BYTES, DEFAULT_PARQUET_SMALL_FILE_LIMIT_BYTES);
|
DEFAULT_PARQUET_SMALL_FILE_LIMIT_BYTES);
|
||||||
setDefaultOnCondition(props, !props.containsKey(COPY_ON_WRITE_TABLE_INSERT_SPLIT_SIZE),
|
setDefaultOnCondition(props, !props.containsKey(COPY_ON_WRITE_TABLE_INSERT_SPLIT_SIZE),
|
||||||
COPY_ON_WRITE_TABLE_INSERT_SPLIT_SIZE, DEFAULT_COPY_ON_WRITE_TABLE_INSERT_SPLIT_SIZE);
|
COPY_ON_WRITE_TABLE_INSERT_SPLIT_SIZE, DEFAULT_COPY_ON_WRITE_TABLE_INSERT_SPLIT_SIZE);
|
||||||
setDefaultOnCondition(props, !props.containsKey(COPY_ON_WRITE_TABLE_AUTO_SPLIT_INSERTS),
|
setDefaultOnCondition(props, !props.containsKey(COPY_ON_WRITE_TABLE_AUTO_SPLIT_INSERTS),
|
||||||
COPY_ON_WRITE_TABLE_AUTO_SPLIT_INSERTS, DEFAULT_COPY_ON_WRITE_TABLE_AUTO_SPLIT_INSERTS);
|
COPY_ON_WRITE_TABLE_AUTO_SPLIT_INSERTS, DEFAULT_COPY_ON_WRITE_TABLE_AUTO_SPLIT_INSERTS);
|
||||||
setDefaultOnCondition(props, !props.containsKey(COPY_ON_WRITE_TABLE_RECORD_SIZE_ESTIMATE),
|
setDefaultOnCondition(props, !props.containsKey(COPY_ON_WRITE_TABLE_RECORD_SIZE_ESTIMATE),
|
||||||
COPY_ON_WRITE_TABLE_RECORD_SIZE_ESTIMATE,
|
COPY_ON_WRITE_TABLE_RECORD_SIZE_ESTIMATE, DEFAULT_COPY_ON_WRITE_TABLE_RECORD_SIZE_ESTIMATE);
|
||||||
DEFAULT_COPY_ON_WRITE_TABLE_RECORD_SIZE_ESTIMATE);
|
|
||||||
setDefaultOnCondition(props, !props.containsKey(CLEANER_PARALLELISM), CLEANER_PARALLELISM,
|
setDefaultOnCondition(props, !props.containsKey(CLEANER_PARALLELISM), CLEANER_PARALLELISM,
|
||||||
DEFAULT_CLEANER_PARALLELISM);
|
DEFAULT_CLEANER_PARALLELISM);
|
||||||
setDefaultOnCondition(props, !props.containsKey(COMPACTION_STRATEGY_PROP),
|
setDefaultOnCondition(props, !props.containsKey(COMPACTION_STRATEGY_PROP), COMPACTION_STRATEGY_PROP,
|
||||||
COMPACTION_STRATEGY_PROP, DEFAULT_COMPACTION_STRATEGY);
|
DEFAULT_COMPACTION_STRATEGY);
|
||||||
setDefaultOnCondition(props, !props.containsKey(PAYLOAD_CLASS_PROP),
|
setDefaultOnCondition(props, !props.containsKey(PAYLOAD_CLASS_PROP), PAYLOAD_CLASS_PROP, DEFAULT_PAYLOAD_CLASS);
|
||||||
PAYLOAD_CLASS_PROP, DEFAULT_PAYLOAD_CLASS);
|
|
||||||
setDefaultOnCondition(props, !props.containsKey(TARGET_IO_PER_COMPACTION_IN_MB_PROP),
|
setDefaultOnCondition(props, !props.containsKey(TARGET_IO_PER_COMPACTION_IN_MB_PROP),
|
||||||
TARGET_IO_PER_COMPACTION_IN_MB_PROP, DEFAULT_TARGET_IO_PER_COMPACTION_IN_MB);
|
TARGET_IO_PER_COMPACTION_IN_MB_PROP, DEFAULT_TARGET_IO_PER_COMPACTION_IN_MB);
|
||||||
setDefaultOnCondition(props, !props.containsKey(COMPACTION_LAZY_BLOCK_READ_ENABLED_PROP),
|
setDefaultOnCondition(props, !props.containsKey(COMPACTION_LAZY_BLOCK_READ_ENABLED_PROP),
|
||||||
@@ -299,13 +279,15 @@ public class HoodieCompactionConfig extends DefaultHoodieConfig {
|
|||||||
// commit instant on timeline, that still has not been cleaned. Could miss some data via incr pull
|
// commit instant on timeline, that still has not been cleaned. Could miss some data via incr pull
|
||||||
int minInstantsToKeep = Integer.parseInt(props.getProperty(HoodieCompactionConfig.MIN_COMMITS_TO_KEEP_PROP));
|
int minInstantsToKeep = Integer.parseInt(props.getProperty(HoodieCompactionConfig.MIN_COMMITS_TO_KEEP_PROP));
|
||||||
int maxInstantsToKeep = Integer.parseInt(props.getProperty(HoodieCompactionConfig.MAX_COMMITS_TO_KEEP_PROP));
|
int maxInstantsToKeep = Integer.parseInt(props.getProperty(HoodieCompactionConfig.MAX_COMMITS_TO_KEEP_PROP));
|
||||||
int cleanerCommitsRetained = Integer
|
int cleanerCommitsRetained =
|
||||||
.parseInt(props.getProperty(HoodieCompactionConfig.CLEANER_COMMITS_RETAINED_PROP));
|
Integer.parseInt(props.getProperty(HoodieCompactionConfig.CLEANER_COMMITS_RETAINED_PROP));
|
||||||
Preconditions.checkArgument(maxInstantsToKeep > minInstantsToKeep);
|
Preconditions.checkArgument(maxInstantsToKeep > minInstantsToKeep);
|
||||||
Preconditions.checkArgument(minInstantsToKeep > cleanerCommitsRetained,
|
Preconditions.checkArgument(minInstantsToKeep > cleanerCommitsRetained,
|
||||||
String.format("Increase %s=%d to be greater than %s=%d. Otherwise, there is risk of incremental pull "
|
String.format(
|
||||||
+ "missing data from few instants.", HoodieCompactionConfig.MIN_COMMITS_TO_KEEP_PROP,
|
"Increase %s=%d to be greater than %s=%d. Otherwise, there is risk of incremental pull "
|
||||||
minInstantsToKeep, HoodieCompactionConfig.CLEANER_COMMITS_RETAINED_PROP, cleanerCommitsRetained));
|
+ "missing data from few instants.",
|
||||||
|
HoodieCompactionConfig.MIN_COMMITS_TO_KEEP_PROP, minInstantsToKeep,
|
||||||
|
HoodieCompactionConfig.CLEANER_COMMITS_RETAINED_PROP, cleanerCommitsRetained));
|
||||||
return config;
|
return config;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -32,8 +32,8 @@ public class HoodieHBaseIndexConfig extends DefaultHoodieConfig {
|
|||||||
public static final String HBASE_GET_BATCH_SIZE_PROP = "hoodie.index.hbase.get.batch.size";
|
public static final String HBASE_GET_BATCH_SIZE_PROP = "hoodie.index.hbase.get.batch.size";
|
||||||
public static final String HBASE_ZK_ZNODEPARENT = "hoodie.index.hbase.zknode.path";
|
public static final String HBASE_ZK_ZNODEPARENT = "hoodie.index.hbase.zknode.path";
|
||||||
/**
|
/**
|
||||||
* Note that if HBASE_PUT_BATCH_SIZE_AUTO_COMPUTE_PROP is set to true, this batch size will not
|
* Note that if HBASE_PUT_BATCH_SIZE_AUTO_COMPUTE_PROP is set to true, this batch size will not be honored for HBase
|
||||||
* be honored for HBase Puts
|
* Puts
|
||||||
*/
|
*/
|
||||||
public static final String HBASE_PUT_BATCH_SIZE_PROP = "hoodie.index.hbase.put.batch.size";
|
public static final String HBASE_PUT_BATCH_SIZE_PROP = "hoodie.index.hbase.put.batch.size";
|
||||||
|
|
||||||
@@ -48,18 +48,16 @@ public class HoodieHBaseIndexConfig extends DefaultHoodieConfig {
|
|||||||
public static final String HBASE_PUT_BATCH_SIZE_AUTO_COMPUTE_PROP = "hoodie.index.hbase.put.batch.size.autocompute";
|
public static final String HBASE_PUT_BATCH_SIZE_AUTO_COMPUTE_PROP = "hoodie.index.hbase.put.batch.size.autocompute";
|
||||||
public static final String DEFAULT_HBASE_PUT_BATCH_SIZE_AUTO_COMPUTE = "false";
|
public static final String DEFAULT_HBASE_PUT_BATCH_SIZE_AUTO_COMPUTE = "false";
|
||||||
/**
|
/**
|
||||||
* Property to set the fraction of the global share of QPS that should be allocated to this job.
|
* Property to set the fraction of the global share of QPS that should be allocated to this job. Let's say there are 3
|
||||||
* Let's say there are 3 jobs which have input size in terms of number of rows required for
|
* jobs which have input size in terms of number of rows required for HbaseIndexing as x, 2x, 3x respectively. Then
|
||||||
* HbaseIndexing as x, 2x, 3x respectively. Then this fraction for the jobs would be (0.17) 1/6,
|
* this fraction for the jobs would be (0.17) 1/6, 0.33 (2/6) and 0.5 (3/6) respectively.
|
||||||
* 0.33 (2/6) and 0.5 (3/6) respectively.
|
|
||||||
*/
|
*/
|
||||||
public static final String HBASE_QPS_FRACTION_PROP = "hoodie.index.hbase.qps.fraction";
|
public static final String HBASE_QPS_FRACTION_PROP = "hoodie.index.hbase.qps.fraction";
|
||||||
/**
|
/**
|
||||||
* Property to set maximum QPS allowed per Region Server. This should be same across various
|
* Property to set maximum QPS allowed per Region Server. This should be same across various jobs. This is intended to
|
||||||
* jobs. This is intended to limit the aggregate QPS generated across various jobs to an Hbase
|
* limit the aggregate QPS generated across various jobs to an Hbase Region Server. It is recommended to set this
|
||||||
* Region Server. It is recommended to set this value based on global indexing throughput needs
|
* value based on global indexing throughput needs and most importantly, how much the HBase installation in use is
|
||||||
* and most importantly, how much the HBase installation in use is able to tolerate without
|
* able to tolerate without Region Servers going down.
|
||||||
* Region Servers going down.
|
|
||||||
*/
|
*/
|
||||||
public static String HBASE_MAX_QPS_PER_REGION_SERVER_PROP = "hoodie.index.hbase.max.qps.per.region.server";
|
public static String HBASE_MAX_QPS_PER_REGION_SERVER_PROP = "hoodie.index.hbase.max.qps.per.region.server";
|
||||||
/**
|
/**
|
||||||
@@ -71,18 +69,17 @@ public class HoodieHBaseIndexConfig extends DefaultHoodieConfig {
|
|||||||
*/
|
*/
|
||||||
public static final int DEFAULT_HBASE_MAX_QPS_PER_REGION_SERVER = 1000;
|
public static final int DEFAULT_HBASE_MAX_QPS_PER_REGION_SERVER = 1000;
|
||||||
/**
|
/**
|
||||||
* Default is 50%, which means a total of 2 jobs can run using HbaseIndex without overwhelming
|
* Default is 50%, which means a total of 2 jobs can run using HbaseIndex without overwhelming Region Servers
|
||||||
* Region Servers
|
|
||||||
*/
|
*/
|
||||||
public static final float DEFAULT_HBASE_QPS_FRACTION = 0.5f;
|
public static final float DEFAULT_HBASE_QPS_FRACTION = 0.5f;
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Property to decide if HBASE_QPS_FRACTION_PROP is dynamically calculated based on volume
|
* Property to decide if HBASE_QPS_FRACTION_PROP is dynamically calculated based on volume
|
||||||
*/
|
*/
|
||||||
public static final String HOODIE_INDEX_COMPUTE_QPS_DYNAMICALLY = "hoodie.index.hbase.dynamic_qps";
|
public static final String HOODIE_INDEX_COMPUTE_QPS_DYNAMICALLY = "hoodie.index.hbase.dynamic_qps";
|
||||||
public static final boolean DEFAULT_HOODIE_INDEX_COMPUTE_QPS_DYNAMICALLY = false;
|
public static final boolean DEFAULT_HOODIE_INDEX_COMPUTE_QPS_DYNAMICALLY = false;
|
||||||
/**
|
/**
|
||||||
* Min and Max for HBASE_QPS_FRACTION_PROP to stabilize skewed volume workloads
|
* Min and Max for HBASE_QPS_FRACTION_PROP to stabilize skewed volume workloads
|
||||||
*/
|
*/
|
||||||
public static final String HBASE_MIN_QPS_FRACTION_PROP = "hoodie.index.hbase.min.qps.fraction";
|
public static final String HBASE_MIN_QPS_FRACTION_PROP = "hoodie.index.hbase.min.qps.fraction";
|
||||||
public static final String DEFAULT_HBASE_MIN_QPS_FRACTION_PROP = "0.002";
|
public static final String DEFAULT_HBASE_MIN_QPS_FRACTION_PROP = "0.002";
|
||||||
@@ -90,7 +87,7 @@ public class HoodieHBaseIndexConfig extends DefaultHoodieConfig {
|
|||||||
public static final String HBASE_MAX_QPS_FRACTION_PROP = "hoodie.index.hbase.max.qps.fraction";
|
public static final String HBASE_MAX_QPS_FRACTION_PROP = "hoodie.index.hbase.max.qps.fraction";
|
||||||
public static final String DEFAULT_HBASE_MAX_QPS_FRACTION_PROP = "0.06";
|
public static final String DEFAULT_HBASE_MAX_QPS_FRACTION_PROP = "0.06";
|
||||||
/**
|
/**
|
||||||
* Hoodie index desired puts operation time in seconds
|
* Hoodie index desired puts operation time in seconds
|
||||||
*/
|
*/
|
||||||
public static final String HOODIE_INDEX_DESIRED_PUTS_TIME_IN_SECS = "hoodie.index.hbase.desired_puts_time_in_secs";
|
public static final String HOODIE_INDEX_DESIRED_PUTS_TIME_IN_SECS = "hoodie.index.hbase.desired_puts_time_in_secs";
|
||||||
public static final int DEFAULT_HOODIE_INDEX_DESIRED_PUTS_TIME_IN_SECS = 600;
|
public static final int DEFAULT_HOODIE_INDEX_DESIRED_PUTS_TIME_IN_SECS = 600;
|
||||||
@@ -105,7 +102,7 @@ public class HoodieHBaseIndexConfig extends DefaultHoodieConfig {
|
|||||||
public static final String DEFAULT_HBASE_ZK_PATH_QPS_ROOT = "/QPS_ROOT";
|
public static final String DEFAULT_HBASE_ZK_PATH_QPS_ROOT = "/QPS_ROOT";
|
||||||
|
|
||||||
public HoodieHBaseIndexConfig(final Properties props) {
|
public HoodieHBaseIndexConfig(final Properties props) {
|
||||||
super(props);
|
super(props);
|
||||||
}
|
}
|
||||||
|
|
||||||
public static HoodieHBaseIndexConfig.Builder newBuilder() {
|
public static HoodieHBaseIndexConfig.Builder newBuilder() {
|
||||||
@@ -218,18 +215,15 @@ public class HoodieHBaseIndexConfig extends DefaultHoodieConfig {
|
|||||||
|
|
||||||
/**
|
/**
|
||||||
* <p>
|
* <p>
|
||||||
* Method to set maximum QPS allowed per Region Server. This should be same across various
|
* Method to set maximum QPS allowed per Region Server. This should be same across various jobs. This is intended to
|
||||||
* jobs. This is intended to limit the aggregate QPS generated across various jobs to an
|
* limit the aggregate QPS generated across various jobs to an Hbase Region Server.
|
||||||
* Hbase Region Server.
|
|
||||||
* </p>
|
* </p>
|
||||||
* <p>
|
* <p>
|
||||||
* It is recommended to set this value based on your global indexing throughput needs and
|
* It is recommended to set this value based on your global indexing throughput needs and most importantly, how much
|
||||||
* most importantly, how much your HBase installation is able to tolerate without Region
|
* your HBase installation is able to tolerate without Region Servers going down.
|
||||||
* Servers going down.
|
|
||||||
* </p>
|
* </p>
|
||||||
*/
|
*/
|
||||||
public HoodieHBaseIndexConfig.Builder hbaseIndexMaxQPSPerRegionServer(
|
public HoodieHBaseIndexConfig.Builder hbaseIndexMaxQPSPerRegionServer(int maxQPSPerRegionServer) {
|
||||||
int maxQPSPerRegionServer) {
|
|
||||||
// This should be same across various jobs
|
// This should be same across various jobs
|
||||||
props.setProperty(HoodieHBaseIndexConfig.HBASE_MAX_QPS_PER_REGION_SERVER_PROP,
|
props.setProperty(HoodieHBaseIndexConfig.HBASE_MAX_QPS_PER_REGION_SERVER_PROP,
|
||||||
String.valueOf(maxQPSPerRegionServer));
|
String.valueOf(maxQPSPerRegionServer));
|
||||||
@@ -238,30 +232,30 @@ public class HoodieHBaseIndexConfig extends DefaultHoodieConfig {
|
|||||||
|
|
||||||
public HoodieHBaseIndexConfig build() {
|
public HoodieHBaseIndexConfig build() {
|
||||||
HoodieHBaseIndexConfig config = new HoodieHBaseIndexConfig(props);
|
HoodieHBaseIndexConfig config = new HoodieHBaseIndexConfig(props);
|
||||||
setDefaultOnCondition(props, !props.containsKey(HBASE_GET_BATCH_SIZE_PROP),
|
setDefaultOnCondition(props, !props.containsKey(HBASE_GET_BATCH_SIZE_PROP), HBASE_GET_BATCH_SIZE_PROP,
|
||||||
HBASE_GET_BATCH_SIZE_PROP, String.valueOf(DEFAULT_HBASE_BATCH_SIZE));
|
String.valueOf(DEFAULT_HBASE_BATCH_SIZE));
|
||||||
setDefaultOnCondition(props, !props.containsKey(HBASE_PUT_BATCH_SIZE_PROP),
|
setDefaultOnCondition(props, !props.containsKey(HBASE_PUT_BATCH_SIZE_PROP), HBASE_PUT_BATCH_SIZE_PROP,
|
||||||
HBASE_PUT_BATCH_SIZE_PROP, String.valueOf(DEFAULT_HBASE_BATCH_SIZE));
|
String.valueOf(DEFAULT_HBASE_BATCH_SIZE));
|
||||||
setDefaultOnCondition(props, !props.containsKey(HBASE_PUT_BATCH_SIZE_AUTO_COMPUTE_PROP),
|
setDefaultOnCondition(props, !props.containsKey(HBASE_PUT_BATCH_SIZE_AUTO_COMPUTE_PROP),
|
||||||
HBASE_PUT_BATCH_SIZE_AUTO_COMPUTE_PROP, String.valueOf(DEFAULT_HBASE_PUT_BATCH_SIZE_AUTO_COMPUTE));
|
HBASE_PUT_BATCH_SIZE_AUTO_COMPUTE_PROP, String.valueOf(DEFAULT_HBASE_PUT_BATCH_SIZE_AUTO_COMPUTE));
|
||||||
setDefaultOnCondition(props, !props.containsKey(HBASE_QPS_FRACTION_PROP),
|
setDefaultOnCondition(props, !props.containsKey(HBASE_QPS_FRACTION_PROP), HBASE_QPS_FRACTION_PROP,
|
||||||
HBASE_QPS_FRACTION_PROP, String.valueOf(DEFAULT_HBASE_QPS_FRACTION));
|
String.valueOf(DEFAULT_HBASE_QPS_FRACTION));
|
||||||
setDefaultOnCondition(props, !props.containsKey(HBASE_MAX_QPS_PER_REGION_SERVER_PROP),
|
setDefaultOnCondition(props, !props.containsKey(HBASE_MAX_QPS_PER_REGION_SERVER_PROP),
|
||||||
HBASE_MAX_QPS_PER_REGION_SERVER_PROP, String.valueOf(DEFAULT_HBASE_MAX_QPS_PER_REGION_SERVER));
|
HBASE_MAX_QPS_PER_REGION_SERVER_PROP, String.valueOf(DEFAULT_HBASE_MAX_QPS_PER_REGION_SERVER));
|
||||||
setDefaultOnCondition(props, !props.containsKey(HOODIE_INDEX_COMPUTE_QPS_DYNAMICALLY),
|
setDefaultOnCondition(props, !props.containsKey(HOODIE_INDEX_COMPUTE_QPS_DYNAMICALLY),
|
||||||
HOODIE_INDEX_COMPUTE_QPS_DYNAMICALLY, String.valueOf(DEFAULT_HOODIE_INDEX_COMPUTE_QPS_DYNAMICALLY));
|
HOODIE_INDEX_COMPUTE_QPS_DYNAMICALLY, String.valueOf(DEFAULT_HOODIE_INDEX_COMPUTE_QPS_DYNAMICALLY));
|
||||||
setDefaultOnCondition(props, !props.containsKey(HBASE_INDEX_QPS_ALLOCATOR_CLASS),
|
setDefaultOnCondition(props, !props.containsKey(HBASE_INDEX_QPS_ALLOCATOR_CLASS), HBASE_INDEX_QPS_ALLOCATOR_CLASS,
|
||||||
HBASE_INDEX_QPS_ALLOCATOR_CLASS, String.valueOf(DEFAULT_HBASE_INDEX_QPS_ALLOCATOR_CLASS));
|
String.valueOf(DEFAULT_HBASE_INDEX_QPS_ALLOCATOR_CLASS));
|
||||||
setDefaultOnCondition(props, !props.containsKey(HOODIE_INDEX_DESIRED_PUTS_TIME_IN_SECS),
|
setDefaultOnCondition(props, !props.containsKey(HOODIE_INDEX_DESIRED_PUTS_TIME_IN_SECS),
|
||||||
HOODIE_INDEX_DESIRED_PUTS_TIME_IN_SECS, String.valueOf(DEFAULT_HOODIE_INDEX_DESIRED_PUTS_TIME_IN_SECS));
|
HOODIE_INDEX_DESIRED_PUTS_TIME_IN_SECS, String.valueOf(DEFAULT_HOODIE_INDEX_DESIRED_PUTS_TIME_IN_SECS));
|
||||||
setDefaultOnCondition(props, !props.containsKey(HBASE_ZK_PATH_QPS_ROOT),
|
setDefaultOnCondition(props, !props.containsKey(HBASE_ZK_PATH_QPS_ROOT), HBASE_ZK_PATH_QPS_ROOT,
|
||||||
HBASE_ZK_PATH_QPS_ROOT, String.valueOf(DEFAULT_HBASE_ZK_PATH_QPS_ROOT));
|
String.valueOf(DEFAULT_HBASE_ZK_PATH_QPS_ROOT));
|
||||||
setDefaultOnCondition(props, !props.containsKey(HOODIE_INDEX_HBASE_ZK_SESSION_TIMEOUT_MS),
|
setDefaultOnCondition(props, !props.containsKey(HOODIE_INDEX_HBASE_ZK_SESSION_TIMEOUT_MS),
|
||||||
HOODIE_INDEX_HBASE_ZK_SESSION_TIMEOUT_MS, String.valueOf(DEFAULT_ZK_SESSION_TIMEOUT_MS));
|
HOODIE_INDEX_HBASE_ZK_SESSION_TIMEOUT_MS, String.valueOf(DEFAULT_ZK_SESSION_TIMEOUT_MS));
|
||||||
setDefaultOnCondition(props, !props.containsKey(HOODIE_INDEX_HBASE_ZK_CONNECTION_TIMEOUT_MS),
|
setDefaultOnCondition(props, !props.containsKey(HOODIE_INDEX_HBASE_ZK_CONNECTION_TIMEOUT_MS),
|
||||||
HOODIE_INDEX_HBASE_ZK_CONNECTION_TIMEOUT_MS, String.valueOf(DEFAULT_ZK_CONNECTION_TIMEOUT_MS));
|
HOODIE_INDEX_HBASE_ZK_CONNECTION_TIMEOUT_MS, String.valueOf(DEFAULT_ZK_CONNECTION_TIMEOUT_MS));
|
||||||
setDefaultOnCondition(props, !props.containsKey(HBASE_INDEX_QPS_ALLOCATOR_CLASS),
|
setDefaultOnCondition(props, !props.containsKey(HBASE_INDEX_QPS_ALLOCATOR_CLASS), HBASE_INDEX_QPS_ALLOCATOR_CLASS,
|
||||||
HBASE_INDEX_QPS_ALLOCATOR_CLASS, String.valueOf(DEFAULT_HBASE_INDEX_QPS_ALLOCATOR_CLASS));
|
String.valueOf(DEFAULT_HBASE_INDEX_QPS_ALLOCATOR_CLASS));
|
||||||
return config;
|
return config;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|||||||
@@ -34,7 +34,7 @@ public class HoodieIndexConfig extends DefaultHoodieConfig {
|
|||||||
public static final String INDEX_TYPE_PROP = "hoodie.index.type";
|
public static final String INDEX_TYPE_PROP = "hoodie.index.type";
|
||||||
public static final String DEFAULT_INDEX_TYPE = HoodieIndex.IndexType.BLOOM.name();
|
public static final String DEFAULT_INDEX_TYPE = HoodieIndex.IndexType.BLOOM.name();
|
||||||
|
|
||||||
// ***** Bloom Index configs *****
|
// ***** Bloom Index configs *****
|
||||||
public static final String BLOOM_FILTER_NUM_ENTRIES = "hoodie.index.bloom.num_entries";
|
public static final String BLOOM_FILTER_NUM_ENTRIES = "hoodie.index.bloom.num_entries";
|
||||||
public static final String DEFAULT_BLOOM_FILTER_NUM_ENTRIES = "60000";
|
public static final String DEFAULT_BLOOM_FILTER_NUM_ENTRIES = "60000";
|
||||||
public static final String BLOOM_FILTER_FPP = "hoodie.index.bloom.fpp";
|
public static final String BLOOM_FILTER_FPP = "hoodie.index.bloom.fpp";
|
||||||
@@ -42,8 +42,7 @@ public class HoodieIndexConfig extends DefaultHoodieConfig {
|
|||||||
public static final String BLOOM_INDEX_PARALLELISM_PROP = "hoodie.bloom.index.parallelism";
|
public static final String BLOOM_INDEX_PARALLELISM_PROP = "hoodie.bloom.index.parallelism";
|
||||||
// Disable explicit bloom index parallelism setting by default - hoodie auto computes
|
// Disable explicit bloom index parallelism setting by default - hoodie auto computes
|
||||||
public static final String DEFAULT_BLOOM_INDEX_PARALLELISM = "0";
|
public static final String DEFAULT_BLOOM_INDEX_PARALLELISM = "0";
|
||||||
public static final String BLOOM_INDEX_PRUNE_BY_RANGES_PROP =
|
public static final String BLOOM_INDEX_PRUNE_BY_RANGES_PROP = "hoodie.bloom.index.prune.by" + ".ranges";
|
||||||
"hoodie.bloom.index.prune.by" + ".ranges";
|
|
||||||
public static final String DEFAULT_BLOOM_INDEX_PRUNE_BY_RANGES = "true";
|
public static final String DEFAULT_BLOOM_INDEX_PRUNE_BY_RANGES = "true";
|
||||||
public static final String BLOOM_INDEX_USE_CACHING_PROP = "hoodie.bloom.index.use.caching";
|
public static final String BLOOM_INDEX_USE_CACHING_PROP = "hoodie.bloom.index.use.caching";
|
||||||
public static final String DEFAULT_BLOOM_INDEX_USE_CACHING = "true";
|
public static final String DEFAULT_BLOOM_INDEX_USE_CACHING = "true";
|
||||||
@@ -67,8 +66,7 @@ public class HoodieIndexConfig extends DefaultHoodieConfig {
|
|||||||
public static final String DEFAULT_HBASE_BATCH_SIZE = "100";
|
public static final String DEFAULT_HBASE_BATCH_SIZE = "100";
|
||||||
|
|
||||||
|
|
||||||
public static final String BLOOM_INDEX_INPUT_STORAGE_LEVEL =
|
public static final String BLOOM_INDEX_INPUT_STORAGE_LEVEL = "hoodie.bloom.index.input.storage" + ".level";
|
||||||
"hoodie.bloom.index.input.storage" + ".level";
|
|
||||||
public static final String DEFAULT_BLOOM_INDEX_INPUT_STORAGE_LEVEL = "MEMORY_AND_DISK_SER";
|
public static final String DEFAULT_BLOOM_INDEX_INPUT_STORAGE_LEVEL = "MEMORY_AND_DISK_SER";
|
||||||
|
|
||||||
private HoodieIndexConfig(Properties props) {
|
private HoodieIndexConfig(Properties props) {
|
||||||
@@ -175,20 +173,18 @@ public class HoodieIndexConfig extends DefaultHoodieConfig {
|
|||||||
|
|
||||||
public HoodieIndexConfig build() {
|
public HoodieIndexConfig build() {
|
||||||
HoodieIndexConfig config = new HoodieIndexConfig(props);
|
HoodieIndexConfig config = new HoodieIndexConfig(props);
|
||||||
setDefaultOnCondition(props, !props.containsKey(INDEX_TYPE_PROP), INDEX_TYPE_PROP,
|
setDefaultOnCondition(props, !props.containsKey(INDEX_TYPE_PROP), INDEX_TYPE_PROP, DEFAULT_INDEX_TYPE);
|
||||||
DEFAULT_INDEX_TYPE);
|
setDefaultOnCondition(props, !props.containsKey(BLOOM_FILTER_NUM_ENTRIES), BLOOM_FILTER_NUM_ENTRIES,
|
||||||
setDefaultOnCondition(props, !props.containsKey(BLOOM_FILTER_NUM_ENTRIES),
|
DEFAULT_BLOOM_FILTER_NUM_ENTRIES);
|
||||||
BLOOM_FILTER_NUM_ENTRIES, DEFAULT_BLOOM_FILTER_NUM_ENTRIES);
|
setDefaultOnCondition(props, !props.containsKey(BLOOM_FILTER_FPP), BLOOM_FILTER_FPP, DEFAULT_BLOOM_FILTER_FPP);
|
||||||
setDefaultOnCondition(props, !props.containsKey(BLOOM_FILTER_FPP), BLOOM_FILTER_FPP,
|
setDefaultOnCondition(props, !props.containsKey(BLOOM_INDEX_PARALLELISM_PROP), BLOOM_INDEX_PARALLELISM_PROP,
|
||||||
DEFAULT_BLOOM_FILTER_FPP);
|
DEFAULT_BLOOM_INDEX_PARALLELISM);
|
||||||
setDefaultOnCondition(props, !props.containsKey(BLOOM_INDEX_PARALLELISM_PROP),
|
|
||||||
BLOOM_INDEX_PARALLELISM_PROP, DEFAULT_BLOOM_INDEX_PARALLELISM);
|
|
||||||
setDefaultOnCondition(props, !props.containsKey(BLOOM_INDEX_PRUNE_BY_RANGES_PROP),
|
setDefaultOnCondition(props, !props.containsKey(BLOOM_INDEX_PRUNE_BY_RANGES_PROP),
|
||||||
BLOOM_INDEX_PRUNE_BY_RANGES_PROP, DEFAULT_BLOOM_INDEX_PRUNE_BY_RANGES);
|
BLOOM_INDEX_PRUNE_BY_RANGES_PROP, DEFAULT_BLOOM_INDEX_PRUNE_BY_RANGES);
|
||||||
setDefaultOnCondition(props, !props.containsKey(BLOOM_INDEX_USE_CACHING_PROP),
|
setDefaultOnCondition(props, !props.containsKey(BLOOM_INDEX_USE_CACHING_PROP), BLOOM_INDEX_USE_CACHING_PROP,
|
||||||
BLOOM_INDEX_USE_CACHING_PROP, DEFAULT_BLOOM_INDEX_USE_CACHING);
|
DEFAULT_BLOOM_INDEX_USE_CACHING);
|
||||||
setDefaultOnCondition(props, !props.containsKey(BLOOM_INDEX_INPUT_STORAGE_LEVEL),
|
setDefaultOnCondition(props, !props.containsKey(BLOOM_INDEX_INPUT_STORAGE_LEVEL), BLOOM_INDEX_INPUT_STORAGE_LEVEL,
|
||||||
BLOOM_INDEX_INPUT_STORAGE_LEVEL, DEFAULT_BLOOM_INDEX_INPUT_STORAGE_LEVEL);
|
DEFAULT_BLOOM_INDEX_INPUT_STORAGE_LEVEL);
|
||||||
setDefaultOnCondition(props, !props.containsKey(BLOOM_INDEX_TREE_BASED_FILTER_PROP),
|
setDefaultOnCondition(props, !props.containsKey(BLOOM_INDEX_TREE_BASED_FILTER_PROP),
|
||||||
BLOOM_INDEX_TREE_BASED_FILTER_PROP, DEFAULT_BLOOM_INDEX_TREE_BASED_FILTER);
|
BLOOM_INDEX_TREE_BASED_FILTER_PROP, DEFAULT_BLOOM_INDEX_TREE_BASED_FILTER);
|
||||||
setDefaultOnCondition(props, !props.containsKey(BLOOM_INDEX_BUCKETIZED_CHECKING_PROP),
|
setDefaultOnCondition(props, !props.containsKey(BLOOM_INDEX_BUCKETIZED_CHECKING_PROP),
|
||||||
|
|||||||
@@ -41,8 +41,7 @@ public class HoodieMemoryConfig extends DefaultHoodieConfig {
|
|||||||
// Default max memory fraction during compaction, excess spills to disk
|
// Default max memory fraction during compaction, excess spills to disk
|
||||||
public static final String DEFAULT_MAX_MEMORY_FRACTION_FOR_COMPACTION = String.valueOf(0.6);
|
public static final String DEFAULT_MAX_MEMORY_FRACTION_FOR_COMPACTION = String.valueOf(0.6);
|
||||||
// Default memory size per compaction (used if SparkEnv is absent), excess spills to disk
|
// Default memory size per compaction (used if SparkEnv is absent), excess spills to disk
|
||||||
public static final long DEFAULT_MAX_MEMORY_FOR_SPILLABLE_MAP_IN_BYTES =
|
public static final long DEFAULT_MAX_MEMORY_FOR_SPILLABLE_MAP_IN_BYTES = 1024 * 1024 * 1024L; // 1GB
|
||||||
1024 * 1024 * 1024L; // 1GB
|
|
||||||
// Property to set the max memory for merge
|
// Property to set the max memory for merge
|
||||||
public static final String MAX_MEMORY_FOR_MERGE_PROP = "hoodie.memory.merge.max.size";
|
public static final String MAX_MEMORY_FOR_MERGE_PROP = "hoodie.memory.merge.max.size";
|
||||||
// Property to set the max memory for compaction
|
// Property to set the max memory for compaction
|
||||||
@@ -88,20 +87,17 @@ public class HoodieMemoryConfig extends DefaultHoodieConfig {
|
|||||||
}
|
}
|
||||||
|
|
||||||
public Builder withMaxMemoryFractionPerPartitionMerge(double maxMemoryFractionPerPartitionMerge) {
|
public Builder withMaxMemoryFractionPerPartitionMerge(double maxMemoryFractionPerPartitionMerge) {
|
||||||
props.setProperty(MAX_MEMORY_FRACTION_FOR_MERGE_PROP,
|
props.setProperty(MAX_MEMORY_FRACTION_FOR_MERGE_PROP, String.valueOf(maxMemoryFractionPerPartitionMerge));
|
||||||
String.valueOf(maxMemoryFractionPerPartitionMerge));
|
|
||||||
return this;
|
return this;
|
||||||
}
|
}
|
||||||
|
|
||||||
public Builder withMaxMemoryFractionPerCompaction(double maxMemoryFractionPerCompaction) {
|
public Builder withMaxMemoryFractionPerCompaction(double maxMemoryFractionPerCompaction) {
|
||||||
props.setProperty(MAX_MEMORY_FRACTION_FOR_COMPACTION_PROP,
|
props.setProperty(MAX_MEMORY_FRACTION_FOR_COMPACTION_PROP, String.valueOf(maxMemoryFractionPerCompaction));
|
||||||
String.valueOf(maxMemoryFractionPerCompaction));
|
|
||||||
return this;
|
return this;
|
||||||
}
|
}
|
||||||
|
|
||||||
public Builder withMaxDFSStreamBufferSize(int maxStreamBufferSize) {
|
public Builder withMaxDFSStreamBufferSize(int maxStreamBufferSize) {
|
||||||
props.setProperty(MAX_DFS_STREAM_BUFFER_SIZE_PROP,
|
props.setProperty(MAX_DFS_STREAM_BUFFER_SIZE_PROP, String.valueOf(maxStreamBufferSize));
|
||||||
String.valueOf(maxStreamBufferSize));
|
|
||||||
return this;
|
return this;
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -130,19 +126,16 @@ public class HoodieMemoryConfig extends DefaultHoodieConfig {
|
|||||||
|
|
||||||
if (SparkEnv.get() != null) {
|
if (SparkEnv.get() != null) {
|
||||||
// 1 GB is the default conf used by Spark, look at SparkContext.scala
|
// 1 GB is the default conf used by Spark, look at SparkContext.scala
|
||||||
long executorMemoryInBytes = Utils.memoryStringToMb(SparkEnv.get().conf().get(SPARK_EXECUTOR_MEMORY_PROP,
|
long executorMemoryInBytes = Utils.memoryStringToMb(
|
||||||
DEFAULT_SPARK_EXECUTOR_MEMORY_MB)) * 1024
|
SparkEnv.get().conf().get(SPARK_EXECUTOR_MEMORY_PROP, DEFAULT_SPARK_EXECUTOR_MEMORY_MB)) * 1024 * 1024L;
|
||||||
* 1024L;
|
|
||||||
// 0.6 is the default value used by Spark,
|
// 0.6 is the default value used by Spark,
|
||||||
// look at {@link
|
// look at {@link
|
||||||
// https://github.com/apache/spark/blob/master/core/src/main/scala/org/apache/spark/SparkConf.scala#L507}
|
// https://github.com/apache/spark/blob/master/core/src/main/scala/org/apache/spark/SparkConf.scala#L507}
|
||||||
double memoryFraction = Double
|
double memoryFraction = Double.valueOf(
|
||||||
.valueOf(SparkEnv.get().conf().get(SPARK_EXECUTOR_MEMORY_FRACTION_PROP,
|
SparkEnv.get().conf().get(SPARK_EXECUTOR_MEMORY_FRACTION_PROP, DEFAULT_SPARK_EXECUTOR_MEMORY_FRACTION));
|
||||||
DEFAULT_SPARK_EXECUTOR_MEMORY_FRACTION));
|
|
||||||
double maxMemoryFractionForMerge = Double.valueOf(maxMemoryFraction);
|
double maxMemoryFractionForMerge = Double.valueOf(maxMemoryFraction);
|
||||||
double userAvailableMemory = executorMemoryInBytes * (1 - memoryFraction);
|
double userAvailableMemory = executorMemoryInBytes * (1 - memoryFraction);
|
||||||
long maxMemoryForMerge = (long) Math
|
long maxMemoryForMerge = (long) Math.floor(userAvailableMemory * maxMemoryFractionForMerge);
|
||||||
.floor(userAvailableMemory * maxMemoryFractionForMerge);
|
|
||||||
return maxMemoryForMerge;
|
return maxMemoryForMerge;
|
||||||
} else {
|
} else {
|
||||||
return DEFAULT_MAX_MEMORY_FOR_SPILLABLE_MAP_IN_BYTES;
|
return DEFAULT_MAX_MEMORY_FOR_SPILLABLE_MAP_IN_BYTES;
|
||||||
@@ -151,29 +144,19 @@ public class HoodieMemoryConfig extends DefaultHoodieConfig {
|
|||||||
|
|
||||||
public HoodieMemoryConfig build() {
|
public HoodieMemoryConfig build() {
|
||||||
HoodieMemoryConfig config = new HoodieMemoryConfig(props);
|
HoodieMemoryConfig config = new HoodieMemoryConfig(props);
|
||||||
setDefaultOnCondition(props,
|
setDefaultOnCondition(props, !props.containsKey(MAX_MEMORY_FRACTION_FOR_COMPACTION_PROP),
|
||||||
!props.containsKey(MAX_MEMORY_FRACTION_FOR_COMPACTION_PROP),
|
MAX_MEMORY_FRACTION_FOR_COMPACTION_PROP, DEFAULT_MAX_MEMORY_FRACTION_FOR_COMPACTION);
|
||||||
MAX_MEMORY_FRACTION_FOR_COMPACTION_PROP,
|
setDefaultOnCondition(props, !props.containsKey(MAX_MEMORY_FRACTION_FOR_MERGE_PROP),
|
||||||
DEFAULT_MAX_MEMORY_FRACTION_FOR_COMPACTION);
|
|
||||||
setDefaultOnCondition(props,
|
|
||||||
!props.containsKey(MAX_MEMORY_FRACTION_FOR_MERGE_PROP),
|
|
||||||
MAX_MEMORY_FRACTION_FOR_MERGE_PROP, DEFAULT_MAX_MEMORY_FRACTION_FOR_MERGE);
|
MAX_MEMORY_FRACTION_FOR_MERGE_PROP, DEFAULT_MAX_MEMORY_FRACTION_FOR_MERGE);
|
||||||
setDefaultOnCondition(props,
|
setDefaultOnCondition(props, !props.containsKey(MAX_MEMORY_FOR_MERGE_PROP), MAX_MEMORY_FOR_MERGE_PROP,
|
||||||
!props.containsKey(MAX_MEMORY_FOR_MERGE_PROP),
|
String.valueOf(getMaxMemoryAllowedForMerge(props.getProperty(MAX_MEMORY_FRACTION_FOR_MERGE_PROP))));
|
||||||
MAX_MEMORY_FOR_MERGE_PROP, String.valueOf(
|
setDefaultOnCondition(props, !props.containsKey(MAX_MEMORY_FOR_COMPACTION_PROP), MAX_MEMORY_FOR_COMPACTION_PROP,
|
||||||
getMaxMemoryAllowedForMerge(props.getProperty(MAX_MEMORY_FRACTION_FOR_MERGE_PROP))));
|
String.valueOf(getMaxMemoryAllowedForMerge(props.getProperty(MAX_MEMORY_FRACTION_FOR_COMPACTION_PROP))));
|
||||||
setDefaultOnCondition(props,
|
setDefaultOnCondition(props, !props.containsKey(MAX_DFS_STREAM_BUFFER_SIZE_PROP), MAX_DFS_STREAM_BUFFER_SIZE_PROP,
|
||||||
!props.containsKey(MAX_MEMORY_FOR_COMPACTION_PROP),
|
String.valueOf(DEFAULT_MAX_DFS_STREAM_BUFFER_SIZE));
|
||||||
MAX_MEMORY_FOR_COMPACTION_PROP, String.valueOf(
|
setDefaultOnCondition(props, !props.containsKey(SPILLABLE_MAP_BASE_PATH_PROP), SPILLABLE_MAP_BASE_PATH_PROP,
|
||||||
getMaxMemoryAllowedForMerge(props.getProperty(MAX_MEMORY_FRACTION_FOR_COMPACTION_PROP))));
|
DEFAULT_SPILLABLE_MAP_BASE_PATH);
|
||||||
setDefaultOnCondition(props,
|
setDefaultOnCondition(props, !props.containsKey(WRITESTATUS_FAILURE_FRACTION_PROP),
|
||||||
!props.containsKey(MAX_DFS_STREAM_BUFFER_SIZE_PROP),
|
|
||||||
MAX_DFS_STREAM_BUFFER_SIZE_PROP, String.valueOf(DEFAULT_MAX_DFS_STREAM_BUFFER_SIZE));
|
|
||||||
setDefaultOnCondition(props,
|
|
||||||
!props.containsKey(SPILLABLE_MAP_BASE_PATH_PROP),
|
|
||||||
SPILLABLE_MAP_BASE_PATH_PROP, DEFAULT_SPILLABLE_MAP_BASE_PATH);
|
|
||||||
setDefaultOnCondition(props,
|
|
||||||
!props.containsKey(WRITESTATUS_FAILURE_FRACTION_PROP),
|
|
||||||
WRITESTATUS_FAILURE_FRACTION_PROP, String.valueOf(DEFAULT_WRITESTATUS_FAILURE_FRACTION));
|
WRITESTATUS_FAILURE_FRACTION_PROP, String.valueOf(DEFAULT_WRITESTATUS_FAILURE_FRACTION));
|
||||||
return config;
|
return config;
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -35,8 +35,7 @@ public class HoodieMetricsConfig extends DefaultHoodieConfig {
|
|||||||
public static final String METRICS_ON = METRIC_PREFIX + ".on";
|
public static final String METRICS_ON = METRIC_PREFIX + ".on";
|
||||||
public static final boolean DEFAULT_METRICS_ON = false;
|
public static final boolean DEFAULT_METRICS_ON = false;
|
||||||
public static final String METRICS_REPORTER_TYPE = METRIC_PREFIX + ".reporter.type";
|
public static final String METRICS_REPORTER_TYPE = METRIC_PREFIX + ".reporter.type";
|
||||||
public static final MetricsReporterType DEFAULT_METRICS_REPORTER_TYPE = MetricsReporterType
|
public static final MetricsReporterType DEFAULT_METRICS_REPORTER_TYPE = MetricsReporterType.GRAPHITE;
|
||||||
.GRAPHITE;
|
|
||||||
|
|
||||||
// Graphite
|
// Graphite
|
||||||
public static final String GRAPHITE_PREFIX = METRIC_PREFIX + ".graphite";
|
public static final String GRAPHITE_PREFIX = METRIC_PREFIX + ".graphite";
|
||||||
@@ -103,8 +102,7 @@ public class HoodieMetricsConfig extends DefaultHoodieConfig {
|
|||||||
|
|
||||||
public HoodieMetricsConfig build() {
|
public HoodieMetricsConfig build() {
|
||||||
HoodieMetricsConfig config = new HoodieMetricsConfig(props);
|
HoodieMetricsConfig config = new HoodieMetricsConfig(props);
|
||||||
setDefaultOnCondition(props, !props.containsKey(METRICS_ON), METRICS_ON,
|
setDefaultOnCondition(props, !props.containsKey(METRICS_ON), METRICS_ON, String.valueOf(DEFAULT_METRICS_ON));
|
||||||
String.valueOf(DEFAULT_METRICS_ON));
|
|
||||||
setDefaultOnCondition(props, !props.containsKey(METRICS_REPORTER_TYPE), METRICS_REPORTER_TYPE,
|
setDefaultOnCondition(props, !props.containsKey(METRICS_REPORTER_TYPE), METRICS_REPORTER_TYPE,
|
||||||
DEFAULT_METRICS_REPORTER_TYPE.name());
|
DEFAULT_METRICS_REPORTER_TYPE.name());
|
||||||
setDefaultOnCondition(props, !props.containsKey(GRAPHITE_SERVER_HOST), GRAPHITE_SERVER_HOST,
|
setDefaultOnCondition(props, !props.containsKey(GRAPHITE_SERVER_HOST), GRAPHITE_SERVER_HOST,
|
||||||
|
|||||||
@@ -38,8 +38,7 @@ public class HoodieStorageConfig extends DefaultHoodieConfig {
|
|||||||
public static final String DEFAULT_PARQUET_PAGE_SIZE_BYTES = String.valueOf(1 * 1024 * 1024);
|
public static final String DEFAULT_PARQUET_PAGE_SIZE_BYTES = String.valueOf(1 * 1024 * 1024);
|
||||||
// used to size log files
|
// used to size log files
|
||||||
public static final String LOGFILE_SIZE_MAX_BYTES = "hoodie.logfile.max.size";
|
public static final String LOGFILE_SIZE_MAX_BYTES = "hoodie.logfile.max.size";
|
||||||
public static final String DEFAULT_LOGFILE_SIZE_MAX_BYTES = String
|
public static final String DEFAULT_LOGFILE_SIZE_MAX_BYTES = String.valueOf(1024 * 1024 * 1024); // 1 GB
|
||||||
.valueOf(1024 * 1024 * 1024); // 1 GB
|
|
||||||
// used to size data blocks in log file
|
// used to size data blocks in log file
|
||||||
public static final String LOGFILE_DATA_BLOCK_SIZE_MAX_BYTES = "hoodie.logfile.data.block.max.size";
|
public static final String LOGFILE_DATA_BLOCK_SIZE_MAX_BYTES = "hoodie.logfile.data.block.max.size";
|
||||||
public static final String DEFAULT_LOGFILE_DATA_BLOCK_SIZE_MAX_BYTES = String.valueOf(256 * 1024 * 1024); // 256 MB
|
public static final String DEFAULT_LOGFILE_DATA_BLOCK_SIZE_MAX_BYTES = String.valueOf(256 * 1024 * 1024); // 256 MB
|
||||||
@@ -122,20 +121,20 @@ public class HoodieStorageConfig extends DefaultHoodieConfig {
|
|||||||
|
|
||||||
public HoodieStorageConfig build() {
|
public HoodieStorageConfig build() {
|
||||||
HoodieStorageConfig config = new HoodieStorageConfig(props);
|
HoodieStorageConfig config = new HoodieStorageConfig(props);
|
||||||
setDefaultOnCondition(props, !props.containsKey(PARQUET_FILE_MAX_BYTES),
|
setDefaultOnCondition(props, !props.containsKey(PARQUET_FILE_MAX_BYTES), PARQUET_FILE_MAX_BYTES,
|
||||||
PARQUET_FILE_MAX_BYTES, DEFAULT_PARQUET_FILE_MAX_BYTES);
|
DEFAULT_PARQUET_FILE_MAX_BYTES);
|
||||||
setDefaultOnCondition(props, !props.containsKey(PARQUET_BLOCK_SIZE_BYTES),
|
setDefaultOnCondition(props, !props.containsKey(PARQUET_BLOCK_SIZE_BYTES), PARQUET_BLOCK_SIZE_BYTES,
|
||||||
PARQUET_BLOCK_SIZE_BYTES, DEFAULT_PARQUET_BLOCK_SIZE_BYTES);
|
DEFAULT_PARQUET_BLOCK_SIZE_BYTES);
|
||||||
setDefaultOnCondition(props, !props.containsKey(PARQUET_PAGE_SIZE_BYTES),
|
setDefaultOnCondition(props, !props.containsKey(PARQUET_PAGE_SIZE_BYTES), PARQUET_PAGE_SIZE_BYTES,
|
||||||
PARQUET_PAGE_SIZE_BYTES, DEFAULT_PARQUET_PAGE_SIZE_BYTES);
|
DEFAULT_PARQUET_PAGE_SIZE_BYTES);
|
||||||
setDefaultOnCondition(props, !props.containsKey(LOGFILE_DATA_BLOCK_SIZE_MAX_BYTES),
|
setDefaultOnCondition(props, !props.containsKey(LOGFILE_DATA_BLOCK_SIZE_MAX_BYTES),
|
||||||
LOGFILE_DATA_BLOCK_SIZE_MAX_BYTES, DEFAULT_LOGFILE_DATA_BLOCK_SIZE_MAX_BYTES);
|
LOGFILE_DATA_BLOCK_SIZE_MAX_BYTES, DEFAULT_LOGFILE_DATA_BLOCK_SIZE_MAX_BYTES);
|
||||||
setDefaultOnCondition(props, !props.containsKey(LOGFILE_SIZE_MAX_BYTES),
|
setDefaultOnCondition(props, !props.containsKey(LOGFILE_SIZE_MAX_BYTES), LOGFILE_SIZE_MAX_BYTES,
|
||||||
LOGFILE_SIZE_MAX_BYTES, DEFAULT_LOGFILE_SIZE_MAX_BYTES);
|
DEFAULT_LOGFILE_SIZE_MAX_BYTES);
|
||||||
setDefaultOnCondition(props, !props.containsKey(PARQUET_COMPRESSION_RATIO),
|
setDefaultOnCondition(props, !props.containsKey(PARQUET_COMPRESSION_RATIO), PARQUET_COMPRESSION_RATIO,
|
||||||
PARQUET_COMPRESSION_RATIO, DEFAULT_STREAM_COMPRESSION_RATIO);
|
DEFAULT_STREAM_COMPRESSION_RATIO);
|
||||||
setDefaultOnCondition(props, !props.containsKey(PARQUET_COMPRESSION_CODEC),
|
setDefaultOnCondition(props, !props.containsKey(PARQUET_COMPRESSION_CODEC), PARQUET_COMPRESSION_CODEC,
|
||||||
PARQUET_COMPRESSION_CODEC, DEFAULT_PARQUET_COMPRESSION_CODEC);
|
DEFAULT_PARQUET_COMPRESSION_CODEC);
|
||||||
setDefaultOnCondition(props, !props.containsKey(LOGFILE_TO_PARQUET_COMPRESSION_RATIO),
|
setDefaultOnCondition(props, !props.containsKey(LOGFILE_TO_PARQUET_COMPRESSION_RATIO),
|
||||||
LOGFILE_TO_PARQUET_COMPRESSION_RATIO, DEFAULT_LOGFILE_TO_PARQUET_COMPRESSION_RATIO);
|
LOGFILE_TO_PARQUET_COMPRESSION_RATIO, DEFAULT_LOGFILE_TO_PARQUET_COMPRESSION_RATIO);
|
||||||
return config;
|
return config;
|
||||||
|
|||||||
@@ -61,8 +61,7 @@ public class HoodieWriteConfig extends DefaultHoodieConfig {
|
|||||||
private static final String DEFAULT_WRITE_STATUS_STORAGE_LEVEL = "MEMORY_AND_DISK_SER";
|
private static final String DEFAULT_WRITE_STATUS_STORAGE_LEVEL = "MEMORY_AND_DISK_SER";
|
||||||
private static final String HOODIE_AUTO_COMMIT_PROP = "hoodie.auto.commit";
|
private static final String HOODIE_AUTO_COMMIT_PROP = "hoodie.auto.commit";
|
||||||
private static final String DEFAULT_HOODIE_AUTO_COMMIT = "true";
|
private static final String DEFAULT_HOODIE_AUTO_COMMIT = "true";
|
||||||
private static final String HOODIE_ASSUME_DATE_PARTITIONING_PROP =
|
private static final String HOODIE_ASSUME_DATE_PARTITIONING_PROP = "hoodie.assume.date" + ".partitioning";
|
||||||
"hoodie.assume.date" + ".partitioning";
|
|
||||||
private static final String DEFAULT_ASSUME_DATE_PARTITIONING = "false";
|
private static final String DEFAULT_ASSUME_DATE_PARTITIONING = "false";
|
||||||
private static final String HOODIE_WRITE_STATUS_CLASS_PROP = "hoodie.writestatus.class";
|
private static final String HOODIE_WRITE_STATUS_CLASS_PROP = "hoodie.writestatus.class";
|
||||||
private static final String DEFAULT_HOODIE_WRITE_STATUS_CLASS = WriteStatus.class.getName();
|
private static final String DEFAULT_HOODIE_WRITE_STATUS_CLASS = WriteStatus.class.getName();
|
||||||
@@ -143,8 +142,7 @@ public class HoodieWriteConfig extends DefaultHoodieConfig {
|
|||||||
}
|
}
|
||||||
|
|
||||||
public int getWriteBufferLimitBytes() {
|
public int getWriteBufferLimitBytes() {
|
||||||
return Integer
|
return Integer.parseInt(props.getProperty(WRITE_BUFFER_LIMIT_BYTES, DEFAULT_WRITE_BUFFER_LIMIT_BYTES));
|
||||||
.parseInt(props.getProperty(WRITE_BUFFER_LIMIT_BYTES, DEFAULT_WRITE_BUFFER_LIMIT_BYTES));
|
|
||||||
}
|
}
|
||||||
|
|
||||||
public boolean shouldCombineBeforeInsert() {
|
public boolean shouldCombineBeforeInsert() {
|
||||||
@@ -191,18 +189,15 @@ public class HoodieWriteConfig extends DefaultHoodieConfig {
|
|||||||
* compaction properties
|
* compaction properties
|
||||||
**/
|
**/
|
||||||
public HoodieCleaningPolicy getCleanerPolicy() {
|
public HoodieCleaningPolicy getCleanerPolicy() {
|
||||||
return HoodieCleaningPolicy
|
return HoodieCleaningPolicy.valueOf(props.getProperty(HoodieCompactionConfig.CLEANER_POLICY_PROP));
|
||||||
.valueOf(props.getProperty(HoodieCompactionConfig.CLEANER_POLICY_PROP));
|
|
||||||
}
|
}
|
||||||
|
|
||||||
public int getCleanerFileVersionsRetained() {
|
public int getCleanerFileVersionsRetained() {
|
||||||
return Integer
|
return Integer.parseInt(props.getProperty(HoodieCompactionConfig.CLEANER_FILE_VERSIONS_RETAINED_PROP));
|
||||||
.parseInt(props.getProperty(HoodieCompactionConfig.CLEANER_FILE_VERSIONS_RETAINED_PROP));
|
|
||||||
}
|
}
|
||||||
|
|
||||||
public int getCleanerCommitsRetained() {
|
public int getCleanerCommitsRetained() {
|
||||||
return Integer
|
return Integer.parseInt(props.getProperty(HoodieCompactionConfig.CLEANER_COMMITS_RETAINED_PROP));
|
||||||
.parseInt(props.getProperty(HoodieCompactionConfig.CLEANER_COMMITS_RETAINED_PROP));
|
|
||||||
}
|
}
|
||||||
|
|
||||||
public int getMaxCommitsToKeep() {
|
public int getMaxCommitsToKeep() {
|
||||||
@@ -214,23 +209,19 @@ public class HoodieWriteConfig extends DefaultHoodieConfig {
|
|||||||
}
|
}
|
||||||
|
|
||||||
public int getParquetSmallFileLimit() {
|
public int getParquetSmallFileLimit() {
|
||||||
return Integer
|
return Integer.parseInt(props.getProperty(HoodieCompactionConfig.PARQUET_SMALL_FILE_LIMIT_BYTES));
|
||||||
.parseInt(props.getProperty(HoodieCompactionConfig.PARQUET_SMALL_FILE_LIMIT_BYTES));
|
|
||||||
}
|
}
|
||||||
|
|
||||||
public int getCopyOnWriteInsertSplitSize() {
|
public int getCopyOnWriteInsertSplitSize() {
|
||||||
return Integer
|
return Integer.parseInt(props.getProperty(HoodieCompactionConfig.COPY_ON_WRITE_TABLE_INSERT_SPLIT_SIZE));
|
||||||
.parseInt(props.getProperty(HoodieCompactionConfig.COPY_ON_WRITE_TABLE_INSERT_SPLIT_SIZE));
|
|
||||||
}
|
}
|
||||||
|
|
||||||
public int getCopyOnWriteRecordSizeEstimate() {
|
public int getCopyOnWriteRecordSizeEstimate() {
|
||||||
return Integer.parseInt(
|
return Integer.parseInt(props.getProperty(HoodieCompactionConfig.COPY_ON_WRITE_TABLE_RECORD_SIZE_ESTIMATE));
|
||||||
props.getProperty(HoodieCompactionConfig.COPY_ON_WRITE_TABLE_RECORD_SIZE_ESTIMATE));
|
|
||||||
}
|
}
|
||||||
|
|
||||||
public boolean shouldAutoTuneInsertSplits() {
|
public boolean shouldAutoTuneInsertSplits() {
|
||||||
return Boolean.parseBoolean(
|
return Boolean.parseBoolean(props.getProperty(HoodieCompactionConfig.COPY_ON_WRITE_TABLE_AUTO_SPLIT_INSERTS));
|
||||||
props.getProperty(HoodieCompactionConfig.COPY_ON_WRITE_TABLE_AUTO_SPLIT_INSERTS));
|
|
||||||
}
|
}
|
||||||
|
|
||||||
public int getCleanerParallelism() {
|
public int getCleanerParallelism() {
|
||||||
@@ -246,28 +237,23 @@ public class HoodieWriteConfig extends DefaultHoodieConfig {
|
|||||||
}
|
}
|
||||||
|
|
||||||
public int getInlineCompactDeltaCommitMax() {
|
public int getInlineCompactDeltaCommitMax() {
|
||||||
return Integer
|
return Integer.parseInt(props.getProperty(HoodieCompactionConfig.INLINE_COMPACT_NUM_DELTA_COMMITS_PROP));
|
||||||
.parseInt(props.getProperty(HoodieCompactionConfig.INLINE_COMPACT_NUM_DELTA_COMMITS_PROP));
|
|
||||||
}
|
}
|
||||||
|
|
||||||
public CompactionStrategy getCompactionStrategy() {
|
public CompactionStrategy getCompactionStrategy() {
|
||||||
return ReflectionUtils
|
return ReflectionUtils.loadClass(props.getProperty(HoodieCompactionConfig.COMPACTION_STRATEGY_PROP));
|
||||||
.loadClass(props.getProperty(HoodieCompactionConfig.COMPACTION_STRATEGY_PROP));
|
|
||||||
}
|
}
|
||||||
|
|
||||||
public Long getTargetIOPerCompactionInMB() {
|
public Long getTargetIOPerCompactionInMB() {
|
||||||
return Long
|
return Long.parseLong(props.getProperty(HoodieCompactionConfig.TARGET_IO_PER_COMPACTION_IN_MB_PROP));
|
||||||
.parseLong(props.getProperty(HoodieCompactionConfig.TARGET_IO_PER_COMPACTION_IN_MB_PROP));
|
|
||||||
}
|
}
|
||||||
|
|
||||||
public Boolean getCompactionLazyBlockReadEnabled() {
|
public Boolean getCompactionLazyBlockReadEnabled() {
|
||||||
return Boolean
|
return Boolean.valueOf(props.getProperty(HoodieCompactionConfig.COMPACTION_LAZY_BLOCK_READ_ENABLED_PROP));
|
||||||
.valueOf(props.getProperty(HoodieCompactionConfig.COMPACTION_LAZY_BLOCK_READ_ENABLED_PROP));
|
|
||||||
}
|
}
|
||||||
|
|
||||||
public Boolean getCompactionReverseLogReadEnabled() {
|
public Boolean getCompactionReverseLogReadEnabled() {
|
||||||
return Boolean.valueOf(
|
return Boolean.valueOf(props.getProperty(HoodieCompactionConfig.COMPACTION_REVERSE_LOG_READ_ENABLED_PROP));
|
||||||
props.getProperty(HoodieCompactionConfig.COMPACTION_REVERSE_LOG_READ_ENABLED_PROP));
|
|
||||||
}
|
}
|
||||||
|
|
||||||
public String getPayloadClass() {
|
public String getPayloadClass() {
|
||||||
@@ -275,13 +261,11 @@ public class HoodieWriteConfig extends DefaultHoodieConfig {
|
|||||||
}
|
}
|
||||||
|
|
||||||
public int getTargetPartitionsPerDayBasedCompaction() {
|
public int getTargetPartitionsPerDayBasedCompaction() {
|
||||||
return Integer
|
return Integer.parseInt(props.getProperty(HoodieCompactionConfig.TARGET_PARTITIONS_PER_DAYBASED_COMPACTION_PROP));
|
||||||
.parseInt(props.getProperty(HoodieCompactionConfig.TARGET_PARTITIONS_PER_DAYBASED_COMPACTION_PROP));
|
|
||||||
}
|
}
|
||||||
|
|
||||||
public int getCommitArchivalBatchSize() {
|
public int getCommitArchivalBatchSize() {
|
||||||
return Integer
|
return Integer.parseInt(props.getProperty(HoodieCompactionConfig.COMMITS_ARCHIVAL_BATCH_SIZE_PROP));
|
||||||
.parseInt(props.getProperty(HoodieCompactionConfig.COMMITS_ARCHIVAL_BATCH_SIZE_PROP));
|
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
@@ -352,9 +336,8 @@ public class HoodieWriteConfig extends DefaultHoodieConfig {
|
|||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Fraction of the global share of QPS that should be allocated to this job.
|
* Fraction of the global share of QPS that should be allocated to this job. Let's say there are 3 jobs which have
|
||||||
* Let's say there are 3 jobs which have input size in terms of number of rows
|
* input size in terms of number of rows required for HbaseIndexing as x, 2x, 3x respectively. Then this fraction for
|
||||||
* required for HbaseIndexing as x, 2x, 3x respectively. Then this fraction for
|
|
||||||
* the jobs would be (0.17) 1/6, 0.33 (2/6) and 0.5 (3/6) respectively.
|
* the jobs would be (0.17) 1/6, 0.33 (2/6) and 0.5 (3/6) respectively.
|
||||||
*/
|
*/
|
||||||
public float getHbaseIndexQPSFraction() {
|
public float getHbaseIndexQPSFraction() {
|
||||||
@@ -370,8 +353,8 @@ public class HoodieWriteConfig extends DefaultHoodieConfig {
|
|||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* This should be same across various jobs. This is intended to limit the aggregate
|
* This should be same across various jobs. This is intended to limit the aggregate QPS generated across various
|
||||||
* QPS generated across various Hoodie jobs to an Hbase Region Server
|
* Hoodie jobs to an Hbase Region Server
|
||||||
*/
|
*/
|
||||||
public int getHbaseIndexMaxQPSPerRegionServer() {
|
public int getHbaseIndexMaxQPSPerRegionServer() {
|
||||||
return Integer.parseInt(props.getProperty(HoodieHBaseIndexConfig.HBASE_MAX_QPS_PER_REGION_SERVER_PROP));
|
return Integer.parseInt(props.getProperty(HoodieHBaseIndexConfig.HBASE_MAX_QPS_PER_REGION_SERVER_PROP));
|
||||||
@@ -382,8 +365,7 @@ public class HoodieWriteConfig extends DefaultHoodieConfig {
|
|||||||
}
|
}
|
||||||
|
|
||||||
public boolean getBloomIndexPruneByRanges() {
|
public boolean getBloomIndexPruneByRanges() {
|
||||||
return Boolean
|
return Boolean.parseBoolean(props.getProperty(HoodieIndexConfig.BLOOM_INDEX_PRUNE_BY_RANGES_PROP));
|
||||||
.parseBoolean(props.getProperty(HoodieIndexConfig.BLOOM_INDEX_PRUNE_BY_RANGES_PROP));
|
|
||||||
}
|
}
|
||||||
|
|
||||||
public boolean getBloomIndexUseCaching() {
|
public boolean getBloomIndexUseCaching() {
|
||||||
@@ -403,8 +385,7 @@ public class HoodieWriteConfig extends DefaultHoodieConfig {
|
|||||||
}
|
}
|
||||||
|
|
||||||
public StorageLevel getBloomIndexInputStorageLevel() {
|
public StorageLevel getBloomIndexInputStorageLevel() {
|
||||||
return StorageLevel
|
return StorageLevel.fromString(props.getProperty(HoodieIndexConfig.BLOOM_INDEX_INPUT_STORAGE_LEVEL));
|
||||||
.fromString(props.getProperty(HoodieIndexConfig.BLOOM_INDEX_INPUT_STORAGE_LEVEL));
|
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
@@ -423,8 +404,7 @@ public class HoodieWriteConfig extends DefaultHoodieConfig {
|
|||||||
}
|
}
|
||||||
|
|
||||||
public int getLogFileDataBlockMaxSize() {
|
public int getLogFileDataBlockMaxSize() {
|
||||||
return Integer
|
return Integer.parseInt(props.getProperty(HoodieStorageConfig.LOGFILE_DATA_BLOCK_SIZE_MAX_BYTES));
|
||||||
.parseInt(props.getProperty(HoodieStorageConfig.LOGFILE_DATA_BLOCK_SIZE_MAX_BYTES));
|
|
||||||
}
|
}
|
||||||
|
|
||||||
public int getLogFileMaxSize() {
|
public int getLogFileMaxSize() {
|
||||||
@@ -451,8 +431,7 @@ public class HoodieWriteConfig extends DefaultHoodieConfig {
|
|||||||
}
|
}
|
||||||
|
|
||||||
public MetricsReporterType getMetricsReporterType() {
|
public MetricsReporterType getMetricsReporterType() {
|
||||||
return MetricsReporterType
|
return MetricsReporterType.valueOf(props.getProperty(HoodieMetricsConfig.METRICS_REPORTER_TYPE));
|
||||||
.valueOf(props.getProperty(HoodieMetricsConfig.METRICS_REPORTER_TYPE));
|
|
||||||
}
|
}
|
||||||
|
|
||||||
public String getGraphiteServerHost() {
|
public String getGraphiteServerHost() {
|
||||||
@@ -475,9 +454,7 @@ public class HoodieWriteConfig extends DefaultHoodieConfig {
|
|||||||
}
|
}
|
||||||
|
|
||||||
public Double getMaxMemoryFractionPerCompaction() {
|
public Double getMaxMemoryFractionPerCompaction() {
|
||||||
return Double
|
return Double.valueOf(props.getProperty(HoodieMemoryConfig.MAX_MEMORY_FRACTION_FOR_COMPACTION_PROP));
|
||||||
.valueOf(
|
|
||||||
props.getProperty(HoodieMemoryConfig.MAX_MEMORY_FRACTION_FOR_COMPACTION_PROP));
|
|
||||||
}
|
}
|
||||||
|
|
||||||
public Long getMaxMemoryPerPartitionMerge() {
|
public Long getMaxMemoryPerPartitionMerge() {
|
||||||
@@ -637,8 +614,7 @@ public class HoodieWriteConfig extends DefaultHoodieConfig {
|
|||||||
}
|
}
|
||||||
|
|
||||||
public Builder withAssumeDatePartitioning(boolean assumeDatePartitioning) {
|
public Builder withAssumeDatePartitioning(boolean assumeDatePartitioning) {
|
||||||
props.setProperty(HOODIE_ASSUME_DATE_PARTITIONING_PROP,
|
props.setProperty(HOODIE_ASSUME_DATE_PARTITIONING_PROP, String.valueOf(assumeDatePartitioning));
|
||||||
String.valueOf(assumeDatePartitioning));
|
|
||||||
return this;
|
return this;
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -671,48 +647,42 @@ public class HoodieWriteConfig extends DefaultHoodieConfig {
|
|||||||
|
|
||||||
public HoodieWriteConfig build() {
|
public HoodieWriteConfig build() {
|
||||||
// Check for mandatory properties
|
// Check for mandatory properties
|
||||||
setDefaultOnCondition(props, !props.containsKey(INSERT_PARALLELISM), INSERT_PARALLELISM,
|
setDefaultOnCondition(props, !props.containsKey(INSERT_PARALLELISM), INSERT_PARALLELISM, DEFAULT_PARALLELISM);
|
||||||
|
setDefaultOnCondition(props, !props.containsKey(BULKINSERT_PARALLELISM), BULKINSERT_PARALLELISM,
|
||||||
DEFAULT_PARALLELISM);
|
DEFAULT_PARALLELISM);
|
||||||
setDefaultOnCondition(props, !props.containsKey(BULKINSERT_PARALLELISM),
|
setDefaultOnCondition(props, !props.containsKey(UPSERT_PARALLELISM), UPSERT_PARALLELISM, DEFAULT_PARALLELISM);
|
||||||
BULKINSERT_PARALLELISM, DEFAULT_PARALLELISM);
|
setDefaultOnCondition(props, !props.containsKey(COMBINE_BEFORE_INSERT_PROP), COMBINE_BEFORE_INSERT_PROP,
|
||||||
setDefaultOnCondition(props, !props.containsKey(UPSERT_PARALLELISM), UPSERT_PARALLELISM,
|
DEFAULT_COMBINE_BEFORE_INSERT);
|
||||||
DEFAULT_PARALLELISM);
|
setDefaultOnCondition(props, !props.containsKey(COMBINE_BEFORE_UPSERT_PROP), COMBINE_BEFORE_UPSERT_PROP,
|
||||||
setDefaultOnCondition(props, !props.containsKey(COMBINE_BEFORE_INSERT_PROP),
|
DEFAULT_COMBINE_BEFORE_UPSERT);
|
||||||
COMBINE_BEFORE_INSERT_PROP, DEFAULT_COMBINE_BEFORE_INSERT);
|
setDefaultOnCondition(props, !props.containsKey(WRITE_STATUS_STORAGE_LEVEL), WRITE_STATUS_STORAGE_LEVEL,
|
||||||
setDefaultOnCondition(props, !props.containsKey(COMBINE_BEFORE_UPSERT_PROP),
|
DEFAULT_WRITE_STATUS_STORAGE_LEVEL);
|
||||||
COMBINE_BEFORE_UPSERT_PROP, DEFAULT_COMBINE_BEFORE_UPSERT);
|
setDefaultOnCondition(props, !props.containsKey(HOODIE_AUTO_COMMIT_PROP), HOODIE_AUTO_COMMIT_PROP,
|
||||||
setDefaultOnCondition(props, !props.containsKey(WRITE_STATUS_STORAGE_LEVEL),
|
DEFAULT_HOODIE_AUTO_COMMIT);
|
||||||
WRITE_STATUS_STORAGE_LEVEL, DEFAULT_WRITE_STATUS_STORAGE_LEVEL);
|
|
||||||
setDefaultOnCondition(props, !props.containsKey(HOODIE_AUTO_COMMIT_PROP),
|
|
||||||
HOODIE_AUTO_COMMIT_PROP, DEFAULT_HOODIE_AUTO_COMMIT);
|
|
||||||
setDefaultOnCondition(props, !props.containsKey(HOODIE_ASSUME_DATE_PARTITIONING_PROP),
|
setDefaultOnCondition(props, !props.containsKey(HOODIE_ASSUME_DATE_PARTITIONING_PROP),
|
||||||
HOODIE_ASSUME_DATE_PARTITIONING_PROP, DEFAULT_ASSUME_DATE_PARTITIONING);
|
HOODIE_ASSUME_DATE_PARTITIONING_PROP, DEFAULT_ASSUME_DATE_PARTITIONING);
|
||||||
setDefaultOnCondition(props, !props.containsKey(HOODIE_WRITE_STATUS_CLASS_PROP),
|
setDefaultOnCondition(props, !props.containsKey(HOODIE_WRITE_STATUS_CLASS_PROP), HOODIE_WRITE_STATUS_CLASS_PROP,
|
||||||
HOODIE_WRITE_STATUS_CLASS_PROP, DEFAULT_HOODIE_WRITE_STATUS_CLASS);
|
DEFAULT_HOODIE_WRITE_STATUS_CLASS);
|
||||||
setDefaultOnCondition(props, !props.containsKey(FINALIZE_WRITE_PARALLELISM),
|
setDefaultOnCondition(props, !props.containsKey(FINALIZE_WRITE_PARALLELISM), FINALIZE_WRITE_PARALLELISM,
|
||||||
FINALIZE_WRITE_PARALLELISM, DEFAULT_FINALIZE_WRITE_PARALLELISM);
|
DEFAULT_FINALIZE_WRITE_PARALLELISM);
|
||||||
setDefaultOnCondition(props, !props.containsKey(EMBEDDED_TIMELINE_SERVER_ENABLED),
|
setDefaultOnCondition(props, !props.containsKey(EMBEDDED_TIMELINE_SERVER_ENABLED),
|
||||||
EMBEDDED_TIMELINE_SERVER_ENABLED, DEFAULT_EMBEDDED_TIMELINE_SERVER_ENABLED);
|
EMBEDDED_TIMELINE_SERVER_ENABLED, DEFAULT_EMBEDDED_TIMELINE_SERVER_ENABLED);
|
||||||
setDefaultOnCondition(props, !props.containsKey(INITIAL_CONSISTENCY_CHECK_INTERVAL_MS_PROP),
|
setDefaultOnCondition(props, !props.containsKey(INITIAL_CONSISTENCY_CHECK_INTERVAL_MS_PROP),
|
||||||
INITIAL_CONSISTENCY_CHECK_INTERVAL_MS_PROP, String.valueOf(DEFAULT_INITIAL_CONSISTENCY_CHECK_INTERVAL_MS));
|
INITIAL_CONSISTENCY_CHECK_INTERVAL_MS_PROP, String.valueOf(DEFAULT_INITIAL_CONSISTENCY_CHECK_INTERVAL_MS));
|
||||||
setDefaultOnCondition(props, !props.containsKey(MAX_CONSISTENCY_CHECK_INTERVAL_MS_PROP),
|
setDefaultOnCondition(props, !props.containsKey(MAX_CONSISTENCY_CHECK_INTERVAL_MS_PROP),
|
||||||
MAX_CONSISTENCY_CHECK_INTERVAL_MS_PROP, String.valueOf(DEFAULT_MAX_CONSISTENCY_CHECK_INTERVAL_MS));
|
MAX_CONSISTENCY_CHECK_INTERVAL_MS_PROP, String.valueOf(DEFAULT_MAX_CONSISTENCY_CHECK_INTERVAL_MS));
|
||||||
setDefaultOnCondition(props, !props.containsKey(MAX_CONSISTENCY_CHECKS_PROP),
|
setDefaultOnCondition(props, !props.containsKey(MAX_CONSISTENCY_CHECKS_PROP), MAX_CONSISTENCY_CHECKS_PROP,
|
||||||
MAX_CONSISTENCY_CHECKS_PROP, String.valueOf(DEFAULT_MAX_CONSISTENCY_CHECKS));
|
String.valueOf(DEFAULT_MAX_CONSISTENCY_CHECKS));
|
||||||
setDefaultOnCondition(props, !props.containsKey(FAIL_ON_TIMELINE_ARCHIVING_ENABLED_PROP),
|
setDefaultOnCondition(props, !props.containsKey(FAIL_ON_TIMELINE_ARCHIVING_ENABLED_PROP),
|
||||||
FAIL_ON_TIMELINE_ARCHIVING_ENABLED_PROP, DEFAULT_FAIL_ON_TIMELINE_ARCHIVING_ENABLED);
|
FAIL_ON_TIMELINE_ARCHIVING_ENABLED_PROP, DEFAULT_FAIL_ON_TIMELINE_ARCHIVING_ENABLED);
|
||||||
|
|
||||||
// Make sure the props is propagated
|
// Make sure the props is propagated
|
||||||
setDefaultOnCondition(props, !isIndexConfigSet,
|
setDefaultOnCondition(props, !isIndexConfigSet, HoodieIndexConfig.newBuilder().fromProperties(props).build());
|
||||||
HoodieIndexConfig.newBuilder().fromProperties(props).build());
|
setDefaultOnCondition(props, !isStorageConfigSet, HoodieStorageConfig.newBuilder().fromProperties(props).build());
|
||||||
setDefaultOnCondition(props, !isStorageConfigSet,
|
|
||||||
HoodieStorageConfig.newBuilder().fromProperties(props).build());
|
|
||||||
setDefaultOnCondition(props, !isCompactionConfigSet,
|
setDefaultOnCondition(props, !isCompactionConfigSet,
|
||||||
HoodieCompactionConfig.newBuilder().fromProperties(props).build());
|
HoodieCompactionConfig.newBuilder().fromProperties(props).build());
|
||||||
setDefaultOnCondition(props, !isMetricsConfigSet,
|
setDefaultOnCondition(props, !isMetricsConfigSet, HoodieMetricsConfig.newBuilder().fromProperties(props).build());
|
||||||
HoodieMetricsConfig.newBuilder().fromProperties(props).build());
|
setDefaultOnCondition(props, !isMemoryConfigSet, HoodieMemoryConfig.newBuilder().fromProperties(props).build());
|
||||||
setDefaultOnCondition(props, !isMemoryConfigSet,
|
|
||||||
HoodieMemoryConfig.newBuilder().fromProperties(props).build());
|
|
||||||
setDefaultOnCondition(props, !isViewConfigSet,
|
setDefaultOnCondition(props, !isViewConfigSet,
|
||||||
FileSystemViewStorageConfig.newBuilder().fromProperties(props).build());
|
FileSystemViewStorageConfig.newBuilder().fromProperties(props).build());
|
||||||
setDefaultOnCondition(props, !isConsistencyGuardSet,
|
setDefaultOnCondition(props, !isConsistencyGuardSet,
|
||||||
|
|||||||
@@ -19,8 +19,9 @@
|
|||||||
package org.apache.hudi.exception;
|
package org.apache.hudi.exception;
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* <p> Exception thrown for any higher level errors when <code>HoodieClient</code> is doing a delta
|
* <p>
|
||||||
* commit </p>
|
* Exception thrown for any higher level errors when <code>HoodieClient</code> is doing a delta commit
|
||||||
|
* </p>
|
||||||
*/
|
*/
|
||||||
public class HoodieAppendException extends HoodieException {
|
public class HoodieAppendException extends HoodieException {
|
||||||
|
|
||||||
|
|||||||
@@ -19,7 +19,8 @@
|
|||||||
package org.apache.hudi.exception;
|
package org.apache.hudi.exception;
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* <p> Exception thrown for any higher level errors when <code>HoodieClient</code> is doing a Commit
|
* <p>
|
||||||
|
* Exception thrown for any higher level errors when <code>HoodieClient</code> is doing a Commit
|
||||||
* </p>
|
* </p>
|
||||||
*/
|
*/
|
||||||
public class HoodieCommitException extends HoodieException {
|
public class HoodieCommitException extends HoodieException {
|
||||||
|
|||||||
@@ -20,7 +20,9 @@ package org.apache.hudi.exception;
|
|||||||
|
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* <p> Exception thrown when dependent system is not available </p>
|
* <p>
|
||||||
|
* Exception thrown when dependent system is not available
|
||||||
|
* </p>
|
||||||
*/
|
*/
|
||||||
public class HoodieDependentSystemUnavailableException extends HoodieException {
|
public class HoodieDependentSystemUnavailableException extends HoodieException {
|
||||||
|
|
||||||
|
|||||||
@@ -19,8 +19,9 @@
|
|||||||
package org.apache.hudi.exception;
|
package org.apache.hudi.exception;
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* <p> Exception thrown for any higher level errors when <code>HoodieClient</code> is doing a bulk
|
* <p>
|
||||||
* insert </p>
|
* Exception thrown for any higher level errors when <code>HoodieClient</code> is doing a bulk insert
|
||||||
|
* </p>
|
||||||
*/
|
*/
|
||||||
public class HoodieInsertException extends HoodieException {
|
public class HoodieInsertException extends HoodieException {
|
||||||
|
|
||||||
|
|||||||
@@ -19,8 +19,9 @@
|
|||||||
package org.apache.hudi.exception;
|
package org.apache.hudi.exception;
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* <p> Exception thrown for any higher level errors when <code>HoodieClient</code> is doing a
|
* <p>
|
||||||
* incremental upsert </p>
|
* Exception thrown for any higher level errors when <code>HoodieClient</code> is doing a incremental upsert
|
||||||
|
* </p>
|
||||||
*/
|
*/
|
||||||
public class HoodieUpsertException extends HoodieException {
|
public class HoodieUpsertException extends HoodieException {
|
||||||
|
|
||||||
|
|||||||
@@ -31,16 +31,16 @@ import org.apache.spark.api.java.function.Function2;
|
|||||||
/**
|
/**
|
||||||
* Map function that handles a sorted stream of HoodieRecords
|
* Map function that handles a sorted stream of HoodieRecords
|
||||||
*/
|
*/
|
||||||
public class BulkInsertMapFunction<T extends HoodieRecordPayload> implements
|
public class BulkInsertMapFunction<T extends HoodieRecordPayload>
|
||||||
Function2<Integer, Iterator<HoodieRecord<T>>, Iterator<List<WriteStatus>>> {
|
implements Function2<Integer, Iterator<HoodieRecord<T>>, Iterator<List<WriteStatus>>> {
|
||||||
|
|
||||||
private String commitTime;
|
private String commitTime;
|
||||||
private HoodieWriteConfig config;
|
private HoodieWriteConfig config;
|
||||||
private HoodieTable<T> hoodieTable;
|
private HoodieTable<T> hoodieTable;
|
||||||
private List<String> fileIDPrefixes;
|
private List<String> fileIDPrefixes;
|
||||||
|
|
||||||
public BulkInsertMapFunction(String commitTime, HoodieWriteConfig config,
|
public BulkInsertMapFunction(String commitTime, HoodieWriteConfig config, HoodieTable<T> hoodieTable,
|
||||||
HoodieTable<T> hoodieTable, List<String> fileIDPrefixes) {
|
List<String> fileIDPrefixes) {
|
||||||
this.commitTime = commitTime;
|
this.commitTime = commitTime;
|
||||||
this.config = config;
|
this.config = config;
|
||||||
this.hoodieTable = hoodieTable;
|
this.hoodieTable = hoodieTable;
|
||||||
|
|||||||
@@ -37,11 +37,10 @@ import org.apache.hudi.io.HoodieWriteHandle;
|
|||||||
import org.apache.hudi.table.HoodieTable;
|
import org.apache.hudi.table.HoodieTable;
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Lazy Iterable, that writes a stream of HoodieRecords sorted by the partitionPath, into new
|
* Lazy Iterable, that writes a stream of HoodieRecords sorted by the partitionPath, into new files.
|
||||||
* files.
|
|
||||||
*/
|
*/
|
||||||
public class CopyOnWriteLazyInsertIterable<T extends HoodieRecordPayload> extends
|
public class CopyOnWriteLazyInsertIterable<T extends HoodieRecordPayload>
|
||||||
LazyIterableIterator<HoodieRecord<T>, List<WriteStatus>> {
|
extends LazyIterableIterator<HoodieRecord<T>, List<WriteStatus>> {
|
||||||
|
|
||||||
protected final HoodieWriteConfig hoodieConfig;
|
protected final HoodieWriteConfig hoodieConfig;
|
||||||
protected final String commitTime;
|
protected final String commitTime;
|
||||||
@@ -80,25 +79,23 @@ public class CopyOnWriteLazyInsertIterable<T extends HoodieRecordPayload> extend
|
|||||||
* Transformer function to help transform a HoodieRecord. This transformer is used by BufferedIterator to offload some
|
* Transformer function to help transform a HoodieRecord. This transformer is used by BufferedIterator to offload some
|
||||||
* expensive operations of transformation to the reader thread.
|
* expensive operations of transformation to the reader thread.
|
||||||
*/
|
*/
|
||||||
static <T extends HoodieRecordPayload> Function<HoodieRecord<T>,
|
static <T extends HoodieRecordPayload> Function<HoodieRecord<T>, HoodieInsertValueGenResult<HoodieRecord>> getTransformFunction(
|
||||||
HoodieInsertValueGenResult<HoodieRecord>> getTransformFunction(Schema schema) {
|
Schema schema) {
|
||||||
return hoodieRecord -> new HoodieInsertValueGenResult(hoodieRecord, schema);
|
return hoodieRecord -> new HoodieInsertValueGenResult(hoodieRecord, schema);
|
||||||
}
|
}
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
protected void start() {
|
protected void start() {}
|
||||||
}
|
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
protected List<WriteStatus> computeNext() {
|
protected List<WriteStatus> computeNext() {
|
||||||
// Executor service used for launching writer thread.
|
// Executor service used for launching writer thread.
|
||||||
BoundedInMemoryExecutor<HoodieRecord<T>,
|
BoundedInMemoryExecutor<HoodieRecord<T>, HoodieInsertValueGenResult<HoodieRecord>, List<WriteStatus>> bufferedIteratorExecutor =
|
||||||
HoodieInsertValueGenResult<HoodieRecord>, List<WriteStatus>> bufferedIteratorExecutor = null;
|
null;
|
||||||
try {
|
try {
|
||||||
final Schema schema = new Schema.Parser().parse(hoodieConfig.getSchema());
|
final Schema schema = new Schema.Parser().parse(hoodieConfig.getSchema());
|
||||||
bufferedIteratorExecutor =
|
bufferedIteratorExecutor =
|
||||||
new SparkBoundedInMemoryExecutor<>(hoodieConfig, inputItr,
|
new SparkBoundedInMemoryExecutor<>(hoodieConfig, inputItr, getInsertHandler(), getTransformFunction(schema));
|
||||||
getInsertHandler(), getTransformFunction(schema));
|
|
||||||
final List<WriteStatus> result = bufferedIteratorExecutor.execute();
|
final List<WriteStatus> result = bufferedIteratorExecutor.execute();
|
||||||
assert result != null && !result.isEmpty() && !bufferedIteratorExecutor.isRemaining();
|
assert result != null && !result.isEmpty() && !bufferedIteratorExecutor.isRemaining();
|
||||||
return result;
|
return result;
|
||||||
@@ -112,8 +109,7 @@ public class CopyOnWriteLazyInsertIterable<T extends HoodieRecordPayload> extend
|
|||||||
}
|
}
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
protected void end() {
|
protected void end() {}
|
||||||
}
|
|
||||||
|
|
||||||
protected String getNextFileId(String idPfx) {
|
protected String getNextFileId(String idPfx) {
|
||||||
return String.format("%s-%d", idPfx, numFilesWritten++);
|
return String.format("%s-%d", idPfx, numFilesWritten++);
|
||||||
@@ -124,11 +120,10 @@ public class CopyOnWriteLazyInsertIterable<T extends HoodieRecordPayload> extend
|
|||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Consumes stream of hoodie records from in-memory queue and
|
* Consumes stream of hoodie records from in-memory queue and writes to one or more create-handles
|
||||||
* writes to one or more create-handles
|
|
||||||
*/
|
*/
|
||||||
protected class CopyOnWriteInsertHandler extends
|
protected class CopyOnWriteInsertHandler
|
||||||
BoundedInMemoryQueueConsumer<HoodieInsertValueGenResult<HoodieRecord>, List<WriteStatus>> {
|
extends BoundedInMemoryQueueConsumer<HoodieInsertValueGenResult<HoodieRecord>, List<WriteStatus>> {
|
||||||
|
|
||||||
protected final List<WriteStatus> statuses = new ArrayList<>();
|
protected final List<WriteStatus> statuses = new ArrayList<>();
|
||||||
protected HoodieWriteHandle handle;
|
protected HoodieWriteHandle handle;
|
||||||
|
|||||||
@@ -21,16 +21,15 @@ package org.apache.hudi.func;
|
|||||||
import java.util.Iterator;
|
import java.util.Iterator;
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* (NOTE: Adapted from Apache SystemML) This class is a generic base class for lazy, single pass
|
* (NOTE: Adapted from Apache SystemML) This class is a generic base class for lazy, single pass inputItr classes in
|
||||||
* inputItr classes in order to simplify the implementation of lazy iterators for mapPartitions use
|
* order to simplify the implementation of lazy iterators for mapPartitions use cases. Note [SPARK-3369], which gives
|
||||||
* cases. Note [SPARK-3369], which gives the reasons for backwards compatibility with regard to the
|
* the reasons for backwards compatibility with regard to the iterable API despite Spark's single pass nature.
|
||||||
* iterable API despite Spark's single pass nature.
|
|
||||||
* <p>
|
* <p>
|
||||||
* Provide a way to obtain a inputItr of type O (output), out of an inputItr of type I (input)
|
* Provide a way to obtain a inputItr of type O (output), out of an inputItr of type I (input)
|
||||||
* <p>
|
* <p>
|
||||||
* Things to remember: - Assumes Spark calls hasNext() to check for elements, before calling next()
|
* Things to remember: - Assumes Spark calls hasNext() to check for elements, before calling next() to obtain them -
|
||||||
* to obtain them - Assumes hasNext() gets called atleast once. - Concrete Implementation is
|
* Assumes hasNext() gets called atleast once. - Concrete Implementation is responsible for calling inputIterator.next()
|
||||||
* responsible for calling inputIterator.next() and doing the processing in computeNext()
|
* and doing the processing in computeNext()
|
||||||
*/
|
*/
|
||||||
public abstract class LazyIterableIterator<I, O> implements Iterable<O>, Iterator<O> {
|
public abstract class LazyIterableIterator<I, O> implements Iterable<O>, Iterator<O> {
|
||||||
|
|
||||||
@@ -88,13 +87,13 @@ public abstract class LazyIterableIterator<I, O> implements Iterable<O>, Iterato
|
|||||||
|
|
||||||
@Override
|
@Override
|
||||||
public Iterator<O> iterator() {
|
public Iterator<O> iterator() {
|
||||||
//check for consumed inputItr
|
// check for consumed inputItr
|
||||||
if (consumed) {
|
if (consumed) {
|
||||||
throw new RuntimeException("Invalid repeated inputItr consumption.");
|
throw new RuntimeException("Invalid repeated inputItr consumption.");
|
||||||
}
|
}
|
||||||
|
|
||||||
//hand out self as inputItr exactly once (note: do not hand out the input
|
// hand out self as inputItr exactly once (note: do not hand out the input
|
||||||
//inputItr since it is consumed by the self inputItr implementation)
|
// inputItr since it is consumed by the self inputItr implementation)
|
||||||
consumed = true;
|
consumed = true;
|
||||||
return this;
|
return this;
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -29,11 +29,9 @@ import org.apache.hudi.io.HoodieAppendHandle;
|
|||||||
import org.apache.hudi.table.HoodieTable;
|
import org.apache.hudi.table.HoodieTable;
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Lazy Iterable, that writes a stream of HoodieRecords sorted by the partitionPath, into new
|
* Lazy Iterable, that writes a stream of HoodieRecords sorted by the partitionPath, into new log files.
|
||||||
* log files.
|
|
||||||
*/
|
*/
|
||||||
public class MergeOnReadLazyInsertIterable<T extends HoodieRecordPayload> extends
|
public class MergeOnReadLazyInsertIterable<T extends HoodieRecordPayload> extends CopyOnWriteLazyInsertIterable<T> {
|
||||||
CopyOnWriteLazyInsertIterable<T> {
|
|
||||||
|
|
||||||
public MergeOnReadLazyInsertIterable(Iterator<HoodieRecord<T>> sortedRecordItr, HoodieWriteConfig config,
|
public MergeOnReadLazyInsertIterable(Iterator<HoodieRecord<T>> sortedRecordItr, HoodieWriteConfig config,
|
||||||
String commitTime, HoodieTable<T> hoodieTable, String idPfx) {
|
String commitTime, HoodieTable<T> hoodieTable, String idPfx) {
|
||||||
|
|||||||
@@ -32,8 +32,7 @@ public class OperationResult<T> implements Serializable {
|
|||||||
private boolean success;
|
private boolean success;
|
||||||
private Option<Exception> exception;
|
private Option<Exception> exception;
|
||||||
|
|
||||||
public OperationResult() {
|
public OperationResult() {}
|
||||||
}
|
|
||||||
|
|
||||||
public OperationResult(T operation, boolean success, Option<Exception> exception) {
|
public OperationResult(T operation, boolean success, Option<Exception> exception) {
|
||||||
this.operation = operation;
|
this.operation = operation;
|
||||||
@@ -67,11 +66,7 @@ public class OperationResult<T> implements Serializable {
|
|||||||
|
|
||||||
@Override
|
@Override
|
||||||
public String toString() {
|
public String toString() {
|
||||||
return "OperationResult{"
|
return "OperationResult{" + "operation=" + operation + ", executed=" + executed + ", success=" + success
|
||||||
+ "operation=" + operation
|
+ ", exception=" + exception + '}';
|
||||||
+ ", executed=" + executed
|
|
||||||
+ ", success=" + success
|
|
||||||
+ ", exception=" + exception
|
|
||||||
+ '}';
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -25,8 +25,8 @@ import org.apache.hudi.exception.HoodieIOException;
|
|||||||
import org.apache.parquet.hadoop.ParquetReader;
|
import org.apache.parquet.hadoop.ParquetReader;
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* This class wraps a parquet reader and provides an iterator based api to
|
* This class wraps a parquet reader and provides an iterator based api to read from a parquet file. This is used in
|
||||||
* read from a parquet file. This is used in {@link BoundedInMemoryQueue}
|
* {@link BoundedInMemoryQueue}
|
||||||
*/
|
*/
|
||||||
public class ParquetReaderIterator<T> implements Iterator<T> {
|
public class ParquetReaderIterator<T> implements Iterator<T> {
|
||||||
|
|
||||||
|
|||||||
@@ -36,17 +36,13 @@ public class SparkBoundedInMemoryExecutor<I, O, E> extends BoundedInMemoryExecut
|
|||||||
final TaskContext sparkThreadTaskContext;
|
final TaskContext sparkThreadTaskContext;
|
||||||
|
|
||||||
public SparkBoundedInMemoryExecutor(final HoodieWriteConfig hoodieConfig, final Iterator<I> inputItr,
|
public SparkBoundedInMemoryExecutor(final HoodieWriteConfig hoodieConfig, final Iterator<I> inputItr,
|
||||||
BoundedInMemoryQueueConsumer<O, E> consumer,
|
BoundedInMemoryQueueConsumer<O, E> consumer, Function<I, O> bufferedIteratorTransform) {
|
||||||
Function<I, O> bufferedIteratorTransform) {
|
|
||||||
this(hoodieConfig, new IteratorBasedQueueProducer<>(inputItr), consumer, bufferedIteratorTransform);
|
this(hoodieConfig, new IteratorBasedQueueProducer<>(inputItr), consumer, bufferedIteratorTransform);
|
||||||
}
|
}
|
||||||
|
|
||||||
public SparkBoundedInMemoryExecutor(final HoodieWriteConfig hoodieConfig,
|
public SparkBoundedInMemoryExecutor(final HoodieWriteConfig hoodieConfig, BoundedInMemoryQueueProducer<I> producer,
|
||||||
BoundedInMemoryQueueProducer<I> producer,
|
BoundedInMemoryQueueConsumer<O, E> consumer, Function<I, O> bufferedIteratorTransform) {
|
||||||
BoundedInMemoryQueueConsumer<O, E> consumer,
|
super(hoodieConfig.getWriteBufferLimitBytes(), producer, Option.of(consumer), bufferedIteratorTransform);
|
||||||
Function<I, O> bufferedIteratorTransform) {
|
|
||||||
super(hoodieConfig.getWriteBufferLimitBytes(), producer,
|
|
||||||
Option.of(consumer), bufferedIteratorTransform);
|
|
||||||
this.sparkThreadTaskContext = TaskContext.get();
|
this.sparkThreadTaskContext = TaskContext.get();
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|||||||
@@ -65,18 +65,18 @@ public abstract class HoodieIndex<T extends HoodieRecordPayload> implements Seri
|
|||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Checks if the given [Keys] exists in the hoodie table and returns [Key, Option[partitionPath, fileID]]
|
* Checks if the given [Keys] exists in the hoodie table and returns [Key, Option[partitionPath, fileID]] If the
|
||||||
* If the optional is empty, then the key is not found.
|
* optional is empty, then the key is not found.
|
||||||
*/
|
*/
|
||||||
public abstract JavaPairRDD<HoodieKey, Option<Pair<String, String>>> fetchRecordLocation(
|
public abstract JavaPairRDD<HoodieKey, Option<Pair<String, String>>> fetchRecordLocation(
|
||||||
JavaRDD<HoodieKey> hoodieKeys, final JavaSparkContext jsc, HoodieTable<T> hoodieTable);
|
JavaRDD<HoodieKey> hoodieKeys, final JavaSparkContext jsc, HoodieTable<T> hoodieTable);
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Looks up the index and tags each incoming record with a location of a file that contains the
|
* Looks up the index and tags each incoming record with a location of a file that contains the row (if it is actually
|
||||||
* row (if it is actually present)
|
* present)
|
||||||
*/
|
*/
|
||||||
public abstract JavaRDD<HoodieRecord<T>> tagLocation(JavaRDD<HoodieRecord<T>> recordRDD,
|
public abstract JavaRDD<HoodieRecord<T>> tagLocation(JavaRDD<HoodieRecord<T>> recordRDD, JavaSparkContext jsc,
|
||||||
JavaSparkContext jsc, HoodieTable<T> hoodieTable) throws HoodieIndexException;
|
HoodieTable<T> hoodieTable) throws HoodieIndexException;
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Extracts the location of written records, and updates the index.
|
* Extracts the location of written records, and updates the index.
|
||||||
@@ -84,8 +84,7 @@ public abstract class HoodieIndex<T extends HoodieRecordPayload> implements Seri
|
|||||||
* TODO(vc): We may need to propagate the record as well in a WriteStatus class
|
* TODO(vc): We may need to propagate the record as well in a WriteStatus class
|
||||||
*/
|
*/
|
||||||
public abstract JavaRDD<WriteStatus> updateLocation(JavaRDD<WriteStatus> writeStatusRDD, JavaSparkContext jsc,
|
public abstract JavaRDD<WriteStatus> updateLocation(JavaRDD<WriteStatus> writeStatusRDD, JavaSparkContext jsc,
|
||||||
HoodieTable<T> hoodieTable)
|
HoodieTable<T> hoodieTable) throws HoodieIndexException;
|
||||||
throws HoodieIndexException;
|
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Rollback the efffects of the commit made at commitTime.
|
* Rollback the efffects of the commit made at commitTime.
|
||||||
@@ -93,17 +92,17 @@ public abstract class HoodieIndex<T extends HoodieRecordPayload> implements Seri
|
|||||||
public abstract boolean rollbackCommit(String commitTime);
|
public abstract boolean rollbackCommit(String commitTime);
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* An index is `global` if {@link HoodieKey} to fileID mapping, does not depend on the
|
* An index is `global` if {@link HoodieKey} to fileID mapping, does not depend on the `partitionPath`. Such an
|
||||||
* `partitionPath`. Such an implementation is able to obtain the same mapping, for two hoodie keys
|
* implementation is able to obtain the same mapping, for two hoodie keys with same `recordKey` but different
|
||||||
* with same `recordKey` but different `partitionPath`
|
* `partitionPath`
|
||||||
*
|
*
|
||||||
* @return whether or not, the index implementation is global in nature
|
* @return whether or not, the index implementation is global in nature
|
||||||
*/
|
*/
|
||||||
public abstract boolean isGlobal();
|
public abstract boolean isGlobal();
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* This is used by storage to determine, if its safe to send inserts, straight to the log, i.e
|
* This is used by storage to determine, if its safe to send inserts, straight to the log, i.e having a
|
||||||
* having a {@link FileSlice}, with no data file.
|
* {@link FileSlice}, with no data file.
|
||||||
*
|
*
|
||||||
* @return Returns true/false depending on whether the impl has this capability
|
* @return Returns true/false depending on whether the impl has this capability
|
||||||
*/
|
*/
|
||||||
@@ -111,8 +110,8 @@ public abstract class HoodieIndex<T extends HoodieRecordPayload> implements Seri
|
|||||||
|
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* An index is "implicit" with respect to storage, if just writing new data to a file slice,
|
* An index is "implicit" with respect to storage, if just writing new data to a file slice, updates the index as
|
||||||
* updates the index as well. This is used by storage, to save memory footprint in certain cases.
|
* well. This is used by storage, to save memory footprint in certain cases.
|
||||||
*/
|
*/
|
||||||
public abstract boolean isImplicitWithStorage();
|
public abstract boolean isImplicitWithStorage();
|
||||||
|
|
||||||
|
|||||||
@@ -40,7 +40,9 @@ import org.apache.spark.api.java.function.Function2;
|
|||||||
|
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Hoodie Index implementation backed by an in-memory Hash map. <p> ONLY USE FOR LOCAL TESTING
|
* Hoodie Index implementation backed by an in-memory Hash map.
|
||||||
|
* <p>
|
||||||
|
* ONLY USE FOR LOCAL TESTING
|
||||||
*/
|
*/
|
||||||
public class InMemoryHashIndex<T extends HoodieRecordPayload> extends HoodieIndex<T> {
|
public class InMemoryHashIndex<T extends HoodieRecordPayload> extends HoodieIndex<T> {
|
||||||
|
|
||||||
@@ -80,7 +82,7 @@ public class InMemoryHashIndex<T extends HoodieRecordPayload> extends HoodieInde
|
|||||||
if (newLocation.isPresent()) {
|
if (newLocation.isPresent()) {
|
||||||
recordLocationMap.put(key, newLocation.get());
|
recordLocationMap.put(key, newLocation.get());
|
||||||
} else {
|
} else {
|
||||||
//Delete existing index for a deleted record
|
// Delete existing index for a deleted record
|
||||||
recordLocationMap.remove(key);
|
recordLocationMap.remove(key);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@@ -122,12 +124,10 @@ public class InMemoryHashIndex<T extends HoodieRecordPayload> extends HoodieInde
|
|||||||
/**
|
/**
|
||||||
* Function that tags each HoodieRecord with an existing location, if known.
|
* Function that tags each HoodieRecord with an existing location, if known.
|
||||||
*/
|
*/
|
||||||
class LocationTagFunction implements
|
class LocationTagFunction implements Function2<Integer, Iterator<HoodieRecord<T>>, Iterator<HoodieRecord<T>>> {
|
||||||
Function2<Integer, Iterator<HoodieRecord<T>>, Iterator<HoodieRecord<T>>> {
|
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
public Iterator<HoodieRecord<T>> call(Integer partitionNum,
|
public Iterator<HoodieRecord<T>> call(Integer partitionNum, Iterator<HoodieRecord<T>> hoodieRecordIterator) {
|
||||||
Iterator<HoodieRecord<T>> hoodieRecordIterator) {
|
|
||||||
List<HoodieRecord<T>> taggedRecords = new ArrayList<>();
|
List<HoodieRecord<T>> taggedRecords = new ArrayList<>();
|
||||||
while (hoodieRecordIterator.hasNext()) {
|
while (hoodieRecordIterator.hasNext()) {
|
||||||
HoodieRecord<T> rec = hoodieRecordIterator.next();
|
HoodieRecord<T> rec = hoodieRecordIterator.next();
|
||||||
|
|||||||
@@ -35,6 +35,7 @@ import org.apache.spark.Partitioner;
|
|||||||
* Partitions bloom filter checks by spreading out comparisons across buckets of work.
|
* Partitions bloom filter checks by spreading out comparisons across buckets of work.
|
||||||
*
|
*
|
||||||
* Each bucket incurs the following cost
|
* Each bucket incurs the following cost
|
||||||
|
*
|
||||||
* <pre>
|
* <pre>
|
||||||
* 1) Read bloom filter from file footer
|
* 1) Read bloom filter from file footer
|
||||||
* 2) Check keys against bloom filter
|
* 2) Check keys against bloom filter
|
||||||
@@ -47,6 +48,7 @@ import org.apache.spark.Partitioner;
|
|||||||
* could bound the amount of skew to std_dev(numberOfBucketsPerPartition) * cost of (3), lower than sort partitioning.
|
* could bound the amount of skew to std_dev(numberOfBucketsPerPartition) * cost of (3), lower than sort partitioning.
|
||||||
*
|
*
|
||||||
* Approach has two goals :
|
* Approach has two goals :
|
||||||
|
*
|
||||||
* <pre>
|
* <pre>
|
||||||
* 1) Pack as many buckets from same file group into same partition, to amortize cost of (1) and (2) further
|
* 1) Pack as many buckets from same file group into same partition, to amortize cost of (1) and (2) further
|
||||||
* 2) Spread buckets across partitions evenly to achieve skew reduction
|
* 2) Spread buckets across partitions evenly to achieve skew reduction
|
||||||
@@ -76,8 +78,7 @@ public class BucketizedBloomCheckPartitioner extends Partitioner {
|
|||||||
|
|
||||||
Map<String, Integer> bucketsPerFileGroup = new HashMap<>();
|
Map<String, Integer> bucketsPerFileGroup = new HashMap<>();
|
||||||
// Compute the buckets needed per file group, using simple uniform distribution
|
// Compute the buckets needed per file group, using simple uniform distribution
|
||||||
fileGroupToComparisons.forEach((f, c) ->
|
fileGroupToComparisons.forEach((f, c) -> bucketsPerFileGroup.put(f, (int) Math.ceil((c * 1.0) / keysPerBucket)));
|
||||||
bucketsPerFileGroup.put(f, (int) Math.ceil((c * 1.0) / keysPerBucket)));
|
|
||||||
int totalBuckets = bucketsPerFileGroup.values().stream().mapToInt(i -> i).sum();
|
int totalBuckets = bucketsPerFileGroup.values().stream().mapToInt(i -> i).sum();
|
||||||
// If totalBuckets > targetPartitions, no need to have extra partitions
|
// If totalBuckets > targetPartitions, no need to have extra partitions
|
||||||
this.partitions = Math.min(targetPartitions, totalBuckets);
|
this.partitions = Math.min(targetPartitions, totalBuckets);
|
||||||
|
|||||||
@@ -78,12 +78,12 @@ public class HoodieBloomIndex<T extends HoodieRecordPayload> extends HoodieIndex
|
|||||||
}
|
}
|
||||||
|
|
||||||
// Step 1: Extract out thinner JavaPairRDD of (partitionPath, recordKey)
|
// Step 1: Extract out thinner JavaPairRDD of (partitionPath, recordKey)
|
||||||
JavaPairRDD<String, String> partitionRecordKeyPairRDD = recordRDD
|
JavaPairRDD<String, String> partitionRecordKeyPairRDD =
|
||||||
.mapToPair(record -> new Tuple2<>(record.getPartitionPath(), record.getRecordKey()));
|
recordRDD.mapToPair(record -> new Tuple2<>(record.getPartitionPath(), record.getRecordKey()));
|
||||||
|
|
||||||
// Lookup indexes for all the partition/recordkey pair
|
// Lookup indexes for all the partition/recordkey pair
|
||||||
JavaPairRDD<HoodieKey, HoodieRecordLocation> keyFilenamePairRDD = lookupIndex(partitionRecordKeyPairRDD, jsc,
|
JavaPairRDD<HoodieKey, HoodieRecordLocation> keyFilenamePairRDD =
|
||||||
hoodieTable);
|
lookupIndex(partitionRecordKeyPairRDD, jsc, hoodieTable);
|
||||||
|
|
||||||
// Cache the result, for subsequent stages.
|
// Cache the result, for subsequent stages.
|
||||||
if (config.getBloomIndexUseCaching()) {
|
if (config.getBloomIndexUseCaching()) {
|
||||||
@@ -96,8 +96,7 @@ public class HoodieBloomIndex<T extends HoodieRecordPayload> extends HoodieIndex
|
|||||||
|
|
||||||
// Step 4: Tag the incoming records, as inserts or updates, by joining with existing record keys
|
// Step 4: Tag the incoming records, as inserts or updates, by joining with existing record keys
|
||||||
// Cost: 4 sec.
|
// Cost: 4 sec.
|
||||||
JavaRDD<HoodieRecord<T>> taggedRecordRDD = tagLocationBacktoRecords(keyFilenamePairRDD,
|
JavaRDD<HoodieRecord<T>> taggedRecordRDD = tagLocationBacktoRecords(keyFilenamePairRDD, recordRDD);
|
||||||
recordRDD);
|
|
||||||
|
|
||||||
if (config.getBloomIndexUseCaching()) {
|
if (config.getBloomIndexUseCaching()) {
|
||||||
recordRDD.unpersist(); // unpersist the input Record RDD
|
recordRDD.unpersist(); // unpersist the input Record RDD
|
||||||
@@ -108,8 +107,8 @@ public class HoodieBloomIndex<T extends HoodieRecordPayload> extends HoodieIndex
|
|||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Returns an RDD mapping each HoodieKey with a partitionPath/fileID which contains it. Option.Empty if the key is
|
* Returns an RDD mapping each HoodieKey with a partitionPath/fileID which contains it. Option.Empty if the key is not
|
||||||
* not found.
|
* found.
|
||||||
*
|
*
|
||||||
* @param hoodieKeys keys to lookup
|
* @param hoodieKeys keys to lookup
|
||||||
* @param jsc spark context
|
* @param jsc spark context
|
||||||
@@ -118,12 +117,12 @@ public class HoodieBloomIndex<T extends HoodieRecordPayload> extends HoodieIndex
|
|||||||
@Override
|
@Override
|
||||||
public JavaPairRDD<HoodieKey, Option<Pair<String, String>>> fetchRecordLocation(JavaRDD<HoodieKey> hoodieKeys,
|
public JavaPairRDD<HoodieKey, Option<Pair<String, String>>> fetchRecordLocation(JavaRDD<HoodieKey> hoodieKeys,
|
||||||
JavaSparkContext jsc, HoodieTable<T> hoodieTable) {
|
JavaSparkContext jsc, HoodieTable<T> hoodieTable) {
|
||||||
JavaPairRDD<String, String> partitionRecordKeyPairRDD = hoodieKeys
|
JavaPairRDD<String, String> partitionRecordKeyPairRDD =
|
||||||
.mapToPair(key -> new Tuple2<>(key.getPartitionPath(), key.getRecordKey()));
|
hoodieKeys.mapToPair(key -> new Tuple2<>(key.getPartitionPath(), key.getRecordKey()));
|
||||||
|
|
||||||
// Lookup indexes for all the partition/recordkey pair
|
// Lookup indexes for all the partition/recordkey pair
|
||||||
JavaPairRDD<HoodieKey, HoodieRecordLocation> recordKeyLocationRDD = lookupIndex(partitionRecordKeyPairRDD, jsc,
|
JavaPairRDD<HoodieKey, HoodieRecordLocation> recordKeyLocationRDD =
|
||||||
hoodieTable);
|
lookupIndex(partitionRecordKeyPairRDD, jsc, hoodieTable);
|
||||||
JavaPairRDD<HoodieKey, String> keyHoodieKeyPairRDD = hoodieKeys.mapToPair(key -> new Tuple2<>(key, null));
|
JavaPairRDD<HoodieKey, String> keyHoodieKeyPairRDD = hoodieKeys.mapToPair(key -> new Tuple2<>(key, null));
|
||||||
|
|
||||||
return keyHoodieKeyPairRDD.leftOuterJoin(recordKeyLocationRDD).mapToPair(keyLoc -> {
|
return keyHoodieKeyPairRDD.leftOuterJoin(recordKeyLocationRDD).mapToPair(keyLoc -> {
|
||||||
@@ -149,19 +148,19 @@ public class HoodieBloomIndex<T extends HoodieRecordPayload> extends HoodieIndex
|
|||||||
List<String> affectedPartitionPathList = new ArrayList<>(recordsPerPartition.keySet());
|
List<String> affectedPartitionPathList = new ArrayList<>(recordsPerPartition.keySet());
|
||||||
|
|
||||||
// Step 2: Load all involved files as <Partition, filename> pairs
|
// Step 2: Load all involved files as <Partition, filename> pairs
|
||||||
List<Tuple2<String, BloomIndexFileInfo>> fileInfoList = loadInvolvedFiles(affectedPartitionPathList, jsc,
|
List<Tuple2<String, BloomIndexFileInfo>> fileInfoList =
|
||||||
hoodieTable);
|
loadInvolvedFiles(affectedPartitionPathList, jsc, hoodieTable);
|
||||||
final Map<String, List<BloomIndexFileInfo>> partitionToFileInfo = fileInfoList.stream()
|
final Map<String, List<BloomIndexFileInfo>> partitionToFileInfo =
|
||||||
.collect(groupingBy(Tuple2::_1, mapping(Tuple2::_2, toList())));
|
fileInfoList.stream().collect(groupingBy(Tuple2::_1, mapping(Tuple2::_2, toList())));
|
||||||
|
|
||||||
// Step 3: Obtain a RDD, for each incoming record, that already exists, with the file id,
|
// Step 3: Obtain a RDD, for each incoming record, that already exists, with the file id,
|
||||||
// that contains it.
|
// that contains it.
|
||||||
Map<String, Long> comparisonsPerFileGroup = computeComparisonsPerFileGroup(recordsPerPartition, partitionToFileInfo,
|
Map<String, Long> comparisonsPerFileGroup =
|
||||||
partitionRecordKeyPairRDD);
|
computeComparisonsPerFileGroup(recordsPerPartition, partitionToFileInfo, partitionRecordKeyPairRDD);
|
||||||
int safeParallelism = computeSafeParallelism(recordsPerPartition, comparisonsPerFileGroup);
|
int safeParallelism = computeSafeParallelism(recordsPerPartition, comparisonsPerFileGroup);
|
||||||
int joinParallelism = determineParallelism(partitionRecordKeyPairRDD.partitions().size(), safeParallelism);
|
int joinParallelism = determineParallelism(partitionRecordKeyPairRDD.partitions().size(), safeParallelism);
|
||||||
return findMatchingFilesForRecordKeys(partitionToFileInfo, partitionRecordKeyPairRDD, joinParallelism,
|
return findMatchingFilesForRecordKeys(partitionToFileInfo, partitionRecordKeyPairRDD, joinParallelism, hoodieTable,
|
||||||
hoodieTable, comparisonsPerFileGroup);
|
comparisonsPerFileGroup);
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
@@ -175,13 +174,13 @@ public class HoodieBloomIndex<T extends HoodieRecordPayload> extends HoodieIndex
|
|||||||
if (config.getBloomIndexPruneByRanges()) {
|
if (config.getBloomIndexPruneByRanges()) {
|
||||||
// we will just try exploding the input and then count to determine comparisons
|
// we will just try exploding the input and then count to determine comparisons
|
||||||
// FIX(vc): Only do sampling here and extrapolate?
|
// FIX(vc): Only do sampling here and extrapolate?
|
||||||
fileToComparisons = explodeRecordRDDWithFileComparisons(partitionToFileInfo,
|
fileToComparisons = explodeRecordRDDWithFileComparisons(partitionToFileInfo, partitionRecordKeyPairRDD)
|
||||||
partitionRecordKeyPairRDD).mapToPair(t -> t).countByKey();
|
.mapToPair(t -> t).countByKey();
|
||||||
} else {
|
} else {
|
||||||
fileToComparisons = new HashMap<>();
|
fileToComparisons = new HashMap<>();
|
||||||
partitionToFileInfo.entrySet().stream().forEach(e -> {
|
partitionToFileInfo.entrySet().stream().forEach(e -> {
|
||||||
for (BloomIndexFileInfo fileInfo : e.getValue()) {
|
for (BloomIndexFileInfo fileInfo : e.getValue()) {
|
||||||
//each file needs to be compared against all the records coming into the partition
|
// each file needs to be compared against all the records coming into the partition
|
||||||
fileToComparisons.put(fileInfo.getFileId(), recordsPerPartition.get(e.getKey()));
|
fileToComparisons.put(fileInfo.getFileId(), recordsPerPartition.get(e.getKey()));
|
||||||
}
|
}
|
||||||
});
|
});
|
||||||
@@ -191,34 +190,41 @@ public class HoodieBloomIndex<T extends HoodieRecordPayload> extends HoodieIndex
|
|||||||
|
|
||||||
/**
|
/**
|
||||||
* Compute the minimum parallelism needed to play well with the spark 2GB limitation.. The index lookup can be skewed
|
* Compute the minimum parallelism needed to play well with the spark 2GB limitation.. The index lookup can be skewed
|
||||||
* in three dimensions : #files, #partitions, #records <p> To be able to smoothly handle skews, we need to compute how
|
* in three dimensions : #files, #partitions, #records
|
||||||
* to split each partitions into subpartitions. We do it here, in a way that keeps the amount of each Spark join
|
* <p>
|
||||||
* partition to < 2GB. <p> If {@link HoodieIndexConfig#BLOOM_INDEX_PARALLELISM_PROP} is
|
* To be able to smoothly handle skews, we need to compute how to split each partitions into subpartitions. We do it
|
||||||
* specified as a NON-zero number, then that is used explicitly.
|
* here, in a way that keeps the amount of each Spark join partition to < 2GB.
|
||||||
|
* <p>
|
||||||
|
* If {@link HoodieIndexConfig#BLOOM_INDEX_PARALLELISM_PROP} is specified as a NON-zero number, then that is used
|
||||||
|
* explicitly.
|
||||||
*/
|
*/
|
||||||
int computeSafeParallelism(Map<String, Long> recordsPerPartition, Map<String, Long> comparisonsPerFileGroup) {
|
int computeSafeParallelism(Map<String, Long> recordsPerPartition, Map<String, Long> comparisonsPerFileGroup) {
|
||||||
long totalComparisons = comparisonsPerFileGroup.values().stream().mapToLong(Long::longValue).sum();
|
long totalComparisons = comparisonsPerFileGroup.values().stream().mapToLong(Long::longValue).sum();
|
||||||
long totalFiles = comparisonsPerFileGroup.size();
|
long totalFiles = comparisonsPerFileGroup.size();
|
||||||
long totalRecords = recordsPerPartition.values().stream().mapToLong(Long::longValue).sum();
|
long totalRecords = recordsPerPartition.values().stream().mapToLong(Long::longValue).sum();
|
||||||
int parallelism = (int) (totalComparisons / MAX_ITEMS_PER_SHUFFLE_PARTITION + 1);
|
int parallelism = (int) (totalComparisons / MAX_ITEMS_PER_SHUFFLE_PARTITION + 1);
|
||||||
logger.info(String.format("TotalRecords %d, TotalFiles %d, TotalAffectedPartitions %d, TotalComparisons %d, "
|
logger.info(String.format(
|
||||||
+ "SafeParallelism %d", totalRecords, totalFiles, recordsPerPartition.size(), totalComparisons, parallelism));
|
"TotalRecords %d, TotalFiles %d, TotalAffectedPartitions %d, TotalComparisons %d, " + "SafeParallelism %d",
|
||||||
|
totalRecords, totalFiles, recordsPerPartition.size(), totalComparisons, parallelism));
|
||||||
return parallelism;
|
return parallelism;
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Its crucial to pick the right parallelism. <p> totalSubPartitions : this is deemed safe limit, to be nice with
|
* Its crucial to pick the right parallelism.
|
||||||
* Spark. inputParallelism : typically number of input file splits <p> We pick the max such that, we are always safe,
|
* <p>
|
||||||
* but go higher if say a there are a lot of input files. (otherwise, we will fallback to number of partitions in
|
* totalSubPartitions : this is deemed safe limit, to be nice with Spark. inputParallelism : typically number of input
|
||||||
* input and end up with slow performance)
|
* file splits
|
||||||
|
* <p>
|
||||||
|
* We pick the max such that, we are always safe, but go higher if say a there are a lot of input files. (otherwise,
|
||||||
|
* we will fallback to number of partitions in input and end up with slow performance)
|
||||||
*/
|
*/
|
||||||
private int determineParallelism(int inputParallelism, int totalSubPartitions) {
|
private int determineParallelism(int inputParallelism, int totalSubPartitions) {
|
||||||
// If bloom index parallelism is set, use it to to check against the input parallelism and
|
// If bloom index parallelism is set, use it to to check against the input parallelism and
|
||||||
// take the max
|
// take the max
|
||||||
int indexParallelism = Math.max(inputParallelism, config.getBloomIndexParallelism());
|
int indexParallelism = Math.max(inputParallelism, config.getBloomIndexParallelism());
|
||||||
int joinParallelism = Math.max(totalSubPartitions, indexParallelism);
|
int joinParallelism = Math.max(totalSubPartitions, indexParallelism);
|
||||||
logger.info("InputParallelism: ${" + inputParallelism + "}, " + "IndexParallelism: ${" + config
|
logger.info("InputParallelism: ${" + inputParallelism + "}, " + "IndexParallelism: ${"
|
||||||
.getBloomIndexParallelism() + "}, " + "TotalSubParts: ${" + totalSubPartitions + "}, "
|
+ config.getBloomIndexParallelism() + "}, " + "TotalSubParts: ${" + totalSubPartitions + "}, "
|
||||||
+ "Join Parallelism set to : " + joinParallelism);
|
+ "Join Parallelism set to : " + joinParallelism);
|
||||||
return joinParallelism;
|
return joinParallelism;
|
||||||
}
|
}
|
||||||
@@ -231,11 +237,10 @@ public class HoodieBloomIndex<T extends HoodieRecordPayload> extends HoodieIndex
|
|||||||
final HoodieTable hoodieTable) {
|
final HoodieTable hoodieTable) {
|
||||||
|
|
||||||
// Obtain the latest data files from all the partitions.
|
// Obtain the latest data files from all the partitions.
|
||||||
List<Pair<String, String>> partitionPathFileIDList = jsc
|
List<Pair<String, String>> partitionPathFileIDList =
|
||||||
.parallelize(partitions, Math.max(partitions.size(), 1))
|
jsc.parallelize(partitions, Math.max(partitions.size(), 1)).flatMap(partitionPath -> {
|
||||||
.flatMap(partitionPath -> {
|
Option<HoodieInstant> latestCommitTime =
|
||||||
Option<HoodieInstant> latestCommitTime = hoodieTable.getMetaClient().getCommitsTimeline()
|
hoodieTable.getMetaClient().getCommitsTimeline().filterCompletedInstants().lastInstant();
|
||||||
.filterCompletedInstants().lastInstant();
|
|
||||||
List<Pair<String, String>> filteredFiles = new ArrayList<>();
|
List<Pair<String, String>> filteredFiles = new ArrayList<>();
|
||||||
if (latestCommitTime.isPresent()) {
|
if (latestCommitTime.isPresent()) {
|
||||||
filteredFiles = hoodieTable.getROFileSystemView()
|
filteredFiles = hoodieTable.getROFileSystemView()
|
||||||
@@ -259,8 +264,7 @@ public class HoodieBloomIndex<T extends HoodieRecordPayload> extends HoodieIndex
|
|||||||
}).collect();
|
}).collect();
|
||||||
} else {
|
} else {
|
||||||
return partitionPathFileIDList.stream()
|
return partitionPathFileIDList.stream()
|
||||||
.map(pf -> new Tuple2<>(pf.getKey(), new BloomIndexFileInfo(pf.getValue())))
|
.map(pf -> new Tuple2<>(pf.getKey(), new BloomIndexFileInfo(pf.getValue()))).collect(toList());
|
||||||
.collect(toList());
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -307,9 +311,9 @@ public class HoodieBloomIndex<T extends HoodieRecordPayload> extends HoodieIndex
|
|||||||
JavaRDD<Tuple2<String, HoodieKey>> explodeRecordRDDWithFileComparisons(
|
JavaRDD<Tuple2<String, HoodieKey>> explodeRecordRDDWithFileComparisons(
|
||||||
final Map<String, List<BloomIndexFileInfo>> partitionToFileIndexInfo,
|
final Map<String, List<BloomIndexFileInfo>> partitionToFileIndexInfo,
|
||||||
JavaPairRDD<String, String> partitionRecordKeyPairRDD) {
|
JavaPairRDD<String, String> partitionRecordKeyPairRDD) {
|
||||||
IndexFileFilter indexFileFilter = config.useBloomIndexTreebasedFilter()
|
IndexFileFilter indexFileFilter =
|
||||||
? new IntervalTreeBasedIndexFileFilter(partitionToFileIndexInfo)
|
config.useBloomIndexTreebasedFilter() ? new IntervalTreeBasedIndexFileFilter(partitionToFileIndexInfo)
|
||||||
: new ListBasedIndexFileFilter(partitionToFileIndexInfo);
|
: new ListBasedIndexFileFilter(partitionToFileIndexInfo);
|
||||||
|
|
||||||
return partitionRecordKeyPairRDD.map(partitionRecordKeyPair -> {
|
return partitionRecordKeyPairRDD.map(partitionRecordKeyPair -> {
|
||||||
String recordKey = partitionRecordKeyPair._2();
|
String recordKey = partitionRecordKeyPair._2();
|
||||||
@@ -322,10 +326,12 @@ public class HoodieBloomIndex<T extends HoodieRecordPayload> extends HoodieIndex
|
|||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Find out <RowKey, filename> pair. All workload grouped by file-level. <p> Join PairRDD(PartitionPath, RecordKey)
|
* Find out <RowKey, filename> pair. All workload grouped by file-level.
|
||||||
* and PairRDD(PartitionPath, File) & then repartition such that each RDD partition is a file, then for each file, we
|
* <p>
|
||||||
* do (1) load bloom filter, (2) load rowKeys, (3) Tag rowKey <p> Make sure the parallelism is atleast the groupby
|
* Join PairRDD(PartitionPath, RecordKey) and PairRDD(PartitionPath, File) & then repartition such that each RDD
|
||||||
* parallelism for tagging location
|
* partition is a file, then for each file, we do (1) load bloom filter, (2) load rowKeys, (3) Tag rowKey
|
||||||
|
* <p>
|
||||||
|
* Make sure the parallelism is atleast the groupby parallelism for tagging location
|
||||||
*/
|
*/
|
||||||
@VisibleForTesting
|
@VisibleForTesting
|
||||||
JavaPairRDD<HoodieKey, HoodieRecordLocation> findMatchingFilesForRecordKeys(
|
JavaPairRDD<HoodieKey, HoodieRecordLocation> findMatchingFilesForRecordKeys(
|
||||||
@@ -336,33 +342,24 @@ public class HoodieBloomIndex<T extends HoodieRecordPayload> extends HoodieIndex
|
|||||||
explodeRecordRDDWithFileComparisons(partitionToFileIndexInfo, partitionRecordKeyPairRDD);
|
explodeRecordRDDWithFileComparisons(partitionToFileIndexInfo, partitionRecordKeyPairRDD);
|
||||||
|
|
||||||
if (config.useBloomIndexBucketizedChecking()) {
|
if (config.useBloomIndexBucketizedChecking()) {
|
||||||
Partitioner partitioner = new BucketizedBloomCheckPartitioner(
|
Partitioner partitioner = new BucketizedBloomCheckPartitioner(shuffleParallelism, fileGroupToComparisons,
|
||||||
shuffleParallelism,
|
config.getBloomIndexKeysPerBucket());
|
||||||
fileGroupToComparisons,
|
|
||||||
config.getBloomIndexKeysPerBucket()
|
|
||||||
);
|
|
||||||
|
|
||||||
fileComparisonsRDD = fileComparisonsRDD
|
fileComparisonsRDD = fileComparisonsRDD.mapToPair(t -> new Tuple2<>(Pair.of(t._1, t._2.getRecordKey()), t))
|
||||||
.mapToPair(t -> new Tuple2<>(Pair.of(t._1, t._2.getRecordKey()), t))
|
.repartitionAndSortWithinPartitions(partitioner).map(Tuple2::_2);
|
||||||
.repartitionAndSortWithinPartitions(partitioner)
|
|
||||||
.map(Tuple2::_2);
|
|
||||||
} else {
|
} else {
|
||||||
fileComparisonsRDD = fileComparisonsRDD.sortBy(Tuple2::_1, true, shuffleParallelism);
|
fileComparisonsRDD = fileComparisonsRDD.sortBy(Tuple2::_1, true, shuffleParallelism);
|
||||||
}
|
}
|
||||||
|
|
||||||
return fileComparisonsRDD
|
return fileComparisonsRDD.mapPartitionsWithIndex(new HoodieBloomIndexCheckFunction(hoodieTable, config), true)
|
||||||
.mapPartitionsWithIndex(new HoodieBloomIndexCheckFunction(hoodieTable, config), true)
|
.flatMap(List::iterator).filter(lr -> lr.getMatchingRecordKeys().size() > 0)
|
||||||
.flatMap(List::iterator)
|
|
||||||
.filter(lr -> lr.getMatchingRecordKeys().size() > 0)
|
|
||||||
.flatMapToPair(lookupResult -> lookupResult.getMatchingRecordKeys().stream()
|
.flatMapToPair(lookupResult -> lookupResult.getMatchingRecordKeys().stream()
|
||||||
.map(recordKey -> new Tuple2<>(new HoodieKey(recordKey, lookupResult.getPartitionPath()),
|
.map(recordKey -> new Tuple2<>(new HoodieKey(recordKey, lookupResult.getPartitionPath()),
|
||||||
new HoodieRecordLocation(lookupResult.getBaseInstantTime(), lookupResult.getFileId())))
|
new HoodieRecordLocation(lookupResult.getBaseInstantTime(), lookupResult.getFileId())))
|
||||||
.collect(Collectors.toList())
|
.collect(Collectors.toList()).iterator());
|
||||||
.iterator());
|
|
||||||
}
|
}
|
||||||
|
|
||||||
HoodieRecord<T> getTaggedRecord(HoodieRecord<T> inputRecord,
|
HoodieRecord<T> getTaggedRecord(HoodieRecord<T> inputRecord, Option<HoodieRecordLocation> location) {
|
||||||
Option<HoodieRecordLocation> location) {
|
|
||||||
HoodieRecord<T> record = inputRecord;
|
HoodieRecord<T> record = inputRecord;
|
||||||
if (location.isPresent()) {
|
if (location.isPresent()) {
|
||||||
// When you have a record in multiple files in the same partition, then rowKeyRecordPairRDD
|
// When you have a record in multiple files in the same partition, then rowKeyRecordPairRDD
|
||||||
@@ -383,12 +380,12 @@ public class HoodieBloomIndex<T extends HoodieRecordPayload> extends HoodieIndex
|
|||||||
*/
|
*/
|
||||||
protected JavaRDD<HoodieRecord<T>> tagLocationBacktoRecords(
|
protected JavaRDD<HoodieRecord<T>> tagLocationBacktoRecords(
|
||||||
JavaPairRDD<HoodieKey, HoodieRecordLocation> keyFilenamePairRDD, JavaRDD<HoodieRecord<T>> recordRDD) {
|
JavaPairRDD<HoodieKey, HoodieRecordLocation> keyFilenamePairRDD, JavaRDD<HoodieRecord<T>> recordRDD) {
|
||||||
JavaPairRDD<HoodieKey, HoodieRecord<T>> keyRecordPairRDD = recordRDD
|
JavaPairRDD<HoodieKey, HoodieRecord<T>> keyRecordPairRDD =
|
||||||
.mapToPair(record -> new Tuple2<>(record.getKey(), record));
|
recordRDD.mapToPair(record -> new Tuple2<>(record.getKey(), record));
|
||||||
// Here as the recordRDD might have more data than rowKeyRDD (some rowKeys' fileId is null),
|
// Here as the recordRDD might have more data than rowKeyRDD (some rowKeys' fileId is null),
|
||||||
// so we do left outer join.
|
// so we do left outer join.
|
||||||
return keyRecordPairRDD.leftOuterJoin(keyFilenamePairRDD).values().map(
|
return keyRecordPairRDD.leftOuterJoin(keyFilenamePairRDD).values()
|
||||||
v1 -> getTaggedRecord(v1._1, Option.ofNullable(v1._2.orNull())));
|
.map(v1 -> getTaggedRecord(v1._1, Option.ofNullable(v1._2.orNull())));
|
||||||
}
|
}
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
|
|||||||
@@ -34,11 +34,10 @@ import org.apache.spark.api.java.function.Function2;
|
|||||||
import scala.Tuple2;
|
import scala.Tuple2;
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Function performing actual checking of RDD partition containing (fileId, hoodieKeys) against the
|
* Function performing actual checking of RDD partition containing (fileId, hoodieKeys) against the actual files
|
||||||
* actual files
|
|
||||||
*/
|
*/
|
||||||
public class HoodieBloomIndexCheckFunction implements
|
public class HoodieBloomIndexCheckFunction
|
||||||
Function2<Integer, Iterator<Tuple2<String, HoodieKey>>, Iterator<List<KeyLookupResult>>> {
|
implements Function2<Integer, Iterator<Tuple2<String, HoodieKey>>, Iterator<List<KeyLookupResult>>> {
|
||||||
|
|
||||||
private final HoodieTable hoodieTable;
|
private final HoodieTable hoodieTable;
|
||||||
|
|
||||||
@@ -59,14 +58,12 @@ public class HoodieBloomIndexCheckFunction implements
|
|||||||
|
|
||||||
private HoodieKeyLookupHandle keyLookupHandle;
|
private HoodieKeyLookupHandle keyLookupHandle;
|
||||||
|
|
||||||
LazyKeyCheckIterator(
|
LazyKeyCheckIterator(Iterator<Tuple2<String, HoodieKey>> filePartitionRecordKeyTripletItr) {
|
||||||
Iterator<Tuple2<String, HoodieKey>> filePartitionRecordKeyTripletItr) {
|
|
||||||
super(filePartitionRecordKeyTripletItr);
|
super(filePartitionRecordKeyTripletItr);
|
||||||
}
|
}
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
protected void start() {
|
protected void start() {}
|
||||||
}
|
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
protected List<HoodieKeyLookupHandle.KeyLookupResult> computeNext() {
|
protected List<HoodieKeyLookupHandle.KeyLookupResult> computeNext() {
|
||||||
@@ -113,7 +110,6 @@ public class HoodieBloomIndexCheckFunction implements
|
|||||||
}
|
}
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
protected void end() {
|
protected void end() {}
|
||||||
}
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -59,9 +59,8 @@ public class HoodieGlobalBloomIndex<T extends HoodieRecordPayload> extends Hoodi
|
|||||||
final HoodieTable hoodieTable) {
|
final HoodieTable hoodieTable) {
|
||||||
HoodieTableMetaClient metaClient = hoodieTable.getMetaClient();
|
HoodieTableMetaClient metaClient = hoodieTable.getMetaClient();
|
||||||
try {
|
try {
|
||||||
List<String> allPartitionPaths = FSUtils
|
List<String> allPartitionPaths = FSUtils.getAllPartitionPaths(metaClient.getFs(), metaClient.getBasePath(),
|
||||||
.getAllPartitionPaths(metaClient.getFs(), metaClient.getBasePath(),
|
config.shouldAssumeDatePartitioning());
|
||||||
config.shouldAssumeDatePartitioning());
|
|
||||||
return super.loadInvolvedFiles(allPartitionPaths, jsc, hoodieTable);
|
return super.loadInvolvedFiles(allPartitionPaths, jsc, hoodieTable);
|
||||||
} catch (IOException e) {
|
} catch (IOException e) {
|
||||||
throw new HoodieIOException("Failed to load all partitions", e);
|
throw new HoodieIOException("Failed to load all partitions", e);
|
||||||
@@ -88,9 +87,9 @@ public class HoodieGlobalBloomIndex<T extends HoodieRecordPayload> extends Hoodi
|
|||||||
entry.getValue().forEach(indexFile -> indexToPartitionMap.put(indexFile.getFileId(), entry.getKey()));
|
entry.getValue().forEach(indexFile -> indexToPartitionMap.put(indexFile.getFileId(), entry.getKey()));
|
||||||
}
|
}
|
||||||
|
|
||||||
IndexFileFilter indexFileFilter = config.getBloomIndexPruneByRanges()
|
IndexFileFilter indexFileFilter =
|
||||||
? new IntervalTreeBasedGlobalIndexFileFilter(partitionToFileIndexInfo)
|
config.getBloomIndexPruneByRanges() ? new IntervalTreeBasedGlobalIndexFileFilter(partitionToFileIndexInfo)
|
||||||
: new ListBasedGlobalIndexFileFilter(partitionToFileIndexInfo);
|
: new ListBasedGlobalIndexFileFilter(partitionToFileIndexInfo);
|
||||||
|
|
||||||
return partitionRecordKeyPairRDD.map(partitionRecordKeyPair -> {
|
return partitionRecordKeyPairRDD.map(partitionRecordKeyPair -> {
|
||||||
String recordKey = partitionRecordKeyPair._2();
|
String recordKey = partitionRecordKeyPair._2();
|
||||||
@@ -109,8 +108,8 @@ public class HoodieGlobalBloomIndex<T extends HoodieRecordPayload> extends Hoodi
|
|||||||
@Override
|
@Override
|
||||||
protected JavaRDD<HoodieRecord<T>> tagLocationBacktoRecords(
|
protected JavaRDD<HoodieRecord<T>> tagLocationBacktoRecords(
|
||||||
JavaPairRDD<HoodieKey, HoodieRecordLocation> keyFilenamePairRDD, JavaRDD<HoodieRecord<T>> recordRDD) {
|
JavaPairRDD<HoodieKey, HoodieRecordLocation> keyFilenamePairRDD, JavaRDD<HoodieRecord<T>> recordRDD) {
|
||||||
JavaPairRDD<String, HoodieRecord<T>> rowKeyRecordPairRDD = recordRDD
|
JavaPairRDD<String, HoodieRecord<T>> rowKeyRecordPairRDD =
|
||||||
.mapToPair(record -> new Tuple2<>(record.getRecordKey(), record));
|
recordRDD.mapToPair(record -> new Tuple2<>(record.getRecordKey(), record));
|
||||||
|
|
||||||
// Here as the recordRDD might have more data than rowKeyRDD (some rowKeys' fileId is null),
|
// Here as the recordRDD might have more data than rowKeyRDD (some rowKeys' fileId is null),
|
||||||
// so we do left outer join.
|
// so we do left outer join.
|
||||||
|
|||||||
@@ -28,7 +28,7 @@ import java.util.stream.Collectors;
|
|||||||
|
|
||||||
/**
|
/**
|
||||||
* Interval Tree based index look up for Global Index. Builds an {@link KeyRangeLookupTree} for all index files (across
|
* Interval Tree based index look up for Global Index. Builds an {@link KeyRangeLookupTree} for all index files (across
|
||||||
* all partitions) and uses it to search for matching index files for any given recordKey that needs to be looked up.
|
* all partitions) and uses it to search for matching index files for any given recordKey that needs to be looked up.
|
||||||
*/
|
*/
|
||||||
class IntervalTreeBasedGlobalIndexFileFilter implements IndexFileFilter {
|
class IntervalTreeBasedGlobalIndexFileFilter implements IndexFileFilter {
|
||||||
|
|
||||||
@@ -41,16 +41,16 @@ class IntervalTreeBasedGlobalIndexFileFilter implements IndexFileFilter {
|
|||||||
* @param partitionToFileIndexInfo Map of partition to List of {@link BloomIndexFileInfo}s
|
* @param partitionToFileIndexInfo Map of partition to List of {@link BloomIndexFileInfo}s
|
||||||
*/
|
*/
|
||||||
IntervalTreeBasedGlobalIndexFileFilter(final Map<String, List<BloomIndexFileInfo>> partitionToFileIndexInfo) {
|
IntervalTreeBasedGlobalIndexFileFilter(final Map<String, List<BloomIndexFileInfo>> partitionToFileIndexInfo) {
|
||||||
List<BloomIndexFileInfo> allIndexFiles = partitionToFileIndexInfo.values().stream().flatMap(Collection::stream)
|
List<BloomIndexFileInfo> allIndexFiles =
|
||||||
.collect(Collectors.toList());
|
partitionToFileIndexInfo.values().stream().flatMap(Collection::stream).collect(Collectors.toList());
|
||||||
// Note that the interval tree implementation doesn't have auto-balancing to ensure logN search time.
|
// Note that the interval tree implementation doesn't have auto-balancing to ensure logN search time.
|
||||||
// So, we are shuffling the input here hoping the tree will not have any skewness. If not, the tree could be skewed
|
// So, we are shuffling the input here hoping the tree will not have any skewness. If not, the tree could be skewed
|
||||||
// which could result in N search time instead of NlogN.
|
// which could result in N search time instead of NlogN.
|
||||||
Collections.shuffle(allIndexFiles);
|
Collections.shuffle(allIndexFiles);
|
||||||
allIndexFiles.forEach(indexFile -> {
|
allIndexFiles.forEach(indexFile -> {
|
||||||
if (indexFile.hasKeyRanges()) {
|
if (indexFile.hasKeyRanges()) {
|
||||||
indexLookUpTree.insert(new KeyRangeNode(indexFile.getMinRecordKey(),
|
indexLookUpTree
|
||||||
indexFile.getMaxRecordKey(), indexFile.getFileId()));
|
.insert(new KeyRangeNode(indexFile.getMinRecordKey(), indexFile.getMaxRecordKey(), indexFile.getFileId()));
|
||||||
} else {
|
} else {
|
||||||
filesWithNoRanges.add(indexFile.getFileId());
|
filesWithNoRanges.add(indexFile.getFileId());
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -48,8 +48,8 @@ class IntervalTreeBasedIndexFileFilter implements IndexFileFilter {
|
|||||||
KeyRangeLookupTree lookUpTree = new KeyRangeLookupTree();
|
KeyRangeLookupTree lookUpTree = new KeyRangeLookupTree();
|
||||||
bloomIndexFiles.forEach(indexFileInfo -> {
|
bloomIndexFiles.forEach(indexFileInfo -> {
|
||||||
if (indexFileInfo.hasKeyRanges()) {
|
if (indexFileInfo.hasKeyRanges()) {
|
||||||
lookUpTree.insert(new KeyRangeNode(indexFileInfo.getMinRecordKey(),
|
lookUpTree.insert(new KeyRangeNode(indexFileInfo.getMinRecordKey(), indexFileInfo.getMaxRecordKey(),
|
||||||
indexFileInfo.getMaxRecordKey(), indexFileInfo.getFileId()));
|
indexFileInfo.getFileId()));
|
||||||
} else {
|
} else {
|
||||||
if (!partitionToFilesWithNoRanges.containsKey(partition)) {
|
if (!partitionToFilesWithNoRanges.containsKey(partition)) {
|
||||||
partitionToFilesWithNoRanges.put(partition, new HashSet<>());
|
partitionToFilesWithNoRanges.put(partition, new HashSet<>());
|
||||||
|
|||||||
@@ -50,25 +50,16 @@ class KeyRangeLookupTree implements Serializable {
|
|||||||
*
|
*
|
||||||
* If no root exists, make {@code newNode} as the root and return the new root.
|
* If no root exists, make {@code newNode} as the root and return the new root.
|
||||||
*
|
*
|
||||||
* If current root and newNode matches with min record key and max record key,
|
* If current root and newNode matches with min record key and max record key, merge two nodes. In other words, add
|
||||||
* merge two nodes. In other words, add files from {@code newNode} to current root.
|
* files from {@code newNode} to current root. Return current root.
|
||||||
* Return current root.
|
|
||||||
*
|
*
|
||||||
* If current root is < newNode
|
* If current root is < newNode if current root has no right sub tree update current root's right sub tree max and min
|
||||||
* if current root has no right sub tree
|
* set newNode as right sub tree else update root's right sub tree min and max with newNode's min and max record key
|
||||||
* update current root's right sub tree max and min
|
* as applicable recursively call insert() with root's right subtree as new root
|
||||||
* set newNode as right sub tree
|
|
||||||
* else
|
|
||||||
* update root's right sub tree min and max with newNode's min and max record key as applicable
|
|
||||||
* recursively call insert() with root's right subtree as new root
|
|
||||||
*
|
*
|
||||||
* else // current root is >= newNode
|
* else // current root is >= newNode if current root has no left sub tree update current root's left sub tree max and
|
||||||
* if current root has no left sub tree
|
* min set newNode as left sub tree else update root's left sub tree min and max with newNode's min and max record key
|
||||||
* update current root's left sub tree max and min
|
* as applicable recursively call insert() with root's left subtree as new root
|
||||||
* set newNode as left sub tree
|
|
||||||
* else
|
|
||||||
* update root's left sub tree min and max with newNode's min and max record key as applicable
|
|
||||||
* recursively call insert() with root's left subtree as new root
|
|
||||||
*
|
*
|
||||||
* @param root refers to the current root of the look up tree
|
* @param root refers to the current root of the look up tree
|
||||||
* @param newNode newNode the new {@link KeyRangeNode} to be inserted
|
* @param newNode newNode the new {@link KeyRangeNode} to be inserted
|
||||||
|
|||||||
@@ -62,15 +62,10 @@ class KeyRangeNode implements Comparable<KeyRangeNode>, Serializable {
|
|||||||
|
|
||||||
@Override
|
@Override
|
||||||
public String toString() {
|
public String toString() {
|
||||||
return "KeyRangeNode{"
|
return "KeyRangeNode{" + "minRecordKey='" + minRecordKey + '\'' + ", maxRecordKey='" + maxRecordKey + '\''
|
||||||
+ "minRecordKey='" + minRecordKey + '\''
|
+ ", fileNameList=" + fileNameList + ", rightSubTreeMax='" + rightSubTreeMax + '\'' + ", leftSubTreeMax='"
|
||||||
+ ", maxRecordKey='" + maxRecordKey + '\''
|
+ leftSubTreeMax + '\'' + ", rightSubTreeMin='" + rightSubTreeMin + '\'' + ", leftSubTreeMin='" + leftSubTreeMin
|
||||||
+ ", fileNameList=" + fileNameList
|
+ '\'' + '}';
|
||||||
+ ", rightSubTreeMax='" + rightSubTreeMax + '\''
|
|
||||||
+ ", leftSubTreeMax='" + leftSubTreeMax + '\''
|
|
||||||
+ ", rightSubTreeMin='" + rightSubTreeMin + '\''
|
|
||||||
+ ", leftSubTreeMin='" + leftSubTreeMin + '\''
|
|
||||||
+ '}';
|
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
@@ -78,8 +73,8 @@ class KeyRangeNode implements Comparable<KeyRangeNode>, Serializable {
|
|||||||
*
|
*
|
||||||
* @param that the {@link KeyRangeNode} to be compared with
|
* @param that the {@link KeyRangeNode} to be compared with
|
||||||
* @return the result of comparison. 0 if both min and max are equal in both. 1 if this {@link KeyRangeNode} is
|
* @return the result of comparison. 0 if both min and max are equal in both. 1 if this {@link KeyRangeNode} is
|
||||||
* greater than the {@code that} keyRangeNode. -1 if {@code that} keyRangeNode is greater than this {@link
|
* greater than the {@code that} keyRangeNode. -1 if {@code that} keyRangeNode is greater than this
|
||||||
* KeyRangeNode}
|
* {@link KeyRangeNode}
|
||||||
*/
|
*/
|
||||||
@Override
|
@Override
|
||||||
public int compareTo(KeyRangeNode that) {
|
public int compareTo(KeyRangeNode that) {
|
||||||
|
|||||||
@@ -30,8 +30,7 @@ class ListBasedGlobalIndexFileFilter extends ListBasedIndexFileFilter {
|
|||||||
*
|
*
|
||||||
* @param partitionToFileIndexInfo Map of partition to List of {@link BloomIndexFileInfo}
|
* @param partitionToFileIndexInfo Map of partition to List of {@link BloomIndexFileInfo}
|
||||||
*/
|
*/
|
||||||
ListBasedGlobalIndexFileFilter(
|
ListBasedGlobalIndexFileFilter(Map<String, List<BloomIndexFileInfo>> partitionToFileIndexInfo) {
|
||||||
Map<String, List<BloomIndexFileInfo>> partitionToFileIndexInfo) {
|
|
||||||
super(partitionToFileIndexInfo);
|
super(partitionToFileIndexInfo);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|||||||
@@ -68,10 +68,8 @@ import scala.Tuple2;
|
|||||||
*/
|
*/
|
||||||
public class HBaseIndex<T extends HoodieRecordPayload> extends HoodieIndex<T> {
|
public class HBaseIndex<T extends HoodieRecordPayload> extends HoodieIndex<T> {
|
||||||
|
|
||||||
public static final String DEFAULT_SPARK_EXECUTOR_INSTANCES_CONFIG_NAME =
|
public static final String DEFAULT_SPARK_EXECUTOR_INSTANCES_CONFIG_NAME = "spark.executor.instances";
|
||||||
"spark.executor.instances";
|
public static final String DEFAULT_SPARK_DYNAMIC_ALLOCATION_ENABLED_CONFIG_NAME = "spark.dynamicAllocation.enabled";
|
||||||
public static final String DEFAULT_SPARK_DYNAMIC_ALLOCATION_ENABLED_CONFIG_NAME =
|
|
||||||
"spark.dynamicAllocation.enabled";
|
|
||||||
public static final String DEFAULT_SPARK_DYNAMIC_ALLOCATION_MAX_EXECUTORS_CONFIG_NAME =
|
public static final String DEFAULT_SPARK_DYNAMIC_ALLOCATION_MAX_EXECUTORS_CONFIG_NAME =
|
||||||
"spark.dynamicAllocation.maxExecutors";
|
"spark.dynamicAllocation.maxExecutors";
|
||||||
|
|
||||||
@@ -114,9 +112,8 @@ public class HBaseIndex<T extends HoodieRecordPayload> extends HoodieIndex<T> {
|
|||||||
public HBaseIndexQPSResourceAllocator createQPSResourceAllocator(HoodieWriteConfig config) {
|
public HBaseIndexQPSResourceAllocator createQPSResourceAllocator(HoodieWriteConfig config) {
|
||||||
try {
|
try {
|
||||||
logger.info("createQPSResourceAllocator :" + config.getHBaseQPSResourceAllocatorClass());
|
logger.info("createQPSResourceAllocator :" + config.getHBaseQPSResourceAllocatorClass());
|
||||||
final HBaseIndexQPSResourceAllocator resourceAllocator =
|
final HBaseIndexQPSResourceAllocator resourceAllocator = (HBaseIndexQPSResourceAllocator) ReflectionUtils
|
||||||
(HBaseIndexQPSResourceAllocator) ReflectionUtils.loadClass(
|
.loadClass(config.getHBaseQPSResourceAllocatorClass(), config);
|
||||||
config.getHBaseQPSResourceAllocatorClass(), config);
|
|
||||||
return resourceAllocator;
|
return resourceAllocator;
|
||||||
} catch (Exception e) {
|
} catch (Exception e) {
|
||||||
logger.warn("error while instantiating HBaseIndexQPSResourceAllocator", e);
|
logger.warn("error while instantiating HBaseIndexQPSResourceAllocator", e);
|
||||||
@@ -143,14 +140,14 @@ public class HBaseIndex<T extends HoodieRecordPayload> extends HoodieIndex<T> {
|
|||||||
try {
|
try {
|
||||||
return ConnectionFactory.createConnection(hbaseConfig);
|
return ConnectionFactory.createConnection(hbaseConfig);
|
||||||
} catch (IOException e) {
|
} catch (IOException e) {
|
||||||
throw new HoodieDependentSystemUnavailableException(
|
throw new HoodieDependentSystemUnavailableException(HoodieDependentSystemUnavailableException.HBASE,
|
||||||
HoodieDependentSystemUnavailableException.HBASE, quorum + ":" + port);
|
quorum + ":" + port);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Since we are sharing the HbaseConnection across tasks in a JVM, make sure the HbaseConnectio is
|
* Since we are sharing the HbaseConnection across tasks in a JVM, make sure the HbaseConnectio is closed when JVM
|
||||||
* closed when JVM exits
|
* exits
|
||||||
*/
|
*/
|
||||||
private void addShutDownHook() {
|
private void addShutDownHook() {
|
||||||
Runtime.getRuntime().addShutdownHook(new Thread() {
|
Runtime.getRuntime().addShutdownHook(new Thread() {
|
||||||
@@ -172,103 +169,95 @@ public class HBaseIndex<T extends HoodieRecordPayload> extends HoodieIndex<T> {
|
|||||||
}
|
}
|
||||||
|
|
||||||
private Get generateStatement(String key) throws IOException {
|
private Get generateStatement(String key) throws IOException {
|
||||||
return new Get(Bytes.toBytes(key)).setMaxVersions(1)
|
return new Get(Bytes.toBytes(key)).setMaxVersions(1).addColumn(SYSTEM_COLUMN_FAMILY, COMMIT_TS_COLUMN)
|
||||||
.addColumn(SYSTEM_COLUMN_FAMILY, COMMIT_TS_COLUMN)
|
.addColumn(SYSTEM_COLUMN_FAMILY, FILE_NAME_COLUMN).addColumn(SYSTEM_COLUMN_FAMILY, PARTITION_PATH_COLUMN);
|
||||||
.addColumn(SYSTEM_COLUMN_FAMILY, FILE_NAME_COLUMN)
|
|
||||||
.addColumn(SYSTEM_COLUMN_FAMILY, PARTITION_PATH_COLUMN);
|
|
||||||
}
|
}
|
||||||
|
|
||||||
private boolean checkIfValidCommit(HoodieTableMetaClient metaClient, String commitTs) {
|
private boolean checkIfValidCommit(HoodieTableMetaClient metaClient, String commitTs) {
|
||||||
HoodieTimeline commitTimeline = metaClient.getActiveTimeline().filterCompletedInstants();
|
HoodieTimeline commitTimeline = metaClient.getActiveTimeline().filterCompletedInstants();
|
||||||
// Check if the last commit ts for this row is 1) present in the timeline or
|
// Check if the last commit ts for this row is 1) present in the timeline or
|
||||||
// 2) is less than the first commit ts in the timeline
|
// 2) is less than the first commit ts in the timeline
|
||||||
return !commitTimeline.empty() && (commitTimeline
|
return !commitTimeline.empty()
|
||||||
.containsInstant(new HoodieInstant(false, HoodieTimeline.COMMIT_ACTION, commitTs))
|
&& (commitTimeline.containsInstant(new HoodieInstant(false, HoodieTimeline.COMMIT_ACTION, commitTs))
|
||||||
|| HoodieTimeline
|
|| HoodieTimeline.compareTimestamps(commitTimeline.firstInstant().get().getTimestamp(), commitTs,
|
||||||
.compareTimestamps(commitTimeline.firstInstant().get().getTimestamp(), commitTs,
|
HoodieTimeline.GREATER));
|
||||||
HoodieTimeline.GREATER));
|
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Function that tags each HoodieRecord with an existing location, if known.
|
* Function that tags each HoodieRecord with an existing location, if known.
|
||||||
*/
|
*/
|
||||||
private Function2<Integer, Iterator<HoodieRecord<T>>,
|
private Function2<Integer, Iterator<HoodieRecord<T>>, Iterator<HoodieRecord<T>>> locationTagFunction(
|
||||||
Iterator<HoodieRecord<T>>> locationTagFunction(HoodieTableMetaClient metaClient) {
|
HoodieTableMetaClient metaClient) {
|
||||||
|
|
||||||
return (Function2<Integer, Iterator<HoodieRecord<T>>, Iterator<HoodieRecord<T>>>)
|
return (Function2<Integer, Iterator<HoodieRecord<T>>, Iterator<HoodieRecord<T>>>) (partitionNum,
|
||||||
(partitionNum, hoodieRecordIterator) -> {
|
hoodieRecordIterator) -> {
|
||||||
|
|
||||||
Integer multiGetBatchSize = config.getHbaseIndexGetBatchSize();
|
Integer multiGetBatchSize = config.getHbaseIndexGetBatchSize();
|
||||||
|
|
||||||
// Grab the global HBase connection
|
// Grab the global HBase connection
|
||||||
synchronized (HBaseIndex.class) {
|
synchronized (HBaseIndex.class) {
|
||||||
if (hbaseConnection == null || hbaseConnection.isClosed()) {
|
if (hbaseConnection == null || hbaseConnection.isClosed()) {
|
||||||
hbaseConnection = getHBaseConnection();
|
hbaseConnection = getHBaseConnection();
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
List<HoodieRecord<T>> taggedRecords = new ArrayList<>();
|
List<HoodieRecord<T>> taggedRecords = new ArrayList<>();
|
||||||
HTable hTable = null;
|
HTable hTable = null;
|
||||||
try {
|
try {
|
||||||
hTable = (HTable) hbaseConnection.getTable(TableName.valueOf(tableName));
|
hTable = (HTable) hbaseConnection.getTable(TableName.valueOf(tableName));
|
||||||
List<Get> statements = new ArrayList<>();
|
List<Get> statements = new ArrayList<>();
|
||||||
List<HoodieRecord> currentBatchOfRecords = new LinkedList<>();
|
List<HoodieRecord> currentBatchOfRecords = new LinkedList<>();
|
||||||
// Do the tagging.
|
// Do the tagging.
|
||||||
while (hoodieRecordIterator.hasNext()) {
|
while (hoodieRecordIterator.hasNext()) {
|
||||||
HoodieRecord rec = hoodieRecordIterator.next();
|
HoodieRecord rec = hoodieRecordIterator.next();
|
||||||
statements.add(generateStatement(rec.getRecordKey()));
|
statements.add(generateStatement(rec.getRecordKey()));
|
||||||
currentBatchOfRecords.add(rec);
|
currentBatchOfRecords.add(rec);
|
||||||
// iterator till we reach batch size
|
// iterator till we reach batch size
|
||||||
if (statements.size() >= multiGetBatchSize || !hoodieRecordIterator.hasNext()) {
|
if (statements.size() >= multiGetBatchSize || !hoodieRecordIterator.hasNext()) {
|
||||||
// get results for batch from Hbase
|
// get results for batch from Hbase
|
||||||
Result[] results = doGet(hTable, statements);
|
Result[] results = doGet(hTable, statements);
|
||||||
// clear statements to be GC'd
|
// clear statements to be GC'd
|
||||||
statements.clear();
|
statements.clear();
|
||||||
for (Result result : results) {
|
for (Result result : results) {
|
||||||
// first, attempt to grab location from HBase
|
// first, attempt to grab location from HBase
|
||||||
HoodieRecord currentRecord = currentBatchOfRecords.remove(0);
|
HoodieRecord currentRecord = currentBatchOfRecords.remove(0);
|
||||||
if (result.getRow() != null) {
|
if (result.getRow() != null) {
|
||||||
String keyFromResult = Bytes.toString(result.getRow());
|
String keyFromResult = Bytes.toString(result.getRow());
|
||||||
String commitTs = Bytes
|
String commitTs = Bytes.toString(result.getValue(SYSTEM_COLUMN_FAMILY, COMMIT_TS_COLUMN));
|
||||||
.toString(result.getValue(SYSTEM_COLUMN_FAMILY, COMMIT_TS_COLUMN));
|
String fileId = Bytes.toString(result.getValue(SYSTEM_COLUMN_FAMILY, FILE_NAME_COLUMN));
|
||||||
String fileId = Bytes
|
String partitionPath = Bytes.toString(result.getValue(SYSTEM_COLUMN_FAMILY, PARTITION_PATH_COLUMN));
|
||||||
.toString(result.getValue(SYSTEM_COLUMN_FAMILY, FILE_NAME_COLUMN));
|
|
||||||
String partitionPath = Bytes
|
|
||||||
.toString(result.getValue(SYSTEM_COLUMN_FAMILY, PARTITION_PATH_COLUMN));
|
|
||||||
|
|
||||||
if (checkIfValidCommit(metaClient, commitTs)) {
|
if (checkIfValidCommit(metaClient, commitTs)) {
|
||||||
currentRecord = new HoodieRecord(
|
currentRecord = new HoodieRecord(new HoodieKey(currentRecord.getRecordKey(), partitionPath),
|
||||||
new HoodieKey(currentRecord.getRecordKey(), partitionPath),
|
currentRecord.getData());
|
||||||
currentRecord.getData());
|
currentRecord.unseal();
|
||||||
currentRecord.unseal();
|
currentRecord.setCurrentLocation(new HoodieRecordLocation(commitTs, fileId));
|
||||||
currentRecord.setCurrentLocation(new HoodieRecordLocation(commitTs, fileId));
|
currentRecord.seal();
|
||||||
currentRecord.seal();
|
taggedRecords.add(currentRecord);
|
||||||
taggedRecords.add(currentRecord);
|
// the key from Result and the key being processed should be same
|
||||||
// the key from Result and the key being processed should be same
|
assert (currentRecord.getRecordKey().contentEquals(keyFromResult));
|
||||||
assert (currentRecord.getRecordKey().contentEquals(keyFromResult));
|
} else { // if commit is invalid, treat this as a new taggedRecord
|
||||||
} else { //if commit is invalid, treat this as a new taggedRecord
|
taggedRecords.add(currentRecord);
|
||||||
taggedRecords.add(currentRecord);
|
|
||||||
}
|
|
||||||
} else {
|
|
||||||
taggedRecords.add(currentRecord);
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
} else {
|
||||||
|
taggedRecords.add(currentRecord);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
} catch (IOException e) {
|
|
||||||
throw new HoodieIndexException(
|
|
||||||
"Failed to Tag indexed locations because of exception with HBase Client", e);
|
|
||||||
} finally {
|
|
||||||
if (hTable != null) {
|
|
||||||
try {
|
|
||||||
hTable.close();
|
|
||||||
} catch (IOException e) {
|
|
||||||
// Ignore
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
}
|
}
|
||||||
return taggedRecords.iterator();
|
}
|
||||||
};
|
} catch (IOException e) {
|
||||||
|
throw new HoodieIndexException("Failed to Tag indexed locations because of exception with HBase Client", e);
|
||||||
|
} finally {
|
||||||
|
if (hTable != null) {
|
||||||
|
try {
|
||||||
|
hTable.close();
|
||||||
|
} catch (IOException e) {
|
||||||
|
// Ignore
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
||||||
|
return taggedRecords.iterator();
|
||||||
|
};
|
||||||
}
|
}
|
||||||
|
|
||||||
private Result[] doGet(HTable hTable, List<Get> keys) throws IOException {
|
private Result[] doGet(HTable hTable, List<Get> keys) throws IOException {
|
||||||
@@ -310,15 +299,12 @@ public class HBaseIndex<T extends HoodieRecordPayload> extends HoodieIndex<T> {
|
|||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
Put put = new Put(Bytes.toBytes(rec.getRecordKey()));
|
Put put = new Put(Bytes.toBytes(rec.getRecordKey()));
|
||||||
put.addColumn(SYSTEM_COLUMN_FAMILY, COMMIT_TS_COLUMN,
|
put.addColumn(SYSTEM_COLUMN_FAMILY, COMMIT_TS_COLUMN, Bytes.toBytes(loc.get().getInstantTime()));
|
||||||
Bytes.toBytes(loc.get().getInstantTime()));
|
put.addColumn(SYSTEM_COLUMN_FAMILY, FILE_NAME_COLUMN, Bytes.toBytes(loc.get().getFileId()));
|
||||||
put.addColumn(SYSTEM_COLUMN_FAMILY, FILE_NAME_COLUMN,
|
put.addColumn(SYSTEM_COLUMN_FAMILY, PARTITION_PATH_COLUMN, Bytes.toBytes(rec.getPartitionPath()));
|
||||||
Bytes.toBytes(loc.get().getFileId()));
|
|
||||||
put.addColumn(SYSTEM_COLUMN_FAMILY, PARTITION_PATH_COLUMN,
|
|
||||||
Bytes.toBytes(rec.getPartitionPath()));
|
|
||||||
puts.add(put);
|
puts.add(put);
|
||||||
} else {
|
} else {
|
||||||
//Delete existing index for a deleted record
|
// Delete existing index for a deleted record
|
||||||
Delete delete = new Delete(Bytes.toBytes(rec.getRecordKey()));
|
Delete delete = new Delete(Bytes.toBytes(rec.getRecordKey()));
|
||||||
deletes.add(delete);
|
deletes.add(delete);
|
||||||
}
|
}
|
||||||
@@ -328,7 +314,7 @@ public class HBaseIndex<T extends HoodieRecordPayload> extends HoodieIndex<T> {
|
|||||||
}
|
}
|
||||||
doPutsAndDeletes(hTable, puts, deletes);
|
doPutsAndDeletes(hTable, puts, deletes);
|
||||||
}
|
}
|
||||||
//process remaining puts and deletes, if any
|
// process remaining puts and deletes, if any
|
||||||
doPutsAndDeletes(hTable, puts, deletes);
|
doPutsAndDeletes(hTable, puts, deletes);
|
||||||
} catch (Exception e) {
|
} catch (Exception e) {
|
||||||
Exception we = new Exception("Error updating index for " + writeStatus, e);
|
Exception we = new Exception("Error updating index for " + writeStatus, e);
|
||||||
@@ -338,8 +324,7 @@ public class HBaseIndex<T extends HoodieRecordPayload> extends HoodieIndex<T> {
|
|||||||
writeStatusList.add(writeStatus);
|
writeStatusList.add(writeStatus);
|
||||||
}
|
}
|
||||||
} catch (IOException e) {
|
} catch (IOException e) {
|
||||||
throw new HoodieIndexException(
|
throw new HoodieIndexException("Failed to Update Index locations because of exception with HBase Client", e);
|
||||||
"Failed to Update Index locations because of exception with HBase Client", e);
|
|
||||||
} finally {
|
} finally {
|
||||||
if (hTable != null) {
|
if (hTable != null) {
|
||||||
try {
|
try {
|
||||||
@@ -356,8 +341,7 @@ public class HBaseIndex<T extends HoodieRecordPayload> extends HoodieIndex<T> {
|
|||||||
/**
|
/**
|
||||||
* Helper method to facilitate performing puts and deletes in Hbase
|
* Helper method to facilitate performing puts and deletes in Hbase
|
||||||
*/
|
*/
|
||||||
private void doPutsAndDeletes(HTable hTable, List<Put> puts, List<Delete> deletes)
|
private void doPutsAndDeletes(HTable hTable, List<Put> puts, List<Delete> deletes) throws IOException {
|
||||||
throws IOException {
|
|
||||||
if (puts.size() > 0) {
|
if (puts.size() > 0) {
|
||||||
hTable.put(puts);
|
hTable.put(puts);
|
||||||
}
|
}
|
||||||
@@ -385,58 +369,49 @@ public class HBaseIndex<T extends HoodieRecordPayload> extends HoodieIndex<T> {
|
|||||||
final HBaseIndexQPSResourceAllocator hBaseIndexQPSResourceAllocator = createQPSResourceAllocator(this.config);
|
final HBaseIndexQPSResourceAllocator hBaseIndexQPSResourceAllocator = createQPSResourceAllocator(this.config);
|
||||||
setPutBatchSize(writeStatusRDD, hBaseIndexQPSResourceAllocator, jsc);
|
setPutBatchSize(writeStatusRDD, hBaseIndexQPSResourceAllocator, jsc);
|
||||||
logger.info("multiPutBatchSize: before hbase puts" + multiPutBatchSize);
|
logger.info("multiPutBatchSize: before hbase puts" + multiPutBatchSize);
|
||||||
JavaRDD<WriteStatus> writeStatusJavaRDD = writeStatusRDD.mapPartitionsWithIndex(
|
JavaRDD<WriteStatus> writeStatusJavaRDD = writeStatusRDD.mapPartitionsWithIndex(updateLocationFunction(), true);
|
||||||
updateLocationFunction(), true);
|
|
||||||
// caching the index updated status RDD
|
// caching the index updated status RDD
|
||||||
writeStatusJavaRDD = writeStatusJavaRDD.persist(config.getWriteStatusStorageLevel());
|
writeStatusJavaRDD = writeStatusJavaRDD.persist(config.getWriteStatusStorageLevel());
|
||||||
return writeStatusJavaRDD;
|
return writeStatusJavaRDD;
|
||||||
}
|
}
|
||||||
|
|
||||||
private void setPutBatchSize(JavaRDD<WriteStatus> writeStatusRDD,
|
private void setPutBatchSize(JavaRDD<WriteStatus> writeStatusRDD,
|
||||||
HBaseIndexQPSResourceAllocator hBaseIndexQPSResourceAllocator,
|
HBaseIndexQPSResourceAllocator hBaseIndexQPSResourceAllocator, final JavaSparkContext jsc) {
|
||||||
final JavaSparkContext jsc) {
|
|
||||||
if (config.getHbaseIndexPutBatchSizeAutoCompute()) {
|
if (config.getHbaseIndexPutBatchSizeAutoCompute()) {
|
||||||
SparkConf conf = jsc.getConf();
|
SparkConf conf = jsc.getConf();
|
||||||
int maxExecutors = conf.getInt(DEFAULT_SPARK_EXECUTOR_INSTANCES_CONFIG_NAME, 1);
|
int maxExecutors = conf.getInt(DEFAULT_SPARK_EXECUTOR_INSTANCES_CONFIG_NAME, 1);
|
||||||
if (conf.getBoolean(DEFAULT_SPARK_DYNAMIC_ALLOCATION_ENABLED_CONFIG_NAME, false)) {
|
if (conf.getBoolean(DEFAULT_SPARK_DYNAMIC_ALLOCATION_ENABLED_CONFIG_NAME, false)) {
|
||||||
maxExecutors = Math.max(maxExecutors, conf.getInt(
|
maxExecutors =
|
||||||
DEFAULT_SPARK_DYNAMIC_ALLOCATION_MAX_EXECUTORS_CONFIG_NAME, 1));
|
Math.max(maxExecutors, conf.getInt(DEFAULT_SPARK_DYNAMIC_ALLOCATION_MAX_EXECUTORS_CONFIG_NAME, 1));
|
||||||
}
|
}
|
||||||
|
|
||||||
/*
|
/*
|
||||||
Each writeStatus represents status information from a write done in one of the IOHandles.
|
* Each writeStatus represents status information from a write done in one of the IOHandles. If a writeStatus has
|
||||||
If a writeStatus has any insert, it implies that the corresponding task contacts HBase for
|
* any insert, it implies that the corresponding task contacts HBase for doing puts, since we only do puts for
|
||||||
doing puts, since we only do puts for inserts from HBaseIndex.
|
* inserts from HBaseIndex.
|
||||||
*/
|
*/
|
||||||
final Tuple2<Long, Integer> numPutsParallelismTuple = getHBasePutAccessParallelism(writeStatusRDD);
|
final Tuple2<Long, Integer> numPutsParallelismTuple = getHBasePutAccessParallelism(writeStatusRDD);
|
||||||
final long numPuts = numPutsParallelismTuple._1;
|
final long numPuts = numPutsParallelismTuple._1;
|
||||||
final int hbasePutsParallelism = numPutsParallelismTuple._2;
|
final int hbasePutsParallelism = numPutsParallelismTuple._2;
|
||||||
this.numRegionServersForTable = getNumRegionServersAliveForTable();
|
this.numRegionServersForTable = getNumRegionServersAliveForTable();
|
||||||
final float desiredQPSFraction = hBaseIndexQPSResourceAllocator
|
final float desiredQPSFraction =
|
||||||
.calculateQPSFractionForPutsTime(numPuts, this.numRegionServersForTable);
|
hBaseIndexQPSResourceAllocator.calculateQPSFractionForPutsTime(numPuts, this.numRegionServersForTable);
|
||||||
logger.info("Desired QPSFraction :" + desiredQPSFraction);
|
logger.info("Desired QPSFraction :" + desiredQPSFraction);
|
||||||
logger.info("Number HBase puts :" + numPuts);
|
logger.info("Number HBase puts :" + numPuts);
|
||||||
logger.info("Hbase Puts Parallelism :" + hbasePutsParallelism);
|
logger.info("Hbase Puts Parallelism :" + hbasePutsParallelism);
|
||||||
final float availableQpsFraction = hBaseIndexQPSResourceAllocator
|
final float availableQpsFraction =
|
||||||
.acquireQPSResources(desiredQPSFraction, numPuts);
|
hBaseIndexQPSResourceAllocator.acquireQPSResources(desiredQPSFraction, numPuts);
|
||||||
logger.info("Allocated QPS Fraction :" + availableQpsFraction);
|
logger.info("Allocated QPS Fraction :" + availableQpsFraction);
|
||||||
multiPutBatchSize = putBatchSizeCalculator
|
multiPutBatchSize = putBatchSizeCalculator.getBatchSize(numRegionServersForTable, maxQpsPerRegionServer,
|
||||||
.getBatchSize(
|
hbasePutsParallelism, maxExecutors, SLEEP_TIME_MILLISECONDS, availableQpsFraction);
|
||||||
numRegionServersForTable,
|
|
||||||
maxQpsPerRegionServer,
|
|
||||||
hbasePutsParallelism,
|
|
||||||
maxExecutors,
|
|
||||||
SLEEP_TIME_MILLISECONDS,
|
|
||||||
availableQpsFraction);
|
|
||||||
logger.info("multiPutBatchSize :" + multiPutBatchSize);
|
logger.info("multiPutBatchSize :" + multiPutBatchSize);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@VisibleForTesting
|
@VisibleForTesting
|
||||||
public Tuple2<Long, Integer> getHBasePutAccessParallelism(final JavaRDD<WriteStatus> writeStatusRDD) {
|
public Tuple2<Long, Integer> getHBasePutAccessParallelism(final JavaRDD<WriteStatus> writeStatusRDD) {
|
||||||
final JavaPairRDD<Long, Integer> insertOnlyWriteStatusRDD =
|
final JavaPairRDD<Long, Integer> insertOnlyWriteStatusRDD = writeStatusRDD
|
||||||
writeStatusRDD.filter(w -> w.getStat().getNumInserts() > 0)
|
.filter(w -> w.getStat().getNumInserts() > 0).mapToPair(w -> new Tuple2<>(w.getStat().getNumInserts(), 1));
|
||||||
.mapToPair(w -> new Tuple2<>(w.getStat().getNumInserts(), 1));
|
|
||||||
return insertOnlyWriteStatusRDD.fold(new Tuple2<>(0L, 0), (w, c) -> new Tuple2<>(w._1 + c._1, w._2 + c._2));
|
return insertOnlyWriteStatusRDD.fold(new Tuple2<>(0L, 0), (w, c) -> new Tuple2<>(w._1 + c._1, w._2 + c._2));
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -460,21 +435,25 @@ public class HBaseIndex<T extends HoodieRecordPayload> extends HoodieIndex<T> {
|
|||||||
* 16000. We assume requests get distributed to Region Servers uniformly, so each RS gets 1600 requests which
|
* 16000. We assume requests get distributed to Region Servers uniformly, so each RS gets 1600 requests which
|
||||||
* happens to be 10% of 16667 (maxQPSPerRegionServer), as expected.
|
* happens to be 10% of 16667 (maxQPSPerRegionServer), as expected.
|
||||||
* </p>
|
* </p>
|
||||||
* <p> Assumptions made here <li> In a batch, writes get evenly distributed to each RS for that
|
* <p>
|
||||||
* table. Since we do writes only in the case of inserts and not updates, for this assumption to fail, inserts would
|
* Assumptions made here
|
||||||
* have to be skewed towards few RS, likelihood of which is less if Hbase table is pre-split and rowKeys are UUIDs
|
* <li>In a batch, writes get evenly distributed to each RS for that table. Since we do writes only in the case of
|
||||||
* (random strings). If this assumption fails, then it is possible for some RS to receive more than
|
* inserts and not updates, for this assumption to fail, inserts would have to be skewed towards few RS, likelihood
|
||||||
* maxQpsPerRegionServer QPS, but for simplicity, we are going ahead with this model, since this is meant to be a
|
* of which is less if Hbase table is pre-split and rowKeys are UUIDs (random strings). If this assumption fails,
|
||||||
* lightweight distributed throttling mechanism without maintaining a global context. So if this assumption breaks,
|
* then it is possible for some RS to receive more than maxQpsPerRegionServer QPS, but for simplicity, we are going
|
||||||
* we are hoping the HBase Master relocates hot-spot regions to new Region Servers.
|
* ahead with this model, since this is meant to be a lightweight distributed throttling mechanism without
|
||||||
|
* maintaining a global context. So if this assumption breaks, we are hoping the HBase Master relocates hot-spot
|
||||||
|
* regions to new Region Servers.
|
||||||
*
|
*
|
||||||
* </li> <li> For Region Server stability, throttling at a second level granularity is fine.
|
* </li>
|
||||||
* Although, within a second, the sum of queries might be within maxQpsPerRegionServer, there could be peaks at some
|
* <li>For Region Server stability, throttling at a second level granularity is fine. Although, within a second, the
|
||||||
* sub second intervals. So, the assumption is that these peaks are tolerated by the Region Server (which at max can
|
* sum of queries might be within maxQpsPerRegionServer, there could be peaks at some sub second intervals. So, the
|
||||||
* be maxQpsPerRegionServer). </li> </p>
|
* assumption is that these peaks are tolerated by the Region Server (which at max can be maxQpsPerRegionServer).
|
||||||
|
* </li>
|
||||||
|
* </p>
|
||||||
*/
|
*/
|
||||||
public int getBatchSize(int numRegionServersForTable, int maxQpsPerRegionServer,
|
public int getBatchSize(int numRegionServersForTable, int maxQpsPerRegionServer, int numTasksDuringPut,
|
||||||
int numTasksDuringPut, int maxExecutors, int sleepTimeMs, float qpsFraction) {
|
int maxExecutors, int sleepTimeMs, float qpsFraction) {
|
||||||
int numRSAlive = numRegionServersForTable;
|
int numRSAlive = numRegionServersForTable;
|
||||||
int maxReqPerSec = (int) (qpsFraction * numRSAlive * maxQpsPerRegionServer);
|
int maxReqPerSec = (int) (qpsFraction * numRSAlive * maxQpsPerRegionServer);
|
||||||
int numTasks = numTasksDuringPut;
|
int numTasks = numTasksDuringPut;
|
||||||
@@ -499,11 +478,9 @@ public class HBaseIndex<T extends HoodieRecordPayload> extends HoodieIndex<T> {
|
|||||||
// from the driver, so ok to use a local connection variable.
|
// from the driver, so ok to use a local connection variable.
|
||||||
if (numRegionServersForTable == null) {
|
if (numRegionServersForTable == null) {
|
||||||
try (Connection conn = getHBaseConnection()) {
|
try (Connection conn = getHBaseConnection()) {
|
||||||
RegionLocator regionLocator = conn
|
RegionLocator regionLocator = conn.getRegionLocator(TableName.valueOf(tableName));
|
||||||
.getRegionLocator(TableName.valueOf(tableName));
|
numRegionServersForTable = Math
|
||||||
numRegionServersForTable = Math.toIntExact(
|
.toIntExact(regionLocator.getAllRegionLocations().stream().map(e -> e.getServerName()).distinct().count());
|
||||||
regionLocator.getAllRegionLocations().stream().map(e -> e.getServerName()).distinct()
|
|
||||||
.count());
|
|
||||||
return numRegionServersForTable;
|
return numRegionServersForTable;
|
||||||
} catch (IOException e) {
|
} catch (IOException e) {
|
||||||
logger.error(e);
|
logger.error(e);
|
||||||
|
|||||||
@@ -26,11 +26,11 @@ import java.io.Serializable;
|
|||||||
public interface HBaseIndexQPSResourceAllocator extends Serializable {
|
public interface HBaseIndexQPSResourceAllocator extends Serializable {
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* This method returns the QPS Fraction value that needs to be acquired such that the respective
|
* This method returns the QPS Fraction value that needs to be acquired such that the respective HBase index operation
|
||||||
* HBase index operation can be completed in desiredPutsTime.
|
* can be completed in desiredPutsTime.
|
||||||
*
|
*
|
||||||
* @param numPuts Number of inserts to be written to HBase index
|
* @param numPuts Number of inserts to be written to HBase index
|
||||||
* @param desiredPutsTimeInSecs Total expected time for the HBase inserts operation
|
* @param desiredPutsTimeInSecs Total expected time for the HBase inserts operation
|
||||||
* @return QPS fraction that needs to be acquired.
|
* @return QPS fraction that needs to be acquired.
|
||||||
*/
|
*/
|
||||||
float calculateQPSFractionForPutsTime(final long numPuts, final int desiredPutsTimeInSecs);
|
float calculateQPSFractionForPutsTime(final long numPuts, final int desiredPutsTimeInSecs);
|
||||||
@@ -38,8 +38,8 @@ public interface HBaseIndexQPSResourceAllocator extends Serializable {
|
|||||||
/**
|
/**
|
||||||
* This method acquires the requested QPS Fraction against HBase cluster for index operation.
|
* This method acquires the requested QPS Fraction against HBase cluster for index operation.
|
||||||
*
|
*
|
||||||
* @param desiredQPSFraction QPS fraction that needs to be requested and acquired
|
* @param desiredQPSFraction QPS fraction that needs to be requested and acquired
|
||||||
* @param numPuts Number of inserts to be written to HBase index
|
* @param numPuts Number of inserts to be written to HBase index
|
||||||
* @return value of the acquired QPS Fraction.
|
* @return value of the acquired QPS Fraction.
|
||||||
*/
|
*/
|
||||||
float acquireQPSResources(final float desiredQPSFraction, final long numPuts);
|
float acquireQPSResources(final float desiredQPSFraction, final long numPuts);
|
||||||
|
|||||||
@@ -96,8 +96,8 @@ public class HoodieAppendHandle<T extends HoodieRecordPayload> extends HoodieWri
|
|||||||
// Total number of new records inserted into the delta file
|
// Total number of new records inserted into the delta file
|
||||||
private long insertRecordsWritten = 0;
|
private long insertRecordsWritten = 0;
|
||||||
|
|
||||||
public HoodieAppendHandle(HoodieWriteConfig config, String commitTime, HoodieTable<T> hoodieTable,
|
public HoodieAppendHandle(HoodieWriteConfig config, String commitTime, HoodieTable<T> hoodieTable, String fileId,
|
||||||
String fileId, Iterator<HoodieRecord<T>> recordItr) {
|
Iterator<HoodieRecord<T>> recordItr) {
|
||||||
super(config, commitTime, fileId, hoodieTable);
|
super(config, commitTime, fileId, hoodieTable);
|
||||||
writeStatus.setStat(new HoodieDeltaWriteStat());
|
writeStatus.setStat(new HoodieDeltaWriteStat());
|
||||||
this.fileId = fileId;
|
this.fileId = fileId;
|
||||||
@@ -137,10 +137,8 @@ public class HoodieAppendHandle<T extends HoodieRecordPayload> extends HoodieWri
|
|||||||
} catch (Exception e) {
|
} catch (Exception e) {
|
||||||
logger.error("Error in update task at commit " + instantTime, e);
|
logger.error("Error in update task at commit " + instantTime, e);
|
||||||
writeStatus.setGlobalError(e);
|
writeStatus.setGlobalError(e);
|
||||||
throw new HoodieUpsertException(
|
throw new HoodieUpsertException("Failed to initialize HoodieAppendHandle for FileId: " + fileId + " on commit "
|
||||||
"Failed to initialize HoodieAppendHandle for FileId: " + fileId + " on commit "
|
+ instantTime + " on HDFS path " + hoodieTable.getMetaClient().getBasePath() + partitionPath, e);
|
||||||
+ instantTime + " on HDFS path " + hoodieTable.getMetaClient().getBasePath()
|
|
||||||
+ partitionPath, e);
|
|
||||||
}
|
}
|
||||||
Path path = new Path(partitionPath, writer.getLogFile().getFileName());
|
Path path = new Path(partitionPath, writer.getLogFile().getFileName());
|
||||||
writeStatus.getStat().setPath(path.toString());
|
writeStatus.getStat().setPath(path.toString());
|
||||||
@@ -155,13 +153,11 @@ public class HoodieAppendHandle<T extends HoodieRecordPayload> extends HoodieWri
|
|||||||
if (avroRecord.isPresent()) {
|
if (avroRecord.isPresent()) {
|
||||||
// Convert GenericRecord to GenericRecord with hoodie commit metadata in schema
|
// Convert GenericRecord to GenericRecord with hoodie commit metadata in schema
|
||||||
avroRecord = Option.of(rewriteRecord((GenericRecord) avroRecord.get()));
|
avroRecord = Option.of(rewriteRecord((GenericRecord) avroRecord.get()));
|
||||||
String seqId = HoodieRecord.generateSequenceId(instantTime, TaskContext.getPartitionId(),
|
String seqId =
|
||||||
recordIndex.getAndIncrement());
|
HoodieRecord.generateSequenceId(instantTime, TaskContext.getPartitionId(), recordIndex.getAndIncrement());
|
||||||
HoodieAvroUtils
|
HoodieAvroUtils.addHoodieKeyToRecord((GenericRecord) avroRecord.get(), hoodieRecord.getRecordKey(),
|
||||||
.addHoodieKeyToRecord((GenericRecord) avroRecord.get(), hoodieRecord.getRecordKey(),
|
hoodieRecord.getPartitionPath(), fileId);
|
||||||
hoodieRecord.getPartitionPath(), fileId);
|
HoodieAvroUtils.addCommitMetadataToRecord((GenericRecord) avroRecord.get(), instantTime, seqId);
|
||||||
HoodieAvroUtils
|
|
||||||
.addCommitMetadataToRecord((GenericRecord) avroRecord.get(), instantTime, seqId);
|
|
||||||
// If currentLocation is present, then this is an update
|
// If currentLocation is present, then this is an update
|
||||||
if (hoodieRecord.getCurrentLocation() != null) {
|
if (hoodieRecord.getCurrentLocation() != null) {
|
||||||
updatedRecordsWritten++;
|
updatedRecordsWritten++;
|
||||||
@@ -208,20 +204,18 @@ public class HoodieAppendHandle<T extends HoodieRecordPayload> extends HoodieWri
|
|||||||
recordList.clear();
|
recordList.clear();
|
||||||
}
|
}
|
||||||
if (keysToDelete.size() > 0) {
|
if (keysToDelete.size() > 0) {
|
||||||
writer = writer.appendBlock(
|
writer = writer.appendBlock(new HoodieDeleteBlock(keysToDelete.stream().toArray(HoodieKey[]::new), header));
|
||||||
new HoodieDeleteBlock(keysToDelete.stream().toArray(HoodieKey[]::new), header));
|
|
||||||
keysToDelete.clear();
|
keysToDelete.clear();
|
||||||
}
|
}
|
||||||
} catch (Exception e) {
|
} catch (Exception e) {
|
||||||
throw new HoodieAppendException(
|
throw new HoodieAppendException("Failed while appending records to " + currentLogFile.getPath(), e);
|
||||||
"Failed while appending records to " + currentLogFile.getPath(), e);
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
public boolean canWrite(HoodieRecord record) {
|
public boolean canWrite(HoodieRecord record) {
|
||||||
return config.getParquetMaxFileSize() >= estimatedNumberOfBytesWritten * config
|
return config.getParquetMaxFileSize() >= estimatedNumberOfBytesWritten
|
||||||
.getLogFileToParquetCompressionRatio();
|
* config.getLogFileToParquetCompressionRatio();
|
||||||
}
|
}
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
@@ -262,8 +256,8 @@ public class HoodieAppendHandle<T extends HoodieRecordPayload> extends HoodieWri
|
|||||||
runtimeStats.setTotalUpsertTime(timer.endTimer());
|
runtimeStats.setTotalUpsertTime(timer.endTimer());
|
||||||
stat.setRuntimeStats(runtimeStats);
|
stat.setRuntimeStats(runtimeStats);
|
||||||
|
|
||||||
logger.info(String.format("AppendHandle for partitionPath %s fileID %s, took %d ms.",
|
logger.info(String.format("AppendHandle for partitionPath %s fileID %s, took %d ms.", stat.getPartitionPath(),
|
||||||
stat.getPartitionPath(), stat.getFileId(), runtimeStats.getTotalUpsertTime()));
|
stat.getFileId(), runtimeStats.getTotalUpsertTime()));
|
||||||
|
|
||||||
return writeStatus;
|
return writeStatus;
|
||||||
} catch (IOException e) {
|
} catch (IOException e) {
|
||||||
@@ -282,13 +276,11 @@ public class HoodieAppendHandle<T extends HoodieRecordPayload> extends HoodieWri
|
|||||||
|
|
||||||
return HoodieLogFormat.newWriterBuilder()
|
return HoodieLogFormat.newWriterBuilder()
|
||||||
.onParentPath(FSUtils.getPartitionPath(hoodieTable.getMetaClient().getBasePath(), partitionPath))
|
.onParentPath(FSUtils.getPartitionPath(hoodieTable.getMetaClient().getBasePath(), partitionPath))
|
||||||
.withFileId(fileId).overBaseCommit(baseCommitTime).withLogVersion(
|
.withFileId(fileId).overBaseCommit(baseCommitTime)
|
||||||
latestLogFile.map(HoodieLogFile::getLogVersion).orElse(HoodieLogFile.LOGFILE_BASE_VERSION))
|
.withLogVersion(latestLogFile.map(HoodieLogFile::getLogVersion).orElse(HoodieLogFile.LOGFILE_BASE_VERSION))
|
||||||
.withSizeThreshold(config.getLogFileMaxSize()).withFs(fs)
|
.withSizeThreshold(config.getLogFileMaxSize()).withFs(fs)
|
||||||
.withLogWriteToken(
|
.withLogWriteToken(latestLogFile.map(x -> FSUtils.getWriteTokenFromLogPath(x.getPath())).orElse(writeToken))
|
||||||
latestLogFile.map(x -> FSUtils.getWriteTokenFromLogPath(x.getPath())).orElse(writeToken))
|
.withRolloverLogWriteToken(writeToken).withFileExtension(HoodieLogFile.DELTA_EXTENSION).build();
|
||||||
.withRolloverLogWriteToken(writeToken)
|
|
||||||
.withFileExtension(HoodieLogFile.DELTA_EXTENSION).build();
|
|
||||||
}
|
}
|
||||||
|
|
||||||
private void writeToBuffer(HoodieRecord<T> record) {
|
private void writeToBuffer(HoodieRecord<T> record) {
|
||||||
|
|||||||
@@ -45,9 +45,12 @@ import org.apache.log4j.Logger;
|
|||||||
|
|
||||||
/**
|
/**
|
||||||
* Cleaner is responsible for garbage collecting older files in a given partition path, such that
|
* Cleaner is responsible for garbage collecting older files in a given partition path, such that
|
||||||
* <p> 1) It provides sufficient time for existing queries running on older versions, to close <p>
|
* <p>
|
||||||
* 2) It bounds the growth of the files in the file system <p> TODO: Should all cleaning be done
|
* 1) It provides sufficient time for existing queries running on older versions, to close
|
||||||
* based on {@link HoodieCommitMetadata}
|
* <p>
|
||||||
|
* 2) It bounds the growth of the files in the file system
|
||||||
|
* <p>
|
||||||
|
* TODO: Should all cleaning be done based on {@link HoodieCommitMetadata}
|
||||||
*/
|
*/
|
||||||
public class HoodieCleanHelper<T extends HoodieRecordPayload<T>> {
|
public class HoodieCleanHelper<T extends HoodieRecordPayload<T>> {
|
||||||
|
|
||||||
@@ -65,23 +68,22 @@ public class HoodieCleanHelper<T extends HoodieRecordPayload<T>> {
|
|||||||
this.commitTimeline = hoodieTable.getCompletedCommitTimeline();
|
this.commitTimeline = hoodieTable.getCompletedCommitTimeline();
|
||||||
this.config = config;
|
this.config = config;
|
||||||
this.fgIdToPendingCompactionOperations =
|
this.fgIdToPendingCompactionOperations =
|
||||||
((SyncableFileSystemView)hoodieTable.getRTFileSystemView()).getPendingCompactionOperations()
|
((SyncableFileSystemView) hoodieTable.getRTFileSystemView()).getPendingCompactionOperations()
|
||||||
.map(entry -> Pair.of(new HoodieFileGroupId(entry.getValue().getPartitionPath(),
|
.map(entry -> Pair.of(
|
||||||
entry.getValue().getFileId()), entry.getValue()))
|
new HoodieFileGroupId(entry.getValue().getPartitionPath(), entry.getValue().getFileId()),
|
||||||
|
entry.getValue()))
|
||||||
.collect(Collectors.toMap(Pair::getKey, Pair::getValue));
|
.collect(Collectors.toMap(Pair::getKey, Pair::getValue));
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Selects the older versions of files for cleaning, such that it bounds the number of versions of
|
* Selects the older versions of files for cleaning, such that it bounds the number of versions of each file. This
|
||||||
* each file. This policy is useful, if you are simply interested in querying the table, and you
|
* policy is useful, if you are simply interested in querying the table, and you don't want too many versions for a
|
||||||
* don't want too many versions for a single file (i.e run it with versionsRetained = 1)
|
* single file (i.e run it with versionsRetained = 1)
|
||||||
*/
|
*/
|
||||||
private List<String> getFilesToCleanKeepingLatestVersions(String partitionPath)
|
private List<String> getFilesToCleanKeepingLatestVersions(String partitionPath) throws IOException {
|
||||||
throws IOException {
|
logger.info("Cleaning " + partitionPath + ", retaining latest " + config.getCleanerFileVersionsRetained()
|
||||||
logger.info("Cleaning " + partitionPath + ", retaining latest " + config
|
+ " file versions. ");
|
||||||
.getCleanerFileVersionsRetained() + " file versions. ");
|
List<HoodieFileGroup> fileGroups = fileSystemView.getAllFileGroups(partitionPath).collect(Collectors.toList());
|
||||||
List<HoodieFileGroup> fileGroups = fileSystemView.getAllFileGroups(partitionPath)
|
|
||||||
.collect(Collectors.toList());
|
|
||||||
List<String> deletePaths = new ArrayList<>();
|
List<String> deletePaths = new ArrayList<>();
|
||||||
// Collect all the datafiles savepointed by all the savepoints
|
// Collect all the datafiles savepointed by all the savepoints
|
||||||
List<String> savepointedFiles = hoodieTable.getSavepoints().stream()
|
List<String> savepointedFiles = hoodieTable.getSavepoints().stream()
|
||||||
@@ -90,8 +92,8 @@ public class HoodieCleanHelper<T extends HoodieRecordPayload<T>> {
|
|||||||
for (HoodieFileGroup fileGroup : fileGroups) {
|
for (HoodieFileGroup fileGroup : fileGroups) {
|
||||||
int keepVersions = config.getCleanerFileVersionsRetained();
|
int keepVersions = config.getCleanerFileVersionsRetained();
|
||||||
// do not cleanup slice required for pending compaction
|
// do not cleanup slice required for pending compaction
|
||||||
Iterator<FileSlice> fileSliceIterator = fileGroup.getAllFileSlices()
|
Iterator<FileSlice> fileSliceIterator =
|
||||||
.filter(fs -> !isFileSliceNeededForPendingCompaction(fs)).iterator();
|
fileGroup.getAllFileSlices().filter(fs -> !isFileSliceNeededForPendingCompaction(fs)).iterator();
|
||||||
if (isFileGroupInPendingCompaction(fileGroup)) {
|
if (isFileGroupInPendingCompaction(fileGroup)) {
|
||||||
// We have already saved the last version of file-groups for pending compaction Id
|
// We have already saved the last version of file-groups for pending compaction Id
|
||||||
keepVersions--;
|
keepVersions--;
|
||||||
@@ -116,8 +118,8 @@ public class HoodieCleanHelper<T extends HoodieRecordPayload<T>> {
|
|||||||
}
|
}
|
||||||
if (hoodieTable.getMetaClient().getTableType() == HoodieTableType.MERGE_ON_READ) {
|
if (hoodieTable.getMetaClient().getTableType() == HoodieTableType.MERGE_ON_READ) {
|
||||||
// If merge on read, then clean the log files for the commits as well
|
// If merge on read, then clean the log files for the commits as well
|
||||||
deletePaths.addAll(nextSlice.getLogFiles().map(file -> file.getPath().toString())
|
deletePaths
|
||||||
.collect(Collectors.toList()));
|
.addAll(nextSlice.getLogFiles().map(file -> file.getPath().toString()).collect(Collectors.toList()));
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@@ -126,21 +128,21 @@ public class HoodieCleanHelper<T extends HoodieRecordPayload<T>> {
|
|||||||
|
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Selects the versions for file for cleaning, such that it <p> - Leaves the latest version of the
|
* Selects the versions for file for cleaning, such that it
|
||||||
* file untouched - For older versions, - It leaves all the commits untouched which has occured in
|
* <p>
|
||||||
* last <code>config.getCleanerCommitsRetained()</code> commits - It leaves ONE commit before this
|
* - Leaves the latest version of the file untouched - For older versions, - It leaves all the commits untouched which
|
||||||
* window. We assume that the max(query execution time) == commit_batch_time *
|
* has occured in last <code>config.getCleanerCommitsRetained()</code> commits - It leaves ONE commit before this
|
||||||
* config.getCleanerCommitsRetained(). This is 12 hours by default. This is essential to leave the
|
* window. We assume that the max(query execution time) == commit_batch_time * config.getCleanerCommitsRetained().
|
||||||
* file used by the query thats running for the max time. <p> This provides the effect of having
|
* This is 12 hours by default. This is essential to leave the file used by the query thats running for the max time.
|
||||||
* lookback into all changes that happened in the last X commits. (eg: if you retain 24 commits,
|
* <p>
|
||||||
* and commit batch time is 30 mins, then you have 12 hrs of lookback) <p> This policy is the
|
* This provides the effect of having lookback into all changes that happened in the last X commits. (eg: if you
|
||||||
* default.
|
* retain 24 commits, and commit batch time is 30 mins, then you have 12 hrs of lookback)
|
||||||
|
* <p>
|
||||||
|
* This policy is the default.
|
||||||
*/
|
*/
|
||||||
private List<String> getFilesToCleanKeepingLatestCommits(String partitionPath)
|
private List<String> getFilesToCleanKeepingLatestCommits(String partitionPath) throws IOException {
|
||||||
throws IOException {
|
|
||||||
int commitsRetained = config.getCleanerCommitsRetained();
|
int commitsRetained = config.getCleanerCommitsRetained();
|
||||||
logger
|
logger.info("Cleaning " + partitionPath + ", retaining latest " + commitsRetained + " commits. ");
|
||||||
.info("Cleaning " + partitionPath + ", retaining latest " + commitsRetained + " commits. ");
|
|
||||||
List<String> deletePaths = new ArrayList<>();
|
List<String> deletePaths = new ArrayList<>();
|
||||||
|
|
||||||
// Collect all the datafiles savepointed by all the savepoints
|
// Collect all the datafiles savepointed by all the savepoints
|
||||||
@@ -150,8 +152,7 @@ public class HoodieCleanHelper<T extends HoodieRecordPayload<T>> {
|
|||||||
// determine if we have enough commits, to start cleaning.
|
// determine if we have enough commits, to start cleaning.
|
||||||
if (commitTimeline.countInstants() > commitsRetained) {
|
if (commitTimeline.countInstants() > commitsRetained) {
|
||||||
HoodieInstant earliestCommitToRetain = getEarliestCommitToRetain().get();
|
HoodieInstant earliestCommitToRetain = getEarliestCommitToRetain().get();
|
||||||
List<HoodieFileGroup> fileGroups = fileSystemView.getAllFileGroups(partitionPath)
|
List<HoodieFileGroup> fileGroups = fileSystemView.getAllFileGroups(partitionPath).collect(Collectors.toList());
|
||||||
.collect(Collectors.toList());
|
|
||||||
for (HoodieFileGroup fileGroup : fileGroups) {
|
for (HoodieFileGroup fileGroup : fileGroups) {
|
||||||
List<FileSlice> fileSliceList = fileGroup.getAllFileSlices().collect(Collectors.toList());
|
List<FileSlice> fileSliceList = fileGroup.getAllFileSlices().collect(Collectors.toList());
|
||||||
|
|
||||||
@@ -160,8 +161,8 @@ public class HoodieCleanHelper<T extends HoodieRecordPayload<T>> {
|
|||||||
}
|
}
|
||||||
|
|
||||||
String lastVersion = fileSliceList.get(0).getBaseInstantTime();
|
String lastVersion = fileSliceList.get(0).getBaseInstantTime();
|
||||||
String lastVersionBeforeEarliestCommitToRetain = getLatestVersionBeforeCommit(fileSliceList,
|
String lastVersionBeforeEarliestCommitToRetain =
|
||||||
earliestCommitToRetain);
|
getLatestVersionBeforeCommit(fileSliceList, earliestCommitToRetain);
|
||||||
|
|
||||||
// Ensure there are more than 1 version of the file (we only clean old files from updates)
|
// Ensure there are more than 1 version of the file (we only clean old files from updates)
|
||||||
// i.e always spare the last commit.
|
// i.e always spare the last commit.
|
||||||
@@ -183,16 +184,14 @@ public class HoodieCleanHelper<T extends HoodieRecordPayload<T>> {
|
|||||||
}
|
}
|
||||||
|
|
||||||
// Always keep the last commit
|
// Always keep the last commit
|
||||||
if (!isFileSliceNeededForPendingCompaction(aSlice)
|
if (!isFileSliceNeededForPendingCompaction(aSlice) && HoodieTimeline
|
||||||
&& HoodieTimeline
|
.compareTimestamps(earliestCommitToRetain.getTimestamp(), fileCommitTime, HoodieTimeline.GREATER)) {
|
||||||
.compareTimestamps(earliestCommitToRetain.getTimestamp(), fileCommitTime,
|
|
||||||
HoodieTimeline.GREATER)) {
|
|
||||||
// this is a commit, that should be cleaned.
|
// this is a commit, that should be cleaned.
|
||||||
aFile.ifPresent(hoodieDataFile -> deletePaths.add(hoodieDataFile.getPath()));
|
aFile.ifPresent(hoodieDataFile -> deletePaths.add(hoodieDataFile.getPath()));
|
||||||
if (hoodieTable.getMetaClient().getTableType() == HoodieTableType.MERGE_ON_READ) {
|
if (hoodieTable.getMetaClient().getTableType() == HoodieTableType.MERGE_ON_READ) {
|
||||||
// If merge on read, then clean the log files for the commits as well
|
// If merge on read, then clean the log files for the commits as well
|
||||||
deletePaths.addAll(aSlice.getLogFiles().map(file -> file.getPath().toString())
|
deletePaths
|
||||||
.collect(Collectors.toList()));
|
.addAll(aSlice.getLogFiles().map(file -> file.getPath().toString()).collect(Collectors.toList()));
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@@ -205,12 +204,10 @@ public class HoodieCleanHelper<T extends HoodieRecordPayload<T>> {
|
|||||||
/**
|
/**
|
||||||
* Gets the latest version < commitTime. This version file could still be used by queries.
|
* Gets the latest version < commitTime. This version file could still be used by queries.
|
||||||
*/
|
*/
|
||||||
private String getLatestVersionBeforeCommit(List<FileSlice> fileSliceList,
|
private String getLatestVersionBeforeCommit(List<FileSlice> fileSliceList, HoodieInstant commitTime) {
|
||||||
HoodieInstant commitTime) {
|
|
||||||
for (FileSlice file : fileSliceList) {
|
for (FileSlice file : fileSliceList) {
|
||||||
String fileCommitTime = file.getBaseInstantTime();
|
String fileCommitTime = file.getBaseInstantTime();
|
||||||
if (HoodieTimeline
|
if (HoodieTimeline.compareTimestamps(commitTime.getTimestamp(), fileCommitTime, HoodieTimeline.GREATER)) {
|
||||||
.compareTimestamps(commitTime.getTimestamp(), fileCommitTime, HoodieTimeline.GREATER)) {
|
|
||||||
// fileList is sorted on the reverse, so the first commit we find <= commitTime is the
|
// fileList is sorted on the reverse, so the first commit we find <= commitTime is the
|
||||||
// one we want
|
// one we want
|
||||||
return fileCommitTime;
|
return fileCommitTime;
|
||||||
@@ -246,14 +243,14 @@ public class HoodieCleanHelper<T extends HoodieRecordPayload<T>> {
|
|||||||
int commitsRetained = config.getCleanerCommitsRetained();
|
int commitsRetained = config.getCleanerCommitsRetained();
|
||||||
if (config.getCleanerPolicy() == HoodieCleaningPolicy.KEEP_LATEST_COMMITS
|
if (config.getCleanerPolicy() == HoodieCleaningPolicy.KEEP_LATEST_COMMITS
|
||||||
&& commitTimeline.countInstants() > commitsRetained) {
|
&& commitTimeline.countInstants() > commitsRetained) {
|
||||||
earliestCommitToRetain = commitTimeline
|
earliestCommitToRetain = commitTimeline.nthInstant(commitTimeline.countInstants() - commitsRetained);
|
||||||
.nthInstant(commitTimeline.countInstants() - commitsRetained);
|
|
||||||
}
|
}
|
||||||
return earliestCommitToRetain;
|
return earliestCommitToRetain;
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Determine if file slice needed to be preserved for pending compaction
|
* Determine if file slice needed to be preserved for pending compaction
|
||||||
|
*
|
||||||
* @param fileSlice File Slice
|
* @param fileSlice File Slice
|
||||||
* @return true if file slice needs to be preserved, false otherwise.
|
* @return true if file slice needs to be preserved, false otherwise.
|
||||||
*/
|
*/
|
||||||
|
|||||||
@@ -83,9 +83,8 @@ public class HoodieCommitArchiveLog {
|
|||||||
try {
|
try {
|
||||||
if (this.writer == null) {
|
if (this.writer == null) {
|
||||||
return HoodieLogFormat.newWriterBuilder().onParentPath(archiveFilePath.getParent())
|
return HoodieLogFormat.newWriterBuilder().onParentPath(archiveFilePath.getParent())
|
||||||
.withFileId(archiveFilePath.getName())
|
.withFileId(archiveFilePath.getName()).withFileExtension(HoodieArchivedLogFile.ARCHIVE_EXTENSION)
|
||||||
.withFileExtension(HoodieArchivedLogFile.ARCHIVE_EXTENSION).withFs(metaClient.getFs())
|
.withFs(metaClient.getFs()).overBaseCommit("").build();
|
||||||
.overBaseCommit("").build();
|
|
||||||
} else {
|
} else {
|
||||||
return this.writer;
|
return this.writer;
|
||||||
}
|
}
|
||||||
@@ -137,8 +136,7 @@ public class HoodieCommitArchiveLog {
|
|||||||
// TODO: Handle ROLLBACK_ACTION in future
|
// TODO: Handle ROLLBACK_ACTION in future
|
||||||
// ROLLBACK_ACTION is currently not defined in HoodieActiveTimeline
|
// ROLLBACK_ACTION is currently not defined in HoodieActiveTimeline
|
||||||
HoodieTimeline cleanAndRollbackTimeline = table.getActiveTimeline()
|
HoodieTimeline cleanAndRollbackTimeline = table.getActiveTimeline()
|
||||||
.getTimelineOfActions(Sets.newHashSet(HoodieTimeline.CLEAN_ACTION))
|
.getTimelineOfActions(Sets.newHashSet(HoodieTimeline.CLEAN_ACTION)).filterCompletedInstants();
|
||||||
.filterCompletedInstants();
|
|
||||||
Stream<HoodieInstant> instants = cleanAndRollbackTimeline.getInstants()
|
Stream<HoodieInstant> instants = cleanAndRollbackTimeline.getInstants()
|
||||||
.collect(Collectors.groupingBy(s -> s.getAction())).entrySet().stream().map(i -> {
|
.collect(Collectors.groupingBy(s -> s.getAction())).entrySet().stream().map(i -> {
|
||||||
if (i.getValue().size() > maxCommitsToKeep) {
|
if (i.getValue().size() > maxCommitsToKeep) {
|
||||||
@@ -148,7 +146,7 @@ public class HoodieCommitArchiveLog {
|
|||||||
}
|
}
|
||||||
}).flatMap(i -> i.stream());
|
}).flatMap(i -> i.stream());
|
||||||
|
|
||||||
//TODO (na) : Add a way to return actions associated with a timeline and then merge/unify
|
// TODO (na) : Add a way to return actions associated with a timeline and then merge/unify
|
||||||
// with logic above to avoid Stream.concats
|
// with logic above to avoid Stream.concats
|
||||||
HoodieTimeline commitTimeline = table.getCompletedCommitsTimeline();
|
HoodieTimeline commitTimeline = table.getCompletedCommitsTimeline();
|
||||||
Option<HoodieInstant> oldestPendingCompactionInstant =
|
Option<HoodieInstant> oldestPendingCompactionInstant =
|
||||||
@@ -159,20 +157,16 @@ public class HoodieCommitArchiveLog {
|
|||||||
Option<HoodieInstant> firstSavepoint = table.getCompletedSavepointTimeline().firstInstant();
|
Option<HoodieInstant> firstSavepoint = table.getCompletedSavepointTimeline().firstInstant();
|
||||||
if (!commitTimeline.empty() && commitTimeline.countInstants() > maxCommitsToKeep) {
|
if (!commitTimeline.empty() && commitTimeline.countInstants() > maxCommitsToKeep) {
|
||||||
// Actually do the commits
|
// Actually do the commits
|
||||||
instants = Stream.concat(instants, commitTimeline.getInstants()
|
instants = Stream.concat(instants, commitTimeline.getInstants().filter(s -> {
|
||||||
.filter(s -> {
|
// if no savepoint present, then dont filter
|
||||||
// if no savepoint present, then dont filter
|
return !(firstSavepoint.isPresent() && HoodieTimeline.compareTimestamps(firstSavepoint.get().getTimestamp(),
|
||||||
return !(firstSavepoint.isPresent() && HoodieTimeline
|
s.getTimestamp(), HoodieTimeline.LESSER_OR_EQUAL));
|
||||||
.compareTimestamps(firstSavepoint.get().getTimestamp(), s.getTimestamp(),
|
}).filter(s -> {
|
||||||
HoodieTimeline.LESSER_OR_EQUAL));
|
// Ensure commits >= oldest pending compaction commit is retained
|
||||||
})
|
return oldestPendingCompactionInstant.map(instant -> {
|
||||||
.filter(s -> {
|
return HoodieTimeline.compareTimestamps(instant.getTimestamp(), s.getTimestamp(), HoodieTimeline.GREATER);
|
||||||
// Ensure commits >= oldest pending compaction commit is retained
|
}).orElse(true);
|
||||||
return oldestPendingCompactionInstant.map(instant -> {
|
}).limit(commitTimeline.countInstants() - minCommitsToKeep));
|
||||||
return HoodieTimeline.compareTimestamps(instant.getTimestamp(), s.getTimestamp(), HoodieTimeline.GREATER);
|
|
||||||
}).orElse(true);
|
|
||||||
})
|
|
||||||
.limit(commitTimeline.countInstants() - minCommitsToKeep));
|
|
||||||
}
|
}
|
||||||
|
|
||||||
return instants;
|
return instants;
|
||||||
@@ -194,13 +188,10 @@ public class HoodieCommitArchiveLog {
|
|||||||
}
|
}
|
||||||
|
|
||||||
// Remove older meta-data from auxiliary path too
|
// Remove older meta-data from auxiliary path too
|
||||||
Option<HoodieInstant> latestCommitted =
|
Option<HoodieInstant> latestCommitted = Option.fromJavaOptional(archivedInstants.stream().filter(i -> {
|
||||||
Option.fromJavaOptional(archivedInstants.stream()
|
return i.isCompleted() && (i.getAction().equals(HoodieTimeline.COMMIT_ACTION)
|
||||||
.filter(i -> {
|
|| (i.getAction().equals(HoodieTimeline.DELTA_COMMIT_ACTION)));
|
||||||
return i.isCompleted()
|
}).max(Comparator.comparing(HoodieInstant::getTimestamp)));
|
||||||
&& (i.getAction().equals(HoodieTimeline.COMMIT_ACTION) || (i.getAction().equals(
|
|
||||||
HoodieTimeline.DELTA_COMMIT_ACTION)));
|
|
||||||
}).max(Comparator.comparing(HoodieInstant::getTimestamp)));
|
|
||||||
if (latestCommitted.isPresent()) {
|
if (latestCommitted.isPresent()) {
|
||||||
success &= deleteAllInstantsOlderorEqualsInAuxMetaFolder(latestCommitted.get());
|
success &= deleteAllInstantsOlderorEqualsInAuxMetaFolder(latestCommitted.get());
|
||||||
}
|
}
|
||||||
@@ -214,12 +205,9 @@ public class HoodieCommitArchiveLog {
|
|||||||
* @return success if all eligible file deleted successfully
|
* @return success if all eligible file deleted successfully
|
||||||
* @throws IOException in case of error
|
* @throws IOException in case of error
|
||||||
*/
|
*/
|
||||||
private boolean deleteAllInstantsOlderorEqualsInAuxMetaFolder(HoodieInstant thresholdInstant)
|
private boolean deleteAllInstantsOlderorEqualsInAuxMetaFolder(HoodieInstant thresholdInstant) throws IOException {
|
||||||
throws IOException {
|
List<HoodieInstant> instants = HoodieTableMetaClient.scanHoodieInstantsFromFileSystem(metaClient.getFs(),
|
||||||
List<HoodieInstant> instants =
|
new Path(metaClient.getMetaAuxiliaryPath()), HoodieActiveTimeline.VALID_EXTENSIONS_IN_ACTIVE_TIMELINE);
|
||||||
HoodieTableMetaClient.scanHoodieInstantsFromFileSystem(metaClient.getFs(),
|
|
||||||
new Path(metaClient.getMetaAuxiliaryPath()),
|
|
||||||
HoodieActiveTimeline.VALID_EXTENSIONS_IN_ACTIVE_TIMELINE);
|
|
||||||
|
|
||||||
List<HoodieInstant> instantsToBeDeleted =
|
List<HoodieInstant> instantsToBeDeleted =
|
||||||
instants.stream().filter(instant1 -> HoodieTimeline.compareTimestamps(instant1.getTimestamp(),
|
instants.stream().filter(instant1 -> HoodieTimeline.compareTimestamps(instant1.getTimestamp(),
|
||||||
@@ -239,8 +227,7 @@ public class HoodieCommitArchiveLog {
|
|||||||
|
|
||||||
public void archive(List<HoodieInstant> instants) throws HoodieCommitException {
|
public void archive(List<HoodieInstant> instants) throws HoodieCommitException {
|
||||||
try {
|
try {
|
||||||
HoodieTimeline commitTimeline = metaClient.getActiveTimeline().getAllCommitsTimeline()
|
HoodieTimeline commitTimeline = metaClient.getActiveTimeline().getAllCommitsTimeline().filterCompletedInstants();
|
||||||
.filterCompletedInstants();
|
|
||||||
Schema wrapperSchema = HoodieArchivedMetaEntry.getClassSchema();
|
Schema wrapperSchema = HoodieArchivedMetaEntry.getClassSchema();
|
||||||
log.info("Wrapper schema " + wrapperSchema.toString());
|
log.info("Wrapper schema " + wrapperSchema.toString());
|
||||||
List<IndexedRecord> records = new ArrayList<>();
|
List<IndexedRecord> records = new ArrayList<>();
|
||||||
@@ -277,15 +264,14 @@ public class HoodieCommitArchiveLog {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
private IndexedRecord convertToAvroRecord(HoodieTimeline commitTimeline,
|
private IndexedRecord convertToAvroRecord(HoodieTimeline commitTimeline, HoodieInstant hoodieInstant)
|
||||||
HoodieInstant hoodieInstant) throws IOException {
|
throws IOException {
|
||||||
HoodieArchivedMetaEntry archivedMetaWrapper = new HoodieArchivedMetaEntry();
|
HoodieArchivedMetaEntry archivedMetaWrapper = new HoodieArchivedMetaEntry();
|
||||||
archivedMetaWrapper.setCommitTime(hoodieInstant.getTimestamp());
|
archivedMetaWrapper.setCommitTime(hoodieInstant.getTimestamp());
|
||||||
switch (hoodieInstant.getAction()) {
|
switch (hoodieInstant.getAction()) {
|
||||||
case HoodieTimeline.CLEAN_ACTION: {
|
case HoodieTimeline.CLEAN_ACTION: {
|
||||||
archivedMetaWrapper.setHoodieCleanMetadata(AvroUtils
|
archivedMetaWrapper.setHoodieCleanMetadata(AvroUtils
|
||||||
.deserializeAvroMetadata(commitTimeline.getInstantDetails(hoodieInstant).get(),
|
.deserializeAvroMetadata(commitTimeline.getInstantDetails(hoodieInstant).get(), HoodieCleanMetadata.class));
|
||||||
HoodieCleanMetadata.class));
|
|
||||||
archivedMetaWrapper.setActionType(ActionType.clean.name());
|
archivedMetaWrapper.setActionType(ActionType.clean.name());
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
@@ -297,16 +283,14 @@ public class HoodieCommitArchiveLog {
|
|||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
case HoodieTimeline.ROLLBACK_ACTION: {
|
case HoodieTimeline.ROLLBACK_ACTION: {
|
||||||
archivedMetaWrapper.setHoodieRollbackMetadata(AvroUtils
|
archivedMetaWrapper.setHoodieRollbackMetadata(AvroUtils.deserializeAvroMetadata(
|
||||||
.deserializeAvroMetadata(commitTimeline.getInstantDetails(hoodieInstant).get(),
|
commitTimeline.getInstantDetails(hoodieInstant).get(), HoodieRollbackMetadata.class));
|
||||||
HoodieRollbackMetadata.class));
|
|
||||||
archivedMetaWrapper.setActionType(ActionType.rollback.name());
|
archivedMetaWrapper.setActionType(ActionType.rollback.name());
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
case HoodieTimeline.SAVEPOINT_ACTION: {
|
case HoodieTimeline.SAVEPOINT_ACTION: {
|
||||||
archivedMetaWrapper.setHoodieSavePointMetadata(AvroUtils
|
archivedMetaWrapper.setHoodieSavePointMetadata(AvroUtils.deserializeAvroMetadata(
|
||||||
.deserializeAvroMetadata(commitTimeline.getInstantDetails(hoodieInstant).get(),
|
commitTimeline.getInstantDetails(hoodieInstant).get(), HoodieSavepointMetadata.class));
|
||||||
HoodieSavepointMetadata.class));
|
|
||||||
archivedMetaWrapper.setActionType(ActionType.savepoint.name());
|
archivedMetaWrapper.setActionType(ActionType.savepoint.name());
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
@@ -326,10 +310,10 @@ public class HoodieCommitArchiveLog {
|
|||||||
private org.apache.hudi.avro.model.HoodieCommitMetadata commitMetadataConverter(
|
private org.apache.hudi.avro.model.HoodieCommitMetadata commitMetadataConverter(
|
||||||
HoodieCommitMetadata hoodieCommitMetadata) {
|
HoodieCommitMetadata hoodieCommitMetadata) {
|
||||||
ObjectMapper mapper = new ObjectMapper();
|
ObjectMapper mapper = new ObjectMapper();
|
||||||
//Need this to ignore other public get() methods
|
// Need this to ignore other public get() methods
|
||||||
mapper.configure(DeserializationFeature.FAIL_ON_UNKNOWN_PROPERTIES, false);
|
mapper.configure(DeserializationFeature.FAIL_ON_UNKNOWN_PROPERTIES, false);
|
||||||
org.apache.hudi.avro.model.HoodieCommitMetadata avroMetaData = mapper
|
org.apache.hudi.avro.model.HoodieCommitMetadata avroMetaData =
|
||||||
.convertValue(hoodieCommitMetadata, org.apache.hudi.avro.model.HoodieCommitMetadata.class);
|
mapper.convertValue(hoodieCommitMetadata, org.apache.hudi.avro.model.HoodieCommitMetadata.class);
|
||||||
// Do not archive Rolling Stats, cannot set to null since AVRO will throw null pointer
|
// Do not archive Rolling Stats, cannot set to null since AVRO will throw null pointer
|
||||||
avroMetaData.getExtraMetadata().put(HoodieRollingStatMetadata.ROLLING_STAT_METADATA_KEY, "");
|
avroMetaData.getExtraMetadata().put(HoodieRollingStatMetadata.ROLLING_STAT_METADATA_KEY, "");
|
||||||
return avroMetaData;
|
return avroMetaData;
|
||||||
|
|||||||
@@ -66,11 +66,10 @@ public class HoodieCreateHandle<T extends HoodieRecordPayload> extends HoodieWri
|
|||||||
new Path(config.getBasePath()), FSUtils.getPartitionPath(config.getBasePath(), partitionPath));
|
new Path(config.getBasePath()), FSUtils.getPartitionPath(config.getBasePath(), partitionPath));
|
||||||
partitionMetadata.trySave(TaskContext.getPartitionId());
|
partitionMetadata.trySave(TaskContext.getPartitionId());
|
||||||
createMarkerFile(partitionPath);
|
createMarkerFile(partitionPath);
|
||||||
this.storageWriter = HoodieStorageWriterFactory
|
this.storageWriter =
|
||||||
.getStorageWriter(commitTime, path, hoodieTable, config, writerSchema);
|
HoodieStorageWriterFactory.getStorageWriter(commitTime, path, hoodieTable, config, writerSchema);
|
||||||
} catch (IOException e) {
|
} catch (IOException e) {
|
||||||
throw new HoodieInsertException(
|
throw new HoodieInsertException("Failed to initialize HoodieStorageWriter for path " + path, e);
|
||||||
"Failed to initialize HoodieStorageWriter for path " + path, e);
|
|
||||||
}
|
}
|
||||||
logger.info("New CreateHandle for partition :" + partitionPath + " with fileId " + fileId);
|
logger.info("New CreateHandle for partition :" + partitionPath + " with fileId " + fileId);
|
||||||
}
|
}
|
||||||
@@ -136,8 +135,7 @@ public class HoodieCreateHandle<T extends HoodieRecordPayload> extends HoodieWri
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
} catch (IOException io) {
|
} catch (IOException io) {
|
||||||
throw new HoodieInsertException(
|
throw new HoodieInsertException("Failed to insert records for path " + path, io);
|
||||||
"Failed to insert records for path " + path, io);
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -151,8 +149,8 @@ public class HoodieCreateHandle<T extends HoodieRecordPayload> extends HoodieWri
|
|||||||
*/
|
*/
|
||||||
@Override
|
@Override
|
||||||
public WriteStatus close() {
|
public WriteStatus close() {
|
||||||
logger.info("Closing the file " + writeStatus.getFileId() + " as we are done with all the records "
|
logger
|
||||||
+ recordsWritten);
|
.info("Closing the file " + writeStatus.getFileId() + " as we are done with all the records " + recordsWritten);
|
||||||
try {
|
try {
|
||||||
|
|
||||||
storageWriter.close();
|
storageWriter.close();
|
||||||
@@ -174,8 +172,8 @@ public class HoodieCreateHandle<T extends HoodieRecordPayload> extends HoodieWri
|
|||||||
stat.setRuntimeStats(runtimeStats);
|
stat.setRuntimeStats(runtimeStats);
|
||||||
writeStatus.setStat(stat);
|
writeStatus.setStat(stat);
|
||||||
|
|
||||||
logger.info(String.format("CreateHandle for partitionPath %s fileID %s, took %d ms.",
|
logger.info(String.format("CreateHandle for partitionPath %s fileID %s, took %d ms.", stat.getPartitionPath(),
|
||||||
stat.getPartitionPath(), stat.getFileId(), runtimeStats.getTotalCreateTime()));
|
stat.getFileId(), runtimeStats.getTotalCreateTime()));
|
||||||
|
|
||||||
return writeStatus;
|
return writeStatus;
|
||||||
} catch (IOException e) {
|
} catch (IOException e) {
|
||||||
|
|||||||
@@ -67,15 +67,15 @@ public class HoodieKeyLookupHandle<T extends HoodieRecordPayload> extends Hoodie
|
|||||||
/**
|
/**
|
||||||
* Given a list of row keys and one file, return only row keys existing in that file.
|
* Given a list of row keys and one file, return only row keys existing in that file.
|
||||||
*/
|
*/
|
||||||
public static List<String> checkCandidatesAgainstFile(Configuration configuration,
|
public static List<String> checkCandidatesAgainstFile(Configuration configuration, List<String> candidateRecordKeys,
|
||||||
List<String> candidateRecordKeys, Path filePath) throws HoodieIndexException {
|
Path filePath) throws HoodieIndexException {
|
||||||
List<String> foundRecordKeys = new ArrayList<>();
|
List<String> foundRecordKeys = new ArrayList<>();
|
||||||
try {
|
try {
|
||||||
// Load all rowKeys from the file, to double-confirm
|
// Load all rowKeys from the file, to double-confirm
|
||||||
if (!candidateRecordKeys.isEmpty()) {
|
if (!candidateRecordKeys.isEmpty()) {
|
||||||
HoodieTimer timer = new HoodieTimer().startTimer();
|
HoodieTimer timer = new HoodieTimer().startTimer();
|
||||||
Set<String> fileRowKeys = ParquetUtils.filterParquetRowKeys(configuration, filePath,
|
Set<String> fileRowKeys =
|
||||||
new HashSet<>(candidateRecordKeys));
|
ParquetUtils.filterParquetRowKeys(configuration, filePath, new HashSet<>(candidateRecordKeys));
|
||||||
foundRecordKeys.addAll(fileRowKeys);
|
foundRecordKeys.addAll(fileRowKeys);
|
||||||
logger.info(String.format("Checked keys against file %s, in %d ms. #candidates (%d) #found (%d)", filePath,
|
logger.info(String.format("Checked keys against file %s, in %d ms. #candidates (%d) #found (%d)", filePath,
|
||||||
timer.endTimer(), candidateRecordKeys.size(), foundRecordKeys.size()));
|
timer.endTimer(), candidateRecordKeys.size(), foundRecordKeys.size()));
|
||||||
@@ -112,11 +112,11 @@ public class HoodieKeyLookupHandle<T extends HoodieRecordPayload> extends Hoodie
|
|||||||
}
|
}
|
||||||
|
|
||||||
HoodieDataFile dataFile = getLatestDataFile();
|
HoodieDataFile dataFile = getLatestDataFile();
|
||||||
List<String> matchingKeys = checkCandidatesAgainstFile(hoodieTable.getHadoopConf(), candidateRecordKeys,
|
List<String> matchingKeys =
|
||||||
new Path(dataFile.getPath()));
|
checkCandidatesAgainstFile(hoodieTable.getHadoopConf(), candidateRecordKeys, new Path(dataFile.getPath()));
|
||||||
logger.info(String.format("Total records (%d), bloom filter candidates (%d)/fp(%d), actual matches (%d)",
|
logger.info(
|
||||||
totalKeysChecked, candidateRecordKeys.size(), candidateRecordKeys.size() - matchingKeys.size(),
|
String.format("Total records (%d), bloom filter candidates (%d)/fp(%d), actual matches (%d)", totalKeysChecked,
|
||||||
matchingKeys.size()));
|
candidateRecordKeys.size(), candidateRecordKeys.size() - matchingKeys.size(), matchingKeys.size()));
|
||||||
return new KeyLookupResult(partitionPathFilePair.getRight(), partitionPathFilePair.getLeft(),
|
return new KeyLookupResult(partitionPathFilePair.getRight(), partitionPathFilePair.getLeft(),
|
||||||
dataFile.getCommitTime(), matchingKeys);
|
dataFile.getCommitTime(), matchingKeys);
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -71,8 +71,7 @@ public class HoodieMergeHandle<T extends HoodieRecordPayload> extends HoodieWrit
|
|||||||
Iterator<HoodieRecord<T>> recordItr, String fileId) {
|
Iterator<HoodieRecord<T>> recordItr, String fileId) {
|
||||||
super(config, commitTime, fileId, hoodieTable);
|
super(config, commitTime, fileId, hoodieTable);
|
||||||
String partitionPath = init(fileId, recordItr);
|
String partitionPath = init(fileId, recordItr);
|
||||||
init(fileId, partitionPath,
|
init(fileId, partitionPath, hoodieTable.getROFileSystemView().getLatestDataFile(partitionPath, fileId).get());
|
||||||
hoodieTable.getROFileSystemView().getLatestDataFile(partitionPath, fileId).get());
|
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
@@ -83,8 +82,8 @@ public class HoodieMergeHandle<T extends HoodieRecordPayload> extends HoodieWrit
|
|||||||
super(config, commitTime, fileId, hoodieTable);
|
super(config, commitTime, fileId, hoodieTable);
|
||||||
this.keyToNewRecords = keyToNewRecords;
|
this.keyToNewRecords = keyToNewRecords;
|
||||||
this.useWriterSchema = true;
|
this.useWriterSchema = true;
|
||||||
init(fileId, keyToNewRecords.get(keyToNewRecords.keySet().stream().findFirst().get())
|
init(fileId, keyToNewRecords.get(keyToNewRecords.keySet().stream().findFirst().get()).getPartitionPath(),
|
||||||
.getPartitionPath(), dataFileToBeMerged);
|
dataFileToBeMerged);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
@@ -160,15 +159,13 @@ public class HoodieMergeHandle<T extends HoodieRecordPayload> extends HoodieWrit
|
|||||||
new Path(config.getBasePath()), FSUtils.getPartitionPath(config.getBasePath(), partitionPath));
|
new Path(config.getBasePath()), FSUtils.getPartitionPath(config.getBasePath(), partitionPath));
|
||||||
partitionMetadata.trySave(TaskContext.getPartitionId());
|
partitionMetadata.trySave(TaskContext.getPartitionId());
|
||||||
|
|
||||||
oldFilePath = new Path(
|
oldFilePath = new Path(config.getBasePath() + "/" + partitionPath + "/" + latestValidFilePath);
|
||||||
config.getBasePath() + "/" + partitionPath + "/" + latestValidFilePath);
|
|
||||||
String relativePath = new Path((partitionPath.isEmpty() ? "" : partitionPath + "/")
|
String relativePath = new Path((partitionPath.isEmpty() ? "" : partitionPath + "/")
|
||||||
+ FSUtils.makeDataFileName(instantTime, writeToken, fileId)).toString();
|
+ FSUtils.makeDataFileName(instantTime, writeToken, fileId)).toString();
|
||||||
newFilePath = new Path(config.getBasePath(), relativePath);
|
newFilePath = new Path(config.getBasePath(), relativePath);
|
||||||
|
|
||||||
logger.info(String
|
logger.info(String.format("Merging new data into oldPath %s, as newPath %s", oldFilePath.toString(),
|
||||||
.format("Merging new data into oldPath %s, as newPath %s", oldFilePath.toString(),
|
newFilePath.toString()));
|
||||||
newFilePath.toString()));
|
|
||||||
// file name is same for all records, in this bunch
|
// file name is same for all records, in this bunch
|
||||||
writeStatus.setFileId(fileId);
|
writeStatus.setFileId(fileId);
|
||||||
writeStatus.setPartitionPath(partitionPath);
|
writeStatus.setPartitionPath(partitionPath);
|
||||||
@@ -180,14 +177,13 @@ public class HoodieMergeHandle<T extends HoodieRecordPayload> extends HoodieWrit
|
|||||||
createMarkerFile(partitionPath);
|
createMarkerFile(partitionPath);
|
||||||
|
|
||||||
// Create the writer for writing the new version file
|
// Create the writer for writing the new version file
|
||||||
storageWriter = HoodieStorageWriterFactory
|
storageWriter =
|
||||||
.getStorageWriter(instantTime, newFilePath, hoodieTable, config, writerSchema);
|
HoodieStorageWriterFactory.getStorageWriter(instantTime, newFilePath, hoodieTable, config, writerSchema);
|
||||||
} catch (IOException io) {
|
} catch (IOException io) {
|
||||||
logger.error("Error in update task at commit " + instantTime, io);
|
logger.error("Error in update task at commit " + instantTime, io);
|
||||||
writeStatus.setGlobalError(io);
|
writeStatus.setGlobalError(io);
|
||||||
throw new HoodieUpsertException(
|
throw new HoodieUpsertException("Failed to initialize HoodieUpdateHandle for FileId: " + fileId + " on commit "
|
||||||
"Failed to initialize HoodieUpdateHandle for FileId: " + fileId + " on commit "
|
+ instantTime + " on path " + hoodieTable.getMetaClient().getBasePath(), io);
|
||||||
+ instantTime + " on path " + hoodieTable.getMetaClient().getBasePath(), io);
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -211,16 +207,14 @@ public class HoodieMergeHandle<T extends HoodieRecordPayload> extends HoodieWrit
|
|||||||
record.unseal();
|
record.unseal();
|
||||||
record.setNewLocation(new HoodieRecordLocation(instantTime, fileId));
|
record.setNewLocation(new HoodieRecordLocation(instantTime, fileId));
|
||||||
record.seal();
|
record.seal();
|
||||||
//NOTE: Once Records are added to map (spillable-map), DO NOT change it as they won't persist
|
// NOTE: Once Records are added to map (spillable-map), DO NOT change it as they won't persist
|
||||||
keyToNewRecords.put(record.getRecordKey(), record);
|
keyToNewRecords.put(record.getRecordKey(), record);
|
||||||
}
|
}
|
||||||
logger.info("Number of entries in MemoryBasedMap => "
|
logger.info("Number of entries in MemoryBasedMap => "
|
||||||
+ ((ExternalSpillableMap) keyToNewRecords).getInMemoryMapNumEntries()
|
+ ((ExternalSpillableMap) keyToNewRecords).getInMemoryMapNumEntries()
|
||||||
+ "Total size in bytes of MemoryBasedMap => "
|
+ "Total size in bytes of MemoryBasedMap => "
|
||||||
+ ((ExternalSpillableMap) keyToNewRecords).getCurrentInMemoryMapSize()
|
+ ((ExternalSpillableMap) keyToNewRecords).getCurrentInMemoryMapSize() + "Number of entries in DiskBasedMap => "
|
||||||
+ "Number of entries in DiskBasedMap => "
|
+ ((ExternalSpillableMap) keyToNewRecords).getDiskBasedMapNumEntries() + "Size of file spilled to disk => "
|
||||||
+ ((ExternalSpillableMap) keyToNewRecords).getDiskBasedMapNumEntries()
|
|
||||||
+ "Size of file spilled to disk => "
|
|
||||||
+ ((ExternalSpillableMap) keyToNewRecords).getSizeOfFileOnDiskInBytes());
|
+ ((ExternalSpillableMap) keyToNewRecords).getSizeOfFileOnDiskInBytes());
|
||||||
return partitionPath;
|
return partitionPath;
|
||||||
}
|
}
|
||||||
@@ -258,8 +252,7 @@ public class HoodieMergeHandle<T extends HoodieRecordPayload> extends HoodieWrit
|
|||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Go through an old record. Here if we detect a newer version shows up, we write the new one to
|
* Go through an old record. Here if we detect a newer version shows up, we write the new one to the file.
|
||||||
* the file.
|
|
||||||
*/
|
*/
|
||||||
public void write(GenericRecord oldRecord) {
|
public void write(GenericRecord oldRecord) {
|
||||||
String key = oldRecord.get(HoodieRecord.RECORD_KEY_METADATA_FIELD).toString();
|
String key = oldRecord.get(HoodieRecord.RECORD_KEY_METADATA_FIELD).toString();
|
||||||
@@ -269,12 +262,12 @@ public class HoodieMergeHandle<T extends HoodieRecordPayload> extends HoodieWrit
|
|||||||
// writing the first record. So make a copy of the record to be merged
|
// writing the first record. So make a copy of the record to be merged
|
||||||
HoodieRecord<T> hoodieRecord = new HoodieRecord<>(keyToNewRecords.get(key));
|
HoodieRecord<T> hoodieRecord = new HoodieRecord<>(keyToNewRecords.get(key));
|
||||||
try {
|
try {
|
||||||
Option<IndexedRecord> combinedAvroRecord = hoodieRecord.getData()
|
Option<IndexedRecord> combinedAvroRecord =
|
||||||
.combineAndGetUpdateValue(oldRecord, useWriterSchema ? writerSchema : originalSchema);
|
hoodieRecord.getData().combineAndGetUpdateValue(oldRecord, useWriterSchema ? writerSchema : originalSchema);
|
||||||
if (writeUpdateRecord(hoodieRecord, combinedAvroRecord)) {
|
if (writeUpdateRecord(hoodieRecord, combinedAvroRecord)) {
|
||||||
/* ONLY WHEN
|
/*
|
||||||
* 1) we have an update for this key AND
|
* ONLY WHEN 1) we have an update for this key AND 2) We are able to successfully write the the combined new
|
||||||
* 2) We are able to successfully write the the combined new value
|
* value
|
||||||
*
|
*
|
||||||
* We no longer need to copy the old record over.
|
* We no longer need to copy the old record over.
|
||||||
*/
|
*/
|
||||||
@@ -282,26 +275,24 @@ public class HoodieMergeHandle<T extends HoodieRecordPayload> extends HoodieWrit
|
|||||||
}
|
}
|
||||||
writtenRecordKeys.add(key);
|
writtenRecordKeys.add(key);
|
||||||
} catch (Exception e) {
|
} catch (Exception e) {
|
||||||
throw new HoodieUpsertException(
|
throw new HoodieUpsertException("Failed to combine/merge new record with old value in storage, for new record {"
|
||||||
"Failed to combine/merge new record with old value in storage, for new record {"
|
+ keyToNewRecords.get(key) + "}, old value {" + oldRecord + "}", e);
|
||||||
+ keyToNewRecords.get(key) + "}, old value {" + oldRecord + "}", e);
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
if (copyOldRecord) {
|
if (copyOldRecord) {
|
||||||
// this should work as it is, since this is an existing record
|
// this should work as it is, since this is an existing record
|
||||||
String errMsg = "Failed to merge old record into new file for key " + key + " from old file "
|
String errMsg = "Failed to merge old record into new file for key " + key + " from old file " + getOldFilePath()
|
||||||
+ getOldFilePath() + " to new file " + newFilePath;
|
+ " to new file " + newFilePath;
|
||||||
try {
|
try {
|
||||||
storageWriter.writeAvro(key, oldRecord);
|
storageWriter.writeAvro(key, oldRecord);
|
||||||
} catch (ClassCastException e) {
|
} catch (ClassCastException e) {
|
||||||
logger.error("Schema mismatch when rewriting old record " + oldRecord + " from file "
|
logger.error("Schema mismatch when rewriting old record " + oldRecord + " from file " + getOldFilePath()
|
||||||
+ getOldFilePath() + " to file " + newFilePath + " with writerSchema " + writerSchema
|
+ " to file " + newFilePath + " with writerSchema " + writerSchema.toString(true));
|
||||||
.toString(true));
|
|
||||||
throw new HoodieUpsertException(errMsg, e);
|
throw new HoodieUpsertException(errMsg, e);
|
||||||
} catch (IOException e) {
|
} catch (IOException e) {
|
||||||
logger.error("Failed to merge old record into new file for key " + key + " from old file "
|
logger.error("Failed to merge old record into new file for key " + key + " from old file " + getOldFilePath()
|
||||||
+ getOldFilePath() + " to new file " + newFilePath, e);
|
+ " to new file " + newFilePath, e);
|
||||||
throw new HoodieUpsertException(errMsg, e);
|
throw new HoodieUpsertException(errMsg, e);
|
||||||
}
|
}
|
||||||
recordsWritten++;
|
recordsWritten++;
|
||||||
@@ -344,8 +335,8 @@ public class HoodieMergeHandle<T extends HoodieRecordPayload> extends HoodieWrit
|
|||||||
runtimeStats.setTotalUpsertTime(timer.endTimer());
|
runtimeStats.setTotalUpsertTime(timer.endTimer());
|
||||||
stat.setRuntimeStats(runtimeStats);
|
stat.setRuntimeStats(runtimeStats);
|
||||||
|
|
||||||
logger.info(String.format("MergeHandle for partitionPath %s fileID %s, took %d ms.",
|
logger.info(String.format("MergeHandle for partitionPath %s fileID %s, took %d ms.", stat.getPartitionPath(),
|
||||||
stat.getPartitionPath(), stat.getFileId(), runtimeStats.getTotalUpsertTime()));
|
stat.getFileId(), runtimeStats.getTotalUpsertTime()));
|
||||||
|
|
||||||
return writeStatus;
|
return writeStatus;
|
||||||
} catch (IOException e) {
|
} catch (IOException e) {
|
||||||
|
|||||||
@@ -61,8 +61,7 @@ public abstract class HoodieWriteHandle<T extends HoodieRecordPayload> extends H
|
|||||||
this.writerSchema = createHoodieWriteSchema(originalSchema);
|
this.writerSchema = createHoodieWriteSchema(originalSchema);
|
||||||
this.timer = new HoodieTimer().startTimer();
|
this.timer = new HoodieTimer().startTimer();
|
||||||
this.writeStatus = (WriteStatus) ReflectionUtils.loadClass(config.getWriteStatusClassName(),
|
this.writeStatus = (WriteStatus) ReflectionUtils.loadClass(config.getWriteStatusClassName(),
|
||||||
!hoodieTable.getIndex().isImplicitWithStorage(),
|
!hoodieTable.getIndex().isImplicitWithStorage(), config.getWriteStatusFailureFraction());
|
||||||
config.getWriteStatusFailureFraction());
|
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
@@ -104,7 +103,7 @@ public abstract class HoodieWriteHandle<T extends HoodieRecordPayload> extends H
|
|||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* THe marker path will be <base-path>/.hoodie/.temp/<instant_ts>/2019/04/25/filename
|
* THe marker path will be <base-path>/.hoodie/.temp/<instant_ts>/2019/04/25/filename
|
||||||
*/
|
*/
|
||||||
private Path makeNewMarkerPath(String partitionPath) {
|
private Path makeNewMarkerPath(String partitionPath) {
|
||||||
Path markerRootPath = new Path(hoodieTable.getMetaClient().getMarkerFolderPath(instantTime));
|
Path markerRootPath = new Path(hoodieTable.getMetaClient().getMarkerFolderPath(instantTime));
|
||||||
|
|||||||
@@ -37,23 +37,20 @@ public interface HoodieCompactor extends Serializable {
|
|||||||
/**
|
/**
|
||||||
* Generate a new compaction plan for scheduling
|
* Generate a new compaction plan for scheduling
|
||||||
*
|
*
|
||||||
* @param jsc Spark Context
|
* @param jsc Spark Context
|
||||||
* @param hoodieTable Hoodie Table
|
* @param hoodieTable Hoodie Table
|
||||||
* @param config Hoodie Write Configuration
|
* @param config Hoodie Write Configuration
|
||||||
* @param compactionCommitTime scheduled compaction commit time
|
* @param compactionCommitTime scheduled compaction commit time
|
||||||
* @param fgIdsInPendingCompactions partition-fileId pairs for which compaction is pending
|
* @param fgIdsInPendingCompactions partition-fileId pairs for which compaction is pending
|
||||||
* @return Compaction Plan
|
* @return Compaction Plan
|
||||||
* @throws IOException when encountering errors
|
* @throws IOException when encountering errors
|
||||||
*/
|
*/
|
||||||
HoodieCompactionPlan generateCompactionPlan(JavaSparkContext jsc,
|
HoodieCompactionPlan generateCompactionPlan(JavaSparkContext jsc, HoodieTable hoodieTable, HoodieWriteConfig config,
|
||||||
HoodieTable hoodieTable, HoodieWriteConfig config, String compactionCommitTime,
|
String compactionCommitTime, Set<HoodieFileGroupId> fgIdsInPendingCompactions) throws IOException;
|
||||||
Set<HoodieFileGroupId> fgIdsInPendingCompactions)
|
|
||||||
throws IOException;
|
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Execute compaction operations and report back status
|
* Execute compaction operations and report back status
|
||||||
*/
|
*/
|
||||||
JavaRDD<WriteStatus> compact(JavaSparkContext jsc,
|
JavaRDD<WriteStatus> compact(JavaSparkContext jsc, HoodieCompactionPlan compactionPlan, HoodieTable hoodieTable,
|
||||||
HoodieCompactionPlan compactionPlan, HoodieTable hoodieTable, HoodieWriteConfig config,
|
HoodieWriteConfig config, String compactionInstantTime) throws IOException;
|
||||||
String compactionInstantTime) throws IOException;
|
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -63,9 +63,9 @@ import org.apache.spark.util.AccumulatorV2;
|
|||||||
import org.apache.spark.util.LongAccumulator;
|
import org.apache.spark.util.LongAccumulator;
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* HoodieRealtimeTableCompactor compacts a hoodie table with merge on read storage. Computes all
|
* HoodieRealtimeTableCompactor compacts a hoodie table with merge on read storage. Computes all possible compactions,
|
||||||
* possible compactions, passes it through a CompactionFilter and executes all the compactions and
|
* passes it through a CompactionFilter and executes all the compactions and writes a new version of base files and make
|
||||||
* writes a new version of base files and make a normal commit
|
* a normal commit
|
||||||
*
|
*
|
||||||
* @see HoodieCompactor
|
* @see HoodieCompactor
|
||||||
*/
|
*/
|
||||||
@@ -78,9 +78,8 @@ public class HoodieRealtimeTableCompactor implements HoodieCompactor {
|
|||||||
private AccumulatorV2<Long, Long> totalFileSlices;
|
private AccumulatorV2<Long, Long> totalFileSlices;
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
public JavaRDD<WriteStatus> compact(JavaSparkContext jsc,
|
public JavaRDD<WriteStatus> compact(JavaSparkContext jsc, HoodieCompactionPlan compactionPlan,
|
||||||
HoodieCompactionPlan compactionPlan, HoodieTable hoodieTable, HoodieWriteConfig config,
|
HoodieTable hoodieTable, HoodieWriteConfig config, String compactionInstantTime) throws IOException {
|
||||||
String compactionInstantTime) throws IOException {
|
|
||||||
if (compactionPlan == null || (compactionPlan.getOperations() == null)
|
if (compactionPlan == null || (compactionPlan.getOperations() == null)
|
||||||
|| (compactionPlan.getOperations().isEmpty())) {
|
|| (compactionPlan.getOperations().isEmpty())) {
|
||||||
return jsc.emptyRDD();
|
return jsc.emptyRDD();
|
||||||
@@ -88,41 +87,36 @@ public class HoodieRealtimeTableCompactor implements HoodieCompactor {
|
|||||||
HoodieTableMetaClient metaClient = hoodieTable.getMetaClient();
|
HoodieTableMetaClient metaClient = hoodieTable.getMetaClient();
|
||||||
// Compacting is very similar to applying updates to existing file
|
// Compacting is very similar to applying updates to existing file
|
||||||
HoodieCopyOnWriteTable table = new HoodieCopyOnWriteTable(config, jsc);
|
HoodieCopyOnWriteTable table = new HoodieCopyOnWriteTable(config, jsc);
|
||||||
List<CompactionOperation> operations = compactionPlan.getOperations().stream().map(
|
List<CompactionOperation> operations = compactionPlan.getOperations().stream()
|
||||||
CompactionOperation::convertFromAvroRecordInstance).collect(toList());
|
.map(CompactionOperation::convertFromAvroRecordInstance).collect(toList());
|
||||||
log.info("Compactor compacting " + operations + " files");
|
log.info("Compactor compacting " + operations + " files");
|
||||||
|
|
||||||
return jsc.parallelize(operations, operations.size())
|
return jsc.parallelize(operations, operations.size())
|
||||||
.map(s -> compact(table, metaClient, config, s, compactionInstantTime))
|
.map(s -> compact(table, metaClient, config, s, compactionInstantTime)).flatMap(List::iterator);
|
||||||
.flatMap(List::iterator);
|
|
||||||
}
|
}
|
||||||
|
|
||||||
private List<WriteStatus> compact(HoodieCopyOnWriteTable hoodieCopyOnWriteTable, HoodieTableMetaClient metaClient,
|
private List<WriteStatus> compact(HoodieCopyOnWriteTable hoodieCopyOnWriteTable, HoodieTableMetaClient metaClient,
|
||||||
HoodieWriteConfig config,
|
HoodieWriteConfig config, CompactionOperation operation, String commitTime) throws IOException {
|
||||||
CompactionOperation operation, String commitTime) throws IOException {
|
|
||||||
FileSystem fs = metaClient.getFs();
|
FileSystem fs = metaClient.getFs();
|
||||||
Schema readerSchema = HoodieAvroUtils
|
Schema readerSchema = HoodieAvroUtils.addMetadataFields(new Schema.Parser().parse(config.getSchema()));
|
||||||
.addMetadataFields(new Schema.Parser().parse(config.getSchema()));
|
|
||||||
|
|
||||||
log.info("Compacting base " + operation.getDataFilePath() + " with delta files " + operation
|
log.info("Compacting base " + operation.getDataFilePath() + " with delta files " + operation.getDeltaFilePaths()
|
||||||
.getDeltaFilePaths() + " for commit " + commitTime);
|
+ " for commit " + commitTime);
|
||||||
// TODO - FIX THIS
|
// TODO - FIX THIS
|
||||||
// Reads the entire avro file. Always only specific blocks should be read from the avro file
|
// Reads the entire avro file. Always only specific blocks should be read from the avro file
|
||||||
// (failure recover).
|
// (failure recover).
|
||||||
// Load all the delta commits since the last compaction commit and get all the blocks to be
|
// Load all the delta commits since the last compaction commit and get all the blocks to be
|
||||||
// loaded and load it using CompositeAvroLogReader
|
// loaded and load it using CompositeAvroLogReader
|
||||||
// Since a DeltaCommit is not defined yet, reading all the records. revisit this soon.
|
// Since a DeltaCommit is not defined yet, reading all the records. revisit this soon.
|
||||||
String maxInstantTime = metaClient.getActiveTimeline()
|
String maxInstantTime = metaClient
|
||||||
.getTimelineOfActions(
|
.getActiveTimeline().getTimelineOfActions(Sets.newHashSet(HoodieTimeline.COMMIT_ACTION,
|
||||||
Sets.newHashSet(HoodieTimeline.COMMIT_ACTION, HoodieTimeline.ROLLBACK_ACTION,
|
HoodieTimeline.ROLLBACK_ACTION, HoodieTimeline.DELTA_COMMIT_ACTION))
|
||||||
HoodieTimeline.DELTA_COMMIT_ACTION))
|
|
||||||
.filterCompletedInstants().lastInstant().get().getTimestamp();
|
.filterCompletedInstants().lastInstant().get().getTimestamp();
|
||||||
log.info("MaxMemoryPerCompaction => " + config.getMaxMemoryPerCompaction());
|
log.info("MaxMemoryPerCompaction => " + config.getMaxMemoryPerCompaction());
|
||||||
HoodieMergedLogRecordScanner scanner = new HoodieMergedLogRecordScanner(fs,
|
HoodieMergedLogRecordScanner scanner = new HoodieMergedLogRecordScanner(fs, metaClient.getBasePath(),
|
||||||
metaClient.getBasePath(), operation.getDeltaFilePaths(), readerSchema, maxInstantTime,
|
operation.getDeltaFilePaths(), readerSchema, maxInstantTime, config.getMaxMemoryPerCompaction(),
|
||||||
config.getMaxMemoryPerCompaction(), config.getCompactionLazyBlockReadEnabled(),
|
config.getCompactionLazyBlockReadEnabled(), config.getCompactionReverseLogReadEnabled(),
|
||||||
config.getCompactionReverseLogReadEnabled(), config.getMaxDFSStreamBufferSize(),
|
config.getMaxDFSStreamBufferSize(), config.getSpillableMapBasePath());
|
||||||
config.getSpillableMapBasePath());
|
|
||||||
if (!scanner.iterator().hasNext()) {
|
if (!scanner.iterator().hasNext()) {
|
||||||
return Lists.<WriteStatus>newArrayList();
|
return Lists.<WriteStatus>newArrayList();
|
||||||
}
|
}
|
||||||
@@ -134,53 +128,49 @@ public class HoodieRealtimeTableCompactor implements HoodieCompactor {
|
|||||||
// If the dataFile is present, there is a base parquet file present, perform updates else perform inserts into a
|
// If the dataFile is present, there is a base parquet file present, perform updates else perform inserts into a
|
||||||
// new base parquet file.
|
// new base parquet file.
|
||||||
if (oldDataFileOpt.isPresent()) {
|
if (oldDataFileOpt.isPresent()) {
|
||||||
result = hoodieCopyOnWriteTable
|
result = hoodieCopyOnWriteTable.handleUpdate(commitTime, operation.getFileId(), scanner.getRecords(),
|
||||||
.handleUpdate(commitTime, operation.getFileId(), scanner.getRecords(), oldDataFileOpt.get());
|
oldDataFileOpt.get());
|
||||||
} else {
|
} else {
|
||||||
result = hoodieCopyOnWriteTable
|
result = hoodieCopyOnWriteTable.handleInsert(commitTime, operation.getPartitionPath(), operation.getFileId(),
|
||||||
.handleInsert(commitTime, operation.getPartitionPath(), operation.getFileId(), scanner.iterator());
|
scanner.iterator());
|
||||||
}
|
}
|
||||||
Iterable<List<WriteStatus>> resultIterable = () -> result;
|
Iterable<List<WriteStatus>> resultIterable = () -> result;
|
||||||
return StreamSupport.stream(resultIterable.spliterator(), false).flatMap(Collection::stream)
|
return StreamSupport.stream(resultIterable.spliterator(), false).flatMap(Collection::stream).peek(s -> {
|
||||||
.peek(s -> {
|
s.getStat().setTotalUpdatedRecordsCompacted(scanner.getNumMergedRecordsInLog());
|
||||||
s.getStat().setTotalUpdatedRecordsCompacted(scanner.getNumMergedRecordsInLog());
|
s.getStat().setTotalLogFilesCompacted(scanner.getTotalLogFiles());
|
||||||
s.getStat().setTotalLogFilesCompacted(scanner.getTotalLogFiles());
|
s.getStat().setTotalLogRecords(scanner.getTotalLogRecords());
|
||||||
s.getStat().setTotalLogRecords(scanner.getTotalLogRecords());
|
s.getStat().setPartitionPath(operation.getPartitionPath());
|
||||||
s.getStat().setPartitionPath(operation.getPartitionPath());
|
s.getStat()
|
||||||
s.getStat().setTotalLogSizeCompacted(operation.getMetrics().get(
|
.setTotalLogSizeCompacted(operation.getMetrics().get(CompactionStrategy.TOTAL_LOG_FILE_SIZE).longValue());
|
||||||
CompactionStrategy.TOTAL_LOG_FILE_SIZE).longValue());
|
s.getStat().setTotalLogBlocks(scanner.getTotalLogBlocks());
|
||||||
s.getStat().setTotalLogBlocks(scanner.getTotalLogBlocks());
|
s.getStat().setTotalCorruptLogBlock(scanner.getTotalCorruptBlocks());
|
||||||
s.getStat().setTotalCorruptLogBlock(scanner.getTotalCorruptBlocks());
|
s.getStat().setTotalRollbackBlocks(scanner.getTotalRollbacks());
|
||||||
s.getStat().setTotalRollbackBlocks(scanner.getTotalRollbacks());
|
RuntimeStats runtimeStats = new RuntimeStats();
|
||||||
RuntimeStats runtimeStats = new RuntimeStats();
|
runtimeStats.setTotalScanTime(scanner.getTotalTimeTakenToReadAndMergeBlocks());
|
||||||
runtimeStats.setTotalScanTime(scanner.getTotalTimeTakenToReadAndMergeBlocks());
|
s.getStat().setRuntimeStats(runtimeStats);
|
||||||
s.getStat().setRuntimeStats(runtimeStats);
|
}).collect(toList());
|
||||||
}).collect(toList());
|
|
||||||
}
|
}
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
public HoodieCompactionPlan generateCompactionPlan(JavaSparkContext jsc,
|
public HoodieCompactionPlan generateCompactionPlan(JavaSparkContext jsc, HoodieTable hoodieTable,
|
||||||
HoodieTable hoodieTable, HoodieWriteConfig config, String compactionCommitTime,
|
HoodieWriteConfig config, String compactionCommitTime, Set<HoodieFileGroupId> fgIdsInPendingCompactions)
|
||||||
Set<HoodieFileGroupId> fgIdsInPendingCompactions) throws IOException {
|
throws IOException {
|
||||||
|
|
||||||
totalLogFiles = new LongAccumulator();
|
totalLogFiles = new LongAccumulator();
|
||||||
totalFileSlices = new LongAccumulator();
|
totalFileSlices = new LongAccumulator();
|
||||||
jsc.sc().register(totalLogFiles);
|
jsc.sc().register(totalLogFiles);
|
||||||
jsc.sc().register(totalFileSlices);
|
jsc.sc().register(totalFileSlices);
|
||||||
|
|
||||||
Preconditions
|
Preconditions.checkArgument(hoodieTable.getMetaClient().getTableType() == HoodieTableType.MERGE_ON_READ,
|
||||||
.checkArgument(hoodieTable.getMetaClient().getTableType() == HoodieTableType.MERGE_ON_READ,
|
"HoodieRealtimeTableCompactor can only compact table of type " + HoodieTableType.MERGE_ON_READ + " and not "
|
||||||
"HoodieRealtimeTableCompactor can only compact table of type "
|
+ hoodieTable.getMetaClient().getTableType().name());
|
||||||
+ HoodieTableType.MERGE_ON_READ + " and not " + hoodieTable.getMetaClient()
|
|
||||||
.getTableType().name());
|
|
||||||
|
|
||||||
//TODO : check if maxMemory is not greater than JVM or spark.executor memory
|
// TODO : check if maxMemory is not greater than JVM or spark.executor memory
|
||||||
// TODO - rollback any compactions in flight
|
// TODO - rollback any compactions in flight
|
||||||
HoodieTableMetaClient metaClient = hoodieTable.getMetaClient();
|
HoodieTableMetaClient metaClient = hoodieTable.getMetaClient();
|
||||||
log.info("Compacting " + metaClient.getBasePath() + " with commit " + compactionCommitTime);
|
log.info("Compacting " + metaClient.getBasePath() + " with commit " + compactionCommitTime);
|
||||||
List<String> partitionPaths = FSUtils
|
List<String> partitionPaths = FSUtils.getAllPartitionPaths(metaClient.getFs(), metaClient.getBasePath(),
|
||||||
.getAllPartitionPaths(metaClient.getFs(), metaClient.getBasePath(),
|
config.shouldAssumeDatePartitioning());
|
||||||
config.shouldAssumeDatePartitioning());
|
|
||||||
|
|
||||||
// filter the partition paths if needed to reduce list status
|
// filter the partition paths if needed to reduce list status
|
||||||
partitionPaths = config.getCompactionStrategy().filterPartitionPaths(config, partitionPaths);
|
partitionPaths = config.getCompactionStrategy().filterPartitionPaths(config, partitionPaths);
|
||||||
@@ -192,28 +182,22 @@ public class HoodieRealtimeTableCompactor implements HoodieCompactor {
|
|||||||
|
|
||||||
RealtimeView fileSystemView = hoodieTable.getRTFileSystemView();
|
RealtimeView fileSystemView = hoodieTable.getRTFileSystemView();
|
||||||
log.info("Compaction looking for files to compact in " + partitionPaths + " partitions");
|
log.info("Compaction looking for files to compact in " + partitionPaths + " partitions");
|
||||||
List<HoodieCompactionOperation> operations =
|
List<HoodieCompactionOperation> operations = jsc.parallelize(partitionPaths, partitionPaths.size())
|
||||||
jsc.parallelize(partitionPaths, partitionPaths.size())
|
.flatMap((FlatMapFunction<String, CompactionOperation>) partitionPath -> fileSystemView
|
||||||
.flatMap((FlatMapFunction<String, CompactionOperation>) partitionPath -> fileSystemView
|
.getLatestFileSlices(partitionPath)
|
||||||
.getLatestFileSlices(partitionPath)
|
.filter(slice -> !fgIdsInPendingCompactions.contains(slice.getFileGroupId())).map(s -> {
|
||||||
.filter(slice ->
|
List<HoodieLogFile> logFiles =
|
||||||
!fgIdsInPendingCompactions.contains(slice.getFileGroupId()))
|
s.getLogFiles().sorted(HoodieLogFile.getLogFileComparator()).collect(Collectors.toList());
|
||||||
.map(
|
totalLogFiles.add((long) logFiles.size());
|
||||||
s -> {
|
totalFileSlices.add(1L);
|
||||||
List<HoodieLogFile> logFiles = s.getLogFiles().sorted(HoodieLogFile
|
// Avro generated classes are not inheriting Serializable. Using CompactionOperation POJO
|
||||||
.getLogFileComparator()).collect(Collectors.toList());
|
// for spark Map operations and collecting them finally in Avro generated classes for storing
|
||||||
totalLogFiles.add((long) logFiles.size());
|
// into meta files.
|
||||||
totalFileSlices.add(1L);
|
Option<HoodieDataFile> dataFile = s.getDataFile();
|
||||||
// Avro generated classes are not inheriting Serializable. Using CompactionOperation POJO
|
return new CompactionOperation(dataFile, partitionPath, logFiles,
|
||||||
// for spark Map operations and collecting them finally in Avro generated classes for storing
|
config.getCompactionStrategy().captureMetrics(config, dataFile, partitionPath, logFiles));
|
||||||
// into meta files.
|
}).filter(c -> !c.getDeltaFilePaths().isEmpty()).collect(toList()).iterator())
|
||||||
Option<HoodieDataFile> dataFile = s.getDataFile();
|
.collect().stream().map(CompactionUtils::buildHoodieCompactionOperation).collect(toList());
|
||||||
return new CompactionOperation(dataFile, partitionPath, logFiles,
|
|
||||||
config.getCompactionStrategy().captureMetrics(config, dataFile, partitionPath, logFiles));
|
|
||||||
})
|
|
||||||
.filter(c -> !c.getDeltaFilePaths().isEmpty())
|
|
||||||
.collect(toList()).iterator()).collect().stream().map(CompactionUtils::buildHoodieCompactionOperation)
|
|
||||||
.collect(toList());
|
|
||||||
log.info("Total of " + operations.size() + " compactions are retrieved");
|
log.info("Total of " + operations.size() + " compactions are retrieved");
|
||||||
log.info("Total number of latest files slices " + totalFileSlices.value());
|
log.info("Total number of latest files slices " + totalFileSlices.value());
|
||||||
log.info("Total number of log files " + totalLogFiles.value());
|
log.info("Total number of log files " + totalLogFiles.value());
|
||||||
@@ -222,11 +206,11 @@ public class HoodieRealtimeTableCompactor implements HoodieCompactor {
|
|||||||
// compactions only
|
// compactions only
|
||||||
HoodieCompactionPlan compactionPlan = config.getCompactionStrategy().generateCompactionPlan(config, operations,
|
HoodieCompactionPlan compactionPlan = config.getCompactionStrategy().generateCompactionPlan(config, operations,
|
||||||
CompactionUtils.getAllPendingCompactionPlans(metaClient).stream().map(Pair::getValue).collect(toList()));
|
CompactionUtils.getAllPendingCompactionPlans(metaClient).stream().map(Pair::getValue).collect(toList()));
|
||||||
Preconditions.checkArgument(compactionPlan.getOperations().stream().noneMatch(
|
Preconditions.checkArgument(
|
||||||
op -> fgIdsInPendingCompactions.contains(new HoodieFileGroupId(op.getPartitionPath(), op.getFileId()))),
|
compactionPlan.getOperations().stream().noneMatch(
|
||||||
|
op -> fgIdsInPendingCompactions.contains(new HoodieFileGroupId(op.getPartitionPath(), op.getFileId()))),
|
||||||
"Bad Compaction Plan. FileId MUST NOT have multiple pending compactions. "
|
"Bad Compaction Plan. FileId MUST NOT have multiple pending compactions. "
|
||||||
+ "Please fix your strategy implementation."
|
+ "Please fix your strategy implementation." + "FileIdsWithPendingCompactions :" + fgIdsInPendingCompactions
|
||||||
+ "FileIdsWithPendingCompactions :" + fgIdsInPendingCompactions
|
|
||||||
+ ", Selected workload :" + compactionPlan);
|
+ ", Selected workload :" + compactionPlan);
|
||||||
if (compactionPlan.getOperations().isEmpty()) {
|
if (compactionPlan.getOperations().isEmpty()) {
|
||||||
log.warn("After filtering, Nothing to compact for " + metaClient.getBasePath());
|
log.warn("After filtering, Nothing to compact for " + metaClient.getBasePath());
|
||||||
|
|||||||
@@ -25,8 +25,8 @@ import org.apache.hudi.avro.model.HoodieCompactionPlan;
|
|||||||
import org.apache.hudi.config.HoodieWriteConfig;
|
import org.apache.hudi.config.HoodieWriteConfig;
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* CompactionStrategy which looks at total IO to be done for the compaction (read + write) and
|
* CompactionStrategy which looks at total IO to be done for the compaction (read + write) and limits the list of
|
||||||
* limits the list of compactions to be under a configured limit on the IO
|
* compactions to be under a configured limit on the IO
|
||||||
*
|
*
|
||||||
* @see CompactionStrategy
|
* @see CompactionStrategy
|
||||||
*/
|
*/
|
||||||
|
|||||||
@@ -32,11 +32,10 @@ import org.apache.hudi.config.HoodieWriteConfig;
|
|||||||
|
|
||||||
/**
|
/**
|
||||||
* This strategy ensures that the last N partitions are picked up even if there are later partitions created for the
|
* This strategy ensures that the last N partitions are picked up even if there are later partitions created for the
|
||||||
* dataset. lastNPartitions is defined as the N partitions before the currentDate.
|
* dataset. lastNPartitions is defined as the N partitions before the currentDate. currentDay = 2018/01/01 The dataset
|
||||||
* currentDay = 2018/01/01
|
* has partitions for 2018/02/02 and 2018/03/03 beyond the currentDay This strategy will pick up the following
|
||||||
* The dataset has partitions for 2018/02/02 and 2018/03/03 beyond the currentDay
|
* partitions for compaction : (2018/01/01, allPartitionsInRange[(2018/01/01 - lastNPartitions) to 2018/01/01),
|
||||||
* This strategy will pick up the following partitions for compaction :
|
* 2018/02/02, 2018/03/03)
|
||||||
* (2018/01/01, allPartitionsInRange[(2018/01/01 - lastNPartitions) to 2018/01/01), 2018/02/02, 2018/03/03)
|
|
||||||
*/
|
*/
|
||||||
public class BoundedPartitionAwareCompactionStrategy extends DayBasedCompactionStrategy {
|
public class BoundedPartitionAwareCompactionStrategy extends DayBasedCompactionStrategy {
|
||||||
|
|
||||||
@@ -46,15 +45,14 @@ public class BoundedPartitionAwareCompactionStrategy extends DayBasedCompactionS
|
|||||||
public List<HoodieCompactionOperation> orderAndFilter(HoodieWriteConfig writeConfig,
|
public List<HoodieCompactionOperation> orderAndFilter(HoodieWriteConfig writeConfig,
|
||||||
List<HoodieCompactionOperation> operations, List<HoodieCompactionPlan> pendingCompactionPlans) {
|
List<HoodieCompactionOperation> operations, List<HoodieCompactionPlan> pendingCompactionPlans) {
|
||||||
// The earliest partition to compact - current day minus the target partitions limit
|
// The earliest partition to compact - current day minus the target partitions limit
|
||||||
String earliestPartitionPathToCompact = dateFormat.format(
|
String earliestPartitionPathToCompact =
|
||||||
getDateAtOffsetFromToday(-1 * writeConfig.getTargetPartitionsPerDayBasedCompaction()));
|
dateFormat.format(getDateAtOffsetFromToday(-1 * writeConfig.getTargetPartitionsPerDayBasedCompaction()));
|
||||||
// Filter out all partitions greater than earliestPartitionPathToCompact
|
// Filter out all partitions greater than earliestPartitionPathToCompact
|
||||||
List<HoodieCompactionOperation> eligibleCompactionOperations = operations.stream()
|
List<HoodieCompactionOperation> eligibleCompactionOperations =
|
||||||
.collect(Collectors.groupingBy(HoodieCompactionOperation::getPartitionPath)).entrySet().stream()
|
operations.stream().collect(Collectors.groupingBy(HoodieCompactionOperation::getPartitionPath)).entrySet()
|
||||||
.sorted(Map.Entry.comparingByKey(comparator))
|
.stream().sorted(Map.Entry.comparingByKey(comparator))
|
||||||
.filter(e -> comparator.compare(earliestPartitionPathToCompact, e.getKey()) >= 0)
|
.filter(e -> comparator.compare(earliestPartitionPathToCompact, e.getKey()) >= 0)
|
||||||
.flatMap(e -> e.getValue().stream())
|
.flatMap(e -> e.getValue().stream()).collect(Collectors.toList());
|
||||||
.collect(Collectors.toList());
|
|
||||||
|
|
||||||
return eligibleCompactionOperations;
|
return eligibleCompactionOperations;
|
||||||
}
|
}
|
||||||
@@ -62,13 +60,12 @@ public class BoundedPartitionAwareCompactionStrategy extends DayBasedCompactionS
|
|||||||
@Override
|
@Override
|
||||||
public List<String> filterPartitionPaths(HoodieWriteConfig writeConfig, List<String> partitionPaths) {
|
public List<String> filterPartitionPaths(HoodieWriteConfig writeConfig, List<String> partitionPaths) {
|
||||||
// The earliest partition to compact - current day minus the target partitions limit
|
// The earliest partition to compact - current day minus the target partitions limit
|
||||||
String earliestPartitionPathToCompact = dateFormat.format(
|
String earliestPartitionPathToCompact =
|
||||||
getDateAtOffsetFromToday(-1 * writeConfig.getTargetPartitionsPerDayBasedCompaction()));
|
dateFormat.format(getDateAtOffsetFromToday(-1 * writeConfig.getTargetPartitionsPerDayBasedCompaction()));
|
||||||
// Get all partitions and sort them
|
// Get all partitions and sort them
|
||||||
List<String> filteredPartitionPaths = partitionPaths.stream().map(partition -> partition.replace("/", "-"))
|
List<String> filteredPartitionPaths = partitionPaths.stream().map(partition -> partition.replace("/", "-"))
|
||||||
.sorted(Comparator.reverseOrder()).map(partitionPath -> partitionPath.replace("-", "/"))
|
.sorted(Comparator.reverseOrder()).map(partitionPath -> partitionPath.replace("-", "/"))
|
||||||
.filter(e -> comparator.compare(earliestPartitionPathToCompact, e) >= 0)
|
.filter(e -> comparator.compare(earliestPartitionPathToCompact, e) >= 0).collect(Collectors.toList());
|
||||||
.collect(Collectors.toList());
|
|
||||||
return filteredPartitionPaths;
|
return filteredPartitionPaths;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|||||||
@@ -32,11 +32,10 @@ import org.apache.hudi.config.HoodieWriteConfig;
|
|||||||
import org.apache.hudi.io.compact.HoodieRealtimeTableCompactor;
|
import org.apache.hudi.io.compact.HoodieRealtimeTableCompactor;
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Strategy for compaction. Pluggable implementation to define how compaction should be done. The
|
* Strategy for compaction. Pluggable implementation to define how compaction should be done. The over-ridden
|
||||||
* over-ridden implementations of this abstract class can capture the relevant metrics to order
|
* implementations of this abstract class can capture the relevant metrics to order and filter the final list of
|
||||||
* and filter the final list of compaction operation to run in a single compaction.
|
* compaction operation to run in a single compaction. Implementation of CompactionStrategy cannot hold any state.
|
||||||
* Implementation of CompactionStrategy cannot hold any state. Difference instantiations can be
|
* Difference instantiations can be passed in every time
|
||||||
* passed in every time
|
|
||||||
*
|
*
|
||||||
* @see HoodieRealtimeTableCompactor
|
* @see HoodieRealtimeTableCompactor
|
||||||
*/
|
*/
|
||||||
@@ -49,8 +48,8 @@ public abstract class CompactionStrategy implements Serializable {
|
|||||||
public static final String TOTAL_LOG_FILES = "TOTAL_LOG_FILES";
|
public static final String TOTAL_LOG_FILES = "TOTAL_LOG_FILES";
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Callback hook when a HoodieCompactionOperation is created. Individual strategies can capture the
|
* Callback hook when a HoodieCompactionOperation is created. Individual strategies can capture the metrics they need
|
||||||
* metrics they need to decide on the priority.
|
* to decide on the priority.
|
||||||
*
|
*
|
||||||
* @param dataFile - Base file to compact
|
* @param dataFile - Base file to compact
|
||||||
* @param partitionPath - Partition path
|
* @param partitionPath - Partition path
|
||||||
@@ -58,18 +57,18 @@ public abstract class CompactionStrategy implements Serializable {
|
|||||||
* @return Map[String, Object] - metrics captured
|
* @return Map[String, Object] - metrics captured
|
||||||
*/
|
*/
|
||||||
public Map<String, Double> captureMetrics(HoodieWriteConfig writeConfig, Option<HoodieDataFile> dataFile,
|
public Map<String, Double> captureMetrics(HoodieWriteConfig writeConfig, Option<HoodieDataFile> dataFile,
|
||||||
String partitionPath, List<HoodieLogFile> logFiles) {
|
String partitionPath, List<HoodieLogFile> logFiles) {
|
||||||
Map<String, Double> metrics = Maps.newHashMap();
|
Map<String, Double> metrics = Maps.newHashMap();
|
||||||
Long defaultMaxParquetFileSize = writeConfig.getParquetMaxFileSize();
|
Long defaultMaxParquetFileSize = writeConfig.getParquetMaxFileSize();
|
||||||
// Total size of all the log files
|
// Total size of all the log files
|
||||||
Long totalLogFileSize = logFiles.stream().map(HoodieLogFile::getFileSize).filter(size -> size >= 0)
|
Long totalLogFileSize = logFiles.stream().map(HoodieLogFile::getFileSize).filter(size -> size >= 0)
|
||||||
.reduce((size1, size2) -> size1 + size2).orElse(0L);
|
.reduce((size1, size2) -> size1 + size2).orElse(0L);
|
||||||
// Total read will be the base file + all the log files
|
// Total read will be the base file + all the log files
|
||||||
Long totalIORead = FSUtils.getSizeInMB((dataFile.isPresent() ? dataFile.get().getFileSize() : 0L)
|
Long totalIORead =
|
||||||
+ totalLogFileSize);
|
FSUtils.getSizeInMB((dataFile.isPresent() ? dataFile.get().getFileSize() : 0L) + totalLogFileSize);
|
||||||
// Total write will be similar to the size of the base file
|
// Total write will be similar to the size of the base file
|
||||||
Long totalIOWrite = FSUtils
|
Long totalIOWrite =
|
||||||
.getSizeInMB(dataFile.isPresent() ? dataFile.get().getFileSize() : defaultMaxParquetFileSize);
|
FSUtils.getSizeInMB(dataFile.isPresent() ? dataFile.get().getFileSize() : defaultMaxParquetFileSize);
|
||||||
// Total IO will the the IO for read + write
|
// Total IO will the the IO for read + write
|
||||||
Long totalIO = totalIORead + totalIOWrite;
|
Long totalIO = totalIORead + totalIOWrite;
|
||||||
// Save these metrics and we will use during the filter
|
// Save these metrics and we will use during the filter
|
||||||
@@ -86,8 +85,8 @@ public abstract class CompactionStrategy implements Serializable {
|
|||||||
* implementation takes care of setting compactor Id from configuration allowing subclasses to only worry about
|
* implementation takes care of setting compactor Id from configuration allowing subclasses to only worry about
|
||||||
* ordering and filtering compaction operations
|
* ordering and filtering compaction operations
|
||||||
*
|
*
|
||||||
* @param writeConfig Hoodie Write Config
|
* @param writeConfig Hoodie Write Config
|
||||||
* @param operations Compaction Operations to be ordered and filtered
|
* @param operations Compaction Operations to be ordered and filtered
|
||||||
* @param pendingCompactionPlans Pending Compaction Plans for strategy to schedule next compaction plan
|
* @param pendingCompactionPlans Pending Compaction Plans for strategy to schedule next compaction plan
|
||||||
* @return Compaction plan to be scheduled.
|
* @return Compaction plan to be scheduled.
|
||||||
*/
|
*/
|
||||||
@@ -95,27 +94,26 @@ public abstract class CompactionStrategy implements Serializable {
|
|||||||
List<HoodieCompactionOperation> operations, List<HoodieCompactionPlan> pendingCompactionPlans) {
|
List<HoodieCompactionOperation> operations, List<HoodieCompactionPlan> pendingCompactionPlans) {
|
||||||
// Strategy implementation can overload this method to set specific compactor-id
|
// Strategy implementation can overload this method to set specific compactor-id
|
||||||
return HoodieCompactionPlan.newBuilder()
|
return HoodieCompactionPlan.newBuilder()
|
||||||
.setOperations(orderAndFilter(writeConfig, operations, pendingCompactionPlans))
|
.setOperations(orderAndFilter(writeConfig, operations, pendingCompactionPlans)).build();
|
||||||
.build();
|
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Order and Filter the list of compactions. Use the metrics captured with the captureMetrics to order and filter out
|
* Order and Filter the list of compactions. Use the metrics captured with the captureMetrics to order and filter out
|
||||||
* compactions
|
* compactions
|
||||||
*
|
*
|
||||||
* @param writeConfig config for this compaction is passed in
|
* @param writeConfig config for this compaction is passed in
|
||||||
* @param operations list of compactions collected
|
* @param operations list of compactions collected
|
||||||
* @param pendingCompactionPlans Pending Compaction Plans for strategy to schedule next compaction plan
|
* @param pendingCompactionPlans Pending Compaction Plans for strategy to schedule next compaction plan
|
||||||
* @return list of compactions to perform in this run
|
* @return list of compactions to perform in this run
|
||||||
*/
|
*/
|
||||||
public List<HoodieCompactionOperation> orderAndFilter(HoodieWriteConfig writeConfig,
|
public List<HoodieCompactionOperation> orderAndFilter(HoodieWriteConfig writeConfig,
|
||||||
List<HoodieCompactionOperation> operations,
|
List<HoodieCompactionOperation> operations, List<HoodieCompactionPlan> pendingCompactionPlans) {
|
||||||
List<HoodieCompactionPlan> pendingCompactionPlans) {
|
|
||||||
return operations;
|
return operations;
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Filter the partition paths based on compaction strategy
|
* Filter the partition paths based on compaction strategy
|
||||||
|
*
|
||||||
* @param writeConfig
|
* @param writeConfig
|
||||||
* @param allPartitionPaths
|
* @param allPartitionPaths
|
||||||
* @return
|
* @return
|
||||||
|
|||||||
@@ -34,21 +34,18 @@ import org.apache.hudi.config.HoodieWriteConfig;
|
|||||||
import org.apache.hudi.exception.HoodieException;
|
import org.apache.hudi.exception.HoodieException;
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* This strategy orders compactions in reverse order of creation of Hive Partitions. It helps to
|
* This strategy orders compactions in reverse order of creation of Hive Partitions. It helps to compact data in latest
|
||||||
* compact data in latest partitions first and then older capped at the Total_IO allowed.
|
* partitions first and then older capped at the Total_IO allowed.
|
||||||
*/
|
*/
|
||||||
public class DayBasedCompactionStrategy extends CompactionStrategy {
|
public class DayBasedCompactionStrategy extends CompactionStrategy {
|
||||||
|
|
||||||
// For now, use SimpleDateFormat as default partition format
|
// For now, use SimpleDateFormat as default partition format
|
||||||
protected static String datePartitionFormat = "yyyy/MM/dd";
|
protected static String datePartitionFormat = "yyyy/MM/dd";
|
||||||
// Sorts compaction in LastInFirstCompacted order
|
// Sorts compaction in LastInFirstCompacted order
|
||||||
protected static Comparator<String> comparator = (String leftPartition,
|
protected static Comparator<String> comparator = (String leftPartition, String rightPartition) -> {
|
||||||
String rightPartition) -> {
|
|
||||||
try {
|
try {
|
||||||
Date left = new SimpleDateFormat(datePartitionFormat, Locale.ENGLISH)
|
Date left = new SimpleDateFormat(datePartitionFormat, Locale.ENGLISH).parse(leftPartition);
|
||||||
.parse(leftPartition);
|
Date right = new SimpleDateFormat(datePartitionFormat, Locale.ENGLISH).parse(rightPartition);
|
||||||
Date right = new SimpleDateFormat(datePartitionFormat, Locale.ENGLISH)
|
|
||||||
.parse(rightPartition);
|
|
||||||
return left.after(right) ? -1 : right.after(left) ? 1 : 0;
|
return left.after(right) ? -1 : right.after(left) ? 1 : 0;
|
||||||
} catch (ParseException e) {
|
} catch (ParseException e) {
|
||||||
throw new HoodieException("Invalid Partition Date Format", e);
|
throw new HoodieException("Invalid Partition Date Format", e);
|
||||||
@@ -68,8 +65,7 @@ public class DayBasedCompactionStrategy extends CompactionStrategy {
|
|||||||
List<HoodieCompactionOperation> filteredList = operations.stream()
|
List<HoodieCompactionOperation> filteredList = operations.stream()
|
||||||
.collect(Collectors.groupingBy(HoodieCompactionOperation::getPartitionPath)).entrySet().stream()
|
.collect(Collectors.groupingBy(HoodieCompactionOperation::getPartitionPath)).entrySet().stream()
|
||||||
.sorted(Map.Entry.comparingByKey(comparator)).limit(writeConfig.getTargetPartitionsPerDayBasedCompaction())
|
.sorted(Map.Entry.comparingByKey(comparator)).limit(writeConfig.getTargetPartitionsPerDayBasedCompaction())
|
||||||
.flatMap(e -> e.getValue().stream())
|
.flatMap(e -> e.getValue().stream()).collect(Collectors.toList());
|
||||||
.collect(Collectors.toList());
|
|
||||||
return filteredList;
|
return filteredList;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|||||||
@@ -30,14 +30,14 @@ import org.apache.hudi.common.util.Option;
|
|||||||
import org.apache.hudi.config.HoodieWriteConfig;
|
import org.apache.hudi.config.HoodieWriteConfig;
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* LogFileSizeBasedCompactionStrategy orders the compactions based on the total log files size and
|
* LogFileSizeBasedCompactionStrategy orders the compactions based on the total log files size and limits the
|
||||||
* limits the compactions within a configured IO bound
|
* compactions within a configured IO bound
|
||||||
*
|
*
|
||||||
* @see BoundedIOCompactionStrategy
|
* @see BoundedIOCompactionStrategy
|
||||||
* @see CompactionStrategy
|
* @see CompactionStrategy
|
||||||
*/
|
*/
|
||||||
public class LogFileSizeBasedCompactionStrategy extends BoundedIOCompactionStrategy implements
|
public class LogFileSizeBasedCompactionStrategy extends BoundedIOCompactionStrategy
|
||||||
Comparator<HoodieCompactionOperation> {
|
implements Comparator<HoodieCompactionOperation> {
|
||||||
|
|
||||||
private static final String TOTAL_LOG_FILE_SIZE = "TOTAL_LOG_FILE_SIZE";
|
private static final String TOTAL_LOG_FILE_SIZE = "TOTAL_LOG_FILE_SIZE";
|
||||||
|
|
||||||
@@ -47,9 +47,8 @@ public class LogFileSizeBasedCompactionStrategy extends BoundedIOCompactionStrat
|
|||||||
Map<String, Double> metrics = super.captureMetrics(config, dataFile, partitionPath, logFiles);
|
Map<String, Double> metrics = super.captureMetrics(config, dataFile, partitionPath, logFiles);
|
||||||
|
|
||||||
// Total size of all the log files
|
// Total size of all the log files
|
||||||
Long totalLogFileSize = logFiles.stream().map(HoodieLogFile::getFileSize)
|
Long totalLogFileSize = logFiles.stream().map(HoodieLogFile::getFileSize).filter(size -> size >= 0)
|
||||||
.filter(size -> size >= 0).reduce((size1, size2) -> size1 + size2)
|
.reduce((size1, size2) -> size1 + size2).orElse(0L);
|
||||||
.orElse(0L);
|
|
||||||
// save the metrics needed during the order
|
// save the metrics needed during the order
|
||||||
metrics.put(TOTAL_LOG_FILE_SIZE, totalLogFileSize.doubleValue());
|
metrics.put(TOTAL_LOG_FILE_SIZE, totalLogFileSize.doubleValue());
|
||||||
return metrics;
|
return metrics;
|
||||||
@@ -59,9 +58,8 @@ public class LogFileSizeBasedCompactionStrategy extends BoundedIOCompactionStrat
|
|||||||
public List<HoodieCompactionOperation> orderAndFilter(HoodieWriteConfig writeConfig,
|
public List<HoodieCompactionOperation> orderAndFilter(HoodieWriteConfig writeConfig,
|
||||||
List<HoodieCompactionOperation> operations, List<HoodieCompactionPlan> pendingCompactionPlans) {
|
List<HoodieCompactionOperation> operations, List<HoodieCompactionPlan> pendingCompactionPlans) {
|
||||||
// Order the operations based on the reverse size of the logs and limit them by the IO
|
// Order the operations based on the reverse size of the logs and limit them by the IO
|
||||||
return super
|
return super.orderAndFilter(writeConfig, operations.stream().sorted(this).collect(Collectors.toList()),
|
||||||
.orderAndFilter(writeConfig,
|
pendingCompactionPlans);
|
||||||
operations.stream().sorted(this).collect(Collectors.toList()), pendingCompactionPlans);
|
|
||||||
}
|
}
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
|
|||||||
@@ -24,9 +24,8 @@ import org.apache.hudi.avro.model.HoodieCompactionPlan;
|
|||||||
import org.apache.hudi.config.HoodieWriteConfig;
|
import org.apache.hudi.config.HoodieWriteConfig;
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* UnBoundedCompactionStrategy will not change ordering or filter any compaction. It is a
|
* UnBoundedCompactionStrategy will not change ordering or filter any compaction. It is a pass-through and will compact
|
||||||
* pass-through and will compact all the base files which has a log file. This usually means
|
* all the base files which has a log file. This usually means no-intelligence on compaction.
|
||||||
* no-intelligence on compaction.
|
|
||||||
*
|
*
|
||||||
* @see CompactionStrategy
|
* @see CompactionStrategy
|
||||||
*/
|
*/
|
||||||
|
|||||||
@@ -27,12 +27,11 @@ import org.apache.hudi.avro.model.HoodieCompactionPlan;
|
|||||||
import org.apache.hudi.config.HoodieWriteConfig;
|
import org.apache.hudi.config.HoodieWriteConfig;
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* UnBoundedPartitionAwareCompactionStrategy is a custom UnBounded Strategy.
|
* UnBoundedPartitionAwareCompactionStrategy is a custom UnBounded Strategy. This will filter all the partitions that
|
||||||
* This will filter all the partitions that are eligible to be compacted by a
|
* are eligible to be compacted by a {@link BoundedPartitionAwareCompactionStrategy} and return the result. This is done
|
||||||
* {@link BoundedPartitionAwareCompactionStrategy} and return the result.
|
* so that a long running UnBoundedPartitionAwareCompactionStrategy does not step over partitions in a shorter running
|
||||||
* This is done so that a long running UnBoundedPartitionAwareCompactionStrategy does not step over partitions
|
* BoundedPartitionAwareCompactionStrategy. Essentially, this is an inverse of the partitions chosen in
|
||||||
* in a shorter running BoundedPartitionAwareCompactionStrategy. Essentially, this is an inverse of the
|
* BoundedPartitionAwareCompactionStrategy
|
||||||
* partitions chosen in BoundedPartitionAwareCompactionStrategy
|
|
||||||
*
|
*
|
||||||
* @see CompactionStrategy
|
* @see CompactionStrategy
|
||||||
*/
|
*/
|
||||||
@@ -41,10 +40,10 @@ public class UnBoundedPartitionAwareCompactionStrategy extends CompactionStrateg
|
|||||||
@Override
|
@Override
|
||||||
public List<HoodieCompactionOperation> orderAndFilter(HoodieWriteConfig config,
|
public List<HoodieCompactionOperation> orderAndFilter(HoodieWriteConfig config,
|
||||||
final List<HoodieCompactionOperation> operations, final List<HoodieCompactionPlan> pendingCompactionWorkloads) {
|
final List<HoodieCompactionOperation> operations, final List<HoodieCompactionPlan> pendingCompactionWorkloads) {
|
||||||
BoundedPartitionAwareCompactionStrategy boundedPartitionAwareCompactionStrategy
|
BoundedPartitionAwareCompactionStrategy boundedPartitionAwareCompactionStrategy =
|
||||||
= new BoundedPartitionAwareCompactionStrategy();
|
new BoundedPartitionAwareCompactionStrategy();
|
||||||
List<HoodieCompactionOperation> operationsToExclude = boundedPartitionAwareCompactionStrategy
|
List<HoodieCompactionOperation> operationsToExclude =
|
||||||
.orderAndFilter(config, operations, pendingCompactionWorkloads);
|
boundedPartitionAwareCompactionStrategy.orderAndFilter(config, operations, pendingCompactionWorkloads);
|
||||||
List<HoodieCompactionOperation> allOperations = new ArrayList<>(operations);
|
List<HoodieCompactionOperation> allOperations = new ArrayList<>(operations);
|
||||||
allOperations.removeAll(operationsToExclude);
|
allOperations.removeAll(operationsToExclude);
|
||||||
return allOperations;
|
return allOperations;
|
||||||
@@ -52,13 +51,13 @@ public class UnBoundedPartitionAwareCompactionStrategy extends CompactionStrateg
|
|||||||
|
|
||||||
@Override
|
@Override
|
||||||
public List<String> filterPartitionPaths(HoodieWriteConfig writeConfig, List<String> partitionPaths) {
|
public List<String> filterPartitionPaths(HoodieWriteConfig writeConfig, List<String> partitionPaths) {
|
||||||
List<String> allPartitionPaths = partitionPaths.stream().map(partition -> partition.replace("/", "-"))
|
List<String> allPartitionPaths =
|
||||||
.sorted(Comparator.reverseOrder()).map(partitionPath -> partitionPath.replace("-", "/"))
|
partitionPaths.stream().map(partition -> partition.replace("/", "-")).sorted(Comparator.reverseOrder())
|
||||||
.collect(Collectors.toList());
|
.map(partitionPath -> partitionPath.replace("-", "/")).collect(Collectors.toList());
|
||||||
BoundedPartitionAwareCompactionStrategy boundedPartitionAwareCompactionStrategy
|
BoundedPartitionAwareCompactionStrategy boundedPartitionAwareCompactionStrategy =
|
||||||
= new BoundedPartitionAwareCompactionStrategy();
|
new BoundedPartitionAwareCompactionStrategy();
|
||||||
List<String> partitionsToExclude = boundedPartitionAwareCompactionStrategy.filterPartitionPaths(writeConfig,
|
List<String> partitionsToExclude =
|
||||||
partitionPaths);
|
boundedPartitionAwareCompactionStrategy.filterPartitionPaths(writeConfig, partitionPaths);
|
||||||
allPartitionPaths.removeAll(partitionsToExclude);
|
allPartitionPaths.removeAll(partitionsToExclude);
|
||||||
return allPartitionPaths;
|
return allPartitionPaths;
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -32,9 +32,8 @@ public class HoodieParquetConfig {
|
|||||||
private Configuration hadoopConf;
|
private Configuration hadoopConf;
|
||||||
private double compressionRatio;
|
private double compressionRatio;
|
||||||
|
|
||||||
public HoodieParquetConfig(HoodieAvroWriteSupport writeSupport,
|
public HoodieParquetConfig(HoodieAvroWriteSupport writeSupport, CompressionCodecName compressionCodecName,
|
||||||
CompressionCodecName compressionCodecName, int blockSize, int pageSize, long maxFileSize,
|
int blockSize, int pageSize, long maxFileSize, Configuration hadoopConf, double compressionRatio) {
|
||||||
Configuration hadoopConf, double compressionRatio) {
|
|
||||||
this.writeSupport = writeSupport;
|
this.writeSupport = writeSupport;
|
||||||
this.compressionCodecName = compressionCodecName;
|
this.compressionCodecName = compressionCodecName;
|
||||||
this.blockSize = blockSize;
|
this.blockSize = blockSize;
|
||||||
|
|||||||
@@ -36,11 +36,11 @@ import org.apache.parquet.hadoop.ParquetWriter;
|
|||||||
import org.apache.spark.TaskContext;
|
import org.apache.spark.TaskContext;
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* HoodieParquetWriter extends the ParquetWriter to help limit the size of underlying file. Provides
|
* HoodieParquetWriter extends the ParquetWriter to help limit the size of underlying file. Provides a way to check if
|
||||||
* a way to check if the current file can take more records with the <code>canWrite()</code>
|
* the current file can take more records with the <code>canWrite()</code>
|
||||||
*/
|
*/
|
||||||
public class HoodieParquetWriter<T extends HoodieRecordPayload, R extends IndexedRecord> extends
|
public class HoodieParquetWriter<T extends HoodieRecordPayload, R extends IndexedRecord>
|
||||||
ParquetWriter<IndexedRecord> implements HoodieStorageWriter<R> {
|
extends ParquetWriter<IndexedRecord> implements HoodieStorageWriter<R> {
|
||||||
|
|
||||||
private static AtomicLong recordIndex = new AtomicLong(1);
|
private static AtomicLong recordIndex = new AtomicLong(1);
|
||||||
|
|
||||||
@@ -52,24 +52,22 @@ public class HoodieParquetWriter<T extends HoodieRecordPayload, R extends Indexe
|
|||||||
private final Schema schema;
|
private final Schema schema;
|
||||||
|
|
||||||
|
|
||||||
public HoodieParquetWriter(String commitTime, Path file, HoodieParquetConfig parquetConfig,
|
public HoodieParquetWriter(String commitTime, Path file, HoodieParquetConfig parquetConfig, Schema schema)
|
||||||
Schema schema) throws IOException {
|
throws IOException {
|
||||||
super(HoodieWrapperFileSystem.convertToHoodiePath(file, parquetConfig.getHadoopConf()),
|
super(HoodieWrapperFileSystem.convertToHoodiePath(file, parquetConfig.getHadoopConf()),
|
||||||
ParquetFileWriter.Mode.CREATE, parquetConfig.getWriteSupport(),
|
ParquetFileWriter.Mode.CREATE, parquetConfig.getWriteSupport(), parquetConfig.getCompressionCodecName(),
|
||||||
parquetConfig.getCompressionCodecName(), parquetConfig.getBlockSize(),
|
parquetConfig.getBlockSize(), parquetConfig.getPageSize(), parquetConfig.getPageSize(),
|
||||||
parquetConfig.getPageSize(), parquetConfig.getPageSize(),
|
|
||||||
ParquetWriter.DEFAULT_IS_DICTIONARY_ENABLED, ParquetWriter.DEFAULT_IS_VALIDATING_ENABLED,
|
ParquetWriter.DEFAULT_IS_DICTIONARY_ENABLED, ParquetWriter.DEFAULT_IS_VALIDATING_ENABLED,
|
||||||
ParquetWriter.DEFAULT_WRITER_VERSION,
|
ParquetWriter.DEFAULT_WRITER_VERSION, registerFileSystem(file, parquetConfig.getHadoopConf()));
|
||||||
registerFileSystem(file, parquetConfig.getHadoopConf()));
|
|
||||||
this.file = HoodieWrapperFileSystem.convertToHoodiePath(file, parquetConfig.getHadoopConf());
|
this.file = HoodieWrapperFileSystem.convertToHoodiePath(file, parquetConfig.getHadoopConf());
|
||||||
this.fs = (HoodieWrapperFileSystem) this.file
|
this.fs =
|
||||||
.getFileSystem(registerFileSystem(file, parquetConfig.getHadoopConf()));
|
(HoodieWrapperFileSystem) this.file.getFileSystem(registerFileSystem(file, parquetConfig.getHadoopConf()));
|
||||||
// We cannot accurately measure the snappy compressed output file size. We are choosing a
|
// We cannot accurately measure the snappy compressed output file size. We are choosing a
|
||||||
// conservative 10%
|
// conservative 10%
|
||||||
// TODO - compute this compression ratio dynamically by looking at the bytes written to the
|
// TODO - compute this compression ratio dynamically by looking at the bytes written to the
|
||||||
// stream and the actual file size reported by HDFS
|
// stream and the actual file size reported by HDFS
|
||||||
this.maxFileSize = parquetConfig.getMaxFileSize() + Math
|
this.maxFileSize = parquetConfig.getMaxFileSize()
|
||||||
.round(parquetConfig.getMaxFileSize() * parquetConfig.getCompressionRatio());
|
+ Math.round(parquetConfig.getMaxFileSize() * parquetConfig.getCompressionRatio());
|
||||||
this.writeSupport = parquetConfig.getWriteSupport();
|
this.writeSupport = parquetConfig.getWriteSupport();
|
||||||
this.commitTime = commitTime;
|
this.commitTime = commitTime;
|
||||||
this.schema = schema;
|
this.schema = schema;
|
||||||
@@ -85,10 +83,10 @@ public class HoodieParquetWriter<T extends HoodieRecordPayload, R extends Indexe
|
|||||||
|
|
||||||
@Override
|
@Override
|
||||||
public void writeAvroWithMetadata(R avroRecord, HoodieRecord record) throws IOException {
|
public void writeAvroWithMetadata(R avroRecord, HoodieRecord record) throws IOException {
|
||||||
String seqId = HoodieRecord.generateSequenceId(commitTime, TaskContext.getPartitionId(),
|
String seqId =
|
||||||
recordIndex.getAndIncrement());
|
HoodieRecord.generateSequenceId(commitTime, TaskContext.getPartitionId(), recordIndex.getAndIncrement());
|
||||||
HoodieAvroUtils.addHoodieKeyToRecord((GenericRecord) avroRecord, record.getRecordKey(),
|
HoodieAvroUtils.addHoodieKeyToRecord((GenericRecord) avroRecord, record.getRecordKey(), record.getPartitionPath(),
|
||||||
record.getPartitionPath(), file.getName());
|
file.getName());
|
||||||
HoodieAvroUtils.addCommitMetadataToRecord((GenericRecord) avroRecord, commitTime, seqId);
|
HoodieAvroUtils.addCommitMetadataToRecord((GenericRecord) avroRecord, commitTime, seqId);
|
||||||
super.write(avroRecord);
|
super.write(avroRecord);
|
||||||
writeSupport.add(record.getRecordKey());
|
writeSupport.add(record.getRecordKey());
|
||||||
|
|||||||
@@ -36,8 +36,8 @@ import org.apache.parquet.avro.AvroSchemaConverter;
|
|||||||
public class HoodieStorageWriterFactory {
|
public class HoodieStorageWriterFactory {
|
||||||
|
|
||||||
public static <T extends HoodieRecordPayload, R extends IndexedRecord> HoodieStorageWriter<R> getStorageWriter(
|
public static <T extends HoodieRecordPayload, R extends IndexedRecord> HoodieStorageWriter<R> getStorageWriter(
|
||||||
String commitTime, Path path, HoodieTable<T> hoodieTable,
|
String commitTime, Path path, HoodieTable<T> hoodieTable, HoodieWriteConfig config, Schema schema)
|
||||||
HoodieWriteConfig config, Schema schema) throws IOException {
|
throws IOException {
|
||||||
final String name = path.getName();
|
final String name = path.getName();
|
||||||
final String extension = FSUtils.isLogFile(path) ? HOODIE_LOG.getFileExtension() : FSUtils.getFileExtension(name);
|
final String extension = FSUtils.isLogFile(path) ? HOODIE_LOG.getFileExtension() : FSUtils.getFileExtension(name);
|
||||||
if (PARQUET.getFileExtension().equals(extension)) {
|
if (PARQUET.getFileExtension().equals(extension)) {
|
||||||
@@ -46,19 +46,16 @@ public class HoodieStorageWriterFactory {
|
|||||||
throw new UnsupportedOperationException(extension + " format not supported yet.");
|
throw new UnsupportedOperationException(extension + " format not supported yet.");
|
||||||
}
|
}
|
||||||
|
|
||||||
private static <T extends HoodieRecordPayload,
|
private static <T extends HoodieRecordPayload, R extends IndexedRecord> HoodieStorageWriter<R> newParquetStorageWriter(
|
||||||
R extends IndexedRecord> HoodieStorageWriter<R> newParquetStorageWriter(String commitTime, Path path,
|
String commitTime, Path path, HoodieWriteConfig config, Schema schema, HoodieTable hoodieTable)
|
||||||
HoodieWriteConfig config, Schema schema, HoodieTable hoodieTable) throws IOException {
|
throws IOException {
|
||||||
BloomFilter filter = new BloomFilter(config.getBloomFilterNumEntries(),
|
BloomFilter filter = new BloomFilter(config.getBloomFilterNumEntries(), config.getBloomFilterFPP());
|
||||||
config.getBloomFilterFPP());
|
HoodieAvroWriteSupport writeSupport =
|
||||||
HoodieAvroWriteSupport writeSupport = new HoodieAvroWriteSupport(
|
new HoodieAvroWriteSupport(new AvroSchemaConverter().convert(schema), schema, filter);
|
||||||
new AvroSchemaConverter().convert(schema), schema, filter);
|
|
||||||
|
|
||||||
HoodieParquetConfig parquetConfig =
|
HoodieParquetConfig parquetConfig = new HoodieParquetConfig(writeSupport, config.getParquetCompressionCodec(),
|
||||||
new HoodieParquetConfig(writeSupport, config.getParquetCompressionCodec(),
|
config.getParquetBlockSize(), config.getParquetPageSize(), config.getParquetMaxFileSize(),
|
||||||
config.getParquetBlockSize(), config.getParquetPageSize(),
|
hoodieTable.getHadoopConf(), config.getParquetCompressionRatio());
|
||||||
config.getParquetMaxFileSize(), hoodieTable.getHadoopConf(),
|
|
||||||
config.getParquetCompressionRatio());
|
|
||||||
|
|
||||||
return new HoodieParquetWriter<>(commitTime, path, parquetConfig, schema);
|
return new HoodieParquetWriter<>(commitTime, path, parquetConfig, schema);
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -118,8 +118,8 @@ public class HoodieMetrics {
|
|||||||
return indexTimer == null ? null : indexTimer.time();
|
return indexTimer == null ? null : indexTimer.time();
|
||||||
}
|
}
|
||||||
|
|
||||||
public void updateCommitMetrics(long commitEpochTimeInMs, long durationInMs,
|
public void updateCommitMetrics(long commitEpochTimeInMs, long durationInMs, HoodieCommitMetadata metadata,
|
||||||
HoodieCommitMetadata metadata, String actionType) {
|
String actionType) {
|
||||||
if (config.isMetricsOn()) {
|
if (config.isMetricsOn()) {
|
||||||
long totalPartitionsWritten = metadata.fetchTotalPartitionsWritten();
|
long totalPartitionsWritten = metadata.fetchTotalPartitionsWritten();
|
||||||
long totalFilesInsert = metadata.fetchTotalFilesInsert();
|
long totalFilesInsert = metadata.fetchTotalFilesInsert();
|
||||||
@@ -154,9 +154,8 @@ public class HoodieMetrics {
|
|||||||
|
|
||||||
public void updateRollbackMetrics(long durationInMs, long numFilesDeleted) {
|
public void updateRollbackMetrics(long durationInMs, long numFilesDeleted) {
|
||||||
if (config.isMetricsOn()) {
|
if (config.isMetricsOn()) {
|
||||||
logger.info(String
|
logger.info(
|
||||||
.format("Sending rollback metrics (duration=%d, numFilesDeleted=%d)", durationInMs,
|
String.format("Sending rollback metrics (duration=%d, numFilesDeleted=%d)", durationInMs, numFilesDeleted));
|
||||||
numFilesDeleted));
|
|
||||||
Metrics.registerGauge(getMetricsName("rollback", "duration"), durationInMs);
|
Metrics.registerGauge(getMetricsName("rollback", "duration"), durationInMs);
|
||||||
Metrics.registerGauge(getMetricsName("rollback", "numFilesDeleted"), numFilesDeleted);
|
Metrics.registerGauge(getMetricsName("rollback", "numFilesDeleted"), numFilesDeleted);
|
||||||
}
|
}
|
||||||
@@ -164,9 +163,8 @@ public class HoodieMetrics {
|
|||||||
|
|
||||||
public void updateCleanMetrics(long durationInMs, int numFilesDeleted) {
|
public void updateCleanMetrics(long durationInMs, int numFilesDeleted) {
|
||||||
if (config.isMetricsOn()) {
|
if (config.isMetricsOn()) {
|
||||||
logger.info(String
|
logger.info(
|
||||||
.format("Sending clean metrics (duration=%d, numFilesDeleted=%d)", durationInMs,
|
String.format("Sending clean metrics (duration=%d, numFilesDeleted=%d)", durationInMs, numFilesDeleted));
|
||||||
numFilesDeleted));
|
|
||||||
Metrics.registerGauge(getMetricsName("clean", "duration"), durationInMs);
|
Metrics.registerGauge(getMetricsName("clean", "duration"), durationInMs);
|
||||||
Metrics.registerGauge(getMetricsName("clean", "numFilesDeleted"), numFilesDeleted);
|
Metrics.registerGauge(getMetricsName("clean", "numFilesDeleted"), numFilesDeleted);
|
||||||
}
|
}
|
||||||
@@ -174,20 +172,17 @@ public class HoodieMetrics {
|
|||||||
|
|
||||||
public void updateFinalizeWriteMetrics(long durationInMs, long numFilesFinalized) {
|
public void updateFinalizeWriteMetrics(long durationInMs, long numFilesFinalized) {
|
||||||
if (config.isMetricsOn()) {
|
if (config.isMetricsOn()) {
|
||||||
logger.info(String
|
logger.info(String.format("Sending finalize write metrics (duration=%d, numFilesFinalized=%d)", durationInMs,
|
||||||
.format("Sending finalize write metrics (duration=%d, numFilesFinalized=%d)",
|
numFilesFinalized));
|
||||||
durationInMs, numFilesFinalized));
|
|
||||||
Metrics.registerGauge(getMetricsName("finalize", "duration"), durationInMs);
|
Metrics.registerGauge(getMetricsName("finalize", "duration"), durationInMs);
|
||||||
Metrics.registerGauge(getMetricsName("finalize", "numFilesFinalized"), numFilesFinalized);
|
Metrics.registerGauge(getMetricsName("finalize", "numFilesFinalized"), numFilesFinalized);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
public void updateIndexMetrics(final String action,final long durationInMs) {
|
public void updateIndexMetrics(final String action, final long durationInMs) {
|
||||||
if (config.isMetricsOn()) {
|
if (config.isMetricsOn()) {
|
||||||
logger.info(String
|
logger.info(String.format("Sending index metrics (%s.duration, %d)", action, durationInMs));
|
||||||
.format("Sending index metrics (%s.duration, %d)",action, durationInMs));
|
Metrics.registerGauge(getMetricsName("index", String.format("%s.duration", action)), durationInMs);
|
||||||
Metrics.registerGauge(getMetricsName("index", String.format("%s.duration", action)),
|
|
||||||
durationInMs);
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -202,4 +197,4 @@ public class HoodieMetrics {
|
|||||||
public long getDurationInMs(long ctxDuration) {
|
public long getDurationInMs(long ctxDuration) {
|
||||||
return ctxDuration / 1000000;
|
return ctxDuration / 1000000;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -26,12 +26,10 @@ import java.io.Closeable;
|
|||||||
public class InMemoryMetricsReporter extends MetricsReporter {
|
public class InMemoryMetricsReporter extends MetricsReporter {
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
public void start() {
|
public void start() {}
|
||||||
}
|
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
public void report() {
|
public void report() {}
|
||||||
}
|
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
public Closeable getReporter() {
|
public Closeable getReporter() {
|
||||||
|
|||||||
@@ -30,8 +30,7 @@ import org.apache.log4j.LogManager;
|
|||||||
import org.apache.log4j.Logger;
|
import org.apache.log4j.Logger;
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Implementation of Graphite reporter, which connects to the Graphite server, and send metrics to
|
* Implementation of Graphite reporter, which connects to the Graphite server, and send metrics to that server.
|
||||||
* that server.
|
|
||||||
*/
|
*/
|
||||||
public class MetricsGraphiteReporter extends MetricsReporter {
|
public class MetricsGraphiteReporter extends MetricsReporter {
|
||||||
|
|
||||||
@@ -50,9 +49,8 @@ public class MetricsGraphiteReporter extends MetricsReporter {
|
|||||||
this.serverHost = config.getGraphiteServerHost();
|
this.serverHost = config.getGraphiteServerHost();
|
||||||
this.serverPort = config.getGraphiteServerPort();
|
this.serverPort = config.getGraphiteServerPort();
|
||||||
if (serverHost == null || serverPort == 0) {
|
if (serverHost == null || serverPort == 0) {
|
||||||
throw new RuntimeException(String
|
throw new RuntimeException(String.format("Graphite cannot be initialized with serverHost[%s] and serverPort[%s].",
|
||||||
.format("Graphite cannot be initialized with serverHost[%s] and serverPort[%s].",
|
serverHost, serverPort));
|
||||||
serverHost, serverPort));
|
|
||||||
}
|
}
|
||||||
|
|
||||||
this.graphiteReporter = createGraphiteReport();
|
this.graphiteReporter = createGraphiteReport();
|
||||||
@@ -84,8 +82,7 @@ public class MetricsGraphiteReporter extends MetricsReporter {
|
|||||||
private GraphiteReporter createGraphiteReport() {
|
private GraphiteReporter createGraphiteReport() {
|
||||||
Graphite graphite = new Graphite(new InetSocketAddress(serverHost, serverPort));
|
Graphite graphite = new Graphite(new InetSocketAddress(serverHost, serverPort));
|
||||||
String reporterPrefix = config.getGraphiteMetricPrefix();
|
String reporterPrefix = config.getGraphiteMetricPrefix();
|
||||||
return GraphiteReporter.forRegistry(registry).prefixedWith(reporterPrefix)
|
return GraphiteReporter.forRegistry(registry).prefixedWith(reporterPrefix).convertRatesTo(TimeUnit.SECONDS)
|
||||||
.convertRatesTo(TimeUnit.SECONDS).convertDurationsTo(TimeUnit.MILLISECONDS)
|
.convertDurationsTo(TimeUnit.MILLISECONDS).filter(MetricFilter.ALL).build(graphite);
|
||||||
.filter(MetricFilter.ALL).build(graphite);
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -19,8 +19,7 @@
|
|||||||
package org.apache.hudi.metrics;
|
package org.apache.hudi.metrics;
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Types of the reporter. Right now we only support Graphite. We can include JMX and CSV in the
|
* Types of the reporter. Right now we only support Graphite. We can include JMX and CSV in the future.
|
||||||
* future.
|
|
||||||
*/
|
*/
|
||||||
public enum MetricsReporterType {
|
public enum MetricsReporterType {
|
||||||
GRAPHITE, INMEMORY
|
GRAPHITE, INMEMORY
|
||||||
|
|||||||
@@ -82,8 +82,7 @@ import scala.Tuple2;
|
|||||||
/**
|
/**
|
||||||
* Implementation of a very heavily read-optimized Hoodie Table where
|
* Implementation of a very heavily read-optimized Hoodie Table where
|
||||||
* <p>
|
* <p>
|
||||||
* INSERTS - Produce new files, block aligned to desired size (or) Merge with the smallest existing
|
* INSERTS - Produce new files, block aligned to desired size (or) Merge with the smallest existing file, to expand it
|
||||||
* file, to expand it
|
|
||||||
* <p>
|
* <p>
|
||||||
* UPDATES - Produce a new version of the file, just replacing the updated records with new values
|
* UPDATES - Produce a new version of the file, just replacing the updated records with new values
|
||||||
*/
|
*/
|
||||||
@@ -95,31 +94,28 @@ public class HoodieCopyOnWriteTable<T extends HoodieRecordPayload> extends Hoodi
|
|||||||
super(config, jsc);
|
super(config, jsc);
|
||||||
}
|
}
|
||||||
|
|
||||||
private static PairFlatMapFunction<Iterator<Tuple2<String, String>>, String,
|
private static PairFlatMapFunction<Iterator<Tuple2<String, String>>, String, PartitionCleanStat> deleteFilesFunc(
|
||||||
PartitionCleanStat> deleteFilesFunc(
|
|
||||||
HoodieTable table) {
|
HoodieTable table) {
|
||||||
return (PairFlatMapFunction<Iterator<Tuple2<String, String>>, String, PartitionCleanStat>)
|
return (PairFlatMapFunction<Iterator<Tuple2<String, String>>, String, PartitionCleanStat>) iter -> {
|
||||||
iter -> {
|
Map<String, PartitionCleanStat> partitionCleanStatMap = new HashMap<>();
|
||||||
Map<String, PartitionCleanStat> partitionCleanStatMap = new HashMap<>();
|
|
||||||
|
|
||||||
FileSystem fs = table.getMetaClient().getFs();
|
FileSystem fs = table.getMetaClient().getFs();
|
||||||
while (iter.hasNext()) {
|
while (iter.hasNext()) {
|
||||||
Tuple2<String, String> partitionDelFileTuple = iter.next();
|
Tuple2<String, String> partitionDelFileTuple = iter.next();
|
||||||
String partitionPath = partitionDelFileTuple._1();
|
String partitionPath = partitionDelFileTuple._1();
|
||||||
String deletePathStr = partitionDelFileTuple._2();
|
String deletePathStr = partitionDelFileTuple._2();
|
||||||
Boolean deletedFileResult = deleteFileAndGetResult(fs, deletePathStr);
|
Boolean deletedFileResult = deleteFileAndGetResult(fs, deletePathStr);
|
||||||
if (!partitionCleanStatMap.containsKey(partitionPath)) {
|
if (!partitionCleanStatMap.containsKey(partitionPath)) {
|
||||||
partitionCleanStatMap.put(partitionPath, new PartitionCleanStat(partitionPath));
|
partitionCleanStatMap.put(partitionPath, new PartitionCleanStat(partitionPath));
|
||||||
}
|
}
|
||||||
PartitionCleanStat partitionCleanStat = partitionCleanStatMap.get(partitionPath);
|
PartitionCleanStat partitionCleanStat = partitionCleanStatMap.get(partitionPath);
|
||||||
partitionCleanStat.addDeleteFilePatterns(deletePathStr);
|
partitionCleanStat.addDeleteFilePatterns(deletePathStr);
|
||||||
partitionCleanStat.addDeletedFileResult(deletePathStr, deletedFileResult);
|
partitionCleanStat.addDeletedFileResult(deletePathStr, deletedFileResult);
|
||||||
}
|
}
|
||||||
|
|
||||||
return partitionCleanStatMap.entrySet().stream()
|
return partitionCleanStatMap.entrySet().stream().map(e -> new Tuple2<>(e.getKey(), e.getValue()))
|
||||||
.map(e -> new Tuple2<>(e.getKey(), e.getValue()))
|
.collect(Collectors.toList()).iterator();
|
||||||
.collect(Collectors.toList()).iterator();
|
};
|
||||||
};
|
|
||||||
}
|
}
|
||||||
|
|
||||||
private static PairFlatMapFunction<String, String, String> getFilesToDeleteFunc(HoodieTable table,
|
private static PairFlatMapFunction<String, String, String> getFilesToDeleteFunc(HoodieTable table,
|
||||||
@@ -131,8 +127,7 @@ public class HoodieCopyOnWriteTable<T extends HoodieRecordPayload> extends Hoodi
|
|||||||
};
|
};
|
||||||
}
|
}
|
||||||
|
|
||||||
private static Boolean deleteFileAndGetResult(FileSystem fs, String deletePathStr)
|
private static Boolean deleteFileAndGetResult(FileSystem fs, String deletePathStr) throws IOException {
|
||||||
throws IOException {
|
|
||||||
Path deletePath = new Path(deletePathStr);
|
Path deletePath = new Path(deletePathStr);
|
||||||
logger.debug("Working on delete path :" + deletePath);
|
logger.debug("Working on delete path :" + deletePath);
|
||||||
boolean deleteResult = fs.delete(deletePath, false);
|
boolean deleteResult = fs.delete(deletePath, false);
|
||||||
@@ -171,8 +166,8 @@ public class HoodieCopyOnWriteTable<T extends HoodieRecordPayload> extends Hoodi
|
|||||||
throw new HoodieNotSupportedException("Compaction is not supported from a CopyOnWrite table");
|
throw new HoodieNotSupportedException("Compaction is not supported from a CopyOnWrite table");
|
||||||
}
|
}
|
||||||
|
|
||||||
public Iterator<List<WriteStatus>> handleUpdate(String commitTime, String fileId,
|
public Iterator<List<WriteStatus>> handleUpdate(String commitTime, String fileId, Iterator<HoodieRecord<T>> recordItr)
|
||||||
Iterator<HoodieRecord<T>> recordItr) throws IOException {
|
throws IOException {
|
||||||
// This is needed since sometimes some buckets are never picked in getPartition() and end up with 0 records
|
// This is needed since sometimes some buckets are never picked in getPartition() and end up with 0 records
|
||||||
if (!recordItr.hasNext()) {
|
if (!recordItr.hasNext()) {
|
||||||
logger.info("Empty partition with fileId => " + fileId);
|
logger.info("Empty partition with fileId => " + fileId);
|
||||||
@@ -190,17 +185,16 @@ public class HoodieCopyOnWriteTable<T extends HoodieRecordPayload> extends Hoodi
|
|||||||
return handleUpdateInternal(upsertHandle, commitTime, fileId);
|
return handleUpdateInternal(upsertHandle, commitTime, fileId);
|
||||||
}
|
}
|
||||||
|
|
||||||
protected Iterator<List<WriteStatus>> handleUpdateInternal(HoodieMergeHandle upsertHandle,
|
protected Iterator<List<WriteStatus>> handleUpdateInternal(HoodieMergeHandle upsertHandle, String commitTime,
|
||||||
String commitTime, String fileId)
|
String fileId) throws IOException {
|
||||||
throws IOException {
|
|
||||||
if (upsertHandle.getOldFilePath() == null) {
|
if (upsertHandle.getOldFilePath() == null) {
|
||||||
throw new HoodieUpsertException(
|
throw new HoodieUpsertException(
|
||||||
"Error in finding the old file path at commit " + commitTime + " for fileId: " + fileId);
|
"Error in finding the old file path at commit " + commitTime + " for fileId: " + fileId);
|
||||||
} else {
|
} else {
|
||||||
AvroReadSupport.setAvroReadSchema(getHadoopConf(), upsertHandle.getWriterSchema());
|
AvroReadSupport.setAvroReadSchema(getHadoopConf(), upsertHandle.getWriterSchema());
|
||||||
BoundedInMemoryExecutor<GenericRecord, GenericRecord, Void> wrapper = null;
|
BoundedInMemoryExecutor<GenericRecord, GenericRecord, Void> wrapper = null;
|
||||||
try (ParquetReader<IndexedRecord> reader = AvroParquetReader.<IndexedRecord>builder(upsertHandle.getOldFilePath())
|
try (ParquetReader<IndexedRecord> reader =
|
||||||
.withConf(getHadoopConf()).build()) {
|
AvroParquetReader.<IndexedRecord>builder(upsertHandle.getOldFilePath()).withConf(getHadoopConf()).build()) {
|
||||||
wrapper = new SparkBoundedInMemoryExecutor(config, new ParquetReaderIterator(reader),
|
wrapper = new SparkBoundedInMemoryExecutor(config, new ParquetReaderIterator(reader),
|
||||||
new UpdateHandler(upsertHandle), x -> x);
|
new UpdateHandler(upsertHandle), x -> x);
|
||||||
wrapper.execute();
|
wrapper.execute();
|
||||||
@@ -214,17 +208,15 @@ public class HoodieCopyOnWriteTable<T extends HoodieRecordPayload> extends Hoodi
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
//TODO(vc): This needs to be revisited
|
// TODO(vc): This needs to be revisited
|
||||||
if (upsertHandle.getWriteStatus().getPartitionPath() == null) {
|
if (upsertHandle.getWriteStatus().getPartitionPath() == null) {
|
||||||
logger.info("Upsert Handle has partition path as null " + upsertHandle.getOldFilePath()
|
logger.info("Upsert Handle has partition path as null " + upsertHandle.getOldFilePath() + ", "
|
||||||
+ ", " + upsertHandle.getWriteStatus());
|
+ upsertHandle.getWriteStatus());
|
||||||
}
|
}
|
||||||
return Collections.singletonList(Collections.singletonList(upsertHandle.getWriteStatus()))
|
return Collections.singletonList(Collections.singletonList(upsertHandle.getWriteStatus())).iterator();
|
||||||
.iterator();
|
|
||||||
}
|
}
|
||||||
|
|
||||||
protected HoodieMergeHandle getUpdateHandle(String commitTime, String fileId,
|
protected HoodieMergeHandle getUpdateHandle(String commitTime, String fileId, Iterator<HoodieRecord<T>> recordItr) {
|
||||||
Iterator<HoodieRecord<T>> recordItr) {
|
|
||||||
return new HoodieMergeHandle<>(config, commitTime, this, recordItr, fileId);
|
return new HoodieMergeHandle<>(config, commitTime, this, recordItr, fileId);
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -233,8 +225,8 @@ public class HoodieCopyOnWriteTable<T extends HoodieRecordPayload> extends Hoodi
|
|||||||
return new HoodieMergeHandle<>(config, commitTime, this, keyToNewRecords, fileId, dataFileToBeMerged);
|
return new HoodieMergeHandle<>(config, commitTime, this, keyToNewRecords, fileId, dataFileToBeMerged);
|
||||||
}
|
}
|
||||||
|
|
||||||
public Iterator<List<WriteStatus>> handleInsert(String commitTime, String idPfx,
|
public Iterator<List<WriteStatus>> handleInsert(String commitTime, String idPfx, Iterator<HoodieRecord<T>> recordItr)
|
||||||
Iterator<HoodieRecord<T>> recordItr) throws Exception {
|
throws Exception {
|
||||||
// This is needed since sometimes some buckets are never picked in getPartition() and end up with 0 records
|
// This is needed since sometimes some buckets are never picked in getPartition() and end up with 0 records
|
||||||
if (!recordItr.hasNext()) {
|
if (!recordItr.hasNext()) {
|
||||||
logger.info("Empty partition");
|
logger.info("Empty partition");
|
||||||
@@ -245,16 +237,16 @@ public class HoodieCopyOnWriteTable<T extends HoodieRecordPayload> extends Hoodi
|
|||||||
|
|
||||||
public Iterator<List<WriteStatus>> handleInsert(String commitTime, String partitionPath, String fileId,
|
public Iterator<List<WriteStatus>> handleInsert(String commitTime, String partitionPath, String fileId,
|
||||||
Iterator<HoodieRecord<T>> recordItr) {
|
Iterator<HoodieRecord<T>> recordItr) {
|
||||||
HoodieCreateHandle createHandle = new HoodieCreateHandle(config, commitTime, this, partitionPath, fileId,
|
HoodieCreateHandle createHandle =
|
||||||
recordItr);
|
new HoodieCreateHandle(config, commitTime, this, partitionPath, fileId, recordItr);
|
||||||
createHandle.write();
|
createHandle.write();
|
||||||
return Collections.singletonList(Collections.singletonList(createHandle.close())).iterator();
|
return Collections.singletonList(Collections.singletonList(createHandle.close())).iterator();
|
||||||
}
|
}
|
||||||
|
|
||||||
@SuppressWarnings("unchecked")
|
@SuppressWarnings("unchecked")
|
||||||
@Override
|
@Override
|
||||||
public Iterator<List<WriteStatus>> handleUpsertPartition(String commitTime, Integer partition,
|
public Iterator<List<WriteStatus>> handleUpsertPartition(String commitTime, Integer partition, Iterator recordItr,
|
||||||
Iterator recordItr, Partitioner partitioner) {
|
Partitioner partitioner) {
|
||||||
UpsertPartitioner upsertPartitioner = (UpsertPartitioner) partitioner;
|
UpsertPartitioner upsertPartitioner = (UpsertPartitioner) partitioner;
|
||||||
BucketInfo binfo = upsertPartitioner.getBucketInfo(partition);
|
BucketInfo binfo = upsertPartitioner.getBucketInfo(partition);
|
||||||
BucketType btype = binfo.bucketType;
|
BucketType btype = binfo.bucketType;
|
||||||
@@ -264,8 +256,7 @@ public class HoodieCopyOnWriteTable<T extends HoodieRecordPayload> extends Hoodi
|
|||||||
} else if (btype.equals(BucketType.UPDATE)) {
|
} else if (btype.equals(BucketType.UPDATE)) {
|
||||||
return handleUpdate(commitTime, binfo.fileIdPrefix, recordItr);
|
return handleUpdate(commitTime, binfo.fileIdPrefix, recordItr);
|
||||||
} else {
|
} else {
|
||||||
throw new HoodieUpsertException(
|
throw new HoodieUpsertException("Unknown bucketType " + btype + " for partition :" + partition);
|
||||||
"Unknown bucketType " + btype + " for partition :" + partition);
|
|
||||||
}
|
}
|
||||||
} catch (Throwable t) {
|
} catch (Throwable t) {
|
||||||
String msg = "Error upserting bucketType " + btype + " for partition :" + partition;
|
String msg = "Error upserting bucketType " + btype + " for partition :" + partition;
|
||||||
@@ -275,15 +266,14 @@ public class HoodieCopyOnWriteTable<T extends HoodieRecordPayload> extends Hoodi
|
|||||||
}
|
}
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
public Iterator<List<WriteStatus>> handleInsertPartition(String commitTime, Integer partition,
|
public Iterator<List<WriteStatus>> handleInsertPartition(String commitTime, Integer partition, Iterator recordItr,
|
||||||
Iterator recordItr, Partitioner partitioner) {
|
Partitioner partitioner) {
|
||||||
return handleUpsertPartition(commitTime, partition, recordItr, partitioner);
|
return handleUpsertPartition(commitTime, partition, recordItr, partitioner);
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Performs cleaning of partition paths according to cleaning policy and returns the number of
|
* Performs cleaning of partition paths according to cleaning policy and returns the number of files cleaned. Handles
|
||||||
* files cleaned. Handles skews in partitions to clean by making files to clean as the unit of
|
* skews in partitions to clean by making files to clean as the unit of task distribution.
|
||||||
* task distribution.
|
|
||||||
*
|
*
|
||||||
* @throws IllegalArgumentException if unknown cleaning policy is provided
|
* @throws IllegalArgumentException if unknown cleaning policy is provided
|
||||||
*/
|
*/
|
||||||
@@ -291,11 +281,9 @@ public class HoodieCopyOnWriteTable<T extends HoodieRecordPayload> extends Hoodi
|
|||||||
public List<HoodieCleanStat> clean(JavaSparkContext jsc) {
|
public List<HoodieCleanStat> clean(JavaSparkContext jsc) {
|
||||||
try {
|
try {
|
||||||
FileSystem fs = getMetaClient().getFs();
|
FileSystem fs = getMetaClient().getFs();
|
||||||
List<String> partitionsToClean = FSUtils
|
List<String> partitionsToClean =
|
||||||
.getAllPartitionPaths(fs, getMetaClient().getBasePath(),
|
FSUtils.getAllPartitionPaths(fs, getMetaClient().getBasePath(), config.shouldAssumeDatePartitioning());
|
||||||
config.shouldAssumeDatePartitioning());
|
logger.info("Partitions to clean up : " + partitionsToClean + ", with policy " + config.getCleanerPolicy());
|
||||||
logger.info("Partitions to clean up : " + partitionsToClean + ", with policy " + config
|
|
||||||
.getCleanerPolicy());
|
|
||||||
if (partitionsToClean.isEmpty()) {
|
if (partitionsToClean.isEmpty()) {
|
||||||
logger.info("Nothing to clean here mom. It is already clean");
|
logger.info("Nothing to clean here mom. It is already clean");
|
||||||
return Collections.emptyList();
|
return Collections.emptyList();
|
||||||
@@ -307,12 +295,10 @@ public class HoodieCopyOnWriteTable<T extends HoodieRecordPayload> extends Hoodi
|
|||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Common method used for cleaning out parquet files under a partition path during rollback of a
|
* Common method used for cleaning out parquet files under a partition path during rollback of a set of commits
|
||||||
* set of commits
|
|
||||||
*/
|
*/
|
||||||
protected Map<FileStatus, Boolean> deleteCleanedFiles(Map<FileStatus, Boolean> results, String partitionPath,
|
protected Map<FileStatus, Boolean> deleteCleanedFiles(Map<FileStatus, Boolean> results, String partitionPath,
|
||||||
PathFilter filter)
|
PathFilter filter) throws IOException {
|
||||||
throws IOException {
|
|
||||||
logger.info("Cleaning path " + partitionPath);
|
logger.info("Cleaning path " + partitionPath);
|
||||||
FileSystem fs = getMetaClient().getFs();
|
FileSystem fs = getMetaClient().getFs();
|
||||||
FileStatus[] toBeDeleted = fs.listStatus(FSUtils.getPartitionPath(config.getBasePath(), partitionPath), filter);
|
FileStatus[] toBeDeleted = fs.listStatus(FSUtils.getPartitionPath(config.getBasePath(), partitionPath), filter);
|
||||||
@@ -325,12 +311,10 @@ public class HoodieCopyOnWriteTable<T extends HoodieRecordPayload> extends Hoodi
|
|||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Common method used for cleaning out parquet files under a partition path during rollback of a
|
* Common method used for cleaning out parquet files under a partition path during rollback of a set of commits
|
||||||
* set of commits
|
|
||||||
*/
|
*/
|
||||||
protected Map<FileStatus, Boolean> deleteCleanedFiles(Map<FileStatus, Boolean> results, String commit, String
|
protected Map<FileStatus, Boolean> deleteCleanedFiles(Map<FileStatus, Boolean> results, String commit,
|
||||||
partitionPath)
|
String partitionPath) throws IOException {
|
||||||
throws IOException {
|
|
||||||
logger.info("Cleaning path " + partitionPath);
|
logger.info("Cleaning path " + partitionPath);
|
||||||
FileSystem fs = getMetaClient().getFs();
|
FileSystem fs = getMetaClient().getFs();
|
||||||
PathFilter filter = (path) -> {
|
PathFilter filter = (path) -> {
|
||||||
@@ -354,8 +338,8 @@ public class HoodieCopyOnWriteTable<T extends HoodieRecordPayload> extends Hoodi
|
|||||||
throws IOException {
|
throws IOException {
|
||||||
String actionType = metaClient.getCommitActionType();
|
String actionType = metaClient.getCommitActionType();
|
||||||
HoodieActiveTimeline activeTimeline = this.getActiveTimeline();
|
HoodieActiveTimeline activeTimeline = this.getActiveTimeline();
|
||||||
List<String> inflights = this.getInflightCommitTimeline().getInstants()
|
List<String> inflights =
|
||||||
.map(HoodieInstant::getTimestamp).collect(Collectors.toList());
|
this.getInflightCommitTimeline().getInstants().map(HoodieInstant::getTimestamp).collect(Collectors.toList());
|
||||||
// Atomically unpublish the commits
|
// Atomically unpublish the commits
|
||||||
if (!inflights.contains(commit)) {
|
if (!inflights.contains(commit)) {
|
||||||
activeTimeline.revertToInflight(new HoodieInstant(false, actionType, commit));
|
activeTimeline.revertToInflight(new HoodieInstant(false, actionType, commit));
|
||||||
@@ -364,27 +348,26 @@ public class HoodieCopyOnWriteTable<T extends HoodieRecordPayload> extends Hoodi
|
|||||||
|
|
||||||
// delete all the data files for this commit
|
// delete all the data files for this commit
|
||||||
logger.info("Clean out all parquet files generated for commit: " + commit);
|
logger.info("Clean out all parquet files generated for commit: " + commit);
|
||||||
List<HoodieRollbackStat> stats = jsc.parallelize(FSUtils
|
List<HoodieRollbackStat> stats =
|
||||||
.getAllPartitionPaths(metaClient.getFs(), getMetaClient().getBasePath(),
|
jsc.parallelize(FSUtils.getAllPartitionPaths(metaClient.getFs(), getMetaClient().getBasePath(),
|
||||||
config.shouldAssumeDatePartitioning()))
|
config.shouldAssumeDatePartitioning())).map((Function<String, HoodieRollbackStat>) partitionPath -> {
|
||||||
.map((Function<String, HoodieRollbackStat>) partitionPath -> {
|
// Scan all partitions files with this commit time
|
||||||
// Scan all partitions files with this commit time
|
final Map<FileStatus, Boolean> filesToDeletedStatus = new HashMap<>();
|
||||||
final Map<FileStatus, Boolean> filesToDeletedStatus = new HashMap<>();
|
deleteCleanedFiles(filesToDeletedStatus, commit, partitionPath);
|
||||||
deleteCleanedFiles(filesToDeletedStatus, commit, partitionPath);
|
return HoodieRollbackStat.newBuilder().withPartitionPath(partitionPath)
|
||||||
return HoodieRollbackStat.newBuilder().withPartitionPath(partitionPath)
|
.withDeletedFileResults(filesToDeletedStatus).build();
|
||||||
.withDeletedFileResults(filesToDeletedStatus).build();
|
}).collect();
|
||||||
}).collect();
|
|
||||||
|
|
||||||
// Delete Inflight instant if enabled
|
// Delete Inflight instant if enabled
|
||||||
deleteInflightInstant(deleteInstants, activeTimeline,
|
deleteInflightInstant(deleteInstants, activeTimeline, new HoodieInstant(true, actionType, commit));
|
||||||
new HoodieInstant(true, actionType, commit));
|
|
||||||
return stats;
|
return stats;
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Delete Inflight instant if enabled
|
* Delete Inflight instant if enabled
|
||||||
|
*
|
||||||
* @param deleteInstant Enable Deletion of Inflight instant
|
* @param deleteInstant Enable Deletion of Inflight instant
|
||||||
* @param activeTimeline Hoodie active timeline
|
* @param activeTimeline Hoodie active timeline
|
||||||
* @param instantToBeDeleted Instant to be deleted
|
* @param instantToBeDeleted Instant to be deleted
|
||||||
*/
|
*/
|
||||||
protected void deleteInflightInstant(boolean deleteInstant, HoodieActiveTimeline activeTimeline,
|
protected void deleteInflightInstant(boolean deleteInstant, HoodieActiveTimeline activeTimeline,
|
||||||
@@ -401,30 +384,27 @@ public class HoodieCopyOnWriteTable<T extends HoodieRecordPayload> extends Hoodi
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
private List<HoodieCleanStat> cleanPartitionPaths(List<String> partitionsToClean,
|
private List<HoodieCleanStat> cleanPartitionPaths(List<String> partitionsToClean, JavaSparkContext jsc) {
|
||||||
JavaSparkContext jsc) {
|
|
||||||
int cleanerParallelism = Math.min(partitionsToClean.size(), config.getCleanerParallelism());
|
int cleanerParallelism = Math.min(partitionsToClean.size(), config.getCleanerParallelism());
|
||||||
logger.info("Using cleanerParallelism: " + cleanerParallelism);
|
logger.info("Using cleanerParallelism: " + cleanerParallelism);
|
||||||
List<Tuple2<String, PartitionCleanStat>> partitionCleanStats = jsc
|
List<Tuple2<String, PartitionCleanStat>> partitionCleanStats = jsc
|
||||||
.parallelize(partitionsToClean, cleanerParallelism)
|
.parallelize(partitionsToClean, cleanerParallelism).flatMapToPair(getFilesToDeleteFunc(this, config))
|
||||||
.flatMapToPair(getFilesToDeleteFunc(this, config))
|
.repartition(cleanerParallelism) // repartition to remove skews
|
||||||
.repartition(cleanerParallelism) // repartition to remove skews
|
|
||||||
.mapPartitionsToPair(deleteFilesFunc(this)).reduceByKey(
|
.mapPartitionsToPair(deleteFilesFunc(this)).reduceByKey(
|
||||||
// merge partition level clean stats below
|
// merge partition level clean stats below
|
||||||
(Function2<PartitionCleanStat, PartitionCleanStat, PartitionCleanStat>) (e1, e2) -> e1
|
(Function2<PartitionCleanStat, PartitionCleanStat, PartitionCleanStat>) (e1, e2) -> e1.merge(e2))
|
||||||
.merge(e2)).collect();
|
.collect();
|
||||||
|
|
||||||
Map<String, PartitionCleanStat> partitionCleanStatsMap = partitionCleanStats.stream()
|
Map<String, PartitionCleanStat> partitionCleanStatsMap =
|
||||||
.collect(Collectors.toMap(Tuple2::_1, Tuple2::_2));
|
partitionCleanStats.stream().collect(Collectors.toMap(Tuple2::_1, Tuple2::_2));
|
||||||
|
|
||||||
HoodieCleanHelper cleaner = new HoodieCleanHelper(this, config);
|
HoodieCleanHelper cleaner = new HoodieCleanHelper(this, config);
|
||||||
// Return PartitionCleanStat for each partition passed.
|
// Return PartitionCleanStat for each partition passed.
|
||||||
return partitionsToClean.stream().map(partitionPath -> {
|
return partitionsToClean.stream().map(partitionPath -> {
|
||||||
PartitionCleanStat partitionCleanStat =
|
PartitionCleanStat partitionCleanStat =
|
||||||
(partitionCleanStatsMap.containsKey(partitionPath)) ? partitionCleanStatsMap
|
(partitionCleanStatsMap.containsKey(partitionPath)) ? partitionCleanStatsMap.get(partitionPath)
|
||||||
.get(partitionPath) : new PartitionCleanStat(partitionPath);
|
: new PartitionCleanStat(partitionPath);
|
||||||
return HoodieCleanStat.newBuilder().withPolicy(config.getCleanerPolicy())
|
return HoodieCleanStat.newBuilder().withPolicy(config.getCleanerPolicy()).withPartitionPath(partitionPath)
|
||||||
.withPartitionPath(partitionPath)
|
|
||||||
.withEarliestCommitRetained(cleaner.getEarliestCommitToRetain())
|
.withEarliestCommitRetained(cleaner.getEarliestCommitToRetain())
|
||||||
.withDeletePathPattern(partitionCleanStat.deletePathPatterns)
|
.withDeletePathPattern(partitionCleanStat.deletePathPatterns)
|
||||||
.withSuccessfulDeletes(partitionCleanStat.successDeleteFiles)
|
.withSuccessfulDeletes(partitionCleanStat.successDeleteFiles)
|
||||||
@@ -453,8 +433,7 @@ public class HoodieCopyOnWriteTable<T extends HoodieRecordPayload> extends Hoodi
|
|||||||
}
|
}
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
protected void finish() {
|
protected void finish() {}
|
||||||
}
|
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
protected Void getResult() {
|
protected Void getResult() {
|
||||||
@@ -487,8 +466,8 @@ public class HoodieCopyOnWriteTable<T extends HoodieRecordPayload> extends Hoodi
|
|||||||
|
|
||||||
private PartitionCleanStat merge(PartitionCleanStat other) {
|
private PartitionCleanStat merge(PartitionCleanStat other) {
|
||||||
if (!this.partitionPath.equals(other.partitionPath)) {
|
if (!this.partitionPath.equals(other.partitionPath)) {
|
||||||
throw new RuntimeException(String
|
throw new RuntimeException(
|
||||||
.format("partitionPath is not a match: (%s, %s)", partitionPath, other.partitionPath));
|
String.format("partitionPath is not a match: (%s, %s)", partitionPath, other.partitionPath));
|
||||||
}
|
}
|
||||||
successDeleteFiles.addAll(other.successDeleteFiles);
|
successDeleteFiles.addAll(other.successDeleteFiles);
|
||||||
deletePathPatterns.addAll(other.deletePathPatterns);
|
deletePathPatterns.addAll(other.deletePathPatterns);
|
||||||
@@ -516,8 +495,8 @@ public class HoodieCopyOnWriteTable<T extends HoodieRecordPayload> extends Hoodi
|
|||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Helper class for an insert bucket along with the weight [0.0, 0.1] that defines the amount of
|
* Helper class for an insert bucket along with the weight [0.0, 0.1] that defines the amount of incoming inserts that
|
||||||
* incoming inserts that should be allocated to the bucket
|
* should be allocated to the bucket
|
||||||
*/
|
*/
|
||||||
class InsertBucket implements Serializable {
|
class InsertBucket implements Serializable {
|
||||||
|
|
||||||
@@ -563,8 +542,7 @@ public class HoodieCopyOnWriteTable<T extends HoodieRecordPayload> extends Hoodi
|
|||||||
*/
|
*/
|
||||||
List<SmallFile> smallFiles = new ArrayList<SmallFile>();
|
List<SmallFile> smallFiles = new ArrayList<SmallFile>();
|
||||||
/**
|
/**
|
||||||
* Total number of RDD partitions, is determined by total buckets we want to pack the incoming
|
* Total number of RDD partitions, is determined by total buckets we want to pack the incoming workload into
|
||||||
* workload into
|
|
||||||
*/
|
*/
|
||||||
private int totalBuckets = 0;
|
private int totalBuckets = 0;
|
||||||
/**
|
/**
|
||||||
@@ -599,17 +577,15 @@ public class HoodieCopyOnWriteTable<T extends HoodieRecordPayload> extends Hoodi
|
|||||||
assignUpdates(profile);
|
assignUpdates(profile);
|
||||||
assignInserts(profile);
|
assignInserts(profile);
|
||||||
|
|
||||||
logger.info(
|
logger.info("Total Buckets :" + totalBuckets + ", " + "buckets info => " + bucketInfoMap + ", \n"
|
||||||
"Total Buckets :" + totalBuckets + ", " + "buckets info => " + bucketInfoMap + ", \n"
|
+ "Partition to insert buckets => " + partitionPathToInsertBuckets + ", \n"
|
||||||
+ "Partition to insert buckets => " + partitionPathToInsertBuckets + ", \n"
|
+ "UpdateLocations mapped to buckets =>" + updateLocationToBucket);
|
||||||
+ "UpdateLocations mapped to buckets =>" + updateLocationToBucket);
|
|
||||||
}
|
}
|
||||||
|
|
||||||
private void assignUpdates(WorkloadProfile profile) {
|
private void assignUpdates(WorkloadProfile profile) {
|
||||||
// each update location gets a partition
|
// each update location gets a partition
|
||||||
WorkloadStat gStat = profile.getGlobalStat();
|
WorkloadStat gStat = profile.getGlobalStat();
|
||||||
for (Map.Entry<String, Pair<String, Long>> updateLocEntry : gStat.getUpdateLocationToCount()
|
for (Map.Entry<String, Pair<String, Long>> updateLocEntry : gStat.getUpdateLocationToCount().entrySet()) {
|
||||||
.entrySet()) {
|
|
||||||
addUpdateBucket(updateLocEntry.getKey());
|
addUpdateBucket(updateLocEntry.getKey());
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@@ -628,8 +604,9 @@ public class HoodieCopyOnWriteTable<T extends HoodieRecordPayload> extends Hoodi
|
|||||||
private void assignInserts(WorkloadProfile profile) {
|
private void assignInserts(WorkloadProfile profile) {
|
||||||
// for new inserts, compute buckets depending on how many records we have for each partition
|
// for new inserts, compute buckets depending on how many records we have for each partition
|
||||||
Set<String> partitionPaths = profile.getPartitionPaths();
|
Set<String> partitionPaths = profile.getPartitionPaths();
|
||||||
long averageRecordSize = averageBytesPerRecord(metaClient.getActiveTimeline().getCommitTimeline()
|
long averageRecordSize =
|
||||||
.filterCompletedInstants(), config.getCopyOnWriteRecordSizeEstimate());
|
averageBytesPerRecord(metaClient.getActiveTimeline().getCommitTimeline().filterCompletedInstants(),
|
||||||
|
config.getCopyOnWriteRecordSizeEstimate());
|
||||||
logger.info("AvgRecordSize => " + averageRecordSize);
|
logger.info("AvgRecordSize => " + averageRecordSize);
|
||||||
for (String partitionPath : partitionPaths) {
|
for (String partitionPath : partitionPaths) {
|
||||||
WorkloadStat pStat = profile.getWorkloadStat(partitionPath);
|
WorkloadStat pStat = profile.getWorkloadStat(partitionPath);
|
||||||
@@ -644,20 +621,17 @@ public class HoodieCopyOnWriteTable<T extends HoodieRecordPayload> extends Hoodi
|
|||||||
|
|
||||||
// first try packing this into one of the smallFiles
|
// first try packing this into one of the smallFiles
|
||||||
for (SmallFile smallFile : smallFiles) {
|
for (SmallFile smallFile : smallFiles) {
|
||||||
long recordsToAppend = Math
|
long recordsToAppend = Math.min((config.getParquetMaxFileSize() - smallFile.sizeBytes) / averageRecordSize,
|
||||||
.min((config.getParquetMaxFileSize() - smallFile.sizeBytes) / averageRecordSize,
|
totalUnassignedInserts);
|
||||||
totalUnassignedInserts);
|
|
||||||
if (recordsToAppend > 0 && totalUnassignedInserts > 0) {
|
if (recordsToAppend > 0 && totalUnassignedInserts > 0) {
|
||||||
// create a new bucket or re-use an existing bucket
|
// create a new bucket or re-use an existing bucket
|
||||||
int bucket;
|
int bucket;
|
||||||
if (updateLocationToBucket.containsKey(smallFile.location.getFileId())) {
|
if (updateLocationToBucket.containsKey(smallFile.location.getFileId())) {
|
||||||
bucket = updateLocationToBucket.get(smallFile.location.getFileId());
|
bucket = updateLocationToBucket.get(smallFile.location.getFileId());
|
||||||
logger.info("Assigning " + recordsToAppend + " inserts to existing update bucket "
|
logger.info("Assigning " + recordsToAppend + " inserts to existing update bucket " + bucket);
|
||||||
+ bucket);
|
|
||||||
} else {
|
} else {
|
||||||
bucket = addUpdateBucket(smallFile.location.getFileId());
|
bucket = addUpdateBucket(smallFile.location.getFileId());
|
||||||
logger.info(
|
logger.info("Assigning " + recordsToAppend + " inserts to new update bucket " + bucket);
|
||||||
"Assigning " + recordsToAppend + " inserts to new update bucket " + bucket);
|
|
||||||
}
|
}
|
||||||
bucketNumbers.add(bucket);
|
bucketNumbers.add(bucket);
|
||||||
recordsPerBucket.add(recordsToAppend);
|
recordsPerBucket.add(recordsToAppend);
|
||||||
@@ -673,10 +647,8 @@ public class HoodieCopyOnWriteTable<T extends HoodieRecordPayload> extends Hoodi
|
|||||||
}
|
}
|
||||||
|
|
||||||
int insertBuckets = (int) Math.ceil((1.0 * totalUnassignedInserts) / insertRecordsPerBucket);
|
int insertBuckets = (int) Math.ceil((1.0 * totalUnassignedInserts) / insertRecordsPerBucket);
|
||||||
logger.info(
|
logger.info("After small file assignment: unassignedInserts => " + totalUnassignedInserts
|
||||||
"After small file assignment: unassignedInserts => " + totalUnassignedInserts
|
+ ", totalInsertBuckets => " + insertBuckets + ", recordsPerBucket => " + insertRecordsPerBucket);
|
||||||
+ ", totalInsertBuckets => " + insertBuckets + ", recordsPerBucket => "
|
|
||||||
+ insertRecordsPerBucket);
|
|
||||||
for (int b = 0; b < insertBuckets; b++) {
|
for (int b = 0; b < insertBuckets; b++) {
|
||||||
bucketNumbers.add(totalBuckets);
|
bucketNumbers.add(totalBuckets);
|
||||||
recordsPerBucket.add(totalUnassignedInserts / insertBuckets);
|
recordsPerBucket.add(totalUnassignedInserts / insertBuckets);
|
||||||
@@ -696,15 +668,14 @@ public class HoodieCopyOnWriteTable<T extends HoodieRecordPayload> extends Hoodi
|
|||||||
bkt.weight = (1.0 * recordsPerBucket.get(i)) / pStat.getNumInserts();
|
bkt.weight = (1.0 * recordsPerBucket.get(i)) / pStat.getNumInserts();
|
||||||
insertBuckets.add(bkt);
|
insertBuckets.add(bkt);
|
||||||
}
|
}
|
||||||
logger.info(
|
logger.info("Total insert buckets for partition path " + partitionPath + " => " + insertBuckets);
|
||||||
"Total insert buckets for partition path " + partitionPath + " => " + insertBuckets);
|
|
||||||
partitionPathToInsertBuckets.put(partitionPath, insertBuckets);
|
partitionPathToInsertBuckets.put(partitionPath, insertBuckets);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Returns a list of small files in the given partition path
|
* Returns a list of small files in the given partition path
|
||||||
*/
|
*/
|
||||||
protected List<SmallFile> getSmallFiles(String partitionPath) {
|
protected List<SmallFile> getSmallFiles(String partitionPath) {
|
||||||
|
|
||||||
@@ -716,15 +687,13 @@ public class HoodieCopyOnWriteTable<T extends HoodieRecordPayload> extends Hoodi
|
|||||||
if (!commitTimeline.empty()) { // if we have some commits
|
if (!commitTimeline.empty()) { // if we have some commits
|
||||||
HoodieInstant latestCommitTime = commitTimeline.lastInstant().get();
|
HoodieInstant latestCommitTime = commitTimeline.lastInstant().get();
|
||||||
List<HoodieDataFile> allFiles = getROFileSystemView()
|
List<HoodieDataFile> allFiles = getROFileSystemView()
|
||||||
.getLatestDataFilesBeforeOrOn(partitionPath, latestCommitTime.getTimestamp())
|
.getLatestDataFilesBeforeOrOn(partitionPath, latestCommitTime.getTimestamp()).collect(Collectors.toList());
|
||||||
.collect(Collectors.toList());
|
|
||||||
|
|
||||||
for (HoodieDataFile file : allFiles) {
|
for (HoodieDataFile file : allFiles) {
|
||||||
if (file.getFileSize() < config.getParquetSmallFileLimit()) {
|
if (file.getFileSize() < config.getParquetSmallFileLimit()) {
|
||||||
String filename = file.getFileName();
|
String filename = file.getFileName();
|
||||||
SmallFile sf = new SmallFile();
|
SmallFile sf = new SmallFile();
|
||||||
sf.location = new HoodieRecordLocation(FSUtils.getCommitTime(filename),
|
sf.location = new HoodieRecordLocation(FSUtils.getCommitTime(filename), FSUtils.getFileId(filename));
|
||||||
FSUtils.getFileId(filename));
|
|
||||||
sf.sizeBytes = file.getFileSize();
|
sf.sizeBytes = file.getFileSize();
|
||||||
smallFileLocations.add(sf);
|
smallFileLocations.add(sf);
|
||||||
// Update the global small files list
|
// Update the global small files list
|
||||||
@@ -751,19 +720,18 @@ public class HoodieCopyOnWriteTable<T extends HoodieRecordPayload> extends Hoodi
|
|||||||
|
|
||||||
@Override
|
@Override
|
||||||
public int getPartition(Object key) {
|
public int getPartition(Object key) {
|
||||||
Tuple2<HoodieKey, Option<HoodieRecordLocation>> keyLocation = (Tuple2<HoodieKey,
|
Tuple2<HoodieKey, Option<HoodieRecordLocation>> keyLocation =
|
||||||
Option<HoodieRecordLocation>>) key;
|
(Tuple2<HoodieKey, Option<HoodieRecordLocation>>) key;
|
||||||
if (keyLocation._2().isPresent()) {
|
if (keyLocation._2().isPresent()) {
|
||||||
HoodieRecordLocation location = keyLocation._2().get();
|
HoodieRecordLocation location = keyLocation._2().get();
|
||||||
return updateLocationToBucket.get(location.getFileId());
|
return updateLocationToBucket.get(location.getFileId());
|
||||||
} else {
|
} else {
|
||||||
List<InsertBucket> targetBuckets = partitionPathToInsertBuckets
|
List<InsertBucket> targetBuckets = partitionPathToInsertBuckets.get(keyLocation._1().getPartitionPath());
|
||||||
.get(keyLocation._1().getPartitionPath());
|
|
||||||
// pick the target bucket to use based on the weights.
|
// pick the target bucket to use based on the weights.
|
||||||
double totalWeight = 0.0;
|
double totalWeight = 0.0;
|
||||||
final long totalInserts = Math.max(1, globalStat.getNumInserts());
|
final long totalInserts = Math.max(1, globalStat.getNumInserts());
|
||||||
final long hashOfKey = Hashing.md5()
|
final long hashOfKey =
|
||||||
.hashString(keyLocation._1().getRecordKey(), StandardCharsets.UTF_8).asLong();
|
Hashing.md5().hashString(keyLocation._1().getRecordKey(), StandardCharsets.UTF_8).asLong();
|
||||||
final double r = 1.0 * Math.floorMod(hashOfKey, totalInserts) / totalInserts;
|
final double r = 1.0 * Math.floorMod(hashOfKey, totalInserts) / totalInserts;
|
||||||
for (InsertBucket insertBucket : targetBuckets) {
|
for (InsertBucket insertBucket : targetBuckets) {
|
||||||
totalWeight += insertBucket.weight;
|
totalWeight += insertBucket.weight;
|
||||||
@@ -782,8 +750,8 @@ public class HoodieCopyOnWriteTable<T extends HoodieRecordPayload> extends Hoodi
|
|||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Obtains the average record size based on records written during previous commits. Used for
|
* Obtains the average record size based on records written during previous commits. Used for estimating how many
|
||||||
* estimating how many records pack into one file.
|
* records pack into one file.
|
||||||
*/
|
*/
|
||||||
protected static long averageBytesPerRecord(HoodieTimeline commitTimeline, int defaultRecordSizeEstimate) {
|
protected static long averageBytesPerRecord(HoodieTimeline commitTimeline, int defaultRecordSizeEstimate) {
|
||||||
long avgSize = defaultRecordSizeEstimate;
|
long avgSize = defaultRecordSizeEstimate;
|
||||||
|
|||||||
@@ -73,15 +73,21 @@ import org.apache.spark.api.java.JavaSparkContext;
|
|||||||
import org.apache.spark.api.java.function.Function;
|
import org.apache.spark.api.java.function.Function;
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Implementation of a more real-time read-optimized Hoodie Table where <p> INSERTS - Same as
|
* Implementation of a more real-time read-optimized Hoodie Table where
|
||||||
* HoodieCopyOnWriteTable - Produce new files, block aligned to desired size (or) Merge with the
|
* <p>
|
||||||
* smallest existing file, to expand it </p> <p> UPDATES - Appends the changes to a rolling log file
|
* INSERTS - Same as HoodieCopyOnWriteTable - Produce new files, block aligned to desired size (or) Merge with the
|
||||||
* maintained per file Id. Compaction merges the log file into the base file. </p> <p> WARNING - MOR
|
* smallest existing file, to expand it
|
||||||
* table type does not support nested rollbacks, every rollback must be followed by an attempted
|
* </p>
|
||||||
* commit action </p>
|
* <p>
|
||||||
|
* UPDATES - Appends the changes to a rolling log file maintained per file Id. Compaction merges the log file into the
|
||||||
|
* base file.
|
||||||
|
* </p>
|
||||||
|
* <p>
|
||||||
|
* WARNING - MOR table type does not support nested rollbacks, every rollback must be followed by an attempted commit
|
||||||
|
* action
|
||||||
|
* </p>
|
||||||
*/
|
*/
|
||||||
public class HoodieMergeOnReadTable<T extends HoodieRecordPayload> extends
|
public class HoodieMergeOnReadTable<T extends HoodieRecordPayload> extends HoodieCopyOnWriteTable<T> {
|
||||||
HoodieCopyOnWriteTable<T> {
|
|
||||||
|
|
||||||
private static Logger logger = LogManager.getLogger(HoodieMergeOnReadTable.class);
|
private static Logger logger = LogManager.getLogger(HoodieMergeOnReadTable.class);
|
||||||
|
|
||||||
@@ -102,27 +108,24 @@ public class HoodieMergeOnReadTable<T extends HoodieRecordPayload> extends
|
|||||||
}
|
}
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
public Iterator<List<WriteStatus>> handleUpdate(String commitTime, String fileId,
|
public Iterator<List<WriteStatus>> handleUpdate(String commitTime, String fileId, Iterator<HoodieRecord<T>> recordItr)
|
||||||
Iterator<HoodieRecord<T>> recordItr) throws IOException {
|
throws IOException {
|
||||||
logger.info("Merging updates for commit " + commitTime + " for file " + fileId);
|
logger.info("Merging updates for commit " + commitTime + " for file " + fileId);
|
||||||
|
|
||||||
if (!index.canIndexLogFiles() && mergeOnReadUpsertPartitioner.getSmallFileIds().contains(fileId)) {
|
if (!index.canIndexLogFiles() && mergeOnReadUpsertPartitioner.getSmallFileIds().contains(fileId)) {
|
||||||
logger.info(
|
logger.info("Small file corrections for updates for commit " + commitTime + " for file " + fileId);
|
||||||
"Small file corrections for updates for commit " + commitTime + " for file " + fileId);
|
|
||||||
return super.handleUpdate(commitTime, fileId, recordItr);
|
return super.handleUpdate(commitTime, fileId, recordItr);
|
||||||
} else {
|
} else {
|
||||||
HoodieAppendHandle<T> appendHandle = new HoodieAppendHandle<>(config, commitTime, this,
|
HoodieAppendHandle<T> appendHandle = new HoodieAppendHandle<>(config, commitTime, this, fileId, recordItr);
|
||||||
fileId, recordItr);
|
|
||||||
appendHandle.doAppend();
|
appendHandle.doAppend();
|
||||||
appendHandle.close();
|
appendHandle.close();
|
||||||
return Collections.singletonList(Collections.singletonList(appendHandle.getWriteStatus()))
|
return Collections.singletonList(Collections.singletonList(appendHandle.getWriteStatus())).iterator();
|
||||||
.iterator();
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
public Iterator<List<WriteStatus>> handleInsert(String commitTime, String idPfx,
|
public Iterator<List<WriteStatus>> handleInsert(String commitTime, String idPfx, Iterator<HoodieRecord<T>> recordItr)
|
||||||
Iterator<HoodieRecord<T>> recordItr) throws Exception {
|
throws Exception {
|
||||||
// If canIndexLogFiles, write inserts to log files else write inserts to parquet files
|
// If canIndexLogFiles, write inserts to log files else write inserts to parquet files
|
||||||
if (index.canIndexLogFiles()) {
|
if (index.canIndexLogFiles()) {
|
||||||
return new MergeOnReadLazyInsertIterable<>(recordItr, config, commitTime, this, idPfx);
|
return new MergeOnReadLazyInsertIterable<>(recordItr, config, commitTime, this, idPfx);
|
||||||
@@ -134,8 +137,8 @@ public class HoodieMergeOnReadTable<T extends HoodieRecordPayload> extends
|
|||||||
@Override
|
@Override
|
||||||
public HoodieCompactionPlan scheduleCompaction(JavaSparkContext jsc, String instantTime) {
|
public HoodieCompactionPlan scheduleCompaction(JavaSparkContext jsc, String instantTime) {
|
||||||
logger.info("Checking if compaction needs to be run on " + config.getBasePath());
|
logger.info("Checking if compaction needs to be run on " + config.getBasePath());
|
||||||
Option<HoodieInstant> lastCompaction = getActiveTimeline().getCommitTimeline()
|
Option<HoodieInstant> lastCompaction =
|
||||||
.filterCompletedInstants().lastInstant();
|
getActiveTimeline().getCommitTimeline().filterCompletedInstants().lastInstant();
|
||||||
String deltaCommitsSinceTs = "0";
|
String deltaCommitsSinceTs = "0";
|
||||||
if (lastCompaction.isPresent()) {
|
if (lastCompaction.isPresent()) {
|
||||||
deltaCommitsSinceTs = lastCompaction.get().getTimestamp();
|
deltaCommitsSinceTs = lastCompaction.get().getTimestamp();
|
||||||
@@ -145,8 +148,8 @@ public class HoodieMergeOnReadTable<T extends HoodieRecordPayload> extends
|
|||||||
.findInstantsAfter(deltaCommitsSinceTs, Integer.MAX_VALUE).countInstants();
|
.findInstantsAfter(deltaCommitsSinceTs, Integer.MAX_VALUE).countInstants();
|
||||||
if (config.getInlineCompactDeltaCommitMax() > deltaCommitsSinceLastCompaction) {
|
if (config.getInlineCompactDeltaCommitMax() > deltaCommitsSinceLastCompaction) {
|
||||||
logger.info("Not running compaction as only " + deltaCommitsSinceLastCompaction
|
logger.info("Not running compaction as only " + deltaCommitsSinceLastCompaction
|
||||||
+ " delta commits was found since last compaction " + deltaCommitsSinceTs
|
+ " delta commits was found since last compaction " + deltaCommitsSinceTs + ". Waiting for "
|
||||||
+ ". Waiting for " + config.getInlineCompactDeltaCommitMax());
|
+ config.getInlineCompactDeltaCommitMax());
|
||||||
return new HoodieCompactionPlan();
|
return new HoodieCompactionPlan();
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -154,7 +157,7 @@ public class HoodieMergeOnReadTable<T extends HoodieRecordPayload> extends
|
|||||||
HoodieRealtimeTableCompactor compactor = new HoodieRealtimeTableCompactor();
|
HoodieRealtimeTableCompactor compactor = new HoodieRealtimeTableCompactor();
|
||||||
try {
|
try {
|
||||||
return compactor.generateCompactionPlan(jsc, this, config, instantTime,
|
return compactor.generateCompactionPlan(jsc, this, config, instantTime,
|
||||||
((SyncableFileSystemView)getRTFileSystemView()).getPendingCompactionOperations()
|
((SyncableFileSystemView) getRTFileSystemView()).getPendingCompactionOperations()
|
||||||
.map(instantTimeCompactionopPair -> instantTimeCompactionopPair.getValue().getFileGroupId())
|
.map(instantTimeCompactionopPair -> instantTimeCompactionopPair.getValue().getFileGroupId())
|
||||||
.collect(Collectors.toSet()));
|
.collect(Collectors.toSet()));
|
||||||
|
|
||||||
@@ -186,9 +189,8 @@ public class HoodieMergeOnReadTable<T extends HoodieRecordPayload> extends
|
|||||||
// Atomically un-publish all non-inflight commits
|
// Atomically un-publish all non-inflight commits
|
||||||
Option<HoodieInstant> commitOrCompactionOption = Option.fromJavaOptional(this.getActiveTimeline()
|
Option<HoodieInstant> commitOrCompactionOption = Option.fromJavaOptional(this.getActiveTimeline()
|
||||||
.getTimelineOfActions(Sets.newHashSet(HoodieActiveTimeline.COMMIT_ACTION,
|
.getTimelineOfActions(Sets.newHashSet(HoodieActiveTimeline.COMMIT_ACTION,
|
||||||
HoodieActiveTimeline.DELTA_COMMIT_ACTION, HoodieActiveTimeline.COMPACTION_ACTION)).getInstants()
|
HoodieActiveTimeline.DELTA_COMMIT_ACTION, HoodieActiveTimeline.COMPACTION_ACTION))
|
||||||
.filter(i -> commit.equals(i.getTimestamp()))
|
.getInstants().filter(i -> commit.equals(i.getTimestamp())).findFirst());
|
||||||
.findFirst());
|
|
||||||
HoodieInstant instantToRollback = commitOrCompactionOption.get();
|
HoodieInstant instantToRollback = commitOrCompactionOption.get();
|
||||||
// Atomically un-publish all non-inflight commits
|
// Atomically un-publish all non-inflight commits
|
||||||
if (!instantToRollback.isInflight()) {
|
if (!instantToRollback.isInflight()) {
|
||||||
@@ -196,128 +198,134 @@ public class HoodieMergeOnReadTable<T extends HoodieRecordPayload> extends
|
|||||||
}
|
}
|
||||||
logger.info("Unpublished " + commit);
|
logger.info("Unpublished " + commit);
|
||||||
Long startTime = System.currentTimeMillis();
|
Long startTime = System.currentTimeMillis();
|
||||||
List<HoodieRollbackStat> allRollbackStats = jsc.parallelize(FSUtils
|
List<HoodieRollbackStat> allRollbackStats =
|
||||||
.getAllPartitionPaths(this.metaClient.getFs(), this.getMetaClient().getBasePath(),
|
jsc.parallelize(FSUtils.getAllPartitionPaths(this.metaClient.getFs(), this.getMetaClient().getBasePath(),
|
||||||
config.shouldAssumeDatePartitioning()))
|
config.shouldAssumeDatePartitioning())).map((Function<String, HoodieRollbackStat>) partitionPath -> {
|
||||||
.map((Function<String, HoodieRollbackStat>) partitionPath -> {
|
HoodieActiveTimeline activeTimeline = this.getActiveTimeline().reload();
|
||||||
HoodieActiveTimeline activeTimeline = this.getActiveTimeline().reload();
|
HoodieRollbackStat hoodieRollbackStats = null;
|
||||||
HoodieRollbackStat hoodieRollbackStats = null;
|
// Need to put the path filter here since Filter is not serializable
|
||||||
// Need to put the path filter here since Filter is not serializable
|
// PathFilter to get all parquet files and log files that need to be deleted
|
||||||
// PathFilter to get all parquet files and log files that need to be deleted
|
PathFilter filter = (path) -> {
|
||||||
PathFilter filter = (path) -> {
|
if (path.toString().contains(".parquet")) {
|
||||||
if (path.toString().contains(".parquet")) {
|
String fileCommitTime = FSUtils.getCommitTime(path.getName());
|
||||||
String fileCommitTime = FSUtils.getCommitTime(path.getName());
|
return commit.equals(fileCommitTime);
|
||||||
return commit.equals(fileCommitTime);
|
} else if (path.toString().contains(".log")) {
|
||||||
} else if (path.toString().contains(".log")) {
|
// Since the baseCommitTime is the only commit for new log files, it's okay here
|
||||||
// Since the baseCommitTime is the only commit for new log files, it's okay here
|
String fileCommitTime = FSUtils.getBaseCommitTimeFromLogPath(path);
|
||||||
String fileCommitTime = FSUtils.getBaseCommitTimeFromLogPath(path);
|
return commit.equals(fileCommitTime);
|
||||||
return commit.equals(fileCommitTime);
|
|
||||||
}
|
|
||||||
return false;
|
|
||||||
};
|
|
||||||
|
|
||||||
final Map<FileStatus, Boolean> filesToDeletedStatus = new HashMap<>();
|
|
||||||
|
|
||||||
switch (instantToRollback.getAction()) {
|
|
||||||
case HoodieTimeline.COMMIT_ACTION:
|
|
||||||
try {
|
|
||||||
// Rollback of a commit should delete the newly created parquet files along with any log
|
|
||||||
// files created with this as baseCommit. This is required to support multi-rollbacks in a MOR table.
|
|
||||||
super.deleteCleanedFiles(filesToDeletedStatus, partitionPath, filter);
|
|
||||||
hoodieRollbackStats = HoodieRollbackStat.newBuilder()
|
|
||||||
.withPartitionPath(partitionPath).withDeletedFileResults(filesToDeletedStatus).build();
|
|
||||||
break;
|
|
||||||
} catch (IOException io) {
|
|
||||||
throw new UncheckedIOException("Failed to rollback for commit " + commit, io);
|
|
||||||
}
|
|
||||||
case HoodieTimeline.COMPACTION_ACTION:
|
|
||||||
try {
|
|
||||||
// If there is no delta commit present after the current commit (if compaction), no action, else we
|
|
||||||
// need to make sure that a compaction commit rollback also deletes any log files written as part of the
|
|
||||||
// succeeding deltacommit.
|
|
||||||
boolean higherDeltaCommits = !activeTimeline.getDeltaCommitTimeline()
|
|
||||||
.filterCompletedInstants().findInstantsAfter(commit, 1).empty();
|
|
||||||
if (higherDeltaCommits) {
|
|
||||||
// Rollback of a compaction action with no higher deltacommit means that the compaction is scheduled
|
|
||||||
// and has not yet finished. In this scenario we should delete only the newly created parquet files
|
|
||||||
// and not corresponding base commit log files created with this as baseCommit since updates would
|
|
||||||
// have been written to the log files.
|
|
||||||
super.deleteCleanedFiles(filesToDeletedStatus, commit, partitionPath);
|
|
||||||
hoodieRollbackStats = HoodieRollbackStat.newBuilder()
|
|
||||||
.withPartitionPath(partitionPath).withDeletedFileResults(filesToDeletedStatus).build();
|
|
||||||
} else {
|
|
||||||
// No deltacommits present after this compaction commit (inflight or requested). In this case, we
|
|
||||||
// can also delete any log files that were created with this compaction commit as base
|
|
||||||
// commit.
|
|
||||||
super.deleteCleanedFiles(filesToDeletedStatus, partitionPath, filter);
|
|
||||||
hoodieRollbackStats = HoodieRollbackStat.newBuilder()
|
|
||||||
.withPartitionPath(partitionPath).withDeletedFileResults(filesToDeletedStatus).build();
|
|
||||||
}
|
}
|
||||||
break;
|
return false;
|
||||||
} catch (IOException io) {
|
};
|
||||||
throw new UncheckedIOException("Failed to rollback for commit " + commit, io);
|
|
||||||
}
|
|
||||||
case HoodieTimeline.DELTA_COMMIT_ACTION:
|
|
||||||
// --------------------------------------------------------------------------------------------------
|
|
||||||
// (A) The following cases are possible if index.canIndexLogFiles and/or index.isGlobal
|
|
||||||
// --------------------------------------------------------------------------------------------------
|
|
||||||
// (A.1) Failed first commit - Inserts were written to log files and HoodieWriteStat has no entries. In
|
|
||||||
// this scenario we would want to delete these log files.
|
|
||||||
// (A.2) Failed recurring commit - Inserts/Updates written to log files. In this scenario,
|
|
||||||
// HoodieWriteStat will have the baseCommitTime for the first log file written, add rollback blocks.
|
|
||||||
// (A.3) Rollback triggered for first commit - Inserts were written to the log files but the commit is
|
|
||||||
// being reverted. In this scenario, HoodieWriteStat will be `null` for the attribute prevCommitTime and
|
|
||||||
// and hence will end up deleting these log files. This is done so there are no orphan log files
|
|
||||||
// lying around.
|
|
||||||
// (A.4) Rollback triggered for recurring commits - Inserts/Updates are being rolled back, the actions
|
|
||||||
// taken in this scenario is a combination of (A.2) and (A.3)
|
|
||||||
// ---------------------------------------------------------------------------------------------------
|
|
||||||
// (B) The following cases are possible if !index.canIndexLogFiles and/or !index.isGlobal
|
|
||||||
// ---------------------------------------------------------------------------------------------------
|
|
||||||
// (B.1) Failed first commit - Inserts were written to parquet files and HoodieWriteStat has no entries.
|
|
||||||
// In this scenario, we delete all the parquet files written for the failed commit.
|
|
||||||
// (B.2) Failed recurring commits - Inserts were written to parquet files and updates to log files. In
|
|
||||||
// this scenario, perform (A.1) and for updates written to log files, write rollback blocks.
|
|
||||||
// (B.3) Rollback triggered for first commit - Same as (B.1)
|
|
||||||
// (B.4) Rollback triggered for recurring commits - Same as (B.2) plus we need to delete the log files
|
|
||||||
// as well if the base parquet file gets deleted.
|
|
||||||
try {
|
|
||||||
HoodieCommitMetadata commitMetadata = HoodieCommitMetadata.fromBytes(
|
|
||||||
metaClient.getCommitTimeline().getInstantDetails(
|
|
||||||
new HoodieInstant(true, instantToRollback.getAction(), instantToRollback.getTimestamp()))
|
|
||||||
.get(), HoodieCommitMetadata.class);
|
|
||||||
|
|
||||||
// read commit file and (either append delete blocks or delete file)
|
final Map<FileStatus, Boolean> filesToDeletedStatus = new HashMap<>();
|
||||||
Map<FileStatus, Long> filesToNumBlocksRollback = new HashMap<>();
|
|
||||||
|
|
||||||
// In case all data was inserts and the commit failed, delete the file belonging to that commit
|
switch (instantToRollback.getAction()) {
|
||||||
// We do not know fileIds for inserts (first inserts are either log files or parquet files),
|
case HoodieTimeline.COMMIT_ACTION:
|
||||||
// delete all files for the corresponding failed commit, if present (same as COW)
|
try {
|
||||||
super.deleteCleanedFiles(filesToDeletedStatus, partitionPath, filter);
|
// Rollback of a commit should delete the newly created parquet files along with any log
|
||||||
final Set<String> deletedFiles = filesToDeletedStatus.entrySet().stream()
|
// files created with this as baseCommit. This is required to support multi-rollbacks in a MOR
|
||||||
.map(entry -> {
|
// table.
|
||||||
|
super.deleteCleanedFiles(filesToDeletedStatus, partitionPath, filter);
|
||||||
|
hoodieRollbackStats = HoodieRollbackStat.newBuilder().withPartitionPath(partitionPath)
|
||||||
|
.withDeletedFileResults(filesToDeletedStatus).build();
|
||||||
|
break;
|
||||||
|
} catch (IOException io) {
|
||||||
|
throw new UncheckedIOException("Failed to rollback for commit " + commit, io);
|
||||||
|
}
|
||||||
|
case HoodieTimeline.COMPACTION_ACTION:
|
||||||
|
try {
|
||||||
|
// If there is no delta commit present after the current commit (if compaction), no action, else we
|
||||||
|
// need to make sure that a compaction commit rollback also deletes any log files written as part of
|
||||||
|
// the
|
||||||
|
// succeeding deltacommit.
|
||||||
|
boolean higherDeltaCommits = !activeTimeline.getDeltaCommitTimeline().filterCompletedInstants()
|
||||||
|
.findInstantsAfter(commit, 1).empty();
|
||||||
|
if (higherDeltaCommits) {
|
||||||
|
// Rollback of a compaction action with no higher deltacommit means that the compaction is
|
||||||
|
// scheduled
|
||||||
|
// and has not yet finished. In this scenario we should delete only the newly created parquet
|
||||||
|
// files
|
||||||
|
// and not corresponding base commit log files created with this as baseCommit since updates would
|
||||||
|
// have been written to the log files.
|
||||||
|
super.deleteCleanedFiles(filesToDeletedStatus, commit, partitionPath);
|
||||||
|
hoodieRollbackStats = HoodieRollbackStat.newBuilder().withPartitionPath(partitionPath)
|
||||||
|
.withDeletedFileResults(filesToDeletedStatus).build();
|
||||||
|
} else {
|
||||||
|
// No deltacommits present after this compaction commit (inflight or requested). In this case, we
|
||||||
|
// can also delete any log files that were created with this compaction commit as base
|
||||||
|
// commit.
|
||||||
|
super.deleteCleanedFiles(filesToDeletedStatus, partitionPath, filter);
|
||||||
|
hoodieRollbackStats = HoodieRollbackStat.newBuilder().withPartitionPath(partitionPath)
|
||||||
|
.withDeletedFileResults(filesToDeletedStatus).build();
|
||||||
|
}
|
||||||
|
break;
|
||||||
|
} catch (IOException io) {
|
||||||
|
throw new UncheckedIOException("Failed to rollback for commit " + commit, io);
|
||||||
|
}
|
||||||
|
case HoodieTimeline.DELTA_COMMIT_ACTION:
|
||||||
|
// --------------------------------------------------------------------------------------------------
|
||||||
|
// (A) The following cases are possible if index.canIndexLogFiles and/or index.isGlobal
|
||||||
|
// --------------------------------------------------------------------------------------------------
|
||||||
|
// (A.1) Failed first commit - Inserts were written to log files and HoodieWriteStat has no entries.
|
||||||
|
// In
|
||||||
|
// this scenario we would want to delete these log files.
|
||||||
|
// (A.2) Failed recurring commit - Inserts/Updates written to log files. In this scenario,
|
||||||
|
// HoodieWriteStat will have the baseCommitTime for the first log file written, add rollback blocks.
|
||||||
|
// (A.3) Rollback triggered for first commit - Inserts were written to the log files but the commit is
|
||||||
|
// being reverted. In this scenario, HoodieWriteStat will be `null` for the attribute prevCommitTime
|
||||||
|
// and
|
||||||
|
// and hence will end up deleting these log files. This is done so there are no orphan log files
|
||||||
|
// lying around.
|
||||||
|
// (A.4) Rollback triggered for recurring commits - Inserts/Updates are being rolled back, the actions
|
||||||
|
// taken in this scenario is a combination of (A.2) and (A.3)
|
||||||
|
// ---------------------------------------------------------------------------------------------------
|
||||||
|
// (B) The following cases are possible if !index.canIndexLogFiles and/or !index.isGlobal
|
||||||
|
// ---------------------------------------------------------------------------------------------------
|
||||||
|
// (B.1) Failed first commit - Inserts were written to parquet files and HoodieWriteStat has no
|
||||||
|
// entries.
|
||||||
|
// In this scenario, we delete all the parquet files written for the failed commit.
|
||||||
|
// (B.2) Failed recurring commits - Inserts were written to parquet files and updates to log files. In
|
||||||
|
// this scenario, perform (A.1) and for updates written to log files, write rollback blocks.
|
||||||
|
// (B.3) Rollback triggered for first commit - Same as (B.1)
|
||||||
|
// (B.4) Rollback triggered for recurring commits - Same as (B.2) plus we need to delete the log files
|
||||||
|
// as well if the base parquet file gets deleted.
|
||||||
|
try {
|
||||||
|
HoodieCommitMetadata commitMetadata =
|
||||||
|
HoodieCommitMetadata.fromBytes(
|
||||||
|
metaClient.getCommitTimeline().getInstantDetails(new HoodieInstant(true,
|
||||||
|
instantToRollback.getAction(), instantToRollback.getTimestamp())).get(),
|
||||||
|
HoodieCommitMetadata.class);
|
||||||
|
|
||||||
|
// read commit file and (either append delete blocks or delete file)
|
||||||
|
Map<FileStatus, Long> filesToNumBlocksRollback = new HashMap<>();
|
||||||
|
|
||||||
|
// In case all data was inserts and the commit failed, delete the file belonging to that commit
|
||||||
|
// We do not know fileIds for inserts (first inserts are either log files or parquet files),
|
||||||
|
// delete all files for the corresponding failed commit, if present (same as COW)
|
||||||
|
super.deleteCleanedFiles(filesToDeletedStatus, partitionPath, filter);
|
||||||
|
final Set<String> deletedFiles = filesToDeletedStatus.entrySet().stream().map(entry -> {
|
||||||
Path filePath = entry.getKey().getPath();
|
Path filePath = entry.getKey().getPath();
|
||||||
return FSUtils.getFileIdFromFilePath(filePath);
|
return FSUtils.getFileIdFromFilePath(filePath);
|
||||||
}).collect(Collectors.toSet());
|
}).collect(Collectors.toSet());
|
||||||
|
|
||||||
// append rollback blocks for updates
|
// append rollback blocks for updates
|
||||||
if (commitMetadata.getPartitionToWriteStats().containsKey(partitionPath)) {
|
if (commitMetadata.getPartitionToWriteStats().containsKey(partitionPath)) {
|
||||||
hoodieRollbackStats = rollback(index, partitionPath, commit, commitMetadata, filesToDeletedStatus,
|
hoodieRollbackStats = rollback(index, partitionPath, commit, commitMetadata, filesToDeletedStatus,
|
||||||
filesToNumBlocksRollback, deletedFiles);
|
filesToNumBlocksRollback, deletedFiles);
|
||||||
}
|
}
|
||||||
break;
|
break;
|
||||||
} catch (IOException io) {
|
} catch (IOException io) {
|
||||||
throw new UncheckedIOException("Failed to rollback for commit " + commit, io);
|
throw new UncheckedIOException("Failed to rollback for commit " + commit, io);
|
||||||
|
}
|
||||||
|
default:
|
||||||
|
break;
|
||||||
}
|
}
|
||||||
default:
|
return hoodieRollbackStats;
|
||||||
break;
|
}).filter(Objects::nonNull).collect();
|
||||||
}
|
|
||||||
return hoodieRollbackStats;
|
|
||||||
}).filter(Objects::nonNull).collect();
|
|
||||||
|
|
||||||
// Delete Inflight instants if enabled
|
// Delete Inflight instants if enabled
|
||||||
deleteInflightInstant(deleteInstants, this.getActiveTimeline(), new HoodieInstant(true, instantToRollback
|
deleteInflightInstant(deleteInstants, this.getActiveTimeline(),
|
||||||
.getAction(), instantToRollback.getTimestamp()));
|
new HoodieInstant(true, instantToRollback.getAction(), instantToRollback.getTimestamp()));
|
||||||
|
|
||||||
logger.debug("Time(in ms) taken to finish rollback " + (System.currentTimeMillis() - startTime));
|
logger.debug("Time(in ms) taken to finish rollback " + (System.currentTimeMillis() - startTime));
|
||||||
|
|
||||||
@@ -332,8 +340,8 @@ public class HoodieMergeOnReadTable<T extends HoodieRecordPayload> extends
|
|||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* UpsertPartitioner for MergeOnRead table type, this allows auto correction of small parquet
|
* UpsertPartitioner for MergeOnRead table type, this allows auto correction of small parquet files to larger ones
|
||||||
* files to larger ones without the need for an index in the logFile.
|
* without the need for an index in the logFile.
|
||||||
*/
|
*/
|
||||||
class MergeOnReadUpsertPartitioner extends HoodieCopyOnWriteTable.UpsertPartitioner {
|
class MergeOnReadUpsertPartitioner extends HoodieCopyOnWriteTable.UpsertPartitioner {
|
||||||
|
|
||||||
@@ -361,21 +369,23 @@ public class HoodieMergeOnReadTable<T extends HoodieRecordPayload> extends
|
|||||||
// TODO : choose last N small files since there can be multiple small files written to a single partition
|
// TODO : choose last N small files since there can be multiple small files written to a single partition
|
||||||
// by different spark partitions in a single batch
|
// by different spark partitions in a single batch
|
||||||
Option<FileSlice> smallFileSlice = Option.fromJavaOptional(getRTFileSystemView()
|
Option<FileSlice> smallFileSlice = Option.fromJavaOptional(getRTFileSystemView()
|
||||||
.getLatestFileSlicesBeforeOrOn(partitionPath, latestCommitTime.getTimestamp(), false).filter(
|
.getLatestFileSlicesBeforeOrOn(partitionPath, latestCommitTime.getTimestamp(), false)
|
||||||
fileSlice -> fileSlice.getLogFiles().count() < 1
|
.filter(fileSlice -> fileSlice.getLogFiles().count() < 1
|
||||||
&& fileSlice.getDataFile().get().getFileSize() < config
|
&& fileSlice.getDataFile().get().getFileSize() < config.getParquetSmallFileLimit())
|
||||||
.getParquetSmallFileLimit()).sorted((FileSlice left, FileSlice right) ->
|
.sorted((FileSlice left,
|
||||||
left.getDataFile().get().getFileSize() < right.getDataFile().get().getFileSize()
|
FileSlice right) -> left.getDataFile().get().getFileSize() < right.getDataFile().get().getFileSize()
|
||||||
? -1 : 1).findFirst());
|
? -1
|
||||||
|
: 1)
|
||||||
|
.findFirst());
|
||||||
if (smallFileSlice.isPresent()) {
|
if (smallFileSlice.isPresent()) {
|
||||||
allSmallFileSlices.add(smallFileSlice.get());
|
allSmallFileSlices.add(smallFileSlice.get());
|
||||||
}
|
}
|
||||||
} else {
|
} else {
|
||||||
// If we can index log files, we can add more inserts to log files for fileIds including those under
|
// If we can index log files, we can add more inserts to log files for fileIds including those under
|
||||||
// pending compaction.
|
// pending compaction.
|
||||||
List<FileSlice> allFileSlices = getRTFileSystemView()
|
List<FileSlice> allFileSlices =
|
||||||
.getLatestFileSlicesBeforeOrOn(partitionPath, latestCommitTime.getTimestamp(), true)
|
getRTFileSystemView().getLatestFileSlicesBeforeOrOn(partitionPath, latestCommitTime.getTimestamp(), true)
|
||||||
.collect(Collectors.toList());
|
.collect(Collectors.toList());
|
||||||
for (FileSlice fileSlice : allFileSlices) {
|
for (FileSlice fileSlice : allFileSlices) {
|
||||||
if (isSmallFile(partitionPath, fileSlice)) {
|
if (isSmallFile(partitionPath, fileSlice)) {
|
||||||
allSmallFileSlices.add(fileSlice);
|
allSmallFileSlices.add(fileSlice);
|
||||||
@@ -408,8 +418,7 @@ public class HoodieMergeOnReadTable<T extends HoodieRecordPayload> extends
|
|||||||
}
|
}
|
||||||
|
|
||||||
public List<String> getSmallFileIds() {
|
public List<String> getSmallFileIds() {
|
||||||
return (List<String>) smallFiles.stream()
|
return (List<String>) smallFiles.stream().map(smallFile -> ((SmallFile) smallFile).location.getFileId())
|
||||||
.map(smallFile -> ((SmallFile) smallFile).location.getFileId())
|
|
||||||
.collect(Collectors.toList());
|
.collect(Collectors.toList());
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -417,8 +426,8 @@ public class HoodieMergeOnReadTable<T extends HoodieRecordPayload> extends
|
|||||||
if (!fileSlice.getDataFile().isPresent()) {
|
if (!fileSlice.getDataFile().isPresent()) {
|
||||||
return convertLogFilesSizeToExpectedParquetSize(fileSlice.getLogFiles().collect(Collectors.toList()));
|
return convertLogFilesSizeToExpectedParquetSize(fileSlice.getLogFiles().collect(Collectors.toList()));
|
||||||
} else {
|
} else {
|
||||||
return fileSlice.getDataFile().get().getFileSize() + convertLogFilesSizeToExpectedParquetSize(fileSlice
|
return fileSlice.getDataFile().get().getFileSize()
|
||||||
.getLogFiles().collect(Collectors.toList()));
|
+ convertLogFilesSizeToExpectedParquetSize(fileSlice.getLogFiles().collect(Collectors.toList()));
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -431,13 +440,12 @@ public class HoodieMergeOnReadTable<T extends HoodieRecordPayload> extends
|
|||||||
@VisibleForTesting
|
@VisibleForTesting
|
||||||
public long convertLogFilesSizeToExpectedParquetSize(List<HoodieLogFile> hoodieLogFiles) {
|
public long convertLogFilesSizeToExpectedParquetSize(List<HoodieLogFile> hoodieLogFiles) {
|
||||||
long totalSizeOfLogFiles = hoodieLogFiles.stream().map(hoodieLogFile -> hoodieLogFile.getFileSize())
|
long totalSizeOfLogFiles = hoodieLogFiles.stream().map(hoodieLogFile -> hoodieLogFile.getFileSize())
|
||||||
.filter(size -> size > 0)
|
.filter(size -> size > 0).reduce((a, b) -> (a + b)).orElse(0L);
|
||||||
.reduce((a, b) -> (a + b)).orElse(0L);
|
|
||||||
// Here we assume that if there is no base parquet file, all log files contain only inserts.
|
// Here we assume that if there is no base parquet file, all log files contain only inserts.
|
||||||
// We can then just get the parquet equivalent size of these log files, compare that with
|
// We can then just get the parquet equivalent size of these log files, compare that with
|
||||||
// {@link config.getParquetMaxFileSize()} and decide if there is scope to insert more rows
|
// {@link config.getParquetMaxFileSize()} and decide if there is scope to insert more rows
|
||||||
long logFilesEquivalentParquetFileSize = (long) (totalSizeOfLogFiles * config
|
long logFilesEquivalentParquetFileSize =
|
||||||
.getLogFileToParquetCompressionRatio());
|
(long) (totalSizeOfLogFiles * config.getLogFileToParquetCompressionRatio());
|
||||||
return logFilesEquivalentParquetFileSize;
|
return logFilesEquivalentParquetFileSize;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@@ -447,8 +455,8 @@ public class HoodieMergeOnReadTable<T extends HoodieRecordPayload> extends
|
|||||||
Map<HeaderMetadataType, String> header = Maps.newHashMap();
|
Map<HeaderMetadataType, String> header = Maps.newHashMap();
|
||||||
header.put(HeaderMetadataType.INSTANT_TIME, metaClient.getActiveTimeline().lastInstant().get().getTimestamp());
|
header.put(HeaderMetadataType.INSTANT_TIME, metaClient.getActiveTimeline().lastInstant().get().getTimestamp());
|
||||||
header.put(HeaderMetadataType.TARGET_INSTANT_TIME, commit);
|
header.put(HeaderMetadataType.TARGET_INSTANT_TIME, commit);
|
||||||
header.put(HeaderMetadataType.COMMAND_BLOCK_TYPE, String.valueOf(HoodieCommandBlockTypeEnum.ROLLBACK_PREVIOUS_BLOCK
|
header.put(HeaderMetadataType.COMMAND_BLOCK_TYPE,
|
||||||
.ordinal()));
|
String.valueOf(HoodieCommandBlockTypeEnum.ROLLBACK_PREVIOUS_BLOCK.ordinal()));
|
||||||
return header;
|
return header;
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -461,52 +469,47 @@ public class HoodieMergeOnReadTable<T extends HoodieRecordPayload> extends
|
|||||||
// But the index (global) might store the baseCommit of the parquet and not the requested, hence get the
|
// But the index (global) might store the baseCommit of the parquet and not the requested, hence get the
|
||||||
// baseCommit always by listing the file slice
|
// baseCommit always by listing the file slice
|
||||||
Map<String, String> fileIdToBaseCommitTimeForLogMap = this.getRTFileSystemView().getLatestFileSlices(partitionPath)
|
Map<String, String> fileIdToBaseCommitTimeForLogMap = this.getRTFileSystemView().getLatestFileSlices(partitionPath)
|
||||||
.collect(Collectors.toMap(FileSlice::getFileId, FileSlice::getBaseInstantTime));
|
.collect(Collectors.toMap(FileSlice::getFileId, FileSlice::getBaseInstantTime));
|
||||||
commitMetadata.getPartitionToWriteStats().get(partitionPath).stream()
|
commitMetadata.getPartitionToWriteStats().get(partitionPath).stream().filter(wStat -> {
|
||||||
.filter(wStat -> {
|
// Filter out stats without prevCommit since they are all inserts
|
||||||
// Filter out stats without prevCommit since they are all inserts
|
return wStat != null && wStat.getPrevCommit() != HoodieWriteStat.NULL_COMMIT && wStat.getPrevCommit() != null
|
||||||
return wStat != null && wStat.getPrevCommit() != HoodieWriteStat.NULL_COMMIT && wStat.getPrevCommit() != null
|
&& !deletedFiles.contains(wStat.getFileId());
|
||||||
&& !deletedFiles.contains(wStat.getFileId());
|
}).forEach(wStat -> {
|
||||||
}).forEach(wStat -> {
|
Writer writer = null;
|
||||||
Writer writer = null;
|
String baseCommitTime = fileIdToBaseCommitTimeForLogMap.get(wStat.getFileId());
|
||||||
String baseCommitTime = fileIdToBaseCommitTimeForLogMap.get(wStat.getFileId());
|
if (null != baseCommitTime) {
|
||||||
if (null != baseCommitTime) {
|
boolean success = false;
|
||||||
boolean success = false;
|
try {
|
||||||
try {
|
writer = HoodieLogFormat.newWriterBuilder()
|
||||||
writer = HoodieLogFormat.newWriterBuilder().onParentPath(
|
.onParentPath(FSUtils.getPartitionPath(this.getMetaClient().getBasePath(), partitionPath))
|
||||||
FSUtils.getPartitionPath(this.getMetaClient().getBasePath(), partitionPath))
|
.withFileId(wStat.getFileId()).overBaseCommit(baseCommitTime).withFs(this.metaClient.getFs())
|
||||||
.withFileId(wStat.getFileId()).overBaseCommit(baseCommitTime)
|
.withFileExtension(HoodieLogFile.DELTA_EXTENSION).build();
|
||||||
.withFs(this.metaClient.getFs())
|
// generate metadata
|
||||||
.withFileExtension(HoodieLogFile.DELTA_EXTENSION).build();
|
Map<HeaderMetadataType, String> header = generateHeader(commit);
|
||||||
// generate metadata
|
// if update belongs to an existing log file
|
||||||
Map<HeaderMetadataType, String> header = generateHeader(commit);
|
writer = writer.appendBlock(new HoodieCommandBlock(header));
|
||||||
// if update belongs to an existing log file
|
success = true;
|
||||||
writer = writer.appendBlock(new HoodieCommandBlock(header));
|
} catch (IOException | InterruptedException io) {
|
||||||
success = true;
|
throw new HoodieRollbackException("Failed to rollback for commit " + commit, io);
|
||||||
} catch (IOException | InterruptedException io) {
|
} finally {
|
||||||
throw new HoodieRollbackException(
|
try {
|
||||||
"Failed to rollback for commit " + commit, io);
|
if (writer != null) {
|
||||||
} finally {
|
writer.close();
|
||||||
try {
|
|
||||||
if (writer != null) {
|
|
||||||
writer.close();
|
|
||||||
}
|
|
||||||
if (success) {
|
|
||||||
// This step is intentionally done after writer is closed. Guarantees that
|
|
||||||
// getFileStatus would reflect correct stats and FileNotFoundException is not thrown in
|
|
||||||
// cloud-storage : HUDI-168
|
|
||||||
filesToNumBlocksRollback.put(this.getMetaClient().getFs()
|
|
||||||
.getFileStatus(writer.getLogFile().getPath()), 1L);
|
|
||||||
}
|
|
||||||
} catch (IOException io) {
|
|
||||||
throw new UncheckedIOException(io);
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
if (success) {
|
||||||
|
// This step is intentionally done after writer is closed. Guarantees that
|
||||||
|
// getFileStatus would reflect correct stats and FileNotFoundException is not thrown in
|
||||||
|
// cloud-storage : HUDI-168
|
||||||
|
filesToNumBlocksRollback.put(this.getMetaClient().getFs().getFileStatus(writer.getLogFile().getPath()),
|
||||||
|
1L);
|
||||||
|
}
|
||||||
|
} catch (IOException io) {
|
||||||
|
throw new UncheckedIOException(io);
|
||||||
}
|
}
|
||||||
});
|
}
|
||||||
return HoodieRollbackStat.newBuilder()
|
}
|
||||||
.withPartitionPath(partitionPath)
|
});
|
||||||
.withDeletedFileResults(filesToDeletedStatus)
|
return HoodieRollbackStat.newBuilder().withPartitionPath(partitionPath).withDeletedFileResults(filesToDeletedStatus)
|
||||||
.withRollbackBlockAppendResults(filesToNumBlocksRollback).build();
|
.withRollbackBlockAppendResults(filesToNumBlocksRollback).build();
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|||||||
@@ -82,22 +82,21 @@ public abstract class HoodieTable<T extends HoodieRecordPayload> implements Seri
|
|||||||
protected HoodieTable(HoodieWriteConfig config, JavaSparkContext jsc) {
|
protected HoodieTable(HoodieWriteConfig config, JavaSparkContext jsc) {
|
||||||
this.config = config;
|
this.config = config;
|
||||||
this.hadoopConfiguration = new SerializableConfiguration(jsc.hadoopConfiguration());
|
this.hadoopConfiguration = new SerializableConfiguration(jsc.hadoopConfiguration());
|
||||||
this.viewManager = FileSystemViewManager.createViewManager(
|
this.viewManager = FileSystemViewManager.createViewManager(new SerializableConfiguration(jsc.hadoopConfiguration()),
|
||||||
new SerializableConfiguration(jsc.hadoopConfiguration()), config.getViewStorageConfig());
|
config.getViewStorageConfig());
|
||||||
this.metaClient = ClientUtils.createMetaClient(jsc, config, true);
|
this.metaClient = ClientUtils.createMetaClient(jsc, config, true);
|
||||||
this.index = HoodieIndex.createIndex(config, jsc);
|
this.index = HoodieIndex.createIndex(config, jsc);
|
||||||
}
|
}
|
||||||
|
|
||||||
private synchronized FileSystemViewManager getViewManager() {
|
private synchronized FileSystemViewManager getViewManager() {
|
||||||
if (null == viewManager) {
|
if (null == viewManager) {
|
||||||
viewManager = FileSystemViewManager.createViewManager(hadoopConfiguration,
|
viewManager = FileSystemViewManager.createViewManager(hadoopConfiguration, config.getViewStorageConfig());
|
||||||
config.getViewStorageConfig());
|
|
||||||
}
|
}
|
||||||
return viewManager;
|
return viewManager;
|
||||||
}
|
}
|
||||||
|
|
||||||
public static <T extends HoodieRecordPayload> HoodieTable<T> getHoodieTable(
|
public static <T extends HoodieRecordPayload> HoodieTable<T> getHoodieTable(HoodieTableMetaClient metaClient,
|
||||||
HoodieTableMetaClient metaClient, HoodieWriteConfig config, JavaSparkContext jsc) {
|
HoodieWriteConfig config, JavaSparkContext jsc) {
|
||||||
switch (metaClient.getTableType()) {
|
switch (metaClient.getTableType()) {
|
||||||
case COPY_ON_WRITE:
|
case COPY_ON_WRITE:
|
||||||
return new HoodieCopyOnWriteTable<>(config, jsc);
|
return new HoodieCopyOnWriteTable<>(config, jsc);
|
||||||
@@ -202,8 +201,7 @@ public abstract class HoodieTable<T extends HoodieRecordPayload> implements Seri
|
|||||||
* Get the list of savepoints in this table
|
* Get the list of savepoints in this table
|
||||||
*/
|
*/
|
||||||
public List<String> getSavepoints() {
|
public List<String> getSavepoints() {
|
||||||
return getCompletedSavepointTimeline().getInstants().map(HoodieInstant::getTimestamp)
|
return getCompletedSavepointTimeline().getInstants().map(HoodieInstant::getTimestamp).collect(Collectors.toList());
|
||||||
.collect(Collectors.toList());
|
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
@@ -214,18 +212,14 @@ public abstract class HoodieTable<T extends HoodieRecordPayload> implements Seri
|
|||||||
throw new HoodieSavepointException(
|
throw new HoodieSavepointException(
|
||||||
"Could not get data files for savepoint " + savepointTime + ". No such savepoint.");
|
"Could not get data files for savepoint " + savepointTime + ". No such savepoint.");
|
||||||
}
|
}
|
||||||
HoodieInstant instant = new HoodieInstant(false, HoodieTimeline.SAVEPOINT_ACTION,
|
HoodieInstant instant = new HoodieInstant(false, HoodieTimeline.SAVEPOINT_ACTION, savepointTime);
|
||||||
savepointTime);
|
|
||||||
HoodieSavepointMetadata metadata = null;
|
HoodieSavepointMetadata metadata = null;
|
||||||
try {
|
try {
|
||||||
metadata = AvroUtils
|
metadata = AvroUtils.deserializeHoodieSavepointMetadata(getActiveTimeline().getInstantDetails(instant).get());
|
||||||
.deserializeHoodieSavepointMetadata(getActiveTimeline().getInstantDetails(instant).get());
|
|
||||||
} catch (IOException e) {
|
} catch (IOException e) {
|
||||||
throw new HoodieSavepointException(
|
throw new HoodieSavepointException("Could not get savepointed data files for savepoint " + savepointTime, e);
|
||||||
"Could not get savepointed data files for savepoint " + savepointTime, e);
|
|
||||||
}
|
}
|
||||||
return metadata.getPartitionMetadata().values().stream()
|
return metadata.getPartitionMetadata().values().stream().flatMap(s -> s.getSavepointDataFile().stream());
|
||||||
.flatMap(s -> s.getSavepointDataFile().stream());
|
|
||||||
}
|
}
|
||||||
|
|
||||||
public HoodieActiveTimeline getActiveTimeline() {
|
public HoodieActiveTimeline getActiveTimeline() {
|
||||||
@@ -242,30 +236,30 @@ public abstract class HoodieTable<T extends HoodieRecordPayload> implements Seri
|
|||||||
/**
|
/**
|
||||||
* Perform the ultimate IO for a given upserted (RDD) partition
|
* Perform the ultimate IO for a given upserted (RDD) partition
|
||||||
*/
|
*/
|
||||||
public abstract Iterator<List<WriteStatus>> handleUpsertPartition(String commitTime,
|
public abstract Iterator<List<WriteStatus>> handleUpsertPartition(String commitTime, Integer partition,
|
||||||
Integer partition, Iterator<HoodieRecord<T>> recordIterator, Partitioner partitioner);
|
Iterator<HoodieRecord<T>> recordIterator, Partitioner partitioner);
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Perform the ultimate IO for a given inserted (RDD) partition
|
* Perform the ultimate IO for a given inserted (RDD) partition
|
||||||
*/
|
*/
|
||||||
public abstract Iterator<List<WriteStatus>> handleInsertPartition(String commitTime,
|
public abstract Iterator<List<WriteStatus>> handleInsertPartition(String commitTime, Integer partition,
|
||||||
Integer partition, Iterator<HoodieRecord<T>> recordIterator, Partitioner partitioner);
|
Iterator<HoodieRecord<T>> recordIterator, Partitioner partitioner);
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Schedule compaction for the instant time
|
* Schedule compaction for the instant time
|
||||||
* @param jsc Spark Context
|
*
|
||||||
|
* @param jsc Spark Context
|
||||||
* @param instantTime Instant Time for scheduling compaction
|
* @param instantTime Instant Time for scheduling compaction
|
||||||
* @return
|
* @return
|
||||||
*/
|
*/
|
||||||
public abstract HoodieCompactionPlan scheduleCompaction(JavaSparkContext jsc, String instantTime);
|
public abstract HoodieCompactionPlan scheduleCompaction(JavaSparkContext jsc, String instantTime);
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Run Compaction on the table. Compaction arranges the data so that it is optimized for data
|
* Run Compaction on the table. Compaction arranges the data so that it is optimized for data access
|
||||||
* access
|
|
||||||
*
|
*
|
||||||
* @param jsc Spark Context
|
* @param jsc Spark Context
|
||||||
* @param compactionInstantTime Instant Time
|
* @param compactionInstantTime Instant Time
|
||||||
* @param compactionPlan Compaction Plan
|
* @param compactionPlan Compaction Plan
|
||||||
*/
|
*/
|
||||||
public abstract JavaRDD<WriteStatus> compact(JavaSparkContext jsc, String compactionInstantTime,
|
public abstract JavaRDD<WriteStatus> compact(JavaSparkContext jsc, String compactionInstantTime,
|
||||||
HoodieCompactionPlan compactionPlan);
|
HoodieCompactionPlan compactionPlan);
|
||||||
@@ -276,9 +270,9 @@ public abstract class HoodieTable<T extends HoodieRecordPayload> implements Seri
|
|||||||
public abstract List<HoodieCleanStat> clean(JavaSparkContext jsc);
|
public abstract List<HoodieCleanStat> clean(JavaSparkContext jsc);
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Rollback the (inflight/committed) record changes with the given commit time. Four steps: (1)
|
* Rollback the (inflight/committed) record changes with the given commit time. Four steps: (1) Atomically unpublish
|
||||||
* Atomically unpublish this commit (2) clean indexing data (3) clean new generated parquet files
|
* this commit (2) clean indexing data (3) clean new generated parquet files / log blocks (4) Finally, delete
|
||||||
* / log blocks (4) Finally, delete .<action>.commit or .<action>.inflight file if deleteInstants = true
|
* .<action>.commit or .<action>.inflight file if deleteInstants = true
|
||||||
*/
|
*/
|
||||||
public abstract List<HoodieRollbackStat> rollback(JavaSparkContext jsc, String commit, boolean deleteInstants)
|
public abstract List<HoodieRollbackStat> rollback(JavaSparkContext jsc, String commit, boolean deleteInstants)
|
||||||
throws IOException;
|
throws IOException;
|
||||||
@@ -297,6 +291,7 @@ public abstract class HoodieTable<T extends HoodieRecordPayload> implements Seri
|
|||||||
|
|
||||||
/**
|
/**
|
||||||
* Delete Marker directory corresponding to an instant
|
* Delete Marker directory corresponding to an instant
|
||||||
|
*
|
||||||
* @param instantTs Instant Time
|
* @param instantTs Instant Time
|
||||||
*/
|
*/
|
||||||
protected void deleteMarkerDir(String instantTs) {
|
protected void deleteMarkerDir(String instantTs) {
|
||||||
@@ -317,10 +312,10 @@ public abstract class HoodieTable<T extends HoodieRecordPayload> implements Seri
|
|||||||
* Reconciles WriteStats and marker files to detect and safely delete duplicate data files created because of Spark
|
* Reconciles WriteStats and marker files to detect and safely delete duplicate data files created because of Spark
|
||||||
* retries.
|
* retries.
|
||||||
*
|
*
|
||||||
* @param jsc Spark Context
|
* @param jsc Spark Context
|
||||||
* @param instantTs Instant Timestamp
|
* @param instantTs Instant Timestamp
|
||||||
* @param stats Hoodie Write Stat
|
* @param stats Hoodie Write Stat
|
||||||
* @param consistencyCheckEnabled Consistency Check Enabled
|
* @param consistencyCheckEnabled Consistency Check Enabled
|
||||||
* @throws HoodieIOException
|
* @throws HoodieIOException
|
||||||
*/
|
*/
|
||||||
protected void cleanFailedWrites(JavaSparkContext jsc, String instantTs, List<HoodieWriteStat> stats,
|
protected void cleanFailedWrites(JavaSparkContext jsc, String instantTs, List<HoodieWriteStat> stats,
|
||||||
@@ -343,13 +338,12 @@ public abstract class HoodieTable<T extends HoodieRecordPayload> implements Seri
|
|||||||
// Contains list of partially created files. These needs to be cleaned up.
|
// Contains list of partially created files. These needs to be cleaned up.
|
||||||
invalidDataPaths.removeAll(validDataPaths);
|
invalidDataPaths.removeAll(validDataPaths);
|
||||||
if (!invalidDataPaths.isEmpty()) {
|
if (!invalidDataPaths.isEmpty()) {
|
||||||
logger.info("Removing duplicate data files created due to spark retries before committing. Paths="
|
logger.info(
|
||||||
+ invalidDataPaths);
|
"Removing duplicate data files created due to spark retries before committing. Paths=" + invalidDataPaths);
|
||||||
}
|
}
|
||||||
|
|
||||||
Map<String, List<Pair<String, String>>> groupByPartition = invalidDataPaths.stream()
|
Map<String, List<Pair<String, String>>> groupByPartition = invalidDataPaths.stream()
|
||||||
.map(dp -> Pair.of(new Path(dp).getParent().toString(), dp))
|
.map(dp -> Pair.of(new Path(dp).getParent().toString(), dp)).collect(Collectors.groupingBy(Pair::getKey));
|
||||||
.collect(Collectors.groupingBy(Pair::getKey));
|
|
||||||
|
|
||||||
if (!groupByPartition.isEmpty()) {
|
if (!groupByPartition.isEmpty()) {
|
||||||
// Ensure all files in delete list is actually present. This is mandatory for an eventually consistent FS.
|
// Ensure all files in delete list is actually present. This is mandatory for an eventually consistent FS.
|
||||||
@@ -394,7 +388,8 @@ public abstract class HoodieTable<T extends HoodieRecordPayload> implements Seri
|
|||||||
|
|
||||||
/**
|
/**
|
||||||
* Ensures all files passed either appear or disappear
|
* Ensures all files passed either appear or disappear
|
||||||
* @param jsc JavaSparkContext
|
*
|
||||||
|
* @param jsc JavaSparkContext
|
||||||
* @param groupByPartition Files grouped by partition
|
* @param groupByPartition Files grouped by partition
|
||||||
* @param visibility Appear/Disappear
|
* @param visibility Appear/Disappear
|
||||||
*/
|
*/
|
||||||
|
|||||||
@@ -23,13 +23,11 @@ import org.apache.hudi.common.model.HoodieRecordPayload;
|
|||||||
import org.apache.spark.api.java.JavaRDD;
|
import org.apache.spark.api.java.JavaRDD;
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Repartition input records into at least expected number of output spark partitions. It should
|
* Repartition input records into at least expected number of output spark partitions. It should give below guarantees -
|
||||||
* give below guarantees - Output spark partition will have records from only one hoodie partition.
|
* Output spark partition will have records from only one hoodie partition. - Average records per output spark
|
||||||
* - Average records per output spark partitions should be almost equal to (#inputRecords /
|
* partitions should be almost equal to (#inputRecords / #outputSparkPartitions) to avoid possible skews.
|
||||||
* #outputSparkPartitions) to avoid possible skews.
|
|
||||||
*/
|
*/
|
||||||
public interface UserDefinedBulkInsertPartitioner<T extends HoodieRecordPayload> {
|
public interface UserDefinedBulkInsertPartitioner<T extends HoodieRecordPayload> {
|
||||||
|
|
||||||
JavaRDD<HoodieRecord<T>> repartitionRecords(JavaRDD<HoodieRecord<T>> records,
|
JavaRDD<HoodieRecord<T>> repartitionRecords(JavaRDD<HoodieRecord<T>> records, int outputSparkPartitions);
|
||||||
int outputSparkPartitions);
|
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -30,8 +30,7 @@ import org.apache.spark.api.java.JavaRDD;
|
|||||||
import scala.Tuple2;
|
import scala.Tuple2;
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Information about incoming records for upsert/insert obtained either via sampling or
|
* Information about incoming records for upsert/insert obtained either via sampling or introspecting the data fully
|
||||||
* introspecting the data fully
|
|
||||||
* <p>
|
* <p>
|
||||||
* TODO(vc): Think about obtaining this directly from index.tagLocation
|
* TODO(vc): Think about obtaining this directly from index.tagLocation
|
||||||
*/
|
*/
|
||||||
@@ -62,11 +61,10 @@ public class WorkloadProfile<T extends HoodieRecordPayload> implements Serializa
|
|||||||
|
|
||||||
Map<Tuple2<String, Option<HoodieRecordLocation>>, Long> partitionLocationCounts = taggedRecords
|
Map<Tuple2<String, Option<HoodieRecordLocation>>, Long> partitionLocationCounts = taggedRecords
|
||||||
.mapToPair(record -> new Tuple2<>(
|
.mapToPair(record -> new Tuple2<>(
|
||||||
new Tuple2<>(record.getPartitionPath(), Option.ofNullable(record.getCurrentLocation())),
|
new Tuple2<>(record.getPartitionPath(), Option.ofNullable(record.getCurrentLocation())), record))
|
||||||
record)).countByKey();
|
.countByKey();
|
||||||
|
|
||||||
for (Map.Entry<Tuple2<String, Option<HoodieRecordLocation>>, Long> e : partitionLocationCounts
|
for (Map.Entry<Tuple2<String, Option<HoodieRecordLocation>>, Long> e : partitionLocationCounts.entrySet()) {
|
||||||
.entrySet()) {
|
|
||||||
String partitionPath = e.getKey()._1();
|
String partitionPath = e.getKey()._1();
|
||||||
Long count = e.getValue();
|
Long count = e.getValue();
|
||||||
Option<HoodieRecordLocation> locOption = e.getKey()._2();
|
Option<HoodieRecordLocation> locOption = e.getKey()._2();
|
||||||
|
|||||||
@@ -41,7 +41,8 @@ import org.apache.spark.api.java.JavaRDD;
|
|||||||
import org.apache.spark.api.java.JavaSparkContext;
|
import org.apache.spark.api.java.JavaSparkContext;
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Driver program that uses the Hoodie client with synthetic workload, and performs basic operations. <p>
|
* Driver program that uses the Hoodie client with synthetic workload, and performs basic operations.
|
||||||
|
* <p>
|
||||||
*/
|
*/
|
||||||
public class HoodieClientExample {
|
public class HoodieClientExample {
|
||||||
|
|
||||||
@@ -82,18 +83,15 @@ public class HoodieClientExample {
|
|||||||
Path path = new Path(tablePath);
|
Path path = new Path(tablePath);
|
||||||
FileSystem fs = FSUtils.getFs(tablePath, jsc.hadoopConfiguration());
|
FileSystem fs = FSUtils.getFs(tablePath, jsc.hadoopConfiguration());
|
||||||
if (!fs.exists(path)) {
|
if (!fs.exists(path)) {
|
||||||
HoodieTableMetaClient
|
HoodieTableMetaClient.initTableType(jsc.hadoopConfiguration(), tablePath, HoodieTableType.valueOf(tableType),
|
||||||
.initTableType(jsc.hadoopConfiguration(), tablePath, HoodieTableType.valueOf(tableType), tableName,
|
tableName, HoodieAvroPayload.class.getName());
|
||||||
HoodieAvroPayload.class.getName());
|
|
||||||
}
|
}
|
||||||
|
|
||||||
// Create the write client to write some records in
|
// Create the write client to write some records in
|
||||||
HoodieWriteConfig cfg = HoodieWriteConfig.newBuilder().withPath(tablePath)
|
HoodieWriteConfig cfg = HoodieWriteConfig.newBuilder().withPath(tablePath)
|
||||||
.withSchema(HoodieTestDataGenerator.TRIP_EXAMPLE_SCHEMA).withParallelism(2, 2)
|
.withSchema(HoodieTestDataGenerator.TRIP_EXAMPLE_SCHEMA).withParallelism(2, 2).forTable(tableName)
|
||||||
.forTable(tableName)
|
|
||||||
.withIndexConfig(HoodieIndexConfig.newBuilder().withIndexType(IndexType.BLOOM).build())
|
.withIndexConfig(HoodieIndexConfig.newBuilder().withIndexType(IndexType.BLOOM).build())
|
||||||
.withCompactionConfig(
|
.withCompactionConfig(HoodieCompactionConfig.newBuilder().archiveCommitsWith(2, 3).build()).build();
|
||||||
HoodieCompactionConfig.newBuilder().archiveCommitsWith(2, 3).build()).build();
|
|
||||||
HoodieWriteClient client = new HoodieWriteClient(jsc, cfg);
|
HoodieWriteClient client = new HoodieWriteClient(jsc, cfg);
|
||||||
|
|
||||||
/**
|
/**
|
||||||
|
|||||||
@@ -53,7 +53,7 @@ public abstract class HoodieClientTestHarness extends HoodieCommonTestHarness im
|
|||||||
protected transient ExecutorService executorService;
|
protected transient ExecutorService executorService;
|
||||||
protected transient HoodieTableMetaClient metaClient;
|
protected transient HoodieTableMetaClient metaClient;
|
||||||
|
|
||||||
//dfs
|
// dfs
|
||||||
protected String dfsBasePath;
|
protected String dfsBasePath;
|
||||||
protected transient HdfsTestService hdfsTestService;
|
protected transient HdfsTestService hdfsTestService;
|
||||||
protected transient MiniDFSCluster dfsCluster;
|
protected transient MiniDFSCluster dfsCluster;
|
||||||
@@ -74,6 +74,7 @@ public abstract class HoodieClientTestHarness extends HoodieCommonTestHarness im
|
|||||||
|
|
||||||
/**
|
/**
|
||||||
* Cleanups resource group for the subclasses of {@link TestHoodieClientBase}.
|
* Cleanups resource group for the subclasses of {@link TestHoodieClientBase}.
|
||||||
|
*
|
||||||
* @throws IOException
|
* @throws IOException
|
||||||
*/
|
*/
|
||||||
public void cleanupResources() throws IOException {
|
public void cleanupResources() throws IOException {
|
||||||
@@ -84,8 +85,7 @@ public abstract class HoodieClientTestHarness extends HoodieCommonTestHarness im
|
|||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Initializes the Spark contexts ({@link JavaSparkContext} and {@link SQLContext})
|
* Initializes the Spark contexts ({@link JavaSparkContext} and {@link SQLContext}) with the given application name.
|
||||||
* with the given application name.
|
|
||||||
*
|
*
|
||||||
* @param appName The specified application name.
|
* @param appName The specified application name.
|
||||||
*/
|
*/
|
||||||
@@ -94,13 +94,13 @@ public abstract class HoodieClientTestHarness extends HoodieCommonTestHarness im
|
|||||||
jsc = new JavaSparkContext(HoodieClientTestUtils.getSparkConfForTest(appName));
|
jsc = new JavaSparkContext(HoodieClientTestUtils.getSparkConfForTest(appName));
|
||||||
jsc.setLogLevel("ERROR");
|
jsc.setLogLevel("ERROR");
|
||||||
|
|
||||||
//SQLContext stuff
|
// SQLContext stuff
|
||||||
sqlContext = new SQLContext(jsc);
|
sqlContext = new SQLContext(jsc);
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Initializes the Spark contexts ({@link JavaSparkContext} and {@link SQLContext})
|
* Initializes the Spark contexts ({@link JavaSparkContext} and {@link SQLContext}) with a default name
|
||||||
* with a default name <b>TestHoodieClient</b>.
|
* <b>TestHoodieClient</b>.
|
||||||
*/
|
*/
|
||||||
protected void initSparkContexts() {
|
protected void initSparkContexts() {
|
||||||
initSparkContexts("TestHoodieClient");
|
initSparkContexts("TestHoodieClient");
|
||||||
@@ -155,8 +155,8 @@ public abstract class HoodieClientTestHarness extends HoodieCommonTestHarness im
|
|||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Initializes an instance of {@link HoodieTableMetaClient} with a special table type
|
* Initializes an instance of {@link HoodieTableMetaClient} with a special table type specified by
|
||||||
* specified by {@code getTableType()}.
|
* {@code getTableType()}.
|
||||||
*
|
*
|
||||||
* @throws IOException
|
* @throws IOException
|
||||||
*/
|
*/
|
||||||
|
|||||||
@@ -73,15 +73,14 @@ public class TestAsyncCompaction extends TestHoodieClientBase {
|
|||||||
|
|
||||||
private HoodieWriteConfig.Builder getConfigBuilder(Boolean autoCommit) {
|
private HoodieWriteConfig.Builder getConfigBuilder(Boolean autoCommit) {
|
||||||
return HoodieWriteConfig.newBuilder().withPath(basePath).withSchema(TRIP_EXAMPLE_SCHEMA).withParallelism(2, 2)
|
return HoodieWriteConfig.newBuilder().withPath(basePath).withSchema(TRIP_EXAMPLE_SCHEMA).withParallelism(2, 2)
|
||||||
.withAutoCommit(autoCommit).withAssumeDatePartitioning(true).withCompactionConfig(
|
.withAutoCommit(autoCommit).withAssumeDatePartitioning(true)
|
||||||
HoodieCompactionConfig.newBuilder().compactionSmallFileSize(1024 * 1024 * 1024).withInlineCompaction(false)
|
.withCompactionConfig(HoodieCompactionConfig.newBuilder().compactionSmallFileSize(1024 * 1024 * 1024)
|
||||||
.withMaxNumDeltaCommitsBeforeCompaction(1).build())
|
.withInlineCompaction(false).withMaxNumDeltaCommitsBeforeCompaction(1).build())
|
||||||
.withStorageConfig(HoodieStorageConfig.newBuilder().limitFileSize(1024 * 1024 * 1024).build())
|
.withStorageConfig(HoodieStorageConfig.newBuilder().limitFileSize(1024 * 1024 * 1024).build())
|
||||||
.forTable("test-trip-table")
|
.forTable("test-trip-table")
|
||||||
.withIndexConfig(HoodieIndexConfig.newBuilder().withIndexType(HoodieIndex.IndexType.BLOOM).build())
|
.withIndexConfig(HoodieIndexConfig.newBuilder().withIndexType(HoodieIndex.IndexType.BLOOM).build())
|
||||||
.withEmbeddedTimelineServerEnabled(true).withFileSystemViewConfig(
|
.withEmbeddedTimelineServerEnabled(true).withFileSystemViewConfig(FileSystemViewStorageConfig.newBuilder()
|
||||||
FileSystemViewStorageConfig.newBuilder().withStorageType(FileSystemViewStorageType.EMBEDDED_KV_STORE)
|
.withStorageType(FileSystemViewStorageType.EMBEDDED_KV_STORE).build());
|
||||||
.build());
|
|
||||||
}
|
}
|
||||||
|
|
||||||
@Test
|
@Test
|
||||||
@@ -97,8 +96,8 @@ public class TestAsyncCompaction extends TestHoodieClientBase {
|
|||||||
int numRecs = 2000;
|
int numRecs = 2000;
|
||||||
|
|
||||||
List<HoodieRecord> records = dataGen.generateInserts(firstInstantTime, numRecs);
|
List<HoodieRecord> records = dataGen.generateInserts(firstInstantTime, numRecs);
|
||||||
runNextDeltaCommits(client, Arrays.asList(firstInstantTime, secondInstantTime),
|
runNextDeltaCommits(client, Arrays.asList(firstInstantTime, secondInstantTime), records, cfg, true,
|
||||||
records, cfg, true, new ArrayList<>());
|
new ArrayList<>());
|
||||||
|
|
||||||
// Schedule compaction but do not run them
|
// Schedule compaction but do not run them
|
||||||
scheduleCompaction(compactionInstantTime, client, cfg);
|
scheduleCompaction(compactionInstantTime, client, cfg);
|
||||||
@@ -158,8 +157,8 @@ public class TestAsyncCompaction extends TestHoodieClientBase {
|
|||||||
try (HoodieWriteClient client = getHoodieWriteClient(cfg, true);) {
|
try (HoodieWriteClient client = getHoodieWriteClient(cfg, true);) {
|
||||||
|
|
||||||
List<HoodieRecord> records = dataGen.generateInserts(firstInstantTime, numRecs);
|
List<HoodieRecord> records = dataGen.generateInserts(firstInstantTime, numRecs);
|
||||||
records = runNextDeltaCommits(client, Arrays.asList(firstInstantTime, secondInstantTime),
|
records = runNextDeltaCommits(client, Arrays.asList(firstInstantTime, secondInstantTime), records, cfg, true,
|
||||||
records, cfg, true, new ArrayList<>());
|
new ArrayList<>());
|
||||||
|
|
||||||
// Schedule compaction but do not run them
|
// Schedule compaction but do not run them
|
||||||
scheduleCompaction(compactionInstantTime, client, cfg);
|
scheduleCompaction(compactionInstantTime, client, cfg);
|
||||||
@@ -177,20 +176,18 @@ public class TestAsyncCompaction extends TestHoodieClientBase {
|
|||||||
assertTrue("inflight instant has expected instant time",
|
assertTrue("inflight instant has expected instant time",
|
||||||
inflightInstant.getTimestamp().equals(inflightInstantTime));
|
inflightInstant.getTimestamp().equals(inflightInstantTime));
|
||||||
|
|
||||||
//This should rollback
|
// This should rollback
|
||||||
client.startCommitWithTime(nextInflightInstantTime);
|
client.startCommitWithTime(nextInflightInstantTime);
|
||||||
|
|
||||||
//Validate
|
// Validate
|
||||||
metaClient = new HoodieTableMetaClient(jsc.hadoopConfiguration(), cfg.getBasePath());
|
metaClient = new HoodieTableMetaClient(jsc.hadoopConfiguration(), cfg.getBasePath());
|
||||||
inflightInstant =
|
inflightInstant = metaClient.getActiveTimeline().filterInflightsExcludingCompaction().firstInstant().get();
|
||||||
metaClient.getActiveTimeline().filterInflightsExcludingCompaction().firstInstant().get();
|
|
||||||
assertTrue("inflight instant has expected instant time",
|
assertTrue("inflight instant has expected instant time",
|
||||||
inflightInstant.getTimestamp().equals(nextInflightInstantTime));
|
inflightInstant.getTimestamp().equals(nextInflightInstantTime));
|
||||||
assertTrue("Expect only one inflight instant",
|
assertTrue("Expect only one inflight instant",
|
||||||
metaClient.getActiveTimeline().filterInflightsExcludingCompaction().getInstants().count() == 1);
|
metaClient.getActiveTimeline().filterInflightsExcludingCompaction().getInstants().count() == 1);
|
||||||
//Expect pending Compaction to be present
|
// Expect pending Compaction to be present
|
||||||
pendingCompactionInstant =
|
pendingCompactionInstant = metaClient.getActiveTimeline().filterPendingCompactionTimeline().firstInstant().get();
|
||||||
metaClient.getActiveTimeline().filterPendingCompactionTimeline().firstInstant().get();
|
|
||||||
assertTrue("Pending Compaction instant has expected instant time",
|
assertTrue("Pending Compaction instant has expected instant time",
|
||||||
pendingCompactionInstant.getTimestamp().equals(compactionInstantTime));
|
pendingCompactionInstant.getTimestamp().equals(compactionInstantTime));
|
||||||
}
|
}
|
||||||
@@ -211,8 +208,8 @@ public class TestAsyncCompaction extends TestHoodieClientBase {
|
|||||||
int numRecs = 2000;
|
int numRecs = 2000;
|
||||||
|
|
||||||
List<HoodieRecord> records = dataGen.generateInserts(firstInstantTime, numRecs);
|
List<HoodieRecord> records = dataGen.generateInserts(firstInstantTime, numRecs);
|
||||||
records = runNextDeltaCommits(client, Arrays.asList(firstInstantTime, secondInstantTime),
|
records = runNextDeltaCommits(client, Arrays.asList(firstInstantTime, secondInstantTime), records, cfg, true,
|
||||||
records, cfg, true, new ArrayList<>());
|
new ArrayList<>());
|
||||||
|
|
||||||
// Schedule and mark compaction instant as inflight
|
// Schedule and mark compaction instant as inflight
|
||||||
HoodieTableMetaClient metaClient = new HoodieTableMetaClient(jsc.hadoopConfiguration(), cfg.getBasePath());
|
HoodieTableMetaClient metaClient = new HoodieTableMetaClient(jsc.hadoopConfiguration(), cfg.getBasePath());
|
||||||
@@ -221,8 +218,8 @@ public class TestAsyncCompaction extends TestHoodieClientBase {
|
|||||||
moveCompactionFromRequestedToInflight(compactionInstantTime, client, cfg);
|
moveCompactionFromRequestedToInflight(compactionInstantTime, client, cfg);
|
||||||
|
|
||||||
// Complete ingestions
|
// Complete ingestions
|
||||||
runNextDeltaCommits(client, Arrays.asList(thirdInstantTime, fourthInstantTime),
|
runNextDeltaCommits(client, Arrays.asList(thirdInstantTime, fourthInstantTime), records, cfg, false,
|
||||||
records, cfg, false, Arrays.asList(compactionInstantTime));
|
Arrays.asList(compactionInstantTime));
|
||||||
|
|
||||||
// execute inflight compaction
|
// execute inflight compaction
|
||||||
executeCompaction(compactionInstantTime, client, hoodieTable, cfg, numRecs, true);
|
executeCompaction(compactionInstantTime, client, hoodieTable, cfg, numRecs, true);
|
||||||
@@ -242,8 +239,8 @@ public class TestAsyncCompaction extends TestHoodieClientBase {
|
|||||||
int numRecs = 2000;
|
int numRecs = 2000;
|
||||||
|
|
||||||
List<HoodieRecord> records = dataGen.generateInserts(firstInstantTime, numRecs);
|
List<HoodieRecord> records = dataGen.generateInserts(firstInstantTime, numRecs);
|
||||||
records = runNextDeltaCommits(client, Arrays.asList(firstInstantTime, secondInstantTime),
|
records = runNextDeltaCommits(client, Arrays.asList(firstInstantTime, secondInstantTime), records, cfg, true,
|
||||||
records, cfg, true, new ArrayList<>());
|
new ArrayList<>());
|
||||||
|
|
||||||
// Schedule compaction but do not run them
|
// Schedule compaction but do not run them
|
||||||
HoodieTableMetaClient metaClient = new HoodieTableMetaClient(jsc.hadoopConfiguration(), cfg.getBasePath());
|
HoodieTableMetaClient metaClient = new HoodieTableMetaClient(jsc.hadoopConfiguration(), cfg.getBasePath());
|
||||||
@@ -256,8 +253,8 @@ public class TestAsyncCompaction extends TestHoodieClientBase {
|
|||||||
|
|
||||||
boolean gotException = false;
|
boolean gotException = false;
|
||||||
try {
|
try {
|
||||||
runNextDeltaCommits(client, Arrays.asList(failedInstantTime),
|
runNextDeltaCommits(client, Arrays.asList(failedInstantTime), records, cfg, false,
|
||||||
records, cfg, false, Arrays.asList(compactionInstantTime));
|
Arrays.asList(compactionInstantTime));
|
||||||
} catch (IllegalArgumentException iex) {
|
} catch (IllegalArgumentException iex) {
|
||||||
// Latest pending compaction instant time must be earlier than this instant time. Should fail here
|
// Latest pending compaction instant time must be earlier than this instant time. Should fail here
|
||||||
gotException = true;
|
gotException = true;
|
||||||
@@ -279,8 +276,8 @@ public class TestAsyncCompaction extends TestHoodieClientBase {
|
|||||||
int numRecs = 2000;
|
int numRecs = 2000;
|
||||||
|
|
||||||
List<HoodieRecord> records = dataGen.generateInserts(firstInstantTime, numRecs);
|
List<HoodieRecord> records = dataGen.generateInserts(firstInstantTime, numRecs);
|
||||||
records = runNextDeltaCommits(client, Arrays.asList(firstInstantTime, secondInstantTime),
|
records = runNextDeltaCommits(client, Arrays.asList(firstInstantTime, secondInstantTime), records, cfg, true,
|
||||||
records, cfg, true, new ArrayList<>());
|
new ArrayList<>());
|
||||||
|
|
||||||
HoodieTableMetaClient metaClient = new HoodieTableMetaClient(jsc.hadoopConfiguration(), cfg.getBasePath());
|
HoodieTableMetaClient metaClient = new HoodieTableMetaClient(jsc.hadoopConfiguration(), cfg.getBasePath());
|
||||||
createNextDeltaCommit(inflightInstantTime, records, client, metaClient, cfg, true);
|
createNextDeltaCommit(inflightInstantTime, records, client, metaClient, cfg, true);
|
||||||
@@ -315,8 +312,8 @@ public class TestAsyncCompaction extends TestHoodieClientBase {
|
|||||||
int numRecs = 2000;
|
int numRecs = 2000;
|
||||||
|
|
||||||
List<HoodieRecord> records = dataGen.generateInserts(firstInstantTime, numRecs);
|
List<HoodieRecord> records = dataGen.generateInserts(firstInstantTime, numRecs);
|
||||||
records = runNextDeltaCommits(client, Arrays.asList(firstInstantTime, secondInstantTime),
|
records = runNextDeltaCommits(client, Arrays.asList(firstInstantTime, secondInstantTime), records, cfg, true,
|
||||||
records, cfg, true, new ArrayList<>());
|
new ArrayList<>());
|
||||||
|
|
||||||
HoodieTableMetaClient metaClient = new HoodieTableMetaClient(jsc.hadoopConfiguration(), cfg.getBasePath());
|
HoodieTableMetaClient metaClient = new HoodieTableMetaClient(jsc.hadoopConfiguration(), cfg.getBasePath());
|
||||||
boolean gotException = false;
|
boolean gotException = false;
|
||||||
@@ -337,8 +334,7 @@ public class TestAsyncCompaction extends TestHoodieClientBase {
|
|||||||
} catch (IllegalArgumentException iex) {
|
} catch (IllegalArgumentException iex) {
|
||||||
gotException = true;
|
gotException = true;
|
||||||
}
|
}
|
||||||
assertTrue("Compaction Instant to be scheduled cannot have same timestamp as committed instant",
|
assertTrue("Compaction Instant to be scheduled cannot have same timestamp as committed instant", gotException);
|
||||||
gotException);
|
|
||||||
|
|
||||||
compactionInstantTime = "006";
|
compactionInstantTime = "006";
|
||||||
scheduleCompaction(compactionInstantTime, client, cfg);
|
scheduleCompaction(compactionInstantTime, client, cfg);
|
||||||
@@ -349,8 +345,7 @@ public class TestAsyncCompaction extends TestHoodieClientBase {
|
|||||||
} catch (IllegalArgumentException iex) {
|
} catch (IllegalArgumentException iex) {
|
||||||
gotException = true;
|
gotException = true;
|
||||||
}
|
}
|
||||||
assertTrue("Compaction Instant to be scheduled cannot have same timestamp as a pending compaction",
|
assertTrue("Compaction Instant to be scheduled cannot have same timestamp as a pending compaction", gotException);
|
||||||
gotException);
|
|
||||||
}
|
}
|
||||||
|
|
||||||
@Test
|
@Test
|
||||||
@@ -365,8 +360,8 @@ public class TestAsyncCompaction extends TestHoodieClientBase {
|
|||||||
int numRecs = 2000;
|
int numRecs = 2000;
|
||||||
|
|
||||||
List<HoodieRecord> records = dataGen.generateInserts(firstInstantTime, numRecs);
|
List<HoodieRecord> records = dataGen.generateInserts(firstInstantTime, numRecs);
|
||||||
records = runNextDeltaCommits(client, Arrays.asList(firstInstantTime, secondInstantTime),
|
records = runNextDeltaCommits(client, Arrays.asList(firstInstantTime, secondInstantTime), records, cfg, true,
|
||||||
records, cfg, true, new ArrayList<>());
|
new ArrayList<>());
|
||||||
|
|
||||||
HoodieTableMetaClient metaClient = new HoodieTableMetaClient(jsc.hadoopConfiguration(), cfg.getBasePath());
|
HoodieTableMetaClient metaClient = new HoodieTableMetaClient(jsc.hadoopConfiguration(), cfg.getBasePath());
|
||||||
HoodieTable hoodieTable = getHoodieTable(metaClient, cfg);
|
HoodieTable hoodieTable = getHoodieTable(metaClient, cfg);
|
||||||
@@ -376,7 +371,7 @@ public class TestAsyncCompaction extends TestHoodieClientBase {
|
|||||||
|
|
||||||
@Test
|
@Test
|
||||||
public void testInterleavedCompaction() throws Exception {
|
public void testInterleavedCompaction() throws Exception {
|
||||||
//Case: Two delta commits before and after compaction schedule
|
// Case: Two delta commits before and after compaction schedule
|
||||||
HoodieWriteConfig cfg = getConfig(true);
|
HoodieWriteConfig cfg = getConfig(true);
|
||||||
try (HoodieWriteClient client = getHoodieWriteClient(cfg, true);) {
|
try (HoodieWriteClient client = getHoodieWriteClient(cfg, true);) {
|
||||||
|
|
||||||
@@ -389,15 +384,15 @@ public class TestAsyncCompaction extends TestHoodieClientBase {
|
|||||||
int numRecs = 2000;
|
int numRecs = 2000;
|
||||||
|
|
||||||
List<HoodieRecord> records = dataGen.generateInserts(firstInstantTime, numRecs);
|
List<HoodieRecord> records = dataGen.generateInserts(firstInstantTime, numRecs);
|
||||||
records = runNextDeltaCommits(client, Arrays.asList(firstInstantTime, secondInstantTime),
|
records = runNextDeltaCommits(client, Arrays.asList(firstInstantTime, secondInstantTime), records, cfg, true,
|
||||||
records, cfg, true, new ArrayList<>());
|
new ArrayList<>());
|
||||||
|
|
||||||
HoodieTableMetaClient metaClient = new HoodieTableMetaClient(jsc.hadoopConfiguration(), cfg.getBasePath());
|
HoodieTableMetaClient metaClient = new HoodieTableMetaClient(jsc.hadoopConfiguration(), cfg.getBasePath());
|
||||||
HoodieTable hoodieTable = getHoodieTable(metaClient, cfg);
|
HoodieTable hoodieTable = getHoodieTable(metaClient, cfg);
|
||||||
scheduleCompaction(compactionInstantTime, client, cfg);
|
scheduleCompaction(compactionInstantTime, client, cfg);
|
||||||
|
|
||||||
runNextDeltaCommits(client, Arrays.asList(thirdInstantTime, fourthInstantTime),
|
runNextDeltaCommits(client, Arrays.asList(thirdInstantTime, fourthInstantTime), records, cfg, false,
|
||||||
records, cfg, false, Arrays.asList(compactionInstantTime));
|
Arrays.asList(compactionInstantTime));
|
||||||
executeCompaction(compactionInstantTime, client, hoodieTable, cfg, numRecs, true);
|
executeCompaction(compactionInstantTime, client, hoodieTable, cfg, numRecs, true);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@@ -428,8 +423,8 @@ public class TestAsyncCompaction extends TestHoodieClientBase {
|
|||||||
}
|
}
|
||||||
|
|
||||||
private List<HoodieRecord> runNextDeltaCommits(HoodieWriteClient client, List<String> deltaInstants,
|
private List<HoodieRecord> runNextDeltaCommits(HoodieWriteClient client, List<String> deltaInstants,
|
||||||
List<HoodieRecord> records, HoodieWriteConfig cfg, boolean insertFirst,
|
List<HoodieRecord> records, HoodieWriteConfig cfg, boolean insertFirst, List<String> expPendingCompactionInstants)
|
||||||
List<String> expPendingCompactionInstants) throws Exception {
|
throws Exception {
|
||||||
|
|
||||||
HoodieTableMetaClient metaClient = new HoodieTableMetaClient(jsc.hadoopConfiguration(), cfg.getBasePath());
|
HoodieTableMetaClient metaClient = new HoodieTableMetaClient(jsc.hadoopConfiguration(), cfg.getBasePath());
|
||||||
List<Pair<HoodieInstant, HoodieCompactionPlan>> pendingCompactions =
|
List<Pair<HoodieInstant, HoodieCompactionPlan>> pendingCompactions =
|
||||||
@@ -476,8 +471,8 @@ public class TestAsyncCompaction extends TestHoodieClientBase {
|
|||||||
HoodieWriteConfig cfg) throws IOException {
|
HoodieWriteConfig cfg) throws IOException {
|
||||||
HoodieTableMetaClient metaClient = new HoodieTableMetaClient(jsc.hadoopConfiguration(), cfg.getBasePath());
|
HoodieTableMetaClient metaClient = new HoodieTableMetaClient(jsc.hadoopConfiguration(), cfg.getBasePath());
|
||||||
HoodieInstant compactionInstant = HoodieTimeline.getCompactionRequestedInstant(compactionInstantTime);
|
HoodieInstant compactionInstant = HoodieTimeline.getCompactionRequestedInstant(compactionInstantTime);
|
||||||
HoodieCompactionPlan workload = AvroUtils.deserializeCompactionPlan(
|
HoodieCompactionPlan workload = AvroUtils
|
||||||
metaClient.getActiveTimeline().getInstantAuxiliaryDetails(compactionInstant).get());
|
.deserializeCompactionPlan(metaClient.getActiveTimeline().getInstantAuxiliaryDetails(compactionInstant).get());
|
||||||
metaClient.getActiveTimeline().transitionCompactionRequestedToInflight(compactionInstant);
|
metaClient.getActiveTimeline().transitionCompactionRequestedToInflight(compactionInstant);
|
||||||
HoodieInstant instant = metaClient.getActiveTimeline().reload().filterPendingCompactionTimeline().getInstants()
|
HoodieInstant instant = metaClient.getActiveTimeline().reload().filterPendingCompactionTimeline().getInstants()
|
||||||
.filter(in -> in.getTimestamp().equals(compactionInstantTime)).findAny().get();
|
.filter(in -> in.getTimestamp().equals(compactionInstantTime)).findAny().get();
|
||||||
@@ -489,27 +484,23 @@ public class TestAsyncCompaction extends TestHoodieClientBase {
|
|||||||
client.scheduleCompactionAtInstant(compactionInstantTime, Option.empty());
|
client.scheduleCompactionAtInstant(compactionInstantTime, Option.empty());
|
||||||
HoodieTableMetaClient metaClient = new HoodieTableMetaClient(jsc.hadoopConfiguration(), cfg.getBasePath());
|
HoodieTableMetaClient metaClient = new HoodieTableMetaClient(jsc.hadoopConfiguration(), cfg.getBasePath());
|
||||||
HoodieInstant instant = metaClient.getActiveTimeline().filterPendingCompactionTimeline().lastInstant().get();
|
HoodieInstant instant = metaClient.getActiveTimeline().filterPendingCompactionTimeline().lastInstant().get();
|
||||||
assertEquals("Last compaction instant must be the one set",
|
assertEquals("Last compaction instant must be the one set", instant.getTimestamp(), compactionInstantTime);
|
||||||
instant.getTimestamp(), compactionInstantTime);
|
|
||||||
}
|
}
|
||||||
|
|
||||||
private void scheduleAndExecuteCompaction(String compactionInstantTime,
|
private void scheduleAndExecuteCompaction(String compactionInstantTime, HoodieWriteClient client, HoodieTable table,
|
||||||
HoodieWriteClient client, HoodieTable table, HoodieWriteConfig cfg, int expectedNumRecs,
|
HoodieWriteConfig cfg, int expectedNumRecs, boolean hasDeltaCommitAfterPendingCompaction) throws IOException {
|
||||||
boolean hasDeltaCommitAfterPendingCompaction) throws IOException {
|
|
||||||
scheduleCompaction(compactionInstantTime, client, cfg);
|
scheduleCompaction(compactionInstantTime, client, cfg);
|
||||||
executeCompaction(compactionInstantTime, client, table, cfg, expectedNumRecs, hasDeltaCommitAfterPendingCompaction);
|
executeCompaction(compactionInstantTime, client, table, cfg, expectedNumRecs, hasDeltaCommitAfterPendingCompaction);
|
||||||
}
|
}
|
||||||
|
|
||||||
private void executeCompaction(String compactionInstantTime,
|
private void executeCompaction(String compactionInstantTime, HoodieWriteClient client, HoodieTable table,
|
||||||
HoodieWriteClient client, HoodieTable table, HoodieWriteConfig cfg, int expectedNumRecs,
|
HoodieWriteConfig cfg, int expectedNumRecs, boolean hasDeltaCommitAfterPendingCompaction) throws IOException {
|
||||||
boolean hasDeltaCommitAfterPendingCompaction) throws IOException {
|
|
||||||
|
|
||||||
client.compact(compactionInstantTime);
|
client.compact(compactionInstantTime);
|
||||||
List<FileSlice> fileSliceList = getCurrentLatestFileSlices(table, cfg);
|
List<FileSlice> fileSliceList = getCurrentLatestFileSlices(table, cfg);
|
||||||
assertTrue("Ensure latest file-slices are not empty", fileSliceList.stream().findAny().isPresent());
|
assertTrue("Ensure latest file-slices are not empty", fileSliceList.stream().findAny().isPresent());
|
||||||
assertFalse("Verify all file-slices have base-instant same as compaction instant",
|
assertFalse("Verify all file-slices have base-instant same as compaction instant", fileSliceList.stream()
|
||||||
fileSliceList.stream().filter(fs -> !fs.getBaseInstantTime().equals(compactionInstantTime))
|
.filter(fs -> !fs.getBaseInstantTime().equals(compactionInstantTime)).findAny().isPresent());
|
||||||
.findAny().isPresent());
|
|
||||||
assertFalse("Verify all file-slices have data-files",
|
assertFalse("Verify all file-slices have data-files",
|
||||||
fileSliceList.stream().filter(fs -> !fs.getDataFile().isPresent()).findAny().isPresent());
|
fileSliceList.stream().filter(fs -> !fs.getDataFile().isPresent()).findAny().isPresent());
|
||||||
|
|
||||||
@@ -522,12 +513,11 @@ public class TestAsyncCompaction extends TestHoodieClientBase {
|
|||||||
}
|
}
|
||||||
|
|
||||||
// verify that there is a commit
|
// verify that there is a commit
|
||||||
table = getHoodieTable(
|
table = getHoodieTable(new HoodieTableMetaClient(jsc.hadoopConfiguration(), cfg.getBasePath(), true), cfg);
|
||||||
new HoodieTableMetaClient(jsc.hadoopConfiguration(), cfg.getBasePath(), true), cfg);
|
|
||||||
HoodieTimeline timeline = table.getMetaClient().getCommitTimeline().filterCompletedInstants();
|
HoodieTimeline timeline = table.getMetaClient().getCommitTimeline().filterCompletedInstants();
|
||||||
String latestCompactionCommitTime = timeline.lastInstant().get().getTimestamp();
|
String latestCompactionCommitTime = timeline.lastInstant().get().getTimestamp();
|
||||||
assertEquals("Expect compaction instant time to be the latest commit time",
|
assertEquals("Expect compaction instant time to be the latest commit time", latestCompactionCommitTime,
|
||||||
latestCompactionCommitTime, compactionInstantTime);
|
compactionInstantTime);
|
||||||
Assert.assertEquals("Must contain expected records", expectedNumRecs,
|
Assert.assertEquals("Must contain expected records", expectedNumRecs,
|
||||||
HoodieClientTestUtils.readSince(basePath, sqlContext, timeline, "000").count());
|
HoodieClientTestUtils.readSince(basePath, sqlContext, timeline, "000").count());
|
||||||
|
|
||||||
@@ -546,8 +536,8 @@ public class TestAsyncCompaction extends TestHoodieClientBase {
|
|||||||
client.commit(instantTime, statuses);
|
client.commit(instantTime, statuses);
|
||||||
}
|
}
|
||||||
|
|
||||||
Option<HoodieInstant> deltaCommit = metaClient.getActiveTimeline().reload().getDeltaCommitTimeline()
|
Option<HoodieInstant> deltaCommit =
|
||||||
.filterCompletedInstants().lastInstant();
|
metaClient.getActiveTimeline().reload().getDeltaCommitTimeline().filterCompletedInstants().lastInstant();
|
||||||
if (skipCommit && !cfg.shouldAutoCommit()) {
|
if (skipCommit && !cfg.shouldAutoCommit()) {
|
||||||
assertTrue("Delta commit should not be latest instant",
|
assertTrue("Delta commit should not be latest instant",
|
||||||
deltaCommit.get().getTimestamp().compareTo(instantTime) < 0);
|
deltaCommit.get().getTimestamp().compareTo(instantTime) < 0);
|
||||||
@@ -560,8 +550,8 @@ public class TestAsyncCompaction extends TestHoodieClientBase {
|
|||||||
|
|
||||||
private List<HoodieDataFile> getCurrentLatestDataFiles(HoodieTable table, HoodieWriteConfig cfg) throws IOException {
|
private List<HoodieDataFile> getCurrentLatestDataFiles(HoodieTable table, HoodieWriteConfig cfg) throws IOException {
|
||||||
FileStatus[] allFiles = HoodieTestUtils.listAllDataFilesInPath(table.getMetaClient().getFs(), cfg.getBasePath());
|
FileStatus[] allFiles = HoodieTestUtils.listAllDataFilesInPath(table.getMetaClient().getFs(), cfg.getBasePath());
|
||||||
HoodieTableFileSystemView
|
HoodieTableFileSystemView view =
|
||||||
view = new HoodieTableFileSystemView(table.getMetaClient(), table.getCompletedCommitsTimeline(), allFiles);
|
new HoodieTableFileSystemView(table.getMetaClient(), table.getCompletedCommitsTimeline(), allFiles);
|
||||||
List<HoodieDataFile> dataFilesToRead = view.getLatestDataFiles().collect(Collectors.toList());
|
List<HoodieDataFile> dataFilesToRead = view.getLatestDataFiles().collect(Collectors.toList());
|
||||||
return dataFilesToRead;
|
return dataFilesToRead;
|
||||||
}
|
}
|
||||||
@@ -569,9 +559,8 @@ public class TestAsyncCompaction extends TestHoodieClientBase {
|
|||||||
private List<FileSlice> getCurrentLatestFileSlices(HoodieTable table, HoodieWriteConfig cfg) throws IOException {
|
private List<FileSlice> getCurrentLatestFileSlices(HoodieTable table, HoodieWriteConfig cfg) throws IOException {
|
||||||
HoodieTableFileSystemView view = new HoodieTableFileSystemView(table.getMetaClient(),
|
HoodieTableFileSystemView view = new HoodieTableFileSystemView(table.getMetaClient(),
|
||||||
table.getMetaClient().getActiveTimeline().reload().getCommitsAndCompactionTimeline());
|
table.getMetaClient().getActiveTimeline().reload().getCommitsAndCompactionTimeline());
|
||||||
List<FileSlice> fileSliceList =
|
List<FileSlice> fileSliceList = Arrays.asList(HoodieTestDataGenerator.DEFAULT_PARTITION_PATHS).stream()
|
||||||
Arrays.asList(HoodieTestDataGenerator.DEFAULT_PARTITION_PATHS).stream().flatMap(partition ->
|
.flatMap(partition -> view.getLatestFileSlices(partition)).collect(Collectors.toList());
|
||||||
view.getLatestFileSlices(partition)).collect(Collectors.toList());
|
|
||||||
return fileSliceList;
|
return fileSliceList;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|||||||
@@ -93,16 +93,13 @@ public class TestCleaner extends TestHoodieClientBase {
|
|||||||
* @param insertFn Insertion API for testing
|
* @param insertFn Insertion API for testing
|
||||||
* @throws Exception in case of error
|
* @throws Exception in case of error
|
||||||
*/
|
*/
|
||||||
private String insertFirstBigBatchForClientCleanerTest(
|
private String insertFirstBigBatchForClientCleanerTest(HoodieWriteConfig cfg, HoodieWriteClient client,
|
||||||
HoodieWriteConfig cfg,
|
|
||||||
HoodieWriteClient client,
|
|
||||||
Function2<List<HoodieRecord>, String, Integer> recordGenFunction,
|
Function2<List<HoodieRecord>, String, Integer> recordGenFunction,
|
||||||
Function3<JavaRDD<WriteStatus>, HoodieWriteClient, JavaRDD<HoodieRecord>, String> insertFn) throws Exception {
|
Function3<JavaRDD<WriteStatus>, HoodieWriteClient, JavaRDD<HoodieRecord>, String> insertFn) throws Exception {
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* do a big insert
|
* do a big insert (this is basically same as insert part of upsert, just adding it here so we can catch breakages
|
||||||
* (this is basically same as insert part of upsert, just adding it here so we can
|
* in insert(), if the implementation diverges.)
|
||||||
* catch breakages in insert(), if the implementation diverges.)
|
|
||||||
*/
|
*/
|
||||||
String newCommitTime = client.startCommit();
|
String newCommitTime = client.startCommit();
|
||||||
|
|
||||||
@@ -145,8 +142,8 @@ public class TestCleaner extends TestHoodieClientBase {
|
|||||||
*/
|
*/
|
||||||
@Test
|
@Test
|
||||||
public void testInsertPreppedAndCleanByVersions() throws Exception {
|
public void testInsertPreppedAndCleanByVersions() throws Exception {
|
||||||
testInsertAndCleanByVersions(HoodieWriteClient::insertPreppedRecords,
|
testInsertAndCleanByVersions(HoodieWriteClient::insertPreppedRecords, HoodieWriteClient::upsertPreppedRecords,
|
||||||
HoodieWriteClient::upsertPreppedRecords, true);
|
true);
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
@@ -173,20 +170,18 @@ public class TestCleaner extends TestHoodieClientBase {
|
|||||||
* @param insertFn Insert API to be tested
|
* @param insertFn Insert API to be tested
|
||||||
* @param upsertFn Upsert API to be tested
|
* @param upsertFn Upsert API to be tested
|
||||||
* @param isPreppedAPI Flag to indicate if a prepped-version is used. If true, a wrapper function will be used during
|
* @param isPreppedAPI Flag to indicate if a prepped-version is used. If true, a wrapper function will be used during
|
||||||
* record generation to also tag the regards (de-dupe is implicit as we use uniq record-gen APIs)
|
* record generation to also tag the regards (de-dupe is implicit as we use uniq record-gen APIs)
|
||||||
* @throws Exception in case of errors
|
* @throws Exception in case of errors
|
||||||
*/
|
*/
|
||||||
private void testInsertAndCleanByVersions(
|
private void testInsertAndCleanByVersions(
|
||||||
Function3<JavaRDD<WriteStatus>, HoodieWriteClient, JavaRDD<HoodieRecord>, String> insertFn,
|
Function3<JavaRDD<WriteStatus>, HoodieWriteClient, JavaRDD<HoodieRecord>, String> insertFn,
|
||||||
Function3<JavaRDD<WriteStatus>, HoodieWriteClient, JavaRDD<HoodieRecord>, String> upsertFn,
|
Function3<JavaRDD<WriteStatus>, HoodieWriteClient, JavaRDD<HoodieRecord>, String> upsertFn, boolean isPreppedAPI)
|
||||||
boolean isPreppedAPI
|
throws Exception {
|
||||||
) throws Exception {
|
|
||||||
int maxVersions = 2; // keep upto 2 versions for each file
|
int maxVersions = 2; // keep upto 2 versions for each file
|
||||||
HoodieWriteConfig cfg = getConfigBuilder().withCompactionConfig(
|
HoodieWriteConfig cfg = getConfigBuilder()
|
||||||
HoodieCompactionConfig.newBuilder().withCleanerPolicy(HoodieCleaningPolicy.KEEP_LATEST_FILE_VERSIONS)
|
.withCompactionConfig(HoodieCompactionConfig.newBuilder()
|
||||||
.retainFileVersions(maxVersions).build())
|
.withCleanerPolicy(HoodieCleaningPolicy.KEEP_LATEST_FILE_VERSIONS).retainFileVersions(maxVersions).build())
|
||||||
.withParallelism(1, 1).withBulkInsertParallelism(1)
|
.withParallelism(1, 1).withBulkInsertParallelism(1).withFinalizeWriteParallelism(1)
|
||||||
.withFinalizeWriteParallelism(1)
|
|
||||||
.withConsistencyGuardConfig(ConsistencyGuardConfig.newBuilder().withConsistencyCheckEnabled(true).build())
|
.withConsistencyGuardConfig(ConsistencyGuardConfig.newBuilder().withConsistencyCheckEnabled(true).build())
|
||||||
.build();
|
.build();
|
||||||
try (HoodieWriteClient client = getHoodieWriteClient(cfg);) {
|
try (HoodieWriteClient client = getHoodieWriteClient(cfg);) {
|
||||||
@@ -204,11 +199,10 @@ public class TestCleaner extends TestHoodieClientBase {
|
|||||||
HoodieTable table = HoodieTable.getHoodieTable(metaClient, getConfig(), jsc);
|
HoodieTable table = HoodieTable.getHoodieTable(metaClient, getConfig(), jsc);
|
||||||
for (String partitionPath : dataGen.getPartitionPaths()) {
|
for (String partitionPath : dataGen.getPartitionPaths()) {
|
||||||
TableFileSystemView fsView = table.getFileSystemView();
|
TableFileSystemView fsView = table.getFileSystemView();
|
||||||
Option<Boolean> added = Option.fromJavaOptional(fsView.getAllFileGroups(partitionPath).findFirst()
|
Option<Boolean> added = Option.fromJavaOptional(fsView.getAllFileGroups(partitionPath).findFirst().map(fg -> {
|
||||||
.map(fg -> {
|
fg.getLatestFileSlice().map(fs -> compactionFileIdToLatestFileSlice.put(fg.getFileGroupId(), fs));
|
||||||
fg.getLatestFileSlice().map(fs -> compactionFileIdToLatestFileSlice.put(fg.getFileGroupId(), fs));
|
return true;
|
||||||
return true;
|
}));
|
||||||
}));
|
|
||||||
if (added.isPresent()) {
|
if (added.isPresent()) {
|
||||||
// Select only one file-group for compaction
|
// Select only one file-group for compaction
|
||||||
break;
|
break;
|
||||||
@@ -234,8 +228,7 @@ public class TestCleaner extends TestHoodieClientBase {
|
|||||||
client.startCommitWithTime(newInstantTime);
|
client.startCommitWithTime(newInstantTime);
|
||||||
List<HoodieRecord> records = recordUpsertGenWrappedFunction.apply(newInstantTime, 100);
|
List<HoodieRecord> records = recordUpsertGenWrappedFunction.apply(newInstantTime, 100);
|
||||||
|
|
||||||
List<WriteStatus> statuses =
|
List<WriteStatus> statuses = upsertFn.apply(client, jsc.parallelize(records, 1), newInstantTime).collect();
|
||||||
upsertFn.apply(client, jsc.parallelize(records, 1), newInstantTime).collect();
|
|
||||||
// Verify there are no errors
|
// Verify there are no errors
|
||||||
assertNoWriteErrors(statuses);
|
assertNoWriteErrors(statuses);
|
||||||
|
|
||||||
@@ -249,8 +242,8 @@ public class TestCleaner extends TestHoodieClientBase {
|
|||||||
// compute all the versions of all files, from time 0
|
// compute all the versions of all files, from time 0
|
||||||
HashMap<String, TreeSet<String>> fileIdToVersions = new HashMap<>();
|
HashMap<String, TreeSet<String>> fileIdToVersions = new HashMap<>();
|
||||||
for (HoodieInstant entry : timeline.getInstants().collect(Collectors.toList())) {
|
for (HoodieInstant entry : timeline.getInstants().collect(Collectors.toList())) {
|
||||||
HoodieCommitMetadata commitMetadata = HoodieCommitMetadata
|
HoodieCommitMetadata commitMetadata =
|
||||||
.fromBytes(timeline.getInstantDetails(entry).get(), HoodieCommitMetadata.class);
|
HoodieCommitMetadata.fromBytes(timeline.getInstantDetails(entry).get(), HoodieCommitMetadata.class);
|
||||||
|
|
||||||
for (HoodieWriteStat wstat : commitMetadata.getWriteStats(partitionPath)) {
|
for (HoodieWriteStat wstat : commitMetadata.getWriteStats(partitionPath)) {
|
||||||
if (!fileIdToVersions.containsKey(wstat.getFileId())) {
|
if (!fileIdToVersions.containsKey(wstat.getFileId())) {
|
||||||
@@ -267,8 +260,8 @@ public class TestCleaner extends TestHoodieClientBase {
|
|||||||
// Ensure latest file-slice selected for compaction is retained
|
// Ensure latest file-slice selected for compaction is retained
|
||||||
Option<HoodieDataFile> dataFileForCompactionPresent =
|
Option<HoodieDataFile> dataFileForCompactionPresent =
|
||||||
Option.fromJavaOptional(fileGroup.getAllDataFiles().filter(df -> {
|
Option.fromJavaOptional(fileGroup.getAllDataFiles().filter(df -> {
|
||||||
return compactionFileIdToLatestFileSlice.get(fileGroup.getFileGroupId())
|
return compactionFileIdToLatestFileSlice.get(fileGroup.getFileGroupId()).getBaseInstantTime()
|
||||||
.getBaseInstantTime().equals(df.getCommitTime());
|
.equals(df.getCommitTime());
|
||||||
}).findAny());
|
}).findAny());
|
||||||
Assert.assertTrue("Data File selected for compaction is retained",
|
Assert.assertTrue("Data File selected for compaction is retained",
|
||||||
dataFileForCompactionPresent.isPresent());
|
dataFileForCompactionPresent.isPresent());
|
||||||
@@ -310,8 +303,7 @@ public class TestCleaner extends TestHoodieClientBase {
|
|||||||
*/
|
*/
|
||||||
@Test
|
@Test
|
||||||
public void testInsertPreppedAndCleanByCommits() throws Exception {
|
public void testInsertPreppedAndCleanByCommits() throws Exception {
|
||||||
testInsertAndCleanByCommits(HoodieWriteClient::insertPreppedRecords,
|
testInsertAndCleanByCommits(HoodieWriteClient::insertPreppedRecords, HoodieWriteClient::upsertPreppedRecords, true);
|
||||||
HoodieWriteClient::upsertPreppedRecords, true);
|
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
@@ -338,20 +330,18 @@ public class TestCleaner extends TestHoodieClientBase {
|
|||||||
* @param insertFn Insert API to be tested
|
* @param insertFn Insert API to be tested
|
||||||
* @param upsertFn Upsert API to be tested
|
* @param upsertFn Upsert API to be tested
|
||||||
* @param isPreppedAPI Flag to indicate if a prepped-version is used. If true, a wrapper function will be used during
|
* @param isPreppedAPI Flag to indicate if a prepped-version is used. If true, a wrapper function will be used during
|
||||||
* record generation to also tag the regards (de-dupe is implicit as we use uniq record-gen APIs)
|
* record generation to also tag the regards (de-dupe is implicit as we use uniq record-gen APIs)
|
||||||
* @throws Exception in case of errors
|
* @throws Exception in case of errors
|
||||||
*/
|
*/
|
||||||
private void testInsertAndCleanByCommits(
|
private void testInsertAndCleanByCommits(
|
||||||
Function3<JavaRDD<WriteStatus>, HoodieWriteClient, JavaRDD<HoodieRecord>, String> insertFn,
|
Function3<JavaRDD<WriteStatus>, HoodieWriteClient, JavaRDD<HoodieRecord>, String> insertFn,
|
||||||
Function3<JavaRDD<WriteStatus>, HoodieWriteClient, JavaRDD<HoodieRecord>, String> upsertFn,
|
Function3<JavaRDD<WriteStatus>, HoodieWriteClient, JavaRDD<HoodieRecord>, String> upsertFn, boolean isPreppedAPI)
|
||||||
boolean isPreppedAPI
|
throws Exception {
|
||||||
) throws Exception {
|
|
||||||
int maxCommits = 3; // keep upto 3 commits from the past
|
int maxCommits = 3; // keep upto 3 commits from the past
|
||||||
HoodieWriteConfig cfg = getConfigBuilder().withCompactionConfig(
|
HoodieWriteConfig cfg = getConfigBuilder()
|
||||||
HoodieCompactionConfig.newBuilder()
|
.withCompactionConfig(HoodieCompactionConfig.newBuilder()
|
||||||
.withCleanerPolicy(HoodieCleaningPolicy.KEEP_LATEST_FILE_VERSIONS).retainCommits(maxCommits).build())
|
.withCleanerPolicy(HoodieCleaningPolicy.KEEP_LATEST_FILE_VERSIONS).retainCommits(maxCommits).build())
|
||||||
.withParallelism(1, 1).withBulkInsertParallelism(1)
|
.withParallelism(1, 1).withBulkInsertParallelism(1).withFinalizeWriteParallelism(1)
|
||||||
.withFinalizeWriteParallelism(1)
|
|
||||||
.withConsistencyGuardConfig(ConsistencyGuardConfig.newBuilder().withConsistencyCheckEnabled(true).build())
|
.withConsistencyGuardConfig(ConsistencyGuardConfig.newBuilder().withConsistencyCheckEnabled(true).build())
|
||||||
.build();
|
.build();
|
||||||
HoodieWriteClient client = getHoodieWriteClient(cfg);
|
HoodieWriteClient client = getHoodieWriteClient(cfg);
|
||||||
@@ -370,8 +360,7 @@ public class TestCleaner extends TestHoodieClientBase {
|
|||||||
client.startCommitWithTime(newCommitTime);
|
client.startCommitWithTime(newCommitTime);
|
||||||
List<HoodieRecord> records = recordUpsertGenWrappedFunction.apply(newCommitTime, 100);
|
List<HoodieRecord> records = recordUpsertGenWrappedFunction.apply(newCommitTime, 100);
|
||||||
|
|
||||||
List<WriteStatus> statuses =
|
List<WriteStatus> statuses = upsertFn.apply(client, jsc.parallelize(records, 1), newCommitTime).collect();
|
||||||
upsertFn.apply(client, jsc.parallelize(records, 1), newCommitTime).collect();
|
|
||||||
// Verify there are no errors
|
// Verify there are no errors
|
||||||
assertNoWriteErrors(statuses);
|
assertNoWriteErrors(statuses);
|
||||||
|
|
||||||
@@ -381,9 +370,9 @@ public class TestCleaner extends TestHoodieClientBase {
|
|||||||
Option<HoodieInstant> earliestRetainedCommit = activeTimeline.nthFromLastInstant(maxCommits - 1);
|
Option<HoodieInstant> earliestRetainedCommit = activeTimeline.nthFromLastInstant(maxCommits - 1);
|
||||||
Set<HoodieInstant> acceptableCommits = activeTimeline.getInstants().collect(Collectors.toSet());
|
Set<HoodieInstant> acceptableCommits = activeTimeline.getInstants().collect(Collectors.toSet());
|
||||||
if (earliestRetainedCommit.isPresent()) {
|
if (earliestRetainedCommit.isPresent()) {
|
||||||
acceptableCommits.removeAll(
|
acceptableCommits
|
||||||
activeTimeline.findInstantsInRange("000", earliestRetainedCommit.get().getTimestamp()).getInstants()
|
.removeAll(activeTimeline.findInstantsInRange("000", earliestRetainedCommit.get().getTimestamp())
|
||||||
.collect(Collectors.toSet()));
|
.getInstants().collect(Collectors.toSet()));
|
||||||
acceptableCommits.add(earliestRetainedCommit.get());
|
acceptableCommits.add(earliestRetainedCommit.get());
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -412,18 +401,19 @@ public class TestCleaner extends TestHoodieClientBase {
|
|||||||
*/
|
*/
|
||||||
@Test
|
@Test
|
||||||
public void testKeepLatestFileVersions() throws IOException {
|
public void testKeepLatestFileVersions() throws IOException {
|
||||||
HoodieWriteConfig config = HoodieWriteConfig.newBuilder().withPath(basePath).withAssumeDatePartitioning(true)
|
HoodieWriteConfig config =
|
||||||
.withCompactionConfig(HoodieCompactionConfig.newBuilder().withCleanerPolicy(
|
HoodieWriteConfig.newBuilder().withPath(basePath).withAssumeDatePartitioning(true)
|
||||||
HoodieCleaningPolicy.KEEP_LATEST_FILE_VERSIONS).retainFileVersions(1).build())
|
.withCompactionConfig(HoodieCompactionConfig.newBuilder()
|
||||||
.build();
|
.withCleanerPolicy(HoodieCleaningPolicy.KEEP_LATEST_FILE_VERSIONS).retainFileVersions(1).build())
|
||||||
|
.build();
|
||||||
|
|
||||||
// make 1 commit, with 1 file per partition
|
// make 1 commit, with 1 file per partition
|
||||||
HoodieTestUtils.createCommitFiles(basePath, "000");
|
HoodieTestUtils.createCommitFiles(basePath, "000");
|
||||||
|
|
||||||
String file1P0C0 = HoodieTestUtils
|
String file1P0C0 =
|
||||||
.createNewDataFile(basePath, HoodieTestDataGenerator.DEFAULT_FIRST_PARTITION_PATH, "000");
|
HoodieTestUtils.createNewDataFile(basePath, HoodieTestDataGenerator.DEFAULT_FIRST_PARTITION_PATH, "000");
|
||||||
String file1P1C0 = HoodieTestUtils
|
String file1P1C0 =
|
||||||
.createNewDataFile(basePath, HoodieTestDataGenerator.DEFAULT_SECOND_PARTITION_PATH, "000");
|
HoodieTestUtils.createNewDataFile(basePath, HoodieTestDataGenerator.DEFAULT_SECOND_PARTITION_PATH, "000");
|
||||||
metaClient = HoodieTableMetaClient.reload(metaClient);
|
metaClient = HoodieTableMetaClient.reload(metaClient);
|
||||||
HoodieTable table = HoodieTable.getHoodieTable(metaClient, config, jsc);
|
HoodieTable table = HoodieTable.getHoodieTable(metaClient, config, jsc);
|
||||||
|
|
||||||
@@ -434,24 +424,22 @@ public class TestCleaner extends TestHoodieClientBase {
|
|||||||
assertEquals("Must not clean any files", 0,
|
assertEquals("Must not clean any files", 0,
|
||||||
getCleanStat(hoodieCleanStatsOne, HoodieTestDataGenerator.DEFAULT_SECOND_PARTITION_PATH).getSuccessDeleteFiles()
|
getCleanStat(hoodieCleanStatsOne, HoodieTestDataGenerator.DEFAULT_SECOND_PARTITION_PATH).getSuccessDeleteFiles()
|
||||||
.size());
|
.size());
|
||||||
assertTrue(HoodieTestUtils
|
assertTrue(HoodieTestUtils.doesDataFileExist(basePath, HoodieTestDataGenerator.DEFAULT_FIRST_PARTITION_PATH, "000",
|
||||||
.doesDataFileExist(basePath, HoodieTestDataGenerator.DEFAULT_FIRST_PARTITION_PATH, "000", file1P0C0));
|
file1P0C0));
|
||||||
assertTrue(HoodieTestUtils
|
assertTrue(HoodieTestUtils.doesDataFileExist(basePath, HoodieTestDataGenerator.DEFAULT_SECOND_PARTITION_PATH, "000",
|
||||||
.doesDataFileExist(basePath, HoodieTestDataGenerator.DEFAULT_SECOND_PARTITION_PATH, "000", file1P1C0));
|
file1P1C0));
|
||||||
|
|
||||||
// make next commit, with 1 insert & 1 update per partition
|
// make next commit, with 1 insert & 1 update per partition
|
||||||
HoodieTestUtils.createCommitFiles(basePath, "001");
|
HoodieTestUtils.createCommitFiles(basePath, "001");
|
||||||
metaClient = HoodieTableMetaClient.reload(metaClient);
|
metaClient = HoodieTableMetaClient.reload(metaClient);
|
||||||
table = HoodieTable.getHoodieTable(metaClient, config, jsc);
|
table = HoodieTable.getHoodieTable(metaClient, config, jsc);
|
||||||
|
|
||||||
String file2P0C1 = HoodieTestUtils
|
String file2P0C1 =
|
||||||
.createNewDataFile(basePath, HoodieTestDataGenerator.DEFAULT_FIRST_PARTITION_PATH, "001"); // insert
|
HoodieTestUtils.createNewDataFile(basePath, HoodieTestDataGenerator.DEFAULT_FIRST_PARTITION_PATH, "001"); // insert
|
||||||
String file2P1C1 = HoodieTestUtils
|
String file2P1C1 =
|
||||||
.createNewDataFile(basePath, HoodieTestDataGenerator.DEFAULT_SECOND_PARTITION_PATH, "001"); // insert
|
HoodieTestUtils.createNewDataFile(basePath, HoodieTestDataGenerator.DEFAULT_SECOND_PARTITION_PATH, "001"); // insert
|
||||||
HoodieTestUtils
|
HoodieTestUtils.createDataFile(basePath, HoodieTestDataGenerator.DEFAULT_FIRST_PARTITION_PATH, "001", file1P0C0); // update
|
||||||
.createDataFile(basePath, HoodieTestDataGenerator.DEFAULT_FIRST_PARTITION_PATH, "001", file1P0C0); // update
|
HoodieTestUtils.createDataFile(basePath, HoodieTestDataGenerator.DEFAULT_SECOND_PARTITION_PATH, "001", file1P1C0); // update
|
||||||
HoodieTestUtils
|
|
||||||
.createDataFile(basePath, HoodieTestDataGenerator.DEFAULT_SECOND_PARTITION_PATH, "001", file1P1C0); // update
|
|
||||||
|
|
||||||
List<HoodieCleanStat> hoodieCleanStatsTwo = table.clean(jsc);
|
List<HoodieCleanStat> hoodieCleanStatsTwo = table.clean(jsc);
|
||||||
assertEquals("Must clean 1 file", 1,
|
assertEquals("Must clean 1 file", 1,
|
||||||
@@ -460,47 +448,44 @@ public class TestCleaner extends TestHoodieClientBase {
|
|||||||
assertEquals("Must clean 1 file", 1,
|
assertEquals("Must clean 1 file", 1,
|
||||||
getCleanStat(hoodieCleanStatsTwo, HoodieTestDataGenerator.DEFAULT_SECOND_PARTITION_PATH).getSuccessDeleteFiles()
|
getCleanStat(hoodieCleanStatsTwo, HoodieTestDataGenerator.DEFAULT_SECOND_PARTITION_PATH).getSuccessDeleteFiles()
|
||||||
.size());
|
.size());
|
||||||
assertTrue(HoodieTestUtils
|
assertTrue(HoodieTestUtils.doesDataFileExist(basePath, HoodieTestDataGenerator.DEFAULT_FIRST_PARTITION_PATH, "001",
|
||||||
.doesDataFileExist(basePath, HoodieTestDataGenerator.DEFAULT_FIRST_PARTITION_PATH, "001", file2P0C1));
|
file2P0C1));
|
||||||
assertTrue(HoodieTestUtils
|
assertTrue(HoodieTestUtils.doesDataFileExist(basePath, HoodieTestDataGenerator.DEFAULT_SECOND_PARTITION_PATH, "001",
|
||||||
.doesDataFileExist(basePath, HoodieTestDataGenerator.DEFAULT_SECOND_PARTITION_PATH, "001", file2P1C1));
|
file2P1C1));
|
||||||
assertFalse(HoodieTestUtils
|
assertFalse(HoodieTestUtils.doesDataFileExist(basePath, HoodieTestDataGenerator.DEFAULT_FIRST_PARTITION_PATH, "000",
|
||||||
.doesDataFileExist(basePath, HoodieTestDataGenerator.DEFAULT_FIRST_PARTITION_PATH, "000", file1P0C0));
|
file1P0C0));
|
||||||
assertFalse(HoodieTestUtils
|
assertFalse(HoodieTestUtils.doesDataFileExist(basePath, HoodieTestDataGenerator.DEFAULT_SECOND_PARTITION_PATH,
|
||||||
.doesDataFileExist(basePath, HoodieTestDataGenerator.DEFAULT_SECOND_PARTITION_PATH, "000", file1P1C0));
|
"000", file1P1C0));
|
||||||
|
|
||||||
// make next commit, with 2 updates to existing files, and 1 insert
|
// make next commit, with 2 updates to existing files, and 1 insert
|
||||||
HoodieTestUtils.createCommitFiles(basePath, "002");
|
HoodieTestUtils.createCommitFiles(basePath, "002");
|
||||||
metaClient = HoodieTableMetaClient.reload(metaClient);
|
metaClient = HoodieTableMetaClient.reload(metaClient);
|
||||||
table = HoodieTable.getHoodieTable(metaClient, config, jsc);
|
table = HoodieTable.getHoodieTable(metaClient, config, jsc);
|
||||||
|
|
||||||
HoodieTestUtils
|
HoodieTestUtils.createDataFile(basePath, HoodieTestDataGenerator.DEFAULT_FIRST_PARTITION_PATH, "002", file1P0C0); // update
|
||||||
.createDataFile(basePath, HoodieTestDataGenerator.DEFAULT_FIRST_PARTITION_PATH, "002", file1P0C0); // update
|
HoodieTestUtils.createDataFile(basePath, HoodieTestDataGenerator.DEFAULT_FIRST_PARTITION_PATH, "002", file2P0C1); // update
|
||||||
HoodieTestUtils
|
String file3P0C2 =
|
||||||
.createDataFile(basePath, HoodieTestDataGenerator.DEFAULT_FIRST_PARTITION_PATH, "002", file2P0C1); // update
|
HoodieTestUtils.createNewDataFile(basePath, HoodieTestDataGenerator.DEFAULT_FIRST_PARTITION_PATH, "002");
|
||||||
String file3P0C2 = HoodieTestUtils
|
|
||||||
.createNewDataFile(basePath, HoodieTestDataGenerator.DEFAULT_FIRST_PARTITION_PATH, "002");
|
|
||||||
|
|
||||||
List<HoodieCleanStat> hoodieCleanStatsThree = table.clean(jsc);
|
List<HoodieCleanStat> hoodieCleanStatsThree = table.clean(jsc);
|
||||||
assertEquals("Must clean two files", 2,
|
assertEquals("Must clean two files", 2,
|
||||||
getCleanStat(hoodieCleanStatsThree, HoodieTestDataGenerator.DEFAULT_FIRST_PARTITION_PATH)
|
getCleanStat(hoodieCleanStatsThree, HoodieTestDataGenerator.DEFAULT_FIRST_PARTITION_PATH)
|
||||||
.getSuccessDeleteFiles().size());
|
.getSuccessDeleteFiles().size());
|
||||||
assertFalse(HoodieTestUtils
|
assertFalse(HoodieTestUtils.doesDataFileExist(basePath, HoodieTestDataGenerator.DEFAULT_FIRST_PARTITION_PATH, "001",
|
||||||
.doesDataFileExist(basePath, HoodieTestDataGenerator.DEFAULT_FIRST_PARTITION_PATH, "001", file1P0C0));
|
file1P0C0));
|
||||||
assertFalse(HoodieTestUtils
|
assertFalse(HoodieTestUtils.doesDataFileExist(basePath, HoodieTestDataGenerator.DEFAULT_FIRST_PARTITION_PATH, "001",
|
||||||
.doesDataFileExist(basePath, HoodieTestDataGenerator.DEFAULT_FIRST_PARTITION_PATH, "001", file2P0C1));
|
file2P0C1));
|
||||||
assertTrue(HoodieTestUtils
|
assertTrue(HoodieTestUtils.doesDataFileExist(basePath, HoodieTestDataGenerator.DEFAULT_FIRST_PARTITION_PATH, "002",
|
||||||
.doesDataFileExist(basePath, HoodieTestDataGenerator.DEFAULT_FIRST_PARTITION_PATH, "002", file3P0C2));
|
file3P0C2));
|
||||||
|
|
||||||
// No cleaning on partially written file, with no commit.
|
// No cleaning on partially written file, with no commit.
|
||||||
HoodieTestUtils
|
HoodieTestUtils.createDataFile(basePath, HoodieTestDataGenerator.DEFAULT_FIRST_PARTITION_PATH, "003", file3P0C2); // update
|
||||||
.createDataFile(basePath, HoodieTestDataGenerator.DEFAULT_FIRST_PARTITION_PATH, "003", file3P0C2); // update
|
|
||||||
List<HoodieCleanStat> hoodieCleanStatsFour = table.clean(jsc);
|
List<HoodieCleanStat> hoodieCleanStatsFour = table.clean(jsc);
|
||||||
assertEquals("Must not clean any files", 0,
|
assertEquals("Must not clean any files", 0,
|
||||||
getCleanStat(hoodieCleanStatsFour, HoodieTestDataGenerator.DEFAULT_FIRST_PARTITION_PATH).getSuccessDeleteFiles()
|
getCleanStat(hoodieCleanStatsFour, HoodieTestDataGenerator.DEFAULT_FIRST_PARTITION_PATH).getSuccessDeleteFiles()
|
||||||
.size());
|
.size());
|
||||||
assertTrue(HoodieTestUtils
|
assertTrue(HoodieTestUtils.doesDataFileExist(basePath, HoodieTestDataGenerator.DEFAULT_FIRST_PARTITION_PATH, "002",
|
||||||
.doesDataFileExist(basePath, HoodieTestDataGenerator.DEFAULT_FIRST_PARTITION_PATH, "002", file3P0C2));
|
file3P0C2));
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
@@ -509,37 +494,33 @@ public class TestCleaner extends TestHoodieClientBase {
|
|||||||
@Test
|
@Test
|
||||||
public void testKeepLatestFileVersionsMOR() throws IOException {
|
public void testKeepLatestFileVersionsMOR() throws IOException {
|
||||||
|
|
||||||
HoodieWriteConfig config = HoodieWriteConfig.newBuilder().withPath(basePath).withAssumeDatePartitioning(true)
|
HoodieWriteConfig config =
|
||||||
.withCompactionConfig(HoodieCompactionConfig.newBuilder().withCleanerPolicy(
|
HoodieWriteConfig.newBuilder().withPath(basePath).withAssumeDatePartitioning(true)
|
||||||
HoodieCleaningPolicy.KEEP_LATEST_FILE_VERSIONS).retainFileVersions(1).build())
|
.withCompactionConfig(HoodieCompactionConfig.newBuilder()
|
||||||
.build();
|
.withCleanerPolicy(HoodieCleaningPolicy.KEEP_LATEST_FILE_VERSIONS).retainFileVersions(1).build())
|
||||||
|
.build();
|
||||||
|
|
||||||
HoodieTableMetaClient metaClient = HoodieTestUtils.init(jsc.hadoopConfiguration(), basePath,
|
HoodieTableMetaClient metaClient =
|
||||||
HoodieTableType.MERGE_ON_READ);
|
HoodieTestUtils.init(jsc.hadoopConfiguration(), basePath, HoodieTableType.MERGE_ON_READ);
|
||||||
|
|
||||||
// Make 3 files, one base file and 2 log files associated with base file
|
// Make 3 files, one base file and 2 log files associated with base file
|
||||||
String file1P0 = HoodieTestUtils
|
String file1P0 =
|
||||||
.createNewDataFile(basePath, HoodieTestDataGenerator.DEFAULT_FIRST_PARTITION_PATH, "000");
|
HoodieTestUtils.createNewDataFile(basePath, HoodieTestDataGenerator.DEFAULT_FIRST_PARTITION_PATH, "000");
|
||||||
String file2P0L0 = HoodieTestUtils
|
String file2P0L0 = HoodieTestUtils.createNewLogFile(fs, basePath,
|
||||||
.createNewLogFile(fs, basePath, HoodieTestDataGenerator.DEFAULT_FIRST_PARTITION_PATH, "000", file1P0,
|
HoodieTestDataGenerator.DEFAULT_FIRST_PARTITION_PATH, "000", file1P0, Option.empty());
|
||||||
Option.empty());
|
String file2P0L1 = HoodieTestUtils.createNewLogFile(fs, basePath,
|
||||||
String file2P0L1 = HoodieTestUtils
|
HoodieTestDataGenerator.DEFAULT_FIRST_PARTITION_PATH, "000", file1P0, Option.of(2));
|
||||||
.createNewLogFile(fs, basePath, HoodieTestDataGenerator.DEFAULT_FIRST_PARTITION_PATH, "000", file1P0,
|
|
||||||
Option.of(2));
|
|
||||||
// make 1 compaction commit
|
// make 1 compaction commit
|
||||||
HoodieTestUtils.createCompactionCommitFiles(fs, basePath, "000");
|
HoodieTestUtils.createCompactionCommitFiles(fs, basePath, "000");
|
||||||
|
|
||||||
// Make 4 files, one base file and 3 log files associated with base file
|
// Make 4 files, one base file and 3 log files associated with base file
|
||||||
HoodieTestUtils.createDataFile(basePath, HoodieTestDataGenerator.DEFAULT_FIRST_PARTITION_PATH, "001", file1P0);
|
HoodieTestUtils.createDataFile(basePath, HoodieTestDataGenerator.DEFAULT_FIRST_PARTITION_PATH, "001", file1P0);
|
||||||
file2P0L0 = HoodieTestUtils
|
file2P0L0 = HoodieTestUtils.createNewLogFile(fs, basePath, HoodieTestDataGenerator.DEFAULT_FIRST_PARTITION_PATH,
|
||||||
.createNewLogFile(fs, basePath, HoodieTestDataGenerator.DEFAULT_FIRST_PARTITION_PATH, "001", file1P0,
|
"001", file1P0, Option.empty());
|
||||||
Option.empty());
|
file2P0L0 = HoodieTestUtils.createNewLogFile(fs, basePath, HoodieTestDataGenerator.DEFAULT_FIRST_PARTITION_PATH,
|
||||||
file2P0L0 = HoodieTestUtils
|
"001", file1P0, Option.of(2));
|
||||||
.createNewLogFile(fs, basePath, HoodieTestDataGenerator.DEFAULT_FIRST_PARTITION_PATH, "001", file1P0,
|
file2P0L0 = HoodieTestUtils.createNewLogFile(fs, basePath, HoodieTestDataGenerator.DEFAULT_FIRST_PARTITION_PATH,
|
||||||
Option.of(2));
|
"001", file1P0, Option.of(3));
|
||||||
file2P0L0 = HoodieTestUtils
|
|
||||||
.createNewLogFile(fs, basePath, HoodieTestDataGenerator.DEFAULT_FIRST_PARTITION_PATH, "001", file1P0,
|
|
||||||
Option.of(3));
|
|
||||||
// make 1 compaction commit
|
// make 1 compaction commit
|
||||||
HoodieTestUtils.createCompactionCommitFiles(fs, basePath, "001");
|
HoodieTestUtils.createCompactionCommitFiles(fs, basePath, "001");
|
||||||
|
|
||||||
@@ -548,16 +529,12 @@ public class TestCleaner extends TestHoodieClientBase {
|
|||||||
assertEquals("Must clean three files, one parquet and 2 log files", 3,
|
assertEquals("Must clean three files, one parquet and 2 log files", 3,
|
||||||
getCleanStat(hoodieCleanStats, HoodieTestDataGenerator.DEFAULT_FIRST_PARTITION_PATH).getSuccessDeleteFiles()
|
getCleanStat(hoodieCleanStats, HoodieTestDataGenerator.DEFAULT_FIRST_PARTITION_PATH).getSuccessDeleteFiles()
|
||||||
.size());
|
.size());
|
||||||
assertFalse(HoodieTestUtils
|
assertFalse(HoodieTestUtils.doesDataFileExist(basePath, HoodieTestDataGenerator.DEFAULT_FIRST_PARTITION_PATH, "000",
|
||||||
.doesDataFileExist(basePath, HoodieTestDataGenerator.DEFAULT_FIRST_PARTITION_PATH, "000", file1P0));
|
file1P0));
|
||||||
assertFalse(
|
assertFalse(HoodieTestUtils.doesLogFileExist(basePath, HoodieTestDataGenerator.DEFAULT_FIRST_PARTITION_PATH, "000",
|
||||||
HoodieTestUtils
|
file2P0L0, Option.empty()));
|
||||||
.doesLogFileExist(basePath, HoodieTestDataGenerator.DEFAULT_FIRST_PARTITION_PATH, "000", file2P0L0,
|
assertFalse(HoodieTestUtils.doesLogFileExist(basePath, HoodieTestDataGenerator.DEFAULT_FIRST_PARTITION_PATH, "000",
|
||||||
Option.empty()));
|
file2P0L0, Option.of(2)));
|
||||||
assertFalse(
|
|
||||||
HoodieTestUtils
|
|
||||||
.doesLogFileExist(basePath, HoodieTestDataGenerator.DEFAULT_FIRST_PARTITION_PATH, "000", file2P0L0,
|
|
||||||
Option.of(2)));
|
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
@@ -566,16 +543,17 @@ public class TestCleaner extends TestHoodieClientBase {
|
|||||||
@Test
|
@Test
|
||||||
public void testKeepLatestCommits() throws IOException {
|
public void testKeepLatestCommits() throws IOException {
|
||||||
HoodieWriteConfig config = HoodieWriteConfig.newBuilder().withPath(basePath).withAssumeDatePartitioning(true)
|
HoodieWriteConfig config = HoodieWriteConfig.newBuilder().withPath(basePath).withAssumeDatePartitioning(true)
|
||||||
.withCompactionConfig(HoodieCompactionConfig.newBuilder().withCleanerPolicy(
|
.withCompactionConfig(HoodieCompactionConfig.newBuilder()
|
||||||
HoodieCleaningPolicy.KEEP_LATEST_COMMITS).retainCommits(2).build()).build();
|
.withCleanerPolicy(HoodieCleaningPolicy.KEEP_LATEST_COMMITS).retainCommits(2).build())
|
||||||
|
.build();
|
||||||
|
|
||||||
// make 1 commit, with 1 file per partition
|
// make 1 commit, with 1 file per partition
|
||||||
HoodieTestUtils.createCommitFiles(basePath, "000");
|
HoodieTestUtils.createCommitFiles(basePath, "000");
|
||||||
|
|
||||||
String file1P0C0 = HoodieTestUtils
|
String file1P0C0 =
|
||||||
.createNewDataFile(basePath, HoodieTestDataGenerator.DEFAULT_FIRST_PARTITION_PATH, "000");
|
HoodieTestUtils.createNewDataFile(basePath, HoodieTestDataGenerator.DEFAULT_FIRST_PARTITION_PATH, "000");
|
||||||
String file1P1C0 = HoodieTestUtils
|
String file1P1C0 =
|
||||||
.createNewDataFile(basePath, HoodieTestDataGenerator.DEFAULT_SECOND_PARTITION_PATH, "000");
|
HoodieTestUtils.createNewDataFile(basePath, HoodieTestDataGenerator.DEFAULT_SECOND_PARTITION_PATH, "000");
|
||||||
|
|
||||||
metaClient = HoodieTableMetaClient.reload(metaClient);
|
metaClient = HoodieTableMetaClient.reload(metaClient);
|
||||||
HoodieTable table = HoodieTable.getHoodieTable(metaClient, config, jsc);
|
HoodieTable table = HoodieTable.getHoodieTable(metaClient, config, jsc);
|
||||||
@@ -587,24 +565,22 @@ public class TestCleaner extends TestHoodieClientBase {
|
|||||||
assertEquals("Must not clean any files", 0,
|
assertEquals("Must not clean any files", 0,
|
||||||
getCleanStat(hoodieCleanStatsOne, HoodieTestDataGenerator.DEFAULT_SECOND_PARTITION_PATH).getSuccessDeleteFiles()
|
getCleanStat(hoodieCleanStatsOne, HoodieTestDataGenerator.DEFAULT_SECOND_PARTITION_PATH).getSuccessDeleteFiles()
|
||||||
.size());
|
.size());
|
||||||
assertTrue(HoodieTestUtils
|
assertTrue(HoodieTestUtils.doesDataFileExist(basePath, HoodieTestDataGenerator.DEFAULT_FIRST_PARTITION_PATH, "000",
|
||||||
.doesDataFileExist(basePath, HoodieTestDataGenerator.DEFAULT_FIRST_PARTITION_PATH, "000", file1P0C0));
|
file1P0C0));
|
||||||
assertTrue(HoodieTestUtils
|
assertTrue(HoodieTestUtils.doesDataFileExist(basePath, HoodieTestDataGenerator.DEFAULT_SECOND_PARTITION_PATH, "000",
|
||||||
.doesDataFileExist(basePath, HoodieTestDataGenerator.DEFAULT_SECOND_PARTITION_PATH, "000", file1P1C0));
|
file1P1C0));
|
||||||
|
|
||||||
// make next commit, with 1 insert & 1 update per partition
|
// make next commit, with 1 insert & 1 update per partition
|
||||||
HoodieTestUtils.createCommitFiles(basePath, "001");
|
HoodieTestUtils.createCommitFiles(basePath, "001");
|
||||||
metaClient = HoodieTableMetaClient.reload(metaClient);
|
metaClient = HoodieTableMetaClient.reload(metaClient);
|
||||||
table = HoodieTable.getHoodieTable(metaClient, config, jsc);
|
table = HoodieTable.getHoodieTable(metaClient, config, jsc);
|
||||||
|
|
||||||
String file2P0C1 = HoodieTestUtils
|
String file2P0C1 =
|
||||||
.createNewDataFile(basePath, HoodieTestDataGenerator.DEFAULT_FIRST_PARTITION_PATH, "001"); // insert
|
HoodieTestUtils.createNewDataFile(basePath, HoodieTestDataGenerator.DEFAULT_FIRST_PARTITION_PATH, "001"); // insert
|
||||||
String file2P1C1 = HoodieTestUtils
|
String file2P1C1 =
|
||||||
.createNewDataFile(basePath, HoodieTestDataGenerator.DEFAULT_SECOND_PARTITION_PATH, "001"); // insert
|
HoodieTestUtils.createNewDataFile(basePath, HoodieTestDataGenerator.DEFAULT_SECOND_PARTITION_PATH, "001"); // insert
|
||||||
HoodieTestUtils
|
HoodieTestUtils.createDataFile(basePath, HoodieTestDataGenerator.DEFAULT_FIRST_PARTITION_PATH, "001", file1P0C0); // update
|
||||||
.createDataFile(basePath, HoodieTestDataGenerator.DEFAULT_FIRST_PARTITION_PATH, "001", file1P0C0); // update
|
HoodieTestUtils.createDataFile(basePath, HoodieTestDataGenerator.DEFAULT_SECOND_PARTITION_PATH, "001", file1P1C0); // update
|
||||||
HoodieTestUtils
|
|
||||||
.createDataFile(basePath, HoodieTestDataGenerator.DEFAULT_SECOND_PARTITION_PATH, "001", file1P1C0); // update
|
|
||||||
|
|
||||||
List<HoodieCleanStat> hoodieCleanStatsTwo = table.clean(jsc);
|
List<HoodieCleanStat> hoodieCleanStatsTwo = table.clean(jsc);
|
||||||
assertEquals("Must not clean any files", 0,
|
assertEquals("Must not clean any files", 0,
|
||||||
@@ -613,78 +589,73 @@ public class TestCleaner extends TestHoodieClientBase {
|
|||||||
assertEquals("Must not clean any files", 0,
|
assertEquals("Must not clean any files", 0,
|
||||||
getCleanStat(hoodieCleanStatsTwo, HoodieTestDataGenerator.DEFAULT_SECOND_PARTITION_PATH).getSuccessDeleteFiles()
|
getCleanStat(hoodieCleanStatsTwo, HoodieTestDataGenerator.DEFAULT_SECOND_PARTITION_PATH).getSuccessDeleteFiles()
|
||||||
.size());
|
.size());
|
||||||
assertTrue(HoodieTestUtils
|
assertTrue(HoodieTestUtils.doesDataFileExist(basePath, HoodieTestDataGenerator.DEFAULT_FIRST_PARTITION_PATH, "001",
|
||||||
.doesDataFileExist(basePath, HoodieTestDataGenerator.DEFAULT_FIRST_PARTITION_PATH, "001", file2P0C1));
|
file2P0C1));
|
||||||
assertTrue(HoodieTestUtils
|
assertTrue(HoodieTestUtils.doesDataFileExist(basePath, HoodieTestDataGenerator.DEFAULT_SECOND_PARTITION_PATH, "001",
|
||||||
.doesDataFileExist(basePath, HoodieTestDataGenerator.DEFAULT_SECOND_PARTITION_PATH, "001", file2P1C1));
|
file2P1C1));
|
||||||
assertTrue(HoodieTestUtils
|
assertTrue(HoodieTestUtils.doesDataFileExist(basePath, HoodieTestDataGenerator.DEFAULT_FIRST_PARTITION_PATH, "000",
|
||||||
.doesDataFileExist(basePath, HoodieTestDataGenerator.DEFAULT_FIRST_PARTITION_PATH, "000", file1P0C0));
|
file1P0C0));
|
||||||
assertTrue(HoodieTestUtils
|
assertTrue(HoodieTestUtils.doesDataFileExist(basePath, HoodieTestDataGenerator.DEFAULT_SECOND_PARTITION_PATH, "000",
|
||||||
.doesDataFileExist(basePath, HoodieTestDataGenerator.DEFAULT_SECOND_PARTITION_PATH, "000", file1P1C0));
|
file1P1C0));
|
||||||
|
|
||||||
// make next commit, with 2 updates to existing files, and 1 insert
|
// make next commit, with 2 updates to existing files, and 1 insert
|
||||||
HoodieTestUtils.createCommitFiles(basePath, "002");
|
HoodieTestUtils.createCommitFiles(basePath, "002");
|
||||||
metaClient = HoodieTableMetaClient.reload(metaClient);
|
metaClient = HoodieTableMetaClient.reload(metaClient);
|
||||||
table = HoodieTable.getHoodieTable(metaClient, config, jsc);
|
table = HoodieTable.getHoodieTable(metaClient, config, jsc);
|
||||||
|
|
||||||
HoodieTestUtils
|
HoodieTestUtils.createDataFile(basePath, HoodieTestDataGenerator.DEFAULT_FIRST_PARTITION_PATH, "002", file1P0C0); // update
|
||||||
.createDataFile(basePath, HoodieTestDataGenerator.DEFAULT_FIRST_PARTITION_PATH, "002", file1P0C0); // update
|
HoodieTestUtils.createDataFile(basePath, HoodieTestDataGenerator.DEFAULT_FIRST_PARTITION_PATH, "002", file2P0C1); // update
|
||||||
HoodieTestUtils
|
String file3P0C2 =
|
||||||
.createDataFile(basePath, HoodieTestDataGenerator.DEFAULT_FIRST_PARTITION_PATH, "002", file2P0C1); // update
|
HoodieTestUtils.createNewDataFile(basePath, HoodieTestDataGenerator.DEFAULT_FIRST_PARTITION_PATH, "002");
|
||||||
String file3P0C2 = HoodieTestUtils
|
|
||||||
.createNewDataFile(basePath, HoodieTestDataGenerator.DEFAULT_FIRST_PARTITION_PATH, "002");
|
|
||||||
|
|
||||||
List<HoodieCleanStat> hoodieCleanStatsThree = table.clean(jsc);
|
List<HoodieCleanStat> hoodieCleanStatsThree = table.clean(jsc);
|
||||||
assertEquals("Must not clean any file. We have to keep 1 version before the latest commit time to keep", 0,
|
assertEquals("Must not clean any file. We have to keep 1 version before the latest commit time to keep", 0,
|
||||||
getCleanStat(hoodieCleanStatsThree, HoodieTestDataGenerator.DEFAULT_FIRST_PARTITION_PATH)
|
getCleanStat(hoodieCleanStatsThree, HoodieTestDataGenerator.DEFAULT_FIRST_PARTITION_PATH)
|
||||||
.getSuccessDeleteFiles().size());
|
.getSuccessDeleteFiles().size());
|
||||||
|
|
||||||
assertTrue(HoodieTestUtils
|
assertTrue(HoodieTestUtils.doesDataFileExist(basePath, HoodieTestDataGenerator.DEFAULT_FIRST_PARTITION_PATH, "000",
|
||||||
.doesDataFileExist(basePath, HoodieTestDataGenerator.DEFAULT_FIRST_PARTITION_PATH, "000", file1P0C0));
|
file1P0C0));
|
||||||
|
|
||||||
// make next commit, with 2 updates to existing files, and 1 insert
|
// make next commit, with 2 updates to existing files, and 1 insert
|
||||||
HoodieTestUtils.createCommitFiles(basePath, "003");
|
HoodieTestUtils.createCommitFiles(basePath, "003");
|
||||||
metaClient = HoodieTableMetaClient.reload(metaClient);
|
metaClient = HoodieTableMetaClient.reload(metaClient);
|
||||||
table = HoodieTable.getHoodieTable(metaClient, config, jsc);
|
table = HoodieTable.getHoodieTable(metaClient, config, jsc);
|
||||||
|
|
||||||
HoodieTestUtils
|
HoodieTestUtils.createDataFile(basePath, HoodieTestDataGenerator.DEFAULT_FIRST_PARTITION_PATH, "003", file1P0C0); // update
|
||||||
.createDataFile(basePath, HoodieTestDataGenerator.DEFAULT_FIRST_PARTITION_PATH, "003", file1P0C0); // update
|
HoodieTestUtils.createDataFile(basePath, HoodieTestDataGenerator.DEFAULT_FIRST_PARTITION_PATH, "003", file2P0C1); // update
|
||||||
HoodieTestUtils
|
String file4P0C3 =
|
||||||
.createDataFile(basePath, HoodieTestDataGenerator.DEFAULT_FIRST_PARTITION_PATH, "003", file2P0C1); // update
|
HoodieTestUtils.createNewDataFile(basePath, HoodieTestDataGenerator.DEFAULT_FIRST_PARTITION_PATH, "003");
|
||||||
String file4P0C3 = HoodieTestUtils
|
|
||||||
.createNewDataFile(basePath, HoodieTestDataGenerator.DEFAULT_FIRST_PARTITION_PATH, "003");
|
|
||||||
|
|
||||||
List<HoodieCleanStat> hoodieCleanStatsFour = table.clean(jsc);
|
List<HoodieCleanStat> hoodieCleanStatsFour = table.clean(jsc);
|
||||||
assertEquals("Must not clean one old file", 1,
|
assertEquals("Must not clean one old file", 1,
|
||||||
getCleanStat(hoodieCleanStatsFour, HoodieTestDataGenerator.DEFAULT_FIRST_PARTITION_PATH).getSuccessDeleteFiles()
|
getCleanStat(hoodieCleanStatsFour, HoodieTestDataGenerator.DEFAULT_FIRST_PARTITION_PATH).getSuccessDeleteFiles()
|
||||||
.size());
|
.size());
|
||||||
|
|
||||||
assertFalse(HoodieTestUtils
|
assertFalse(HoodieTestUtils.doesDataFileExist(basePath, HoodieTestDataGenerator.DEFAULT_FIRST_PARTITION_PATH, "000",
|
||||||
.doesDataFileExist(basePath, HoodieTestDataGenerator.DEFAULT_FIRST_PARTITION_PATH, "000", file1P0C0));
|
file1P0C0));
|
||||||
assertTrue(HoodieTestUtils
|
assertTrue(HoodieTestUtils.doesDataFileExist(basePath, HoodieTestDataGenerator.DEFAULT_FIRST_PARTITION_PATH, "001",
|
||||||
.doesDataFileExist(basePath, HoodieTestDataGenerator.DEFAULT_FIRST_PARTITION_PATH, "001", file1P0C0));
|
file1P0C0));
|
||||||
assertTrue(HoodieTestUtils
|
assertTrue(HoodieTestUtils.doesDataFileExist(basePath, HoodieTestDataGenerator.DEFAULT_FIRST_PARTITION_PATH, "002",
|
||||||
.doesDataFileExist(basePath, HoodieTestDataGenerator.DEFAULT_FIRST_PARTITION_PATH, "002", file1P0C0));
|
file1P0C0));
|
||||||
assertTrue(HoodieTestUtils
|
assertTrue(HoodieTestUtils.doesDataFileExist(basePath, HoodieTestDataGenerator.DEFAULT_FIRST_PARTITION_PATH, "001",
|
||||||
.doesDataFileExist(basePath, HoodieTestDataGenerator.DEFAULT_FIRST_PARTITION_PATH, "001", file2P0C1));
|
file2P0C1));
|
||||||
assertTrue(HoodieTestUtils
|
assertTrue(HoodieTestUtils.doesDataFileExist(basePath, HoodieTestDataGenerator.DEFAULT_FIRST_PARTITION_PATH, "002",
|
||||||
.doesDataFileExist(basePath, HoodieTestDataGenerator.DEFAULT_FIRST_PARTITION_PATH, "002", file2P0C1));
|
file2P0C1));
|
||||||
assertTrue(HoodieTestUtils
|
assertTrue(HoodieTestUtils.doesDataFileExist(basePath, HoodieTestDataGenerator.DEFAULT_FIRST_PARTITION_PATH, "002",
|
||||||
.doesDataFileExist(basePath, HoodieTestDataGenerator.DEFAULT_FIRST_PARTITION_PATH, "002", file3P0C2));
|
file3P0C2));
|
||||||
assertTrue(HoodieTestUtils
|
assertTrue(HoodieTestUtils.doesDataFileExist(basePath, HoodieTestDataGenerator.DEFAULT_FIRST_PARTITION_PATH, "003",
|
||||||
.doesDataFileExist(basePath, HoodieTestDataGenerator.DEFAULT_FIRST_PARTITION_PATH, "003", file4P0C3));
|
file4P0C3));
|
||||||
|
|
||||||
// No cleaning on partially written file, with no commit.
|
// No cleaning on partially written file, with no commit.
|
||||||
HoodieTestUtils
|
HoodieTestUtils.createDataFile(basePath, HoodieTestDataGenerator.DEFAULT_FIRST_PARTITION_PATH, "004", file3P0C2); // update
|
||||||
.createDataFile(basePath, HoodieTestDataGenerator.DEFAULT_FIRST_PARTITION_PATH, "004", file3P0C2); // update
|
|
||||||
List<HoodieCleanStat> hoodieCleanStatsFive = table.clean(jsc);
|
List<HoodieCleanStat> hoodieCleanStatsFive = table.clean(jsc);
|
||||||
assertEquals("Must not clean any files", 0,
|
assertEquals("Must not clean any files", 0,
|
||||||
getCleanStat(hoodieCleanStatsFive, HoodieTestDataGenerator.DEFAULT_FIRST_PARTITION_PATH).getSuccessDeleteFiles()
|
getCleanStat(hoodieCleanStatsFive, HoodieTestDataGenerator.DEFAULT_FIRST_PARTITION_PATH).getSuccessDeleteFiles()
|
||||||
.size());
|
.size());
|
||||||
assertTrue(HoodieTestUtils
|
assertTrue(HoodieTestUtils.doesDataFileExist(basePath, HoodieTestDataGenerator.DEFAULT_FIRST_PARTITION_PATH, "001",
|
||||||
.doesDataFileExist(basePath, HoodieTestDataGenerator.DEFAULT_FIRST_PARTITION_PATH, "001", file1P0C0));
|
file1P0C0));
|
||||||
assertTrue(HoodieTestUtils
|
assertTrue(HoodieTestUtils.doesDataFileExist(basePath, HoodieTestDataGenerator.DEFAULT_FIRST_PARTITION_PATH, "001",
|
||||||
.doesDataFileExist(basePath, HoodieTestDataGenerator.DEFAULT_FIRST_PARTITION_PATH, "001", file2P0C1));
|
file2P0C1));
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
@@ -711,8 +682,9 @@ public class TestCleaner extends TestHoodieClientBase {
|
|||||||
@Test
|
@Test
|
||||||
public void testCleaningWithZeroPartitonPaths() throws IOException {
|
public void testCleaningWithZeroPartitonPaths() throws IOException {
|
||||||
HoodieWriteConfig config = HoodieWriteConfig.newBuilder().withPath(basePath).withAssumeDatePartitioning(true)
|
HoodieWriteConfig config = HoodieWriteConfig.newBuilder().withPath(basePath).withAssumeDatePartitioning(true)
|
||||||
.withCompactionConfig(HoodieCompactionConfig.newBuilder().withCleanerPolicy(
|
.withCompactionConfig(HoodieCompactionConfig.newBuilder()
|
||||||
HoodieCleaningPolicy.KEEP_LATEST_COMMITS).retainCommits(2).build()).build();
|
.withCleanerPolicy(HoodieCleaningPolicy.KEEP_LATEST_COMMITS).retainCommits(2).build())
|
||||||
|
.build();
|
||||||
|
|
||||||
// Make a commit, although there are no partitionPaths.
|
// Make a commit, although there are no partitionPaths.
|
||||||
// Example use-case of this is when a client wants to create a table
|
// Example use-case of this is when a client wants to create a table
|
||||||
@@ -732,8 +704,9 @@ public class TestCleaner extends TestHoodieClientBase {
|
|||||||
@Test
|
@Test
|
||||||
public void testCleaningSkewedPartitons() throws IOException {
|
public void testCleaningSkewedPartitons() throws IOException {
|
||||||
HoodieWriteConfig config = HoodieWriteConfig.newBuilder().withPath(basePath).withAssumeDatePartitioning(true)
|
HoodieWriteConfig config = HoodieWriteConfig.newBuilder().withPath(basePath).withAssumeDatePartitioning(true)
|
||||||
.withCompactionConfig(HoodieCompactionConfig.newBuilder().withCleanerPolicy(
|
.withCompactionConfig(HoodieCompactionConfig.newBuilder()
|
||||||
HoodieCleaningPolicy.KEEP_LATEST_COMMITS).retainCommits(2).build()).build();
|
.withCleanerPolicy(HoodieCleaningPolicy.KEEP_LATEST_COMMITS).retainCommits(2).build())
|
||||||
|
.build();
|
||||||
Map<Long, Long> stageOneShuffleReadTaskRecordsCountMap = new HashMap<>();
|
Map<Long, Long> stageOneShuffleReadTaskRecordsCountMap = new HashMap<>();
|
||||||
|
|
||||||
// Since clean involves repartition in order to uniformly distribute data,
|
// Since clean involves repartition in order to uniformly distribute data,
|
||||||
@@ -783,22 +756,20 @@ public class TestCleaner extends TestHoodieClientBase {
|
|||||||
HoodieTable table = HoodieTable.getHoodieTable(metaClient, config, jsc);
|
HoodieTable table = HoodieTable.getHoodieTable(metaClient, config, jsc);
|
||||||
List<HoodieCleanStat> hoodieCleanStats = table.clean(jsc);
|
List<HoodieCleanStat> hoodieCleanStats = table.clean(jsc);
|
||||||
|
|
||||||
assertEquals(100,
|
assertEquals(100, getCleanStat(hoodieCleanStats, HoodieTestDataGenerator.DEFAULT_FIRST_PARTITION_PATH)
|
||||||
getCleanStat(hoodieCleanStats, HoodieTestDataGenerator.DEFAULT_FIRST_PARTITION_PATH).getSuccessDeleteFiles()
|
.getSuccessDeleteFiles().size());
|
||||||
.size());
|
assertEquals(10, getCleanStat(hoodieCleanStats, HoodieTestDataGenerator.DEFAULT_SECOND_PARTITION_PATH)
|
||||||
assertEquals(10,
|
.getSuccessDeleteFiles().size());
|
||||||
getCleanStat(hoodieCleanStats, HoodieTestDataGenerator.DEFAULT_SECOND_PARTITION_PATH).getSuccessDeleteFiles()
|
assertEquals(10, getCleanStat(hoodieCleanStats, HoodieTestDataGenerator.DEFAULT_THIRD_PARTITION_PATH)
|
||||||
.size());
|
.getSuccessDeleteFiles().size());
|
||||||
assertEquals(10,
|
|
||||||
getCleanStat(hoodieCleanStats, HoodieTestDataGenerator.DEFAULT_THIRD_PARTITION_PATH).getSuccessDeleteFiles()
|
|
||||||
.size());
|
|
||||||
|
|
||||||
// 3 tasks are expected since the number of partitions is 3
|
// 3 tasks are expected since the number of partitions is 3
|
||||||
assertEquals(3, stageOneShuffleReadTaskRecordsCountMap.keySet().size());
|
assertEquals(3, stageOneShuffleReadTaskRecordsCountMap.keySet().size());
|
||||||
// Sum of all records processed = total number of files to clean
|
// Sum of all records processed = total number of files to clean
|
||||||
assertEquals(120,
|
assertEquals(120,
|
||||||
stageOneShuffleReadTaskRecordsCountMap.values().stream().reduce((a, b) -> a + b).get().intValue());
|
stageOneShuffleReadTaskRecordsCountMap.values().stream().reduce((a, b) -> a + b).get().intValue());
|
||||||
assertTrue("The skew in handling files to clean is not removed. "
|
assertTrue(
|
||||||
|
"The skew in handling files to clean is not removed. "
|
||||||
+ "Each task should handle more records than the partitionPath with least files "
|
+ "Each task should handle more records than the partitionPath with least files "
|
||||||
+ "and less records than the partitionPath with most files.",
|
+ "and less records than the partitionPath with most files.",
|
||||||
stageOneShuffleReadTaskRecordsCountMap.values().stream().filter(a -> a > 10 && a < 100).count() == 3);
|
stageOneShuffleReadTaskRecordsCountMap.values().stream().filter(a -> a > 10 && a < 100).count() == 3);
|
||||||
@@ -811,17 +782,18 @@ public class TestCleaner extends TestHoodieClientBase {
|
|||||||
@Test
|
@Test
|
||||||
public void testKeepLatestCommitsWithPendingCompactions() throws IOException {
|
public void testKeepLatestCommitsWithPendingCompactions() throws IOException {
|
||||||
HoodieWriteConfig config = HoodieWriteConfig.newBuilder().withPath(basePath).withAssumeDatePartitioning(true)
|
HoodieWriteConfig config = HoodieWriteConfig.newBuilder().withPath(basePath).withAssumeDatePartitioning(true)
|
||||||
.withCompactionConfig(HoodieCompactionConfig.newBuilder().withCleanerPolicy(
|
.withCompactionConfig(HoodieCompactionConfig.newBuilder()
|
||||||
HoodieCleaningPolicy.KEEP_LATEST_COMMITS).retainCommits(2).build()).build();
|
.withCleanerPolicy(HoodieCleaningPolicy.KEEP_LATEST_COMMITS).retainCommits(2).build())
|
||||||
|
.build();
|
||||||
// Deletions:
|
// Deletions:
|
||||||
// . FileId Parquet Logs Total Retained Commits
|
// . FileId Parquet Logs Total Retained Commits
|
||||||
// FileId7 5 10 15 009, 011
|
// FileId7 5 10 15 009, 011
|
||||||
// FileId6 5 10 15 009
|
// FileId6 5 10 15 009
|
||||||
// FileId5 3 6 9 005
|
// FileId5 3 6 9 005
|
||||||
// FileId4 2 4 6 003
|
// FileId4 2 4 6 003
|
||||||
// FileId3 1 2 3 001
|
// FileId3 1 2 3 001
|
||||||
// FileId2 0 0 0 000
|
// FileId2 0 0 0 000
|
||||||
// FileId1 0 0 0 000
|
// FileId1 0 0 0 000
|
||||||
testPendingCompactions(config, 48, 18);
|
testPendingCompactions(config, 48, 18);
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -830,18 +802,20 @@ public class TestCleaner extends TestHoodieClientBase {
|
|||||||
*/
|
*/
|
||||||
@Test
|
@Test
|
||||||
public void testKeepLatestVersionsWithPendingCompactions() throws IOException {
|
public void testKeepLatestVersionsWithPendingCompactions() throws IOException {
|
||||||
HoodieWriteConfig config = HoodieWriteConfig.newBuilder().withPath(basePath).withAssumeDatePartitioning(true)
|
HoodieWriteConfig config =
|
||||||
.withCompactionConfig(HoodieCompactionConfig.newBuilder().withCleanerPolicy(
|
HoodieWriteConfig.newBuilder().withPath(basePath).withAssumeDatePartitioning(true)
|
||||||
HoodieCleaningPolicy.KEEP_LATEST_FILE_VERSIONS).retainFileVersions(2).build()).build();
|
.withCompactionConfig(HoodieCompactionConfig.newBuilder()
|
||||||
|
.withCleanerPolicy(HoodieCleaningPolicy.KEEP_LATEST_FILE_VERSIONS).retainFileVersions(2).build())
|
||||||
|
.build();
|
||||||
// Deletions:
|
// Deletions:
|
||||||
// . FileId Parquet Logs Total Retained Commits
|
// . FileId Parquet Logs Total Retained Commits
|
||||||
// FileId7 5 10 15 009, 011
|
// FileId7 5 10 15 009, 011
|
||||||
// FileId6 4 8 12 007, 009
|
// FileId6 4 8 12 007, 009
|
||||||
// FileId5 2 4 6 003 005
|
// FileId5 2 4 6 003 005
|
||||||
// FileId4 1 2 3 001, 003
|
// FileId4 1 2 3 001, 003
|
||||||
// FileId3 0 0 0 000, 001
|
// FileId3 0 0 0 000, 001
|
||||||
// FileId2 0 0 0 000
|
// FileId2 0 0 0 000
|
||||||
// FileId1 0 0 0 000
|
// FileId1 0 0 0 000
|
||||||
testPendingCompactions(config, 36, 9);
|
testPendingCompactions(config, 36, 9);
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -853,10 +827,10 @@ public class TestCleaner extends TestHoodieClientBase {
|
|||||||
*/
|
*/
|
||||||
public void testPendingCompactions(HoodieWriteConfig config, int expNumFilesDeleted,
|
public void testPendingCompactions(HoodieWriteConfig config, int expNumFilesDeleted,
|
||||||
int expNumFilesUnderCompactionDeleted) throws IOException {
|
int expNumFilesUnderCompactionDeleted) throws IOException {
|
||||||
HoodieTableMetaClient metaClient = HoodieTestUtils.init(jsc.hadoopConfiguration(), basePath,
|
HoodieTableMetaClient metaClient =
|
||||||
HoodieTableType.MERGE_ON_READ);
|
HoodieTestUtils.init(jsc.hadoopConfiguration(), basePath, HoodieTableType.MERGE_ON_READ);
|
||||||
String[] instants = new String[]{"000", "001", "003", "005", "007", "009", "011", "013"};
|
String[] instants = new String[] {"000", "001", "003", "005", "007", "009", "011", "013"};
|
||||||
String[] compactionInstants = new String[]{"002", "004", "006", "008", "010"};
|
String[] compactionInstants = new String[] {"002", "004", "006", "008", "010"};
|
||||||
Map<String, String> expFileIdToPendingCompaction = new HashMap<>();
|
Map<String, String> expFileIdToPendingCompaction = new HashMap<>();
|
||||||
Map<String, String> fileIdToLatestInstantBeforeCompaction = new HashMap<>();
|
Map<String, String> fileIdToLatestInstantBeforeCompaction = new HashMap<>();
|
||||||
Map<String, List<FileSlice>> compactionInstantsToFileSlices = new HashMap<>();
|
Map<String, List<FileSlice>> compactionInstantsToFileSlices = new HashMap<>();
|
||||||
@@ -870,13 +844,11 @@ public class TestCleaner extends TestHoodieClientBase {
|
|||||||
// compactions
|
// compactions
|
||||||
// FileIds 2-5 will be under compaction
|
// FileIds 2-5 will be under compaction
|
||||||
int maxNumFileIds = 7;
|
int maxNumFileIds = 7;
|
||||||
String[] fileIds = new String[]
|
String[] fileIds = new String[] {"fileId1", "fileId2", "fileId3", "fileId4", "fileId5", "fileId6", "fileId7"};
|
||||||
{"fileId1", "fileId2", "fileId3", "fileId4", "fileId5", "fileId6", "fileId7"};
|
|
||||||
int maxNumFileIdsForCompaction = 4;
|
int maxNumFileIdsForCompaction = 4;
|
||||||
for (int i = 0; i < maxNumFileIds; i++) {
|
for (int i = 0; i < maxNumFileIds; i++) {
|
||||||
final String fileId = HoodieTestUtils
|
final String fileId = HoodieTestUtils.createDataFile(basePath,
|
||||||
.createDataFile(basePath, HoodieTestDataGenerator.DEFAULT_FIRST_PARTITION_PATH, instants[0],
|
HoodieTestDataGenerator.DEFAULT_FIRST_PARTITION_PATH, instants[0], fileIds[i]);
|
||||||
fileIds[i]);
|
|
||||||
HoodieTestUtils.createNewLogFile(fs, basePath, HoodieTestDataGenerator.DEFAULT_FIRST_PARTITION_PATH, instants[0],
|
HoodieTestUtils.createNewLogFile(fs, basePath, HoodieTestDataGenerator.DEFAULT_FIRST_PARTITION_PATH, instants[0],
|
||||||
fileId, Option.empty());
|
fileId, Option.empty());
|
||||||
HoodieTestUtils.createNewLogFile(fs, basePath, HoodieTestDataGenerator.DEFAULT_FIRST_PARTITION_PATH, instants[0],
|
HoodieTestUtils.createNewLogFile(fs, basePath, HoodieTestDataGenerator.DEFAULT_FIRST_PARTITION_PATH, instants[0],
|
||||||
@@ -887,9 +859,9 @@ public class TestCleaner extends TestHoodieClientBase {
|
|||||||
expFileIdToPendingCompaction.put(fileId, compactionInstants[j]);
|
expFileIdToPendingCompaction.put(fileId, compactionInstants[j]);
|
||||||
metaClient = HoodieTableMetaClient.reload(metaClient);
|
metaClient = HoodieTableMetaClient.reload(metaClient);
|
||||||
HoodieTable table = HoodieTable.getHoodieTable(metaClient, config, jsc);
|
HoodieTable table = HoodieTable.getHoodieTable(metaClient, config, jsc);
|
||||||
FileSlice slice = table.getRTFileSystemView().getLatestFileSlices(
|
FileSlice slice =
|
||||||
HoodieTestDataGenerator.DEFAULT_FIRST_PARTITION_PATH)
|
table.getRTFileSystemView().getLatestFileSlices(HoodieTestDataGenerator.DEFAULT_FIRST_PARTITION_PATH)
|
||||||
.filter(fs -> fs.getFileId().equals(fileId)).findFirst().get();
|
.filter(fs -> fs.getFileId().equals(fileId)).findFirst().get();
|
||||||
List<FileSlice> slices = new ArrayList<>();
|
List<FileSlice> slices = new ArrayList<>();
|
||||||
if (compactionInstantsToFileSlices.containsKey(compactionInstants[j])) {
|
if (compactionInstantsToFileSlices.containsKey(compactionInstants[j])) {
|
||||||
slices = compactionInstantsToFileSlices.get(compactionInstants[j]);
|
slices = compactionInstantsToFileSlices.get(compactionInstants[j]);
|
||||||
@@ -898,20 +870,16 @@ public class TestCleaner extends TestHoodieClientBase {
|
|||||||
compactionInstantsToFileSlices.put(compactionInstants[j], slices);
|
compactionInstantsToFileSlices.put(compactionInstants[j], slices);
|
||||||
// Add log-files to simulate delta-commits after pending compaction
|
// Add log-files to simulate delta-commits after pending compaction
|
||||||
HoodieTestUtils.createNewLogFile(fs, basePath, HoodieTestDataGenerator.DEFAULT_FIRST_PARTITION_PATH,
|
HoodieTestUtils.createNewLogFile(fs, basePath, HoodieTestDataGenerator.DEFAULT_FIRST_PARTITION_PATH,
|
||||||
compactionInstants[j],
|
compactionInstants[j], fileId, Option.empty());
|
||||||
fileId, Option.empty());
|
|
||||||
HoodieTestUtils.createNewLogFile(fs, basePath, HoodieTestDataGenerator.DEFAULT_FIRST_PARTITION_PATH,
|
HoodieTestUtils.createNewLogFile(fs, basePath, HoodieTestDataGenerator.DEFAULT_FIRST_PARTITION_PATH,
|
||||||
compactionInstants[j],
|
compactionInstants[j], fileId, Option.of(2));
|
||||||
fileId, Option.of(2));
|
|
||||||
} else {
|
} else {
|
||||||
HoodieTestUtils
|
HoodieTestUtils.createDataFile(basePath, HoodieTestDataGenerator.DEFAULT_FIRST_PARTITION_PATH, instants[j],
|
||||||
.createDataFile(basePath, HoodieTestDataGenerator.DEFAULT_FIRST_PARTITION_PATH, instants[j], fileId);
|
fileId);
|
||||||
HoodieTestUtils
|
HoodieTestUtils.createNewLogFile(fs, basePath, HoodieTestDataGenerator.DEFAULT_FIRST_PARTITION_PATH,
|
||||||
.createNewLogFile(fs, basePath, HoodieTestDataGenerator.DEFAULT_FIRST_PARTITION_PATH, instants[j], fileId,
|
instants[j], fileId, Option.empty());
|
||||||
Option.empty());
|
HoodieTestUtils.createNewLogFile(fs, basePath, HoodieTestDataGenerator.DEFAULT_FIRST_PARTITION_PATH,
|
||||||
HoodieTestUtils
|
instants[j], fileId, Option.of(2));
|
||||||
.createNewLogFile(fs, basePath, HoodieTestDataGenerator.DEFAULT_FIRST_PARTITION_PATH, instants[j], fileId,
|
|
||||||
Option.of(2));
|
|
||||||
fileIdToLatestInstantBeforeCompaction.put(fileId, instants[j]);
|
fileIdToLatestInstantBeforeCompaction.put(fileId, instants[j]);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@@ -921,9 +889,8 @@ public class TestCleaner extends TestHoodieClientBase {
|
|||||||
for (String instant : compactionInstants) {
|
for (String instant : compactionInstants) {
|
||||||
List<FileSlice> fileSliceList = compactionInstantsToFileSlices.get(instant);
|
List<FileSlice> fileSliceList = compactionInstantsToFileSlices.get(instant);
|
||||||
if (null != fileSliceList) {
|
if (null != fileSliceList) {
|
||||||
HoodieTestUtils.createCompactionRequest(metaClient, instant,
|
HoodieTestUtils.createCompactionRequest(metaClient, instant, fileSliceList.stream()
|
||||||
fileSliceList.stream().map(fs -> Pair.of(HoodieTestDataGenerator.DEFAULT_FIRST_PARTITION_PATH, fs))
|
.map(fs -> Pair.of(HoodieTestDataGenerator.DEFAULT_FIRST_PARTITION_PATH, fs)).collect(Collectors.toList()));
|
||||||
.collect(Collectors.toList()));
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -939,38 +906,35 @@ public class TestCleaner extends TestHoodieClientBase {
|
|||||||
expFileIdToPendingCompaction.entrySet().stream().forEach(entry -> {
|
expFileIdToPendingCompaction.entrySet().stream().forEach(entry -> {
|
||||||
String fileId = entry.getKey();
|
String fileId = entry.getKey();
|
||||||
String baseInstantForCompaction = fileIdToLatestInstantBeforeCompaction.get(fileId);
|
String baseInstantForCompaction = fileIdToLatestInstantBeforeCompaction.get(fileId);
|
||||||
Option<FileSlice> fileSliceForCompaction =
|
Option<FileSlice> fileSliceForCompaction = Option.fromJavaOptional(hoodieTable.getRTFileSystemView()
|
||||||
Option.fromJavaOptional(
|
.getLatestFileSlicesBeforeOrOn(HoodieTestDataGenerator.DEFAULT_FIRST_PARTITION_PATH, baseInstantForCompaction,
|
||||||
hoodieTable.getRTFileSystemView().getLatestFileSlicesBeforeOrOn(
|
true)
|
||||||
HoodieTestDataGenerator.DEFAULT_FIRST_PARTITION_PATH,
|
.filter(fs -> fs.getFileId().equals(fileId)).findFirst());
|
||||||
baseInstantForCompaction, true).filter(fs -> fs.getFileId().equals(fileId)).findFirst());
|
|
||||||
Assert.assertTrue("Base Instant for Compaction must be preserved", fileSliceForCompaction.isPresent());
|
Assert.assertTrue("Base Instant for Compaction must be preserved", fileSliceForCompaction.isPresent());
|
||||||
Assert.assertTrue("FileSlice has data-file", fileSliceForCompaction.get().getDataFile().isPresent());
|
Assert.assertTrue("FileSlice has data-file", fileSliceForCompaction.get().getDataFile().isPresent());
|
||||||
Assert.assertEquals("FileSlice has log-files", 2,
|
Assert.assertEquals("FileSlice has log-files", 2, fileSliceForCompaction.get().getLogFiles().count());
|
||||||
fileSliceForCompaction.get().getLogFiles().count());
|
|
||||||
});
|
});
|
||||||
|
|
||||||
// Test for progress (Did we clean some files ?)
|
// Test for progress (Did we clean some files ?)
|
||||||
long numFilesUnderCompactionDeleted =
|
long numFilesUnderCompactionDeleted = hoodieCleanStats.stream().flatMap(cleanStat -> {
|
||||||
hoodieCleanStats.stream().flatMap(cleanStat -> {
|
return convertPathToFileIdWithCommitTime(newMetaClient, cleanStat.getDeletePathPatterns())
|
||||||
return convertPathToFileIdWithCommitTime(newMetaClient, cleanStat.getDeletePathPatterns()).map(
|
.map(fileIdWithCommitTime -> {
|
||||||
fileIdWithCommitTime -> {
|
if (expFileIdToPendingCompaction.containsKey(fileIdWithCommitTime.getKey())) {
|
||||||
if (expFileIdToPendingCompaction.containsKey(fileIdWithCommitTime.getKey())) {
|
Assert.assertTrue("Deleted instant time must be less than pending compaction",
|
||||||
Assert.assertTrue("Deleted instant time must be less than pending compaction",
|
HoodieTimeline.compareTimestamps(
|
||||||
HoodieTimeline.compareTimestamps(
|
fileIdToLatestInstantBeforeCompaction.get(fileIdWithCommitTime.getKey()),
|
||||||
fileIdToLatestInstantBeforeCompaction.get(fileIdWithCommitTime.getKey()),
|
fileIdWithCommitTime.getValue(), HoodieTimeline.GREATER));
|
||||||
fileIdWithCommitTime.getValue(), HoodieTimeline.GREATER));
|
return true;
|
||||||
return true;
|
}
|
||||||
}
|
return false;
|
||||||
return false;
|
});
|
||||||
});
|
}).filter(x -> x).count();
|
||||||
}).filter(x -> x).count();
|
long numDeleted =
|
||||||
long numDeleted = hoodieCleanStats.stream()
|
hoodieCleanStats.stream().flatMap(cleanStat -> cleanStat.getDeletePathPatterns().stream()).count();
|
||||||
.flatMap(cleanStat -> cleanStat.getDeletePathPatterns().stream()).count();
|
|
||||||
// Tighter check for regression
|
// Tighter check for regression
|
||||||
Assert.assertEquals("Correct number of files deleted", expNumFilesDeleted, numDeleted);
|
Assert.assertEquals("Correct number of files deleted", expNumFilesDeleted, numDeleted);
|
||||||
Assert.assertEquals("Correct number of files under compaction deleted",
|
Assert.assertEquals("Correct number of files under compaction deleted", expNumFilesUnderCompactionDeleted,
|
||||||
expNumFilesUnderCompactionDeleted, numFilesUnderCompactionDeleted);
|
numFilesUnderCompactionDeleted);
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
@@ -991,6 +955,7 @@ public class TestCleaner extends TestHoodieClientBase {
|
|||||||
|
|
||||||
/***
|
/***
|
||||||
* Helper method to return temporary files count
|
* Helper method to return temporary files count
|
||||||
|
*
|
||||||
* @return Number of temporary files found
|
* @return Number of temporary files found
|
||||||
* @throws IOException in case of error
|
* @throws IOException in case of error
|
||||||
*/
|
*/
|
||||||
@@ -1004,22 +969,20 @@ public class TestCleaner extends TestHoodieClientBase {
|
|||||||
return count;
|
return count;
|
||||||
}
|
}
|
||||||
|
|
||||||
private Stream<Pair<String, String>> convertPathToFileIdWithCommitTime(
|
private Stream<Pair<String, String>> convertPathToFileIdWithCommitTime(final HoodieTableMetaClient metaClient,
|
||||||
final HoodieTableMetaClient metaClient, List<String> paths) {
|
List<String> paths) {
|
||||||
Predicate<String> roFilePredicate = path ->
|
Predicate<String> roFilePredicate =
|
||||||
path.contains(metaClient.getTableConfig().getROFileFormat().getFileExtension());
|
path -> path.contains(metaClient.getTableConfig().getROFileFormat().getFileExtension());
|
||||||
Predicate<String> rtFilePredicate = path ->
|
Predicate<String> rtFilePredicate =
|
||||||
path.contains(metaClient.getTableConfig().getRTFileFormat().getFileExtension());
|
path -> path.contains(metaClient.getTableConfig().getRTFileFormat().getFileExtension());
|
||||||
Stream<Pair<String, String>> stream1 = paths.stream().filter(roFilePredicate)
|
Stream<Pair<String, String>> stream1 = paths.stream().filter(roFilePredicate).map(fullPath -> {
|
||||||
.map(fullPath -> {
|
String fileName = Paths.get(fullPath).getFileName().toString();
|
||||||
String fileName = Paths.get(fullPath).getFileName().toString();
|
return Pair.of(FSUtils.getFileId(fileName), FSUtils.getCommitTime(fileName));
|
||||||
return Pair.of(FSUtils.getFileId(fileName), FSUtils.getCommitTime(fileName));
|
});
|
||||||
});
|
Stream<Pair<String, String>> stream2 = paths.stream().filter(rtFilePredicate).map(path -> {
|
||||||
Stream<Pair<String, String>> stream2 = paths.stream().filter(rtFilePredicate)
|
return Pair.of(FSUtils.getFileIdFromLogPath(new Path(path)),
|
||||||
.map(path -> {
|
FSUtils.getBaseCommitTimeFromLogPath(new Path(path)));
|
||||||
return Pair.of(FSUtils.getFileIdFromLogPath(new Path(path)),
|
});
|
||||||
FSUtils.getBaseCommitTimeFromLogPath(new Path(path)));
|
|
||||||
});
|
|
||||||
return Stream.concat(stream1, stream2);
|
return Stream.concat(stream1, stream2);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|||||||
Some files were not shown because too many files have changed in this diff Show More
Reference in New Issue
Block a user