CodeStyle formatting to conform to basic Checkstyle rules.
The code-style rules follow google style with some changes: 1. Increase line length from 100 to 120 2. Disable JavaDoc related checkstyles as this needs more manual work. Both source and test code are checked for code-style
This commit is contained in:
committed by
vinoth chandar
parent
987f5d6b96
commit
788e4f2d2e
@@ -7,8 +7,7 @@ permalink: dev_setup.html
|
|||||||
|
|
||||||
### Code Style
|
### Code Style
|
||||||
|
|
||||||
We have embraced the [Google Java code style](https://google.github.io/styleguide/javaguide.html). Please setup your IDE accordingly with style files from [here](https://github.com/google/styleguide/blob/gh-pages/intellij-java-google-style.xml)
|
We have embraced the code style largely based on [google format](https://google.github.io/styleguide/javaguide.html).
|
||||||
Also recommend setting up the [Save Action Plugin](https://plugins.jetbrains.com/plugin/7642-save-actions) to auto format & organize imports on save.
|
Please setup your IDE with style files from [here](../style/)
|
||||||
|
We also recommend setting up the [Save Action Plugin](https://plugins.jetbrains.com/plugin/7642-save-actions) to auto format & organize imports on save.
|
||||||
|
The Maven Compilation life-cycle will fail if there are checkstyle violations.
|
||||||
|
|
||||||
|
|||||||
@@ -35,11 +35,11 @@ public class HoodiePrompt extends DefaultPromptProvider {
|
|||||||
case DATASET:
|
case DATASET:
|
||||||
return "hoodie:" + tableName + "->";
|
return "hoodie:" + tableName + "->";
|
||||||
case SYNC:
|
case SYNC:
|
||||||
return "hoodie:" + tableName + " <==> "
|
return "hoodie:" + tableName + " <==> " + HoodieCLI.syncTableMetadata.getTableConfig().getTableName() + "->";
|
||||||
+ HoodieCLI.syncTableMetadata.getTableConfig().getTableName() + "->";
|
default:
|
||||||
}
|
|
||||||
return "hoodie:" + tableName + "->";
|
return "hoodie:" + tableName + "->";
|
||||||
}
|
}
|
||||||
|
}
|
||||||
return "hoodie->";
|
return "hoodie->";
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|||||||
@@ -24,22 +24,18 @@ import org.springframework.stereotype.Component;
|
|||||||
|
|
||||||
@Component
|
@Component
|
||||||
@Order(Ordered.HIGHEST_PRECEDENCE)
|
@Order(Ordered.HIGHEST_PRECEDENCE)
|
||||||
public class HoodieSplashScreen
|
public class HoodieSplashScreen extends DefaultBannerProvider {
|
||||||
extends DefaultBannerProvider {
|
|
||||||
|
|
||||||
private static String screen =
|
private static String screen = "============================================" + OsUtils.LINE_SEPARATOR
|
||||||
"============================================" + OsUtils.LINE_SEPARATOR +
|
+ "* *" + OsUtils.LINE_SEPARATOR
|
||||||
"* *" + OsUtils.LINE_SEPARATOR +
|
+ "* _ _ _ _ *" + OsUtils.LINE_SEPARATOR
|
||||||
"* _ _ _ _ *" + OsUtils.LINE_SEPARATOR +
|
+ "* | | | | | (_) *" + OsUtils.LINE_SEPARATOR
|
||||||
"* | | | | | (_) *" + OsUtils.LINE_SEPARATOR +
|
+ "* | |__| | ___ ___ __| |_ ___ *" + OsUtils.LINE_SEPARATOR
|
||||||
"* | |__| | ___ ___ __| |_ ___ *" + OsUtils.LINE_SEPARATOR +
|
+ "* | __ |/ _ \\ / _ \\ / _` | |/ _ \\ *" + OsUtils.LINE_SEPARATOR
|
||||||
"* | __ |/ _ \\ / _ \\ / _` | |/ _ \\ *" +
|
+ "* | | | | (_) | (_) | (_| | | __/ *" + OsUtils.LINE_SEPARATOR
|
||||||
OsUtils.LINE_SEPARATOR +
|
+ "* |_| |_|\\___/ \\___/ \\__,_|_|\\___| *" + OsUtils.LINE_SEPARATOR
|
||||||
"* | | | | (_) | (_) | (_| | | __/ *" + OsUtils.LINE_SEPARATOR +
|
+ "* *" + OsUtils.LINE_SEPARATOR
|
||||||
"* |_| |_|\\___/ \\___/ \\__,_|_|\\___| *" +
|
+ "============================================" + OsUtils.LINE_SEPARATOR;
|
||||||
OsUtils.LINE_SEPARATOR +
|
|
||||||
"* *" + OsUtils.LINE_SEPARATOR +
|
|
||||||
"============================================" + OsUtils.LINE_SEPARATOR;
|
|
||||||
|
|
||||||
public String getBanner() {
|
public String getBanner() {
|
||||||
return screen;
|
return screen;
|
||||||
|
|||||||
@@ -22,8 +22,7 @@ import org.springframework.shell.Bootstrap;
|
|||||||
public class Main {
|
public class Main {
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Main class that delegates to Spring Shell's Bootstrap class in order to simplify debugging
|
* Main class that delegates to Spring Shell's Bootstrap class in order to simplify debugging inside an IDE
|
||||||
* inside an IDE
|
|
||||||
*/
|
*/
|
||||||
public static void main(String[] args) throws IOException {
|
public static void main(String[] args) throws IOException {
|
||||||
Bootstrap.main(args);
|
Bootstrap.main(args);
|
||||||
|
|||||||
@@ -47,13 +47,11 @@ public class ArchivedCommitsCommand implements CommandMarker {
|
|||||||
}
|
}
|
||||||
|
|
||||||
@CliCommand(value = "show archived commits", help = "Read commits from archived files and show details")
|
@CliCommand(value = "show archived commits", help = "Read commits from archived files and show details")
|
||||||
public String showCommits(
|
public String showCommits(@CliOption(key = {
|
||||||
@CliOption(key = {
|
"limit"}, mandatory = false, help = "Limit commits", unspecifiedDefaultValue = "10") final Integer limit)
|
||||||
"limit"}, mandatory = false, help = "Limit commits", unspecifiedDefaultValue = "10")
|
throws IOException {
|
||||||
final Integer limit) throws IOException {
|
|
||||||
|
|
||||||
System.out
|
System.out.println("===============> Showing only " + limit + " archived commits <===============");
|
||||||
.println("===============> Showing only " + limit + " archived commits <===============");
|
|
||||||
String basePath = HoodieCLI.tableMetadata.getBasePath();
|
String basePath = HoodieCLI.tableMetadata.getBasePath();
|
||||||
FileStatus[] fsStatuses = FSUtils.getFs(basePath, HoodieCLI.conf)
|
FileStatus[] fsStatuses = FSUtils.getFs(basePath, HoodieCLI.conf)
|
||||||
.globStatus(new Path(basePath + "/.hoodie/.commits_.archive*"));
|
.globStatus(new Path(basePath + "/.hoodie/.commits_.archive*"));
|
||||||
@@ -61,8 +59,7 @@ public class ArchivedCommitsCommand implements CommandMarker {
|
|||||||
int commits = 0;
|
int commits = 0;
|
||||||
for (FileStatus fs : fsStatuses) {
|
for (FileStatus fs : fsStatuses) {
|
||||||
//read the archived file
|
//read the archived file
|
||||||
HoodieLogFormat.Reader reader = HoodieLogFormat
|
HoodieLogFormat.Reader reader = HoodieLogFormat.newReader(FSUtils.getFs(basePath, HoodieCLI.conf),
|
||||||
.newReader(FSUtils.getFs(basePath, HoodieCLI.conf),
|
|
||||||
new HoodieLogFile(fs.getPath()), HoodieArchivedMetaEntry.getClassSchema());
|
new HoodieLogFile(fs.getPath()), HoodieArchivedMetaEntry.getClassSchema());
|
||||||
|
|
||||||
List<IndexedRecord> readRecords = new ArrayList<>();
|
List<IndexedRecord> readRecords = new ArrayList<>();
|
||||||
@@ -76,15 +73,14 @@ public class ArchivedCommitsCommand implements CommandMarker {
|
|||||||
}
|
}
|
||||||
commits++;
|
commits++;
|
||||||
}
|
}
|
||||||
List<String[]> readCommits = readRecords.stream().map(r -> (GenericRecord) r)
|
List<String[]> readCommits = readRecords.stream().map(r -> (GenericRecord) r).map(r -> readCommit(r))
|
||||||
.map(r -> readCommit(r)).collect(Collectors.toList());
|
.collect(Collectors.toList());
|
||||||
allCommits.addAll(readCommits);
|
allCommits.addAll(readCommits);
|
||||||
if (commits == limit) {
|
if (commits == limit) {
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
return HoodiePrintHelper.print(
|
return HoodiePrintHelper.print(new String[] {"CommitTime", "CommitType", "CommitDetails"},
|
||||||
new String[]{"CommitTime", "CommitType", "CommitDetails"},
|
|
||||||
allCommits.toArray(new String[allCommits.size()][]));
|
allCommits.toArray(new String[allCommits.size()][]));
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -122,6 +118,8 @@ public class ArchivedCommitsCommand implements CommandMarker {
|
|||||||
commitDetails.add(record.get("hoodieSavePointMetadata").toString());
|
commitDetails.add(record.get("hoodieSavePointMetadata").toString());
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
|
default:
|
||||||
|
return commitDetails.toArray(new String[commitDetails.size()]);
|
||||||
}
|
}
|
||||||
} catch (Exception e) {
|
} catch (Exception e) {
|
||||||
e.printStackTrace();
|
e.printStackTrace();
|
||||||
|
|||||||
@@ -13,6 +13,7 @@
|
|||||||
* See the License for the specific language governing permissions and
|
* See the License for the specific language governing permissions and
|
||||||
* limitations under the License.
|
* limitations under the License.
|
||||||
*/
|
*/
|
||||||
|
|
||||||
package com.uber.hoodie.cli.commands;
|
package com.uber.hoodie.cli.commands;
|
||||||
|
|
||||||
import com.uber.hoodie.avro.model.HoodieCleanMetadata;
|
import com.uber.hoodie.avro.model.HoodieCleanMetadata;
|
||||||
@@ -63,42 +64,37 @@ public class CleansCommand implements CommandMarker {
|
|||||||
Collections.reverse(cleans);
|
Collections.reverse(cleans);
|
||||||
for (int i = 0; i < cleans.size(); i++) {
|
for (int i = 0; i < cleans.size(); i++) {
|
||||||
HoodieInstant clean = cleans.get(i);
|
HoodieInstant clean = cleans.get(i);
|
||||||
HoodieCleanMetadata cleanMetadata =
|
HoodieCleanMetadata cleanMetadata = AvroUtils
|
||||||
AvroUtils.deserializeHoodieCleanMetadata(timeline.getInstantDetails(clean).get());
|
.deserializeHoodieCleanMetadata(timeline.getInstantDetails(clean).get());
|
||||||
rows[i] = new String[] {clean.getTimestamp(), cleanMetadata.getEarliestCommitToRetain(),
|
rows[i] = new String[] {clean.getTimestamp(), cleanMetadata.getEarliestCommitToRetain(),
|
||||||
String.valueOf(cleanMetadata.getTotalFilesDeleted()),
|
String.valueOf(cleanMetadata.getTotalFilesDeleted()), String.valueOf(cleanMetadata.getTimeTakenInMillis())};
|
||||||
String.valueOf(cleanMetadata.getTimeTakenInMillis())};
|
|
||||||
}
|
}
|
||||||
return HoodiePrintHelper.print(
|
return HoodiePrintHelper
|
||||||
new String[]{"CleanTime", "EarliestCommandRetained", "Total Files Deleted",
|
.print(new String[] {"CleanTime", "EarliestCommandRetained", "Total Files Deleted", "Total Time Taken"},
|
||||||
"Total Time Taken"}, rows);
|
rows);
|
||||||
}
|
}
|
||||||
|
|
||||||
@CliCommand(value = "cleans refresh", help = "Refresh the commits")
|
@CliCommand(value = "cleans refresh", help = "Refresh the commits")
|
||||||
public String refreshCleans() throws IOException {
|
public String refreshCleans() throws IOException {
|
||||||
HoodieTableMetaClient metadata =
|
HoodieTableMetaClient metadata = new HoodieTableMetaClient(HoodieCLI.conf, HoodieCLI.tableMetadata.getBasePath());
|
||||||
new HoodieTableMetaClient(HoodieCLI.conf, HoodieCLI.tableMetadata.getBasePath());
|
|
||||||
HoodieCLI.setTableMetadata(metadata);
|
HoodieCLI.setTableMetadata(metadata);
|
||||||
return "Metadata for table " + metadata.getTableConfig().getTableName() + " refreshed.";
|
return "Metadata for table " + metadata.getTableConfig().getTableName() + " refreshed.";
|
||||||
}
|
}
|
||||||
|
|
||||||
@CliCommand(value = "clean showpartitions", help = "Show partition level details of a clean")
|
@CliCommand(value = "clean showpartitions", help = "Show partition level details of a clean")
|
||||||
public String showCleanPartitions(
|
public String showCleanPartitions(@CliOption(key = {"clean"}, help = "clean to show") final String commitTime)
|
||||||
@CliOption(key = {"clean"}, help = "clean to show")
|
throws Exception {
|
||||||
final String commitTime) throws Exception {
|
|
||||||
HoodieActiveTimeline activeTimeline = HoodieCLI.tableMetadata.getActiveTimeline();
|
HoodieActiveTimeline activeTimeline = HoodieCLI.tableMetadata.getActiveTimeline();
|
||||||
HoodieTimeline timeline = activeTimeline.getCleanerTimeline().filterCompletedInstants();
|
HoodieTimeline timeline = activeTimeline.getCleanerTimeline().filterCompletedInstants();
|
||||||
HoodieInstant cleanInstant =
|
HoodieInstant cleanInstant = new HoodieInstant(false, HoodieTimeline.CLEAN_ACTION, commitTime);
|
||||||
new HoodieInstant(false, HoodieTimeline.CLEAN_ACTION, commitTime);
|
|
||||||
|
|
||||||
if (!timeline.containsInstant(cleanInstant)) {
|
if (!timeline.containsInstant(cleanInstant)) {
|
||||||
return "Clean " + commitTime + " not found in metadata " + timeline;
|
return "Clean " + commitTime + " not found in metadata " + timeline;
|
||||||
}
|
}
|
||||||
HoodieCleanMetadata cleanMetadata =
|
HoodieCleanMetadata cleanMetadata = AvroUtils.deserializeHoodieCleanMetadata(
|
||||||
AvroUtils.deserializeHoodieCleanMetadata(timeline.getInstantDetails(cleanInstant).get());
|
timeline.getInstantDetails(cleanInstant).get());
|
||||||
List<String[]> rows = new ArrayList<>();
|
List<String[]> rows = new ArrayList<>();
|
||||||
for (Map.Entry<String, HoodieCleanPartitionMetadata> entry : cleanMetadata
|
for (Map.Entry<String, HoodieCleanPartitionMetadata> entry : cleanMetadata.getPartitionMetadata().entrySet()) {
|
||||||
.getPartitionMetadata().entrySet()) {
|
|
||||||
String path = entry.getKey();
|
String path = entry.getKey();
|
||||||
HoodieCleanPartitionMetadata stats = entry.getValue();
|
HoodieCleanPartitionMetadata stats = entry.getValue();
|
||||||
String policy = stats.getPolicy();
|
String policy = stats.getPolicy();
|
||||||
|
|||||||
@@ -64,20 +64,17 @@ public class CommitsCommand implements CommandMarker {
|
|||||||
}
|
}
|
||||||
|
|
||||||
@CliCommand(value = "commits show", help = "Show the commits")
|
@CliCommand(value = "commits show", help = "Show the commits")
|
||||||
public String showCommits(
|
public String showCommits(@CliOption(key = {
|
||||||
@CliOption(key = {
|
"limit"}, mandatory = false, help = "Limit commits", unspecifiedDefaultValue = "10") final Integer limit)
|
||||||
"limit"}, mandatory = false, help = "Limit commits", unspecifiedDefaultValue = "10")
|
throws IOException {
|
||||||
final Integer limit) throws IOException {
|
|
||||||
HoodieActiveTimeline activeTimeline = HoodieCLI.tableMetadata.getActiveTimeline();
|
HoodieActiveTimeline activeTimeline = HoodieCLI.tableMetadata.getActiveTimeline();
|
||||||
HoodieTimeline timeline = activeTimeline.getCommitsTimeline()
|
HoodieTimeline timeline = activeTimeline.getCommitsTimeline().filterCompletedInstants();
|
||||||
.filterCompletedInstants();
|
|
||||||
List<HoodieInstant> commits = timeline.getInstants().collect(Collectors.toList());
|
List<HoodieInstant> commits = timeline.getInstants().collect(Collectors.toList());
|
||||||
String[][] rows = new String[commits.size()][];
|
String[][] rows = new String[commits.size()][];
|
||||||
Collections.reverse(commits);
|
Collections.reverse(commits);
|
||||||
for (int i = 0; i < commits.size(); i++) {
|
for (int i = 0; i < commits.size(); i++) {
|
||||||
HoodieInstant commit = commits.get(i);
|
HoodieInstant commit = commits.get(i);
|
||||||
HoodieCommitMetadata commitMetadata =
|
HoodieCommitMetadata commitMetadata = HoodieCommitMetadata.fromBytes(timeline.getInstantDetails(commit).get());
|
||||||
HoodieCommitMetadata.fromBytes(timeline.getInstantDetails(commit).get());
|
|
||||||
rows[i] = new String[] {commit.getTimestamp(),
|
rows[i] = new String[] {commit.getTimestamp(),
|
||||||
NumericUtils.humanReadableByteCount(commitMetadata.fetchTotalBytesWritten()),
|
NumericUtils.humanReadableByteCount(commitMetadata.fetchTotalBytesWritten()),
|
||||||
String.valueOf(commitMetadata.fetchTotalFilesInsert()),
|
String.valueOf(commitMetadata.fetchTotalFilesInsert()),
|
||||||
@@ -88,39 +85,32 @@ public class CommitsCommand implements CommandMarker {
|
|||||||
String.valueOf(commitMetadata.fetchTotalWriteErrors())};
|
String.valueOf(commitMetadata.fetchTotalWriteErrors())};
|
||||||
}
|
}
|
||||||
return HoodiePrintHelper.print(
|
return HoodiePrintHelper.print(
|
||||||
new String[]{"CommitTime", "Total Written (B)", "Total Files Added",
|
new String[] {"CommitTime", "Total Written (B)", "Total Files Added", "Total Files Updated",
|
||||||
"Total Files Updated", "Total Partitions Written", "Total Records Written",
|
"Total Partitions Written", "Total Records Written", "Total Update Records Written", "Total Errors"}, rows);
|
||||||
"Total Update Records Written", "Total Errors"}, rows);
|
|
||||||
}
|
}
|
||||||
|
|
||||||
@CliCommand(value = "commits refresh", help = "Refresh the commits")
|
@CliCommand(value = "commits refresh", help = "Refresh the commits")
|
||||||
public String refreshCommits() throws IOException {
|
public String refreshCommits() throws IOException {
|
||||||
HoodieTableMetaClient metadata =
|
HoodieTableMetaClient metadata = new HoodieTableMetaClient(HoodieCLI.conf, HoodieCLI.tableMetadata.getBasePath());
|
||||||
new HoodieTableMetaClient(HoodieCLI.conf, HoodieCLI.tableMetadata.getBasePath());
|
|
||||||
HoodieCLI.setTableMetadata(metadata);
|
HoodieCLI.setTableMetadata(metadata);
|
||||||
return "Metadata for table " + metadata.getTableConfig().getTableName() + " refreshed.";
|
return "Metadata for table " + metadata.getTableConfig().getTableName() + " refreshed.";
|
||||||
}
|
}
|
||||||
|
|
||||||
@CliCommand(value = "commit rollback", help = "Rollback a commit")
|
@CliCommand(value = "commit rollback", help = "Rollback a commit")
|
||||||
public String rollbackCommit(
|
public String rollbackCommit(@CliOption(key = {"commit"}, help = "Commit to rollback") final String commitTime,
|
||||||
@CliOption(key = {"commit"}, help = "Commit to rollback")
|
@CliOption(key = {"sparkProperties"}, help = "Spark Properites File Path") final String sparkPropertiesPath)
|
||||||
final String commitTime,
|
throws Exception {
|
||||||
@CliOption(key = {"sparkProperties"}, help = "Spark Properites File Path")
|
|
||||||
final String sparkPropertiesPath) throws Exception {
|
|
||||||
HoodieActiveTimeline activeTimeline = HoodieCLI.tableMetadata.getActiveTimeline();
|
HoodieActiveTimeline activeTimeline = HoodieCLI.tableMetadata.getActiveTimeline();
|
||||||
HoodieTimeline timeline = activeTimeline.getCommitsTimeline()
|
HoodieTimeline timeline = activeTimeline.getCommitsTimeline().filterCompletedInstants();
|
||||||
.filterCompletedInstants();
|
HoodieInstant commitInstant = new HoodieInstant(false, HoodieTimeline.COMMIT_ACTION, commitTime);
|
||||||
HoodieInstant commitInstant = new HoodieInstant(false, HoodieTimeline.COMMIT_ACTION,
|
|
||||||
commitTime);
|
|
||||||
|
|
||||||
if (!timeline.containsInstant(commitInstant)) {
|
if (!timeline.containsInstant(commitInstant)) {
|
||||||
return "Commit " + commitTime + " not found in Commits " + timeline;
|
return "Commit " + commitTime + " not found in Commits " + timeline;
|
||||||
}
|
}
|
||||||
|
|
||||||
SparkLauncher sparkLauncher = SparkUtil.initLauncher(sparkPropertiesPath);
|
SparkLauncher sparkLauncher = SparkUtil.initLauncher(sparkPropertiesPath);
|
||||||
sparkLauncher.addAppArgs(SparkMain.SparkCommand.ROLLBACK.toString(),
|
sparkLauncher
|
||||||
commitTime,
|
.addAppArgs(SparkMain.SparkCommand.ROLLBACK.toString(), commitTime, HoodieCLI.tableMetadata.getBasePath());
|
||||||
HoodieCLI.tableMetadata.getBasePath());
|
|
||||||
Process process = sparkLauncher.launch();
|
Process process = sparkLauncher.launch();
|
||||||
InputStreamConsumer.captureOutput(process);
|
InputStreamConsumer.captureOutput(process);
|
||||||
int exitCode = process.waitFor();
|
int exitCode = process.waitFor();
|
||||||
@@ -133,23 +123,18 @@ public class CommitsCommand implements CommandMarker {
|
|||||||
}
|
}
|
||||||
|
|
||||||
@CliCommand(value = "commit showpartitions", help = "Show partition level details of a commit")
|
@CliCommand(value = "commit showpartitions", help = "Show partition level details of a commit")
|
||||||
public String showCommitPartitions(
|
public String showCommitPartitions(@CliOption(key = {"commit"}, help = "Commit to show") final String commitTime)
|
||||||
@CliOption(key = {"commit"}, help = "Commit to show")
|
throws Exception {
|
||||||
final String commitTime) throws Exception {
|
|
||||||
HoodieActiveTimeline activeTimeline = HoodieCLI.tableMetadata.getActiveTimeline();
|
HoodieActiveTimeline activeTimeline = HoodieCLI.tableMetadata.getActiveTimeline();
|
||||||
HoodieTimeline timeline = activeTimeline.getCommitsTimeline()
|
HoodieTimeline timeline = activeTimeline.getCommitsTimeline().filterCompletedInstants();
|
||||||
.filterCompletedInstants();
|
HoodieInstant commitInstant = new HoodieInstant(false, HoodieTimeline.COMMIT_ACTION, commitTime);
|
||||||
HoodieInstant commitInstant = new HoodieInstant(false, HoodieTimeline.COMMIT_ACTION,
|
|
||||||
commitTime);
|
|
||||||
|
|
||||||
if (!timeline.containsInstant(commitInstant)) {
|
if (!timeline.containsInstant(commitInstant)) {
|
||||||
return "Commit " + commitTime + " not found in Commits " + timeline;
|
return "Commit " + commitTime + " not found in Commits " + timeline;
|
||||||
}
|
}
|
||||||
HoodieCommitMetadata meta =
|
HoodieCommitMetadata meta = HoodieCommitMetadata.fromBytes(activeTimeline.getInstantDetails(commitInstant).get());
|
||||||
HoodieCommitMetadata.fromBytes(activeTimeline.getInstantDetails(commitInstant).get());
|
|
||||||
List<String[]> rows = new ArrayList<String[]>();
|
List<String[]> rows = new ArrayList<String[]>();
|
||||||
for (Map.Entry<String, List<HoodieWriteStat>> entry : meta.getPartitionToWriteStats()
|
for (Map.Entry<String, List<HoodieWriteStat>> entry : meta.getPartitionToWriteStats().entrySet()) {
|
||||||
.entrySet()) {
|
|
||||||
String path = entry.getKey();
|
String path = entry.getKey();
|
||||||
List<HoodieWriteStat> stats = entry.getValue();
|
List<HoodieWriteStat> stats = entry.getValue();
|
||||||
long totalFilesAdded = 0;
|
long totalFilesAdded = 0;
|
||||||
@@ -169,50 +154,40 @@ public class CommitsCommand implements CommandMarker {
|
|||||||
totalBytesWritten += stat.getTotalWriteBytes();
|
totalBytesWritten += stat.getTotalWriteBytes();
|
||||||
totalWriteErrors += stat.getTotalWriteErrors();
|
totalWriteErrors += stat.getTotalWriteErrors();
|
||||||
}
|
}
|
||||||
rows.add(new String[]{path, String.valueOf(totalFilesAdded),
|
rows.add(new String[] {path, String.valueOf(totalFilesAdded), String.valueOf(totalFilesUpdated),
|
||||||
String.valueOf(totalFilesUpdated), String.valueOf(totalRecordsInserted),
|
String.valueOf(totalRecordsInserted), String.valueOf(totalRecordsUpdated),
|
||||||
String.valueOf(totalRecordsUpdated),
|
NumericUtils.humanReadableByteCount(totalBytesWritten), String.valueOf(totalWriteErrors)});
|
||||||
NumericUtils.humanReadableByteCount(totalBytesWritten),
|
|
||||||
String.valueOf(totalWriteErrors)});
|
|
||||||
|
|
||||||
}
|
}
|
||||||
return HoodiePrintHelper.print(
|
return HoodiePrintHelper.print(
|
||||||
new String[]{"Partition Path", "Total Files Added", "Total Files Updated",
|
new String[] {"Partition Path", "Total Files Added", "Total Files Updated", "Total Records Inserted",
|
||||||
"Total Records Inserted", "Total Records Updated", "Total Bytes Written",
|
"Total Records Updated", "Total Bytes Written", "Total Errors"}, rows.toArray(new String[rows.size()][]));
|
||||||
"Total Errors"}, rows.toArray(new String[rows.size()][]));
|
|
||||||
}
|
}
|
||||||
|
|
||||||
@CliCommand(value = "commit showfiles", help = "Show file level details of a commit")
|
@CliCommand(value = "commit showfiles", help = "Show file level details of a commit")
|
||||||
public String showCommitFiles(
|
public String showCommitFiles(@CliOption(key = {"commit"}, help = "Commit to show") final String commitTime)
|
||||||
@CliOption(key = {"commit"}, help = "Commit to show")
|
throws Exception {
|
||||||
final String commitTime) throws Exception {
|
|
||||||
HoodieActiveTimeline activeTimeline = HoodieCLI.tableMetadata.getActiveTimeline();
|
HoodieActiveTimeline activeTimeline = HoodieCLI.tableMetadata.getActiveTimeline();
|
||||||
HoodieTimeline timeline = activeTimeline.getCommitsTimeline()
|
HoodieTimeline timeline = activeTimeline.getCommitsTimeline().filterCompletedInstants();
|
||||||
.filterCompletedInstants();
|
HoodieInstant commitInstant = new HoodieInstant(false, HoodieTimeline.COMMIT_ACTION, commitTime);
|
||||||
HoodieInstant commitInstant = new HoodieInstant(false, HoodieTimeline.COMMIT_ACTION,
|
|
||||||
commitTime);
|
|
||||||
|
|
||||||
if (!timeline.containsInstant(commitInstant)) {
|
if (!timeline.containsInstant(commitInstant)) {
|
||||||
return "Commit " + commitTime + " not found in Commits " + timeline;
|
return "Commit " + commitTime + " not found in Commits " + timeline;
|
||||||
}
|
}
|
||||||
HoodieCommitMetadata meta =
|
HoodieCommitMetadata meta = HoodieCommitMetadata.fromBytes(activeTimeline.getInstantDetails(commitInstant).get());
|
||||||
HoodieCommitMetadata.fromBytes(activeTimeline.getInstantDetails(commitInstant).get());
|
|
||||||
List<String[]> rows = new ArrayList<String[]>();
|
List<String[]> rows = new ArrayList<String[]>();
|
||||||
for (Map.Entry<String, List<HoodieWriteStat>> entry : meta.getPartitionToWriteStats()
|
for (Map.Entry<String, List<HoodieWriteStat>> entry : meta.getPartitionToWriteStats().entrySet()) {
|
||||||
.entrySet()) {
|
|
||||||
String path = entry.getKey();
|
String path = entry.getKey();
|
||||||
List<HoodieWriteStat> stats = entry.getValue();
|
List<HoodieWriteStat> stats = entry.getValue();
|
||||||
for (HoodieWriteStat stat : stats) {
|
for (HoodieWriteStat stat : stats) {
|
||||||
rows.add(new String[]{path, stat.getFileId(), stat.getPrevCommit(),
|
rows.add(new String[] {path, stat.getFileId(), stat.getPrevCommit(), String.valueOf(stat.getNumUpdateWrites()),
|
||||||
String.valueOf(stat.getNumUpdateWrites()), String.valueOf(stat.getNumWrites()),
|
String.valueOf(stat.getNumWrites()), String.valueOf(stat.getTotalWriteBytes()),
|
||||||
String.valueOf(stat.getTotalWriteBytes()),
|
|
||||||
String.valueOf(stat.getTotalWriteErrors())});
|
String.valueOf(stat.getTotalWriteErrors())});
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
return HoodiePrintHelper.print(
|
return HoodiePrintHelper.print(
|
||||||
new String[]{"Partition Path", "File ID", "Previous Commit", "Total Records Updated",
|
new String[] {"Partition Path", "File ID", "Previous Commit", "Total Records Updated", "Total Records Written",
|
||||||
"Total Records Written", "Total Bytes Written", "Total Errors"},
|
"Total Bytes Written", "Total Errors"}, rows.toArray(new String[rows.size()][]));
|
||||||
rows.toArray(new String[rows.size()][]));
|
|
||||||
}
|
}
|
||||||
|
|
||||||
@CliAvailabilityIndicator({"commits compare"})
|
@CliAvailabilityIndicator({"commits compare"})
|
||||||
@@ -221,38 +196,30 @@ public class CommitsCommand implements CommandMarker {
|
|||||||
}
|
}
|
||||||
|
|
||||||
@CliCommand(value = "commits compare", help = "Compare commits with another Hoodie dataset")
|
@CliCommand(value = "commits compare", help = "Compare commits with another Hoodie dataset")
|
||||||
public String compareCommits(
|
public String compareCommits(@CliOption(key = {"path"}, help = "Path of the dataset to compare to") final String path)
|
||||||
@CliOption(key = {"path"}, help = "Path of the dataset to compare to")
|
throws Exception {
|
||||||
final String path) throws Exception {
|
|
||||||
|
|
||||||
HoodieTableMetaClient target = new HoodieTableMetaClient(HoodieCLI.conf, path);
|
HoodieTableMetaClient target = new HoodieTableMetaClient(HoodieCLI.conf, path);
|
||||||
HoodieTimeline targetTimeline = target.getActiveTimeline().getCommitsTimeline()
|
HoodieTimeline targetTimeline = target.getActiveTimeline().getCommitsTimeline().filterCompletedInstants();
|
||||||
.filterCompletedInstants();
|
|
||||||
HoodieTableMetaClient source = HoodieCLI.tableMetadata;
|
HoodieTableMetaClient source = HoodieCLI.tableMetadata;
|
||||||
HoodieTimeline sourceTimeline = source.getActiveTimeline().getCommitsTimeline()
|
HoodieTimeline sourceTimeline = source.getActiveTimeline().getCommitsTimeline().filterCompletedInstants();
|
||||||
.filterCompletedInstants();
|
|
||||||
String targetLatestCommit =
|
String targetLatestCommit =
|
||||||
targetTimeline.getInstants().iterator().hasNext() ? "0"
|
targetTimeline.getInstants().iterator().hasNext() ? "0" : targetTimeline.lastInstant().get().getTimestamp();
|
||||||
: targetTimeline.lastInstant().get().getTimestamp();
|
|
||||||
String sourceLatestCommit =
|
String sourceLatestCommit =
|
||||||
sourceTimeline.getInstants().iterator().hasNext() ? "0"
|
sourceTimeline.getInstants().iterator().hasNext() ? "0" : sourceTimeline.lastInstant().get().getTimestamp();
|
||||||
: sourceTimeline.lastInstant().get().getTimestamp();
|
|
||||||
|
|
||||||
if (sourceLatestCommit != null &&
|
if (sourceLatestCommit != null && HoodieTimeline.compareTimestamps(targetLatestCommit, sourceLatestCommit,
|
||||||
HoodieTimeline
|
HoodieTimeline.GREATER)) {
|
||||||
.compareTimestamps(targetLatestCommit, sourceLatestCommit, HoodieTimeline.GREATER)) {
|
|
||||||
// source is behind the target
|
// source is behind the target
|
||||||
List<String> commitsToCatchup =
|
List<String> commitsToCatchup = targetTimeline.findInstantsAfter(sourceLatestCommit, Integer.MAX_VALUE)
|
||||||
targetTimeline.findInstantsAfter(sourceLatestCommit, Integer.MAX_VALUE)
|
|
||||||
.getInstants().map(HoodieInstant::getTimestamp).collect(Collectors.toList());
|
.getInstants().map(HoodieInstant::getTimestamp).collect(Collectors.toList());
|
||||||
return "Source " + source.getTableConfig().getTableName() + " is behind by "
|
return "Source " + source.getTableConfig().getTableName() + " is behind by " + commitsToCatchup.size()
|
||||||
+ commitsToCatchup.size() + " commits. Commits to catch up - " + commitsToCatchup;
|
+ " commits. Commits to catch up - " + commitsToCatchup;
|
||||||
} else {
|
} else {
|
||||||
List<String> commitsToCatchup =
|
List<String> commitsToCatchup = sourceTimeline.findInstantsAfter(targetLatestCommit, Integer.MAX_VALUE)
|
||||||
sourceTimeline.findInstantsAfter(targetLatestCommit, Integer.MAX_VALUE)
|
|
||||||
.getInstants().map(HoodieInstant::getTimestamp).collect(Collectors.toList());
|
.getInstants().map(HoodieInstant::getTimestamp).collect(Collectors.toList());
|
||||||
return "Source " + source.getTableConfig().getTableName() + " is ahead by "
|
return "Source " + source.getTableConfig().getTableName() + " is ahead by " + commitsToCatchup.size()
|
||||||
+ commitsToCatchup.size() + " commits. Commits to catch up - " + commitsToCatchup;
|
+ " commits. Commits to catch up - " + commitsToCatchup;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -262,13 +229,12 @@ public class CommitsCommand implements CommandMarker {
|
|||||||
}
|
}
|
||||||
|
|
||||||
@CliCommand(value = "commits sync", help = "Compare commits with another Hoodie dataset")
|
@CliCommand(value = "commits sync", help = "Compare commits with another Hoodie dataset")
|
||||||
public String syncCommits(
|
public String syncCommits(@CliOption(key = {"path"}, help = "Path of the dataset to compare to") final String path)
|
||||||
@CliOption(key = {"path"}, help = "Path of the dataset to compare to")
|
throws Exception {
|
||||||
final String path) throws Exception {
|
|
||||||
HoodieCLI.syncTableMetadata = new HoodieTableMetaClient(HoodieCLI.conf, path);
|
HoodieCLI.syncTableMetadata = new HoodieTableMetaClient(HoodieCLI.conf, path);
|
||||||
HoodieCLI.state = HoodieCLI.CLIState.SYNC;
|
HoodieCLI.state = HoodieCLI.CLIState.SYNC;
|
||||||
return "Load sync state between " + HoodieCLI.tableMetadata.getTableConfig().getTableName()
|
return "Load sync state between " + HoodieCLI.tableMetadata.getTableConfig().getTableName() + " and "
|
||||||
+ " and " + HoodieCLI.syncTableMetadata.getTableConfig().getTableName();
|
+ HoodieCLI.syncTableMetadata.getTableConfig().getTableName();
|
||||||
}
|
}
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -29,13 +29,12 @@ public class DatasetsCommand implements CommandMarker {
|
|||||||
|
|
||||||
@CliCommand(value = "connect", help = "Connect to a hoodie dataset")
|
@CliCommand(value = "connect", help = "Connect to a hoodie dataset")
|
||||||
public String connect(
|
public String connect(
|
||||||
@CliOption(key = {"path"}, mandatory = true, help = "Base Path of the dataset")
|
@CliOption(key = {"path"}, mandatory = true, help = "Base Path of the dataset") final String path)
|
||||||
final String path) throws IOException {
|
throws IOException {
|
||||||
boolean initialized = HoodieCLI.initConf();
|
boolean initialized = HoodieCLI.initConf();
|
||||||
HoodieCLI.initFS(initialized);
|
HoodieCLI.initFS(initialized);
|
||||||
HoodieCLI.setTableMetadata(new HoodieTableMetaClient(HoodieCLI.conf, path));
|
HoodieCLI.setTableMetadata(new HoodieTableMetaClient(HoodieCLI.conf, path));
|
||||||
HoodieCLI.state = HoodieCLI.CLIState.DATASET;
|
HoodieCLI.state = HoodieCLI.CLIState.DATASET;
|
||||||
return "Metadata for table " + HoodieCLI.tableMetadata.getTableConfig().getTableName()
|
return "Metadata for table " + HoodieCLI.tableMetadata.getTableConfig().getTableName() + " loaded";
|
||||||
+ " loaded";
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -37,44 +37,33 @@ public class HDFSParquetImportCommand implements CommandMarker {
|
|||||||
|
|
||||||
@CliCommand(value = "hdfsparquetimport", help = "Imports hdfs dataset to a hoodie dataset")
|
@CliCommand(value = "hdfsparquetimport", help = "Imports hdfs dataset to a hoodie dataset")
|
||||||
public String convert(
|
public String convert(
|
||||||
@CliOption(key = "srcPath", mandatory = true, help = "Base path for the input dataset")
|
@CliOption(key = "srcPath", mandatory = true, help = "Base path for the input dataset") final String srcPath,
|
||||||
final String srcPath,
|
@CliOption(key = "srcType", mandatory = true, help = "Source type for the input dataset") final String srcType,
|
||||||
@CliOption(key = "srcType", mandatory = true, help = "Source type for the input dataset")
|
@CliOption(key = "targetPath", mandatory = true, help = "Base path for the target hoodie dataset") final String
|
||||||
final String srcType,
|
targetPath,
|
||||||
@CliOption(key = "targetPath", mandatory = true, help = "Base path for the target hoodie dataset")
|
@CliOption(key = "tableName", mandatory = true, help = "Table name") final String tableName,
|
||||||
final String targetPath,
|
@CliOption(key = "tableType", mandatory = true, help = "Table type") final String tableType,
|
||||||
@CliOption(key = "tableName", mandatory = true, help = "Table name")
|
@CliOption(key = "rowKeyField", mandatory = true, help = "Row key field name") final String rowKeyField,
|
||||||
final String tableName,
|
@CliOption(key = "partitionPathField", mandatory = true, help = "Partition path field name") final String
|
||||||
@CliOption(key = "tableType", mandatory = true, help = "Table type")
|
partitionPathField,
|
||||||
final String tableType,
|
@CliOption(key = {
|
||||||
@CliOption(key = "rowKeyField", mandatory = true, help = "Row key field name")
|
"parallelism"}, mandatory = true, help = "Parallelism for hoodie insert") final String parallelism,
|
||||||
final String rowKeyField,
|
@CliOption(key = "schemaFilePath", mandatory = true, help = "Path for Avro schema file") final String
|
||||||
@CliOption(key = "partitionPathField", mandatory = true, help = "Partition path field name")
|
schemaFilePath,
|
||||||
final String partitionPathField,
|
@CliOption(key = "format", mandatory = true, help = "Format for the input data") final String format,
|
||||||
@CliOption(key = {"parallelism"}, mandatory = true, help = "Parallelism for hoodie insert")
|
@CliOption(key = "sparkMemory", mandatory = true, help = "Spark executor memory") final String sparkMemory,
|
||||||
final String parallelism,
|
@CliOption(key = "retry", mandatory = true, help = "Number of retries") final String retry) throws Exception {
|
||||||
@CliOption(key = "schemaFilePath", mandatory = true, help = "Path for Avro schema file")
|
|
||||||
final String schemaFilePath,
|
|
||||||
@CliOption(key = "format", mandatory = true, help = "Format for the input data")
|
|
||||||
final String format,
|
|
||||||
@CliOption(key = "sparkMemory", mandatory = true, help = "Spark executor memory")
|
|
||||||
final String sparkMemory,
|
|
||||||
@CliOption(key = "retry", mandatory = true, help = "Number of retries")
|
|
||||||
final String retry)
|
|
||||||
throws Exception {
|
|
||||||
|
|
||||||
validate(format, srcType);
|
validate(format, srcType);
|
||||||
|
|
||||||
boolean initialized = HoodieCLI.initConf();
|
boolean initialized = HoodieCLI.initConf();
|
||||||
HoodieCLI.initFS(initialized);
|
HoodieCLI.initFS(initialized);
|
||||||
String sparkPropertiesPath = Utils
|
String sparkPropertiesPath = Utils.getDefaultPropertiesFile(
|
||||||
.getDefaultPropertiesFile(
|
|
||||||
scala.collection.JavaConversions.propertiesAsScalaMap(System.getProperties()));
|
scala.collection.JavaConversions.propertiesAsScalaMap(System.getProperties()));
|
||||||
SparkLauncher sparkLauncher = SparkUtil.initLauncher(sparkPropertiesPath);
|
SparkLauncher sparkLauncher = SparkUtil.initLauncher(sparkPropertiesPath);
|
||||||
|
|
||||||
sparkLauncher.addAppArgs(SparkCommand.IMPORT.toString(), srcPath, targetPath, tableName,
|
sparkLauncher.addAppArgs(SparkCommand.IMPORT.toString(), srcPath, targetPath, tableName, tableType, rowKeyField,
|
||||||
tableType, rowKeyField, partitionPathField, parallelism, schemaFilePath, sparkMemory,
|
partitionPathField, parallelism, schemaFilePath, sparkMemory, retry);
|
||||||
retry);
|
|
||||||
Process process = sparkLauncher.launch();
|
Process process = sparkLauncher.launch();
|
||||||
InputStreamConsumer.captureOutput(process);
|
InputStreamConsumer.captureOutput(process);
|
||||||
int exitCode = process.waitFor();
|
int exitCode = process.waitFor();
|
||||||
|
|||||||
@@ -64,25 +64,25 @@ public class HoodieLogFileCommand implements CommandMarker {
|
|||||||
|
|
||||||
@CliCommand(value = "show logfile metadata", help = "Read commit metadata from log files")
|
@CliCommand(value = "show logfile metadata", help = "Read commit metadata from log files")
|
||||||
public String showLogFileCommits(
|
public String showLogFileCommits(
|
||||||
@CliOption(key = "logFilePathPattern", mandatory = true, help = "Fully qualified path for the log file")
|
@CliOption(key = "logFilePathPattern", mandatory = true, help = "Fully qualified path for the log file") final
|
||||||
final String logFilePathPattern) throws IOException {
|
String logFilePathPattern)
|
||||||
|
throws IOException {
|
||||||
|
|
||||||
FileSystem fs = HoodieCLI.tableMetadata.getFs();
|
FileSystem fs = HoodieCLI.tableMetadata.getFs();
|
||||||
List<String> logFilePaths = Arrays.stream(fs.globStatus(new Path(logFilePathPattern)))
|
List<String> logFilePaths = Arrays.stream(fs.globStatus(new Path(logFilePathPattern)))
|
||||||
.map(status -> status.getPath().toString()).collect(Collectors.toList());
|
.map(status -> status.getPath().toString()).collect(Collectors.toList());
|
||||||
Map<String, List<Tuple3<HoodieLogBlockType, Tuple2<Map<HeaderMetadataType, String>, Map<HeaderMetadataType, String>>, Integer>>> commitCountAndMetadata = Maps
|
Map<String, List<Tuple3<HoodieLogBlockType, Tuple2<Map<HeaderMetadataType, String>, Map<HeaderMetadataType,
|
||||||
.newHashMap();
|
String>>, Integer>>>
|
||||||
|
commitCountAndMetadata = Maps.newHashMap();
|
||||||
int totalEntries = 0;
|
int totalEntries = 0;
|
||||||
int numCorruptBlocks = 0;
|
int numCorruptBlocks = 0;
|
||||||
|
|
||||||
for (String logFilePath : logFilePaths) {
|
for (String logFilePath : logFilePaths) {
|
||||||
FileStatus[] fsStatus = fs.listStatus(
|
FileStatus[] fsStatus = fs.listStatus(new Path(logFilePath));
|
||||||
new Path(logFilePath));
|
Schema writerSchema = new AvroSchemaConverter().convert(
|
||||||
Schema writerSchema = new AvroSchemaConverter()
|
SchemaUtil.readSchemaFromLogFile(HoodieCLI.tableMetadata.getFs(), new Path(logFilePath)));
|
||||||
.convert(SchemaUtil
|
HoodieLogFormat.Reader reader = HoodieLogFormat
|
||||||
.readSchemaFromLogFile(HoodieCLI.tableMetadata.getFs(), new Path(logFilePath)));
|
.newReader(fs, new HoodieLogFile(fsStatus[0].getPath()), writerSchema);
|
||||||
HoodieLogFormat.Reader reader = HoodieLogFormat.newReader(fs,
|
|
||||||
new HoodieLogFile(fsStatus[0].getPath()), writerSchema);
|
|
||||||
|
|
||||||
// read the avro blocks
|
// read the avro blocks
|
||||||
while (reader.hasNext()) {
|
while (reader.hasNext()) {
|
||||||
@@ -104,15 +104,14 @@ public class HoodieLogFileCommand implements CommandMarker {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
if (commitCountAndMetadata.containsKey(instantTime)) {
|
if (commitCountAndMetadata.containsKey(instantTime)) {
|
||||||
commitCountAndMetadata.get(instantTime)
|
commitCountAndMetadata.get(instantTime).add(
|
||||||
.add(new Tuple3<>(n.getBlockType(),
|
new Tuple3<>(n.getBlockType(), new Tuple2<>(n.getLogBlockHeader(), n.getLogBlockFooter()), recordCount));
|
||||||
new Tuple2<>(n.getLogBlockHeader(), n.getLogBlockFooter()), recordCount));
|
|
||||||
totalEntries++;
|
totalEntries++;
|
||||||
} else {
|
} else {
|
||||||
List<Tuple3<HoodieLogBlockType, Tuple2<Map<HeaderMetadataType, String>, Map<HeaderMetadataType, String>>, Integer>> list
|
List<Tuple3<HoodieLogBlockType, Tuple2<Map<HeaderMetadataType, String>, Map<HeaderMetadataType, String>>,
|
||||||
= new ArrayList<>();
|
Integer>> list = new ArrayList<>();
|
||||||
list.add(new Tuple3<>(n.getBlockType(),
|
list.add(
|
||||||
new Tuple2<>(n.getLogBlockHeader(), n.getLogBlockFooter()), recordCount));
|
new Tuple3<>(n.getBlockType(), new Tuple2<>(n.getLogBlockHeader(), n.getLogBlockFooter()), recordCount));
|
||||||
commitCountAndMetadata.put(instantTime, list);
|
commitCountAndMetadata.put(instantTime, list);
|
||||||
totalEntries++;
|
totalEntries++;
|
||||||
}
|
}
|
||||||
@@ -121,11 +120,12 @@ public class HoodieLogFileCommand implements CommandMarker {
|
|||||||
String[][] rows = new String[totalEntries + 1][];
|
String[][] rows = new String[totalEntries + 1][];
|
||||||
int i = 0;
|
int i = 0;
|
||||||
ObjectMapper objectMapper = new ObjectMapper();
|
ObjectMapper objectMapper = new ObjectMapper();
|
||||||
for (Map.Entry<String, List<Tuple3<HoodieLogBlockType, Tuple2<Map<HeaderMetadataType, String>, Map<HeaderMetadataType, String>>, Integer>>> entry : commitCountAndMetadata
|
for (Map.Entry<String, List<Tuple3<HoodieLogBlockType,
|
||||||
.entrySet()) {
|
Tuple2<Map<HeaderMetadataType, String>, Map<HeaderMetadataType, String>>, Integer>>> entry
|
||||||
|
: commitCountAndMetadata.entrySet()) {
|
||||||
String instantTime = entry.getKey().toString();
|
String instantTime = entry.getKey().toString();
|
||||||
for (Tuple3<HoodieLogBlockType, Tuple2<Map<HeaderMetadataType, String>, Map<HeaderMetadataType, String>>, Integer> tuple3 : entry
|
for (Tuple3<HoodieLogBlockType, Tuple2<Map<HeaderMetadataType, String>,
|
||||||
.getValue()) {
|
Map<HeaderMetadataType, String>>, Integer> tuple3 : entry.getValue()) {
|
||||||
String[] output = new String[5];
|
String[] output = new String[5];
|
||||||
output[0] = instantTime;
|
output[0] = instantTime;
|
||||||
output[1] = String.valueOf(tuple3._3());
|
output[1] = String.valueOf(tuple3._3());
|
||||||
@@ -136,24 +136,21 @@ public class HoodieLogFileCommand implements CommandMarker {
|
|||||||
i++;
|
i++;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
return HoodiePrintHelper.print(
|
return HoodiePrintHelper
|
||||||
new String[]{"InstantTime", "RecordCount", "BlockType", "HeaderMetadata", "FooterMetadata"},
|
.print(new String[] {"InstantTime", "RecordCount", "BlockType", "HeaderMetadata", "FooterMetadata"},
|
||||||
rows);
|
rows);
|
||||||
}
|
}
|
||||||
|
|
||||||
@CliCommand(value = "show logfile records", help = "Read records from log files")
|
@CliCommand(value = "show logfile records", help = "Read records from log files")
|
||||||
public String showLogFileRecords(
|
public String showLogFileRecords(@CliOption(key = {
|
||||||
@CliOption(key = {
|
"limit"}, mandatory = false, help = "Limit commits", unspecifiedDefaultValue = "10") final Integer limit,
|
||||||
"limit"}, mandatory = false, help = "Limit commits", unspecifiedDefaultValue = "10")
|
|
||||||
final Integer limit,
|
|
||||||
@CliOption(key = "logFilePathPattern", mandatory = true, help = "Fully qualified paths for the log files")
|
@CliOption(key = "logFilePathPattern", mandatory = true, help = "Fully qualified paths for the log files")
|
||||||
final String logFilePathPattern,
|
final String logFilePathPattern,
|
||||||
@CliOption(key = "mergeRecords", mandatory = false, help = "If the records in the log files should be merged",
|
@CliOption(key = "mergeRecords", mandatory = false, help = "If the records in the log files should be merged",
|
||||||
unspecifiedDefaultValue = "false")
|
unspecifiedDefaultValue = "false") final Boolean shouldMerge)
|
||||||
final Boolean shouldMerge) throws IOException {
|
throws IOException {
|
||||||
|
|
||||||
System.out
|
System.out.println("===============> Showing only " + limit + " records <===============");
|
||||||
.println("===============> Showing only " + limit + " records <===============");
|
|
||||||
|
|
||||||
FileSystem fs = HoodieCLI.tableMetadata.getFs();
|
FileSystem fs = HoodieCLI.tableMetadata.getFs();
|
||||||
List<String> logFilePaths = Arrays.stream(fs.globStatus(new Path(logFilePathPattern)))
|
List<String> logFilePaths = Arrays.stream(fs.globStatus(new Path(logFilePathPattern)))
|
||||||
@@ -162,9 +159,8 @@ public class HoodieLogFileCommand implements CommandMarker {
|
|||||||
// TODO : readerSchema can change across blocks/log files, fix this inside Scanner
|
// TODO : readerSchema can change across blocks/log files, fix this inside Scanner
|
||||||
AvroSchemaConverter converter = new AvroSchemaConverter();
|
AvroSchemaConverter converter = new AvroSchemaConverter();
|
||||||
// get schema from last log file
|
// get schema from last log file
|
||||||
Schema readerSchema = converter
|
Schema readerSchema = converter.convert(
|
||||||
.convert(SchemaUtil
|
SchemaUtil.readSchemaFromLogFile(fs, new Path(logFilePaths.get(logFilePaths.size() - 1))));
|
||||||
.readSchemaFromLogFile(fs, new Path(logFilePaths.get(logFilePaths.size() - 1))));
|
|
||||||
|
|
||||||
List<IndexedRecord> allRecords = new ArrayList<>();
|
List<IndexedRecord> allRecords = new ArrayList<>();
|
||||||
|
|
||||||
@@ -186,11 +182,10 @@ public class HoodieLogFileCommand implements CommandMarker {
|
|||||||
}
|
}
|
||||||
} else {
|
} else {
|
||||||
for (String logFile : logFilePaths) {
|
for (String logFile : logFilePaths) {
|
||||||
Schema writerSchema = new AvroSchemaConverter()
|
Schema writerSchema = new AvroSchemaConverter().convert(
|
||||||
.convert(SchemaUtil
|
SchemaUtil.readSchemaFromLogFile(HoodieCLI.tableMetadata.getFs(), new Path(logFile)));
|
||||||
.readSchemaFromLogFile(HoodieCLI.tableMetadata.getFs(), new Path(logFile)));
|
HoodieLogFormat.Reader reader = HoodieLogFormat
|
||||||
HoodieLogFormat.Reader reader = HoodieLogFormat.newReader(fs,
|
.newReader(fs, new HoodieLogFile(new Path(logFile)), writerSchema);
|
||||||
new HoodieLogFile(new Path(logFile)), writerSchema);
|
|
||||||
// read the avro blocks
|
// read the avro blocks
|
||||||
while (reader.hasNext()) {
|
while (reader.hasNext()) {
|
||||||
HoodieLogBlock n = reader.next();
|
HoodieLogBlock n = reader.next();
|
||||||
@@ -216,7 +211,6 @@ public class HoodieLogFileCommand implements CommandMarker {
|
|||||||
rows[i] = data;
|
rows[i] = data;
|
||||||
i++;
|
i++;
|
||||||
}
|
}
|
||||||
return HoodiePrintHelper.print(
|
return HoodiePrintHelper.print(new String[] {"Records"}, rows);
|
||||||
new String[]{"Records"}, rows);
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -40,26 +40,22 @@ public class HoodieSyncCommand implements CommandMarker {
|
|||||||
|
|
||||||
@CliCommand(value = "sync validate", help = "Validate the sync by counting the number of records")
|
@CliCommand(value = "sync validate", help = "Validate the sync by counting the number of records")
|
||||||
public String validateSync(
|
public String validateSync(
|
||||||
@CliOption(key = {"mode"}, unspecifiedDefaultValue = "complete", help = "Check mode")
|
@CliOption(key = {"mode"}, unspecifiedDefaultValue = "complete", help = "Check mode") final String mode,
|
||||||
final String mode,
|
@CliOption(key = {"sourceDb"}, unspecifiedDefaultValue = "rawdata", help = "source database") final String srcDb,
|
||||||
@CliOption(key = {
|
@CliOption(key = {
|
||||||
"sourceDb"}, unspecifiedDefaultValue = "rawdata", help = "source database")
|
"targetDb"}, unspecifiedDefaultValue = "dwh_hoodie", help = "target database") final String tgtDb,
|
||||||
final String srcDb,
|
|
||||||
@CliOption(key = {
|
|
||||||
"targetDb"}, unspecifiedDefaultValue = "dwh_hoodie", help = "target database")
|
|
||||||
final String tgtDb,
|
|
||||||
@CliOption(key = {
|
@CliOption(key = {
|
||||||
"partitionCount"}, unspecifiedDefaultValue = "5", help = "total number of recent partitions to validate")
|
"partitionCount"}, unspecifiedDefaultValue = "5", help = "total number of recent partitions to validate")
|
||||||
final int partitionCount,
|
final int partitionCount,
|
||||||
@CliOption(key = {
|
@CliOption(key = {
|
||||||
"hiveServerUrl"}, mandatory = true, help = "hiveServerURL to connect to")
|
"hiveServerUrl"}, mandatory = true, help = "hiveServerURL to connect to") final String hiveServerUrl,
|
||||||
final String hiveServerUrl,
|
|
||||||
@CliOption(key = {
|
@CliOption(key = {
|
||||||
"hiveUser"}, mandatory = false, unspecifiedDefaultValue = "", help = "hive username to connect to")
|
"hiveUser"}, mandatory = false, unspecifiedDefaultValue = "", help = "hive username to connect to") final
|
||||||
final String hiveUser,
|
String hiveUser,
|
||||||
@CliOption(key = {
|
@CliOption(key = {
|
||||||
"hivePass"}, mandatory = true, unspecifiedDefaultValue = "", help = "hive password to connect to")
|
"hivePass"}, mandatory = true, unspecifiedDefaultValue = "", help = "hive password to connect to") final
|
||||||
final String hivePass) throws Exception {
|
String hivePass)
|
||||||
|
throws Exception {
|
||||||
HoodieTableMetaClient target = HoodieCLI.syncTableMetadata;
|
HoodieTableMetaClient target = HoodieCLI.syncTableMetadata;
|
||||||
HoodieTimeline targetTimeline = target.getActiveTimeline().getCommitsTimeline();
|
HoodieTimeline targetTimeline = target.getActiveTimeline().getCommitsTimeline();
|
||||||
HoodieTableMetaClient source = HoodieCLI.tableMetadata;
|
HoodieTableMetaClient source = HoodieCLI.tableMetadata;
|
||||||
@@ -70,52 +66,42 @@ public class HoodieSyncCommand implements CommandMarker {
|
|||||||
sourceCount = HiveUtil.countRecords(hiveServerUrl, source, srcDb, hiveUser, hivePass);
|
sourceCount = HiveUtil.countRecords(hiveServerUrl, source, srcDb, hiveUser, hivePass);
|
||||||
targetCount = HiveUtil.countRecords(hiveServerUrl, target, tgtDb, hiveUser, hivePass);
|
targetCount = HiveUtil.countRecords(hiveServerUrl, target, tgtDb, hiveUser, hivePass);
|
||||||
} else if ("latestPartitions".equals(mode)) {
|
} else if ("latestPartitions".equals(mode)) {
|
||||||
sourceCount = HiveUtil
|
sourceCount = HiveUtil.countRecords(hiveServerUrl, source, srcDb, partitionCount, hiveUser, hivePass);
|
||||||
.countRecords(hiveServerUrl, source, srcDb, partitionCount, hiveUser, hivePass);
|
targetCount = HiveUtil.countRecords(hiveServerUrl, target, tgtDb, partitionCount, hiveUser, hivePass);
|
||||||
targetCount = HiveUtil
|
|
||||||
.countRecords(hiveServerUrl, target, tgtDb, partitionCount, hiveUser, hivePass);
|
|
||||||
}
|
}
|
||||||
|
|
||||||
String targetLatestCommit =
|
String targetLatestCommit =
|
||||||
targetTimeline.getInstants().iterator().hasNext() ? "0"
|
targetTimeline.getInstants().iterator().hasNext() ? "0" : targetTimeline.lastInstant().get().getTimestamp();
|
||||||
: targetTimeline.lastInstant().get().getTimestamp();
|
|
||||||
String sourceLatestCommit =
|
String sourceLatestCommit =
|
||||||
sourceTimeline.getInstants().iterator().hasNext() ? "0"
|
sourceTimeline.getInstants().iterator().hasNext() ? "0" : sourceTimeline.lastInstant().get().getTimestamp();
|
||||||
: sourceTimeline.lastInstant().get().getTimestamp();
|
|
||||||
|
|
||||||
if (sourceLatestCommit != null && HoodieTimeline
|
if (sourceLatestCommit != null && HoodieTimeline.compareTimestamps(targetLatestCommit, sourceLatestCommit,
|
||||||
.compareTimestamps(targetLatestCommit, sourceLatestCommit, HoodieTimeline.GREATER)) {
|
HoodieTimeline.GREATER)) {
|
||||||
// source is behind the target
|
// source is behind the target
|
||||||
List<HoodieInstant> commitsToCatchup =
|
List<HoodieInstant> commitsToCatchup = targetTimeline.findInstantsAfter(sourceLatestCommit, Integer.MAX_VALUE)
|
||||||
targetTimeline.findInstantsAfter(sourceLatestCommit, Integer.MAX_VALUE).getInstants()
|
.getInstants().collect(Collectors.toList());
|
||||||
.collect(Collectors.toList());
|
|
||||||
if (commitsToCatchup.isEmpty()) {
|
if (commitsToCatchup.isEmpty()) {
|
||||||
return "Count difference now is (count(" + target.getTableConfig().getTableName()
|
return "Count difference now is (count(" + target.getTableConfig().getTableName() + ") - count("
|
||||||
+ ") - count(" + source.getTableConfig().getTableName() + ") == " + (targetCount
|
+ source.getTableConfig().getTableName() + ") == " + (targetCount - sourceCount);
|
||||||
- sourceCount);
|
|
||||||
} else {
|
} else {
|
||||||
long newInserts = CommitUtil.countNewRecords(target,
|
long newInserts = CommitUtil.countNewRecords(target,
|
||||||
commitsToCatchup.stream().map(HoodieInstant::getTimestamp)
|
commitsToCatchup.stream().map(HoodieInstant::getTimestamp).collect(Collectors.toList()));
|
||||||
.collect(Collectors.toList()));
|
return "Count difference now is (count(" + target.getTableConfig().getTableName() + ") - count("
|
||||||
return "Count difference now is (count(" + target.getTableConfig().getTableName()
|
+ source.getTableConfig().getTableName()
|
||||||
+ ") - count(" + source.getTableConfig().getTableName() + ") == " + (targetCount
|
+ ") == " + (targetCount - sourceCount) + ". Catch up count is " + newInserts;
|
||||||
- sourceCount) + ". Catch up count is " + newInserts;
|
|
||||||
}
|
}
|
||||||
} else {
|
} else {
|
||||||
List<HoodieInstant> commitsToCatchup =
|
List<HoodieInstant> commitsToCatchup = sourceTimeline.findInstantsAfter(targetLatestCommit, Integer.MAX_VALUE)
|
||||||
sourceTimeline.findInstantsAfter(targetLatestCommit, Integer.MAX_VALUE).getInstants()
|
.getInstants().collect(Collectors.toList());
|
||||||
.collect(Collectors.toList());
|
|
||||||
if (commitsToCatchup.isEmpty()) {
|
if (commitsToCatchup.isEmpty()) {
|
||||||
return "Count difference now is (count(" + source.getTableConfig().getTableName()
|
return "Count difference now is (count(" + source.getTableConfig().getTableName() + ") - count("
|
||||||
+ ") - count(" + target.getTableConfig().getTableName() + ") == " + (sourceCount
|
+ target.getTableConfig().getTableName() + ") == " + (sourceCount - targetCount);
|
||||||
- targetCount);
|
|
||||||
} else {
|
} else {
|
||||||
long newInserts = CommitUtil.countNewRecords(source,
|
long newInserts = CommitUtil.countNewRecords(source,
|
||||||
commitsToCatchup.stream().map(HoodieInstant::getTimestamp)
|
commitsToCatchup.stream().map(HoodieInstant::getTimestamp).collect(Collectors.toList()));
|
||||||
.collect(Collectors.toList()));
|
return "Count difference now is (count(" + source.getTableConfig().getTableName() + ") - count("
|
||||||
return "Count difference now is (count(" + source.getTableConfig().getTableName()
|
+ target.getTableConfig().getTableName()
|
||||||
+ ") - count(" + target.getTableConfig().getTableName() + ") == " + (sourceCount
|
+ ") == " + (sourceCount - targetCount) + ". Catch up count is " + newInserts;
|
||||||
- targetCount) + ". Catch up count is " + newInserts;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -45,20 +45,20 @@ public class RepairsCommand implements CommandMarker {
|
|||||||
return HoodieCLI.tableMetadata != null;
|
return HoodieCLI.tableMetadata != null;
|
||||||
}
|
}
|
||||||
|
|
||||||
@CliCommand(value = "repair deduplicate", help = "De-duplicate a partition path contains duplicates & produce repaired files to replace with")
|
@CliCommand(value = "repair deduplicate", help = "De-duplicate a partition path contains duplicates & produce "
|
||||||
public String deduplicate(
|
+ "repaired files to replace with")
|
||||||
|
public String deduplicate(@CliOption(key = {
|
||||||
|
"duplicatedPartitionPath"}, help = "Partition Path containing the duplicates", mandatory = true) final String
|
||||||
|
duplicatedPartitionPath,
|
||||||
@CliOption(key = {
|
@CliOption(key = {
|
||||||
"duplicatedPartitionPath"}, help = "Partition Path containing the duplicates", mandatory = true)
|
"repairedOutputPath"}, help = "Location to place the repaired files", mandatory = true) final String
|
||||||
final String duplicatedPartitionPath,
|
repairedOutputPath,
|
||||||
@CliOption(key = {
|
@CliOption(key = {
|
||||||
"repairedOutputPath"}, help = "Location to place the repaired files", mandatory = true)
|
"sparkProperties"}, help = "Spark Properites File Path", mandatory = true) final String sparkPropertiesPath)
|
||||||
final String repairedOutputPath,
|
throws Exception {
|
||||||
@CliOption(key = {"sparkProperties"}, help = "Spark Properites File Path", mandatory = true)
|
|
||||||
final String sparkPropertiesPath) throws Exception {
|
|
||||||
SparkLauncher sparkLauncher = SparkUtil.initLauncher(sparkPropertiesPath);
|
SparkLauncher sparkLauncher = SparkUtil.initLauncher(sparkPropertiesPath);
|
||||||
sparkLauncher
|
sparkLauncher.addAppArgs(SparkMain.SparkCommand.DEDUPLICATE.toString(), duplicatedPartitionPath, repairedOutputPath,
|
||||||
.addAppArgs(SparkMain.SparkCommand.DEDUPLICATE.toString(), duplicatedPartitionPath,
|
HoodieCLI.tableMetadata.getBasePath());
|
||||||
repairedOutputPath, HoodieCLI.tableMetadata.getBasePath());
|
|
||||||
Process process = sparkLauncher.launch();
|
Process process = sparkLauncher.launch();
|
||||||
InputStreamConsumer.captureOutput(process);
|
InputStreamConsumer.captureOutput(process);
|
||||||
int exitCode = process.waitFor();
|
int exitCode = process.waitFor();
|
||||||
@@ -71,14 +71,12 @@ public class RepairsCommand implements CommandMarker {
|
|||||||
|
|
||||||
|
|
||||||
@CliCommand(value = "repair addpartitionmeta", help = "Add partition metadata to a dataset, if not present")
|
@CliCommand(value = "repair addpartitionmeta", help = "Add partition metadata to a dataset, if not present")
|
||||||
public String addPartitionMeta(
|
public String addPartitionMeta(@CliOption(key = {
|
||||||
@CliOption(key = {"dryrun"},
|
"dryrun"}, help = "Should we actually add or just print what would be done", unspecifiedDefaultValue = "true")
|
||||||
help = "Should we actually add or just print what would be done",
|
|
||||||
unspecifiedDefaultValue = "true")
|
|
||||||
final boolean dryRun) throws IOException {
|
final boolean dryRun) throws IOException {
|
||||||
|
|
||||||
String latestCommit = HoodieCLI.tableMetadata.getActiveTimeline().getCommitTimeline()
|
String latestCommit = HoodieCLI.tableMetadata.getActiveTimeline().getCommitTimeline().lastInstant().get()
|
||||||
.lastInstant().get().getTimestamp();
|
.getTimestamp();
|
||||||
List<String> partitionPaths = FSUtils.getAllFoldersThreeLevelsDown(HoodieCLI.fs,
|
List<String> partitionPaths = FSUtils.getAllFoldersThreeLevelsDown(HoodieCLI.fs,
|
||||||
HoodieCLI.tableMetadata.getBasePath());
|
HoodieCLI.tableMetadata.getBasePath());
|
||||||
Path basePath = new Path(HoodieCLI.tableMetadata.getBasePath());
|
Path basePath = new Path(HoodieCLI.tableMetadata.getBasePath());
|
||||||
@@ -94,10 +92,7 @@ public class RepairsCommand implements CommandMarker {
|
|||||||
if (!HoodiePartitionMetadata.hasPartitionMetadata(HoodieCLI.fs, partitionPath)) {
|
if (!HoodiePartitionMetadata.hasPartitionMetadata(HoodieCLI.fs, partitionPath)) {
|
||||||
row[1] = "No";
|
row[1] = "No";
|
||||||
if (!dryRun) {
|
if (!dryRun) {
|
||||||
HoodiePartitionMetadata partitionMetadata = new HoodiePartitionMetadata(
|
HoodiePartitionMetadata partitionMetadata = new HoodiePartitionMetadata(HoodieCLI.fs, latestCommit, basePath,
|
||||||
HoodieCLI.fs,
|
|
||||||
latestCommit,
|
|
||||||
basePath,
|
|
||||||
partitionPath);
|
partitionPath);
|
||||||
partitionMetadata.trySave(0);
|
partitionMetadata.trySave(0);
|
||||||
}
|
}
|
||||||
@@ -105,7 +100,6 @@ public class RepairsCommand implements CommandMarker {
|
|||||||
rows[ind++] = row;
|
rows[ind++] = row;
|
||||||
}
|
}
|
||||||
|
|
||||||
return HoodiePrintHelper.print(
|
return HoodiePrintHelper.print(new String[] {"Partition Path", "Metadata Present?", "Action"}, rows);
|
||||||
new String[]{"Partition Path", "Metadata Present?", "Action"}, rows);
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -13,6 +13,7 @@
|
|||||||
* See the License for the specific language governing permissions and
|
* See the License for the specific language governing permissions and
|
||||||
* limitations under the License.
|
* limitations under the License.
|
||||||
*/
|
*/
|
||||||
|
|
||||||
package com.uber.hoodie.cli.commands;
|
package com.uber.hoodie.cli.commands;
|
||||||
|
|
||||||
import com.uber.hoodie.HoodieWriteClient;
|
import com.uber.hoodie.HoodieWriteClient;
|
||||||
@@ -60,8 +61,8 @@ public class SavepointsCommand implements CommandMarker {
|
|||||||
|
|
||||||
@CliAvailabilityIndicator({"savepoint rollback"})
|
@CliAvailabilityIndicator({"savepoint rollback"})
|
||||||
public boolean isRollbackToSavepointAvailable() {
|
public boolean isRollbackToSavepointAvailable() {
|
||||||
return HoodieCLI.tableMetadata != null && !HoodieCLI.tableMetadata.getActiveTimeline()
|
return HoodieCLI.tableMetadata != null && !HoodieCLI.tableMetadata.getActiveTimeline().getSavePointTimeline()
|
||||||
.getSavePointTimeline().filterCompletedInstants().empty();
|
.filterCompletedInstants().empty();
|
||||||
}
|
}
|
||||||
|
|
||||||
@CliCommand(value = "savepoints show", help = "Show the savepoints")
|
@CliCommand(value = "savepoints show", help = "Show the savepoints")
|
||||||
@@ -79,17 +80,13 @@ public class SavepointsCommand implements CommandMarker {
|
|||||||
}
|
}
|
||||||
|
|
||||||
@CliCommand(value = "savepoint create", help = "Savepoint a commit")
|
@CliCommand(value = "savepoint create", help = "Savepoint a commit")
|
||||||
public String savepoint(
|
public String savepoint(@CliOption(key = {"commit"}, help = "Commit to savepoint") final String commitTime,
|
||||||
@CliOption(key = {"commit"}, help = "Commit to savepoint")
|
@CliOption(key = {"user"}, help = "User who is creating the savepoint") final String user,
|
||||||
final String commitTime,
|
@CliOption(key = {"comments"}, help = "Comments for creating the savepoint") final String comments)
|
||||||
@CliOption(key = {"user"}, help = "User who is creating the savepoint")
|
throws Exception {
|
||||||
final String user,
|
|
||||||
@CliOption(key = {"comments"}, help = "Comments for creating the savepoint")
|
|
||||||
final String comments) throws Exception {
|
|
||||||
HoodieActiveTimeline activeTimeline = HoodieCLI.tableMetadata.getActiveTimeline();
|
HoodieActiveTimeline activeTimeline = HoodieCLI.tableMetadata.getActiveTimeline();
|
||||||
HoodieTimeline timeline = activeTimeline.getCommitTimeline().filterCompletedInstants();
|
HoodieTimeline timeline = activeTimeline.getCommitTimeline().filterCompletedInstants();
|
||||||
HoodieInstant
|
HoodieInstant commitInstant = new HoodieInstant(false, HoodieTimeline.COMMIT_ACTION, commitTime);
|
||||||
commitInstant = new HoodieInstant(false, HoodieTimeline.COMMIT_ACTION, commitTime);
|
|
||||||
|
|
||||||
if (!timeline.containsInstant(commitInstant)) {
|
if (!timeline.containsInstant(commitInstant)) {
|
||||||
return "Commit " + commitTime + " not found in Commits " + timeline;
|
return "Commit " + commitTime + " not found in Commits " + timeline;
|
||||||
@@ -106,22 +103,19 @@ public class SavepointsCommand implements CommandMarker {
|
|||||||
|
|
||||||
@CliCommand(value = "savepoint rollback", help = "Savepoint a commit")
|
@CliCommand(value = "savepoint rollback", help = "Savepoint a commit")
|
||||||
public String rollbackToSavepoint(
|
public String rollbackToSavepoint(
|
||||||
@CliOption(key = {"savepoint"}, help = "Savepoint to rollback")
|
@CliOption(key = {"savepoint"}, help = "Savepoint to rollback") final String commitTime,
|
||||||
final String commitTime,
|
@CliOption(key = {"sparkProperties"}, help = "Spark Properites File Path") final String sparkPropertiesPath)
|
||||||
@CliOption(key = {"sparkProperties"}, help = "Spark Properites File Path")
|
throws Exception {
|
||||||
final String sparkPropertiesPath) throws Exception {
|
|
||||||
HoodieActiveTimeline activeTimeline = HoodieCLI.tableMetadata.getActiveTimeline();
|
HoodieActiveTimeline activeTimeline = HoodieCLI.tableMetadata.getActiveTimeline();
|
||||||
HoodieTimeline timeline = activeTimeline.getCommitTimeline().filterCompletedInstants();
|
HoodieTimeline timeline = activeTimeline.getCommitTimeline().filterCompletedInstants();
|
||||||
HoodieInstant
|
HoodieInstant commitInstant = new HoodieInstant(false, HoodieTimeline.COMMIT_ACTION, commitTime);
|
||||||
commitInstant = new HoodieInstant(false, HoodieTimeline.COMMIT_ACTION, commitTime);
|
|
||||||
|
|
||||||
if (!timeline.containsInstant(commitInstant)) {
|
if (!timeline.containsInstant(commitInstant)) {
|
||||||
return "Commit " + commitTime + " not found in Commits " + timeline;
|
return "Commit " + commitTime + " not found in Commits " + timeline;
|
||||||
}
|
}
|
||||||
|
|
||||||
SparkLauncher sparkLauncher = SparkUtil.initLauncher(sparkPropertiesPath);
|
SparkLauncher sparkLauncher = SparkUtil.initLauncher(sparkPropertiesPath);
|
||||||
sparkLauncher.addAppArgs(SparkMain.SparkCommand.ROLLBACK_TO_SAVEPOINT.toString(),
|
sparkLauncher.addAppArgs(SparkMain.SparkCommand.ROLLBACK_TO_SAVEPOINT.toString(), commitTime,
|
||||||
commitTime,
|
|
||||||
HoodieCLI.tableMetadata.getBasePath());
|
HoodieCLI.tableMetadata.getBasePath());
|
||||||
Process process = sparkLauncher.launch();
|
Process process = sparkLauncher.launch();
|
||||||
InputStreamConsumer.captureOutput(process);
|
InputStreamConsumer.captureOutput(process);
|
||||||
@@ -137,18 +131,14 @@ public class SavepointsCommand implements CommandMarker {
|
|||||||
|
|
||||||
@CliCommand(value = "savepoints refresh", help = "Refresh the savepoints")
|
@CliCommand(value = "savepoints refresh", help = "Refresh the savepoints")
|
||||||
public String refreshMetaClient() throws IOException {
|
public String refreshMetaClient() throws IOException {
|
||||||
HoodieTableMetaClient metadata =
|
HoodieTableMetaClient metadata = new HoodieTableMetaClient(HoodieCLI.conf, HoodieCLI.tableMetadata.getBasePath());
|
||||||
new HoodieTableMetaClient(HoodieCLI.conf, HoodieCLI.tableMetadata.getBasePath());
|
|
||||||
HoodieCLI.setTableMetadata(metadata);
|
HoodieCLI.setTableMetadata(metadata);
|
||||||
return "Metadata for table " + metadata.getTableConfig().getTableName() + " refreshed.";
|
return "Metadata for table " + metadata.getTableConfig().getTableName() + " refreshed.";
|
||||||
}
|
}
|
||||||
|
|
||||||
private static HoodieWriteClient createHoodieClient(JavaSparkContext jsc, String basePath)
|
private static HoodieWriteClient createHoodieClient(JavaSparkContext jsc, String basePath) throws Exception {
|
||||||
throws Exception {
|
HoodieWriteConfig config = HoodieWriteConfig.newBuilder().withPath(basePath).withIndexConfig(
|
||||||
HoodieWriteConfig config = HoodieWriteConfig.newBuilder().withPath(basePath)
|
HoodieIndexConfig.newBuilder().withIndexType(HoodieIndex.IndexType.BLOOM).build()).build();
|
||||||
.withIndexConfig(
|
|
||||||
HoodieIndexConfig.newBuilder().withIndexType(HoodieIndex.IndexType.BLOOM).build())
|
|
||||||
.build();
|
|
||||||
return new HoodieWriteClient(jsc, config, false);
|
return new HoodieWriteClient(jsc, config, false);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|||||||
@@ -30,18 +30,14 @@ import org.apache.spark.sql.SQLContext;
|
|||||||
|
|
||||||
public class SparkMain {
|
public class SparkMain {
|
||||||
|
|
||||||
protected final static Logger LOG = Logger.getLogger(SparkMain.class);
|
protected static final Logger LOG = Logger.getLogger(SparkMain.class);
|
||||||
|
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Commands
|
* Commands
|
||||||
*/
|
*/
|
||||||
enum SparkCommand {
|
enum SparkCommand {
|
||||||
ROLLBACK,
|
ROLLBACK, DEDUPLICATE, ROLLBACK_TO_SAVEPOINT, SAVEPOINT, IMPORT
|
||||||
DEDUPLICATE,
|
|
||||||
ROLLBACK_TO_SAVEPOINT,
|
|
||||||
SAVEPOINT,
|
|
||||||
IMPORT
|
|
||||||
}
|
}
|
||||||
|
|
||||||
public static void main(String[] args) throws Exception {
|
public static void main(String[] args) throws Exception {
|
||||||
@@ -67,18 +63,19 @@ public class SparkMain {
|
|||||||
break;
|
break;
|
||||||
case IMPORT:
|
case IMPORT:
|
||||||
assert (args.length == 11);
|
assert (args.length == 11);
|
||||||
returnCode = dataImport(jsc, args[1], args[2], args[3], args[4], args[5], args[6],
|
returnCode = dataImport(jsc, args[1], args[2], args[3], args[4], args[5], args[6], Integer.parseInt(args[7]),
|
||||||
Integer.parseInt(args[7]), args[8], SparkUtil.DEFUALT_SPARK_MASTER, args[9],
|
args[8], SparkUtil.DEFUALT_SPARK_MASTER, args[9], Integer.parseInt(args[10]));
|
||||||
Integer.parseInt(args[10]));
|
break;
|
||||||
|
default:
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
|
|
||||||
System.exit(returnCode);
|
System.exit(returnCode);
|
||||||
}
|
}
|
||||||
|
|
||||||
private static int dataImport(JavaSparkContext jsc, String srcPath, String targetPath,
|
private static int dataImport(JavaSparkContext jsc, String srcPath, String targetPath, String tableName,
|
||||||
String tableName, String tableType, String rowKey, String partitionKey, int parallelism,
|
String tableType, String rowKey, String partitionKey, int parallelism, String schemaFile, String sparkMaster,
|
||||||
String schemaFile, String sparkMaster, String sparkMemory, int retry) throws Exception {
|
String sparkMemory, int retry) throws Exception {
|
||||||
HDFSParquetImporter.Config cfg = new HDFSParquetImporter.Config();
|
HDFSParquetImporter.Config cfg = new HDFSParquetImporter.Config();
|
||||||
cfg.srcPath = srcPath;
|
cfg.srcPath = srcPath;
|
||||||
cfg.targetPath = targetPath;
|
cfg.targetPath = targetPath;
|
||||||
@@ -92,19 +89,15 @@ public class SparkMain {
|
|||||||
return new HDFSParquetImporter(cfg).dataImport(jsc, retry);
|
return new HDFSParquetImporter(cfg).dataImport(jsc, retry);
|
||||||
}
|
}
|
||||||
|
|
||||||
private static int deduplicatePartitionPath(JavaSparkContext jsc,
|
private static int deduplicatePartitionPath(JavaSparkContext jsc, String duplicatedPartitionPath,
|
||||||
String duplicatedPartitionPath,
|
String repairedOutputPath, String basePath) throws Exception {
|
||||||
String repairedOutputPath,
|
DedupeSparkJob job = new DedupeSparkJob(basePath, duplicatedPartitionPath, repairedOutputPath, new SQLContext(jsc),
|
||||||
String basePath)
|
FSUtils.getFs(basePath, jsc.hadoopConfiguration()));
|
||||||
throws Exception {
|
|
||||||
DedupeSparkJob job = new DedupeSparkJob(basePath, duplicatedPartitionPath, repairedOutputPath,
|
|
||||||
new SQLContext(jsc), FSUtils.getFs(basePath, jsc.hadoopConfiguration()));
|
|
||||||
job.fixDuplicates(true);
|
job.fixDuplicates(true);
|
||||||
return 0;
|
return 0;
|
||||||
}
|
}
|
||||||
|
|
||||||
private static int rollback(JavaSparkContext jsc, String commitTime, String basePath)
|
private static int rollback(JavaSparkContext jsc, String commitTime, String basePath) throws Exception {
|
||||||
throws Exception {
|
|
||||||
HoodieWriteClient client = createHoodieClient(jsc, basePath);
|
HoodieWriteClient client = createHoodieClient(jsc, basePath);
|
||||||
if (client.rollback(commitTime)) {
|
if (client.rollback(commitTime)) {
|
||||||
LOG.info(String.format("The commit \"%s\" rolled back.", commitTime));
|
LOG.info(String.format("The commit \"%s\" rolled back.", commitTime));
|
||||||
@@ -115,9 +108,7 @@ public class SparkMain {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
private static int rollbackToSavepoint(JavaSparkContext jsc, String savepointTime,
|
private static int rollbackToSavepoint(JavaSparkContext jsc, String savepointTime, String basePath) throws Exception {
|
||||||
String basePath)
|
|
||||||
throws Exception {
|
|
||||||
HoodieWriteClient client = createHoodieClient(jsc, basePath);
|
HoodieWriteClient client = createHoodieClient(jsc, basePath);
|
||||||
if (client.rollbackToSavepoint(savepointTime)) {
|
if (client.rollbackToSavepoint(savepointTime)) {
|
||||||
LOG.info(String.format("The commit \"%s\" rolled back.", savepointTime));
|
LOG.info(String.format("The commit \"%s\" rolled back.", savepointTime));
|
||||||
@@ -128,12 +119,9 @@ public class SparkMain {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
private static HoodieWriteClient createHoodieClient(JavaSparkContext jsc, String basePath)
|
private static HoodieWriteClient createHoodieClient(JavaSparkContext jsc, String basePath) throws Exception {
|
||||||
throws Exception {
|
HoodieWriteConfig config = HoodieWriteConfig.newBuilder().withPath(basePath).withIndexConfig(
|
||||||
HoodieWriteConfig config = HoodieWriteConfig.newBuilder().withPath(basePath)
|
HoodieIndexConfig.newBuilder().withIndexType(HoodieIndex.IndexType.BLOOM).build()).build();
|
||||||
.withIndexConfig(
|
|
||||||
HoodieIndexConfig.newBuilder().withIndexType(HoodieIndex.IndexType.BLOOM).build())
|
|
||||||
.build();
|
|
||||||
return new HoodieWriteClient(jsc, config);
|
return new HoodieWriteClient(jsc, config);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -16,7 +16,6 @@
|
|||||||
|
|
||||||
package com.uber.hoodie.cli.commands;
|
package com.uber.hoodie.cli.commands;
|
||||||
|
|
||||||
|
|
||||||
import com.codahale.metrics.Histogram;
|
import com.codahale.metrics.Histogram;
|
||||||
import com.codahale.metrics.Snapshot;
|
import com.codahale.metrics.Snapshot;
|
||||||
import com.codahale.metrics.UniformReservoir;
|
import com.codahale.metrics.UniformReservoir;
|
||||||
@@ -44,12 +43,15 @@ import org.springframework.stereotype.Component;
|
|||||||
@Component
|
@Component
|
||||||
public class StatsCommand implements CommandMarker {
|
public class StatsCommand implements CommandMarker {
|
||||||
|
|
||||||
|
private static final int MAX_FILES = 1000000;
|
||||||
|
|
||||||
@CliAvailabilityIndicator({"stats wa"})
|
@CliAvailabilityIndicator({"stats wa"})
|
||||||
public boolean isWriteAmpAvailable() {
|
public boolean isWriteAmpAvailable() {
|
||||||
return HoodieCLI.tableMetadata != null;
|
return HoodieCLI.tableMetadata != null;
|
||||||
}
|
}
|
||||||
|
|
||||||
@CliCommand(value = "stats wa", help = "Write Amplification. Ratio of how many records were upserted to how many records were actually written")
|
@CliCommand(value = "stats wa", help = "Write Amplification. Ratio of how many records were upserted to how many "
|
||||||
|
+ "records were actually written")
|
||||||
public String writeAmplificationStats() throws IOException {
|
public String writeAmplificationStats() throws IOException {
|
||||||
long totalRecordsUpserted = 0;
|
long totalRecordsUpserted = 0;
|
||||||
long totalRecordsWritten = 0;
|
long totalRecordsWritten = 0;
|
||||||
@@ -60,18 +62,13 @@ public class StatsCommand implements CommandMarker {
|
|||||||
String[][] rows = new String[new Long(timeline.countInstants()).intValue() + 1][];
|
String[][] rows = new String[new Long(timeline.countInstants()).intValue() + 1][];
|
||||||
int i = 0;
|
int i = 0;
|
||||||
DecimalFormat df = new DecimalFormat("#.00");
|
DecimalFormat df = new DecimalFormat("#.00");
|
||||||
for (HoodieInstant commitTime : timeline.getInstants().collect(
|
for (HoodieInstant commitTime : timeline.getInstants().collect(Collectors.toList())) {
|
||||||
Collectors.toList())) {
|
|
||||||
String waf = "0";
|
String waf = "0";
|
||||||
HoodieCommitMetadata commit = HoodieCommitMetadata
|
HoodieCommitMetadata commit = HoodieCommitMetadata.fromBytes(activeTimeline.getInstantDetails(commitTime).get());
|
||||||
.fromBytes(activeTimeline.getInstantDetails(commitTime).get());
|
|
||||||
if (commit.fetchTotalUpdateRecordsWritten() > 0) {
|
if (commit.fetchTotalUpdateRecordsWritten() > 0) {
|
||||||
waf = df.format(
|
waf = df.format((float) commit.fetchTotalRecordsWritten() / commit.fetchTotalUpdateRecordsWritten());
|
||||||
(float) commit.fetchTotalRecordsWritten() / commit
|
|
||||||
.fetchTotalUpdateRecordsWritten());
|
|
||||||
}
|
}
|
||||||
rows[i++] = new String[]{commitTime.getTimestamp(),
|
rows[i++] = new String[] {commitTime.getTimestamp(), String.valueOf(commit.fetchTotalUpdateRecordsWritten()),
|
||||||
String.valueOf(commit.fetchTotalUpdateRecordsWritten()),
|
|
||||||
String.valueOf(commit.fetchTotalRecordsWritten()), waf};
|
String.valueOf(commit.fetchTotalRecordsWritten()), waf};
|
||||||
totalRecordsUpserted += commit.fetchTotalUpdateRecordsWritten();
|
totalRecordsUpserted += commit.fetchTotalUpdateRecordsWritten();
|
||||||
totalRecordsWritten += commit.fetchTotalRecordsWritten();
|
totalRecordsWritten += commit.fetchTotalRecordsWritten();
|
||||||
@@ -80,43 +77,32 @@ public class StatsCommand implements CommandMarker {
|
|||||||
if (totalRecordsUpserted > 0) {
|
if (totalRecordsUpserted > 0) {
|
||||||
waf = df.format((float) totalRecordsWritten / totalRecordsUpserted);
|
waf = df.format((float) totalRecordsWritten / totalRecordsUpserted);
|
||||||
}
|
}
|
||||||
rows[i] = new String[]{"Total", String.valueOf(totalRecordsUpserted),
|
rows[i] = new String[] {"Total", String.valueOf(totalRecordsUpserted), String.valueOf(totalRecordsWritten), waf};
|
||||||
String.valueOf(totalRecordsWritten), waf};
|
return HoodiePrintHelper
|
||||||
return HoodiePrintHelper.print(
|
.print(new String[] {"CommitTime", "Total Upserted", "Total Written", "Write Amplifiation Factor"},
|
||||||
new String[]{"CommitTime", "Total Upserted", "Total Written",
|
rows);
|
||||||
"Write Amplifiation Factor"}, rows);
|
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
private String[] printFileSizeHistogram(String commitTime, Snapshot s) {
|
private String[] printFileSizeHistogram(String commitTime, Snapshot s) {
|
||||||
return new String[]{
|
return new String[] {commitTime, NumericUtils.humanReadableByteCount(s.getMin()),
|
||||||
commitTime,
|
NumericUtils.humanReadableByteCount(s.getValue(0.1)), NumericUtils.humanReadableByteCount(s.getMedian()),
|
||||||
NumericUtils.humanReadableByteCount(s.getMin()),
|
NumericUtils.humanReadableByteCount(s.getMean()), NumericUtils.humanReadableByteCount(s.get95thPercentile()),
|
||||||
NumericUtils.humanReadableByteCount(s.getValue(0.1)),
|
NumericUtils.humanReadableByteCount(s.getMax()), String.valueOf(s.size()),
|
||||||
NumericUtils.humanReadableByteCount(s.getMedian()),
|
NumericUtils.humanReadableByteCount(s.getStdDev())};
|
||||||
NumericUtils.humanReadableByteCount(s.getMean()),
|
|
||||||
NumericUtils.humanReadableByteCount(s.get95thPercentile()),
|
|
||||||
NumericUtils.humanReadableByteCount(s.getMax()),
|
|
||||||
String.valueOf(s.size()),
|
|
||||||
NumericUtils.humanReadableByteCount(s.getStdDev())
|
|
||||||
};
|
|
||||||
}
|
}
|
||||||
|
|
||||||
@CliCommand(value = "stats filesizes", help = "File Sizes. Display summary stats on sizes of files")
|
@CliCommand(value = "stats filesizes", help = "File Sizes. Display summary stats on sizes of files")
|
||||||
public String fileSizeStats(
|
public String fileSizeStats(@CliOption(key = {
|
||||||
@CliOption(key = {
|
"partitionPath"}, help = "regex to select files, eg: 2016/08/02", unspecifiedDefaultValue = "*/*/*") final
|
||||||
"partitionPath"}, help = "regex to select files, eg: 2016/08/02", unspecifiedDefaultValue = "*/*/*")
|
String globRegex) throws IOException {
|
||||||
final String globRegex) throws IOException {
|
|
||||||
|
|
||||||
FileSystem fs = HoodieCLI.fs;
|
FileSystem fs = HoodieCLI.fs;
|
||||||
String globPath = String.format("%s/%s/*",
|
String globPath = String.format("%s/%s/*", HoodieCLI.tableMetadata.getBasePath(), globRegex);
|
||||||
HoodieCLI.tableMetadata.getBasePath(),
|
|
||||||
globRegex);
|
|
||||||
FileStatus[] statuses = fs.globStatus(new Path(globPath));
|
FileStatus[] statuses = fs.globStatus(new Path(globPath));
|
||||||
|
|
||||||
// max, min, #small files < 10MB, 50th, avg, 95th
|
// max, min, #small files < 10MB, 50th, avg, 95th
|
||||||
final int MAX_FILES = 1000000;
|
|
||||||
Histogram globalHistogram = new Histogram(new UniformReservoir(MAX_FILES));
|
Histogram globalHistogram = new Histogram(new UniformReservoir(MAX_FILES));
|
||||||
HashMap<String, Histogram> commitHistoMap = new HashMap<String, Histogram>();
|
HashMap<String, Histogram> commitHistoMap = new HashMap<String, Histogram>();
|
||||||
for (FileStatus fileStatus : statuses) {
|
for (FileStatus fileStatus : statuses) {
|
||||||
@@ -138,8 +124,8 @@ public class StatsCommand implements CommandMarker {
|
|||||||
Snapshot s = globalHistogram.getSnapshot();
|
Snapshot s = globalHistogram.getSnapshot();
|
||||||
rows[ind++] = printFileSizeHistogram("ALL", s);
|
rows[ind++] = printFileSizeHistogram("ALL", s);
|
||||||
|
|
||||||
return HoodiePrintHelper.print(
|
return HoodiePrintHelper
|
||||||
new String[]{"CommitTime", "Min", "10th", "50th", "avg", "95th", "Max", "NumFiles",
|
.print(new String[] {"CommitTime", "Min", "10th", "50th", "avg", "95th", "Max", "NumFiles", "StdDev"},
|
||||||
"StdDev"}, rows);
|
rows);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -25,9 +25,7 @@ import org.springframework.stereotype.Component;
|
|||||||
public class UtilsCommand implements CommandMarker {
|
public class UtilsCommand implements CommandMarker {
|
||||||
|
|
||||||
@CliCommand(value = "utils loadClass", help = "Load a class")
|
@CliCommand(value = "utils loadClass", help = "Load a class")
|
||||||
public String loadClass(
|
public String loadClass(@CliOption(key = {"class"}, help = "Check mode") final String clazz) throws Exception {
|
||||||
@CliOption(key = {"class"}, help = "Check mode") final String clazz
|
|
||||||
) throws Exception {
|
|
||||||
Class klass = Class.forName(clazz);
|
Class klass = Class.forName(clazz);
|
||||||
return klass.getProtectionDomain().getCodeSource().getLocation().toExternalForm();
|
return klass.getProtectionDomain().getCodeSource().getLocation().toExternalForm();
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -25,15 +25,12 @@ import java.util.List;
|
|||||||
|
|
||||||
public class CommitUtil {
|
public class CommitUtil {
|
||||||
|
|
||||||
public static long countNewRecords(HoodieTableMetaClient target, List<String> commitsToCatchup)
|
public static long countNewRecords(HoodieTableMetaClient target, List<String> commitsToCatchup) throws IOException {
|
||||||
throws IOException {
|
|
||||||
long totalNew = 0;
|
long totalNew = 0;
|
||||||
HoodieTimeline timeline = target.getActiveTimeline().reload().getCommitTimeline()
|
HoodieTimeline timeline = target.getActiveTimeline().reload().getCommitTimeline().filterCompletedInstants();
|
||||||
.filterCompletedInstants();
|
|
||||||
for (String commit : commitsToCatchup) {
|
for (String commit : commitsToCatchup) {
|
||||||
HoodieCommitMetadata c = HoodieCommitMetadata.fromBytes(timeline
|
HoodieCommitMetadata c = HoodieCommitMetadata.fromBytes(
|
||||||
.getInstantDetails(new HoodieInstant(false, HoodieTimeline.COMMIT_ACTION, commit))
|
timeline.getInstantDetails(new HoodieInstant(false, HoodieTimeline.COMMIT_ACTION, commit)).get());
|
||||||
.get());
|
|
||||||
totalNew += c.fetchTotalRecordsWritten() - c.fetchTotalUpdateRecordsWritten();
|
totalNew += c.fetchTotalRecordsWritten() - c.fetchTotalUpdateRecordsWritten();
|
||||||
}
|
}
|
||||||
return totalNew;
|
return totalNew;
|
||||||
|
|||||||
@@ -27,7 +27,7 @@ import org.joda.time.DateTime;
|
|||||||
|
|
||||||
public class HiveUtil {
|
public class HiveUtil {
|
||||||
|
|
||||||
private static String driverName = "org.apache.hive.jdbc.HiveDriver";
|
private static final String driverName = "org.apache.hive.jdbc.HiveDriver";
|
||||||
|
|
||||||
static {
|
static {
|
||||||
try {
|
try {
|
||||||
@@ -39,8 +39,7 @@ public class HiveUtil {
|
|||||||
|
|
||||||
private static Connection connection;
|
private static Connection connection;
|
||||||
|
|
||||||
private static Connection getConnection(String jdbcUrl, String user, String pass)
|
private static Connection getConnection(String jdbcUrl, String user, String pass) throws SQLException {
|
||||||
throws SQLException {
|
|
||||||
DataSource ds = getDatasource(jdbcUrl, user, pass);
|
DataSource ds = getDatasource(jdbcUrl, user, pass);
|
||||||
return ds.getConnection();
|
return ds.getConnection();
|
||||||
}
|
}
|
||||||
@@ -54,8 +53,8 @@ public class HiveUtil {
|
|||||||
return ds;
|
return ds;
|
||||||
}
|
}
|
||||||
|
|
||||||
public static long countRecords(String jdbcUrl, HoodieTableMetaClient source, String dbName,
|
public static long countRecords(String jdbcUrl, HoodieTableMetaClient source, String dbName, String user, String pass)
|
||||||
String user, String pass) throws SQLException {
|
throws SQLException {
|
||||||
Connection conn = HiveUtil.getConnection(jdbcUrl, user, pass);
|
Connection conn = HiveUtil.getConnection(jdbcUrl, user, pass);
|
||||||
ResultSet rs = null;
|
ResultSet rs = null;
|
||||||
Statement stmt = conn.createStatement();
|
Statement stmt = conn.createStatement();
|
||||||
@@ -64,15 +63,13 @@ public class HiveUtil {
|
|||||||
stmt.execute("set hive.input.format=org.apache.hadoop.hive.ql.io.HiveInputFormat");
|
stmt.execute("set hive.input.format=org.apache.hadoop.hive.ql.io.HiveInputFormat");
|
||||||
stmt.execute("set hive.stats.autogather=false");
|
stmt.execute("set hive.stats.autogather=false");
|
||||||
rs = stmt.executeQuery(
|
rs = stmt.executeQuery(
|
||||||
"select count(`_hoodie_commit_time`) as cnt from " + dbName + "." + source
|
"select count(`_hoodie_commit_time`) as cnt from " + dbName + "."
|
||||||
.getTableConfig()
|
+ source.getTableConfig().getTableName());
|
||||||
.getTableName());
|
|
||||||
long count = -1;
|
long count = -1;
|
||||||
if (rs.next()) {
|
if (rs.next()) {
|
||||||
count = rs.getLong("cnt");
|
count = rs.getLong("cnt");
|
||||||
}
|
}
|
||||||
System.out
|
System.out.println("Total records in " + source.getTableConfig().getTableName() + " is " + count);
|
||||||
.println("Total records in " + source.getTableConfig().getTableName() + " is " + count);
|
|
||||||
return count;
|
return count;
|
||||||
} finally {
|
} finally {
|
||||||
if (rs != null) {
|
if (rs != null) {
|
||||||
@@ -84,22 +81,19 @@ public class HiveUtil {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
public static long countRecords(String jdbcUrl, HoodieTableMetaClient source, String srcDb,
|
public static long countRecords(String jdbcUrl, HoodieTableMetaClient source, String srcDb, int partitions,
|
||||||
int partitions, String user, String pass) throws SQLException {
|
String user, String pass) throws SQLException {
|
||||||
DateTime dateTime = DateTime.now();
|
DateTime dateTime = DateTime.now();
|
||||||
String endDateStr =
|
String endDateStr = dateTime.getYear() + "-" + String.format("%02d", dateTime.getMonthOfYear()) + "-"
|
||||||
dateTime.getYear() + "-" + String.format("%02d", dateTime.getMonthOfYear()) + "-" +
|
+ String.format("%02d", dateTime.getDayOfMonth());
|
||||||
String.format("%02d", dateTime.getDayOfMonth());
|
|
||||||
dateTime = dateTime.minusDays(partitions);
|
dateTime = dateTime.minusDays(partitions);
|
||||||
String startDateStr =
|
String startDateStr = dateTime.getYear() + "-" + String.format("%02d", dateTime.getMonthOfYear()) + "-"
|
||||||
dateTime.getYear() + "-" + String.format("%02d", dateTime.getMonthOfYear()) + "-" +
|
+ String.format("%02d", dateTime.getDayOfMonth());
|
||||||
String.format("%02d", dateTime.getDayOfMonth());
|
|
||||||
System.out.println("Start date " + startDateStr + " and end date " + endDateStr);
|
System.out.println("Start date " + startDateStr + " and end date " + endDateStr);
|
||||||
return countRecords(jdbcUrl, source, srcDb, startDateStr, endDateStr, user, pass);
|
return countRecords(jdbcUrl, source, srcDb, startDateStr, endDateStr, user, pass);
|
||||||
}
|
}
|
||||||
|
|
||||||
private static long countRecords(String jdbcUrl, HoodieTableMetaClient source, String srcDb,
|
private static long countRecords(String jdbcUrl, HoodieTableMetaClient source, String srcDb, String startDateStr,
|
||||||
String startDateStr,
|
|
||||||
String endDateStr, String user, String pass) throws SQLException {
|
String endDateStr, String user, String pass) throws SQLException {
|
||||||
Connection conn = HiveUtil.getConnection(jdbcUrl, user, pass);
|
Connection conn = HiveUtil.getConnection(jdbcUrl, user, pass);
|
||||||
ResultSet rs = null;
|
ResultSet rs = null;
|
||||||
@@ -109,9 +103,8 @@ public class HiveUtil {
|
|||||||
stmt.execute("set hive.input.format=org.apache.hadoop.hive.ql.io.HiveInputFormat");
|
stmt.execute("set hive.input.format=org.apache.hadoop.hive.ql.io.HiveInputFormat");
|
||||||
stmt.execute("set hive.stats.autogather=false");
|
stmt.execute("set hive.stats.autogather=false");
|
||||||
rs = stmt.executeQuery(
|
rs = stmt.executeQuery(
|
||||||
"select count(`_hoodie_commit_time`) as cnt from " + srcDb + "." + source.getTableConfig()
|
"select count(`_hoodie_commit_time`) as cnt from " + srcDb + "." + source.getTableConfig().getTableName()
|
||||||
.getTableName() + " where datestr>'" + startDateStr + "' and datestr<='"
|
+ " where datestr>'" + startDateStr + "' and datestr<='" + endDateStr + "'");
|
||||||
+ endDateStr + "'");
|
|
||||||
if (rs.next()) {
|
if (rs.next()) {
|
||||||
return rs.getLong("cnt");
|
return rs.getLong("cnt");
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -24,7 +24,7 @@ import java.util.logging.Logger;
|
|||||||
|
|
||||||
public class InputStreamConsumer extends Thread {
|
public class InputStreamConsumer extends Thread {
|
||||||
|
|
||||||
protected final static Logger LOG = Logger.getLogger(InputStreamConsumer.class.getName());
|
protected static final Logger LOG = Logger.getLogger(InputStreamConsumer.class.getName());
|
||||||
private InputStream is;
|
private InputStream is;
|
||||||
|
|
||||||
public InputStreamConsumer(InputStream is) {
|
public InputStreamConsumer(InputStream is) {
|
||||||
|
|||||||
@@ -35,12 +35,9 @@ public class SparkUtil {
|
|||||||
* TODO: Need to fix a bunch of hardcoded stuff here eg: history server, spark distro
|
* TODO: Need to fix a bunch of hardcoded stuff here eg: history server, spark distro
|
||||||
*/
|
*/
|
||||||
public static SparkLauncher initLauncher(String propertiesFile) throws URISyntaxException {
|
public static SparkLauncher initLauncher(String propertiesFile) throws URISyntaxException {
|
||||||
String currentJar = new File(
|
String currentJar = new File(SparkUtil.class.getProtectionDomain().getCodeSource().getLocation().toURI().getPath())
|
||||||
SparkUtil.class.getProtectionDomain().getCodeSource().getLocation().toURI().getPath())
|
|
||||||
.getAbsolutePath();
|
.getAbsolutePath();
|
||||||
SparkLauncher sparkLauncher =
|
SparkLauncher sparkLauncher = new SparkLauncher().setAppResource(currentJar).setMainClass(SparkMain.class.getName())
|
||||||
new SparkLauncher().setAppResource(currentJar)
|
|
||||||
.setMainClass(SparkMain.class.getName())
|
|
||||||
.setPropertiesFile(propertiesFile);
|
.setPropertiesFile(propertiesFile);
|
||||||
File libDirectory = new File(new File(currentJar).getParent(), "lib");
|
File libDirectory = new File(new File(currentJar).getParent(), "lib");
|
||||||
for (String library : libDirectory.list()) {
|
for (String library : libDirectory.list()) {
|
||||||
@@ -60,8 +57,7 @@ public class SparkUtil {
|
|||||||
// Configure hadoop conf
|
// Configure hadoop conf
|
||||||
sparkConf.set("spark.hadoop.mapred.output.compress", "true");
|
sparkConf.set("spark.hadoop.mapred.output.compress", "true");
|
||||||
sparkConf.set("spark.hadoop.mapred.output.compression.codec", "true");
|
sparkConf.set("spark.hadoop.mapred.output.compression.codec", "true");
|
||||||
sparkConf.set("spark.hadoop.mapred.output.compression.codec",
|
sparkConf.set("spark.hadoop.mapred.output.compression.codec", "org.apache.hadoop.io.compress.GzipCodec");
|
||||||
"org.apache.hadoop.io.compress.GzipCodec");
|
|
||||||
sparkConf.set("spark.hadoop.mapred.output.compression.type", "BLOCK");
|
sparkConf.set("spark.hadoop.mapred.output.compression.type", "BLOCK");
|
||||||
|
|
||||||
sparkConf = HoodieWriteClient.registerClasses(sparkConf);
|
sparkConf = HoodieWriteClient.registerClasses(sparkConf);
|
||||||
|
|||||||
@@ -50,17 +50,17 @@ import scala.Tuple2;
|
|||||||
*/
|
*/
|
||||||
public class HoodieReadClient<T extends HoodieRecordPayload> implements Serializable {
|
public class HoodieReadClient<T extends HoodieRecordPayload> implements Serializable {
|
||||||
|
|
||||||
private static Logger logger = LogManager.getLogger(HoodieReadClient.class);
|
private static final Logger logger = LogManager.getLogger(HoodieReadClient.class);
|
||||||
|
|
||||||
private transient final JavaSparkContext jsc;
|
private final transient JavaSparkContext jsc;
|
||||||
|
|
||||||
private transient final FileSystem fs;
|
private final transient FileSystem fs;
|
||||||
/**
|
/**
|
||||||
* TODO: We need to persist the index type into hoodie.properties and be able to access the index
|
* TODO: We need to persist the index type into hoodie.properties and be able to access the index
|
||||||
* just with a simple basepath pointing to the dataset. Until, then just always assume a
|
* just with a simple basepath pointing to the dataset. Until, then just always assume a
|
||||||
* BloomIndex
|
* BloomIndex
|
||||||
*/
|
*/
|
||||||
private transient final HoodieIndex<T> index;
|
private final transient HoodieIndex<T> index;
|
||||||
private final HoodieTimeline commitTimeline;
|
private final HoodieTimeline commitTimeline;
|
||||||
private HoodieTable hoodieTable;
|
private HoodieTable hoodieTable;
|
||||||
private transient Optional<SQLContext> sqlContextOpt;
|
private transient Optional<SQLContext> sqlContextOpt;
|
||||||
@@ -69,8 +69,7 @@ public class HoodieReadClient<T extends HoodieRecordPayload> implements Serializ
|
|||||||
* @param basePath path to Hoodie dataset
|
* @param basePath path to Hoodie dataset
|
||||||
*/
|
*/
|
||||||
public HoodieReadClient(JavaSparkContext jsc, String basePath) {
|
public HoodieReadClient(JavaSparkContext jsc, String basePath) {
|
||||||
this(jsc, HoodieWriteConfig.newBuilder()
|
this(jsc, HoodieWriteConfig.newBuilder().withPath(basePath)
|
||||||
.withPath(basePath)
|
|
||||||
// by default we use HoodieBloomIndex
|
// by default we use HoodieBloomIndex
|
||||||
.withIndexConfig(
|
.withIndexConfig(
|
||||||
HoodieIndexConfig.newBuilder().withIndexType(HoodieIndex.IndexType.BLOOM).build())
|
HoodieIndexConfig.newBuilder().withIndexType(HoodieIndex.IndexType.BLOOM).build())
|
||||||
@@ -78,7 +77,6 @@ public class HoodieReadClient<T extends HoodieRecordPayload> implements Serializ
|
|||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
*
|
|
||||||
* @param jsc
|
* @param jsc
|
||||||
* @param basePath
|
* @param basePath
|
||||||
* @param sqlContext
|
* @param sqlContext
|
||||||
@@ -96,8 +94,8 @@ public class HoodieReadClient<T extends HoodieRecordPayload> implements Serializ
|
|||||||
this.jsc = jsc;
|
this.jsc = jsc;
|
||||||
this.fs = FSUtils.getFs(basePath, jsc.hadoopConfiguration());
|
this.fs = FSUtils.getFs(basePath, jsc.hadoopConfiguration());
|
||||||
// Create a Hoodie table which encapsulated the commits and files visible
|
// Create a Hoodie table which encapsulated the commits and files visible
|
||||||
this.hoodieTable = HoodieTable.getHoodieTable(
|
this.hoodieTable = HoodieTable
|
||||||
new HoodieTableMetaClient(jsc.hadoopConfiguration(), basePath, true),
|
.getHoodieTable(new HoodieTableMetaClient(jsc.hadoopConfiguration(), basePath, true),
|
||||||
clientConfig);
|
clientConfig);
|
||||||
this.commitTimeline = hoodieTable.getCommitTimeline().filterCompletedInstants();
|
this.commitTimeline = hoodieTable.getCommitTimeline().filterCompletedInstants();
|
||||||
this.index = HoodieIndex.createIndex(clientConfig, jsc);
|
this.index = HoodieIndex.createIndex(clientConfig, jsc);
|
||||||
@@ -126,33 +124,27 @@ public class HoodieReadClient<T extends HoodieRecordPayload> implements Serializ
|
|||||||
*
|
*
|
||||||
* @return a dataframe
|
* @return a dataframe
|
||||||
*/
|
*/
|
||||||
public Dataset<Row> read(JavaRDD<HoodieKey> hoodieKeys, int parallelism)
|
public Dataset<Row> read(JavaRDD<HoodieKey> hoodieKeys, int parallelism) throws Exception {
|
||||||
throws Exception {
|
|
||||||
|
|
||||||
assertSqlContext();
|
assertSqlContext();
|
||||||
JavaPairRDD<HoodieKey, Optional<String>> keyToFileRDD =
|
JavaPairRDD<HoodieKey, Optional<String>> keyToFileRDD = index
|
||||||
index.fetchRecordLocation(hoodieKeys, hoodieTable);
|
.fetchRecordLocation(hoodieKeys, hoodieTable);
|
||||||
List<String> paths = keyToFileRDD
|
List<String> paths = keyToFileRDD.filter(keyFileTuple -> keyFileTuple._2().isPresent())
|
||||||
.filter(keyFileTuple -> keyFileTuple._2().isPresent())
|
.map(keyFileTuple -> keyFileTuple._2().get()).collect();
|
||||||
.map(keyFileTuple -> keyFileTuple._2().get())
|
|
||||||
.collect();
|
|
||||||
|
|
||||||
// record locations might be same for multiple keys, so need a unique list
|
// record locations might be same for multiple keys, so need a unique list
|
||||||
Set<String> uniquePaths = new HashSet<>(paths);
|
Set<String> uniquePaths = new HashSet<>(paths);
|
||||||
Dataset<Row> originalDF = sqlContextOpt.get().read()
|
Dataset<Row> originalDF = sqlContextOpt.get().read()
|
||||||
.parquet(uniquePaths.toArray(new String[uniquePaths.size()]));
|
.parquet(uniquePaths.toArray(new String[uniquePaths.size()]));
|
||||||
StructType schema = originalDF.schema();
|
StructType schema = originalDF.schema();
|
||||||
JavaPairRDD<HoodieKey, Row> keyRowRDD = originalDF.javaRDD()
|
JavaPairRDD<HoodieKey, Row> keyRowRDD = originalDF.javaRDD().mapToPair(row -> {
|
||||||
.mapToPair(row -> {
|
HoodieKey key = new HoodieKey(row.getAs(HoodieRecord.RECORD_KEY_METADATA_FIELD),
|
||||||
HoodieKey key = new HoodieKey(
|
|
||||||
row.getAs(HoodieRecord.RECORD_KEY_METADATA_FIELD),
|
|
||||||
row.getAs(HoodieRecord.PARTITION_PATH_METADATA_FIELD));
|
row.getAs(HoodieRecord.PARTITION_PATH_METADATA_FIELD));
|
||||||
return new Tuple2<>(key, row);
|
return new Tuple2<>(key, row);
|
||||||
});
|
});
|
||||||
|
|
||||||
// Now, we need to further filter out, for only rows that match the supplied hoodie keys
|
// Now, we need to further filter out, for only rows that match the supplied hoodie keys
|
||||||
JavaRDD<Row> rowRDD = keyRowRDD.join(keyToFileRDD, parallelism)
|
JavaRDD<Row> rowRDD = keyRowRDD.join(keyToFileRDD, parallelism).map(tuple -> tuple._2()._1());
|
||||||
.map(tuple -> tuple._2()._1());
|
|
||||||
|
|
||||||
return sqlContextOpt.get().createDataFrame(rowRDD, schema);
|
return sqlContextOpt.get().createDataFrame(rowRDD, schema);
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -81,18 +81,18 @@ import scala.Tuple2;
|
|||||||
/**
|
/**
|
||||||
* Hoodie Write Client helps you build datasets on HDFS [insert()] and then perform efficient
|
* Hoodie Write Client helps you build datasets on HDFS [insert()] and then perform efficient
|
||||||
* mutations on a HDFS dataset [upsert()]
|
* mutations on a HDFS dataset [upsert()]
|
||||||
*
|
* <p>
|
||||||
* Note that, at any given time, there can only be one Spark job performing these operatons on a
|
* Note that, at any given time, there can only be one Spark job performing these operatons on a
|
||||||
* Hoodie dataset.
|
* Hoodie dataset.
|
||||||
*/
|
*/
|
||||||
public class HoodieWriteClient<T extends HoodieRecordPayload> implements Serializable {
|
public class HoodieWriteClient<T extends HoodieRecordPayload> implements Serializable {
|
||||||
|
|
||||||
private static Logger logger = LogManager.getLogger(HoodieWriteClient.class);
|
private static Logger logger = LogManager.getLogger(HoodieWriteClient.class);
|
||||||
private transient final FileSystem fs;
|
private final transient FileSystem fs;
|
||||||
private transient final JavaSparkContext jsc;
|
private final transient JavaSparkContext jsc;
|
||||||
private final HoodieWriteConfig config;
|
private final HoodieWriteConfig config;
|
||||||
private transient final HoodieMetrics metrics;
|
private final transient HoodieMetrics metrics;
|
||||||
private transient final HoodieIndex<T> index;
|
private final transient HoodieIndex<T> index;
|
||||||
private transient Timer.Context writeContext = null;
|
private transient Timer.Context writeContext = null;
|
||||||
|
|
||||||
/**
|
/**
|
||||||
@@ -100,8 +100,7 @@ public class HoodieWriteClient<T extends HoodieRecordPayload> implements Seriali
|
|||||||
* @param clientConfig
|
* @param clientConfig
|
||||||
* @throws Exception
|
* @throws Exception
|
||||||
*/
|
*/
|
||||||
public HoodieWriteClient(JavaSparkContext jsc, HoodieWriteConfig clientConfig)
|
public HoodieWriteClient(JavaSparkContext jsc, HoodieWriteConfig clientConfig) throws Exception {
|
||||||
throws Exception {
|
|
||||||
this(jsc, clientConfig, false);
|
this(jsc, clientConfig, false);
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -129,6 +128,12 @@ public class HoodieWriteClient<T extends HoodieRecordPayload> implements Seriali
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
public static SparkConf registerClasses(SparkConf conf) {
|
||||||
|
conf.registerKryoClasses(
|
||||||
|
new Class[] {HoodieWriteConfig.class, HoodieRecord.class, HoodieKey.class});
|
||||||
|
return conf;
|
||||||
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Filter out HoodieRecords that already exists in the output folder. This is useful in
|
* Filter out HoodieRecords that already exists in the output folder. This is useful in
|
||||||
* deduplication.
|
* deduplication.
|
||||||
@@ -139,8 +144,7 @@ public class HoodieWriteClient<T extends HoodieRecordPayload> implements Seriali
|
|||||||
public JavaRDD<HoodieRecord<T>> filterExists(JavaRDD<HoodieRecord<T>> hoodieRecords) {
|
public JavaRDD<HoodieRecord<T>> filterExists(JavaRDD<HoodieRecord<T>> hoodieRecords) {
|
||||||
// Create a Hoodie table which encapsulated the commits and files visible
|
// Create a Hoodie table which encapsulated the commits and files visible
|
||||||
HoodieTable<T> table = HoodieTable.getHoodieTable(
|
HoodieTable<T> table = HoodieTable.getHoodieTable(
|
||||||
new HoodieTableMetaClient(jsc.hadoopConfiguration(), config.getBasePath(), true),
|
new HoodieTableMetaClient(jsc.hadoopConfiguration(), config.getBasePath(), true), config);
|
||||||
config);
|
|
||||||
|
|
||||||
JavaRDD<HoodieRecord<T>> recordsWithLocation = index.tagLocation(hoodieRecords, table);
|
JavaRDD<HoodieRecord<T>> recordsWithLocation = index.tagLocation(hoodieRecords, table);
|
||||||
return recordsWithLocation.filter(v1 -> !v1.isCurrentLocationKnown());
|
return recordsWithLocation.filter(v1 -> !v1.isCurrentLocationKnown());
|
||||||
@@ -153,9 +157,8 @@ public class HoodieWriteClient<T extends HoodieRecordPayload> implements Seriali
|
|||||||
HoodieTable<T> table = getTableAndInitCtx();
|
HoodieTable<T> table = getTableAndInitCtx();
|
||||||
try {
|
try {
|
||||||
// De-dupe/merge if needed
|
// De-dupe/merge if needed
|
||||||
JavaRDD<HoodieRecord<T>> dedupedRecords =
|
JavaRDD<HoodieRecord<T>> dedupedRecords = combineOnCondition(
|
||||||
combineOnCondition(config.shouldCombineBeforeUpsert(), records,
|
config.shouldCombineBeforeUpsert(), records, config.getUpsertShuffleParallelism());
|
||||||
config.getUpsertShuffleParallelism());
|
|
||||||
|
|
||||||
// perform index loop up to get existing location of records
|
// perform index loop up to get existing location of records
|
||||||
JavaRDD<HoodieRecord<T>> taggedRecords = index.tagLocation(dedupedRecords, table);
|
JavaRDD<HoodieRecord<T>> taggedRecords = index.tagLocation(dedupedRecords, table);
|
||||||
@@ -170,7 +173,7 @@ public class HoodieWriteClient<T extends HoodieRecordPayload> implements Seriali
|
|||||||
|
|
||||||
/**
|
/**
|
||||||
* Upserts the given prepared records into the Hoodie table, at the supplied commitTime.
|
* Upserts the given prepared records into the Hoodie table, at the supplied commitTime.
|
||||||
*
|
* <p>
|
||||||
* This implementation requires that the input records are already tagged, and de-duped if
|
* This implementation requires that the input records are already tagged, and de-duped if
|
||||||
* needed.
|
* needed.
|
||||||
*
|
*
|
||||||
@@ -187,15 +190,15 @@ public class HoodieWriteClient<T extends HoodieRecordPayload> implements Seriali
|
|||||||
if (e instanceof HoodieUpsertException) {
|
if (e instanceof HoodieUpsertException) {
|
||||||
throw (HoodieUpsertException) e;
|
throw (HoodieUpsertException) e;
|
||||||
}
|
}
|
||||||
throw new HoodieUpsertException("Failed to upsert prepared records for commit time " +
|
throw new HoodieUpsertException(
|
||||||
commitTime, e);
|
"Failed to upsert prepared records for commit time " + commitTime, e);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Inserts the given HoodieRecords, into the table. This API is intended to be used for normal
|
* Inserts the given HoodieRecords, into the table. This API is intended to be used for normal
|
||||||
* writes.
|
* writes.
|
||||||
*
|
* <p>
|
||||||
* This implementation skips the index check and is able to leverage benefits such as small file
|
* This implementation skips the index check and is able to leverage benefits such as small file
|
||||||
* handling/blocking alignment, as with upsert(), by profiling the workload
|
* handling/blocking alignment, as with upsert(), by profiling the workload
|
||||||
*
|
*
|
||||||
@@ -207,9 +210,8 @@ public class HoodieWriteClient<T extends HoodieRecordPayload> implements Seriali
|
|||||||
HoodieTable<T> table = getTableAndInitCtx();
|
HoodieTable<T> table = getTableAndInitCtx();
|
||||||
try {
|
try {
|
||||||
// De-dupe/merge if needed
|
// De-dupe/merge if needed
|
||||||
JavaRDD<HoodieRecord<T>> dedupedRecords =
|
JavaRDD<HoodieRecord<T>> dedupedRecords = combineOnCondition(
|
||||||
combineOnCondition(config.shouldCombineBeforeInsert(), records,
|
config.shouldCombineBeforeInsert(), records, config.getInsertShuffleParallelism());
|
||||||
config.getInsertShuffleParallelism());
|
|
||||||
|
|
||||||
return upsertRecordsInternal(dedupedRecords, commitTime, table, false);
|
return upsertRecordsInternal(dedupedRecords, commitTime, table, false);
|
||||||
} catch (Throwable e) {
|
} catch (Throwable e) {
|
||||||
@@ -222,7 +224,7 @@ public class HoodieWriteClient<T extends HoodieRecordPayload> implements Seriali
|
|||||||
|
|
||||||
/**
|
/**
|
||||||
* Inserts the given prepared records into the Hoodie table, at the supplied commitTime.
|
* Inserts the given prepared records into the Hoodie table, at the supplied commitTime.
|
||||||
*
|
* <p>
|
||||||
* This implementation skips the index check, skips de-duping and is able to leverage benefits
|
* This implementation skips the index check, skips de-duping and is able to leverage benefits
|
||||||
* such as small file handling/blocking alignment, as with insert(), by profiling the workload.
|
* such as small file handling/blocking alignment, as with insert(), by profiling the workload.
|
||||||
* The prepared HoodieRecords should be de-duped if needed.
|
* The prepared HoodieRecords should be de-duped if needed.
|
||||||
@@ -240,8 +242,8 @@ public class HoodieWriteClient<T extends HoodieRecordPayload> implements Seriali
|
|||||||
if (e instanceof HoodieInsertException) {
|
if (e instanceof HoodieInsertException) {
|
||||||
throw e;
|
throw e;
|
||||||
}
|
}
|
||||||
throw new HoodieInsertException("Failed to insert prepared records for commit time " +
|
throw new HoodieInsertException(
|
||||||
commitTime, e);
|
"Failed to insert prepared records for commit time " + commitTime, e);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -249,7 +251,7 @@ public class HoodieWriteClient<T extends HoodieRecordPayload> implements Seriali
|
|||||||
* Loads the given HoodieRecords, as inserts into the table. This is suitable for doing big bulk
|
* Loads the given HoodieRecords, as inserts into the table. This is suitable for doing big bulk
|
||||||
* loads into a Hoodie table for the very first time (e.g: converting an existing dataset to
|
* loads into a Hoodie table for the very first time (e.g: converting an existing dataset to
|
||||||
* Hoodie).
|
* Hoodie).
|
||||||
*
|
* <p>
|
||||||
* This implementation uses sortBy (which does range partitioning based on reservoir sampling) and
|
* This implementation uses sortBy (which does range partitioning based on reservoir sampling) and
|
||||||
* attempts to control the numbers of files with less memory compared to the {@link
|
* attempts to control the numbers of files with less memory compared to the {@link
|
||||||
* HoodieWriteClient#insert(JavaRDD, String)}
|
* HoodieWriteClient#insert(JavaRDD, String)}
|
||||||
@@ -267,7 +269,7 @@ public class HoodieWriteClient<T extends HoodieRecordPayload> implements Seriali
|
|||||||
* Loads the given HoodieRecords, as inserts into the table. This is suitable for doing big bulk
|
* Loads the given HoodieRecords, as inserts into the table. This is suitable for doing big bulk
|
||||||
* loads into a Hoodie table for the very first time (e.g: converting an existing dataset to
|
* loads into a Hoodie table for the very first time (e.g: converting an existing dataset to
|
||||||
* Hoodie).
|
* Hoodie).
|
||||||
*
|
* <p>
|
||||||
* This implementation uses sortBy (which does range partitioning based on reservoir sampling) and
|
* This implementation uses sortBy (which does range partitioning based on reservoir sampling) and
|
||||||
* attempts to control the numbers of files with less memory compared to the {@link
|
* attempts to control the numbers of files with less memory compared to the {@link
|
||||||
* HoodieWriteClient#insert(JavaRDD, String)}. Optionally it allows users to specify their own
|
* HoodieWriteClient#insert(JavaRDD, String)}. Optionally it allows users to specify their own
|
||||||
@@ -280,23 +282,20 @@ public class HoodieWriteClient<T extends HoodieRecordPayload> implements Seriali
|
|||||||
* before they are inserted into hoodie.
|
* before they are inserted into hoodie.
|
||||||
* @return JavaRDD[WriteStatus] - RDD of WriteStatus to inspect errors and counts
|
* @return JavaRDD[WriteStatus] - RDD of WriteStatus to inspect errors and counts
|
||||||
*/
|
*/
|
||||||
public JavaRDD<WriteStatus> bulkInsert(JavaRDD<HoodieRecord<T>> records,
|
public JavaRDD<WriteStatus> bulkInsert(JavaRDD<HoodieRecord<T>> records, final String commitTime,
|
||||||
final String commitTime,
|
|
||||||
Option<UserDefinedBulkInsertPartitioner> bulkInsertPartitioner) {
|
Option<UserDefinedBulkInsertPartitioner> bulkInsertPartitioner) {
|
||||||
HoodieTable<T> table = getTableAndInitCtx();
|
HoodieTable<T> table = getTableAndInitCtx();
|
||||||
try {
|
try {
|
||||||
// De-dupe/merge if needed
|
// De-dupe/merge if needed
|
||||||
JavaRDD<HoodieRecord<T>> dedupedRecords =
|
JavaRDD<HoodieRecord<T>> dedupedRecords = combineOnCondition(
|
||||||
combineOnCondition(config.shouldCombineBeforeInsert(), records,
|
config.shouldCombineBeforeInsert(), records, config.getInsertShuffleParallelism());
|
||||||
config.getInsertShuffleParallelism());
|
|
||||||
|
|
||||||
return bulkInsertInternal(dedupedRecords, commitTime, table, bulkInsertPartitioner);
|
return bulkInsertInternal(dedupedRecords, commitTime, table, bulkInsertPartitioner);
|
||||||
} catch (Throwable e) {
|
} catch (Throwable e) {
|
||||||
if (e instanceof HoodieInsertException) {
|
if (e instanceof HoodieInsertException) {
|
||||||
throw e;
|
throw e;
|
||||||
}
|
}
|
||||||
throw new HoodieInsertException("Failed to bulk insert for commit time " + commitTime,
|
throw new HoodieInsertException("Failed to bulk insert for commit time " + commitTime, e);
|
||||||
e);
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -304,7 +303,7 @@ public class HoodieWriteClient<T extends HoodieRecordPayload> implements Seriali
|
|||||||
* Loads the given HoodieRecords, as inserts into the table. This is suitable for doing big bulk
|
* Loads the given HoodieRecords, as inserts into the table. This is suitable for doing big bulk
|
||||||
* loads into a Hoodie table for the very first time (e.g: converting an existing dataset to
|
* loads into a Hoodie table for the very first time (e.g: converting an existing dataset to
|
||||||
* Hoodie). The input records should contain no duplicates if needed.
|
* Hoodie). The input records should contain no duplicates if needed.
|
||||||
*
|
* <p>
|
||||||
* This implementation uses sortBy (which does range partitioning based on reservoir sampling) and
|
* This implementation uses sortBy (which does range partitioning based on reservoir sampling) and
|
||||||
* attempts to control the numbers of files with less memory compared to the {@link
|
* attempts to control the numbers of files with less memory compared to the {@link
|
||||||
* HoodieWriteClient#insert(JavaRDD, String)}. Optionally it allows users to specify their own
|
* HoodieWriteClient#insert(JavaRDD, String)}. Optionally it allows users to specify their own
|
||||||
@@ -318,8 +317,7 @@ public class HoodieWriteClient<T extends HoodieRecordPayload> implements Seriali
|
|||||||
* @return JavaRDD[WriteStatus] - RDD of WriteStatus to inspect errors and counts
|
* @return JavaRDD[WriteStatus] - RDD of WriteStatus to inspect errors and counts
|
||||||
*/
|
*/
|
||||||
public JavaRDD<WriteStatus> bulkInsertPreppedRecords(JavaRDD<HoodieRecord<T>> preppedRecords,
|
public JavaRDD<WriteStatus> bulkInsertPreppedRecords(JavaRDD<HoodieRecord<T>> preppedRecords,
|
||||||
final String commitTime,
|
final String commitTime, Option<UserDefinedBulkInsertPartitioner> bulkInsertPartitioner) {
|
||||||
Option<UserDefinedBulkInsertPartitioner> bulkInsertPartitioner) {
|
|
||||||
HoodieTable<T> table = getTableAndInitCtx();
|
HoodieTable<T> table = getTableAndInitCtx();
|
||||||
try {
|
try {
|
||||||
return bulkInsertInternal(preppedRecords, commitTime, table, bulkInsertPartitioner);
|
return bulkInsertInternal(preppedRecords, commitTime, table, bulkInsertPartitioner);
|
||||||
@@ -327,35 +325,29 @@ public class HoodieWriteClient<T extends HoodieRecordPayload> implements Seriali
|
|||||||
if (e instanceof HoodieInsertException) {
|
if (e instanceof HoodieInsertException) {
|
||||||
throw e;
|
throw e;
|
||||||
}
|
}
|
||||||
throw new HoodieInsertException("Failed to bulk insert prepared records for commit time " +
|
throw new HoodieInsertException(
|
||||||
commitTime, e);
|
"Failed to bulk insert prepared records for commit time " + commitTime, e);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
private JavaRDD<WriteStatus> bulkInsertInternal(
|
private JavaRDD<WriteStatus> bulkInsertInternal(JavaRDD<HoodieRecord<T>> dedupedRecords,
|
||||||
JavaRDD<HoodieRecord<T>> dedupedRecords,
|
String commitTime, HoodieTable<T> table,
|
||||||
String commitTime,
|
|
||||||
HoodieTable<T> table,
|
|
||||||
Option<UserDefinedBulkInsertPartitioner> bulkInsertPartitioner) {
|
Option<UserDefinedBulkInsertPartitioner> bulkInsertPartitioner) {
|
||||||
final JavaRDD<HoodieRecord<T>> repartitionedRecords;
|
final JavaRDD<HoodieRecord<T>> repartitionedRecords;
|
||||||
if (bulkInsertPartitioner.isDefined()) {
|
if (bulkInsertPartitioner.isDefined()) {
|
||||||
repartitionedRecords =
|
repartitionedRecords = bulkInsertPartitioner.get()
|
||||||
bulkInsertPartitioner.get().repartitionRecords(dedupedRecords,
|
.repartitionRecords(dedupedRecords, config.getBulkInsertShuffleParallelism());
|
||||||
config.getBulkInsertShuffleParallelism());
|
|
||||||
} else {
|
} else {
|
||||||
// Now, sort the records and line them up nicely for loading.
|
// Now, sort the records and line them up nicely for loading.
|
||||||
repartitionedRecords = dedupedRecords
|
repartitionedRecords = dedupedRecords.sortBy(record -> {
|
||||||
.sortBy(record -> {
|
|
||||||
// Let's use "partitionPath + key" as the sort key. Spark, will ensure
|
// Let's use "partitionPath + key" as the sort key. Spark, will ensure
|
||||||
// the records split evenly across RDD partitions, such that small partitions fit
|
// the records split evenly across RDD partitions, such that small partitions fit
|
||||||
// into 1 RDD partition, while big ones spread evenly across multiple RDD partitions
|
// into 1 RDD partition, while big ones spread evenly across multiple RDD partitions
|
||||||
return String
|
return String.format("%s+%s", record.getPartitionPath(), record.getRecordKey());
|
||||||
.format("%s+%s", record.getPartitionPath(), record.getRecordKey());
|
|
||||||
}, true, config.getBulkInsertShuffleParallelism());
|
}, true, config.getBulkInsertShuffleParallelism());
|
||||||
}
|
}
|
||||||
JavaRDD<WriteStatus> writeStatusRDD = repartitionedRecords
|
JavaRDD<WriteStatus> writeStatusRDD = repartitionedRecords
|
||||||
.mapPartitionsWithIndex(new BulkInsertMapFunction<T>(commitTime, config, table),
|
.mapPartitionsWithIndex(new BulkInsertMapFunction<T>(commitTime, config, table), true)
|
||||||
true)
|
|
||||||
.flatMap(writeStatuses -> writeStatuses.iterator());
|
.flatMap(writeStatuses -> writeStatuses.iterator());
|
||||||
|
|
||||||
return updateIndexAndCommitIfNeeded(writeStatusRDD, table, commitTime);
|
return updateIndexAndCommitIfNeeded(writeStatusRDD, table, commitTime);
|
||||||
@@ -375,8 +367,7 @@ public class HoodieWriteClient<T extends HoodieRecordPayload> implements Seriali
|
|||||||
}
|
}
|
||||||
|
|
||||||
private JavaRDD<HoodieRecord<T>> combineOnCondition(boolean condition,
|
private JavaRDD<HoodieRecord<T>> combineOnCondition(boolean condition,
|
||||||
JavaRDD<HoodieRecord<T>> records,
|
JavaRDD<HoodieRecord<T>> records, int parallelism) {
|
||||||
int parallelism) {
|
|
||||||
if (condition) {
|
if (condition) {
|
||||||
return deduplicateRecords(records, parallelism);
|
return deduplicateRecords(records, parallelism);
|
||||||
}
|
}
|
||||||
@@ -390,8 +381,8 @@ public class HoodieWriteClient<T extends HoodieRecordPayload> implements Seriali
|
|||||||
* files) are rolled back based on commit time. // TODO : Create a new WorkloadProfile metadata
|
* files) are rolled back based on commit time. // TODO : Create a new WorkloadProfile metadata
|
||||||
* file instead of using HoodieCommitMetadata
|
* file instead of using HoodieCommitMetadata
|
||||||
*/
|
*/
|
||||||
private void saveWorkloadProfileMetadataToInflight(WorkloadProfile profile,
|
private void saveWorkloadProfileMetadataToInflight(WorkloadProfile profile, HoodieTable<T> table,
|
||||||
HoodieTable<T> table, String commitTime) throws HoodieCommitException {
|
String commitTime) throws HoodieCommitException {
|
||||||
try {
|
try {
|
||||||
HoodieCommitMetadata metadata = new HoodieCommitMetadata();
|
HoodieCommitMetadata metadata = new HoodieCommitMetadata();
|
||||||
profile.getPartitionPaths().stream().forEach(path -> {
|
profile.getPartitionPaths().stream().forEach(path -> {
|
||||||
@@ -416,9 +407,7 @@ public class HoodieWriteClient<T extends HoodieRecordPayload> implements Seriali
|
|||||||
}
|
}
|
||||||
|
|
||||||
private JavaRDD<WriteStatus> upsertRecordsInternal(JavaRDD<HoodieRecord<T>> preppedRecords,
|
private JavaRDD<WriteStatus> upsertRecordsInternal(JavaRDD<HoodieRecord<T>> preppedRecords,
|
||||||
String commitTime,
|
String commitTime, HoodieTable<T> hoodieTable, final boolean isUpsert) {
|
||||||
HoodieTable<T> hoodieTable,
|
|
||||||
final boolean isUpsert) {
|
|
||||||
|
|
||||||
// Cache the tagged records, so we don't end up computing both
|
// Cache the tagged records, so we don't end up computing both
|
||||||
// TODO: Consistent contract in HoodieWriteClient regarding preppedRecord storage level handling
|
// TODO: Consistent contract in HoodieWriteClient regarding preppedRecord storage level handling
|
||||||
@@ -441,20 +430,16 @@ public class HoodieWriteClient<T extends HoodieRecordPayload> implements Seriali
|
|||||||
JavaRDD<WriteStatus> writeStatusRDD = partitionedRecords
|
JavaRDD<WriteStatus> writeStatusRDD = partitionedRecords
|
||||||
.mapPartitionsWithIndex((partition, recordItr) -> {
|
.mapPartitionsWithIndex((partition, recordItr) -> {
|
||||||
if (isUpsert) {
|
if (isUpsert) {
|
||||||
return hoodieTable
|
return hoodieTable.handleUpsertPartition(commitTime, partition, recordItr, partitioner);
|
||||||
.handleUpsertPartition(commitTime, partition, recordItr, partitioner);
|
|
||||||
} else {
|
} else {
|
||||||
return hoodieTable
|
return hoodieTable.handleInsertPartition(commitTime, partition, recordItr, partitioner);
|
||||||
.handleInsertPartition(commitTime, partition, recordItr, partitioner);
|
|
||||||
}
|
}
|
||||||
}, true)
|
}, true).flatMap(writeStatuses -> writeStatuses.iterator());
|
||||||
.flatMap(writeStatuses -> writeStatuses.iterator());
|
|
||||||
|
|
||||||
return updateIndexAndCommitIfNeeded(writeStatusRDD, hoodieTable, commitTime);
|
return updateIndexAndCommitIfNeeded(writeStatusRDD, hoodieTable, commitTime);
|
||||||
}
|
}
|
||||||
|
|
||||||
private Partitioner getPartitioner(HoodieTable table, boolean isUpsert,
|
private Partitioner getPartitioner(HoodieTable table, boolean isUpsert, WorkloadProfile profile) {
|
||||||
WorkloadProfile profile) {
|
|
||||||
if (isUpsert) {
|
if (isUpsert) {
|
||||||
return table.getUpsertPartitioner(profile);
|
return table.getUpsertPartitioner(profile);
|
||||||
} else {
|
} else {
|
||||||
@@ -474,13 +459,9 @@ public class HoodieWriteClient<T extends HoodieRecordPayload> implements Seriali
|
|||||||
|
|
||||||
private JavaRDD<HoodieRecord<T>> partition(JavaRDD<HoodieRecord<T>> dedupedRecords,
|
private JavaRDD<HoodieRecord<T>> partition(JavaRDD<HoodieRecord<T>> dedupedRecords,
|
||||||
Partitioner partitioner) {
|
Partitioner partitioner) {
|
||||||
return dedupedRecords
|
return dedupedRecords.mapToPair(record -> new Tuple2<>(
|
||||||
.mapToPair(record ->
|
new Tuple2<>(record.getKey(), Option.apply(record.getCurrentLocation())), record))
|
||||||
new Tuple2<>(
|
.partitionBy(partitioner).map(tuple -> tuple._2());
|
||||||
new Tuple2<>(record.getKey(), Option.apply(record.getCurrentLocation())),
|
|
||||||
record))
|
|
||||||
.partitionBy(partitioner)
|
|
||||||
.map(tuple -> tuple._2());
|
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
@@ -493,12 +474,10 @@ public class HoodieWriteClient<T extends HoodieRecordPayload> implements Seriali
|
|||||||
/**
|
/**
|
||||||
* Commit changes performed at the given commitTime marker
|
* Commit changes performed at the given commitTime marker
|
||||||
*/
|
*/
|
||||||
public boolean commit(String commitTime,
|
public boolean commit(String commitTime, JavaRDD<WriteStatus> writeStatuses,
|
||||||
JavaRDD<WriteStatus> writeStatuses,
|
|
||||||
Optional<HashMap<String, String>> extraMetadata) {
|
Optional<HashMap<String, String>> extraMetadata) {
|
||||||
HoodieTable<T> table = HoodieTable.getHoodieTable(
|
HoodieTable<T> table = HoodieTable.getHoodieTable(
|
||||||
new HoodieTableMetaClient(jsc.hadoopConfiguration(), config.getBasePath(), true),
|
new HoodieTableMetaClient(jsc.hadoopConfiguration(), config.getBasePath(), true), config);
|
||||||
config);
|
|
||||||
return commit(commitTime, writeStatuses, extraMetadata, table.getCommitActionType());
|
return commit(commitTime, writeStatuses, extraMetadata, table.getCommitActionType());
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -508,15 +487,13 @@ public class HoodieWriteClient<T extends HoodieRecordPayload> implements Seriali
|
|||||||
logger.info("Commiting " + commitTime);
|
logger.info("Commiting " + commitTime);
|
||||||
// Create a Hoodie table which encapsulated the commits and files visible
|
// Create a Hoodie table which encapsulated the commits and files visible
|
||||||
HoodieTable<T> table = HoodieTable.getHoodieTable(
|
HoodieTable<T> table = HoodieTable.getHoodieTable(
|
||||||
new HoodieTableMetaClient(jsc.hadoopConfiguration(), config.getBasePath(), true),
|
new HoodieTableMetaClient(jsc.hadoopConfiguration(), config.getBasePath(), true), config);
|
||||||
config);
|
|
||||||
|
|
||||||
HoodieActiveTimeline activeTimeline = table.getActiveTimeline();
|
HoodieActiveTimeline activeTimeline = table.getActiveTimeline();
|
||||||
|
|
||||||
List<Tuple2<String, HoodieWriteStat>> stats = writeStatuses
|
List<Tuple2<String, HoodieWriteStat>> stats = writeStatuses.mapToPair(
|
||||||
.mapToPair((PairFunction<WriteStatus, String, HoodieWriteStat>) writeStatus ->
|
(PairFunction<WriteStatus, String, HoodieWriteStat>) writeStatus -> new Tuple2<>(
|
||||||
new Tuple2<>(writeStatus.getPartitionPath(), writeStatus.getStat()))
|
writeStatus.getPartitionPath(), writeStatus.getStat())).collect();
|
||||||
.collect();
|
|
||||||
|
|
||||||
HoodieCommitMetadata metadata = new HoodieCommitMetadata();
|
HoodieCommitMetadata metadata = new HoodieCommitMetadata();
|
||||||
for (Tuple2<String, HoodieWriteStat> stat : stats) {
|
for (Tuple2<String, HoodieWriteStat> stat : stats) {
|
||||||
@@ -531,8 +508,7 @@ public class HoodieWriteClient<T extends HoodieRecordPayload> implements Seriali
|
|||||||
durationInMs.ifPresent(duration -> {
|
durationInMs.ifPresent(duration -> {
|
||||||
logger.info("Finalize write elapsed time (milliseconds): " + duration);
|
logger.info("Finalize write elapsed time (milliseconds): " + duration);
|
||||||
metrics.updateFinalizeWriteMetrics(duration, result.get());
|
metrics.updateFinalizeWriteMetrics(duration, result.get());
|
||||||
}
|
});
|
||||||
);
|
|
||||||
}
|
}
|
||||||
|
|
||||||
// add in extra metadata
|
// add in extra metadata
|
||||||
@@ -541,8 +517,7 @@ public class HoodieWriteClient<T extends HoodieRecordPayload> implements Seriali
|
|||||||
}
|
}
|
||||||
|
|
||||||
try {
|
try {
|
||||||
activeTimeline.saveAsComplete(
|
activeTimeline.saveAsComplete(new HoodieInstant(true, actionType, commitTime),
|
||||||
new HoodieInstant(true, actionType, commitTime),
|
|
||||||
Optional.of(metadata.toJsonString().getBytes(StandardCharsets.UTF_8)));
|
Optional.of(metadata.toJsonString().getBytes(StandardCharsets.UTF_8)));
|
||||||
// Save was a success
|
// Save was a success
|
||||||
// Do a inline compaction if enabled
|
// Do a inline compaction if enabled
|
||||||
@@ -566,9 +541,9 @@ public class HoodieWriteClient<T extends HoodieRecordPayload> implements Seriali
|
|||||||
}
|
}
|
||||||
if (writeContext != null) {
|
if (writeContext != null) {
|
||||||
long durationInMs = metrics.getDurationInMs(writeContext.stop());
|
long durationInMs = metrics.getDurationInMs(writeContext.stop());
|
||||||
metrics.updateCommitMetrics(
|
metrics
|
||||||
HoodieActiveTimeline.COMMIT_FORMATTER.parse(commitTime).getTime(), durationInMs,
|
.updateCommitMetrics(HoodieActiveTimeline.COMMIT_FORMATTER.parse(commitTime).getTime(),
|
||||||
metadata);
|
durationInMs, metadata);
|
||||||
writeContext = null;
|
writeContext = null;
|
||||||
}
|
}
|
||||||
logger.info("Committed " + commitTime);
|
logger.info("Committed " + commitTime);
|
||||||
@@ -587,10 +562,10 @@ public class HoodieWriteClient<T extends HoodieRecordPayload> implements Seriali
|
|||||||
* Savepoint a specific commit. Latest version of data files as of the passed in commitTime will
|
* Savepoint a specific commit. Latest version of data files as of the passed in commitTime will
|
||||||
* be referenced in the savepoint and will never be cleaned. The savepointed commit will never be
|
* be referenced in the savepoint and will never be cleaned. The savepointed commit will never be
|
||||||
* rolledback or archived.
|
* rolledback or archived.
|
||||||
*
|
* <p>
|
||||||
* This gives an option to rollback the state to the savepoint anytime. Savepoint needs to be
|
* This gives an option to rollback the state to the savepoint anytime. Savepoint needs to be
|
||||||
* manually created and deleted.
|
* manually created and deleted.
|
||||||
*
|
* <p>
|
||||||
* Savepoint should be on a commit that could not have been cleaned.
|
* Savepoint should be on a commit that could not have been cleaned.
|
||||||
*
|
*
|
||||||
* @param user - User creating the savepoint
|
* @param user - User creating the savepoint
|
||||||
@@ -599,8 +574,7 @@ public class HoodieWriteClient<T extends HoodieRecordPayload> implements Seriali
|
|||||||
*/
|
*/
|
||||||
public boolean savepoint(String user, String comment) {
|
public boolean savepoint(String user, String comment) {
|
||||||
HoodieTable<T> table = HoodieTable.getHoodieTable(
|
HoodieTable<T> table = HoodieTable.getHoodieTable(
|
||||||
new HoodieTableMetaClient(jsc.hadoopConfiguration(), config.getBasePath(), true),
|
new HoodieTableMetaClient(jsc.hadoopConfiguration(), config.getBasePath(), true), config);
|
||||||
config);
|
|
||||||
if (table.getCompletedCommitTimeline().empty()) {
|
if (table.getCompletedCommitTimeline().empty()) {
|
||||||
throw new HoodieSavepointException("Could not savepoint. Commit timeline is empty");
|
throw new HoodieSavepointException("Could not savepoint. Commit timeline is empty");
|
||||||
}
|
}
|
||||||
@@ -614,10 +588,10 @@ public class HoodieWriteClient<T extends HoodieRecordPayload> implements Seriali
|
|||||||
* Savepoint a specific commit. Latest version of data files as of the passed in commitTime will
|
* Savepoint a specific commit. Latest version of data files as of the passed in commitTime will
|
||||||
* be referenced in the savepoint and will never be cleaned. The savepointed commit will never be
|
* be referenced in the savepoint and will never be cleaned. The savepointed commit will never be
|
||||||
* rolledback or archived.
|
* rolledback or archived.
|
||||||
*
|
* <p>
|
||||||
* This gives an option to rollback the state to the savepoint anytime. Savepoint needs to be
|
* This gives an option to rollback the state to the savepoint anytime. Savepoint needs to be
|
||||||
* manually created and deleted.
|
* manually created and deleted.
|
||||||
*
|
* <p>
|
||||||
* Savepoint should be on a commit that could not have been cleaned.
|
* Savepoint should be on a commit that could not have been cleaned.
|
||||||
*
|
*
|
||||||
* @param commitTime - commit that should be savepointed
|
* @param commitTime - commit that should be savepointed
|
||||||
@@ -627,8 +601,7 @@ public class HoodieWriteClient<T extends HoodieRecordPayload> implements Seriali
|
|||||||
*/
|
*/
|
||||||
public boolean savepoint(String commitTime, String user, String comment) {
|
public boolean savepoint(String commitTime, String user, String comment) {
|
||||||
HoodieTable<T> table = HoodieTable.getHoodieTable(
|
HoodieTable<T> table = HoodieTable.getHoodieTable(
|
||||||
new HoodieTableMetaClient(jsc.hadoopConfiguration(), config.getBasePath(), true),
|
new HoodieTableMetaClient(jsc.hadoopConfiguration(), config.getBasePath(), true), config);
|
||||||
config);
|
|
||||||
Optional<HoodieInstant> cleanInstant = table.getCompletedCleanTimeline().lastInstant();
|
Optional<HoodieInstant> cleanInstant = table.getCompletedCleanTimeline().lastInstant();
|
||||||
|
|
||||||
HoodieInstant commitInstant = new HoodieInstant(false, HoodieTimeline.COMMIT_ACTION,
|
HoodieInstant commitInstant = new HoodieInstant(false, HoodieTimeline.COMMIT_ACTION,
|
||||||
@@ -646,8 +619,7 @@ public class HoodieWriteClient<T extends HoodieRecordPayload> implements Seriali
|
|||||||
table.getActiveTimeline().getInstantDetails(cleanInstant.get()).get());
|
table.getActiveTimeline().getInstantDetails(cleanInstant.get()).get());
|
||||||
lastCommitRetained = cleanMetadata.getEarliestCommitToRetain();
|
lastCommitRetained = cleanMetadata.getEarliestCommitToRetain();
|
||||||
} else {
|
} else {
|
||||||
lastCommitRetained =
|
lastCommitRetained = table.getCompletedCommitTimeline().firstInstant().get().getTimestamp();
|
||||||
table.getCompletedCommitTimeline().firstInstant().get().getTimestamp();
|
|
||||||
}
|
}
|
||||||
|
|
||||||
// Cannot allow savepoint time on a commit that could have been cleaned
|
// Cannot allow savepoint time on a commit that could have been cleaned
|
||||||
@@ -656,24 +628,23 @@ public class HoodieWriteClient<T extends HoodieRecordPayload> implements Seriali
|
|||||||
"Could not savepoint commit " + commitTime + " as this is beyond the lookup window "
|
"Could not savepoint commit " + commitTime + " as this is beyond the lookup window "
|
||||||
+ lastCommitRetained);
|
+ lastCommitRetained);
|
||||||
|
|
||||||
Map<String, List<String>> latestFilesMap = jsc.parallelize(
|
Map<String, List<String>> latestFilesMap = jsc.parallelize(FSUtils
|
||||||
FSUtils.getAllPartitionPaths(fs, table.getMetaClient().getBasePath(),
|
.getAllPartitionPaths(fs, table.getMetaClient().getBasePath(),
|
||||||
config.shouldAssumeDatePartitioning()))
|
config.shouldAssumeDatePartitioning()))
|
||||||
.mapToPair((PairFunction<String, String, List<String>>) partitionPath -> {
|
.mapToPair((PairFunction<String, String, List<String>>) partitionPath -> {
|
||||||
// Scan all partitions files with this commit time
|
// Scan all partitions files with this commit time
|
||||||
logger.info("Collecting latest files in partition path " + partitionPath);
|
logger.info("Collecting latest files in partition path " + partitionPath);
|
||||||
TableFileSystemView.ReadOptimizedView view = table.getROFileSystemView();
|
TableFileSystemView.ReadOptimizedView view = table.getROFileSystemView();
|
||||||
List<String> latestFiles =
|
List<String> latestFiles = view.getLatestDataFilesBeforeOrOn(partitionPath, commitTime)
|
||||||
view.getLatestDataFilesBeforeOrOn(partitionPath, commitTime)
|
|
||||||
.map(HoodieDataFile::getFileName).collect(Collectors.toList());
|
.map(HoodieDataFile::getFileName).collect(Collectors.toList());
|
||||||
return new Tuple2<>(partitionPath, latestFiles);
|
return new Tuple2<>(partitionPath, latestFiles);
|
||||||
}).collectAsMap();
|
}).collectAsMap();
|
||||||
|
|
||||||
HoodieSavepointMetadata metadata =
|
HoodieSavepointMetadata metadata = AvroUtils
|
||||||
AvroUtils.convertSavepointMetadata(user, comment, latestFilesMap);
|
.convertSavepointMetadata(user, comment, latestFilesMap);
|
||||||
// Nothing to save in the savepoint
|
// Nothing to save in the savepoint
|
||||||
table.getActiveTimeline().saveAsComplete(
|
table.getActiveTimeline()
|
||||||
new HoodieInstant(true, HoodieTimeline.SAVEPOINT_ACTION, commitTime),
|
.saveAsComplete(new HoodieInstant(true, HoodieTimeline.SAVEPOINT_ACTION, commitTime),
|
||||||
AvroUtils.serializeSavepointMetadata(metadata));
|
AvroUtils.serializeSavepointMetadata(metadata));
|
||||||
logger.info("Savepoint " + commitTime + " created");
|
logger.info("Savepoint " + commitTime + " created");
|
||||||
return true;
|
return true;
|
||||||
@@ -691,22 +662,20 @@ public class HoodieWriteClient<T extends HoodieRecordPayload> implements Seriali
|
|||||||
*/
|
*/
|
||||||
public void deleteSavepoint(String savepointTime) {
|
public void deleteSavepoint(String savepointTime) {
|
||||||
HoodieTable<T> table = HoodieTable.getHoodieTable(
|
HoodieTable<T> table = HoodieTable.getHoodieTable(
|
||||||
new HoodieTableMetaClient(jsc.hadoopConfiguration(), config.getBasePath(), true),
|
new HoodieTableMetaClient(jsc.hadoopConfiguration(), config.getBasePath(), true), config);
|
||||||
config);
|
|
||||||
HoodieActiveTimeline activeTimeline = table.getActiveTimeline();
|
HoodieActiveTimeline activeTimeline = table.getActiveTimeline();
|
||||||
|
|
||||||
HoodieInstant savePoint =
|
HoodieInstant savePoint = new HoodieInstant(false, HoodieTimeline.SAVEPOINT_ACTION,
|
||||||
new HoodieInstant(false, HoodieTimeline.SAVEPOINT_ACTION, savepointTime);
|
savepointTime);
|
||||||
boolean isSavepointPresent =
|
boolean isSavepointPresent = table.getCompletedSavepointTimeline().containsInstant(savePoint);
|
||||||
table.getCompletedSavepointTimeline().containsInstant(savePoint);
|
|
||||||
if (!isSavepointPresent) {
|
if (!isSavepointPresent) {
|
||||||
logger.warn("No savepoint present " + savepointTime);
|
logger.warn("No savepoint present " + savepointTime);
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
|
|
||||||
activeTimeline.revertToInflight(savePoint);
|
activeTimeline.revertToInflight(savePoint);
|
||||||
activeTimeline.deleteInflight(
|
activeTimeline
|
||||||
new HoodieInstant(true, HoodieTimeline.SAVEPOINT_ACTION, savepointTime));
|
.deleteInflight(new HoodieInstant(true, HoodieTimeline.SAVEPOINT_ACTION, savepointTime));
|
||||||
logger.info("Savepoint " + savepointTime + " deleted");
|
logger.info("Savepoint " + savepointTime + " deleted");
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -719,30 +688,27 @@ public class HoodieWriteClient<T extends HoodieRecordPayload> implements Seriali
|
|||||||
*/
|
*/
|
||||||
public boolean rollbackToSavepoint(String savepointTime) {
|
public boolean rollbackToSavepoint(String savepointTime) {
|
||||||
HoodieTable<T> table = HoodieTable.getHoodieTable(
|
HoodieTable<T> table = HoodieTable.getHoodieTable(
|
||||||
new HoodieTableMetaClient(jsc.hadoopConfiguration(), config.getBasePath(), true),
|
new HoodieTableMetaClient(jsc.hadoopConfiguration(), config.getBasePath(), true), config);
|
||||||
config);
|
|
||||||
HoodieActiveTimeline activeTimeline = table.getActiveTimeline();
|
HoodieActiveTimeline activeTimeline = table.getActiveTimeline();
|
||||||
HoodieTimeline commitTimeline = table.getCommitsTimeline();
|
HoodieTimeline commitTimeline = table.getCommitsTimeline();
|
||||||
|
|
||||||
HoodieInstant savePoint =
|
HoodieInstant savePoint = new HoodieInstant(false, HoodieTimeline.SAVEPOINT_ACTION,
|
||||||
new HoodieInstant(false, HoodieTimeline.SAVEPOINT_ACTION, savepointTime);
|
savepointTime);
|
||||||
boolean isSavepointPresent =
|
boolean isSavepointPresent = table.getCompletedSavepointTimeline().containsInstant(savePoint);
|
||||||
table.getCompletedSavepointTimeline().containsInstant(savePoint);
|
|
||||||
if (!isSavepointPresent) {
|
if (!isSavepointPresent) {
|
||||||
throw new HoodieRollbackException("No savepoint for commitTime " + savepointTime);
|
throw new HoodieRollbackException("No savepoint for commitTime " + savepointTime);
|
||||||
}
|
}
|
||||||
|
|
||||||
List<String> commitsToRollback =
|
List<String> commitsToRollback = commitTimeline
|
||||||
commitTimeline.findInstantsAfter(savepointTime, Integer.MAX_VALUE).getInstants()
|
.findInstantsAfter(savepointTime, Integer.MAX_VALUE).getInstants()
|
||||||
.map(HoodieInstant::getTimestamp).collect(Collectors.toList());
|
.map(HoodieInstant::getTimestamp).collect(Collectors.toList());
|
||||||
logger.info("Rolling back commits " + commitsToRollback);
|
logger.info("Rolling back commits " + commitsToRollback);
|
||||||
|
|
||||||
rollback(commitsToRollback);
|
rollback(commitsToRollback);
|
||||||
|
|
||||||
// Make sure the rollback was successful
|
// Make sure the rollback was successful
|
||||||
Optional<HoodieInstant> lastInstant =
|
Optional<HoodieInstant> lastInstant = activeTimeline.reload().getCommitsTimeline()
|
||||||
activeTimeline.reload().getCommitsTimeline().filterCompletedInstants()
|
.filterCompletedInstants().lastInstant();
|
||||||
.lastInstant();
|
|
||||||
Preconditions.checkArgument(lastInstant.isPresent());
|
Preconditions.checkArgument(lastInstant.isPresent());
|
||||||
Preconditions.checkArgument(lastInstant.get().getTimestamp().equals(savepointTime),
|
Preconditions.checkArgument(lastInstant.get().getTimestamp().equals(savepointTime),
|
||||||
savepointTime + "is not the last commit after rolling back " + commitsToRollback
|
savepointTime + "is not the last commit after rolling back " + commitsToRollback
|
||||||
@@ -771,16 +737,14 @@ public class HoodieWriteClient<T extends HoodieRecordPayload> implements Seriali
|
|||||||
|
|
||||||
// Create a Hoodie table which encapsulated the commits and files visible
|
// Create a Hoodie table which encapsulated the commits and files visible
|
||||||
HoodieTable<T> table = HoodieTable.getHoodieTable(
|
HoodieTable<T> table = HoodieTable.getHoodieTable(
|
||||||
new HoodieTableMetaClient(jsc.hadoopConfiguration(), config.getBasePath(), true),
|
new HoodieTableMetaClient(jsc.hadoopConfiguration(), config.getBasePath(), true), config);
|
||||||
config);
|
|
||||||
HoodieActiveTimeline activeTimeline = table.getActiveTimeline();
|
HoodieActiveTimeline activeTimeline = table.getActiveTimeline();
|
||||||
HoodieTimeline inflightTimeline = table.getInflightCommitTimeline();
|
HoodieTimeline inflightTimeline = table.getInflightCommitTimeline();
|
||||||
HoodieTimeline commitTimeline = table.getCompletedCommitTimeline();
|
HoodieTimeline commitTimeline = table.getCompletedCommitTimeline();
|
||||||
|
|
||||||
// Check if any of the commits is a savepoint - do not allow rollback on those commits
|
// Check if any of the commits is a savepoint - do not allow rollback on those commits
|
||||||
List<String> savepoints =
|
List<String> savepoints = table.getCompletedSavepointTimeline().getInstants()
|
||||||
table.getCompletedSavepointTimeline().getInstants().map(HoodieInstant::getTimestamp)
|
.map(HoodieInstant::getTimestamp).collect(Collectors.toList());
|
||||||
.collect(Collectors.toList());
|
|
||||||
commits.forEach(s -> {
|
commits.forEach(s -> {
|
||||||
if (savepoints.contains(s)) {
|
if (savepoints.contains(s)) {
|
||||||
throw new HoodieRollbackException(
|
throw new HoodieRollbackException(
|
||||||
@@ -800,16 +764,15 @@ public class HoodieWriteClient<T extends HoodieRecordPayload> implements Seriali
|
|||||||
String lastCommit = commits.get(commits.size() - 1);
|
String lastCommit = commits.get(commits.size() - 1);
|
||||||
if (!commitTimeline.empty() && !commitTimeline
|
if (!commitTimeline.empty() && !commitTimeline
|
||||||
.findInstantsAfter(lastCommit, Integer.MAX_VALUE).empty()) {
|
.findInstantsAfter(lastCommit, Integer.MAX_VALUE).empty()) {
|
||||||
throw new HoodieRollbackException("Found commits after time :" + lastCommit +
|
throw new HoodieRollbackException(
|
||||||
", please rollback greater commits first");
|
"Found commits after time :" + lastCommit + ", please rollback greater commits first");
|
||||||
}
|
}
|
||||||
|
|
||||||
List<String> inflights = inflightTimeline.getInstants().map(HoodieInstant::getTimestamp)
|
List<String> inflights = inflightTimeline.getInstants().map(HoodieInstant::getTimestamp)
|
||||||
.collect(Collectors.toList());
|
.collect(Collectors.toList());
|
||||||
if (!inflights.isEmpty() && inflights.indexOf(lastCommit) != inflights.size() - 1) {
|
if (!inflights.isEmpty() && inflights.indexOf(lastCommit) != inflights.size() - 1) {
|
||||||
throw new HoodieRollbackException(
|
throw new HoodieRollbackException("Found in-flight commits after time :" + lastCommit
|
||||||
"Found in-flight commits after time :" + lastCommit +
|
+ ", please rollback greater commits first");
|
||||||
", please rollback greater commits first");
|
|
||||||
}
|
}
|
||||||
|
|
||||||
List<HoodieRollbackStat> stats = table.rollback(jsc, commits);
|
List<HoodieRollbackStat> stats = table.rollback(jsc, commits);
|
||||||
@@ -817,8 +780,7 @@ public class HoodieWriteClient<T extends HoodieRecordPayload> implements Seriali
|
|||||||
// cleanup index entries
|
// cleanup index entries
|
||||||
commits.stream().forEach(s -> {
|
commits.stream().forEach(s -> {
|
||||||
if (!index.rollbackCommit(s)) {
|
if (!index.rollbackCommit(s)) {
|
||||||
throw new HoodieRollbackException(
|
throw new HoodieRollbackException("Rollback index changes failed, for time :" + s);
|
||||||
"Rollback index changes failed, for time :" + s);
|
|
||||||
}
|
}
|
||||||
});
|
});
|
||||||
logger.info("Index rolled back for commits " + commits);
|
logger.info("Index rolled back for commits " + commits);
|
||||||
@@ -826,13 +788,12 @@ public class HoodieWriteClient<T extends HoodieRecordPayload> implements Seriali
|
|||||||
Optional<Long> durationInMs = Optional.empty();
|
Optional<Long> durationInMs = Optional.empty();
|
||||||
if (context != null) {
|
if (context != null) {
|
||||||
durationInMs = Optional.of(metrics.getDurationInMs(context.stop()));
|
durationInMs = Optional.of(metrics.getDurationInMs(context.stop()));
|
||||||
Long numFilesDeleted = stats.stream()
|
Long numFilesDeleted = stats.stream().mapToLong(stat -> stat.getSuccessDeleteFiles().size())
|
||||||
.mapToLong(stat -> stat.getSuccessDeleteFiles().size())
|
|
||||||
.sum();
|
.sum();
|
||||||
metrics.updateRollbackMetrics(durationInMs.get(), numFilesDeleted);
|
metrics.updateRollbackMetrics(durationInMs.get(), numFilesDeleted);
|
||||||
}
|
}
|
||||||
HoodieRollbackMetadata rollbackMetadata =
|
HoodieRollbackMetadata rollbackMetadata = AvroUtils
|
||||||
AvroUtils.convertRollbackMetadata(startRollbackTime, durationInMs, commits, stats);
|
.convertRollbackMetadata(startRollbackTime, durationInMs, commits, stats);
|
||||||
table.getActiveTimeline().saveAsComplete(
|
table.getActiveTimeline().saveAsComplete(
|
||||||
new HoodieInstant(true, HoodieTimeline.ROLLBACK_ACTION, startRollbackTime),
|
new HoodieInstant(true, HoodieTimeline.ROLLBACK_ACTION, startRollbackTime),
|
||||||
AvroUtils.serializeRollbackMetadata(rollbackMetadata));
|
AvroUtils.serializeRollbackMetadata(rollbackMetadata));
|
||||||
@@ -846,8 +807,8 @@ public class HoodieWriteClient<T extends HoodieRecordPayload> implements Seriali
|
|||||||
table.getActiveTimeline().getRollbackTimeline().getInstants());
|
table.getActiveTimeline().getRollbackTimeline().getInstants());
|
||||||
}
|
}
|
||||||
} catch (IOException e) {
|
} catch (IOException e) {
|
||||||
throw new HoodieRollbackException("Failed to rollback " +
|
throw new HoodieRollbackException(
|
||||||
config.getBasePath() + " commits " + commits, e);
|
"Failed to rollback " + config.getBasePath() + " commits " + commits, e);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -880,8 +841,7 @@ public class HoodieWriteClient<T extends HoodieRecordPayload> implements Seriali
|
|||||||
|
|
||||||
// Create a Hoodie table which encapsulated the commits and files visible
|
// Create a Hoodie table which encapsulated the commits and files visible
|
||||||
HoodieTable<T> table = HoodieTable.getHoodieTable(
|
HoodieTable<T> table = HoodieTable.getHoodieTable(
|
||||||
new HoodieTableMetaClient(jsc.hadoopConfiguration(), config.getBasePath(),
|
new HoodieTableMetaClient(jsc.hadoopConfiguration(), config.getBasePath(), true), config);
|
||||||
true), config);
|
|
||||||
|
|
||||||
List<HoodieCleanStat> cleanStats = table.clean(jsc);
|
List<HoodieCleanStat> cleanStats = table.clean(jsc);
|
||||||
if (cleanStats.isEmpty()) {
|
if (cleanStats.isEmpty()) {
|
||||||
@@ -896,14 +856,14 @@ public class HoodieWriteClient<T extends HoodieRecordPayload> implements Seriali
|
|||||||
}
|
}
|
||||||
|
|
||||||
// Create the metadata and save it
|
// Create the metadata and save it
|
||||||
HoodieCleanMetadata metadata =
|
HoodieCleanMetadata metadata = AvroUtils
|
||||||
AvroUtils.convertCleanMetadata(startCleanTime, durationInMs, cleanStats);
|
.convertCleanMetadata(startCleanTime, durationInMs, cleanStats);
|
||||||
logger.info("Cleaned " + metadata.getTotalFilesDeleted() + " files");
|
logger.info("Cleaned " + metadata.getTotalFilesDeleted() + " files");
|
||||||
metrics.updateCleanMetrics(durationInMs.orElseGet(() -> -1L),
|
metrics
|
||||||
metadata.getTotalFilesDeleted());
|
.updateCleanMetrics(durationInMs.orElseGet(() -> -1L), metadata.getTotalFilesDeleted());
|
||||||
|
|
||||||
table.getActiveTimeline().saveAsComplete(
|
table.getActiveTimeline()
|
||||||
new HoodieInstant(true, HoodieTimeline.CLEAN_ACTION, startCleanTime),
|
.saveAsComplete(new HoodieInstant(true, HoodieTimeline.CLEAN_ACTION, startCleanTime),
|
||||||
AvroUtils.serializeCleanMetadata(metadata));
|
AvroUtils.serializeCleanMetadata(metadata));
|
||||||
logger.info("Marked clean started on " + startCleanTime + " as complete");
|
logger.info("Marked clean started on " + startCleanTime + " as complete");
|
||||||
|
|
||||||
@@ -930,12 +890,10 @@ public class HoodieWriteClient<T extends HoodieRecordPayload> implements Seriali
|
|||||||
public void startCommitWithTime(String commitTime) {
|
public void startCommitWithTime(String commitTime) {
|
||||||
logger.info("Generate a new commit time " + commitTime);
|
logger.info("Generate a new commit time " + commitTime);
|
||||||
HoodieTable<T> table = HoodieTable.getHoodieTable(
|
HoodieTable<T> table = HoodieTable.getHoodieTable(
|
||||||
new HoodieTableMetaClient(jsc.hadoopConfiguration(), config.getBasePath(), true),
|
new HoodieTableMetaClient(jsc.hadoopConfiguration(), config.getBasePath(), true), config);
|
||||||
config);
|
|
||||||
HoodieActiveTimeline activeTimeline = table.getActiveTimeline();
|
HoodieActiveTimeline activeTimeline = table.getActiveTimeline();
|
||||||
String commitActionType = table.getCommitActionType();
|
String commitActionType = table.getCommitActionType();
|
||||||
activeTimeline.createInflight(
|
activeTimeline.createInflight(new HoodieInstant(true, commitActionType, commitTime));
|
||||||
new HoodieInstant(true, commitActionType, commitTime));
|
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
@@ -948,17 +906,16 @@ public class HoodieWriteClient<T extends HoodieRecordPayload> implements Seriali
|
|||||||
return commitTime;
|
return commitTime;
|
||||||
}
|
}
|
||||||
|
|
||||||
/** Since MOR tableType default to {@link HoodieTimeline#DELTA_COMMIT_ACTION},
|
/**
|
||||||
* we need to explicitly set to {@link HoodieTimeline#COMMIT_ACTION} for compaction
|
* Since MOR tableType default to {@link HoodieTimeline#DELTA_COMMIT_ACTION}, we need to
|
||||||
|
* explicitly set to {@link HoodieTimeline#COMMIT_ACTION} for compaction
|
||||||
*/
|
*/
|
||||||
public void startCompactionWithTime(String commitTime) {
|
public void startCompactionWithTime(String commitTime) {
|
||||||
HoodieTable<T> table = HoodieTable.getHoodieTable(
|
HoodieTable<T> table = HoodieTable.getHoodieTable(
|
||||||
new HoodieTableMetaClient(jsc.hadoopConfiguration(), config.getBasePath(), true),
|
new HoodieTableMetaClient(jsc.hadoopConfiguration(), config.getBasePath(), true), config);
|
||||||
config);
|
|
||||||
HoodieActiveTimeline activeTimeline = table.getActiveTimeline();
|
HoodieActiveTimeline activeTimeline = table.getActiveTimeline();
|
||||||
String commitActionType = HoodieTimeline.COMMIT_ACTION;
|
String commitActionType = HoodieTimeline.COMMIT_ACTION;
|
||||||
activeTimeline.createInflight(
|
activeTimeline.createInflight(new HoodieInstant(true, commitActionType, commitTime));
|
||||||
new HoodieInstant(true, commitActionType, commitTime));
|
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
@@ -968,8 +925,7 @@ public class HoodieWriteClient<T extends HoodieRecordPayload> implements Seriali
|
|||||||
public JavaRDD<WriteStatus> compact(String commitTime) throws IOException {
|
public JavaRDD<WriteStatus> compact(String commitTime) throws IOException {
|
||||||
// Create a Hoodie table which encapsulated the commits and files visible
|
// Create a Hoodie table which encapsulated the commits and files visible
|
||||||
HoodieTable<T> table = HoodieTable.getHoodieTable(
|
HoodieTable<T> table = HoodieTable.getHoodieTable(
|
||||||
new HoodieTableMetaClient(jsc.hadoopConfiguration(), config.getBasePath(), true),
|
new HoodieTableMetaClient(jsc.hadoopConfiguration(), config.getBasePath(), true), config);
|
||||||
config);
|
|
||||||
JavaRDD<WriteStatus> statuses = table.compact(jsc, commitTime);
|
JavaRDD<WriteStatus> statuses = table.compact(jsc, commitTime);
|
||||||
// Trigger the insert and collect statuses
|
// Trigger the insert and collect statuses
|
||||||
statuses = statuses.persist(config.getWriteStatusStorageLevel());
|
statuses = statuses.persist(config.getWriteStatusStorageLevel());
|
||||||
@@ -980,9 +936,6 @@ public class HoodieWriteClient<T extends HoodieRecordPayload> implements Seriali
|
|||||||
|
|
||||||
/**
|
/**
|
||||||
* Commit a compaction operation
|
* Commit a compaction operation
|
||||||
* @param commitTime
|
|
||||||
* @param writeStatuses
|
|
||||||
* @param extraMetadata
|
|
||||||
*/
|
*/
|
||||||
public void commitCompaction(String commitTime, JavaRDD<WriteStatus> writeStatuses,
|
public void commitCompaction(String commitTime, JavaRDD<WriteStatus> writeStatuses,
|
||||||
Optional<HashMap<String, String>> extraMetadata) {
|
Optional<HashMap<String, String>> extraMetadata) {
|
||||||
@@ -992,8 +945,6 @@ public class HoodieWriteClient<T extends HoodieRecordPayload> implements Seriali
|
|||||||
|
|
||||||
/**
|
/**
|
||||||
* Commit a compaction operation
|
* Commit a compaction operation
|
||||||
* @param commitTime
|
|
||||||
* @param writeStatuses
|
|
||||||
*/
|
*/
|
||||||
public void commitCompaction(String commitTime, JavaRDD<WriteStatus> writeStatuses) {
|
public void commitCompaction(String commitTime, JavaRDD<WriteStatus> writeStatuses) {
|
||||||
String commitCompactionActionType = HoodieActiveTimeline.COMMIT_ACTION;
|
String commitCompactionActionType = HoodieActiveTimeline.COMMIT_ACTION;
|
||||||
@@ -1006,8 +957,8 @@ public class HoodieWriteClient<T extends HoodieRecordPayload> implements Seriali
|
|||||||
*/
|
*/
|
||||||
private void forceCompact(String compactionCommitTime) throws IOException {
|
private void forceCompact(String compactionCommitTime) throws IOException {
|
||||||
// Create a Hoodie table which encapsulated the commits and files visible
|
// Create a Hoodie table which encapsulated the commits and files visible
|
||||||
HoodieTableMetaClient metaClient =
|
HoodieTableMetaClient metaClient = new HoodieTableMetaClient(jsc.hadoopConfiguration(),
|
||||||
new HoodieTableMetaClient(jsc.hadoopConfiguration(), config.getBasePath(), true);
|
config.getBasePath(), true);
|
||||||
HoodieTable<T> table = HoodieTable.getHoodieTable(metaClient, config);
|
HoodieTable<T> table = HoodieTable.getHoodieTable(metaClient, config);
|
||||||
JavaRDD<WriteStatus> compactedStatuses = table.compact(jsc, compactionCommitTime);
|
JavaRDD<WriteStatus> compactedStatuses = table.compact(jsc, compactionCommitTime);
|
||||||
if (!compactedStatuses.isEmpty()) {
|
if (!compactedStatuses.isEmpty()) {
|
||||||
@@ -1029,8 +980,7 @@ public class HoodieWriteClient<T extends HoodieRecordPayload> implements Seriali
|
|||||||
}
|
}
|
||||||
|
|
||||||
private void commitForceCompaction(JavaRDD<WriteStatus> writeStatuses,
|
private void commitForceCompaction(JavaRDD<WriteStatus> writeStatuses,
|
||||||
HoodieTableMetaClient metaClient,
|
HoodieTableMetaClient metaClient, String compactionCommitTime) {
|
||||||
String compactionCommitTime) {
|
|
||||||
List<HoodieWriteStat> updateStatusMap = writeStatuses.map(writeStatus -> writeStatus.getStat())
|
List<HoodieWriteStat> updateStatusMap = writeStatuses.map(writeStatus -> writeStatus.getStat())
|
||||||
.collect();
|
.collect();
|
||||||
|
|
||||||
@@ -1054,12 +1004,6 @@ public class HoodieWriteClient<T extends HoodieRecordPayload> implements Seriali
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
public static SparkConf registerClasses(SparkConf conf) {
|
|
||||||
conf.registerKryoClasses(
|
|
||||||
new Class[]{HoodieWriteConfig.class, HoodieRecord.class, HoodieKey.class});
|
|
||||||
return conf;
|
|
||||||
}
|
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Deduplicate Hoodie records, using the given deduplication funciton.
|
* Deduplicate Hoodie records, using the given deduplication funciton.
|
||||||
*/
|
*/
|
||||||
@@ -1074,13 +1018,13 @@ public class HoodieWriteClient<T extends HoodieRecordPayload> implements Seriali
|
|||||||
return new Tuple2<>(key, record);
|
return new Tuple2<>(key, record);
|
||||||
})
|
})
|
||||||
.reduceByKey((rec1, rec2) -> {
|
.reduceByKey((rec1, rec2) -> {
|
||||||
@SuppressWarnings("unchecked")
|
@SuppressWarnings("unchecked") T reducedData = (T) rec1.getData()
|
||||||
T reducedData = (T) rec1.getData().preCombine(rec2.getData());
|
.preCombine(rec2.getData());
|
||||||
// we cannot allow the user to change the key or partitionPath, since that will affect everything
|
// we cannot allow the user to change the key or partitionPath, since that will affect
|
||||||
|
// everything
|
||||||
// so pick it from one of the records.
|
// so pick it from one of the records.
|
||||||
return new HoodieRecord<T>(rec1.getKey(), reducedData);
|
return new HoodieRecord<T>(rec1.getKey(), reducedData);
|
||||||
}, parallelism)
|
}, parallelism).map(recordTuple -> recordTuple._2());
|
||||||
.map(recordTuple -> recordTuple._2());
|
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
@@ -1088,8 +1032,7 @@ public class HoodieWriteClient<T extends HoodieRecordPayload> implements Seriali
|
|||||||
*/
|
*/
|
||||||
private void rollbackInflightCommits() {
|
private void rollbackInflightCommits() {
|
||||||
HoodieTable<T> table = HoodieTable.getHoodieTable(
|
HoodieTable<T> table = HoodieTable.getHoodieTable(
|
||||||
new HoodieTableMetaClient(jsc.hadoopConfiguration(), config.getBasePath(), true),
|
new HoodieTableMetaClient(jsc.hadoopConfiguration(), config.getBasePath(), true), config);
|
||||||
config);
|
|
||||||
HoodieTimeline inflightTimeline = table.getCommitsTimeline().filterInflights();
|
HoodieTimeline inflightTimeline = table.getCommitsTimeline().filterInflights();
|
||||||
List<String> commits = inflightTimeline.getInstants().map(HoodieInstant::getTimestamp)
|
List<String> commits = inflightTimeline.getInstants().map(HoodieInstant::getTimestamp)
|
||||||
.collect(Collectors.toList());
|
.collect(Collectors.toList());
|
||||||
@@ -1103,7 +1046,6 @@ public class HoodieWriteClient<T extends HoodieRecordPayload> implements Seriali
|
|||||||
writeContext = metrics.getCommitCtx();
|
writeContext = metrics.getCommitCtx();
|
||||||
// Create a Hoodie table which encapsulated the commits and files visible
|
// Create a Hoodie table which encapsulated the commits and files visible
|
||||||
return HoodieTable.getHoodieTable(
|
return HoodieTable.getHoodieTable(
|
||||||
new HoodieTableMetaClient(jsc.hadoopConfiguration(), config.getBasePath(), true),
|
new HoodieTableMetaClient(jsc.hadoopConfiguration(), config.getBasePath(), true), config);
|
||||||
config);
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -106,14 +106,14 @@ public class WriteStatus implements Serializable {
|
|||||||
return globalError != null;
|
return globalError != null;
|
||||||
}
|
}
|
||||||
|
|
||||||
public void setGlobalError(Throwable t) {
|
|
||||||
this.globalError = t;
|
|
||||||
}
|
|
||||||
|
|
||||||
public Throwable getGlobalError() {
|
public Throwable getGlobalError() {
|
||||||
return this.globalError;
|
return this.globalError;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
public void setGlobalError(Throwable t) {
|
||||||
|
this.globalError = t;
|
||||||
|
}
|
||||||
|
|
||||||
public List<HoodieRecord> getWrittenRecords() {
|
public List<HoodieRecord> getWrittenRecords() {
|
||||||
return writtenRecords;
|
return writtenRecords;
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -30,10 +30,6 @@ public class DefaultHoodieConfig implements Serializable {
|
|||||||
this.props = props;
|
this.props = props;
|
||||||
}
|
}
|
||||||
|
|
||||||
public Properties getProps() {
|
|
||||||
return props;
|
|
||||||
}
|
|
||||||
|
|
||||||
public static void setDefaultOnCondition(Properties props, boolean condition, String propName,
|
public static void setDefaultOnCondition(Properties props, boolean condition, String propName,
|
||||||
String defaultValue) {
|
String defaultValue) {
|
||||||
if (condition) {
|
if (condition) {
|
||||||
@@ -48,4 +44,8 @@ public class DefaultHoodieConfig implements Serializable {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
public Properties getProps() {
|
||||||
|
return props;
|
||||||
|
}
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -34,81 +34,76 @@ import javax.annotation.concurrent.Immutable;
|
|||||||
public class HoodieCompactionConfig extends DefaultHoodieConfig {
|
public class HoodieCompactionConfig extends DefaultHoodieConfig {
|
||||||
|
|
||||||
public static final String CLEANER_POLICY_PROP = "hoodie.cleaner.policy";
|
public static final String CLEANER_POLICY_PROP = "hoodie.cleaner.policy";
|
||||||
private static final String DEFAULT_CLEANER_POLICY =
|
|
||||||
HoodieCleaningPolicy.KEEP_LATEST_COMMITS.name();
|
|
||||||
|
|
||||||
public static final String AUTO_CLEAN_PROP = "hoodie.clean.automatic";
|
public static final String AUTO_CLEAN_PROP = "hoodie.clean.automatic";
|
||||||
private static final String DEFAULT_AUTO_CLEAN = "true";
|
|
||||||
|
|
||||||
// Turn on inline compaction - after fw delta commits a inline compaction will be run
|
// Turn on inline compaction - after fw delta commits a inline compaction will be run
|
||||||
public static final String INLINE_COMPACT_PROP = "hoodie.compact.inline";
|
public static final String INLINE_COMPACT_PROP = "hoodie.compact.inline";
|
||||||
private static final String DEFAULT_INLINE_COMPACT = "false";
|
|
||||||
|
|
||||||
// Run a compaction every N delta commits
|
// Run a compaction every N delta commits
|
||||||
public static final String INLINE_COMPACT_NUM_DELTA_COMMITS_PROP = "hoodie.compact.inline.max.delta.commits";
|
public static final String INLINE_COMPACT_NUM_DELTA_COMMITS_PROP =
|
||||||
private static final String DEFAULT_INLINE_COMPACT_NUM_DELTA_COMMITS = "10";
|
"hoodie.compact.inline.max" + ".delta.commits";
|
||||||
|
|
||||||
public static final String CLEANER_FILE_VERSIONS_RETAINED_PROP =
|
public static final String CLEANER_FILE_VERSIONS_RETAINED_PROP =
|
||||||
"hoodie.cleaner.fileversions.retained";
|
"hoodie.cleaner.fileversions" + ".retained";
|
||||||
private static final String DEFAULT_CLEANER_FILE_VERSIONS_RETAINED = "3";
|
|
||||||
|
|
||||||
public static final String CLEANER_COMMITS_RETAINED_PROP = "hoodie.cleaner.commits.retained";
|
public static final String CLEANER_COMMITS_RETAINED_PROP = "hoodie.cleaner.commits.retained";
|
||||||
private static final String DEFAULT_CLEANER_COMMITS_RETAINED = "24";
|
|
||||||
|
|
||||||
public static final String MAX_COMMITS_TO_KEEP = "hoodie.keep.max.commits";
|
public static final String MAX_COMMITS_TO_KEEP = "hoodie.keep.max.commits";
|
||||||
private static final String DEFAULT_MAX_COMMITS_TO_KEEP = String.valueOf(128);
|
|
||||||
public static final String MIN_COMMITS_TO_KEEP = "hoodie.keep.min.commits";
|
public static final String MIN_COMMITS_TO_KEEP = "hoodie.keep.min.commits";
|
||||||
private static final String DEFAULT_MIN_COMMITS_TO_KEEP = String.valueOf(96);
|
|
||||||
// Upsert uses this file size to compact new data onto existing files..
|
// Upsert uses this file size to compact new data onto existing files..
|
||||||
public static final String PARQUET_SMALL_FILE_LIMIT_BYTES = "hoodie.parquet.small.file.limit";
|
public static final String PARQUET_SMALL_FILE_LIMIT_BYTES = "hoodie.parquet.small.file.limit";
|
||||||
// Turned off by default
|
// Turned off by default
|
||||||
public static final String DEFAULT_PARQUET_SMALL_FILE_LIMIT_BYTES = String.valueOf(0);
|
public static final String DEFAULT_PARQUET_SMALL_FILE_LIMIT_BYTES = String.valueOf(0);
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Configs related to specific table types
|
* Configs related to specific table types
|
||||||
**/
|
**/
|
||||||
// Number of inserts, that will be put each partition/bucket for writing
|
// Number of inserts, that will be put each partition/bucket for writing
|
||||||
public static final String COPY_ON_WRITE_TABLE_INSERT_SPLIT_SIZE = "hoodie.copyonwrite.insert.split.size";
|
public static final String COPY_ON_WRITE_TABLE_INSERT_SPLIT_SIZE =
|
||||||
|
"hoodie.copyonwrite.insert" + ".split.size";
|
||||||
// The rationale to pick the insert parallelism is the following. Writing out 100MB files,
|
// The rationale to pick the insert parallelism is the following. Writing out 100MB files,
|
||||||
// with atleast 1kb records, means 100K records per file. we just overprovision to 500K
|
// with atleast 1kb records, means 100K records per file. we just overprovision to 500K
|
||||||
public static final String DEFAULT_COPY_ON_WRITE_TABLE_INSERT_SPLIT_SIZE = String.valueOf(500000);
|
public static final String DEFAULT_COPY_ON_WRITE_TABLE_INSERT_SPLIT_SIZE = String.valueOf(500000);
|
||||||
|
// Config to control whether we control insert split sizes automatically based on average
|
||||||
// Config to control whether we control insert split sizes automatically based on average record sizes
|
// record sizes
|
||||||
public static final String COPY_ON_WRITE_TABLE_AUTO_SPLIT_INSERTS = "hoodie.copyonwrite.insert.auto.split";
|
public static final String COPY_ON_WRITE_TABLE_AUTO_SPLIT_INSERTS =
|
||||||
|
"hoodie.copyonwrite.insert" + ".auto.split";
|
||||||
// its off by default
|
// its off by default
|
||||||
public static final String DEFAULT_COPY_ON_WRITE_TABLE_AUTO_SPLIT_INSERTS = String.valueOf(false);
|
public static final String DEFAULT_COPY_ON_WRITE_TABLE_AUTO_SPLIT_INSERTS = String.valueOf(false);
|
||||||
|
// This value is used as a guessimate for the record size, if we can't determine this from
|
||||||
|
// previous commits
|
||||||
// This value is used as a guessimate for the record size, if we can't determine this from previous commits
|
public static final String COPY_ON_WRITE_TABLE_RECORD_SIZE_ESTIMATE =
|
||||||
public static final String COPY_ON_WRITE_TABLE_RECORD_SIZE_ESTIMATE = "hoodie.copyonwrite.record.size.estimate";
|
"hoodie.copyonwrite" + ".record.size.estimate";
|
||||||
// Used to determine how much more can be packed into a small file, before it exceeds the size limit.
|
// Used to determine how much more can be packed into a small file, before it exceeds the size
|
||||||
|
// limit.
|
||||||
public static final String DEFAULT_COPY_ON_WRITE_TABLE_RECORD_SIZE_ESTIMATE = String
|
public static final String DEFAULT_COPY_ON_WRITE_TABLE_RECORD_SIZE_ESTIMATE = String
|
||||||
.valueOf(1024);
|
.valueOf(1024);
|
||||||
|
|
||||||
public static final String CLEANER_PARALLELISM = "hoodie.cleaner.parallelism";
|
public static final String CLEANER_PARALLELISM = "hoodie.cleaner.parallelism";
|
||||||
public static final String DEFAULT_CLEANER_PARALLELISM = String.valueOf(200);
|
public static final String DEFAULT_CLEANER_PARALLELISM = String.valueOf(200);
|
||||||
|
|
||||||
public static final String TARGET_IO_PER_COMPACTION_IN_MB_PROP = "hoodie.compaction.target.io";
|
public static final String TARGET_IO_PER_COMPACTION_IN_MB_PROP = "hoodie.compaction.target.io";
|
||||||
// 500GB of target IO per compaction (both read and write)
|
// 500GB of target IO per compaction (both read and write)
|
||||||
public static final String DEFAULT_TARGET_IO_PER_COMPACTION_IN_MB = String.valueOf(500 * 1024);
|
public static final String DEFAULT_TARGET_IO_PER_COMPACTION_IN_MB = String.valueOf(500 * 1024);
|
||||||
|
|
||||||
public static final String COMPACTION_STRATEGY_PROP = "hoodie.compaction.strategy";
|
public static final String COMPACTION_STRATEGY_PROP = "hoodie.compaction.strategy";
|
||||||
// 200GB of target IO per compaction
|
// 200GB of target IO per compaction
|
||||||
public static final String DEFAULT_COMPACTION_STRATEGY = LogFileSizeBasedCompactionStrategy.class
|
public static final String DEFAULT_COMPACTION_STRATEGY = LogFileSizeBasedCompactionStrategy.class
|
||||||
.getName();
|
.getName();
|
||||||
|
|
||||||
// used to merge records written to log file
|
// used to merge records written to log file
|
||||||
public static final String DEFAULT_PAYLOAD_CLASS = HoodieAvroPayload.class.getName();
|
public static final String DEFAULT_PAYLOAD_CLASS = HoodieAvroPayload.class.getName();
|
||||||
public static final String PAYLOAD_CLASS_PROP = "hoodie.compaction.payload.class";
|
public static final String PAYLOAD_CLASS_PROP = "hoodie.compaction.payload.class";
|
||||||
|
|
||||||
// used to choose a trade off between IO vs Memory when performing compaction process
|
// used to choose a trade off between IO vs Memory when performing compaction process
|
||||||
// Depending on outputfile_size and memory provided, choose true to avoid OOM for large file size + small memory
|
// Depending on outputfile_size and memory provided, choose true to avoid OOM for large file
|
||||||
public static final String COMPACTION_LAZY_BLOCK_READ_ENABLED_PROP = "hoodie.compaction.lazy.block.read";
|
// size + small memory
|
||||||
|
public static final String COMPACTION_LAZY_BLOCK_READ_ENABLED_PROP =
|
||||||
|
"hoodie.compaction.lazy" + ".block.read";
|
||||||
public static final String DEFAULT_COMPACTION_LAZY_BLOCK_READ_ENABLED = "false";
|
public static final String DEFAULT_COMPACTION_LAZY_BLOCK_READ_ENABLED = "false";
|
||||||
|
|
||||||
// used to choose whether to enable reverse log reading (reverse log traversal)
|
// used to choose whether to enable reverse log reading (reverse log traversal)
|
||||||
public static final String COMPACTION_REVERSE_LOG_READ_ENABLED_PROP = "hoodie.compaction.reverse.log.read";
|
public static final String COMPACTION_REVERSE_LOG_READ_ENABLED_PROP =
|
||||||
|
"hoodie.compaction" + ".reverse.log.read";
|
||||||
public static final String DEFAULT_COMPACTION_REVERSE_LOG_READ_ENABLED = "false";
|
public static final String DEFAULT_COMPACTION_REVERSE_LOG_READ_ENABLED = "false";
|
||||||
|
private static final String DEFAULT_CLEANER_POLICY = HoodieCleaningPolicy.KEEP_LATEST_COMMITS
|
||||||
|
.name();
|
||||||
|
private static final String DEFAULT_AUTO_CLEAN = "true";
|
||||||
|
private static final String DEFAULT_INLINE_COMPACT = "false";
|
||||||
|
private static final String DEFAULT_INLINE_COMPACT_NUM_DELTA_COMMITS = "10";
|
||||||
|
private static final String DEFAULT_CLEANER_FILE_VERSIONS_RETAINED = "3";
|
||||||
|
private static final String DEFAULT_CLEANER_COMMITS_RETAINED = "24";
|
||||||
|
private static final String DEFAULT_MAX_COMMITS_TO_KEEP = String.valueOf(128);
|
||||||
|
private static final String DEFAULT_MIN_COMMITS_TO_KEEP = String.valueOf(96);
|
||||||
|
|
||||||
private HoodieCompactionConfig(Properties props) {
|
private HoodieCompactionConfig(Properties props) {
|
||||||
super(props);
|
super(props);
|
||||||
@@ -159,8 +154,7 @@ public class HoodieCompactionConfig extends DefaultHoodieConfig {
|
|||||||
}
|
}
|
||||||
|
|
||||||
public Builder retainFileVersions(int fileVersionsRetained) {
|
public Builder retainFileVersions(int fileVersionsRetained) {
|
||||||
props.setProperty(CLEANER_FILE_VERSIONS_RETAINED_PROP,
|
props.setProperty(CLEANER_FILE_VERSIONS_RETAINED_PROP, String.valueOf(fileVersionsRetained));
|
||||||
String.valueOf(fileVersionsRetained));
|
|
||||||
return this;
|
return this;
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -238,22 +232,22 @@ public class HoodieCompactionConfig extends DefaultHoodieConfig {
|
|||||||
|
|
||||||
public HoodieCompactionConfig build() {
|
public HoodieCompactionConfig build() {
|
||||||
HoodieCompactionConfig config = new HoodieCompactionConfig(props);
|
HoodieCompactionConfig config = new HoodieCompactionConfig(props);
|
||||||
setDefaultOnCondition(props, !props.containsKey(AUTO_CLEAN_PROP),
|
setDefaultOnCondition(props, !props.containsKey(AUTO_CLEAN_PROP), AUTO_CLEAN_PROP,
|
||||||
AUTO_CLEAN_PROP, DEFAULT_AUTO_CLEAN);
|
DEFAULT_AUTO_CLEAN);
|
||||||
setDefaultOnCondition(props, !props.containsKey(INLINE_COMPACT_PROP),
|
setDefaultOnCondition(props, !props.containsKey(INLINE_COMPACT_PROP), INLINE_COMPACT_PROP,
|
||||||
INLINE_COMPACT_PROP, DEFAULT_INLINE_COMPACT);
|
DEFAULT_INLINE_COMPACT);
|
||||||
setDefaultOnCondition(props, !props.containsKey(INLINE_COMPACT_NUM_DELTA_COMMITS_PROP),
|
setDefaultOnCondition(props, !props.containsKey(INLINE_COMPACT_NUM_DELTA_COMMITS_PROP),
|
||||||
INLINE_COMPACT_NUM_DELTA_COMMITS_PROP, DEFAULT_INLINE_COMPACT_NUM_DELTA_COMMITS);
|
INLINE_COMPACT_NUM_DELTA_COMMITS_PROP, DEFAULT_INLINE_COMPACT_NUM_DELTA_COMMITS);
|
||||||
setDefaultOnCondition(props, !props.containsKey(CLEANER_POLICY_PROP),
|
setDefaultOnCondition(props, !props.containsKey(CLEANER_POLICY_PROP), CLEANER_POLICY_PROP,
|
||||||
CLEANER_POLICY_PROP, DEFAULT_CLEANER_POLICY);
|
DEFAULT_CLEANER_POLICY);
|
||||||
setDefaultOnCondition(props, !props.containsKey(CLEANER_FILE_VERSIONS_RETAINED_PROP),
|
setDefaultOnCondition(props, !props.containsKey(CLEANER_FILE_VERSIONS_RETAINED_PROP),
|
||||||
CLEANER_FILE_VERSIONS_RETAINED_PROP, DEFAULT_CLEANER_FILE_VERSIONS_RETAINED);
|
CLEANER_FILE_VERSIONS_RETAINED_PROP, DEFAULT_CLEANER_FILE_VERSIONS_RETAINED);
|
||||||
setDefaultOnCondition(props, !props.containsKey(CLEANER_COMMITS_RETAINED_PROP),
|
setDefaultOnCondition(props, !props.containsKey(CLEANER_COMMITS_RETAINED_PROP),
|
||||||
CLEANER_COMMITS_RETAINED_PROP, DEFAULT_CLEANER_COMMITS_RETAINED);
|
CLEANER_COMMITS_RETAINED_PROP, DEFAULT_CLEANER_COMMITS_RETAINED);
|
||||||
setDefaultOnCondition(props, !props.containsKey(MAX_COMMITS_TO_KEEP),
|
setDefaultOnCondition(props, !props.containsKey(MAX_COMMITS_TO_KEEP), MAX_COMMITS_TO_KEEP,
|
||||||
MAX_COMMITS_TO_KEEP, DEFAULT_MAX_COMMITS_TO_KEEP);
|
DEFAULT_MAX_COMMITS_TO_KEEP);
|
||||||
setDefaultOnCondition(props, !props.containsKey(MIN_COMMITS_TO_KEEP),
|
setDefaultOnCondition(props, !props.containsKey(MIN_COMMITS_TO_KEEP), MIN_COMMITS_TO_KEEP,
|
||||||
MIN_COMMITS_TO_KEEP, DEFAULT_MIN_COMMITS_TO_KEEP);
|
DEFAULT_MIN_COMMITS_TO_KEEP);
|
||||||
setDefaultOnCondition(props, !props.containsKey(PARQUET_SMALL_FILE_LIMIT_BYTES),
|
setDefaultOnCondition(props, !props.containsKey(PARQUET_SMALL_FILE_LIMIT_BYTES),
|
||||||
PARQUET_SMALL_FILE_LIMIT_BYTES, DEFAULT_PARQUET_SMALL_FILE_LIMIT_BYTES);
|
PARQUET_SMALL_FILE_LIMIT_BYTES, DEFAULT_PARQUET_SMALL_FILE_LIMIT_BYTES);
|
||||||
setDefaultOnCondition(props, !props.containsKey(COPY_ON_WRITE_TABLE_INSERT_SPLIT_SIZE),
|
setDefaultOnCondition(props, !props.containsKey(COPY_ON_WRITE_TABLE_INSERT_SPLIT_SIZE),
|
||||||
@@ -263,8 +257,8 @@ public class HoodieCompactionConfig extends DefaultHoodieConfig {
|
|||||||
setDefaultOnCondition(props, !props.containsKey(COPY_ON_WRITE_TABLE_RECORD_SIZE_ESTIMATE),
|
setDefaultOnCondition(props, !props.containsKey(COPY_ON_WRITE_TABLE_RECORD_SIZE_ESTIMATE),
|
||||||
COPY_ON_WRITE_TABLE_RECORD_SIZE_ESTIMATE,
|
COPY_ON_WRITE_TABLE_RECORD_SIZE_ESTIMATE,
|
||||||
DEFAULT_COPY_ON_WRITE_TABLE_RECORD_SIZE_ESTIMATE);
|
DEFAULT_COPY_ON_WRITE_TABLE_RECORD_SIZE_ESTIMATE);
|
||||||
setDefaultOnCondition(props, !props.containsKey(CLEANER_PARALLELISM),
|
setDefaultOnCondition(props, !props.containsKey(CLEANER_PARALLELISM), CLEANER_PARALLELISM,
|
||||||
CLEANER_PARALLELISM, DEFAULT_CLEANER_PARALLELISM);
|
DEFAULT_CLEANER_PARALLELISM);
|
||||||
setDefaultOnCondition(props, !props.containsKey(COMPACTION_STRATEGY_PROP),
|
setDefaultOnCondition(props, !props.containsKey(COMPACTION_STRATEGY_PROP),
|
||||||
COMPACTION_STRATEGY_PROP, DEFAULT_COMPACTION_STRATEGY);
|
COMPACTION_STRATEGY_PROP, DEFAULT_COMPACTION_STRATEGY);
|
||||||
setDefaultOnCondition(props, !props.containsKey(PAYLOAD_CLASS_PROP),
|
setDefaultOnCondition(props, !props.containsKey(PAYLOAD_CLASS_PROP),
|
||||||
@@ -277,8 +271,7 @@ public class HoodieCompactionConfig extends DefaultHoodieConfig {
|
|||||||
COMPACTION_REVERSE_LOG_READ_ENABLED_PROP, DEFAULT_COMPACTION_REVERSE_LOG_READ_ENABLED);
|
COMPACTION_REVERSE_LOG_READ_ENABLED_PROP, DEFAULT_COMPACTION_REVERSE_LOG_READ_ENABLED);
|
||||||
|
|
||||||
HoodieCleaningPolicy.valueOf(props.getProperty(CLEANER_POLICY_PROP));
|
HoodieCleaningPolicy.valueOf(props.getProperty(CLEANER_POLICY_PROP));
|
||||||
Preconditions.checkArgument(
|
Preconditions.checkArgument(Integer.parseInt(props.getProperty(MAX_COMMITS_TO_KEEP)) > Integer
|
||||||
Integer.parseInt(props.getProperty(MAX_COMMITS_TO_KEEP)) > Integer
|
|
||||||
.parseInt(props.getProperty(MIN_COMMITS_TO_KEEP)));
|
.parseInt(props.getProperty(MIN_COMMITS_TO_KEEP)));
|
||||||
return config;
|
return config;
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -40,23 +40,25 @@ public class HoodieIndexConfig extends DefaultHoodieConfig {
|
|||||||
public static final String BLOOM_INDEX_PARALLELISM_PROP = "hoodie.bloom.index.parallelism";
|
public static final String BLOOM_INDEX_PARALLELISM_PROP = "hoodie.bloom.index.parallelism";
|
||||||
// Disable explicit bloom index parallelism setting by default - hoodie auto computes
|
// Disable explicit bloom index parallelism setting by default - hoodie auto computes
|
||||||
public static final String DEFAULT_BLOOM_INDEX_PARALLELISM = "0";
|
public static final String DEFAULT_BLOOM_INDEX_PARALLELISM = "0";
|
||||||
public static final String BLOOM_INDEX_PRUNE_BY_RANGES_PROP = "hoodie.bloom.index.prune.by.ranges";
|
public static final String BLOOM_INDEX_PRUNE_BY_RANGES_PROP =
|
||||||
|
"hoodie.bloom.index.prune.by" + ".ranges";
|
||||||
public static final String DEFAULT_BLOOM_INDEX_PRUNE_BY_RANGES = "true";
|
public static final String DEFAULT_BLOOM_INDEX_PRUNE_BY_RANGES = "true";
|
||||||
public static final String BLOOM_INDEX_USE_CACHING_PROP = "hoodie.bloom.index.use.caching";
|
public static final String BLOOM_INDEX_USE_CACHING_PROP = "hoodie.bloom.index.use.caching";
|
||||||
public static final String DEFAULT_BLOOM_INDEX_USE_CACHING = "true";
|
public static final String DEFAULT_BLOOM_INDEX_USE_CACHING = "true";
|
||||||
public static final String BLOOM_INDEX_INPUT_STORAGE_LEVEL = "hoodie.bloom.index.input.storage.level";
|
public static final String BLOOM_INDEX_INPUT_STORAGE_LEVEL =
|
||||||
|
"hoodie.bloom.index.input.storage" + ".level";
|
||||||
public static final String DEFAULT_BLOOM_INDEX_INPUT_STORAGE_LEVEL = "MEMORY_AND_DISK_SER";
|
public static final String DEFAULT_BLOOM_INDEX_INPUT_STORAGE_LEVEL = "MEMORY_AND_DISK_SER";
|
||||||
|
|
||||||
// ***** HBase Index Configs *****
|
// ***** HBase Index Configs *****
|
||||||
public final static String HBASE_ZKQUORUM_PROP = "hoodie.index.hbase.zkquorum";
|
public static final String HBASE_ZKQUORUM_PROP = "hoodie.index.hbase.zkquorum";
|
||||||
public final static String HBASE_ZKPORT_PROP = "hoodie.index.hbase.zkport";
|
public static final String HBASE_ZKPORT_PROP = "hoodie.index.hbase.zkport";
|
||||||
public final static String HBASE_TABLENAME_PROP = "hoodie.index.hbase.table";
|
public static final String HBASE_TABLENAME_PROP = "hoodie.index.hbase.table";
|
||||||
public final static String HBASE_GET_BATCH_SIZE_PROP = "hoodie.index.hbase.get.batch.size";
|
public static final String HBASE_GET_BATCH_SIZE_PROP = "hoodie.index.hbase.get.batch.size";
|
||||||
public final static String HBASE_PUT_BATCH_SIZE_PROP = "hoodie.index.hbase.put.batch.size";
|
public static final String HBASE_PUT_BATCH_SIZE_PROP = "hoodie.index.hbase.put.batch.size";
|
||||||
public final static String DEFAULT_HBASE_BATCH_SIZE = "100";
|
public static final String DEFAULT_HBASE_BATCH_SIZE = "100";
|
||||||
|
|
||||||
// ***** Bucketed Index Configs *****
|
// ***** Bucketed Index Configs *****
|
||||||
public final static String BUCKETED_INDEX_NUM_BUCKETS_PROP = "hoodie.index.bucketed.numbuckets";
|
public static final String BUCKETED_INDEX_NUM_BUCKETS_PROP = "hoodie.index.bucketed.numbuckets";
|
||||||
|
|
||||||
private HoodieIndexConfig(Properties props) {
|
private HoodieIndexConfig(Properties props) {
|
||||||
super(props);
|
super(props);
|
||||||
@@ -152,12 +154,12 @@ public class HoodieIndexConfig extends DefaultHoodieConfig {
|
|||||||
|
|
||||||
public HoodieIndexConfig build() {
|
public HoodieIndexConfig build() {
|
||||||
HoodieIndexConfig config = new HoodieIndexConfig(props);
|
HoodieIndexConfig config = new HoodieIndexConfig(props);
|
||||||
setDefaultOnCondition(props, !props.containsKey(INDEX_TYPE_PROP),
|
setDefaultOnCondition(props, !props.containsKey(INDEX_TYPE_PROP), INDEX_TYPE_PROP,
|
||||||
INDEX_TYPE_PROP, DEFAULT_INDEX_TYPE);
|
DEFAULT_INDEX_TYPE);
|
||||||
setDefaultOnCondition(props, !props.containsKey(BLOOM_FILTER_NUM_ENTRIES),
|
setDefaultOnCondition(props, !props.containsKey(BLOOM_FILTER_NUM_ENTRIES),
|
||||||
BLOOM_FILTER_NUM_ENTRIES, DEFAULT_BLOOM_FILTER_NUM_ENTRIES);
|
BLOOM_FILTER_NUM_ENTRIES, DEFAULT_BLOOM_FILTER_NUM_ENTRIES);
|
||||||
setDefaultOnCondition(props, !props.containsKey(BLOOM_FILTER_FPP),
|
setDefaultOnCondition(props, !props.containsKey(BLOOM_FILTER_FPP), BLOOM_FILTER_FPP,
|
||||||
BLOOM_FILTER_FPP, DEFAULT_BLOOM_FILTER_FPP);
|
DEFAULT_BLOOM_FILTER_FPP);
|
||||||
setDefaultOnCondition(props, !props.containsKey(BLOOM_INDEX_PARALLELISM_PROP),
|
setDefaultOnCondition(props, !props.containsKey(BLOOM_INDEX_PARALLELISM_PROP),
|
||||||
BLOOM_INDEX_PARALLELISM_PROP, DEFAULT_BLOOM_INDEX_PARALLELISM);
|
BLOOM_INDEX_PARALLELISM_PROP, DEFAULT_BLOOM_INDEX_PARALLELISM);
|
||||||
setDefaultOnCondition(props, !props.containsKey(BLOOM_INDEX_PRUNE_BY_RANGES_PROP),
|
setDefaultOnCondition(props, !props.containsKey(BLOOM_INDEX_PRUNE_BY_RANGES_PROP),
|
||||||
|
|||||||
@@ -30,8 +30,8 @@ import org.apache.spark.util.Utils;
|
|||||||
@Immutable
|
@Immutable
|
||||||
public class HoodieMemoryConfig extends DefaultHoodieConfig {
|
public class HoodieMemoryConfig extends DefaultHoodieConfig {
|
||||||
|
|
||||||
// This fraction is multiplied with the spark.memory.fraction to get a final fraction of heap space to use during merge
|
// This fraction is multiplied with the spark.memory.fraction to get a final fraction of heap space to use
|
||||||
// This makes it easier to scale this value as one increases the spark.executor.memory
|
// during merge. This makes it easier to scale this value as one increases the spark.executor.memory
|
||||||
public static final String MAX_MEMORY_FRACTION_FOR_MERGE_PROP = "hoodie.memory.merge.fraction";
|
public static final String MAX_MEMORY_FRACTION_FOR_MERGE_PROP = "hoodie.memory.merge.fraction";
|
||||||
// Default max memory fraction during hash-merge, excess spills to disk
|
// Default max memory fraction during hash-merge, excess spills to disk
|
||||||
public static final String DEFAULT_MAX_MEMORY_FRACTION_FOR_MERGE = String.valueOf(0.6);
|
public static final String DEFAULT_MAX_MEMORY_FRACTION_FOR_MERGE = String.valueOf(0.6);
|
||||||
@@ -87,19 +87,21 @@ public class HoodieMemoryConfig extends DefaultHoodieConfig {
|
|||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Dynamic calculation of max memory to use for for spillable map. user.available.memory =
|
* Dynamic calculation of max memory to use for for spillable map. user.available.memory = spark.executor.memory *
|
||||||
* spark.executor.memory * (1 - spark.memory.fraction) spillable.available.memory =
|
* (1 - spark.memory.fraction) spillable.available.memory = user.available.memory * hoodie.memory.fraction. Anytime
|
||||||
* user.available.memory * hoodie.memory.fraction. Anytime the spark.executor.memory or the
|
* the spark.executor.memory or the spark.memory.fraction is changed, the memory used for spillable map changes
|
||||||
* spark.memory.fraction is changed, the memory used for spillable map changes accordingly
|
* accordingly
|
||||||
*/
|
*/
|
||||||
private long getMaxMemoryAllowedForMerge(String maxMemoryFraction) {
|
private long getMaxMemoryAllowedForMerge(String maxMemoryFraction) {
|
||||||
final String SPARK_EXECUTOR_MEMORY_PROP = "spark.executor.memory";
|
final String SPARK_EXECUTOR_MEMORY_PROP = "spark.executor.memory";
|
||||||
final String SPARK_EXECUTOR_MEMORY_FRACTION_PROP = "spark.memory.fraction";
|
final String SPARK_EXECUTOR_MEMORY_FRACTION_PROP = "spark.memory.fraction";
|
||||||
// This is hard-coded in spark code {@link https://github.com/apache/spark/blob/576c43fb4226e4efa12189b41c3bc862019862c6/core/src/main/scala/org/apache/spark/memory/UnifiedMemoryManager.scala#L231}
|
// This is hard-coded in spark code {@link
|
||||||
// so have to re-define this here
|
// https://github.com/apache/spark/blob/576c43fb4226e4efa12189b41c3bc862019862c6/core/src/main/scala/org/apache/
|
||||||
|
// spark/memory/UnifiedMemoryManager.scala#L231} so have to re-define this here
|
||||||
final String DEFAULT_SPARK_EXECUTOR_MEMORY_FRACTION = "0.6";
|
final String DEFAULT_SPARK_EXECUTOR_MEMORY_FRACTION = "0.6";
|
||||||
// This is hard-coded in spark code {@link https://github.com/apache/spark/blob/576c43fb4226e4efa12189b41c3bc862019862c6/core/src/main/scala/org/apache/spark/SparkContext.scala#L471}
|
// This is hard-coded in spark code {@link
|
||||||
// so have to re-define this here
|
// https://github.com/apache/spark/blob/576c43fb4226e4efa12189b41c3bc862019862c6/core/src/main/scala/org/apache/
|
||||||
|
// spark/SparkContext.scala#L471} so have to re-define this here
|
||||||
final String DEFAULT_SPARK_EXECUTOR_MEMORY_MB = "1024"; // in MB
|
final String DEFAULT_SPARK_EXECUTOR_MEMORY_MB = "1024"; // in MB
|
||||||
|
|
||||||
if (SparkEnv.get() != null) {
|
if (SparkEnv.get() != null) {
|
||||||
@@ -109,7 +111,8 @@ public class HoodieMemoryConfig extends DefaultHoodieConfig {
|
|||||||
DEFAULT_SPARK_EXECUTOR_MEMORY_MB)) * 1024
|
DEFAULT_SPARK_EXECUTOR_MEMORY_MB)) * 1024
|
||||||
* 1024L);
|
* 1024L);
|
||||||
// 0.6 is the default value used by Spark,
|
// 0.6 is the default value used by Spark,
|
||||||
// look at {@link https://github.com/apache/spark/blob/master/core/src/main/scala/org/apache/spark/SparkConf.scala#L507}
|
// look at {@link
|
||||||
|
// https://github.com/apache/spark/blob/master/core/src/main/scala/org/apache/spark/SparkConf.scala#L507}
|
||||||
double memoryFraction = Double
|
double memoryFraction = Double
|
||||||
.valueOf(SparkEnv.get().conf().get(SPARK_EXECUTOR_MEMORY_FRACTION_PROP,
|
.valueOf(SparkEnv.get().conf().get(SPARK_EXECUTOR_MEMORY_FRACTION_PROP,
|
||||||
DEFAULT_SPARK_EXECUTOR_MEMORY_FRACTION));
|
DEFAULT_SPARK_EXECUTOR_MEMORY_FRACTION));
|
||||||
@@ -143,5 +146,4 @@ public class HoodieMemoryConfig extends DefaultHoodieConfig {
|
|||||||
return config;
|
return config;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -29,22 +29,22 @@ import javax.annotation.concurrent.Immutable;
|
|||||||
@Immutable
|
@Immutable
|
||||||
public class HoodieMetricsConfig extends DefaultHoodieConfig {
|
public class HoodieMetricsConfig extends DefaultHoodieConfig {
|
||||||
|
|
||||||
public final static String METRIC_PREFIX = "hoodie.metrics";
|
public static final String METRIC_PREFIX = "hoodie.metrics";
|
||||||
public final static String METRICS_ON = METRIC_PREFIX + ".on";
|
public static final String METRICS_ON = METRIC_PREFIX + ".on";
|
||||||
public final static boolean DEFAULT_METRICS_ON = false;
|
public static final boolean DEFAULT_METRICS_ON = false;
|
||||||
public final static String METRICS_REPORTER_TYPE = METRIC_PREFIX + ".reporter.type";
|
public static final String METRICS_REPORTER_TYPE = METRIC_PREFIX + ".reporter.type";
|
||||||
public final static MetricsReporterType DEFAULT_METRICS_REPORTER_TYPE =
|
public static final MetricsReporterType DEFAULT_METRICS_REPORTER_TYPE = MetricsReporterType
|
||||||
MetricsReporterType.GRAPHITE;
|
.GRAPHITE;
|
||||||
|
|
||||||
// Graphite
|
// Graphite
|
||||||
public final static String GRAPHITE_PREFIX = METRIC_PREFIX + ".graphite";
|
public static final String GRAPHITE_PREFIX = METRIC_PREFIX + ".graphite";
|
||||||
public final static String GRAPHITE_SERVER_HOST = GRAPHITE_PREFIX + ".host";
|
public static final String GRAPHITE_SERVER_HOST = GRAPHITE_PREFIX + ".host";
|
||||||
public final static String DEFAULT_GRAPHITE_SERVER_HOST = "localhost";
|
public static final String DEFAULT_GRAPHITE_SERVER_HOST = "localhost";
|
||||||
|
|
||||||
public final static String GRAPHITE_SERVER_PORT = GRAPHITE_PREFIX + ".port";
|
public static final String GRAPHITE_SERVER_PORT = GRAPHITE_PREFIX + ".port";
|
||||||
public final static int DEFAULT_GRAPHITE_SERVER_PORT = 4756;
|
public static final int DEFAULT_GRAPHITE_SERVER_PORT = 4756;
|
||||||
|
|
||||||
public final static String GRAPHITE_METRIC_PREFIX = GRAPHITE_PREFIX + ".metric.prefix";
|
public static final String GRAPHITE_METRIC_PREFIX = GRAPHITE_PREFIX + ".metric.prefix";
|
||||||
|
|
||||||
private HoodieMetricsConfig(Properties props) {
|
private HoodieMetricsConfig(Properties props) {
|
||||||
super(props);
|
super(props);
|
||||||
@@ -103,14 +103,14 @@ public class HoodieMetricsConfig extends DefaultHoodieConfig {
|
|||||||
HoodieMetricsConfig config = new HoodieMetricsConfig(props);
|
HoodieMetricsConfig config = new HoodieMetricsConfig(props);
|
||||||
setDefaultOnCondition(props, !props.containsKey(METRICS_ON), METRICS_ON,
|
setDefaultOnCondition(props, !props.containsKey(METRICS_ON), METRICS_ON,
|
||||||
String.valueOf(DEFAULT_METRICS_ON));
|
String.valueOf(DEFAULT_METRICS_ON));
|
||||||
setDefaultOnCondition(props, !props.containsKey(METRICS_REPORTER_TYPE),
|
setDefaultOnCondition(props, !props.containsKey(METRICS_REPORTER_TYPE), METRICS_REPORTER_TYPE,
|
||||||
METRICS_REPORTER_TYPE, DEFAULT_METRICS_REPORTER_TYPE.name());
|
DEFAULT_METRICS_REPORTER_TYPE.name());
|
||||||
setDefaultOnCondition(props, !props.containsKey(GRAPHITE_SERVER_HOST),
|
setDefaultOnCondition(props, !props.containsKey(GRAPHITE_SERVER_HOST), GRAPHITE_SERVER_HOST,
|
||||||
GRAPHITE_SERVER_HOST, DEFAULT_GRAPHITE_SERVER_HOST);
|
DEFAULT_GRAPHITE_SERVER_HOST);
|
||||||
setDefaultOnCondition(props, !props.containsKey(GRAPHITE_SERVER_PORT),
|
setDefaultOnCondition(props, !props.containsKey(GRAPHITE_SERVER_PORT), GRAPHITE_SERVER_PORT,
|
||||||
GRAPHITE_SERVER_PORT, String.valueOf(DEFAULT_GRAPHITE_SERVER_PORT));
|
String.valueOf(DEFAULT_GRAPHITE_SERVER_PORT));
|
||||||
setDefaultOnCondition(props, !props.containsKey(GRAPHITE_SERVER_PORT),
|
setDefaultOnCondition(props, !props.containsKey(GRAPHITE_SERVER_PORT), GRAPHITE_SERVER_PORT,
|
||||||
GRAPHITE_SERVER_PORT, String.valueOf(DEFAULT_GRAPHITE_SERVER_PORT));
|
String.valueOf(DEFAULT_GRAPHITE_SERVER_PORT));
|
||||||
return config;
|
return config;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -36,7 +36,8 @@ public class HoodieStorageConfig extends DefaultHoodieConfig {
|
|||||||
public static final String DEFAULT_PARQUET_PAGE_SIZE_BYTES = String.valueOf(1 * 1024 * 1024);
|
public static final String DEFAULT_PARQUET_PAGE_SIZE_BYTES = String.valueOf(1 * 1024 * 1024);
|
||||||
// used to size log files
|
// used to size log files
|
||||||
public static final String LOGFILE_SIZE_MAX_BYTES = "hoodie.logfile.max.size";
|
public static final String LOGFILE_SIZE_MAX_BYTES = "hoodie.logfile.max.size";
|
||||||
public static final String DEFAULT_LOGFILE_SIZE_MAX_BYTES = String.valueOf(1024*1024*1024); // 1 GB
|
public static final String DEFAULT_LOGFILE_SIZE_MAX_BYTES = String
|
||||||
|
.valueOf(1024 * 1024 * 1024); // 1 GB
|
||||||
// used to size data blocks in log file
|
// used to size data blocks in log file
|
||||||
public static final String LOGFILE_DATA_BLOCK_SIZE_MAX_BYTES = "hoodie.logfile.data.block.max.size";
|
public static final String LOGFILE_DATA_BLOCK_SIZE_MAX_BYTES = "hoodie.logfile.data.block.max.size";
|
||||||
public static final String DEFAULT_LOGFILE_DATA_BLOCK_SIZE_MAX_BYTES = String.valueOf(256 * 1024 * 1024); // 256 MB
|
public static final String DEFAULT_LOGFILE_DATA_BLOCK_SIZE_MAX_BYTES = String.valueOf(256 * 1024 * 1024); // 256 MB
|
||||||
|
|||||||
@@ -16,7 +16,6 @@
|
|||||||
|
|
||||||
package com.uber.hoodie.config;
|
package com.uber.hoodie.config;
|
||||||
|
|
||||||
|
|
||||||
import com.google.common.base.Preconditions;
|
import com.google.common.base.Preconditions;
|
||||||
import com.uber.hoodie.WriteStatus;
|
import com.uber.hoodie.WriteStatus;
|
||||||
import com.uber.hoodie.common.model.HoodieCleaningPolicy;
|
import com.uber.hoodie.common.model.HoodieCleaningPolicy;
|
||||||
@@ -24,15 +23,14 @@ import com.uber.hoodie.common.util.ReflectionUtils;
|
|||||||
import com.uber.hoodie.index.HoodieIndex;
|
import com.uber.hoodie.index.HoodieIndex;
|
||||||
import com.uber.hoodie.io.compact.strategy.CompactionStrategy;
|
import com.uber.hoodie.io.compact.strategy.CompactionStrategy;
|
||||||
import com.uber.hoodie.metrics.MetricsReporterType;
|
import com.uber.hoodie.metrics.MetricsReporterType;
|
||||||
import org.apache.spark.storage.StorageLevel;
|
|
||||||
|
|
||||||
import javax.annotation.concurrent.Immutable;
|
|
||||||
import java.io.File;
|
import java.io.File;
|
||||||
import java.io.FileReader;
|
import java.io.FileReader;
|
||||||
import java.io.IOException;
|
import java.io.IOException;
|
||||||
import java.io.InputStream;
|
import java.io.InputStream;
|
||||||
import java.util.Map;
|
import java.util.Map;
|
||||||
import java.util.Properties;
|
import java.util.Properties;
|
||||||
|
import javax.annotation.concurrent.Immutable;
|
||||||
|
import org.apache.spark.storage.StorageLevel;
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Class storing configs for the {@link com.uber.hoodie.HoodieWriteClient}
|
* Class storing configs for the {@link com.uber.hoodie.HoodieWriteClient}
|
||||||
@@ -40,9 +38,9 @@ import java.util.Properties;
|
|||||||
@Immutable
|
@Immutable
|
||||||
public class HoodieWriteConfig extends DefaultHoodieConfig {
|
public class HoodieWriteConfig extends DefaultHoodieConfig {
|
||||||
|
|
||||||
|
public static final String TABLE_NAME = "hoodie.table.name";
|
||||||
private static final String BASE_PATH_PROP = "hoodie.base.path";
|
private static final String BASE_PATH_PROP = "hoodie.base.path";
|
||||||
private static final String AVRO_SCHEMA = "hoodie.avro.schema";
|
private static final String AVRO_SCHEMA = "hoodie.avro.schema";
|
||||||
public static final String TABLE_NAME = "hoodie.table.name";
|
|
||||||
private static final String DEFAULT_PARALLELISM = "200";
|
private static final String DEFAULT_PARALLELISM = "200";
|
||||||
private static final String INSERT_PARALLELISM = "hoodie.insert.shuffle.parallelism";
|
private static final String INSERT_PARALLELISM = "hoodie.insert.shuffle.parallelism";
|
||||||
private static final String BULKINSERT_PARALLELISM = "hoodie.bulkinsert.shuffle.parallelism";
|
private static final String BULKINSERT_PARALLELISM = "hoodie.bulkinsert.shuffle.parallelism";
|
||||||
@@ -57,13 +55,16 @@ public class HoodieWriteConfig extends DefaultHoodieConfig {
|
|||||||
private static final String DEFAULT_WRITE_STATUS_STORAGE_LEVEL = "MEMORY_AND_DISK_SER";
|
private static final String DEFAULT_WRITE_STATUS_STORAGE_LEVEL = "MEMORY_AND_DISK_SER";
|
||||||
private static final String HOODIE_AUTO_COMMIT_PROP = "hoodie.auto.commit";
|
private static final String HOODIE_AUTO_COMMIT_PROP = "hoodie.auto.commit";
|
||||||
private static final String DEFAULT_HOODIE_AUTO_COMMIT = "true";
|
private static final String DEFAULT_HOODIE_AUTO_COMMIT = "true";
|
||||||
private static final String HOODIE_ASSUME_DATE_PARTITIONING_PROP = "hoodie.assume.date.partitioning";
|
private static final String HOODIE_ASSUME_DATE_PARTITIONING_PROP =
|
||||||
|
"hoodie.assume.date" + ".partitioning";
|
||||||
private static final String DEFAULT_ASSUME_DATE_PARTITIONING = "false";
|
private static final String DEFAULT_ASSUME_DATE_PARTITIONING = "false";
|
||||||
private static final String HOODIE_WRITE_STATUS_CLASS_PROP = "hoodie.writestatus.class";
|
private static final String HOODIE_WRITE_STATUS_CLASS_PROP = "hoodie.writestatus.class";
|
||||||
private static final String DEFAULT_HOODIE_WRITE_STATUS_CLASS = WriteStatus.class.getName();
|
private static final String DEFAULT_HOODIE_WRITE_STATUS_CLASS = WriteStatus.class.getName();
|
||||||
private static final String HOODIE_COPYONWRITE_USE_TEMP_FOLDER_CREATE = "hoodie.copyonwrite.use.temp.folder.for.create";
|
private static final String HOODIE_COPYONWRITE_USE_TEMP_FOLDER_CREATE =
|
||||||
|
"hoodie.copyonwrite.use" + ".temp.folder.for.create";
|
||||||
private static final String DEFAULT_HOODIE_COPYONWRITE_USE_TEMP_FOLDER_CREATE = "false";
|
private static final String DEFAULT_HOODIE_COPYONWRITE_USE_TEMP_FOLDER_CREATE = "false";
|
||||||
private static final String HOODIE_COPYONWRITE_USE_TEMP_FOLDER_MERGE = "hoodie.copyonwrite.use.temp.folder.for.merge";
|
private static final String HOODIE_COPYONWRITE_USE_TEMP_FOLDER_MERGE =
|
||||||
|
"hoodie.copyonwrite.use" + ".temp.folder.for.merge";
|
||||||
private static final String DEFAULT_HOODIE_COPYONWRITE_USE_TEMP_FOLDER_MERGE = "false";
|
private static final String DEFAULT_HOODIE_COPYONWRITE_USE_TEMP_FOLDER_MERGE = "false";
|
||||||
private static final String FINALIZE_WRITE_PARALLELISM = "hoodie.finalize.write.parallelism";
|
private static final String FINALIZE_WRITE_PARALLELISM = "hoodie.finalize.write.parallelism";
|
||||||
private static final String DEFAULT_FINALIZE_WRITE_PARALLELISM = DEFAULT_PARALLELISM;
|
private static final String DEFAULT_FINALIZE_WRITE_PARALLELISM = DEFAULT_PARALLELISM;
|
||||||
@@ -72,6 +73,10 @@ public class HoodieWriteConfig extends DefaultHoodieConfig {
|
|||||||
super(props);
|
super(props);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
public static HoodieWriteConfig.Builder newBuilder() {
|
||||||
|
return new Builder();
|
||||||
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* base properties
|
* base properties
|
||||||
**/
|
**/
|
||||||
@@ -137,8 +142,8 @@ public class HoodieWriteConfig extends DefaultHoodieConfig {
|
|||||||
}
|
}
|
||||||
|
|
||||||
public boolean shouldUseTempFolderForCopyOnWrite() {
|
public boolean shouldUseTempFolderForCopyOnWrite() {
|
||||||
return shouldUseTempFolderForCopyOnWriteForCreate() ||
|
return shouldUseTempFolderForCopyOnWriteForCreate()
|
||||||
shouldUseTempFolderForCopyOnWriteForMerge();
|
|| shouldUseTempFolderForCopyOnWriteForMerge();
|
||||||
}
|
}
|
||||||
|
|
||||||
public int getFinalizeWriteParallelism() {
|
public int getFinalizeWriteParallelism() {
|
||||||
@@ -154,8 +159,8 @@ public class HoodieWriteConfig extends DefaultHoodieConfig {
|
|||||||
}
|
}
|
||||||
|
|
||||||
public int getCleanerFileVersionsRetained() {
|
public int getCleanerFileVersionsRetained() {
|
||||||
return Integer.parseInt(
|
return Integer
|
||||||
props.getProperty(HoodieCompactionConfig.CLEANER_FILE_VERSIONS_RETAINED_PROP));
|
.parseInt(props.getProperty(HoodieCompactionConfig.CLEANER_FILE_VERSIONS_RETAINED_PROP));
|
||||||
}
|
}
|
||||||
|
|
||||||
public int getCleanerCommitsRetained() {
|
public int getCleanerCommitsRetained() {
|
||||||
@@ -177,8 +182,8 @@ public class HoodieWriteConfig extends DefaultHoodieConfig {
|
|||||||
}
|
}
|
||||||
|
|
||||||
public int getCopyOnWriteInsertSplitSize() {
|
public int getCopyOnWriteInsertSplitSize() {
|
||||||
return Integer.parseInt(
|
return Integer
|
||||||
props.getProperty(HoodieCompactionConfig.COPY_ON_WRITE_TABLE_INSERT_SPLIT_SIZE));
|
.parseInt(props.getProperty(HoodieCompactionConfig.COPY_ON_WRITE_TABLE_INSERT_SPLIT_SIZE));
|
||||||
}
|
}
|
||||||
|
|
||||||
public int getCopyOnWriteRecordSizeEstimate() {
|
public int getCopyOnWriteRecordSizeEstimate() {
|
||||||
@@ -204,8 +209,8 @@ public class HoodieWriteConfig extends DefaultHoodieConfig {
|
|||||||
}
|
}
|
||||||
|
|
||||||
public int getInlineCompactDeltaCommitMax() {
|
public int getInlineCompactDeltaCommitMax() {
|
||||||
return Integer.parseInt(
|
return Integer
|
||||||
props.getProperty(HoodieCompactionConfig.INLINE_COMPACT_NUM_DELTA_COMMITS_PROP));
|
.parseInt(props.getProperty(HoodieCompactionConfig.INLINE_COMPACT_NUM_DELTA_COMMITS_PROP));
|
||||||
}
|
}
|
||||||
|
|
||||||
public CompactionStrategy getCompactionStrategy() {
|
public CompactionStrategy getCompactionStrategy() {
|
||||||
@@ -341,10 +346,6 @@ public class HoodieWriteConfig extends DefaultHoodieConfig {
|
|||||||
return props.getProperty(HoodieMetricsConfig.GRAPHITE_METRIC_PREFIX);
|
return props.getProperty(HoodieMetricsConfig.GRAPHITE_METRIC_PREFIX);
|
||||||
}
|
}
|
||||||
|
|
||||||
public static HoodieWriteConfig.Builder newBuilder() {
|
|
||||||
return new Builder();
|
|
||||||
}
|
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* memory configs
|
* memory configs
|
||||||
*/
|
*/
|
||||||
@@ -486,15 +487,15 @@ public class HoodieWriteConfig extends DefaultHoodieConfig {
|
|||||||
|
|
||||||
public Builder withUseTempFolderCopyOnWriteForCreate(
|
public Builder withUseTempFolderCopyOnWriteForCreate(
|
||||||
boolean shouldUseTempFolderCopyOnWriteForCreate) {
|
boolean shouldUseTempFolderCopyOnWriteForCreate) {
|
||||||
props.setProperty(HOODIE_COPYONWRITE_USE_TEMP_FOLDER_CREATE, String.valueOf
|
props.setProperty(HOODIE_COPYONWRITE_USE_TEMP_FOLDER_CREATE,
|
||||||
(shouldUseTempFolderCopyOnWriteForCreate));
|
String.valueOf(shouldUseTempFolderCopyOnWriteForCreate));
|
||||||
return this;
|
return this;
|
||||||
}
|
}
|
||||||
|
|
||||||
public Builder withUseTempFolderCopyOnWriteForMerge(
|
public Builder withUseTempFolderCopyOnWriteForMerge(
|
||||||
boolean shouldUseTempFolderCopyOnWriteForMerge) {
|
boolean shouldUseTempFolderCopyOnWriteForMerge) {
|
||||||
props.setProperty(HOODIE_COPYONWRITE_USE_TEMP_FOLDER_MERGE, String.valueOf
|
props.setProperty(HOODIE_COPYONWRITE_USE_TEMP_FOLDER_MERGE,
|
||||||
(shouldUseTempFolderCopyOnWriteForMerge));
|
String.valueOf(shouldUseTempFolderCopyOnWriteForMerge));
|
||||||
return this;
|
return this;
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -510,8 +511,7 @@ public class HoodieWriteConfig extends DefaultHoodieConfig {
|
|||||||
setDefaultOnCondition(props, !props.containsKey(INSERT_PARALLELISM), INSERT_PARALLELISM,
|
setDefaultOnCondition(props, !props.containsKey(INSERT_PARALLELISM), INSERT_PARALLELISM,
|
||||||
DEFAULT_PARALLELISM);
|
DEFAULT_PARALLELISM);
|
||||||
setDefaultOnCondition(props, !props.containsKey(BULKINSERT_PARALLELISM),
|
setDefaultOnCondition(props, !props.containsKey(BULKINSERT_PARALLELISM),
|
||||||
BULKINSERT_PARALLELISM,
|
BULKINSERT_PARALLELISM, DEFAULT_PARALLELISM);
|
||||||
DEFAULT_PARALLELISM);
|
|
||||||
setDefaultOnCondition(props, !props.containsKey(UPSERT_PARALLELISM), UPSERT_PARALLELISM,
|
setDefaultOnCondition(props, !props.containsKey(UPSERT_PARALLELISM), UPSERT_PARALLELISM,
|
||||||
DEFAULT_PARALLELISM);
|
DEFAULT_PARALLELISM);
|
||||||
setDefaultOnCondition(props, !props.containsKey(COMBINE_BEFORE_INSERT_PROP),
|
setDefaultOnCondition(props, !props.containsKey(COMBINE_BEFORE_INSERT_PROP),
|
||||||
|
|||||||
@@ -21,12 +21,6 @@ import com.google.common.base.Preconditions;
|
|||||||
import com.uber.hoodie.common.model.HoodieRecord;
|
import com.uber.hoodie.common.model.HoodieRecord;
|
||||||
import com.uber.hoodie.common.model.HoodieRecordPayload;
|
import com.uber.hoodie.common.model.HoodieRecordPayload;
|
||||||
import com.uber.hoodie.exception.HoodieException;
|
import com.uber.hoodie.exception.HoodieException;
|
||||||
import org.apache.avro.Schema;
|
|
||||||
import org.apache.avro.generic.IndexedRecord;
|
|
||||||
import org.apache.log4j.LogManager;
|
|
||||||
import org.apache.log4j.Logger;
|
|
||||||
import org.apache.spark.util.SizeEstimator;
|
|
||||||
|
|
||||||
import java.util.Iterator;
|
import java.util.Iterator;
|
||||||
import java.util.Optional;
|
import java.util.Optional;
|
||||||
import java.util.concurrent.LinkedBlockingQueue;
|
import java.util.concurrent.LinkedBlockingQueue;
|
||||||
@@ -35,53 +29,63 @@ import java.util.concurrent.TimeUnit;
|
|||||||
import java.util.concurrent.atomic.AtomicBoolean;
|
import java.util.concurrent.atomic.AtomicBoolean;
|
||||||
import java.util.concurrent.atomic.AtomicLong;
|
import java.util.concurrent.atomic.AtomicLong;
|
||||||
import java.util.concurrent.atomic.AtomicReference;
|
import java.util.concurrent.atomic.AtomicReference;
|
||||||
|
import org.apache.avro.Schema;
|
||||||
|
import org.apache.avro.generic.IndexedRecord;
|
||||||
|
import org.apache.log4j.LogManager;
|
||||||
|
import org.apache.log4j.Logger;
|
||||||
|
import org.apache.spark.util.SizeEstimator;
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Used for buffering input records. Buffer limit is controlled by {@link #bufferMemoryLimit}. It internally samples
|
* Used for buffering input records. Buffer limit is controlled by {@link #bufferMemoryLimit}. It
|
||||||
* every {@link #RECORD_SAMPLING_RATE}th record and adjusts number of records in buffer accordingly. This is done to
|
* internally samples every {@link #RECORD_SAMPLING_RATE}th record and adjusts number of records in
|
||||||
* ensure that we don't OOM.
|
* buffer accordingly. This is done to ensure that we don't OOM.
|
||||||
*/
|
*/
|
||||||
public class BufferedIterator<K extends HoodieRecordPayload, T extends HoodieRecord<K>>
|
public class BufferedIterator<K extends HoodieRecordPayload, T extends HoodieRecord<K>> implements
|
||||||
implements Iterator<BufferedIterator.BufferedIteratorPayload<T>> {
|
Iterator<BufferedIterator.BufferedIteratorPayload<T>> {
|
||||||
|
|
||||||
private static Logger logger = LogManager.getLogger(BufferedIterator.class);
|
|
||||||
// interval used for polling records in the queue.
|
// interval used for polling records in the queue.
|
||||||
public static final int RECORD_POLL_INTERVAL_SEC = 5;
|
public static final int RECORD_POLL_INTERVAL_SEC = 5;
|
||||||
// rate used for sampling records to determine avg record size in bytes.
|
// rate used for sampling records to determine avg record size in bytes.
|
||||||
public static final int RECORD_SAMPLING_RATE = 64;
|
public static final int RECORD_SAMPLING_RATE = 64;
|
||||||
// maximum records that will be cached
|
// maximum records that will be cached
|
||||||
private static final int RECORD_CACHING_LIMIT = 128 * 1024;
|
private static final int RECORD_CACHING_LIMIT = 128 * 1024;
|
||||||
// It indicates number of records to cache. We will be using sampled record's average size to determine how many
|
private static Logger logger = LogManager.getLogger(BufferedIterator.class);
|
||||||
|
// It indicates number of records to cache. We will be using sampled record's average size to
|
||||||
|
// determine how many
|
||||||
// records we should cache and will change (increase/decrease) permits accordingly.
|
// records we should cache and will change (increase/decrease) permits accordingly.
|
||||||
@VisibleForTesting
|
@VisibleForTesting
|
||||||
public final Semaphore rateLimiter = new Semaphore(1);
|
public final Semaphore rateLimiter = new Semaphore(1);
|
||||||
// used for sampling records with "RECORD_SAMPLING_RATE" frequency.
|
// used for sampling records with "RECORD_SAMPLING_RATE" frequency.
|
||||||
public final AtomicLong samplingRecordCounter = new AtomicLong(-1);
|
public final AtomicLong samplingRecordCounter = new AtomicLong(-1);
|
||||||
// indicates rate limit (number of records to cache). it is updated whenever there is a change in avg record size.
|
|
||||||
@VisibleForTesting
|
|
||||||
public int currentRateLimit = 1;
|
|
||||||
// internal buffer to cache buffered records.
|
// internal buffer to cache buffered records.
|
||||||
private final LinkedBlockingQueue<Optional<BufferedIteratorPayload<T>>> buffer = new LinkedBlockingQueue<>();
|
private final LinkedBlockingQueue<Optional<BufferedIteratorPayload<T>>> buffer = new
|
||||||
|
LinkedBlockingQueue<>();
|
||||||
// maximum amount of memory to be used for buffering records.
|
// maximum amount of memory to be used for buffering records.
|
||||||
private final long bufferMemoryLimit;
|
private final long bufferMemoryLimit;
|
||||||
|
// original iterator from where records are read for buffering.
|
||||||
|
private final Iterator<T> inputIterator;
|
||||||
|
// it holds the root cause of the exception in case either buffering records (reading from
|
||||||
|
// inputIterator) fails or
|
||||||
|
// thread reading records from buffer fails.
|
||||||
|
private final AtomicReference<Exception> hasFailed = new AtomicReference(null);
|
||||||
|
// used for indicating that all the records from buffer are read successfully.
|
||||||
|
private final AtomicBoolean isDone = new AtomicBoolean(false);
|
||||||
|
// schema used for fetching insertValue from HoodieRecord.
|
||||||
|
private final Schema schema;
|
||||||
|
// indicates rate limit (number of records to cache). it is updated whenever there is a change
|
||||||
|
// in avg record size.
|
||||||
|
@VisibleForTesting
|
||||||
|
public int currentRateLimit = 1;
|
||||||
// indicates avg record size in bytes. It is updated whenever a new record is sampled.
|
// indicates avg record size in bytes. It is updated whenever a new record is sampled.
|
||||||
@VisibleForTesting
|
@VisibleForTesting
|
||||||
public long avgRecordSizeInBytes = 0;
|
public long avgRecordSizeInBytes = 0;
|
||||||
// indicates number of samples collected so far.
|
// indicates number of samples collected so far.
|
||||||
private long numSamples = 0;
|
private long numSamples = 0;
|
||||||
// original iterator from where records are read for buffering.
|
|
||||||
private final Iterator<T> inputIterator;
|
|
||||||
// it holds the root cause of the exception in case either buffering records (reading from inputIterator) fails or
|
|
||||||
// thread reading records from buffer fails.
|
|
||||||
private final AtomicReference<Exception> hasFailed = new AtomicReference(null);
|
|
||||||
// used for indicating that all the records from buffer are read successfully.
|
|
||||||
private final AtomicBoolean isDone = new AtomicBoolean(false);
|
|
||||||
// next record to be read from buffer.
|
// next record to be read from buffer.
|
||||||
private BufferedIteratorPayload<T> nextRecord;
|
private BufferedIteratorPayload<T> nextRecord;
|
||||||
// schema used for fetching insertValue from HoodieRecord.
|
|
||||||
private final Schema schema;
|
|
||||||
|
|
||||||
public BufferedIterator(final Iterator<T> iterator, final long bufferMemoryLimit, final Schema schema) {
|
public BufferedIterator(final Iterator<T> iterator, final long bufferMemoryLimit,
|
||||||
|
final Schema schema) {
|
||||||
this.inputIterator = iterator;
|
this.inputIterator = iterator;
|
||||||
this.bufferMemoryLimit = bufferMemoryLimit;
|
this.bufferMemoryLimit = bufferMemoryLimit;
|
||||||
this.schema = schema;
|
this.schema = schema;
|
||||||
@@ -92,23 +96,28 @@ public class BufferedIterator<K extends HoodieRecordPayload, T extends HoodieRec
|
|||||||
return this.buffer.size();
|
return this.buffer.size();
|
||||||
}
|
}
|
||||||
|
|
||||||
// It samples records with "RECORD_SAMPLING_RATE" frequency and computes average record size in bytes. It is used
|
// It samples records with "RECORD_SAMPLING_RATE" frequency and computes average record size in
|
||||||
// for determining how many maximum records to buffer. Based on change in avg size it may increase or decrease
|
// bytes. It is used
|
||||||
|
// for determining how many maximum records to buffer. Based on change in avg size it may
|
||||||
|
// increase or decrease
|
||||||
// available permits.
|
// available permits.
|
||||||
private void adjustBufferSizeIfNeeded(final T record) throws InterruptedException {
|
private void adjustBufferSizeIfNeeded(final T record) throws InterruptedException {
|
||||||
if (this.samplingRecordCounter.incrementAndGet() % RECORD_SAMPLING_RATE != 0) {
|
if (this.samplingRecordCounter.incrementAndGet() % RECORD_SAMPLING_RATE != 0) {
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
final long recordSizeInBytes = SizeEstimator.estimate(record);
|
final long recordSizeInBytes = SizeEstimator.estimate(record);
|
||||||
final long newAvgRecordSizeInBytes =
|
final long newAvgRecordSizeInBytes = Math
|
||||||
Math.max(1, (avgRecordSizeInBytes * numSamples + recordSizeInBytes) / (numSamples + 1));
|
.max(1, (avgRecordSizeInBytes * numSamples + recordSizeInBytes) / (numSamples + 1));
|
||||||
final int newRateLimit =
|
final int newRateLimit = (int) Math
|
||||||
(int) Math.min(RECORD_CACHING_LIMIT, Math.max(1, this.bufferMemoryLimit / newAvgRecordSizeInBytes));
|
.min(RECORD_CACHING_LIMIT, Math.max(1, this.bufferMemoryLimit / newAvgRecordSizeInBytes));
|
||||||
// System.out.println("recordSizeInBytes:" + recordSizeInBytes + ":newAvgRecordSizeInBytes:" + newAvgRecordSizeInBytes
|
// System.out.println("recordSizeInBytes:" + recordSizeInBytes + ":newAvgRecordSizeInBytes:" +
|
||||||
// + ":newRateLimit:" + newRateLimit + ":currentRateLimit:" + currentRateLimit + ":numSamples:" + numSamples
|
// newAvgRecordSizeInBytes
|
||||||
|
// + ":newRateLimit:" + newRateLimit + ":currentRateLimit:" + currentRateLimit +
|
||||||
|
// ":numSamples:" + numSamples
|
||||||
// + ":avgRecordSizeInBytes:" + avgRecordSizeInBytes);
|
// + ":avgRecordSizeInBytes:" + avgRecordSizeInBytes);
|
||||||
|
|
||||||
// If there is any change in number of records to cache then we will either release (if it increased) or acquire
|
// If there is any change in number of records to cache then we will either release (if it
|
||||||
|
// increased) or acquire
|
||||||
// (if it decreased) to adjust rate limiting to newly computed value.
|
// (if it decreased) to adjust rate limiting to newly computed value.
|
||||||
if (newRateLimit > currentRateLimit) {
|
if (newRateLimit > currentRateLimit) {
|
||||||
rateLimiter.release(newRateLimit - currentRateLimit);
|
rateLimiter.release(newRateLimit - currentRateLimit);
|
||||||
@@ -120,12 +129,14 @@ public class BufferedIterator<K extends HoodieRecordPayload, T extends HoodieRec
|
|||||||
numSamples++;
|
numSamples++;
|
||||||
}
|
}
|
||||||
|
|
||||||
// inserts record into internal buffer. It also fetches insert value from the record to offload computation work on to
|
// inserts record into internal buffer. It also fetches insert value from the record to offload
|
||||||
|
// computation work on to
|
||||||
// buffering thread.
|
// buffering thread.
|
||||||
private void insertRecord(T t) throws Exception {
|
private void insertRecord(T t) throws Exception {
|
||||||
rateLimiter.acquire();
|
rateLimiter.acquire();
|
||||||
adjustBufferSizeIfNeeded(t);
|
adjustBufferSizeIfNeeded(t);
|
||||||
// We are retrieving insert value in the record buffering thread to offload computation around schema validation
|
// We are retrieving insert value in the record buffering thread to offload computation
|
||||||
|
// around schema validation
|
||||||
// and record creation to it.
|
// and record creation to it.
|
||||||
final BufferedIteratorPayload<T> payload = new BufferedIteratorPayload<>(t, this.schema);
|
final BufferedIteratorPayload<T> payload = new BufferedIteratorPayload<>(t, this.schema);
|
||||||
buffer.put(Optional.of(payload));
|
buffer.put(Optional.of(payload));
|
||||||
@@ -198,12 +209,15 @@ public class BufferedIterator<K extends HoodieRecordPayload, T extends HoodieRec
|
|||||||
|
|
||||||
public void markAsFailed(Exception e) {
|
public void markAsFailed(Exception e) {
|
||||||
this.hasFailed.set(e);
|
this.hasFailed.set(e);
|
||||||
// release the permits so that if the buffering thread is waiting for permits then it will get it.
|
// release the permits so that if the buffering thread is waiting for permits then it will
|
||||||
|
// get it.
|
||||||
this.rateLimiter.release(RECORD_CACHING_LIMIT + 1);
|
this.rateLimiter.release(RECORD_CACHING_LIMIT + 1);
|
||||||
}
|
}
|
||||||
|
|
||||||
// Used for caching HoodieRecord along with insertValue. We need this to offload computation work to buffering thread.
|
// Used for caching HoodieRecord along with insertValue. We need this to offload computation
|
||||||
|
// work to buffering thread.
|
||||||
static class BufferedIteratorPayload<T extends HoodieRecord> {
|
static class BufferedIteratorPayload<T extends HoodieRecord> {
|
||||||
|
|
||||||
public T record;
|
public T record;
|
||||||
public Optional<IndexedRecord> insertValue;
|
public Optional<IndexedRecord> insertValue;
|
||||||
// It caches the exception seen while fetching insert value.
|
// It caches the exception seen while fetching insert value.
|
||||||
|
|||||||
@@ -29,8 +29,8 @@ import org.apache.spark.api.java.function.Function2;
|
|||||||
/**
|
/**
|
||||||
* Map function that handles a sorted stream of HoodieRecords
|
* Map function that handles a sorted stream of HoodieRecords
|
||||||
*/
|
*/
|
||||||
public class BulkInsertMapFunction<T extends HoodieRecordPayload>
|
public class BulkInsertMapFunction<T extends HoodieRecordPayload> implements
|
||||||
implements Function2<Integer, Iterator<HoodieRecord<T>>, Iterator<List<WriteStatus>>> {
|
Function2<Integer, Iterator<HoodieRecord<T>>, Iterator<List<WriteStatus>>> {
|
||||||
|
|
||||||
private String commitTime;
|
private String commitTime;
|
||||||
private HoodieWriteConfig config;
|
private HoodieWriteConfig config;
|
||||||
@@ -45,8 +45,7 @@ public class BulkInsertMapFunction<T extends HoodieRecordPayload>
|
|||||||
|
|
||||||
@Override
|
@Override
|
||||||
public Iterator<List<WriteStatus>> call(Integer partition,
|
public Iterator<List<WriteStatus>> call(Integer partition,
|
||||||
Iterator<HoodieRecord<T>> sortedRecordItr)
|
Iterator<HoodieRecord<T>> sortedRecordItr) throws Exception {
|
||||||
throws Exception {
|
|
||||||
return new LazyInsertIterable<>(sortedRecordItr, config, commitTime, hoodieTable);
|
return new LazyInsertIterable<>(sortedRecordItr, config, commitTime, hoodieTable);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -24,11 +24,6 @@ import com.uber.hoodie.exception.HoodieException;
|
|||||||
import com.uber.hoodie.io.HoodieCreateHandle;
|
import com.uber.hoodie.io.HoodieCreateHandle;
|
||||||
import com.uber.hoodie.io.HoodieIOHandle;
|
import com.uber.hoodie.io.HoodieIOHandle;
|
||||||
import com.uber.hoodie.table.HoodieTable;
|
import com.uber.hoodie.table.HoodieTable;
|
||||||
import org.apache.log4j.LogManager;
|
|
||||||
import org.apache.log4j.Logger;
|
|
||||||
import org.apache.spark.TaskContext;
|
|
||||||
import org.apache.spark.TaskContext$;
|
|
||||||
|
|
||||||
import java.util.ArrayList;
|
import java.util.ArrayList;
|
||||||
import java.util.HashSet;
|
import java.util.HashSet;
|
||||||
import java.util.Iterator;
|
import java.util.Iterator;
|
||||||
@@ -38,6 +33,10 @@ import java.util.Set;
|
|||||||
import java.util.concurrent.ExecutorService;
|
import java.util.concurrent.ExecutorService;
|
||||||
import java.util.concurrent.Executors;
|
import java.util.concurrent.Executors;
|
||||||
import java.util.concurrent.Future;
|
import java.util.concurrent.Future;
|
||||||
|
import org.apache.log4j.LogManager;
|
||||||
|
import org.apache.log4j.Logger;
|
||||||
|
import org.apache.spark.TaskContext;
|
||||||
|
import org.apache.spark.TaskContext$;
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Lazy Iterable, that writes a stream of HoodieRecords sorted by the partitionPath, into new
|
* Lazy Iterable, that writes a stream of HoodieRecords sorted by the partitionPath, into new
|
||||||
@@ -68,21 +67,22 @@ public class LazyInsertIterable<T extends HoodieRecordPayload> extends
|
|||||||
|
|
||||||
@Override
|
@Override
|
||||||
protected List<WriteStatus> computeNext() {
|
protected List<WriteStatus> computeNext() {
|
||||||
// Need to set current spark thread's TaskContext into newly launched thread so that new thread can access
|
// Need to set current spark thread's TaskContext into newly launched thread so that new
|
||||||
|
// thread can access
|
||||||
// TaskContext properties.
|
// TaskContext properties.
|
||||||
final TaskContext sparkThreadTaskContext = TaskContext.get();
|
final TaskContext sparkThreadTaskContext = TaskContext.get();
|
||||||
// Executor service used for launching writer thread.
|
// Executor service used for launching writer thread.
|
||||||
final ExecutorService writerService = Executors.newFixedThreadPool(1);
|
final ExecutorService writerService = Executors.newFixedThreadPool(1);
|
||||||
try {
|
try {
|
||||||
// Used for buffering records which is controlled by HoodieWriteConfig#WRITE_BUFFER_LIMIT_BYTES.
|
// Used for buffering records which is controlled by
|
||||||
final BufferedIterator<T, HoodieRecord<T>> bufferedIterator =
|
// HoodieWriteConfig#WRITE_BUFFER_LIMIT_BYTES.
|
||||||
new BufferedIterator<>(inputItr, hoodieConfig.getWriteBufferLimitBytes(),
|
final BufferedIterator<T, HoodieRecord<T>> bufferedIterator = new BufferedIterator<>(inputItr,
|
||||||
|
hoodieConfig.getWriteBufferLimitBytes(),
|
||||||
HoodieIOHandle.createHoodieWriteSchema(hoodieConfig));
|
HoodieIOHandle.createHoodieWriteSchema(hoodieConfig));
|
||||||
Future<List<WriteStatus>> writerResult =
|
Future<List<WriteStatus>> writerResult = writerService.submit(() -> {
|
||||||
writerService.submit(
|
|
||||||
() -> {
|
|
||||||
logger.info("starting hoodie writer thread");
|
logger.info("starting hoodie writer thread");
|
||||||
// Passing parent thread's TaskContext to newly launched thread for it to access original TaskContext
|
// Passing parent thread's TaskContext to newly launched thread for it to access original
|
||||||
|
// TaskContext
|
||||||
// properties.
|
// properties.
|
||||||
TaskContext$.MODULE$.setTaskContext(sparkThreadTaskContext);
|
TaskContext$.MODULE$.setTaskContext(sparkThreadTaskContext);
|
||||||
List<WriteStatus> statuses = new LinkedList<>();
|
List<WriteStatus> statuses = new LinkedList<>();
|
||||||
@@ -96,7 +96,8 @@ public class LazyInsertIterable<T extends HoodieRecordPayload> extends
|
|||||||
throw e;
|
throw e;
|
||||||
}
|
}
|
||||||
});
|
});
|
||||||
// Buffering records into internal buffer. This can throw exception either if reading records from spark fails or
|
// Buffering records into internal buffer. This can throw exception either if reading
|
||||||
|
// records from spark fails or
|
||||||
// if writing buffered records into parquet file fails.
|
// if writing buffered records into parquet file fails.
|
||||||
bufferedIterator.startBuffering();
|
bufferedIterator.startBuffering();
|
||||||
logger.info("waiting for hoodie write to finish");
|
logger.info("waiting for hoodie write to finish");
|
||||||
@@ -110,28 +111,27 @@ public class LazyInsertIterable<T extends HoodieRecordPayload> extends
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
private List<WriteStatus> handleWrite(final BufferedIterator<T, HoodieRecord<T>> bufferedIterator) {
|
private List<WriteStatus> handleWrite(
|
||||||
|
final BufferedIterator<T, HoodieRecord<T>> bufferedIterator) {
|
||||||
List<WriteStatus> statuses = new ArrayList<>();
|
List<WriteStatus> statuses = new ArrayList<>();
|
||||||
while (bufferedIterator.hasNext()) {
|
while (bufferedIterator.hasNext()) {
|
||||||
final BufferedIterator.BufferedIteratorPayload<HoodieRecord<T>> payload = bufferedIterator.next();
|
final BufferedIterator.BufferedIteratorPayload<HoodieRecord<T>> payload = bufferedIterator
|
||||||
|
.next();
|
||||||
|
|
||||||
// clean up any partial failures
|
// clean up any partial failures
|
||||||
if (!partitionsCleaned.contains(payload.record.getPartitionPath())) {
|
if (!partitionsCleaned.contains(payload.record.getPartitionPath())) {
|
||||||
// This insert task could fail multiple times, but Spark will faithfully retry with
|
// This insert task could fail multiple times, but Spark will faithfully retry with
|
||||||
// the same data again. Thus, before we open any files under a given partition, we
|
// the same data again. Thus, before we open any files under a given partition, we
|
||||||
// first delete any files in the same partitionPath written by same Spark partition
|
// first delete any files in the same partitionPath written by same Spark partition
|
||||||
HoodieIOHandle.cleanupTmpFilesFromCurrentCommit(hoodieConfig,
|
HoodieIOHandle.cleanupTmpFilesFromCurrentCommit(hoodieConfig, commitTime,
|
||||||
commitTime,
|
payload.record.getPartitionPath(), TaskContext.getPartitionId(), hoodieTable);
|
||||||
payload.record.getPartitionPath(),
|
|
||||||
TaskContext.getPartitionId(),
|
|
||||||
hoodieTable);
|
|
||||||
partitionsCleaned.add(payload.record.getPartitionPath());
|
partitionsCleaned.add(payload.record.getPartitionPath());
|
||||||
}
|
}
|
||||||
|
|
||||||
// lazily initialize the handle, for the first time
|
// lazily initialize the handle, for the first time
|
||||||
if (handle == null) {
|
if (handle == null) {
|
||||||
handle =
|
handle = new HoodieCreateHandle(hoodieConfig, commitTime, hoodieTable,
|
||||||
new HoodieCreateHandle(hoodieConfig, commitTime, hoodieTable, payload.record.getPartitionPath());
|
payload.record.getPartitionPath());
|
||||||
}
|
}
|
||||||
|
|
||||||
if (handle.canWrite(payload.record)) {
|
if (handle.canWrite(payload.record)) {
|
||||||
@@ -141,9 +141,10 @@ public class LazyInsertIterable<T extends HoodieRecordPayload> extends
|
|||||||
// handle is full.
|
// handle is full.
|
||||||
statuses.add(handle.close());
|
statuses.add(handle.close());
|
||||||
// Need to handle the rejected payload & open new handle
|
// Need to handle the rejected payload & open new handle
|
||||||
handle =
|
handle = new HoodieCreateHandle(hoodieConfig, commitTime, hoodieTable,
|
||||||
new HoodieCreateHandle(hoodieConfig, commitTime, hoodieTable, payload.record.getPartitionPath());
|
payload.record.getPartitionPath());
|
||||||
handle.write(payload.record, payload.insertValue, payload.exception); // we should be able to write 1 payload.
|
handle.write(payload.record, payload.insertValue,
|
||||||
|
payload.exception); // we should be able to write 1 payload.
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|||||||
@@ -23,9 +23,9 @@ import java.util.Iterator;
|
|||||||
* inputItr classes in order to simplify the implementation of lazy iterators for mapPartitions use
|
* inputItr classes in order to simplify the implementation of lazy iterators for mapPartitions use
|
||||||
* cases. Note [SPARK-3369], which gives the reasons for backwards compatibility with regard to the
|
* cases. Note [SPARK-3369], which gives the reasons for backwards compatibility with regard to the
|
||||||
* iterable API despite Spark's single pass nature.
|
* iterable API despite Spark's single pass nature.
|
||||||
*
|
* <p>
|
||||||
* Provide a way to obtain a inputItr of type O (output), out of an inputItr of type I (input)
|
* Provide a way to obtain a inputItr of type O (output), out of an inputItr of type I (input)
|
||||||
*
|
* <p>
|
||||||
* Things to remember: - Assumes Spark calls hasNext() to check for elements, before calling next()
|
* Things to remember: - Assumes Spark calls hasNext() to check for elements, before calling next()
|
||||||
* to obtain them - Assumes hasNext() gets called atleast once. - Concrete Implementation is
|
* to obtain them - Assumes hasNext() gets called atleast once. - Concrete Implementation is
|
||||||
* responsible for calling inputIterator.next() and doing the processing in computeNext()
|
* responsible for calling inputIterator.next() and doing the processing in computeNext()
|
||||||
|
|||||||
@@ -37,22 +37,30 @@ import org.apache.spark.api.java.JavaSparkContext;
|
|||||||
*/
|
*/
|
||||||
public abstract class HoodieIndex<T extends HoodieRecordPayload> implements Serializable {
|
public abstract class HoodieIndex<T extends HoodieRecordPayload> implements Serializable {
|
||||||
|
|
||||||
protected transient JavaSparkContext jsc = null;
|
|
||||||
|
|
||||||
public enum IndexType {
|
|
||||||
HBASE,
|
|
||||||
INMEMORY,
|
|
||||||
BLOOM,
|
|
||||||
BUCKETED
|
|
||||||
}
|
|
||||||
|
|
||||||
protected final HoodieWriteConfig config;
|
protected final HoodieWriteConfig config;
|
||||||
|
protected transient JavaSparkContext jsc = null;
|
||||||
|
|
||||||
protected HoodieIndex(HoodieWriteConfig config, JavaSparkContext jsc) {
|
protected HoodieIndex(HoodieWriteConfig config, JavaSparkContext jsc) {
|
||||||
this.config = config;
|
this.config = config;
|
||||||
this.jsc = jsc;
|
this.jsc = jsc;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
public static <T extends HoodieRecordPayload> HoodieIndex<T> createIndex(HoodieWriteConfig config,
|
||||||
|
JavaSparkContext jsc) throws HoodieIndexException {
|
||||||
|
switch (config.getIndexType()) {
|
||||||
|
case HBASE:
|
||||||
|
return new HBaseIndex<>(config, jsc);
|
||||||
|
case INMEMORY:
|
||||||
|
return new InMemoryHashIndex<>(config, jsc);
|
||||||
|
case BLOOM:
|
||||||
|
return new HoodieBloomIndex<>(config, jsc);
|
||||||
|
case BUCKETED:
|
||||||
|
return new BucketedIndex<>(config, jsc);
|
||||||
|
default:
|
||||||
|
throw new HoodieIndexException("Index type unspecified, set " + config.getIndexType());
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Checks if the given [Keys] exists in the hoodie table and returns [Key, Optional[FullFilePath]]
|
* Checks if the given [Keys] exists in the hoodie table and returns [Key, Optional[FullFilePath]]
|
||||||
* If the optional FullFilePath value is not present, then the key is not found. If the
|
* If the optional FullFilePath value is not present, then the key is not found. If the
|
||||||
@@ -71,7 +79,7 @@ public abstract class HoodieIndex<T extends HoodieRecordPayload> implements Seri
|
|||||||
|
|
||||||
/**
|
/**
|
||||||
* Extracts the location of written records, and updates the index.
|
* Extracts the location of written records, and updates the index.
|
||||||
*
|
* <p>
|
||||||
* TODO(vc): We may need to propagate the record as well in a WriteStatus class
|
* TODO(vc): We may need to propagate the record as well in a WriteStatus class
|
||||||
*/
|
*/
|
||||||
public abstract JavaRDD<WriteStatus> updateLocation(JavaRDD<WriteStatus> writeStatusRDD,
|
public abstract JavaRDD<WriteStatus> updateLocation(JavaRDD<WriteStatus> writeStatusRDD,
|
||||||
@@ -107,18 +115,7 @@ public abstract class HoodieIndex<T extends HoodieRecordPayload> implements Seri
|
|||||||
public abstract boolean isImplicitWithStorage();
|
public abstract boolean isImplicitWithStorage();
|
||||||
|
|
||||||
|
|
||||||
public static <T extends HoodieRecordPayload> HoodieIndex<T> createIndex(
|
public enum IndexType {
|
||||||
HoodieWriteConfig config, JavaSparkContext jsc) throws HoodieIndexException {
|
HBASE, INMEMORY, BLOOM, BUCKETED
|
||||||
switch (config.getIndexType()) {
|
|
||||||
case HBASE:
|
|
||||||
return new HBaseIndex<>(config, jsc);
|
|
||||||
case INMEMORY:
|
|
||||||
return new InMemoryHashIndex<>(config, jsc);
|
|
||||||
case BLOOM:
|
|
||||||
return new HoodieBloomIndex<>(config, jsc);
|
|
||||||
case BUCKETED:
|
|
||||||
return new BucketedIndex<>(config, jsc);
|
|
||||||
}
|
|
||||||
throw new HoodieIndexException("Index type unspecified, set " + config.getIndexType());
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -49,32 +49,11 @@ public class InMemoryHashIndex<T extends HoodieRecordPayload> extends HoodieInde
|
|||||||
}
|
}
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
public JavaPairRDD<HoodieKey, Optional<String>> fetchRecordLocation(
|
public JavaPairRDD<HoodieKey, Optional<String>> fetchRecordLocation(JavaRDD<HoodieKey> hoodieKeys,
|
||||||
JavaRDD<HoodieKey> hoodieKeys, final HoodieTable<T> table) {
|
final HoodieTable<T> table) {
|
||||||
throw new UnsupportedOperationException("InMemory index does not implement check exist yet");
|
throw new UnsupportedOperationException("InMemory index does not implement check exist yet");
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
|
||||||
* Function that tags each HoodieRecord with an existing location, if known.
|
|
||||||
*/
|
|
||||||
class LocationTagFunction
|
|
||||||
implements Function2<Integer, Iterator<HoodieRecord<T>>, Iterator<HoodieRecord<T>>> {
|
|
||||||
|
|
||||||
@Override
|
|
||||||
public Iterator<HoodieRecord<T>> call(Integer partitionNum,
|
|
||||||
Iterator<HoodieRecord<T>> hoodieRecordIterator) {
|
|
||||||
List<HoodieRecord<T>> taggedRecords = new ArrayList<>();
|
|
||||||
while (hoodieRecordIterator.hasNext()) {
|
|
||||||
HoodieRecord<T> rec = hoodieRecordIterator.next();
|
|
||||||
if (recordLocationMap.containsKey(rec.getKey())) {
|
|
||||||
rec.setCurrentLocation(recordLocationMap.get(rec.getKey()));
|
|
||||||
}
|
|
||||||
taggedRecords.add(rec);
|
|
||||||
}
|
|
||||||
return taggedRecords.iterator();
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
public JavaRDD<HoodieRecord<T>> tagLocation(JavaRDD<HoodieRecord<T>> recordRDD,
|
public JavaRDD<HoodieRecord<T>> tagLocation(JavaRDD<HoodieRecord<T>> recordRDD,
|
||||||
HoodieTable<T> hoodieTable) {
|
HoodieTable<T> hoodieTable) {
|
||||||
@@ -132,4 +111,25 @@ public class InMemoryHashIndex<T extends HoodieRecordPayload> extends HoodieInde
|
|||||||
public boolean isImplicitWithStorage() {
|
public boolean isImplicitWithStorage() {
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Function that tags each HoodieRecord with an existing location, if known.
|
||||||
|
*/
|
||||||
|
class LocationTagFunction implements
|
||||||
|
Function2<Integer, Iterator<HoodieRecord<T>>, Iterator<HoodieRecord<T>>> {
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public Iterator<HoodieRecord<T>> call(Integer partitionNum,
|
||||||
|
Iterator<HoodieRecord<T>> hoodieRecordIterator) {
|
||||||
|
List<HoodieRecord<T>> taggedRecords = new ArrayList<>();
|
||||||
|
while (hoodieRecordIterator.hasNext()) {
|
||||||
|
HoodieRecord<T> rec = hoodieRecordIterator.next();
|
||||||
|
if (recordLocationMap.containsKey(rec.getKey())) {
|
||||||
|
rec.setCurrentLocation(recordLocationMap.get(rec.getKey()));
|
||||||
|
}
|
||||||
|
taggedRecords.add(rec);
|
||||||
|
}
|
||||||
|
return taggedRecords.iterator();
|
||||||
|
}
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -64,8 +64,7 @@ public class BloomIndexFileInfo implements Serializable {
|
|||||||
* Does the given key fall within the range (inclusive)
|
* Does the given key fall within the range (inclusive)
|
||||||
*/
|
*/
|
||||||
public boolean isKeyInRange(String recordKey) {
|
public boolean isKeyInRange(String recordKey) {
|
||||||
return minRecordKey.compareTo(recordKey) <= 0 &&
|
return minRecordKey.compareTo(recordKey) <= 0 && maxRecordKey.compareTo(recordKey) >= 0;
|
||||||
maxRecordKey.compareTo(recordKey) >= 0;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
@@ -78,9 +77,8 @@ public class BloomIndexFileInfo implements Serializable {
|
|||||||
}
|
}
|
||||||
|
|
||||||
BloomIndexFileInfo that = (BloomIndexFileInfo) o;
|
BloomIndexFileInfo that = (BloomIndexFileInfo) o;
|
||||||
return Objects.equal(that.fileName, fileName) &&
|
return Objects.equal(that.fileName, fileName) && Objects.equal(that.minRecordKey, minRecordKey)
|
||||||
Objects.equal(that.minRecordKey, minRecordKey) &&
|
&& Objects.equal(that.maxRecordKey, maxRecordKey);
|
||||||
Objects.equal(that.maxRecordKey, maxRecordKey);
|
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|||||||
@@ -56,12 +56,11 @@ import scala.Tuple2;
|
|||||||
*/
|
*/
|
||||||
public class HoodieBloomIndex<T extends HoodieRecordPayload> extends HoodieIndex<T> {
|
public class HoodieBloomIndex<T extends HoodieRecordPayload> extends HoodieIndex<T> {
|
||||||
|
|
||||||
private static Logger logger = LogManager.getLogger(HoodieBloomIndex.class);
|
|
||||||
|
|
||||||
// we need to limit the join such that it stays within 1.5GB per Spark partition. (SPARK-1476)
|
// we need to limit the join such that it stays within 1.5GB per Spark partition. (SPARK-1476)
|
||||||
private static final int SPARK_MAXIMUM_BYTES_PER_PARTITION = 1500 * 1024 * 1024;
|
private static final int SPARK_MAXIMUM_BYTES_PER_PARTITION = 1500 * 1024 * 1024;
|
||||||
// this is how much a triplet of (partitionPath, fileId, recordKey) costs.
|
// this is how much a triplet of (partitionPath, fileId, recordKey) costs.
|
||||||
private static final int BYTES_PER_PARTITION_FILE_KEY_TRIPLET = 300;
|
private static final int BYTES_PER_PARTITION_FILE_KEY_TRIPLET = 300;
|
||||||
|
private static Logger logger = LogManager.getLogger(HoodieBloomIndex.class);
|
||||||
private static int MAX_ITEMS_PER_SHUFFLE_PARTITION =
|
private static int MAX_ITEMS_PER_SHUFFLE_PARTITION =
|
||||||
SPARK_MAXIMUM_BYTES_PER_PARTITION / BYTES_PER_PARTITION_FILE_KEY_TRIPLET;
|
SPARK_MAXIMUM_BYTES_PER_PARTITION / BYTES_PER_PARTITION_FILE_KEY_TRIPLET;
|
||||||
|
|
||||||
@@ -108,27 +107,26 @@ public class HoodieBloomIndex<T extends HoodieRecordPayload> extends HoodieIndex
|
|||||||
return taggedRecordRDD;
|
return taggedRecordRDD;
|
||||||
}
|
}
|
||||||
|
|
||||||
public JavaPairRDD<HoodieKey, Optional<String>> fetchRecordLocation(
|
public JavaPairRDD<HoodieKey, Optional<String>> fetchRecordLocation(JavaRDD<HoodieKey> hoodieKeys,
|
||||||
JavaRDD<HoodieKey> hoodieKeys, final HoodieTable<T> table) {
|
final HoodieTable<T> table) {
|
||||||
JavaPairRDD<String, String> partitionRecordKeyPairRDD =
|
JavaPairRDD<String, String> partitionRecordKeyPairRDD = hoodieKeys
|
||||||
hoodieKeys.mapToPair(key -> new Tuple2<>(key.getPartitionPath(), key.getRecordKey()));
|
.mapToPair(key -> new Tuple2<>(key.getPartitionPath(), key.getRecordKey()));
|
||||||
|
|
||||||
// Lookup indexes for all the partition/recordkey pair
|
// Lookup indexes for all the partition/recordkey pair
|
||||||
JavaPairRDD<String, String> rowKeyFilenamePairRDD =
|
JavaPairRDD<String, String> rowKeyFilenamePairRDD = lookupIndex(partitionRecordKeyPairRDD,
|
||||||
lookupIndex(partitionRecordKeyPairRDD, table);
|
table);
|
||||||
|
|
||||||
JavaPairRDD<String, HoodieKey> rowKeyHoodieKeyPairRDD =
|
JavaPairRDD<String, HoodieKey> rowKeyHoodieKeyPairRDD = hoodieKeys
|
||||||
hoodieKeys.mapToPair(key -> new Tuple2<>(key.getRecordKey(), key));
|
.mapToPair(key -> new Tuple2<>(key.getRecordKey(), key));
|
||||||
|
|
||||||
return rowKeyHoodieKeyPairRDD.leftOuterJoin(rowKeyFilenamePairRDD)
|
return rowKeyHoodieKeyPairRDD.leftOuterJoin(rowKeyFilenamePairRDD).mapToPair(keyPathTuple -> {
|
||||||
.mapToPair(keyPathTuple -> {
|
|
||||||
Optional<String> recordLocationPath;
|
Optional<String> recordLocationPath;
|
||||||
if (keyPathTuple._2._2.isPresent()) {
|
if (keyPathTuple._2._2.isPresent()) {
|
||||||
String fileName = keyPathTuple._2._2.get();
|
String fileName = keyPathTuple._2._2.get();
|
||||||
String partitionPath = keyPathTuple._2._1.getPartitionPath();
|
String partitionPath = keyPathTuple._2._1.getPartitionPath();
|
||||||
recordLocationPath = Optional.of(new Path(
|
recordLocationPath = Optional
|
||||||
new Path(table.getMetaClient().getBasePath(), partitionPath),
|
.of(new Path(new Path(table.getMetaClient().getBasePath(), partitionPath), fileName)
|
||||||
fileName).toUri().getPath());
|
.toUri().getPath());
|
||||||
} else {
|
} else {
|
||||||
recordLocationPath = Optional.absent();
|
recordLocationPath = Optional.absent();
|
||||||
}
|
}
|
||||||
@@ -152,21 +150,21 @@ public class HoodieBloomIndex<T extends HoodieRecordPayload> extends HoodieIndex
|
|||||||
final Map<String, List<BloomIndexFileInfo>> partitionToFileInfo = fileInfoList.stream()
|
final Map<String, List<BloomIndexFileInfo>> partitionToFileInfo = fileInfoList.stream()
|
||||||
.collect(groupingBy(Tuple2::_1, mapping(Tuple2::_2, toList())));
|
.collect(groupingBy(Tuple2::_1, mapping(Tuple2::_2, toList())));
|
||||||
|
|
||||||
// Step 3: Obtain a RDD, for each incoming record, that already exists, with the file id, that contains it.
|
// Step 3: Obtain a RDD, for each incoming record, that already exists, with the file id,
|
||||||
|
// that contains it.
|
||||||
int parallelism = autoComputeParallelism(recordsPerPartition, partitionToFileInfo,
|
int parallelism = autoComputeParallelism(recordsPerPartition, partitionToFileInfo,
|
||||||
partitionRecordKeyPairRDD);
|
partitionRecordKeyPairRDD);
|
||||||
return findMatchingFilesForRecordKeys(hoodieTable, partitionToFileInfo,
|
return findMatchingFilesForRecordKeys(hoodieTable, partitionToFileInfo,
|
||||||
partitionRecordKeyPairRDD,
|
partitionRecordKeyPairRDD, parallelism);
|
||||||
parallelism);
|
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* The index lookup can be skewed in three dimensions : #files, #partitions, #records
|
* The index lookup can be skewed in three dimensions : #files, #partitions, #records
|
||||||
*
|
* <p>
|
||||||
* To be able to smoothly handle skews, we need to compute how to split each partitions into
|
* To be able to smoothly handle skews, we need to compute how to split each partitions into
|
||||||
* subpartitions. We do it here, in a way that keeps the amount of each Spark join partition to <
|
* subpartitions. We do it here, in a way that keeps the amount of each Spark join partition to <
|
||||||
* 2GB.
|
* 2GB.
|
||||||
*
|
* <p>
|
||||||
* If {@link com.uber.hoodie.config.HoodieIndexConfig#BLOOM_INDEX_PARALLELISM_PROP} is specified
|
* If {@link com.uber.hoodie.config.HoodieIndexConfig#BLOOM_INDEX_PARALLELISM_PROP} is specified
|
||||||
* as a NON-zero number, then that is used explicitly.
|
* as a NON-zero number, then that is used explicitly.
|
||||||
*/
|
*/
|
||||||
@@ -184,7 +182,8 @@ public class HoodieBloomIndex<T extends HoodieRecordPayload> extends HoodieIndex
|
|||||||
// records for a partition.
|
// records for a partition.
|
||||||
Map<String, Long> filesPerPartition = partitionToFileInfo.entrySet().stream()
|
Map<String, Long> filesPerPartition = partitionToFileInfo.entrySet().stream()
|
||||||
.collect(Collectors.toMap(Map.Entry::getKey, e -> Long.valueOf(e.getValue().size())));
|
.collect(Collectors.toMap(Map.Entry::getKey, e -> Long.valueOf(e.getValue().size())));
|
||||||
long totalFiles = 0, totalRecords = 0;
|
long totalFiles = 0;
|
||||||
|
long totalRecords = 0;
|
||||||
for (String partitionPath : recordsPerPartition.keySet()) {
|
for (String partitionPath : recordsPerPartition.keySet()) {
|
||||||
long numRecords = recordsPerPartition.get(partitionPath);
|
long numRecords = recordsPerPartition.get(partitionPath);
|
||||||
long numFiles =
|
long numFiles =
|
||||||
@@ -210,22 +209,22 @@ public class HoodieBloomIndex<T extends HoodieRecordPayload> extends HoodieIndex
|
|||||||
|
|
||||||
/**
|
/**
|
||||||
* Its crucial to pick the right parallelism.
|
* Its crucial to pick the right parallelism.
|
||||||
*
|
* <p>
|
||||||
* totalSubPartitions : this is deemed safe limit, to be nice with Spark. inputParallelism :
|
* totalSubPartitions : this is deemed safe limit, to be nice with Spark. inputParallelism :
|
||||||
* typically number of input file splits
|
* typically number of input file splits
|
||||||
*
|
* <p>
|
||||||
* We pick the max such that, we are always safe, but go higher if say a there are a lot of input
|
* We pick the max such that, we are always safe, but go higher if say a there are a lot of input
|
||||||
* files. (otherwise, we will fallback to number of partitions in input and end up with slow
|
* files. (otherwise, we will fallback to number of partitions in input and end up with slow
|
||||||
* performance)
|
* performance)
|
||||||
*/
|
*/
|
||||||
private int determineParallelism(int inputParallelism, int totalSubPartitions) {
|
private int determineParallelism(int inputParallelism, int totalSubPartitions) {
|
||||||
// If bloom index parallelism is set, use it to to check against the input parallelism and take the max
|
// If bloom index parallelism is set, use it to to check against the input parallelism and
|
||||||
|
// take the max
|
||||||
int indexParallelism = Math.max(inputParallelism, config.getBloomIndexParallelism());
|
int indexParallelism = Math.max(inputParallelism, config.getBloomIndexParallelism());
|
||||||
int joinParallelism = Math.max(totalSubPartitions, indexParallelism);
|
int joinParallelism = Math.max(totalSubPartitions, indexParallelism);
|
||||||
logger.info("InputParallelism: ${" + inputParallelism + "}, " +
|
logger.info("InputParallelism: ${" + inputParallelism + "}, " + "IndexParallelism: ${" + config
|
||||||
"IndexParallelism: ${" + config.getBloomIndexParallelism() + "}, " +
|
.getBloomIndexParallelism() + "}, " + "TotalSubParts: ${" + totalSubPartitions + "}, "
|
||||||
"TotalSubParts: ${" + totalSubPartitions + "}, " +
|
+ "Join Parallelism set to : " + joinParallelism);
|
||||||
"Join Parallelism set to : " + joinParallelism);
|
|
||||||
return joinParallelism;
|
return joinParallelism;
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -237,29 +236,24 @@ public class HoodieBloomIndex<T extends HoodieRecordPayload> extends HoodieIndex
|
|||||||
final HoodieTable<T> hoodieTable) {
|
final HoodieTable<T> hoodieTable) {
|
||||||
// Obtain the latest data files from all the partitions.
|
// Obtain the latest data files from all the partitions.
|
||||||
List<Tuple2<String, HoodieDataFile>> dataFilesList = jsc
|
List<Tuple2<String, HoodieDataFile>> dataFilesList = jsc
|
||||||
.parallelize(partitions, Math.max(partitions.size(), 1))
|
.parallelize(partitions, Math.max(partitions.size(), 1)).flatMapToPair(partitionPath -> {
|
||||||
.flatMapToPair(partitionPath -> {
|
java.util.Optional<HoodieInstant> latestCommitTime = hoodieTable.getCommitsTimeline()
|
||||||
java.util.Optional<HoodieInstant> latestCommitTime =
|
.filterCompletedInstants().lastInstant();
|
||||||
hoodieTable.getCommitsTimeline().filterCompletedInstants().lastInstant();
|
|
||||||
List<Tuple2<String, HoodieDataFile>> filteredFiles = new ArrayList<>();
|
List<Tuple2<String, HoodieDataFile>> filteredFiles = new ArrayList<>();
|
||||||
if (latestCommitTime.isPresent()) {
|
if (latestCommitTime.isPresent()) {
|
||||||
filteredFiles =
|
filteredFiles = hoodieTable.getROFileSystemView()
|
||||||
hoodieTable.getROFileSystemView().getLatestDataFilesBeforeOrOn(partitionPath,
|
.getLatestDataFilesBeforeOrOn(partitionPath, latestCommitTime.get().getTimestamp())
|
||||||
latestCommitTime.get().getTimestamp())
|
.map(f -> new Tuple2<>(partitionPath, f)).collect(toList());
|
||||||
.map(f -> new Tuple2<>(partitionPath, f))
|
|
||||||
.collect(toList());
|
|
||||||
}
|
}
|
||||||
return filteredFiles.iterator();
|
return filteredFiles.iterator();
|
||||||
}).collect();
|
}).collect();
|
||||||
|
|
||||||
if (config.getBloomIndexPruneByRanges()) {
|
if (config.getBloomIndexPruneByRanges()) {
|
||||||
// also obtain file ranges, if range pruning is enabled
|
// also obtain file ranges, if range pruning is enabled
|
||||||
return jsc.parallelize(dataFilesList, Math.max(dataFilesList.size(), 1))
|
return jsc.parallelize(dataFilesList, Math.max(dataFilesList.size(), 1)).mapToPair(ft -> {
|
||||||
.mapToPair(ft -> {
|
|
||||||
try {
|
try {
|
||||||
String[] minMaxKeys = ParquetUtils
|
String[] minMaxKeys = ParquetUtils
|
||||||
.readMinMaxRecordKeys(hoodieTable.getHadoopConf(),
|
.readMinMaxRecordKeys(hoodieTable.getHadoopConf(), ft._2().getFileStatus().getPath());
|
||||||
ft._2().getFileStatus().getPath());
|
|
||||||
return new Tuple2<>(ft._1(),
|
return new Tuple2<>(ft._1(),
|
||||||
new BloomIndexFileInfo(ft._2().getFileName(), minMaxKeys[0], minMaxKeys[1]));
|
new BloomIndexFileInfo(ft._2().getFileName(), minMaxKeys[0], minMaxKeys[1]));
|
||||||
} catch (MetadataNotFoundException me) {
|
} catch (MetadataNotFoundException me) {
|
||||||
@@ -320,21 +314,20 @@ public class HoodieBloomIndex<T extends HoodieRecordPayload> extends HoodieIndex
|
|||||||
* (e.g: timestamp as prefix), the number of files to be compared gets cut down a lot from range
|
* (e.g: timestamp as prefix), the number of files to be compared gets cut down a lot from range
|
||||||
* pruning.
|
* pruning.
|
||||||
*/
|
*/
|
||||||
// sub-partition to ensure the records can be looked up against files & also prune file<=>record comparisons based on recordKey
|
// sub-partition to ensure the records can be looked up against files & also prune
|
||||||
|
// file<=>record comparisons based on recordKey
|
||||||
// ranges in the index info.
|
// ranges in the index info.
|
||||||
@VisibleForTesting
|
@VisibleForTesting
|
||||||
JavaPairRDD<String, Tuple2<String, HoodieKey>> explodeRecordRDDWithFileComparisons(
|
JavaPairRDD<String, Tuple2<String, HoodieKey>> explodeRecordRDDWithFileComparisons(
|
||||||
final Map<String, List<BloomIndexFileInfo>> partitionToFileIndexInfo,
|
final Map<String, List<BloomIndexFileInfo>> partitionToFileIndexInfo,
|
||||||
JavaPairRDD<String, String> partitionRecordKeyPairRDD) {
|
JavaPairRDD<String, String> partitionRecordKeyPairRDD) {
|
||||||
return partitionRecordKeyPairRDD
|
return partitionRecordKeyPairRDD.map(partitionRecordKeyPair -> {
|
||||||
.map(partitionRecordKeyPair -> {
|
|
||||||
String recordKey = partitionRecordKeyPair._2();
|
String recordKey = partitionRecordKeyPair._2();
|
||||||
String partitionPath = partitionRecordKeyPair._1();
|
String partitionPath = partitionRecordKeyPair._1();
|
||||||
|
|
||||||
List<BloomIndexFileInfo> indexInfos = partitionToFileIndexInfo.get(partitionPath);
|
List<BloomIndexFileInfo> indexInfos = partitionToFileIndexInfo.get(partitionPath);
|
||||||
List<Tuple2<String, Tuple2<String, HoodieKey>>> recordComparisons = new ArrayList<>();
|
List<Tuple2<String, Tuple2<String, HoodieKey>>> recordComparisons = new ArrayList<>();
|
||||||
if (indexInfos
|
if (indexInfos != null) { // could be null, if there are no files in a given partition yet.
|
||||||
!= null) { // could be null, if there are no files in a given partition yet.
|
|
||||||
// for each candidate file in partition, that needs to be compared.
|
// for each candidate file in partition, that needs to be compared.
|
||||||
for (BloomIndexFileInfo indexInfo : indexInfos) {
|
for (BloomIndexFileInfo indexInfo : indexInfos) {
|
||||||
if (shouldCompareWithFile(indexInfo, recordKey)) {
|
if (shouldCompareWithFile(indexInfo, recordKey)) {
|
||||||
@@ -346,35 +339,34 @@ public class HoodieBloomIndex<T extends HoodieRecordPayload> extends HoodieIndex
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
return recordComparisons;
|
return recordComparisons;
|
||||||
})
|
}).flatMapToPair(t -> t.iterator());
|
||||||
.flatMapToPair(t -> t.iterator());
|
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Find out <RowKey, filename> pair. All workload grouped by file-level.
|
* Find out <RowKey, filename> pair. All workload grouped by file-level.
|
||||||
*
|
* <p>
|
||||||
* Join PairRDD(PartitionPath, RecordKey) and PairRDD(PartitionPath, File) & then repartition such
|
* Join PairRDD(PartitionPath, RecordKey) and PairRDD(PartitionPath, File) & then repartition such
|
||||||
* that each RDD partition is a file, then for each file, we do (1) load bloom filter, (2) load
|
* that each RDD partition is a file, then for each file, we do (1) load bloom filter, (2) load
|
||||||
* rowKeys, (3) Tag rowKey
|
* rowKeys, (3) Tag rowKey
|
||||||
*
|
* <p>
|
||||||
* Make sure the parallelism is atleast the groupby parallelism for tagging location
|
* Make sure the parallelism is atleast the groupby parallelism for tagging location
|
||||||
*/
|
*/
|
||||||
@VisibleForTesting
|
@VisibleForTesting
|
||||||
JavaPairRDD<String, String> findMatchingFilesForRecordKeys(HoodieTable hoodieTable,
|
JavaPairRDD<String, String> findMatchingFilesForRecordKeys(HoodieTable hoodieTable,
|
||||||
final Map<String, List<BloomIndexFileInfo>> partitionToFileIndexInfo,
|
final Map<String, List<BloomIndexFileInfo>> partitionToFileIndexInfo,
|
||||||
JavaPairRDD<String, String> partitionRecordKeyPairRDD,
|
JavaPairRDD<String, String> partitionRecordKeyPairRDD, int totalSubpartitions) {
|
||||||
int totalSubpartitions) {
|
|
||||||
|
|
||||||
int joinParallelism = determineParallelism(partitionRecordKeyPairRDD.partitions().size(),
|
int joinParallelism = determineParallelism(partitionRecordKeyPairRDD.partitions().size(),
|
||||||
totalSubpartitions);
|
totalSubpartitions);
|
||||||
|
|
||||||
JavaPairRDD<String, Tuple2<String, HoodieKey>> fileSortedTripletRDD = explodeRecordRDDWithFileComparisons(
|
JavaPairRDD<String, Tuple2<String, HoodieKey>> fileSortedTripletRDD =
|
||||||
|
explodeRecordRDDWithFileComparisons(
|
||||||
partitionToFileIndexInfo, partitionRecordKeyPairRDD)
|
partitionToFileIndexInfo, partitionRecordKeyPairRDD)
|
||||||
// sort further based on filename, such that all checking for the file can happen within a single partition, on-the-fly
|
// sort further based on filename, such that all checking for the file can happen within
|
||||||
|
// a single partition, on-the-fly
|
||||||
.sortByKey(true, joinParallelism);
|
.sortByKey(true, joinParallelism);
|
||||||
|
|
||||||
return fileSortedTripletRDD
|
return fileSortedTripletRDD.mapPartitionsWithIndex(
|
||||||
.mapPartitionsWithIndex(
|
|
||||||
new HoodieBloomIndexCheckFunction(hoodieTable, config.getBasePath()), true)
|
new HoodieBloomIndexCheckFunction(hoodieTable, config.getBasePath()), true)
|
||||||
.flatMap(indexLookupResults -> indexLookupResults.iterator())
|
.flatMap(indexLookupResults -> indexLookupResults.iterator())
|
||||||
.filter(lookupResult -> lookupResult.getMatchingRecordKeys().size() > 0)
|
.filter(lookupResult -> lookupResult.getMatchingRecordKeys().size() > 0)
|
||||||
@@ -391,14 +383,13 @@ public class HoodieBloomIndex<T extends HoodieRecordPayload> extends HoodieIndex
|
|||||||
* Tag the <rowKey, filename> back to the original HoodieRecord RDD.
|
* Tag the <rowKey, filename> back to the original HoodieRecord RDD.
|
||||||
*/
|
*/
|
||||||
private JavaRDD<HoodieRecord<T>> tagLocationBacktoRecords(
|
private JavaRDD<HoodieRecord<T>> tagLocationBacktoRecords(
|
||||||
JavaPairRDD<String, String> rowKeyFilenamePairRDD,
|
JavaPairRDD<String, String> rowKeyFilenamePairRDD, JavaRDD<HoodieRecord<T>> recordRDD) {
|
||||||
JavaRDD<HoodieRecord<T>> recordRDD) {
|
|
||||||
JavaPairRDD<String, HoodieRecord<T>> rowKeyRecordPairRDD = recordRDD
|
JavaPairRDD<String, HoodieRecord<T>> rowKeyRecordPairRDD = recordRDD
|
||||||
.mapToPair(record -> new Tuple2<>(record.getRecordKey(), record));
|
.mapToPair(record -> new Tuple2<>(record.getRecordKey(), record));
|
||||||
|
|
||||||
// Here as the recordRDD might have more data than rowKeyRDD (some rowKeys' fileId is null), so we do left outer join.
|
// Here as the recordRDD might have more data than rowKeyRDD (some rowKeys' fileId is null),
|
||||||
return rowKeyRecordPairRDD.leftOuterJoin(rowKeyFilenamePairRDD).values().map(
|
// so we do left outer join.
|
||||||
v1 -> {
|
return rowKeyRecordPairRDD.leftOuterJoin(rowKeyFilenamePairRDD).values().map(v1 -> {
|
||||||
HoodieRecord<T> record = v1._1();
|
HoodieRecord<T> record = v1._1();
|
||||||
if (v1._2().isPresent()) {
|
if (v1._2().isPresent()) {
|
||||||
String filename = v1._2().get();
|
String filename = v1._2().get();
|
||||||
@@ -408,8 +399,7 @@ public class HoodieBloomIndex<T extends HoodieRecordPayload> extends HoodieIndex
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
return record;
|
return record;
|
||||||
}
|
});
|
||||||
);
|
|
||||||
}
|
}
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
|
|||||||
@@ -41,7 +41,8 @@ import scala.Tuple2;
|
|||||||
* actual files
|
* actual files
|
||||||
*/
|
*/
|
||||||
public class HoodieBloomIndexCheckFunction implements
|
public class HoodieBloomIndexCheckFunction implements
|
||||||
Function2<Integer, Iterator<Tuple2<String, Tuple2<String, HoodieKey>>>, Iterator<List<IndexLookupResult>>> {
|
Function2<Integer, Iterator<Tuple2<String, Tuple2<String, HoodieKey>>>,
|
||||||
|
Iterator<List<IndexLookupResult>>> {
|
||||||
|
|
||||||
private static Logger logger = LogManager.getLogger(HoodieBloomIndexCheckFunction.class);
|
private static Logger logger = LogManager.getLogger(HoodieBloomIndexCheckFunction.class);
|
||||||
|
|
||||||
@@ -58,8 +59,7 @@ public class HoodieBloomIndexCheckFunction implements
|
|||||||
* Given a list of row keys and one file, return only row keys existing in that file.
|
* Given a list of row keys and one file, return only row keys existing in that file.
|
||||||
*/
|
*/
|
||||||
public static List<String> checkCandidatesAgainstFile(Configuration configuration,
|
public static List<String> checkCandidatesAgainstFile(Configuration configuration,
|
||||||
List<String> candidateRecordKeys,
|
List<String> candidateRecordKeys, Path filePath) throws HoodieIndexException {
|
||||||
Path filePath) throws HoodieIndexException {
|
|
||||||
List<String> foundRecordKeys = new ArrayList<>();
|
List<String> foundRecordKeys = new ArrayList<>();
|
||||||
try {
|
try {
|
||||||
// Load all rowKeys from the file, to double-confirm
|
// Load all rowKeys from the file, to double-confirm
|
||||||
@@ -86,6 +86,13 @@ public class HoodieBloomIndexCheckFunction implements
|
|||||||
return foundRecordKeys;
|
return foundRecordKeys;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public Iterator<List<IndexLookupResult>> call(Integer partition,
|
||||||
|
Iterator<Tuple2<String, Tuple2<String, HoodieKey>>> fileParitionRecordKeyTripletItr)
|
||||||
|
throws Exception {
|
||||||
|
return new LazyKeyCheckIterator(fileParitionRecordKeyTripletItr);
|
||||||
|
}
|
||||||
|
|
||||||
class LazyKeyCheckIterator extends
|
class LazyKeyCheckIterator extends
|
||||||
LazyIterableIterator<Tuple2<String, Tuple2<String, HoodieKey>>, List<IndexLookupResult>> {
|
LazyIterableIterator<Tuple2<String, Tuple2<String, HoodieKey>>, List<IndexLookupResult>> {
|
||||||
|
|
||||||
@@ -143,7 +150,8 @@ public class HoodieBloomIndexCheckFunction implements
|
|||||||
|
|
||||||
// if continue on current file)
|
// if continue on current file)
|
||||||
if (fileName.equals(currentFile)) {
|
if (fileName.equals(currentFile)) {
|
||||||
// check record key against bloom filter of current file & add to possible keys if needed
|
// check record key against bloom filter of current file & add to possible keys if
|
||||||
|
// needed
|
||||||
if (bloomFilter.mightContain(recordKey)) {
|
if (bloomFilter.mightContain(recordKey)) {
|
||||||
if (logger.isDebugEnabled()) {
|
if (logger.isDebugEnabled()) {
|
||||||
logger.debug("#1 Adding " + recordKey + " as candidate for file " + fileName);
|
logger.debug("#1 Adding " + recordKey + " as candidate for file " + fileName);
|
||||||
@@ -201,12 +209,4 @@ public class HoodieBloomIndexCheckFunction implements
|
|||||||
protected void end() {
|
protected void end() {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
@Override
|
|
||||||
public Iterator<List<IndexLookupResult>> call(Integer partition,
|
|
||||||
Iterator<Tuple2<String, Tuple2<String, HoodieKey>>> fileParitionRecordKeyTripletItr)
|
|
||||||
throws Exception {
|
|
||||||
return new LazyKeyCheckIterator(fileParitionRecordKeyTripletItr);
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -38,9 +38,9 @@ import scala.Tuple2;
|
|||||||
/**
|
/**
|
||||||
* An `stateless` index implementation that will using a deterministic mapping function to determine
|
* An `stateless` index implementation that will using a deterministic mapping function to determine
|
||||||
* the fileID for a given record.
|
* the fileID for a given record.
|
||||||
*
|
* <p>
|
||||||
* Pros: - Fast
|
* Pros: - Fast
|
||||||
*
|
* <p>
|
||||||
* Cons : - Need to tune the number of buckets per partition path manually (FIXME: Need to autotune
|
* Cons : - Need to tune the number of buckets per partition path manually (FIXME: Need to autotune
|
||||||
* this) - Could increase write amplification on copy-on-write storage since inserts always rewrite
|
* this) - Could increase write amplification on copy-on-write storage since inserts always rewrite
|
||||||
* files - Not global.
|
* files - Not global.
|
||||||
|
|||||||
@@ -27,12 +27,16 @@ import com.uber.hoodie.common.model.HoodieRecordLocation;
|
|||||||
import com.uber.hoodie.common.model.HoodieRecordPayload;
|
import com.uber.hoodie.common.model.HoodieRecordPayload;
|
||||||
import com.uber.hoodie.common.table.HoodieTimeline;
|
import com.uber.hoodie.common.table.HoodieTimeline;
|
||||||
import com.uber.hoodie.common.table.timeline.HoodieInstant;
|
import com.uber.hoodie.common.table.timeline.HoodieInstant;
|
||||||
import com.uber.hoodie.config.HoodieIndexConfig;
|
|
||||||
import com.uber.hoodie.config.HoodieWriteConfig;
|
import com.uber.hoodie.config.HoodieWriteConfig;
|
||||||
import com.uber.hoodie.exception.HoodieDependentSystemUnavailableException;
|
import com.uber.hoodie.exception.HoodieDependentSystemUnavailableException;
|
||||||
import com.uber.hoodie.exception.HoodieIndexException;
|
import com.uber.hoodie.exception.HoodieIndexException;
|
||||||
import com.uber.hoodie.index.HoodieIndex;
|
import com.uber.hoodie.index.HoodieIndex;
|
||||||
import com.uber.hoodie.table.HoodieTable;
|
import com.uber.hoodie.table.HoodieTable;
|
||||||
|
import java.io.IOException;
|
||||||
|
import java.util.ArrayList;
|
||||||
|
import java.util.Iterator;
|
||||||
|
import java.util.LinkedList;
|
||||||
|
import java.util.List;
|
||||||
import org.apache.hadoop.conf.Configuration;
|
import org.apache.hadoop.conf.Configuration;
|
||||||
import org.apache.hadoop.hbase.HBaseConfiguration;
|
import org.apache.hadoop.hbase.HBaseConfiguration;
|
||||||
import org.apache.hadoop.hbase.TableName;
|
import org.apache.hadoop.hbase.TableName;
|
||||||
@@ -51,23 +55,18 @@ import org.apache.spark.api.java.JavaRDD;
|
|||||||
import org.apache.spark.api.java.JavaSparkContext;
|
import org.apache.spark.api.java.JavaSparkContext;
|
||||||
import org.apache.spark.api.java.function.Function2;
|
import org.apache.spark.api.java.function.Function2;
|
||||||
|
|
||||||
import java.io.IOException;
|
|
||||||
import java.util.ArrayList;
|
|
||||||
import java.util.Iterator;
|
|
||||||
import java.util.LinkedList;
|
|
||||||
import java.util.List;
|
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Hoodie Index implementation backed by HBase
|
* Hoodie Index implementation backed by HBase
|
||||||
*/
|
*/
|
||||||
public class HBaseIndex<T extends HoodieRecordPayload> extends HoodieIndex<T> {
|
public class HBaseIndex<T extends HoodieRecordPayload> extends HoodieIndex<T> {
|
||||||
private final static byte[] SYSTEM_COLUMN_FAMILY = Bytes.toBytes("_s");
|
|
||||||
private final static byte[] COMMIT_TS_COLUMN = Bytes.toBytes("commit_ts");
|
private static final byte[] SYSTEM_COLUMN_FAMILY = Bytes.toBytes("_s");
|
||||||
private final static byte[] FILE_NAME_COLUMN = Bytes.toBytes("file_name");
|
private static final byte[] COMMIT_TS_COLUMN = Bytes.toBytes("commit_ts");
|
||||||
private final static byte[] PARTITION_PATH_COLUMN = Bytes.toBytes("partition_path");
|
private static final byte[] FILE_NAME_COLUMN = Bytes.toBytes("file_name");
|
||||||
|
private static final byte[] PARTITION_PATH_COLUMN = Bytes.toBytes("partition_path");
|
||||||
|
|
||||||
private static Logger logger = LogManager.getLogger(HBaseIndex.class);
|
private static Logger logger = LogManager.getLogger(HBaseIndex.class);
|
||||||
|
private static Connection hbaseConnection = null;
|
||||||
private final String tableName;
|
private final String tableName;
|
||||||
|
|
||||||
public HBaseIndex(HoodieWriteConfig config, JavaSparkContext jsc) {
|
public HBaseIndex(HoodieWriteConfig config, JavaSparkContext jsc) {
|
||||||
@@ -77,14 +76,12 @@ public class HBaseIndex<T extends HoodieRecordPayload> extends HoodieIndex<T> {
|
|||||||
}
|
}
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
public JavaPairRDD<HoodieKey, Optional<String>> fetchRecordLocation(
|
public JavaPairRDD<HoodieKey, Optional<String>> fetchRecordLocation(JavaRDD<HoodieKey> hoodieKeys,
|
||||||
JavaRDD<HoodieKey> hoodieKeys, HoodieTable<T> table) {
|
HoodieTable<T> table) {
|
||||||
//TODO : Change/Remove filterExists in HoodieReadClient() and revisit
|
//TODO : Change/Remove filterExists in HoodieReadClient() and revisit
|
||||||
throw new UnsupportedOperationException("HBase index does not implement check exist");
|
throw new UnsupportedOperationException("HBase index does not implement check exist");
|
||||||
}
|
}
|
||||||
|
|
||||||
private static Connection hbaseConnection = null;
|
|
||||||
|
|
||||||
private Connection getHBaseConnection() {
|
private Connection getHBaseConnection() {
|
||||||
Configuration hbaseConfig = HBaseConfiguration.create();
|
Configuration hbaseConfig = HBaseConfiguration.create();
|
||||||
String quorum = config.getHbaseZkQuorum();
|
String quorum = config.getHbaseZkQuorum();
|
||||||
@@ -100,8 +97,8 @@ public class HBaseIndex<T extends HoodieRecordPayload> extends HoodieIndex<T> {
|
|||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Since we are sharing the HbaseConnection across tasks in a JVM, make sure the HbaseConnectio is closed when
|
* Since we are sharing the HbaseConnection across tasks in a JVM, make sure the HbaseConnectio is
|
||||||
* JVM exits
|
* closed when JVM exits
|
||||||
*/
|
*/
|
||||||
private void addShutDownHook() {
|
private void addShutDownHook() {
|
||||||
Runtime.getRuntime().addShutdownHook(new Thread() {
|
Runtime.getRuntime().addShutdownHook(new Thread() {
|
||||||
@@ -126,10 +123,11 @@ public class HBaseIndex<T extends HoodieRecordPayload> extends HoodieIndex<T> {
|
|||||||
HoodieTimeline commitTimeline = hoodieTable.getCompletedCommitTimeline();
|
HoodieTimeline commitTimeline = hoodieTable.getCompletedCommitTimeline();
|
||||||
// Check if the last commit ts for this row is 1) present in the timeline or
|
// Check if the last commit ts for this row is 1) present in the timeline or
|
||||||
// 2) is less than the first commit ts in the timeline
|
// 2) is less than the first commit ts in the timeline
|
||||||
return !commitTimeline.empty() && (commitTimeline.containsInstant(
|
return !commitTimeline.empty() && (commitTimeline
|
||||||
new HoodieInstant(false, HoodieTimeline.COMMIT_ACTION, commitTs)) ||
|
.containsInstant(new HoodieInstant(false, HoodieTimeline.COMMIT_ACTION, commitTs))
|
||||||
HoodieTimeline.compareTimestamps(commitTimeline.firstInstant().get().getTimestamp(),
|
|| HoodieTimeline
|
||||||
commitTs, HoodieTimeline.GREATER));
|
.compareTimestamps(commitTimeline.firstInstant().get().getTimestamp(), commitTs,
|
||||||
|
HoodieTimeline.GREATER));
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
@@ -171,16 +169,17 @@ public class HBaseIndex<T extends HoodieRecordPayload> extends HoodieIndex<T> {
|
|||||||
HoodieRecord currentRecord = currentBatchOfRecords.remove(0);
|
HoodieRecord currentRecord = currentBatchOfRecords.remove(0);
|
||||||
if (result.getRow() != null) {
|
if (result.getRow() != null) {
|
||||||
String keyFromResult = Bytes.toString(result.getRow());
|
String keyFromResult = Bytes.toString(result.getRow());
|
||||||
String commitTs =
|
String commitTs = Bytes
|
||||||
Bytes.toString(result.getValue(SYSTEM_COLUMN_FAMILY, COMMIT_TS_COLUMN));
|
.toString(result.getValue(SYSTEM_COLUMN_FAMILY, COMMIT_TS_COLUMN));
|
||||||
String fileId =
|
String fileId = Bytes
|
||||||
Bytes.toString(result.getValue(SYSTEM_COLUMN_FAMILY, FILE_NAME_COLUMN));
|
.toString(result.getValue(SYSTEM_COLUMN_FAMILY, FILE_NAME_COLUMN));
|
||||||
String partitionPath =
|
String partitionPath = Bytes
|
||||||
Bytes.toString(result.getValue(SYSTEM_COLUMN_FAMILY, PARTITION_PATH_COLUMN));
|
.toString(result.getValue(SYSTEM_COLUMN_FAMILY, PARTITION_PATH_COLUMN));
|
||||||
|
|
||||||
if (checkIfValidCommit(hoodieTable, commitTs)) {
|
if (checkIfValidCommit(hoodieTable, commitTs)) {
|
||||||
currentRecord = new HoodieRecord(new HoodieKey(currentRecord.getRecordKey(),
|
currentRecord = new HoodieRecord(
|
||||||
partitionPath), currentRecord.getData());
|
new HoodieKey(currentRecord.getRecordKey(), partitionPath),
|
||||||
|
currentRecord.getData());
|
||||||
currentRecord.setCurrentLocation(new HoodieRecordLocation(commitTs, fileId));
|
currentRecord.setCurrentLocation(new HoodieRecordLocation(commitTs, fileId));
|
||||||
taggedRecords.add(currentRecord);
|
taggedRecords.add(currentRecord);
|
||||||
// the key from Result and the key being processed should be same
|
// the key from Result and the key being processed should be same
|
||||||
@@ -217,10 +216,10 @@ public class HBaseIndex<T extends HoodieRecordPayload> extends HoodieIndex<T> {
|
|||||||
return recordRDD.mapPartitionsWithIndex(locationTagFunction(hoodieTable), true);
|
return recordRDD.mapPartitionsWithIndex(locationTagFunction(hoodieTable), true);
|
||||||
}
|
}
|
||||||
|
|
||||||
private Function2<Integer, Iterator<WriteStatus>, Iterator<WriteStatus>> updateLocationFunction() {
|
private Function2<Integer, Iterator<WriteStatus>, Iterator<WriteStatus>>
|
||||||
|
updateLocationFunction() {
|
||||||
return (Function2<Integer, Iterator<WriteStatus>, Iterator<WriteStatus>>) (partition, statusIterator) -> {
|
return (Function2<Integer, Iterator<WriteStatus>, Iterator<WriteStatus>>) (partition,
|
||||||
|
statusIterator) -> {
|
||||||
Integer multiPutBatchSize = config.getHbaseIndexPutBatchSize();
|
Integer multiPutBatchSize = config.getHbaseIndexPutBatchSize();
|
||||||
|
|
||||||
List<WriteStatus> writeStatusList = new ArrayList<>();
|
List<WriteStatus> writeStatusList = new ArrayList<>();
|
||||||
@@ -292,12 +291,9 @@ public class HBaseIndex<T extends HoodieRecordPayload> extends HoodieIndex<T> {
|
|||||||
|
|
||||||
/**
|
/**
|
||||||
* Helper method to facilitate performing puts and deletes in Hbase
|
* Helper method to facilitate performing puts and deletes in Hbase
|
||||||
* @param hTable
|
|
||||||
* @param puts
|
|
||||||
* @param deletes
|
|
||||||
* @throws IOException
|
|
||||||
*/
|
*/
|
||||||
private void doPutsAndDeletes(HTable hTable, List<Put> puts, List<Delete> deletes) throws IOException {
|
private void doPutsAndDeletes(HTable hTable, List<Put> puts, List<Delete> deletes)
|
||||||
|
throws IOException {
|
||||||
if (puts.size() > 0) {
|
if (puts.size() > 0) {
|
||||||
hTable.put(puts);
|
hTable.put(puts);
|
||||||
}
|
}
|
||||||
@@ -323,7 +319,6 @@ public class HBaseIndex<T extends HoodieRecordPayload> extends HoodieIndex<T> {
|
|||||||
|
|
||||||
/**
|
/**
|
||||||
* Only looks up by recordKey
|
* Only looks up by recordKey
|
||||||
* @return
|
|
||||||
*/
|
*/
|
||||||
@Override
|
@Override
|
||||||
public boolean isGlobal() {
|
public boolean isGlobal() {
|
||||||
@@ -332,7 +327,6 @@ public class HBaseIndex<T extends HoodieRecordPayload> extends HoodieIndex<T> {
|
|||||||
|
|
||||||
/**
|
/**
|
||||||
* Mapping is available in HBase already.
|
* Mapping is available in HBase already.
|
||||||
* @return
|
|
||||||
*/
|
*/
|
||||||
@Override
|
@Override
|
||||||
public boolean canIndexLogFiles() {
|
public boolean canIndexLogFiles() {
|
||||||
@@ -341,7 +335,6 @@ public class HBaseIndex<T extends HoodieRecordPayload> extends HoodieIndex<T> {
|
|||||||
|
|
||||||
/**
|
/**
|
||||||
* Index needs to be explicitly updated after storage write.
|
* Index needs to be explicitly updated after storage write.
|
||||||
* @return
|
|
||||||
*/
|
*/
|
||||||
@Override
|
@Override
|
||||||
public boolean isImplicitWithStorage() {
|
public boolean isImplicitWithStorage() {
|
||||||
|
|||||||
@@ -37,14 +37,6 @@ import com.uber.hoodie.config.HoodieWriteConfig;
|
|||||||
import com.uber.hoodie.exception.HoodieAppendException;
|
import com.uber.hoodie.exception.HoodieAppendException;
|
||||||
import com.uber.hoodie.exception.HoodieUpsertException;
|
import com.uber.hoodie.exception.HoodieUpsertException;
|
||||||
import com.uber.hoodie.table.HoodieTable;
|
import com.uber.hoodie.table.HoodieTable;
|
||||||
import org.apache.avro.generic.GenericRecord;
|
|
||||||
import org.apache.avro.generic.IndexedRecord;
|
|
||||||
import org.apache.hadoop.fs.Path;
|
|
||||||
import org.apache.log4j.LogManager;
|
|
||||||
import org.apache.log4j.Logger;
|
|
||||||
import org.apache.spark.TaskContext;
|
|
||||||
import org.apache.spark.util.SizeEstimator;
|
|
||||||
|
|
||||||
import java.io.IOException;
|
import java.io.IOException;
|
||||||
import java.util.ArrayList;
|
import java.util.ArrayList;
|
||||||
import java.util.Comparator;
|
import java.util.Comparator;
|
||||||
@@ -53,6 +45,13 @@ import java.util.List;
|
|||||||
import java.util.Map;
|
import java.util.Map;
|
||||||
import java.util.Optional;
|
import java.util.Optional;
|
||||||
import java.util.concurrent.atomic.AtomicLong;
|
import java.util.concurrent.atomic.AtomicLong;
|
||||||
|
import org.apache.avro.generic.GenericRecord;
|
||||||
|
import org.apache.avro.generic.IndexedRecord;
|
||||||
|
import org.apache.hadoop.fs.Path;
|
||||||
|
import org.apache.log4j.LogManager;
|
||||||
|
import org.apache.log4j.Logger;
|
||||||
|
import org.apache.spark.TaskContext;
|
||||||
|
import org.apache.spark.util.SizeEstimator;
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* IO Operation to append data onto an existing file.
|
* IO Operation to append data onto an existing file.
|
||||||
@@ -61,14 +60,13 @@ public class HoodieAppendHandle<T extends HoodieRecordPayload> extends HoodieIOH
|
|||||||
|
|
||||||
private static Logger logger = LogManager.getLogger(HoodieAppendHandle.class);
|
private static Logger logger = LogManager.getLogger(HoodieAppendHandle.class);
|
||||||
private static AtomicLong recordIndex = new AtomicLong(1);
|
private static AtomicLong recordIndex = new AtomicLong(1);
|
||||||
|
|
||||||
private TableFileSystemView.RealtimeView fileSystemView;
|
|
||||||
private final WriteStatus writeStatus;
|
private final WriteStatus writeStatus;
|
||||||
private final String fileId;
|
private final String fileId;
|
||||||
private String partitionPath;
|
|
||||||
private Iterator<HoodieRecord<T>> recordItr;
|
|
||||||
List<IndexedRecord> recordList = new ArrayList<>();
|
List<IndexedRecord> recordList = new ArrayList<>();
|
||||||
List<String> keysToDelete = new ArrayList<>();
|
List<String> keysToDelete = new ArrayList<>();
|
||||||
|
private TableFileSystemView.RealtimeView fileSystemView;
|
||||||
|
private String partitionPath;
|
||||||
|
private Iterator<HoodieRecord<T>> recordItr;
|
||||||
private long recordsWritten = 0;
|
private long recordsWritten = 0;
|
||||||
private long recordsDeleted = 0;
|
private long recordsDeleted = 0;
|
||||||
private long averageRecordSize = 0;
|
private long averageRecordSize = 0;
|
||||||
@@ -76,11 +74,8 @@ public class HoodieAppendHandle<T extends HoodieRecordPayload> extends HoodieIOH
|
|||||||
private Writer writer;
|
private Writer writer;
|
||||||
private boolean doInit = true;
|
private boolean doInit = true;
|
||||||
|
|
||||||
public HoodieAppendHandle(HoodieWriteConfig config,
|
public HoodieAppendHandle(HoodieWriteConfig config, String commitTime, HoodieTable<T> hoodieTable,
|
||||||
String commitTime,
|
String fileId, Iterator<HoodieRecord<T>> recordItr) {
|
||||||
HoodieTable<T> hoodieTable,
|
|
||||||
String fileId,
|
|
||||||
Iterator<HoodieRecord<T>> recordItr) {
|
|
||||||
super(config, commitTime, hoodieTable);
|
super(config, commitTime, hoodieTable);
|
||||||
WriteStatus writeStatus = ReflectionUtils.loadClass(config.getWriteStatusClassName());
|
WriteStatus writeStatus = ReflectionUtils.loadClass(config.getWriteStatusClassName());
|
||||||
writeStatus.setStat(new HoodieDeltaWriteStat());
|
writeStatus.setStat(new HoodieDeltaWriteStat());
|
||||||
@@ -94,8 +89,8 @@ public class HoodieAppendHandle<T extends HoodieRecordPayload> extends HoodieIOH
|
|||||||
|
|
||||||
// extract some information from the first record
|
// extract some information from the first record
|
||||||
FileSlice fileSlice = fileSystemView.getLatestFileSlices(partitionPath)
|
FileSlice fileSlice = fileSystemView.getLatestFileSlices(partitionPath)
|
||||||
.filter(fileSlice1 -> fileSlice1.getDataFile().get().getFileId().equals(fileId))
|
.filter(fileSlice1 -> fileSlice1.getDataFile().get().getFileId().equals(fileId)).findFirst()
|
||||||
.findFirst().get();
|
.get();
|
||||||
// HACK(vc) This also assumes a base file. It will break, if appending without one.
|
// HACK(vc) This also assumes a base file. It will break, if appending without one.
|
||||||
String latestValidFilePath = fileSlice.getDataFile().get().getFileName();
|
String latestValidFilePath = fileSlice.getDataFile().get().getFileName();
|
||||||
String baseCommitTime = FSUtils.getCommitTime(latestValidFilePath);
|
String baseCommitTime = FSUtils.getCommitTime(latestValidFilePath);
|
||||||
@@ -108,23 +103,21 @@ public class HoodieAppendHandle<T extends HoodieRecordPayload> extends HoodieIOH
|
|||||||
try {
|
try {
|
||||||
this.writer = HoodieLogFormat.newWriterBuilder()
|
this.writer = HoodieLogFormat.newWriterBuilder()
|
||||||
.onParentPath(new Path(hoodieTable.getMetaClient().getBasePath(), partitionPath))
|
.onParentPath(new Path(hoodieTable.getMetaClient().getBasePath(), partitionPath))
|
||||||
.withFileId(fileId).overBaseCommit(baseCommitTime).withLogVersion(fileSlice.getLogFiles()
|
.withFileId(fileId).overBaseCommit(baseCommitTime).withLogVersion(
|
||||||
.map(logFile -> logFile.getLogVersion())
|
fileSlice.getLogFiles().map(logFile -> logFile.getLogVersion())
|
||||||
.max(Comparator.naturalOrder()).orElse(HoodieLogFile.LOGFILE_BASE_VERSION))
|
.max(Comparator.naturalOrder()).orElse(HoodieLogFile.LOGFILE_BASE_VERSION))
|
||||||
.withSizeThreshold(config.getLogFileMaxSize())
|
.withSizeThreshold(config.getLogFileMaxSize()).withFs(fs)
|
||||||
.withFs(fs).withFileExtension(HoodieLogFile.DELTA_EXTENSION).build();
|
.withFileExtension(HoodieLogFile.DELTA_EXTENSION).build();
|
||||||
this.currentLogFile = writer.getLogFile();
|
this.currentLogFile = writer.getLogFile();
|
||||||
((HoodieDeltaWriteStat) writeStatus.getStat())
|
((HoodieDeltaWriteStat) writeStatus.getStat()).setLogVersion(currentLogFile.getLogVersion());
|
||||||
.setLogVersion(currentLogFile.getLogVersion());
|
((HoodieDeltaWriteStat) writeStatus.getStat()).setLogOffset(writer.getCurrentSize());
|
||||||
((HoodieDeltaWriteStat) writeStatus.getStat())
|
|
||||||
.setLogOffset(writer.getCurrentSize());
|
|
||||||
} catch (Exception e) {
|
} catch (Exception e) {
|
||||||
logger.error("Error in update task at commit " + commitTime, e);
|
logger.error("Error in update task at commit " + commitTime, e);
|
||||||
writeStatus.setGlobalError(e);
|
writeStatus.setGlobalError(e);
|
||||||
throw new HoodieUpsertException(
|
throw new HoodieUpsertException(
|
||||||
"Failed to initialize HoodieUpdateHandle for FileId: " + fileId
|
"Failed to initialize HoodieUpdateHandle for FileId: " + fileId + " on commit "
|
||||||
+ " on commit " + commitTime + " on HDFS path " + hoodieTable
|
+ commitTime + " on HDFS path " + hoodieTable.getMetaClient().getBasePath()
|
||||||
.getMetaClient().getBasePath() + partitionPath, e);
|
+ partitionPath, e);
|
||||||
}
|
}
|
||||||
Path path = new Path(partitionPath,
|
Path path = new Path(partitionPath,
|
||||||
FSUtils.makeDataFileName(commitTime, TaskContext.getPartitionId(), fileId));
|
FSUtils.makeDataFileName(commitTime, TaskContext.getPartitionId(), fileId));
|
||||||
@@ -150,7 +143,8 @@ public class HoodieAppendHandle<T extends HoodieRecordPayload> extends HoodieIOH
|
|||||||
}
|
}
|
||||||
|
|
||||||
writeStatus.markSuccess(hoodieRecord, recordMetadata);
|
writeStatus.markSuccess(hoodieRecord, recordMetadata);
|
||||||
// deflate record payload after recording success. This will help users access payload as a part of marking
|
// deflate record payload after recording success. This will help users access payload as a
|
||||||
|
// part of marking
|
||||||
// record successful.
|
// record successful.
|
||||||
hoodieRecord.deflate();
|
hoodieRecord.deflate();
|
||||||
return avroRecord;
|
return avroRecord;
|
||||||
@@ -165,7 +159,8 @@ public class HoodieAppendHandle<T extends HoodieRecordPayload> extends HoodieIOH
|
|||||||
// to make sure we don't append records with older (shorter) schema than already appended
|
// to make sure we don't append records with older (shorter) schema than already appended
|
||||||
public void doAppend() {
|
public void doAppend() {
|
||||||
|
|
||||||
int maxBlockSize = config.getLogFileDataBlockMaxSize(); int numberOfRecords = 0;
|
int maxBlockSize = config.getLogFileDataBlockMaxSize();
|
||||||
|
int numberOfRecords = 0;
|
||||||
Map<HoodieLogBlock.HeaderMetadataType, String> header = Maps.newHashMap();
|
Map<HoodieLogBlock.HeaderMetadataType, String> header = Maps.newHashMap();
|
||||||
header.put(HoodieLogBlock.HeaderMetadataType.INSTANT_TIME, commitTime);
|
header.put(HoodieLogBlock.HeaderMetadataType.INSTANT_TIME, commitTime);
|
||||||
header.put(HoodieLogBlock.HeaderMetadataType.SCHEMA, schema.toString());
|
header.put(HoodieLogBlock.HeaderMetadataType.SCHEMA, schema.toString());
|
||||||
@@ -180,7 +175,8 @@ public class HoodieAppendHandle<T extends HoodieRecordPayload> extends HoodieIOH
|
|||||||
}
|
}
|
||||||
// Append if max number of records reached to achieve block size
|
// Append if max number of records reached to achieve block size
|
||||||
if (numberOfRecords >= (int) (maxBlockSize / averageRecordSize)) {
|
if (numberOfRecords >= (int) (maxBlockSize / averageRecordSize)) {
|
||||||
// Recompute averageRecordSize before writing a new block and update existing value with avg of new and old
|
// Recompute averageRecordSize before writing a new block and update existing value with
|
||||||
|
// avg of new and old
|
||||||
logger.info("AvgRecordSize => " + averageRecordSize);
|
logger.info("AvgRecordSize => " + averageRecordSize);
|
||||||
averageRecordSize = (averageRecordSize + SizeEstimator.estimate(record)) / 2;
|
averageRecordSize = (averageRecordSize + SizeEstimator.estimate(record)) / 2;
|
||||||
doAppend(header);
|
doAppend(header);
|
||||||
|
|||||||
@@ -68,8 +68,7 @@ public class HoodieCleanHelper<T extends HoodieRecordPayload<T>> {
|
|||||||
throws IOException {
|
throws IOException {
|
||||||
logger.info("Cleaning " + partitionPath + ", retaining latest " + config
|
logger.info("Cleaning " + partitionPath + ", retaining latest " + config
|
||||||
.getCleanerFileVersionsRetained() + " file versions. ");
|
.getCleanerFileVersionsRetained() + " file versions. ");
|
||||||
List<HoodieFileGroup> fileGroups =
|
List<HoodieFileGroup> fileGroups = fileSystemView.getAllFileGroups(partitionPath)
|
||||||
fileSystemView.getAllFileGroups(partitionPath)
|
|
||||||
.collect(Collectors.toList());
|
.collect(Collectors.toList());
|
||||||
List<String> deletePaths = new ArrayList<>();
|
List<String> deletePaths = new ArrayList<>();
|
||||||
// Collect all the datafiles savepointed by all the savepoints
|
// Collect all the datafiles savepointed by all the savepoints
|
||||||
@@ -94,11 +93,9 @@ public class HoodieCleanHelper<T extends HoodieRecordPayload<T>> {
|
|||||||
FileSlice nextSlice = fileSliceIterator.next();
|
FileSlice nextSlice = fileSliceIterator.next();
|
||||||
HoodieDataFile dataFile = nextSlice.getDataFile().get();
|
HoodieDataFile dataFile = nextSlice.getDataFile().get();
|
||||||
deletePaths.add(dataFile.getFileStatus().getPath().toString());
|
deletePaths.add(dataFile.getFileStatus().getPath().toString());
|
||||||
if (hoodieTable.getMetaClient().getTableType()
|
if (hoodieTable.getMetaClient().getTableType() == HoodieTableType.MERGE_ON_READ) {
|
||||||
== HoodieTableType.MERGE_ON_READ) {
|
|
||||||
// If merge on read, then clean the log files for the commits as well
|
// If merge on read, then clean the log files for the commits as well
|
||||||
deletePaths.addAll(nextSlice.getLogFiles()
|
deletePaths.addAll(nextSlice.getLogFiles().map(file -> file.getPath().toString())
|
||||||
.map(file -> file.getPath().toString())
|
|
||||||
.collect(Collectors.toList()));
|
.collect(Collectors.toList()));
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@@ -121,8 +118,8 @@ public class HoodieCleanHelper<T extends HoodieRecordPayload<T>> {
|
|||||||
private List<String> getFilesToCleanKeepingLatestCommits(String partitionPath)
|
private List<String> getFilesToCleanKeepingLatestCommits(String partitionPath)
|
||||||
throws IOException {
|
throws IOException {
|
||||||
int commitsRetained = config.getCleanerCommitsRetained();
|
int commitsRetained = config.getCleanerCommitsRetained();
|
||||||
logger.info(
|
logger
|
||||||
"Cleaning " + partitionPath + ", retaining latest " + commitsRetained + " commits. ");
|
.info("Cleaning " + partitionPath + ", retaining latest " + commitsRetained + " commits. ");
|
||||||
List<String> deletePaths = new ArrayList<>();
|
List<String> deletePaths = new ArrayList<>();
|
||||||
|
|
||||||
// Collect all the datafiles savepointed by all the savepoints
|
// Collect all the datafiles savepointed by all the savepoints
|
||||||
@@ -132,15 +129,14 @@ public class HoodieCleanHelper<T extends HoodieRecordPayload<T>> {
|
|||||||
// determine if we have enough commits, to start cleaning.
|
// determine if we have enough commits, to start cleaning.
|
||||||
if (commitTimeline.countInstants() > commitsRetained) {
|
if (commitTimeline.countInstants() > commitsRetained) {
|
||||||
HoodieInstant earliestCommitToRetain = getEarliestCommitToRetain().get();
|
HoodieInstant earliestCommitToRetain = getEarliestCommitToRetain().get();
|
||||||
List<HoodieFileGroup> fileGroups =
|
List<HoodieFileGroup> fileGroups = fileSystemView.getAllFileGroups(partitionPath)
|
||||||
fileSystemView.getAllFileGroups(partitionPath)
|
|
||||||
.collect(Collectors.toList());
|
.collect(Collectors.toList());
|
||||||
for (HoodieFileGroup fileGroup : fileGroups) {
|
for (HoodieFileGroup fileGroup : fileGroups) {
|
||||||
List<FileSlice> fileSliceList = fileGroup.getAllFileSlices().collect(Collectors.toList());
|
List<FileSlice> fileSliceList = fileGroup.getAllFileSlices().collect(Collectors.toList());
|
||||||
HoodieDataFile dataFile = fileSliceList.get(0).getDataFile().get();
|
HoodieDataFile dataFile = fileSliceList.get(0).getDataFile().get();
|
||||||
String lastVersion = dataFile.getCommitTime();
|
String lastVersion = dataFile.getCommitTime();
|
||||||
String lastVersionBeforeEarliestCommitToRetain =
|
String lastVersionBeforeEarliestCommitToRetain = getLatestVersionBeforeCommit(fileSliceList,
|
||||||
getLatestVersionBeforeCommit(fileSliceList, earliestCommitToRetain);
|
earliestCommitToRetain);
|
||||||
|
|
||||||
// Ensure there are more than 1 version of the file (we only clean old files from updates)
|
// Ensure there are more than 1 version of the file (we only clean old files from updates)
|
||||||
// i.e always spare the last commit.
|
// i.e always spare the last commit.
|
||||||
@@ -151,28 +147,26 @@ public class HoodieCleanHelper<T extends HoodieRecordPayload<T>> {
|
|||||||
// do not clean up a savepoint data file
|
// do not clean up a savepoint data file
|
||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
// Dont delete the latest commit and also the last commit before the earliest commit we are retaining
|
// Dont delete the latest commit and also the last commit before the earliest commit we
|
||||||
// The window of commit retain == max query run time. So a query could be running which still
|
// are retaining
|
||||||
|
// The window of commit retain == max query run time. So a query could be running which
|
||||||
|
// still
|
||||||
// uses this file.
|
// uses this file.
|
||||||
if (fileCommitTime.equals(lastVersion) || (
|
if (fileCommitTime.equals(lastVersion) || (lastVersionBeforeEarliestCommitToRetain != null
|
||||||
lastVersionBeforeEarliestCommitToRetain != null && fileCommitTime
|
&& fileCommitTime.equals(lastVersionBeforeEarliestCommitToRetain))) {
|
||||||
.equals(lastVersionBeforeEarliestCommitToRetain))) {
|
|
||||||
// move on to the next file
|
// move on to the next file
|
||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
|
|
||||||
// Always keep the last commit
|
// Always keep the last commit
|
||||||
if (HoodieTimeline.compareTimestamps(
|
if (HoodieTimeline
|
||||||
earliestCommitToRetain.getTimestamp(),
|
.compareTimestamps(earliestCommitToRetain.getTimestamp(), fileCommitTime,
|
||||||
fileCommitTime,
|
|
||||||
HoodieTimeline.GREATER)) {
|
HoodieTimeline.GREATER)) {
|
||||||
// this is a commit, that should be cleaned.
|
// this is a commit, that should be cleaned.
|
||||||
deletePaths.add(aFile.getFileStatus().getPath().toString());
|
deletePaths.add(aFile.getFileStatus().getPath().toString());
|
||||||
if (hoodieTable.getMetaClient().getTableType()
|
if (hoodieTable.getMetaClient().getTableType() == HoodieTableType.MERGE_ON_READ) {
|
||||||
== HoodieTableType.MERGE_ON_READ) {
|
|
||||||
// If merge on read, then clean the log files for the commits as well
|
// If merge on read, then clean the log files for the commits as well
|
||||||
deletePaths.addAll(aSlice.getLogFiles()
|
deletePaths.addAll(aSlice.getLogFiles().map(file -> file.getPath().toString())
|
||||||
.map(file -> file.getPath().toString())
|
|
||||||
.collect(Collectors.toList()));
|
.collect(Collectors.toList()));
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@@ -190,9 +184,10 @@ public class HoodieCleanHelper<T extends HoodieRecordPayload<T>> {
|
|||||||
HoodieInstant commitTime) {
|
HoodieInstant commitTime) {
|
||||||
for (FileSlice file : fileSliceList) {
|
for (FileSlice file : fileSliceList) {
|
||||||
String fileCommitTime = file.getDataFile().get().getCommitTime();
|
String fileCommitTime = file.getDataFile().get().getCommitTime();
|
||||||
if (HoodieTimeline.compareTimestamps(commitTime.getTimestamp(), fileCommitTime,
|
if (HoodieTimeline
|
||||||
HoodieTimeline.GREATER)) {
|
.compareTimestamps(commitTime.getTimestamp(), fileCommitTime, HoodieTimeline.GREATER)) {
|
||||||
// fileList is sorted on the reverse, so the first commit we find <= commitTime is the one we want
|
// fileList is sorted on the reverse, so the first commit we find <= commitTime is the
|
||||||
|
// one we want
|
||||||
return fileCommitTime;
|
return fileCommitTime;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@@ -213,8 +208,7 @@ public class HoodieCleanHelper<T extends HoodieRecordPayload<T>> {
|
|||||||
} else {
|
} else {
|
||||||
throw new IllegalArgumentException("Unknown cleaning policy : " + policy.name());
|
throw new IllegalArgumentException("Unknown cleaning policy : " + policy.name());
|
||||||
}
|
}
|
||||||
logger.info(
|
logger.info(deletePaths.size() + " patterns used to delete in partition path:" + partitionPath);
|
||||||
deletePaths.size() + " patterns used to delete in partition path:" + partitionPath);
|
|
||||||
|
|
||||||
return deletePaths;
|
return deletePaths;
|
||||||
}
|
}
|
||||||
@@ -227,8 +221,8 @@ public class HoodieCleanHelper<T extends HoodieRecordPayload<T>> {
|
|||||||
int commitsRetained = config.getCleanerCommitsRetained();
|
int commitsRetained = config.getCleanerCommitsRetained();
|
||||||
if (config.getCleanerPolicy() == HoodieCleaningPolicy.KEEP_LATEST_COMMITS
|
if (config.getCleanerPolicy() == HoodieCleaningPolicy.KEEP_LATEST_COMMITS
|
||||||
&& commitTimeline.countInstants() > commitsRetained) {
|
&& commitTimeline.countInstants() > commitsRetained) {
|
||||||
earliestCommitToRetain =
|
earliestCommitToRetain = commitTimeline
|
||||||
commitTimeline.nthInstant(commitTimeline.countInstants() - commitsRetained);
|
.nthInstant(commitTimeline.countInstants() - commitsRetained);
|
||||||
}
|
}
|
||||||
return earliestCommitToRetain;
|
return earliestCommitToRetain;
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -40,13 +40,6 @@ import com.uber.hoodie.exception.HoodieCommitException;
|
|||||||
import com.uber.hoodie.exception.HoodieException;
|
import com.uber.hoodie.exception.HoodieException;
|
||||||
import com.uber.hoodie.exception.HoodieIOException;
|
import com.uber.hoodie.exception.HoodieIOException;
|
||||||
import com.uber.hoodie.table.HoodieTable;
|
import com.uber.hoodie.table.HoodieTable;
|
||||||
import org.apache.avro.Schema;
|
|
||||||
import org.apache.avro.file.DataFileStream;
|
|
||||||
import org.apache.avro.generic.IndexedRecord;
|
|
||||||
import org.apache.hadoop.fs.Path;
|
|
||||||
import org.apache.log4j.LogManager;
|
|
||||||
import org.apache.log4j.Logger;
|
|
||||||
|
|
||||||
import java.io.IOException;
|
import java.io.IOException;
|
||||||
import java.util.ArrayList;
|
import java.util.ArrayList;
|
||||||
import java.util.List;
|
import java.util.List;
|
||||||
@@ -54,6 +47,11 @@ import java.util.Map;
|
|||||||
import java.util.Optional;
|
import java.util.Optional;
|
||||||
import java.util.stream.Collectors;
|
import java.util.stream.Collectors;
|
||||||
import java.util.stream.Stream;
|
import java.util.stream.Stream;
|
||||||
|
import org.apache.avro.Schema;
|
||||||
|
import org.apache.avro.generic.IndexedRecord;
|
||||||
|
import org.apache.hadoop.fs.Path;
|
||||||
|
import org.apache.log4j.LogManager;
|
||||||
|
import org.apache.log4j.Logger;
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Archiver to bound the growth of <action>.commit files
|
* Archiver to bound the growth of <action>.commit files
|
||||||
@@ -76,11 +74,9 @@ public class HoodieCommitArchiveLog {
|
|||||||
private HoodieLogFormat.Writer openWriter() {
|
private HoodieLogFormat.Writer openWriter() {
|
||||||
try {
|
try {
|
||||||
if (this.writer == null) {
|
if (this.writer == null) {
|
||||||
return HoodieLogFormat.newWriterBuilder()
|
return HoodieLogFormat.newWriterBuilder().onParentPath(archiveFilePath.getParent())
|
||||||
.onParentPath(archiveFilePath.getParent())
|
|
||||||
.withFileId(archiveFilePath.getName())
|
.withFileId(archiveFilePath.getName())
|
||||||
.withFileExtension(HoodieArchivedLogFile.ARCHIVE_EXTENSION)
|
.withFileExtension(HoodieArchivedLogFile.ARCHIVE_EXTENSION).withFs(metaClient.getFs())
|
||||||
.withFs(metaClient.getFs())
|
|
||||||
.overBaseCommit("").build();
|
.overBaseCommit("").build();
|
||||||
} else {
|
} else {
|
||||||
return this.writer;
|
return this.writer;
|
||||||
@@ -136,21 +132,19 @@ public class HoodieCommitArchiveLog {
|
|||||||
.getTimelineOfActions(Sets.newHashSet(HoodieTimeline.CLEAN_ACTION))
|
.getTimelineOfActions(Sets.newHashSet(HoodieTimeline.CLEAN_ACTION))
|
||||||
.filterCompletedInstants();
|
.filterCompletedInstants();
|
||||||
Stream<HoodieInstant> instants = cleanAndRollbackTimeline.getInstants()
|
Stream<HoodieInstant> instants = cleanAndRollbackTimeline.getInstants()
|
||||||
.collect(Collectors.groupingBy(s -> s.getAction()))
|
.collect(Collectors.groupingBy(s -> s.getAction())).entrySet().stream().map(i -> {
|
||||||
.entrySet()
|
|
||||||
.stream()
|
|
||||||
.map(i -> {
|
|
||||||
if (i.getValue().size() > maxCommitsToKeep) {
|
if (i.getValue().size() > maxCommitsToKeep) {
|
||||||
return i.getValue().subList(0, i.getValue().size() - minCommitsToKeep);
|
return i.getValue().subList(0, i.getValue().size() - minCommitsToKeep);
|
||||||
} else {
|
} else {
|
||||||
return new ArrayList<HoodieInstant>();
|
return new ArrayList<HoodieInstant>();
|
||||||
}
|
}
|
||||||
})
|
}).flatMap(i -> i.stream());
|
||||||
.flatMap(i -> i.stream());
|
|
||||||
|
|
||||||
//TODO (na) : Add a way to return actions associated with a timeline and then merge/unify with logic above to avoid Stream.concats
|
//TODO (na) : Add a way to return actions associated with a timeline and then merge/unify
|
||||||
|
// with logic above to avoid Stream.concats
|
||||||
HoodieTimeline commitTimeline = table.getCompletedCommitTimeline();
|
HoodieTimeline commitTimeline = table.getCompletedCommitTimeline();
|
||||||
// We cannot have any holes in the commit timeline. We cannot archive any commits which are made after the first savepoint present.
|
// We cannot have any holes in the commit timeline. We cannot archive any commits which are
|
||||||
|
// made after the first savepoint present.
|
||||||
Optional<HoodieInstant> firstSavepoint = table.getCompletedSavepointTimeline().firstInstant();
|
Optional<HoodieInstant> firstSavepoint = table.getCompletedSavepointTimeline().firstInstant();
|
||||||
if (!commitTimeline.empty() && commitTimeline.countInstants() > maxCommitsToKeep) {
|
if (!commitTimeline.empty() && commitTimeline.countInstants() > maxCommitsToKeep) {
|
||||||
// Actually do the commits
|
// Actually do the commits
|
||||||
@@ -169,16 +163,14 @@ public class HoodieCommitArchiveLog {
|
|||||||
log.info("Deleting instants " + archivedInstants);
|
log.info("Deleting instants " + archivedInstants);
|
||||||
boolean success = true;
|
boolean success = true;
|
||||||
for (HoodieInstant archivedInstant : archivedInstants) {
|
for (HoodieInstant archivedInstant : archivedInstants) {
|
||||||
Path commitFile =
|
Path commitFile = new Path(metaClient.getMetaPath(), archivedInstant.getFileName());
|
||||||
new Path(metaClient.getMetaPath(), archivedInstant.getFileName());
|
|
||||||
try {
|
try {
|
||||||
if (metaClient.getFs().exists(commitFile)) {
|
if (metaClient.getFs().exists(commitFile)) {
|
||||||
success &= metaClient.getFs().delete(commitFile, false);
|
success &= metaClient.getFs().delete(commitFile, false);
|
||||||
log.info("Archived and deleted instant file " + commitFile);
|
log.info("Archived and deleted instant file " + commitFile);
|
||||||
}
|
}
|
||||||
} catch (IOException e) {
|
} catch (IOException e) {
|
||||||
throw new HoodieIOException("Failed to delete archived instant " + archivedInstant,
|
throw new HoodieIOException("Failed to delete archived instant " + archivedInstant, e);
|
||||||
e);
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
return success;
|
return success;
|
||||||
@@ -186,8 +178,8 @@ public class HoodieCommitArchiveLog {
|
|||||||
|
|
||||||
public void archive(List<HoodieInstant> instants) throws HoodieCommitException {
|
public void archive(List<HoodieInstant> instants) throws HoodieCommitException {
|
||||||
try {
|
try {
|
||||||
HoodieTimeline commitTimeline =
|
HoodieTimeline commitTimeline = metaClient.getActiveTimeline().getAllCommitsTimeline()
|
||||||
metaClient.getActiveTimeline().getAllCommitsTimeline().filterCompletedInstants();
|
.filterCompletedInstants();
|
||||||
Schema wrapperSchema = HoodieArchivedMetaEntry.getClassSchema();
|
Schema wrapperSchema = HoodieArchivedMetaEntry.getClassSchema();
|
||||||
log.info("Wrapper schema " + wrapperSchema.toString());
|
log.info("Wrapper schema " + wrapperSchema.toString());
|
||||||
List<IndexedRecord> records = new ArrayList<>();
|
List<IndexedRecord> records = new ArrayList<>();
|
||||||
@@ -247,6 +239,8 @@ public class HoodieCommitArchiveLog {
|
|||||||
archivedMetaWrapper.setActionType(ActionType.commit.name());
|
archivedMetaWrapper.setActionType(ActionType.commit.name());
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
|
default:
|
||||||
|
throw new UnsupportedOperationException("Action not fully supported yet");
|
||||||
}
|
}
|
||||||
return archivedMetaWrapper;
|
return archivedMetaWrapper;
|
||||||
}
|
}
|
||||||
@@ -256,9 +250,8 @@ public class HoodieCommitArchiveLog {
|
|||||||
ObjectMapper mapper = new ObjectMapper();
|
ObjectMapper mapper = new ObjectMapper();
|
||||||
//Need this to ignore other public get() methods
|
//Need this to ignore other public get() methods
|
||||||
mapper.configure(DeserializationFeature.FAIL_ON_UNKNOWN_PROPERTIES, false);
|
mapper.configure(DeserializationFeature.FAIL_ON_UNKNOWN_PROPERTIES, false);
|
||||||
com.uber.hoodie.avro.model.HoodieCommitMetadata avroMetaData =
|
com.uber.hoodie.avro.model.HoodieCommitMetadata avroMetaData = mapper
|
||||||
mapper.convertValue(hoodieCommitMetadata,
|
.convertValue(hoodieCommitMetadata, com.uber.hoodie.avro.model.HoodieCommitMetadata.class);
|
||||||
com.uber.hoodie.avro.model.HoodieCommitMetadata.class);
|
|
||||||
return avroMetaData;
|
return avroMetaData;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -49,8 +49,8 @@ public class HoodieCreateHandle<T extends HoodieRecordPayload> extends HoodieIOH
|
|||||||
private long recordsWritten = 0;
|
private long recordsWritten = 0;
|
||||||
private long recordsDeleted = 0;
|
private long recordsDeleted = 0;
|
||||||
|
|
||||||
public HoodieCreateHandle(HoodieWriteConfig config, String commitTime,
|
public HoodieCreateHandle(HoodieWriteConfig config, String commitTime, HoodieTable<T> hoodieTable,
|
||||||
HoodieTable<T> hoodieTable, String partitionPath) {
|
String partitionPath) {
|
||||||
super(config, commitTime, hoodieTable);
|
super(config, commitTime, hoodieTable);
|
||||||
this.status = ReflectionUtils.loadClass(config.getWriteStatusClassName());
|
this.status = ReflectionUtils.loadClass(config.getWriteStatusClassName());
|
||||||
status.setFileId(UUID.randomUUID().toString());
|
status.setFileId(UUID.randomUUID().toString());
|
||||||
@@ -64,13 +64,10 @@ public class HoodieCreateHandle<T extends HoodieRecordPayload> extends HoodieIOH
|
|||||||
}
|
}
|
||||||
|
|
||||||
try {
|
try {
|
||||||
HoodiePartitionMetadata partitionMetadata = new HoodiePartitionMetadata(fs,
|
HoodiePartitionMetadata partitionMetadata = new HoodiePartitionMetadata(fs, commitTime,
|
||||||
commitTime,
|
new Path(config.getBasePath()), new Path(config.getBasePath(), partitionPath));
|
||||||
new Path(config.getBasePath()),
|
|
||||||
new Path(config.getBasePath(), partitionPath));
|
|
||||||
partitionMetadata.trySave(TaskContext.getPartitionId());
|
partitionMetadata.trySave(TaskContext.getPartitionId());
|
||||||
this.storageWriter =
|
this.storageWriter = HoodieStorageWriterFactory
|
||||||
HoodieStorageWriterFactory
|
|
||||||
.getStorageWriter(commitTime, getStorageWriterPath(), hoodieTable, config, schema);
|
.getStorageWriter(commitTime, getStorageWriterPath(), hoodieTable, config, schema);
|
||||||
} catch (IOException e) {
|
} catch (IOException e) {
|
||||||
throw new HoodieInsertException(
|
throw new HoodieInsertException(
|
||||||
@@ -81,13 +78,12 @@ public class HoodieCreateHandle<T extends HoodieRecordPayload> extends HoodieIOH
|
|||||||
|
|
||||||
/**
|
/**
|
||||||
* Determines whether we can accept the incoming records, into the current file, depending on
|
* Determines whether we can accept the incoming records, into the current file, depending on
|
||||||
*
|
* <p>
|
||||||
* - Whether it belongs to the same partitionPath as existing records - Whether the current file
|
* - Whether it belongs to the same partitionPath as existing records - Whether the current file
|
||||||
* written bytes lt max file size
|
* written bytes lt max file size
|
||||||
*/
|
*/
|
||||||
public boolean canWrite(HoodieRecord record) {
|
public boolean canWrite(HoodieRecord record) {
|
||||||
return storageWriter.canWrite() && record.getPartitionPath()
|
return storageWriter.canWrite() && record.getPartitionPath().equals(status.getPartitionPath());
|
||||||
.equals(status.getPartitionPath());
|
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
@@ -111,7 +107,8 @@ public class HoodieCreateHandle<T extends HoodieRecordPayload> extends HoodieIOH
|
|||||||
recordsDeleted++;
|
recordsDeleted++;
|
||||||
}
|
}
|
||||||
status.markSuccess(record, recordMetadata);
|
status.markSuccess(record, recordMetadata);
|
||||||
// deflate record payload after recording success. This will help users access payload as a part of marking
|
// deflate record payload after recording success. This will help users access payload as a
|
||||||
|
// part of marking
|
||||||
// record successful.
|
// record successful.
|
||||||
record.deflate();
|
record.deflate();
|
||||||
} catch (Throwable t) {
|
} catch (Throwable t) {
|
||||||
@@ -126,8 +123,7 @@ public class HoodieCreateHandle<T extends HoodieRecordPayload> extends HoodieIOH
|
|||||||
* Performs actions to durably, persist the current changes and returns a WriteStatus object
|
* Performs actions to durably, persist the current changes and returns a WriteStatus object
|
||||||
*/
|
*/
|
||||||
public WriteStatus close() {
|
public WriteStatus close() {
|
||||||
logger.info(
|
logger.info("Closing the file " + status.getFileId() + " as we are done with all the records "
|
||||||
"Closing the file " + status.getFileId() + " as we are done with all the records "
|
|
||||||
+ recordsWritten);
|
+ recordsWritten);
|
||||||
try {
|
try {
|
||||||
storageWriter.close();
|
storageWriter.close();
|
||||||
@@ -144,8 +140,7 @@ public class HoodieCreateHandle<T extends HoodieRecordPayload> extends HoodieIOH
|
|||||||
|
|
||||||
return status;
|
return status;
|
||||||
} catch (IOException e) {
|
} catch (IOException e) {
|
||||||
throw new HoodieInsertException("Failed to close the Insert Handle for path " + path,
|
throw new HoodieInsertException("Failed to close the Insert Handle for path " + path, e);
|
||||||
e);
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|||||||
@@ -39,11 +39,10 @@ public abstract class HoodieIOHandle<T extends HoodieRecordPayload> {
|
|||||||
protected final HoodieWriteConfig config;
|
protected final HoodieWriteConfig config;
|
||||||
protected final FileSystem fs;
|
protected final FileSystem fs;
|
||||||
protected final HoodieTable<T> hoodieTable;
|
protected final HoodieTable<T> hoodieTable;
|
||||||
protected HoodieTimeline hoodieTimeline;
|
|
||||||
protected final Schema schema;
|
protected final Schema schema;
|
||||||
|
protected HoodieTimeline hoodieTimeline;
|
||||||
|
|
||||||
public HoodieIOHandle(HoodieWriteConfig config, String commitTime,
|
public HoodieIOHandle(HoodieWriteConfig config, String commitTime, HoodieTable<T> hoodieTable) {
|
||||||
HoodieTable<T> hoodieTable) {
|
|
||||||
this.commitTime = commitTime;
|
this.commitTime = commitTime;
|
||||||
this.config = config;
|
this.config = config;
|
||||||
this.fs = hoodieTable.getMetaClient().getFs();
|
this.fs = hoodieTable.getMetaClient().getFs();
|
||||||
@@ -52,6 +51,32 @@ public abstract class HoodieIOHandle<T extends HoodieRecordPayload> {
|
|||||||
this.schema = createHoodieWriteSchema(config);
|
this.schema = createHoodieWriteSchema(config);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Deletes any new tmp files written during the current commit, into the partition
|
||||||
|
*/
|
||||||
|
public static void cleanupTmpFilesFromCurrentCommit(HoodieWriteConfig config, String commitTime,
|
||||||
|
String partitionPath, int taskPartitionId, HoodieTable hoodieTable) {
|
||||||
|
FileSystem fs = hoodieTable.getMetaClient().getFs();
|
||||||
|
try {
|
||||||
|
FileStatus[] prevFailedFiles = fs.globStatus(new Path(String
|
||||||
|
.format("%s/%s/%s", config.getBasePath(), partitionPath,
|
||||||
|
FSUtils.maskWithoutFileId(commitTime, taskPartitionId))));
|
||||||
|
if (prevFailedFiles != null) {
|
||||||
|
logger.info(
|
||||||
|
"Deleting " + prevFailedFiles.length + " files generated by previous failed attempts.");
|
||||||
|
for (FileStatus status : prevFailedFiles) {
|
||||||
|
fs.delete(status.getPath(), false);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
} catch (IOException e) {
|
||||||
|
throw new HoodieIOException("Failed to cleanup Temp files from commit " + commitTime, e);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
public static Schema createHoodieWriteSchema(HoodieWriteConfig config) {
|
||||||
|
return HoodieAvroUtils.addMetadataFields(new Schema.Parser().parse(config.getSchema()));
|
||||||
|
}
|
||||||
|
|
||||||
public Path makeNewPath(String partitionPath, int taskPartitionId, String fileName) {
|
public Path makeNewPath(String partitionPath, int taskPartitionId, String fileName) {
|
||||||
Path path = new Path(config.getBasePath(), partitionPath);
|
Path path = new Path(config.getBasePath(), partitionPath);
|
||||||
try {
|
try {
|
||||||
@@ -72,37 +97,7 @@ public abstract class HoodieIOHandle<T extends HoodieRecordPayload> {
|
|||||||
taskAttemptId));
|
taskAttemptId));
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
|
||||||
* Deletes any new tmp files written during the current commit, into the partition
|
|
||||||
*/
|
|
||||||
public static void cleanupTmpFilesFromCurrentCommit(HoodieWriteConfig config,
|
|
||||||
String commitTime,
|
|
||||||
String partitionPath,
|
|
||||||
int taskPartitionId,
|
|
||||||
HoodieTable hoodieTable) {
|
|
||||||
FileSystem fs = hoodieTable.getMetaClient().getFs();
|
|
||||||
try {
|
|
||||||
FileStatus[] prevFailedFiles = fs.globStatus(new Path(String
|
|
||||||
.format("%s/%s/%s", config.getBasePath(), partitionPath,
|
|
||||||
FSUtils.maskWithoutFileId(commitTime, taskPartitionId))));
|
|
||||||
if (prevFailedFiles != null) {
|
|
||||||
logger.info("Deleting " + prevFailedFiles.length
|
|
||||||
+ " files generated by previous failed attempts.");
|
|
||||||
for (FileStatus status : prevFailedFiles) {
|
|
||||||
fs.delete(status.getPath(), false);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
} catch (IOException e) {
|
|
||||||
throw new HoodieIOException("Failed to cleanup Temp files from commit " + commitTime,
|
|
||||||
e);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
public Schema getSchema() {
|
public Schema getSchema() {
|
||||||
return schema;
|
return schema;
|
||||||
}
|
}
|
||||||
|
|
||||||
public static Schema createHoodieWriteSchema(HoodieWriteConfig config) {
|
|
||||||
return HoodieAvroUtils.addMetadataFields(new Schema.Parser().parse(config.getSchema()));
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -26,14 +26,18 @@ import com.uber.hoodie.common.table.TableFileSystemView;
|
|||||||
import com.uber.hoodie.common.util.FSUtils;
|
import com.uber.hoodie.common.util.FSUtils;
|
||||||
import com.uber.hoodie.common.util.ReflectionUtils;
|
import com.uber.hoodie.common.util.ReflectionUtils;
|
||||||
import com.uber.hoodie.common.util.collection.ExternalSpillableMap;
|
import com.uber.hoodie.common.util.collection.ExternalSpillableMap;
|
||||||
import com.uber.hoodie.common.util.collection.converter.StringConverter;
|
|
||||||
import com.uber.hoodie.common.util.collection.converter.HoodieRecordConverter;
|
import com.uber.hoodie.common.util.collection.converter.HoodieRecordConverter;
|
||||||
|
import com.uber.hoodie.common.util.collection.converter.StringConverter;
|
||||||
import com.uber.hoodie.config.HoodieWriteConfig;
|
import com.uber.hoodie.config.HoodieWriteConfig;
|
||||||
import com.uber.hoodie.exception.HoodieIOException;
|
import com.uber.hoodie.exception.HoodieIOException;
|
||||||
import com.uber.hoodie.exception.HoodieUpsertException;
|
import com.uber.hoodie.exception.HoodieUpsertException;
|
||||||
import com.uber.hoodie.io.storage.HoodieStorageWriter;
|
import com.uber.hoodie.io.storage.HoodieStorageWriter;
|
||||||
import com.uber.hoodie.io.storage.HoodieStorageWriterFactory;
|
import com.uber.hoodie.io.storage.HoodieStorageWriterFactory;
|
||||||
import com.uber.hoodie.table.HoodieTable;
|
import com.uber.hoodie.table.HoodieTable;
|
||||||
|
import java.io.IOException;
|
||||||
|
import java.util.Iterator;
|
||||||
|
import java.util.Map;
|
||||||
|
import java.util.Optional;
|
||||||
import org.apache.avro.generic.GenericRecord;
|
import org.apache.avro.generic.GenericRecord;
|
||||||
import org.apache.avro.generic.IndexedRecord;
|
import org.apache.avro.generic.IndexedRecord;
|
||||||
import org.apache.hadoop.fs.Path;
|
import org.apache.hadoop.fs.Path;
|
||||||
@@ -41,11 +45,6 @@ import org.apache.log4j.LogManager;
|
|||||||
import org.apache.log4j.Logger;
|
import org.apache.log4j.Logger;
|
||||||
import org.apache.spark.TaskContext;
|
import org.apache.spark.TaskContext;
|
||||||
|
|
||||||
import java.io.IOException;
|
|
||||||
import java.util.Iterator;
|
|
||||||
import java.util.Map;
|
|
||||||
import java.util.Optional;
|
|
||||||
|
|
||||||
@SuppressWarnings("Duplicates")
|
@SuppressWarnings("Duplicates")
|
||||||
public class HoodieMergeHandle<T extends HoodieRecordPayload> extends HoodieIOHandle<T> {
|
public class HoodieMergeHandle<T extends HoodieRecordPayload> extends HoodieIOHandle<T> {
|
||||||
|
|
||||||
@@ -62,59 +61,46 @@ public class HoodieMergeHandle<T extends HoodieRecordPayload> extends HoodieIOHa
|
|||||||
private long recordsDeleted = 0;
|
private long recordsDeleted = 0;
|
||||||
private long updatedRecordsWritten = 0;
|
private long updatedRecordsWritten = 0;
|
||||||
|
|
||||||
public HoodieMergeHandle(HoodieWriteConfig config,
|
public HoodieMergeHandle(HoodieWriteConfig config, String commitTime, HoodieTable<T> hoodieTable,
|
||||||
String commitTime,
|
Iterator<HoodieRecord<T>> recordItr, String fileId) {
|
||||||
HoodieTable<T> hoodieTable,
|
|
||||||
Iterator<HoodieRecord<T>> recordItr,
|
|
||||||
String fileId) {
|
|
||||||
super(config, commitTime, hoodieTable);
|
super(config, commitTime, hoodieTable);
|
||||||
this.fileSystemView = hoodieTable.getROFileSystemView();
|
this.fileSystemView = hoodieTable.getROFileSystemView();
|
||||||
init(fileId, init(fileId, recordItr));
|
init(fileId, init(fileId, recordItr));
|
||||||
}
|
}
|
||||||
|
|
||||||
public HoodieMergeHandle(HoodieWriteConfig config,
|
public HoodieMergeHandle(HoodieWriteConfig config, String commitTime, HoodieTable<T> hoodieTable,
|
||||||
String commitTime,
|
Map<String, HoodieRecord<T>> keyToNewRecords, String fileId) {
|
||||||
HoodieTable<T> hoodieTable,
|
|
||||||
Map<String, HoodieRecord<T>> keyToNewRecords,
|
|
||||||
String fileId) {
|
|
||||||
super(config, commitTime, hoodieTable);
|
super(config, commitTime, hoodieTable);
|
||||||
this.fileSystemView = hoodieTable.getROFileSystemView();
|
this.fileSystemView = hoodieTable.getROFileSystemView();
|
||||||
this.keyToNewRecords = keyToNewRecords;
|
this.keyToNewRecords = keyToNewRecords;
|
||||||
init(fileId, keyToNewRecords.get(keyToNewRecords.keySet().stream().findFirst().get()).getPartitionPath());
|
init(fileId, keyToNewRecords.get(keyToNewRecords.keySet().stream().findFirst().get())
|
||||||
|
.getPartitionPath());
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Extract old file path, initialize StorageWriter and WriteStatus
|
* Extract old file path, initialize StorageWriter and WriteStatus
|
||||||
* @param fileId
|
|
||||||
* @param partitionPath
|
|
||||||
*/
|
*/
|
||||||
private void init(String fileId, String partitionPath) {
|
private void init(String fileId, String partitionPath) {
|
||||||
WriteStatus writeStatus = ReflectionUtils.loadClass(config.getWriteStatusClassName());
|
WriteStatus writeStatus = ReflectionUtils.loadClass(config.getWriteStatusClassName());
|
||||||
writeStatus.setStat(new HoodieWriteStat());
|
writeStatus.setStat(new HoodieWriteStat());
|
||||||
this.writeStatus = writeStatus;
|
this.writeStatus = writeStatus;
|
||||||
try {
|
try {
|
||||||
String latestValidFilePath = fileSystemView
|
String latestValidFilePath = fileSystemView.getLatestDataFiles(partitionPath)
|
||||||
.getLatestDataFiles(partitionPath)
|
.filter(dataFile -> dataFile.getFileId().equals(fileId)).findFirst().get().getFileName();
|
||||||
.filter(dataFile -> dataFile.getFileId().equals(fileId))
|
|
||||||
.findFirst()
|
|
||||||
.get().getFileName();
|
|
||||||
writeStatus.getStat().setPrevCommit(FSUtils.getCommitTime(latestValidFilePath));
|
writeStatus.getStat().setPrevCommit(FSUtils.getCommitTime(latestValidFilePath));
|
||||||
|
|
||||||
HoodiePartitionMetadata partitionMetadata = new HoodiePartitionMetadata(fs,
|
HoodiePartitionMetadata partitionMetadata = new HoodiePartitionMetadata(fs, commitTime,
|
||||||
commitTime,
|
new Path(config.getBasePath()), new Path(config.getBasePath(), partitionPath));
|
||||||
new Path(config.getBasePath()),
|
|
||||||
new Path(config.getBasePath(), partitionPath));
|
|
||||||
partitionMetadata.trySave(TaskContext.getPartitionId());
|
partitionMetadata.trySave(TaskContext.getPartitionId());
|
||||||
|
|
||||||
oldFilePath = new Path(
|
oldFilePath = new Path(
|
||||||
config.getBasePath() + "/" + partitionPath + "/"
|
config.getBasePath() + "/" + partitionPath + "/" + latestValidFilePath);
|
||||||
+ latestValidFilePath);
|
|
||||||
String relativePath = new Path(partitionPath + "/" + FSUtils
|
String relativePath = new Path(partitionPath + "/" + FSUtils
|
||||||
.makeDataFileName(commitTime, TaskContext.getPartitionId(), fileId)).toString();
|
.makeDataFileName(commitTime, TaskContext.getPartitionId(), fileId)).toString();
|
||||||
newFilePath = new Path(config.getBasePath(), relativePath);
|
newFilePath = new Path(config.getBasePath(), relativePath);
|
||||||
if (config.shouldUseTempFolderForCopyOnWriteForMerge()) {
|
if (config.shouldUseTempFolderForCopyOnWriteForMerge()) {
|
||||||
this.tempPath = makeTempPath(partitionPath, TaskContext.getPartitionId(),
|
this.tempPath = makeTempPath(partitionPath, TaskContext.getPartitionId(), fileId,
|
||||||
fileId, TaskContext.get().stageId(), TaskContext.get().taskAttemptId());
|
TaskContext.get().stageId(), TaskContext.get().taskAttemptId());
|
||||||
}
|
}
|
||||||
|
|
||||||
// handle cases of partial failures, for update task
|
// handle cases of partial failures, for update task
|
||||||
@@ -122,8 +108,9 @@ public class HoodieMergeHandle<T extends HoodieRecordPayload> extends HoodieIOHa
|
|||||||
fs.delete(newFilePath, false);
|
fs.delete(newFilePath, false);
|
||||||
}
|
}
|
||||||
|
|
||||||
logger.info(String.format("Merging new data into oldPath %s, as newPath %s",
|
logger.info(String
|
||||||
oldFilePath.toString(), getStorageWriterPath().toString()));
|
.format("Merging new data into oldPath %s, as newPath %s", oldFilePath.toString(),
|
||||||
|
getStorageWriterPath().toString()));
|
||||||
// file name is same for all records, in this bunch
|
// file name is same for all records, in this bunch
|
||||||
writeStatus.setFileId(fileId);
|
writeStatus.setFileId(fileId);
|
||||||
writeStatus.setPartitionPath(partitionPath);
|
writeStatus.setPartitionPath(partitionPath);
|
||||||
@@ -143,9 +130,6 @@ public class HoodieMergeHandle<T extends HoodieRecordPayload> extends HoodieIOHa
|
|||||||
|
|
||||||
/**
|
/**
|
||||||
* Load the new incoming records in a map and return partitionPath
|
* Load the new incoming records in a map and return partitionPath
|
||||||
* @param fileId
|
|
||||||
* @param newRecordsItr
|
|
||||||
* @return
|
|
||||||
*/
|
*/
|
||||||
private String init(String fileId, Iterator<HoodieRecord<T>> newRecordsItr) {
|
private String init(String fileId, Iterator<HoodieRecord<T>> newRecordsItr) {
|
||||||
try {
|
try {
|
||||||
@@ -164,14 +148,14 @@ public class HoodieMergeHandle<T extends HoodieRecordPayload> extends HoodieIOHa
|
|||||||
// update the new location of the record, so we know where to find it next
|
// update the new location of the record, so we know where to find it next
|
||||||
record.setNewLocation(new HoodieRecordLocation(commitTime, fileId));
|
record.setNewLocation(new HoodieRecordLocation(commitTime, fileId));
|
||||||
}
|
}
|
||||||
logger.debug("Number of entries in MemoryBasedMap => " +
|
logger.debug("Number of entries in MemoryBasedMap => "
|
||||||
((ExternalSpillableMap) keyToNewRecords).getInMemoryMapNumEntries()
|
+ ((ExternalSpillableMap) keyToNewRecords).getInMemoryMapNumEntries()
|
||||||
+ "Total size in bytes of MemoryBasedMap => " +
|
+ "Total size in bytes of MemoryBasedMap => "
|
||||||
((ExternalSpillableMap) keyToNewRecords).getCurrentInMemoryMapSize()
|
+ ((ExternalSpillableMap) keyToNewRecords).getCurrentInMemoryMapSize()
|
||||||
+ "Number of entries in DiskBasedMap => " +
|
+ "Number of entries in DiskBasedMap => "
|
||||||
((ExternalSpillableMap) keyToNewRecords).getDiskBasedMapNumEntries()
|
+ ((ExternalSpillableMap) keyToNewRecords).getDiskBasedMapNumEntries()
|
||||||
+ "Size of file spilled to disk => " +
|
+ "Size of file spilled to disk => "
|
||||||
((ExternalSpillableMap) keyToNewRecords).getSizeOfFileOnDiskInBytes());
|
+ ((ExternalSpillableMap) keyToNewRecords).getSizeOfFileOnDiskInBytes());
|
||||||
|
|
||||||
return partitionPath;
|
return partitionPath;
|
||||||
}
|
}
|
||||||
@@ -189,7 +173,8 @@ public class HoodieMergeHandle<T extends HoodieRecordPayload> extends HoodieIOHa
|
|||||||
}
|
}
|
||||||
|
|
||||||
writeStatus.markSuccess(hoodieRecord, recordMetadata);
|
writeStatus.markSuccess(hoodieRecord, recordMetadata);
|
||||||
// deflate record payload after recording success. This will help users access payload as a part of marking
|
// deflate record payload after recording success. This will help users access payload as a
|
||||||
|
// part of marking
|
||||||
// record successful.
|
// record successful.
|
||||||
hoodieRecord.deflate();
|
hoodieRecord.deflate();
|
||||||
return true;
|
return true;
|
||||||
@@ -201,8 +186,7 @@ public class HoodieMergeHandle<T extends HoodieRecordPayload> extends HoodieIOHa
|
|||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Go through an old record. Here if we detect a newer version shows up, we write the new one to
|
* Go through an old record. Here if we detect a newer version shows up, we write the new one to the file.
|
||||||
* the file.
|
|
||||||
*/
|
*/
|
||||||
public void write(GenericRecord oldRecord) {
|
public void write(GenericRecord oldRecord) {
|
||||||
String key = oldRecord.get(HoodieRecord.RECORD_KEY_METADATA_FIELD).toString();
|
String key = oldRecord.get(HoodieRecord.RECORD_KEY_METADATA_FIELD).toString();
|
||||||
@@ -236,8 +220,7 @@ public class HoodieMergeHandle<T extends HoodieRecordPayload> extends HoodieIOHa
|
|||||||
try {
|
try {
|
||||||
storageWriter.writeAvro(key, oldRecord);
|
storageWriter.writeAvro(key, oldRecord);
|
||||||
} catch (ClassCastException e) {
|
} catch (ClassCastException e) {
|
||||||
logger.error(
|
logger.error("Schema mismatch when rewriting old record " + oldRecord + " from file "
|
||||||
"Schema mismatch when rewriting old record " + oldRecord + " from file "
|
|
||||||
+ getOldFilePath() + " to file " + getStorageWriterPath() + " with schema " + schema
|
+ getOldFilePath() + " to file " + getStorageWriterPath() + " with schema " + schema
|
||||||
.toString(true));
|
.toString(true));
|
||||||
throw new HoodieUpsertException(errMsg, e);
|
throw new HoodieUpsertException(errMsg, e);
|
||||||
|
|||||||
@@ -53,8 +53,8 @@ public class CompactionOperation implements Serializable {
|
|||||||
this.partitionPath = partitionPath;
|
this.partitionPath = partitionPath;
|
||||||
this.dataFileCommitTime = dataFile.getCommitTime();
|
this.dataFileCommitTime = dataFile.getCommitTime();
|
||||||
this.dataFileSize = dataFile.getFileSize();
|
this.dataFileSize = dataFile.getFileSize();
|
||||||
this.deltaFilePaths = logFiles.stream().map(s -> s.getPath().toString()).collect(
|
this.deltaFilePaths = logFiles.stream().map(s -> s.getPath().toString())
|
||||||
Collectors.toList());
|
.collect(Collectors.toList());
|
||||||
this.metrics = writeConfig.getCompactionStrategy()
|
this.metrics = writeConfig.getCompactionStrategy()
|
||||||
.captureMetrics(dataFile, partitionPath, logFiles);
|
.captureMetrics(dataFile, partitionPath, logFiles);
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -17,17 +17,15 @@
|
|||||||
package com.uber.hoodie.io.compact;
|
package com.uber.hoodie.io.compact;
|
||||||
|
|
||||||
import com.uber.hoodie.WriteStatus;
|
import com.uber.hoodie.WriteStatus;
|
||||||
import com.uber.hoodie.common.model.HoodieCommitMetadata;
|
|
||||||
import com.uber.hoodie.common.table.HoodieTimeline;
|
import com.uber.hoodie.common.table.HoodieTimeline;
|
||||||
import com.uber.hoodie.common.table.timeline.HoodieActiveTimeline;
|
import com.uber.hoodie.common.table.timeline.HoodieActiveTimeline;
|
||||||
import com.uber.hoodie.common.table.timeline.HoodieInstant;
|
import com.uber.hoodie.common.table.timeline.HoodieInstant;
|
||||||
import com.uber.hoodie.config.HoodieWriteConfig;
|
import com.uber.hoodie.config.HoodieWriteConfig;
|
||||||
import com.uber.hoodie.table.HoodieTable;
|
import com.uber.hoodie.table.HoodieTable;
|
||||||
import org.apache.spark.api.java.JavaRDD;
|
|
||||||
import org.apache.spark.api.java.JavaSparkContext;
|
|
||||||
|
|
||||||
import java.io.Serializable;
|
import java.io.Serializable;
|
||||||
import java.util.Date;
|
import java.util.Date;
|
||||||
|
import org.apache.spark.api.java.JavaRDD;
|
||||||
|
import org.apache.spark.api.java.JavaSparkContext;
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* A HoodieCompactor runs compaction on a hoodie table
|
* A HoodieCompactor runs compaction on a hoodie table
|
||||||
|
|||||||
@@ -17,6 +17,7 @@
|
|||||||
package com.uber.hoodie.io.compact;
|
package com.uber.hoodie.io.compact;
|
||||||
|
|
||||||
import static java.util.stream.Collectors.toList;
|
import static java.util.stream.Collectors.toList;
|
||||||
|
|
||||||
import com.google.common.base.Preconditions;
|
import com.google.common.base.Preconditions;
|
||||||
import com.google.common.collect.Lists;
|
import com.google.common.collect.Lists;
|
||||||
import com.google.common.collect.Sets;
|
import com.google.common.collect.Sets;
|
||||||
@@ -70,9 +71,8 @@ public class HoodieRealtimeTableCompactor implements HoodieCompactor {
|
|||||||
}
|
}
|
||||||
|
|
||||||
private JavaRDD<WriteStatus> executeCompaction(JavaSparkContext jsc,
|
private JavaRDD<WriteStatus> executeCompaction(JavaSparkContext jsc,
|
||||||
List<CompactionOperation> operations,
|
List<CompactionOperation> operations, HoodieTable hoodieTable, HoodieWriteConfig config,
|
||||||
HoodieTable hoodieTable,
|
String compactionCommitTime) throws IOException {
|
||||||
HoodieWriteConfig config, String compactionCommitTime) throws IOException {
|
|
||||||
|
|
||||||
log.info("After filtering, Compacting " + operations + " files");
|
log.info("After filtering, Compacting " + operations + " files");
|
||||||
return jsc.parallelize(operations, operations.size())
|
return jsc.parallelize(operations, operations.size())
|
||||||
@@ -80,18 +80,19 @@ public class HoodieRealtimeTableCompactor implements HoodieCompactor {
|
|||||||
.flatMap(writeStatusesItr -> writeStatusesItr.iterator());
|
.flatMap(writeStatusesItr -> writeStatusesItr.iterator());
|
||||||
}
|
}
|
||||||
|
|
||||||
private List<WriteStatus> compact(HoodieTable hoodieTable,
|
private List<WriteStatus> compact(HoodieTable hoodieTable, HoodieWriteConfig config,
|
||||||
HoodieWriteConfig config, CompactionOperation operation, String commitTime)
|
CompactionOperation operation, String commitTime) throws IOException {
|
||||||
throws IOException {
|
|
||||||
FileSystem fs = hoodieTable.getMetaClient().getFs();
|
FileSystem fs = hoodieTable.getMetaClient().getFs();
|
||||||
Schema readerSchema =
|
Schema readerSchema = HoodieAvroUtils
|
||||||
HoodieAvroUtils.addMetadataFields(new Schema.Parser().parse(config.getSchema()));
|
.addMetadataFields(new Schema.Parser().parse(config.getSchema()));
|
||||||
|
|
||||||
log.info("Compacting base " + operation.getDataFilePath() + " with delta files " + operation
|
log.info("Compacting base " + operation.getDataFilePath() + " with delta files " + operation
|
||||||
.getDeltaFilePaths() + " for commit " + commitTime);
|
.getDeltaFilePaths() + " for commit " + commitTime);
|
||||||
// TODO - FIX THIS
|
// TODO - FIX THIS
|
||||||
// Reads the entire avro file. Always only specific blocks should be read from the avro file (failure recover).
|
// Reads the entire avro file. Always only specific blocks should be read from the avro file
|
||||||
// Load all the delta commits since the last compaction commit and get all the blocks to be loaded and load it using CompositeAvroLogReader
|
// (failure recover).
|
||||||
|
// Load all the delta commits since the last compaction commit and get all the blocks to be
|
||||||
|
// loaded and load it using CompositeAvroLogReader
|
||||||
// Since a DeltaCommit is not defined yet, reading all the records. revisit this soon.
|
// Since a DeltaCommit is not defined yet, reading all the records. revisit this soon.
|
||||||
HoodieTableMetaClient metaClient = hoodieTable.getMetaClient();
|
HoodieTableMetaClient metaClient = hoodieTable.getMetaClient();
|
||||||
String maxInstantTime = metaClient.getActiveTimeline()
|
String maxInstantTime = metaClient.getActiveTimeline()
|
||||||
@@ -114,25 +115,22 @@ public class HoodieRealtimeTableCompactor implements HoodieCompactor {
|
|||||||
Iterator<List<WriteStatus>> result = table
|
Iterator<List<WriteStatus>> result = table
|
||||||
.handleUpdate(commitTime, operation.getFileId(), scanner.getRecords());
|
.handleUpdate(commitTime, operation.getFileId(), scanner.getRecords());
|
||||||
Iterable<List<WriteStatus>> resultIterable = () -> result;
|
Iterable<List<WriteStatus>> resultIterable = () -> result;
|
||||||
return StreamSupport.stream(resultIterable.spliterator(), false)
|
return StreamSupport.stream(resultIterable.spliterator(), false).flatMap(Collection::stream)
|
||||||
.flatMap(Collection::stream)
|
|
||||||
.map(s -> {
|
.map(s -> {
|
||||||
s.getStat().setTotalRecordsToBeUpdate(scanner.getTotalRecordsToUpdate());
|
s.getStat().setTotalRecordsToBeUpdate(scanner.getTotalRecordsToUpdate());
|
||||||
s.getStat().setTotalLogFiles(scanner.getTotalLogFiles());
|
s.getStat().setTotalLogFiles(scanner.getTotalLogFiles());
|
||||||
s.getStat().setTotalLogRecords(scanner.getTotalLogRecords());
|
s.getStat().setTotalLogRecords(scanner.getTotalLogRecords());
|
||||||
s.getStat().setPartitionPath(operation.getPartitionPath());
|
s.getStat().setPartitionPath(operation.getPartitionPath());
|
||||||
return s;
|
return s;
|
||||||
})
|
}).collect(toList());
|
||||||
.collect(toList());
|
|
||||||
}
|
}
|
||||||
|
|
||||||
private List<CompactionOperation> getCompactionWorkload(JavaSparkContext jsc,
|
private List<CompactionOperation> getCompactionWorkload(JavaSparkContext jsc,
|
||||||
HoodieTable hoodieTable,
|
HoodieTable hoodieTable, HoodieWriteConfig config, String compactionCommitTime)
|
||||||
HoodieWriteConfig config, String compactionCommitTime)
|
|
||||||
throws IOException {
|
throws IOException {
|
||||||
|
|
||||||
Preconditions.checkArgument(
|
Preconditions
|
||||||
hoodieTable.getMetaClient().getTableType() == HoodieTableType.MERGE_ON_READ,
|
.checkArgument(hoodieTable.getMetaClient().getTableType() == HoodieTableType.MERGE_ON_READ,
|
||||||
"HoodieRealtimeTableCompactor can only compact table of type "
|
"HoodieRealtimeTableCompactor can only compact table of type "
|
||||||
+ HoodieTableType.MERGE_ON_READ + " and not " + hoodieTable.getMetaClient()
|
+ HoodieTableType.MERGE_ON_READ + " and not " + hoodieTable.getMetaClient()
|
||||||
.getTableType().name());
|
.getTableType().name());
|
||||||
@@ -141,25 +139,23 @@ public class HoodieRealtimeTableCompactor implements HoodieCompactor {
|
|||||||
// TODO - rollback any compactions in flight
|
// TODO - rollback any compactions in flight
|
||||||
HoodieTableMetaClient metaClient = hoodieTable.getMetaClient();
|
HoodieTableMetaClient metaClient = hoodieTable.getMetaClient();
|
||||||
log.info("Compacting " + metaClient.getBasePath() + " with commit " + compactionCommitTime);
|
log.info("Compacting " + metaClient.getBasePath() + " with commit " + compactionCommitTime);
|
||||||
List<String> partitionPaths =
|
List<String> partitionPaths = FSUtils
|
||||||
FSUtils.getAllPartitionPaths(metaClient.getFs(), metaClient.getBasePath(),
|
.getAllPartitionPaths(metaClient.getFs(), metaClient.getBasePath(),
|
||||||
config.shouldAssumeDatePartitioning());
|
config.shouldAssumeDatePartitioning());
|
||||||
|
|
||||||
TableFileSystemView.RealtimeView fileSystemView = hoodieTable.getRTFileSystemView();
|
TableFileSystemView.RealtimeView fileSystemView = hoodieTable.getRTFileSystemView();
|
||||||
log.info("Compaction looking for files to compact in " + partitionPaths + " partitions");
|
log.info("Compaction looking for files to compact in " + partitionPaths + " partitions");
|
||||||
List<CompactionOperation> operations =
|
List<CompactionOperation> operations = jsc.parallelize(partitionPaths, partitionPaths.size())
|
||||||
jsc.parallelize(partitionPaths, partitionPaths.size())
|
|
||||||
.flatMap((FlatMapFunction<String, CompactionOperation>) partitionPath -> fileSystemView
|
.flatMap((FlatMapFunction<String, CompactionOperation>) partitionPath -> fileSystemView
|
||||||
.getLatestFileSlices(partitionPath)
|
.getLatestFileSlices(partitionPath).map(
|
||||||
.map(s -> new CompactionOperation(s.getDataFile().get(),
|
s -> new CompactionOperation(s.getDataFile().get(), partitionPath,
|
||||||
partitionPath,
|
|
||||||
s.getLogFiles().sorted(HoodieLogFile.getLogVersionComparator().reversed())
|
s.getLogFiles().sorted(HoodieLogFile.getLogVersionComparator().reversed())
|
||||||
.collect(Collectors.toList()), config))
|
.collect(Collectors.toList()), config))
|
||||||
.filter(c -> !c.getDeltaFilePaths().isEmpty())
|
.filter(c -> !c.getDeltaFilePaths().isEmpty()).collect(toList()).iterator()).collect();
|
||||||
.collect(toList()).iterator()).collect();
|
|
||||||
log.info("Total of " + operations.size() + " compactions are retrieved");
|
log.info("Total of " + operations.size() + " compactions are retrieved");
|
||||||
|
|
||||||
// Filter the compactions with the passed in filter. This lets us choose most effective compactions only
|
// Filter the compactions with the passed in filter. This lets us choose most effective
|
||||||
|
// compactions only
|
||||||
operations = config.getCompactionStrategy().orderAndFilter(config, operations);
|
operations = config.getCompactionStrategy().orderAndFilter(config, operations);
|
||||||
if (operations.isEmpty()) {
|
if (operations.isEmpty()) {
|
||||||
log.warn("After filtering, Nothing to compact for " + metaClient.getBasePath());
|
log.warn("After filtering, Nothing to compact for " + metaClient.getBasePath());
|
||||||
|
|||||||
@@ -44,9 +44,9 @@ public class BoundedIOCompactionStrategy implements CompactionStrategy {
|
|||||||
List<HoodieLogFile> logFiles) {
|
List<HoodieLogFile> logFiles) {
|
||||||
Map<String, Object> metrics = Maps.newHashMap();
|
Map<String, Object> metrics = Maps.newHashMap();
|
||||||
// Total size of all the log files
|
// Total size of all the log files
|
||||||
Long totalLogFileSize = logFiles.stream().map(HoodieLogFile::getFileSize).filter(
|
Long totalLogFileSize = logFiles.stream().map(HoodieLogFile::getFileSize)
|
||||||
Optional::isPresent).map(Optional::get).reduce(
|
.filter(Optional::isPresent).map(Optional::get).reduce((size1, size2) -> size1 + size2)
|
||||||
(size1, size2) -> size1 + size2).orElse(0L);
|
.orElse(0L);
|
||||||
// Total read will be the base file + all the log files
|
// Total read will be the base file + all the log files
|
||||||
Long totalIORead = FSUtils.getSizeInMB(dataFile.getFileSize() + totalLogFileSize);
|
Long totalIORead = FSUtils.getSizeInMB(dataFile.getFileSize() + totalLogFileSize);
|
||||||
// Total write will be similar to the size of the base file
|
// Total write will be similar to the size of the base file
|
||||||
@@ -64,7 +64,8 @@ public class BoundedIOCompactionStrategy implements CompactionStrategy {
|
|||||||
@Override
|
@Override
|
||||||
public List<CompactionOperation> orderAndFilter(HoodieWriteConfig writeConfig,
|
public List<CompactionOperation> orderAndFilter(HoodieWriteConfig writeConfig,
|
||||||
List<CompactionOperation> operations) {
|
List<CompactionOperation> operations) {
|
||||||
// Iterate through the operations in order and accept operations as long as we are within the IO limit
|
// Iterate through the operations in order and accept operations as long as we are within the
|
||||||
|
// IO limit
|
||||||
// Preserves the original ordering of compactions
|
// Preserves the original ordering of compactions
|
||||||
List<CompactionOperation> finalOperations = Lists.newArrayList();
|
List<CompactionOperation> finalOperations = Lists.newArrayList();
|
||||||
long targetIORemaining = writeConfig.getTargetIOPerCompactionInMB();
|
long targetIORemaining = writeConfig.getTargetIOPerCompactionInMB();
|
||||||
|
|||||||
@@ -28,7 +28,7 @@ import java.util.Map;
|
|||||||
* Strategy for compaction. Pluggable implementation of define how compaction should be done. The
|
* Strategy for compaction. Pluggable implementation of define how compaction should be done. The
|
||||||
* implementations of this interface can capture the relevant metrics to order and filter the final
|
* implementations of this interface can capture the relevant metrics to order and filter the final
|
||||||
* list of compaction operation to run in a single compaction.
|
* list of compaction operation to run in a single compaction.
|
||||||
*
|
* <p>
|
||||||
* Implementation of CompactionStrategy cannot hold any state. Difference instantiations can be
|
* Implementation of CompactionStrategy cannot hold any state. Difference instantiations can be
|
||||||
* passed in every time
|
* passed in every time
|
||||||
*
|
*
|
||||||
|
|||||||
@@ -20,7 +20,6 @@ package com.uber.hoodie.io.compact.strategy;
|
|||||||
import com.uber.hoodie.config.HoodieWriteConfig;
|
import com.uber.hoodie.config.HoodieWriteConfig;
|
||||||
import com.uber.hoodie.exception.HoodieException;
|
import com.uber.hoodie.exception.HoodieException;
|
||||||
import com.uber.hoodie.io.compact.CompactionOperation;
|
import com.uber.hoodie.io.compact.CompactionOperation;
|
||||||
|
|
||||||
import java.text.ParseException;
|
import java.text.ParseException;
|
||||||
import java.text.SimpleDateFormat;
|
import java.text.SimpleDateFormat;
|
||||||
import java.util.Comparator;
|
import java.util.Comparator;
|
||||||
@@ -30,15 +29,16 @@ import java.util.Locale;
|
|||||||
import java.util.stream.Collectors;
|
import java.util.stream.Collectors;
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* This strategy orders compactions in reverse order of creation of Hive Partitions.
|
* This strategy orders compactions in reverse order of creation of Hive Partitions. It helps to
|
||||||
* It helps to compact data in latest partitions first and then older capped at the Total_IO allowed.
|
* compact data in latest partitions first and then older capped at the Total_IO allowed.
|
||||||
*/
|
*/
|
||||||
public class DayBasedCompactionStrategy extends BoundedIOCompactionStrategy {
|
public class DayBasedCompactionStrategy extends BoundedIOCompactionStrategy {
|
||||||
|
|
||||||
// For now, use SimpleDateFormat as default partition format
|
// For now, use SimpleDateFormat as default partition format
|
||||||
private static String datePartitionFormat = "yyyy/MM/dd";
|
private static String datePartitionFormat = "yyyy/MM/dd";
|
||||||
// Sorts compaction in LastInFirstCompacted order
|
// Sorts compaction in LastInFirstCompacted order
|
||||||
private static Comparator<CompactionOperation> comparator = (CompactionOperation leftC, CompactionOperation rightC) -> {
|
private static Comparator<CompactionOperation> comparator = (CompactionOperation leftC,
|
||||||
|
CompactionOperation rightC) -> {
|
||||||
try {
|
try {
|
||||||
Date left = new SimpleDateFormat(datePartitionFormat, Locale.ENGLISH)
|
Date left = new SimpleDateFormat(datePartitionFormat, Locale.ENGLISH)
|
||||||
.parse(leftC.getPartitionPath());
|
.parse(leftC.getPartitionPath());
|
||||||
@@ -55,8 +55,10 @@ public class DayBasedCompactionStrategy extends BoundedIOCompactionStrategy {
|
|||||||
}
|
}
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
public List<CompactionOperation> orderAndFilter(HoodieWriteConfig writeConfig, List<CompactionOperation> operations) {
|
public List<CompactionOperation> orderAndFilter(HoodieWriteConfig writeConfig,
|
||||||
|
List<CompactionOperation> operations) {
|
||||||
// Iterate through the operations and accept operations as long as we are within the IO limit
|
// Iterate through the operations and accept operations as long as we are within the IO limit
|
||||||
return super.orderAndFilter(writeConfig, operations.stream().sorted(comparator).collect(Collectors.toList()));
|
return super.orderAndFilter(writeConfig,
|
||||||
|
operations.stream().sorted(comparator).collect(Collectors.toList()));
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@@ -44,9 +44,9 @@ public class LogFileSizeBasedCompactionStrategy extends BoundedIOCompactionStrat
|
|||||||
|
|
||||||
Map<String, Object> metrics = super.captureMetrics(dataFile, partitionPath, logFiles);
|
Map<String, Object> metrics = super.captureMetrics(dataFile, partitionPath, logFiles);
|
||||||
// Total size of all the log files
|
// Total size of all the log files
|
||||||
Long totalLogFileSize = logFiles.stream().map(HoodieLogFile::getFileSize).filter(
|
Long totalLogFileSize = logFiles.stream().map(HoodieLogFile::getFileSize)
|
||||||
Optional::isPresent).map(Optional::get).reduce(
|
.filter(Optional::isPresent).map(Optional::get).reduce((size1, size2) -> size1 + size2)
|
||||||
(size1, size2) -> size1 + size2).orElse(0L);
|
.orElse(0L);
|
||||||
// save the metrics needed during the order
|
// save the metrics needed during the order
|
||||||
metrics.put(TOTAL_LOG_FILE_SIZE, totalLogFileSize);
|
metrics.put(TOTAL_LOG_FILE_SIZE, totalLogFileSize);
|
||||||
return metrics;
|
return metrics;
|
||||||
|
|||||||
@@ -36,8 +36,8 @@ import org.apache.spark.TaskContext;
|
|||||||
* HoodieParquetWriter extends the ParquetWriter to help limit the size of underlying file. Provides
|
* HoodieParquetWriter extends the ParquetWriter to help limit the size of underlying file. Provides
|
||||||
* a way to check if the current file can take more records with the <code>canWrite()</code>
|
* a way to check if the current file can take more records with the <code>canWrite()</code>
|
||||||
*/
|
*/
|
||||||
public class HoodieParquetWriter<T extends HoodieRecordPayload, R extends IndexedRecord>
|
public class HoodieParquetWriter<T extends HoodieRecordPayload, R extends IndexedRecord> extends
|
||||||
extends ParquetWriter<IndexedRecord> implements HoodieStorageWriter<R> {
|
ParquetWriter<IndexedRecord> implements HoodieStorageWriter<R> {
|
||||||
|
|
||||||
private static AtomicLong recordIndex = new AtomicLong(1);
|
private static AtomicLong recordIndex = new AtomicLong(1);
|
||||||
|
|
||||||
@@ -49,6 +49,29 @@ public class HoodieParquetWriter<T extends HoodieRecordPayload, R extends Indexe
|
|||||||
private final Schema schema;
|
private final Schema schema;
|
||||||
|
|
||||||
|
|
||||||
|
public HoodieParquetWriter(String commitTime, Path file, HoodieParquetConfig parquetConfig,
|
||||||
|
Schema schema) throws IOException {
|
||||||
|
super(HoodieWrapperFileSystem.convertToHoodiePath(file, parquetConfig.getHadoopConf()),
|
||||||
|
ParquetFileWriter.Mode.CREATE, parquetConfig.getWriteSupport(),
|
||||||
|
parquetConfig.getCompressionCodecName(), parquetConfig.getBlockSize(),
|
||||||
|
parquetConfig.getPageSize(), parquetConfig.getPageSize(),
|
||||||
|
ParquetWriter.DEFAULT_IS_DICTIONARY_ENABLED, ParquetWriter.DEFAULT_IS_VALIDATING_ENABLED,
|
||||||
|
ParquetWriter.DEFAULT_WRITER_VERSION,
|
||||||
|
registerFileSystem(file, parquetConfig.getHadoopConf()));
|
||||||
|
this.file = HoodieWrapperFileSystem.convertToHoodiePath(file, parquetConfig.getHadoopConf());
|
||||||
|
this.fs = (HoodieWrapperFileSystem) this.file
|
||||||
|
.getFileSystem(registerFileSystem(file, parquetConfig.getHadoopConf()));
|
||||||
|
// We cannot accurately measure the snappy compressed output file size. We are choosing a
|
||||||
|
// conservative 10%
|
||||||
|
// TODO - compute this compression ratio dynamically by looking at the bytes written to the
|
||||||
|
// stream and the actual file size reported by HDFS
|
||||||
|
this.maxFileSize = parquetConfig.getMaxFileSize() + Math
|
||||||
|
.round(parquetConfig.getMaxFileSize() * parquetConfig.getCompressionRatio());
|
||||||
|
this.writeSupport = parquetConfig.getWriteSupport();
|
||||||
|
this.commitTime = commitTime;
|
||||||
|
this.schema = schema;
|
||||||
|
}
|
||||||
|
|
||||||
private static Configuration registerFileSystem(Path file, Configuration conf) {
|
private static Configuration registerFileSystem(Path file, Configuration conf) {
|
||||||
Configuration returnConf = new Configuration(conf);
|
Configuration returnConf = new Configuration(conf);
|
||||||
String scheme = FSUtils.getFs(file.toString(), conf).getScheme();
|
String scheme = FSUtils.getFs(file.toString(), conf).getScheme();
|
||||||
@@ -57,37 +80,12 @@ public class HoodieParquetWriter<T extends HoodieRecordPayload, R extends Indexe
|
|||||||
return returnConf;
|
return returnConf;
|
||||||
}
|
}
|
||||||
|
|
||||||
public HoodieParquetWriter(String commitTime, Path file,
|
|
||||||
HoodieParquetConfig parquetConfig, Schema schema) throws IOException {
|
|
||||||
super(HoodieWrapperFileSystem.convertToHoodiePath(file, parquetConfig.getHadoopConf()),
|
|
||||||
ParquetFileWriter.Mode.CREATE, parquetConfig.getWriteSupport(),
|
|
||||||
parquetConfig.getCompressionCodecName(), parquetConfig.getBlockSize(),
|
|
||||||
parquetConfig.getPageSize(), parquetConfig.getPageSize(),
|
|
||||||
ParquetWriter.DEFAULT_IS_DICTIONARY_ENABLED,
|
|
||||||
ParquetWriter.DEFAULT_IS_VALIDATING_ENABLED, ParquetWriter.DEFAULT_WRITER_VERSION,
|
|
||||||
registerFileSystem(file, parquetConfig.getHadoopConf()));
|
|
||||||
this.file =
|
|
||||||
HoodieWrapperFileSystem.convertToHoodiePath(file, parquetConfig.getHadoopConf());
|
|
||||||
this.fs = (HoodieWrapperFileSystem) this.file
|
|
||||||
.getFileSystem(registerFileSystem(file, parquetConfig.getHadoopConf()));
|
|
||||||
// We cannot accurately measure the snappy compressed output file size. We are choosing a conservative 10%
|
|
||||||
// TODO - compute this compression ratio dynamically by looking at the bytes written to the stream and the actual file size reported by HDFS
|
|
||||||
this.maxFileSize = parquetConfig.getMaxFileSize() + Math
|
|
||||||
.round(parquetConfig.getMaxFileSize() * parquetConfig.getCompressionRatio());
|
|
||||||
this.writeSupport = parquetConfig.getWriteSupport();
|
|
||||||
this.commitTime = commitTime;
|
|
||||||
this.schema = schema;
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
public void writeAvroWithMetadata(R avroRecord, HoodieRecord record) throws IOException {
|
public void writeAvroWithMetadata(R avroRecord, HoodieRecord record) throws IOException {
|
||||||
String seqId = HoodieRecord.generateSequenceId(commitTime, TaskContext.getPartitionId(),
|
String seqId = HoodieRecord.generateSequenceId(commitTime, TaskContext.getPartitionId(),
|
||||||
recordIndex.getAndIncrement());
|
recordIndex.getAndIncrement());
|
||||||
HoodieAvroUtils.addHoodieKeyToRecord((GenericRecord) avroRecord,
|
HoodieAvroUtils.addHoodieKeyToRecord((GenericRecord) avroRecord, record.getRecordKey(),
|
||||||
record.getRecordKey(),
|
record.getPartitionPath(), file.getName());
|
||||||
record.getPartitionPath(),
|
|
||||||
file.getName());
|
|
||||||
HoodieAvroUtils.addCommitMetadataToRecord((GenericRecord) avroRecord, commitTime, seqId);
|
HoodieAvroUtils.addCommitMetadataToRecord((GenericRecord) avroRecord, commitTime, seqId);
|
||||||
super.write(avroRecord);
|
super.write(avroRecord);
|
||||||
writeSupport.add(record.getRecordKey());
|
writeSupport.add(record.getRecordKey());
|
||||||
|
|||||||
@@ -30,22 +30,22 @@ import org.apache.parquet.hadoop.metadata.CompressionCodecName;
|
|||||||
|
|
||||||
public class HoodieStorageWriterFactory {
|
public class HoodieStorageWriterFactory {
|
||||||
|
|
||||||
public static <T extends HoodieRecordPayload, R extends IndexedRecord> HoodieStorageWriter<R> getStorageWriter(
|
public static <T extends HoodieRecordPayload, R extends IndexedRecord> HoodieStorageWriter<R>
|
||||||
String commitTime, Path path, HoodieTable<T> hoodieTable, HoodieWriteConfig config,
|
getStorageWriter(String commitTime, Path path, HoodieTable<T> hoodieTable,
|
||||||
Schema schema)
|
HoodieWriteConfig config, Schema schema) throws IOException {
|
||||||
throws IOException {
|
|
||||||
//TODO - based on the metadata choose the implementation of HoodieStorageWriter
|
//TODO - based on the metadata choose the implementation of HoodieStorageWriter
|
||||||
// Currently only parquet is supported
|
// Currently only parquet is supported
|
||||||
return newParquetStorageWriter(commitTime, path, config, schema, hoodieTable);
|
return newParquetStorageWriter(commitTime, path, config, schema, hoodieTable);
|
||||||
}
|
}
|
||||||
|
|
||||||
private static <T extends HoodieRecordPayload, R extends IndexedRecord> HoodieStorageWriter<R> newParquetStorageWriter(
|
private static <T extends HoodieRecordPayload, R extends IndexedRecord> HoodieStorageWriter<R>
|
||||||
|
newParquetStorageWriter(
|
||||||
String commitTime, Path path, HoodieWriteConfig config, Schema schema,
|
String commitTime, Path path, HoodieWriteConfig config, Schema schema,
|
||||||
HoodieTable hoodieTable) throws IOException {
|
HoodieTable hoodieTable) throws IOException {
|
||||||
BloomFilter filter =
|
BloomFilter filter = new BloomFilter(config.getBloomFilterNumEntries(),
|
||||||
new BloomFilter(config.getBloomFilterNumEntries(), config.getBloomFilterFPP());
|
config.getBloomFilterFPP());
|
||||||
HoodieAvroWriteSupport writeSupport =
|
HoodieAvroWriteSupport writeSupport = new HoodieAvroWriteSupport(
|
||||||
new HoodieAvroWriteSupport(new AvroSchemaConverter().convert(schema), schema, filter);
|
new AvroSchemaConverter().convert(schema), schema, filter);
|
||||||
|
|
||||||
HoodieParquetConfig parquetConfig =
|
HoodieParquetConfig parquetConfig =
|
||||||
new HoodieParquetConfig(writeSupport, CompressionCodecName.GZIP,
|
new HoodieParquetConfig(writeSupport, CompressionCodecName.GZIP,
|
||||||
|
|||||||
@@ -59,8 +59,8 @@ import org.apache.hadoop.util.Progressable;
|
|||||||
*/
|
*/
|
||||||
public class HoodieWrapperFileSystem extends FileSystem {
|
public class HoodieWrapperFileSystem extends FileSystem {
|
||||||
|
|
||||||
private static final Set<String> SUPPORT_SCHEMES;
|
|
||||||
public static final String HOODIE_SCHEME_PREFIX = "hoodie-";
|
public static final String HOODIE_SCHEME_PREFIX = "hoodie-";
|
||||||
|
private static final Set<String> SUPPORT_SCHEMES;
|
||||||
|
|
||||||
static {
|
static {
|
||||||
SUPPORT_SCHEMES = new HashSet<>();
|
SUPPORT_SCHEMES = new HashSet<>();
|
||||||
@@ -69,18 +69,50 @@ public class HoodieWrapperFileSystem extends FileSystem {
|
|||||||
SUPPORT_SCHEMES.add("s3");
|
SUPPORT_SCHEMES.add("s3");
|
||||||
SUPPORT_SCHEMES.add("s3a");
|
SUPPORT_SCHEMES.add("s3a");
|
||||||
|
|
||||||
|
|
||||||
// Hoodie currently relies on underlying object store being fully
|
// Hoodie currently relies on underlying object store being fully
|
||||||
// consistent so only regional buckets should be used.
|
// consistent so only regional buckets should be used.
|
||||||
SUPPORT_SCHEMES.add("gs");
|
SUPPORT_SCHEMES.add("gs");
|
||||||
SUPPORT_SCHEMES.add("viewfs");
|
SUPPORT_SCHEMES.add("viewfs");
|
||||||
}
|
}
|
||||||
|
|
||||||
private ConcurrentMap<String, SizeAwareFSDataOutputStream> openStreams =
|
private ConcurrentMap<String, SizeAwareFSDataOutputStream> openStreams = new
|
||||||
new ConcurrentHashMap<>();
|
ConcurrentHashMap<>();
|
||||||
private FileSystem fileSystem;
|
private FileSystem fileSystem;
|
||||||
private URI uri;
|
private URI uri;
|
||||||
|
|
||||||
|
public static Path convertToHoodiePath(Path file, Configuration conf) {
|
||||||
|
try {
|
||||||
|
String scheme = FSUtils.getFs(file.toString(), conf).getScheme();
|
||||||
|
return convertPathWithScheme(file, getHoodieScheme(scheme));
|
||||||
|
} catch (HoodieIOException e) {
|
||||||
|
throw e;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
private static Path convertPathWithScheme(Path oldPath, String newScheme) {
|
||||||
|
URI oldURI = oldPath.toUri();
|
||||||
|
URI newURI;
|
||||||
|
try {
|
||||||
|
newURI = new URI(newScheme, oldURI.getUserInfo(), oldURI.getHost(), oldURI.getPort(),
|
||||||
|
oldURI.getPath(), oldURI.getQuery(), oldURI.getFragment());
|
||||||
|
return new Path(newURI);
|
||||||
|
} catch (URISyntaxException e) {
|
||||||
|
// TODO - Better Exception handling
|
||||||
|
throw new RuntimeException(e);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
public static String getHoodieScheme(String scheme) {
|
||||||
|
String newScheme;
|
||||||
|
if (SUPPORT_SCHEMES.contains(scheme)) {
|
||||||
|
newScheme = HOODIE_SCHEME_PREFIX + scheme;
|
||||||
|
} else {
|
||||||
|
throw new IllegalArgumentException(
|
||||||
|
"BlockAlignedAvroParquetWriter does not support scheme " + scheme);
|
||||||
|
}
|
||||||
|
return newScheme;
|
||||||
|
}
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
public void initialize(URI uri, Configuration conf) throws IOException {
|
public void initialize(URI uri, Configuration conf) throws IOException {
|
||||||
// Get the default filesystem to decorate
|
// Get the default filesystem to decorate
|
||||||
@@ -90,7 +122,8 @@ public class HoodieWrapperFileSystem extends FileSystem {
|
|||||||
path = new Path(path.toString().replace(HOODIE_SCHEME_PREFIX, ""));
|
path = new Path(path.toString().replace(HOODIE_SCHEME_PREFIX, ""));
|
||||||
}
|
}
|
||||||
this.fileSystem = FSUtils.getFs(path.toString(), conf);
|
this.fileSystem = FSUtils.getFs(path.toString(), conf);
|
||||||
// Do not need to explicitly initialize the default filesystem, its done already in the above FileSystem.get
|
// Do not need to explicitly initialize the default filesystem, its done already in the above
|
||||||
|
// FileSystem.get
|
||||||
// fileSystem.initialize(FileSystem.getDefaultUri(conf), conf);
|
// fileSystem.initialize(FileSystem.getDefaultUri(conf), conf);
|
||||||
// fileSystem.setConf(conf);
|
// fileSystem.setConf(conf);
|
||||||
this.uri = uri;
|
this.uri = uri;
|
||||||
@@ -108,8 +141,7 @@ public class HoodieWrapperFileSystem extends FileSystem {
|
|||||||
|
|
||||||
@Override
|
@Override
|
||||||
public FSDataOutputStream create(Path f, FsPermission permission, boolean overwrite,
|
public FSDataOutputStream create(Path f, FsPermission permission, boolean overwrite,
|
||||||
int bufferSize, short replication, long blockSize, Progressable progress)
|
int bufferSize, short replication, long blockSize, Progressable progress) throws IOException {
|
||||||
throws IOException {
|
|
||||||
final Path translatedPath = convertToDefaultPath(f);
|
final Path translatedPath = convertToDefaultPath(f);
|
||||||
return wrapOutputStream(f, fileSystem
|
return wrapOutputStream(f, fileSystem
|
||||||
.create(translatedPath, permission, overwrite, bufferSize, replication, blockSize,
|
.create(translatedPath, permission, overwrite, bufferSize, replication, blockSize,
|
||||||
@@ -122,8 +154,8 @@ public class HoodieWrapperFileSystem extends FileSystem {
|
|||||||
return fsDataOutputStream;
|
return fsDataOutputStream;
|
||||||
}
|
}
|
||||||
|
|
||||||
SizeAwareFSDataOutputStream os =
|
SizeAwareFSDataOutputStream os = new SizeAwareFSDataOutputStream(fsDataOutputStream,
|
||||||
new SizeAwareFSDataOutputStream(fsDataOutputStream, new Runnable() {
|
new Runnable() {
|
||||||
@Override
|
@Override
|
||||||
public void run() {
|
public void run() {
|
||||||
openStreams.remove(path.getName());
|
openStreams.remove(path.getName());
|
||||||
@@ -160,14 +192,13 @@ public class HoodieWrapperFileSystem extends FileSystem {
|
|||||||
}
|
}
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
public FSDataOutputStream create(Path f, boolean overwrite, int bufferSize)
|
public FSDataOutputStream create(Path f, boolean overwrite, int bufferSize) throws IOException {
|
||||||
throws IOException {
|
|
||||||
return fileSystem.create(convertToDefaultPath(f), overwrite, bufferSize);
|
return fileSystem.create(convertToDefaultPath(f), overwrite, bufferSize);
|
||||||
}
|
}
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
public FSDataOutputStream create(Path f, boolean overwrite, int bufferSize,
|
public FSDataOutputStream create(Path f, boolean overwrite, int bufferSize, Progressable progress)
|
||||||
Progressable progress) throws IOException {
|
throws IOException {
|
||||||
return fileSystem.create(convertToDefaultPath(f), overwrite, bufferSize, progress);
|
return fileSystem.create(convertToDefaultPath(f), overwrite, bufferSize, progress);
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -175,14 +206,12 @@ public class HoodieWrapperFileSystem extends FileSystem {
|
|||||||
public FSDataOutputStream create(Path f, boolean overwrite, int bufferSize, short replication,
|
public FSDataOutputStream create(Path f, boolean overwrite, int bufferSize, short replication,
|
||||||
long blockSize, Progressable progress) throws IOException {
|
long blockSize, Progressable progress) throws IOException {
|
||||||
return fileSystem
|
return fileSystem
|
||||||
.create(convertToDefaultPath(f), overwrite, bufferSize, replication, blockSize,
|
.create(convertToDefaultPath(f), overwrite, bufferSize, replication, blockSize, progress);
|
||||||
progress);
|
|
||||||
}
|
}
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
public FSDataOutputStream create(Path f, FsPermission permission, EnumSet<CreateFlag> flags,
|
public FSDataOutputStream create(Path f, FsPermission permission, EnumSet<CreateFlag> flags,
|
||||||
int bufferSize, short replication, long blockSize, Progressable progress)
|
int bufferSize, short replication, long blockSize, Progressable progress) throws IOException {
|
||||||
throws IOException {
|
|
||||||
return fileSystem
|
return fileSystem
|
||||||
.create(convertToDefaultPath(f), permission, flags, bufferSize, replication, blockSize,
|
.create(convertToDefaultPath(f), permission, flags, bufferSize, replication, blockSize,
|
||||||
progress);
|
progress);
|
||||||
@@ -197,7 +226,6 @@ public class HoodieWrapperFileSystem extends FileSystem {
|
|||||||
progress, checksumOpt);
|
progress, checksumOpt);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
public FSDataOutputStream create(Path f, boolean overwrite, int bufferSize, short replication,
|
public FSDataOutputStream create(Path f, boolean overwrite, int bufferSize, short replication,
|
||||||
long blockSize) throws IOException {
|
long blockSize) throws IOException {
|
||||||
@@ -205,7 +233,6 @@ public class HoodieWrapperFileSystem extends FileSystem {
|
|||||||
.create(convertToDefaultPath(f), overwrite, bufferSize, replication, blockSize);
|
.create(convertToDefaultPath(f), overwrite, bufferSize, replication, blockSize);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
public FSDataOutputStream append(Path f, int bufferSize, Progressable progress)
|
public FSDataOutputStream append(Path f, int bufferSize, Progressable progress)
|
||||||
throws IOException {
|
throws IOException {
|
||||||
@@ -228,13 +255,13 @@ public class HoodieWrapperFileSystem extends FileSystem {
|
|||||||
}
|
}
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
public void setWorkingDirectory(Path new_dir) {
|
public Path getWorkingDirectory() {
|
||||||
fileSystem.setWorkingDirectory(convertToDefaultPath(new_dir));
|
return convertToHoodiePath(fileSystem.getWorkingDirectory());
|
||||||
}
|
}
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
public Path getWorkingDirectory() {
|
public void setWorkingDirectory(Path newDir) {
|
||||||
return convertToHoodiePath(fileSystem.getWorkingDirectory());
|
fileSystem.setWorkingDirectory(convertToDefaultPath(newDir));
|
||||||
}
|
}
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
@@ -290,8 +317,7 @@ public class HoodieWrapperFileSystem extends FileSystem {
|
|||||||
}
|
}
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
public BlockLocation[] getFileBlockLocations(Path p, long start, long len)
|
public BlockLocation[] getFileBlockLocations(Path p, long start, long len) throws IOException {
|
||||||
throws IOException {
|
|
||||||
return fileSystem.getFileBlockLocations(convertToDefaultPath(p), start, len);
|
return fileSystem.getFileBlockLocations(convertToDefaultPath(p), start, len);
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -319,17 +345,16 @@ public class HoodieWrapperFileSystem extends FileSystem {
|
|||||||
public FSDataOutputStream createNonRecursive(Path f, boolean overwrite, int bufferSize,
|
public FSDataOutputStream createNonRecursive(Path f, boolean overwrite, int bufferSize,
|
||||||
short replication, long blockSize, Progressable progress) throws IOException {
|
short replication, long blockSize, Progressable progress) throws IOException {
|
||||||
return fileSystem
|
return fileSystem
|
||||||
.createNonRecursive(convertToDefaultPath(f), overwrite, bufferSize, replication,
|
.createNonRecursive(convertToDefaultPath(f), overwrite, bufferSize, replication, blockSize,
|
||||||
blockSize, progress);
|
progress);
|
||||||
}
|
}
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
public FSDataOutputStream createNonRecursive(Path f, FsPermission permission, boolean overwrite,
|
public FSDataOutputStream createNonRecursive(Path f, FsPermission permission, boolean overwrite,
|
||||||
int bufferSize, short replication, long blockSize, Progressable progress)
|
int bufferSize, short replication, long blockSize, Progressable progress) throws IOException {
|
||||||
throws IOException {
|
|
||||||
return fileSystem
|
return fileSystem
|
||||||
.createNonRecursive(convertToDefaultPath(f), permission, overwrite, bufferSize,
|
.createNonRecursive(convertToDefaultPath(f), permission, overwrite, bufferSize, replication,
|
||||||
replication, blockSize, progress);
|
blockSize, progress);
|
||||||
}
|
}
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
@@ -418,20 +443,17 @@ public class HoodieWrapperFileSystem extends FileSystem {
|
|||||||
}
|
}
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
public FileStatus[] listStatus(Path f, PathFilter filter)
|
public FileStatus[] listStatus(Path f, PathFilter filter) throws IOException {
|
||||||
throws IOException {
|
|
||||||
return fileSystem.listStatus(convertToDefaultPath(f), filter);
|
return fileSystem.listStatus(convertToDefaultPath(f), filter);
|
||||||
}
|
}
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
public FileStatus[] listStatus(Path[] files)
|
public FileStatus[] listStatus(Path[] files) throws IOException {
|
||||||
throws IOException {
|
|
||||||
return fileSystem.listStatus(convertDefaults(files));
|
return fileSystem.listStatus(convertDefaults(files));
|
||||||
}
|
}
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
public FileStatus[] listStatus(Path[] files, PathFilter filter)
|
public FileStatus[] listStatus(Path[] files, PathFilter filter) throws IOException {
|
||||||
throws IOException {
|
|
||||||
return fileSystem.listStatus(convertDefaults(files), filter);
|
return fileSystem.listStatus(convertDefaults(files), filter);
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -441,20 +463,17 @@ public class HoodieWrapperFileSystem extends FileSystem {
|
|||||||
}
|
}
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
public FileStatus[] globStatus(Path pathPattern, PathFilter filter)
|
public FileStatus[] globStatus(Path pathPattern, PathFilter filter) throws IOException {
|
||||||
throws IOException {
|
|
||||||
return fileSystem.globStatus(convertToDefaultPath(pathPattern), filter);
|
return fileSystem.globStatus(convertToDefaultPath(pathPattern), filter);
|
||||||
}
|
}
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
public RemoteIterator<LocatedFileStatus> listLocatedStatus(Path f)
|
public RemoteIterator<LocatedFileStatus> listLocatedStatus(Path f) throws IOException {
|
||||||
throws IOException {
|
|
||||||
return fileSystem.listLocatedStatus(convertToDefaultPath(f));
|
return fileSystem.listLocatedStatus(convertToDefaultPath(f));
|
||||||
}
|
}
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
public RemoteIterator<LocatedFileStatus> listFiles(Path f, boolean recursive)
|
public RemoteIterator<LocatedFileStatus> listFiles(Path f, boolean recursive) throws IOException {
|
||||||
throws IOException {
|
|
||||||
return fileSystem.listFiles(convertToDefaultPath(f), recursive);
|
return fileSystem.listFiles(convertToDefaultPath(f), recursive);
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -498,8 +517,8 @@ public class HoodieWrapperFileSystem extends FileSystem {
|
|||||||
@Override
|
@Override
|
||||||
public void copyFromLocalFile(boolean delSrc, boolean overwrite, Path src, Path dst)
|
public void copyFromLocalFile(boolean delSrc, boolean overwrite, Path src, Path dst)
|
||||||
throws IOException {
|
throws IOException {
|
||||||
fileSystem.copyFromLocalFile(delSrc, overwrite, convertToDefaultPath(src),
|
fileSystem
|
||||||
convertToDefaultPath(dst));
|
.copyFromLocalFile(delSrc, overwrite, convertToDefaultPath(src), convertToDefaultPath(dst));
|
||||||
}
|
}
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
@@ -525,15 +544,13 @@ public class HoodieWrapperFileSystem extends FileSystem {
|
|||||||
}
|
}
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
public Path startLocalOutput(Path fsOutputFile, Path tmpLocalFile)
|
public Path startLocalOutput(Path fsOutputFile, Path tmpLocalFile) throws IOException {
|
||||||
throws IOException {
|
return convertToHoodiePath(fileSystem
|
||||||
return convertToHoodiePath(fileSystem.startLocalOutput(convertToDefaultPath(fsOutputFile),
|
.startLocalOutput(convertToDefaultPath(fsOutputFile), convertToDefaultPath(tmpLocalFile)));
|
||||||
convertToDefaultPath(tmpLocalFile)));
|
|
||||||
}
|
}
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
public void completeLocalOutput(Path fsOutputFile, Path tmpLocalFile)
|
public void completeLocalOutput(Path fsOutputFile, Path tmpLocalFile) throws IOException {
|
||||||
throws IOException {
|
|
||||||
fileSystem.completeLocalOutput(convertToDefaultPath(fsOutputFile),
|
fileSystem.completeLocalOutput(convertToDefaultPath(fsOutputFile),
|
||||||
convertToDefaultPath(tmpLocalFile));
|
convertToDefaultPath(tmpLocalFile));
|
||||||
}
|
}
|
||||||
@@ -574,23 +591,18 @@ public class HoodieWrapperFileSystem extends FileSystem {
|
|||||||
}
|
}
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
public void access(Path path, FsAction mode)
|
public void access(Path path, FsAction mode) throws IOException {
|
||||||
throws IOException {
|
|
||||||
fileSystem.access(convertToDefaultPath(path), mode);
|
fileSystem.access(convertToDefaultPath(path), mode);
|
||||||
}
|
}
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
public void createSymlink(Path target, Path link, boolean createParent)
|
public void createSymlink(Path target, Path link, boolean createParent) throws IOException {
|
||||||
throws
|
|
||||||
IOException {
|
|
||||||
fileSystem
|
fileSystem
|
||||||
.createSymlink(convertToDefaultPath(target), convertToDefaultPath(link), createParent);
|
.createSymlink(convertToDefaultPath(target), convertToDefaultPath(link), createParent);
|
||||||
}
|
}
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
public FileStatus getFileLinkStatus(Path f)
|
public FileStatus getFileLinkStatus(Path f) throws IOException {
|
||||||
throws
|
|
||||||
IOException {
|
|
||||||
return fileSystem.getFileLinkStatus(convertToDefaultPath(f));
|
return fileSystem.getFileLinkStatus(convertToDefaultPath(f));
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -651,8 +663,7 @@ public class HoodieWrapperFileSystem extends FileSystem {
|
|||||||
|
|
||||||
@Override
|
@Override
|
||||||
public Path createSnapshot(Path path, String snapshotName) throws IOException {
|
public Path createSnapshot(Path path, String snapshotName) throws IOException {
|
||||||
return convertToHoodiePath(
|
return convertToHoodiePath(fileSystem.createSnapshot(convertToDefaultPath(path), snapshotName));
|
||||||
fileSystem.createSnapshot(convertToDefaultPath(path), snapshotName));
|
|
||||||
}
|
}
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
@@ -718,8 +729,7 @@ public class HoodieWrapperFileSystem extends FileSystem {
|
|||||||
}
|
}
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
public Map<String, byte[]> getXAttrs(Path path, List<String> names)
|
public Map<String, byte[]> getXAttrs(Path path, List<String> names) throws IOException {
|
||||||
throws IOException {
|
|
||||||
return fileSystem.getXAttrs(convertToDefaultPath(path), names);
|
return fileSystem.getXAttrs(convertToDefaultPath(path), names);
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -734,13 +744,13 @@ public class HoodieWrapperFileSystem extends FileSystem {
|
|||||||
}
|
}
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
public void setConf(Configuration conf) {
|
public Configuration getConf() {
|
||||||
// ignore this. we will set conf on init
|
return fileSystem.getConf();
|
||||||
}
|
}
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
public Configuration getConf() {
|
public void setConf(Configuration conf) {
|
||||||
return fileSystem.getConf();
|
// ignore this. we will set conf on init
|
||||||
}
|
}
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
@@ -762,15 +772,6 @@ public class HoodieWrapperFileSystem extends FileSystem {
|
|||||||
return convertPathWithScheme(oldPath, getHoodieScheme(fileSystem.getScheme()));
|
return convertPathWithScheme(oldPath, getHoodieScheme(fileSystem.getScheme()));
|
||||||
}
|
}
|
||||||
|
|
||||||
public static Path convertToHoodiePath(Path file, Configuration conf) {
|
|
||||||
try {
|
|
||||||
String scheme = FSUtils.getFs(file.toString(), conf).getScheme();
|
|
||||||
return convertPathWithScheme(file, getHoodieScheme(scheme));
|
|
||||||
} catch (HoodieIOException e) {
|
|
||||||
throw e;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
private Path convertToDefaultPath(Path oldPath) {
|
private Path convertToDefaultPath(Path oldPath) {
|
||||||
return convertPathWithScheme(oldPath, fileSystem.getScheme());
|
return convertPathWithScheme(oldPath, fileSystem.getScheme());
|
||||||
}
|
}
|
||||||
@@ -783,30 +784,6 @@ public class HoodieWrapperFileSystem extends FileSystem {
|
|||||||
return psrcsNew;
|
return psrcsNew;
|
||||||
}
|
}
|
||||||
|
|
||||||
private static Path convertPathWithScheme(Path oldPath, String newScheme) {
|
|
||||||
URI oldURI = oldPath.toUri();
|
|
||||||
URI newURI;
|
|
||||||
try {
|
|
||||||
newURI = new URI(newScheme, oldURI.getUserInfo(), oldURI.getHost(), oldURI.getPort(),
|
|
||||||
oldURI.getPath(), oldURI.getQuery(), oldURI.getFragment());
|
|
||||||
return new Path(newURI);
|
|
||||||
} catch (URISyntaxException e) {
|
|
||||||
// TODO - Better Exception handling
|
|
||||||
throw new RuntimeException(e);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
public static String getHoodieScheme(String scheme) {
|
|
||||||
String newScheme;
|
|
||||||
if (SUPPORT_SCHEMES.contains(scheme)) {
|
|
||||||
newScheme = HOODIE_SCHEME_PREFIX + scheme;
|
|
||||||
} else {
|
|
||||||
throw new IllegalArgumentException(
|
|
||||||
"BlockAlignedAvroParquetWriter does not support scheme " + scheme);
|
|
||||||
}
|
|
||||||
return newScheme;
|
|
||||||
}
|
|
||||||
|
|
||||||
public long getBytesWritten(Path file) {
|
public long getBytesWritten(Path file) {
|
||||||
if (openStreams.containsKey(file.getName())) {
|
if (openStreams.containsKey(file.getName())) {
|
||||||
return openStreams.get(file.getName()).getBytesWritten();
|
return openStreams.get(file.getName()).getBytesWritten();
|
||||||
|
|||||||
@@ -30,14 +30,14 @@ import org.apache.log4j.Logger;
|
|||||||
*/
|
*/
|
||||||
public class HoodieMetrics {
|
public class HoodieMetrics {
|
||||||
|
|
||||||
private HoodieWriteConfig config = null;
|
|
||||||
private String tableName = null;
|
|
||||||
private static Logger logger = LogManager.getLogger(HoodieMetrics.class);
|
private static Logger logger = LogManager.getLogger(HoodieMetrics.class);
|
||||||
// Some timers
|
// Some timers
|
||||||
public String rollbackTimerName = null;
|
public String rollbackTimerName = null;
|
||||||
public String cleanTimerName = null;
|
public String cleanTimerName = null;
|
||||||
public String commitTimerName = null;
|
public String commitTimerName = null;
|
||||||
public String finalizeTimerName = null;
|
public String finalizeTimerName = null;
|
||||||
|
private HoodieWriteConfig config = null;
|
||||||
|
private String tableName = null;
|
||||||
private Timer rollbackTimer = null;
|
private Timer rollbackTimer = null;
|
||||||
private Timer cleanTimer = null;
|
private Timer cleanTimer = null;
|
||||||
private Timer commitTimer = null;
|
private Timer commitTimer = null;
|
||||||
@@ -113,8 +113,9 @@ public class HoodieMetrics {
|
|||||||
|
|
||||||
public void updateRollbackMetrics(long durationInMs, long numFilesDeleted) {
|
public void updateRollbackMetrics(long durationInMs, long numFilesDeleted) {
|
||||||
if (config.isMetricsOn()) {
|
if (config.isMetricsOn()) {
|
||||||
logger.info(String.format("Sending rollback metrics (duration=%d, numFilesDeleted=$d)",
|
logger.info(String
|
||||||
durationInMs, numFilesDeleted));
|
.format("Sending rollback metrics (duration=%d, numFilesDeleted=$d)", durationInMs,
|
||||||
|
numFilesDeleted));
|
||||||
registerGauge(getMetricsName("rollback", "duration"), durationInMs);
|
registerGauge(getMetricsName("rollback", "duration"), durationInMs);
|
||||||
registerGauge(getMetricsName("rollback", "numFilesDeleted"), numFilesDeleted);
|
registerGauge(getMetricsName("rollback", "numFilesDeleted"), numFilesDeleted);
|
||||||
}
|
}
|
||||||
@@ -122,8 +123,9 @@ public class HoodieMetrics {
|
|||||||
|
|
||||||
public void updateCleanMetrics(long durationInMs, int numFilesDeleted) {
|
public void updateCleanMetrics(long durationInMs, int numFilesDeleted) {
|
||||||
if (config.isMetricsOn()) {
|
if (config.isMetricsOn()) {
|
||||||
logger.info(String.format("Sending clean metrics (duration=%d, numFilesDeleted=%d)",
|
logger.info(String
|
||||||
durationInMs, numFilesDeleted));
|
.format("Sending clean metrics (duration=%d, numFilesDeleted=%d)", durationInMs,
|
||||||
|
numFilesDeleted));
|
||||||
registerGauge(getMetricsName("clean", "duration"), durationInMs);
|
registerGauge(getMetricsName("clean", "duration"), durationInMs);
|
||||||
registerGauge(getMetricsName("clean", "numFilesDeleted"), numFilesDeleted);
|
registerGauge(getMetricsName("clean", "numFilesDeleted"), numFilesDeleted);
|
||||||
}
|
}
|
||||||
@@ -131,7 +133,8 @@ public class HoodieMetrics {
|
|||||||
|
|
||||||
public void updateFinalizeWriteMetrics(long durationInMs, int numFilesFinalized) {
|
public void updateFinalizeWriteMetrics(long durationInMs, int numFilesFinalized) {
|
||||||
if (config.isMetricsOn()) {
|
if (config.isMetricsOn()) {
|
||||||
logger.info(String.format("Sending finalize write metrics (duration=%d, numFilesFinalized=%d)",
|
logger.info(String
|
||||||
|
.format("Sending finalize write metrics (duration=%d, numFilesFinalized=%d)",
|
||||||
durationInMs, numFilesFinalized));
|
durationInMs, numFilesFinalized));
|
||||||
registerGauge(getMetricsName("finalize", "duration"), durationInMs);
|
registerGauge(getMetricsName("finalize", "duration"), durationInMs);
|
||||||
registerGauge(getMetricsName("finalize", "numFilesFinalized"), numFilesFinalized);
|
registerGauge(getMetricsName("finalize", "numFilesFinalized"), numFilesFinalized);
|
||||||
@@ -140,8 +143,7 @@ public class HoodieMetrics {
|
|||||||
|
|
||||||
@VisibleForTesting
|
@VisibleForTesting
|
||||||
String getMetricsName(String action, String metric) {
|
String getMetricsName(String action, String metric) {
|
||||||
return config == null ? null :
|
return config == null ? null : String.format("%s.%s.%s", tableName, action, metric);
|
||||||
String.format("%s.%s.%s", tableName, action, metric);
|
|
||||||
}
|
}
|
||||||
|
|
||||||
void registerGauge(String metricName, final long value) {
|
void registerGauge(String metricName, final long value) {
|
||||||
@@ -154,7 +156,8 @@ public class HoodieMetrics {
|
|||||||
}
|
}
|
||||||
});
|
});
|
||||||
} catch (Exception e) {
|
} catch (Exception e) {
|
||||||
// Here we catch all exception, so the major upsert pipeline will not be affected if the metrics system
|
// Here we catch all exception, so the major upsert pipeline will not be affected if the
|
||||||
|
// metrics system
|
||||||
// has some issues.
|
// has some issues.
|
||||||
logger.error("Failed to send metrics: ", e);
|
logger.error("Failed to send metrics: ", e);
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -33,14 +33,13 @@ import org.apache.log4j.Logger;
|
|||||||
*/
|
*/
|
||||||
public class MetricsGraphiteReporter extends MetricsReporter {
|
public class MetricsGraphiteReporter extends MetricsReporter {
|
||||||
|
|
||||||
|
private static Logger logger = LogManager.getLogger(MetricsGraphiteReporter.class);
|
||||||
private final MetricRegistry registry;
|
private final MetricRegistry registry;
|
||||||
private final GraphiteReporter graphiteReporter;
|
private final GraphiteReporter graphiteReporter;
|
||||||
private final HoodieWriteConfig config;
|
private final HoodieWriteConfig config;
|
||||||
private String serverHost;
|
private String serverHost;
|
||||||
private int serverPort;
|
private int serverPort;
|
||||||
|
|
||||||
private static Logger logger = LogManager.getLogger(MetricsGraphiteReporter.class);
|
|
||||||
|
|
||||||
public MetricsGraphiteReporter(HoodieWriteConfig config, MetricRegistry registry) {
|
public MetricsGraphiteReporter(HoodieWriteConfig config, MetricRegistry registry) {
|
||||||
this.registry = registry;
|
this.registry = registry;
|
||||||
this.config = config;
|
this.config = config;
|
||||||
@@ -49,8 +48,8 @@ public class MetricsGraphiteReporter extends MetricsReporter {
|
|||||||
this.serverHost = config.getGraphiteServerHost();
|
this.serverHost = config.getGraphiteServerHost();
|
||||||
this.serverPort = config.getGraphiteServerPort();
|
this.serverPort = config.getGraphiteServerPort();
|
||||||
if (serverHost == null || serverPort == 0) {
|
if (serverHost == null || serverPort == 0) {
|
||||||
throw new RuntimeException(
|
throw new RuntimeException(String
|
||||||
String.format("Graphite cannot be initialized with serverHost[%s] and serverPort[%s].",
|
.format("Graphite cannot be initialized with serverHost[%s] and serverPort[%s].",
|
||||||
serverHost, serverPort));
|
serverHost, serverPort));
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -81,14 +80,10 @@ public class MetricsGraphiteReporter extends MetricsReporter {
|
|||||||
}
|
}
|
||||||
|
|
||||||
private GraphiteReporter createGraphiteReport() {
|
private GraphiteReporter createGraphiteReport() {
|
||||||
Graphite graphite = new Graphite(
|
Graphite graphite = new Graphite(new InetSocketAddress(serverHost, serverPort));
|
||||||
new InetSocketAddress(serverHost, serverPort));
|
|
||||||
String reporterPrefix = config.getGraphiteMetricPrefix();
|
String reporterPrefix = config.getGraphiteMetricPrefix();
|
||||||
return GraphiteReporter.forRegistry(registry)
|
return GraphiteReporter.forRegistry(registry).prefixedWith(reporterPrefix)
|
||||||
.prefixedWith(reporterPrefix)
|
.convertRatesTo(TimeUnit.SECONDS).convertDurationsTo(TimeUnit.MILLISECONDS)
|
||||||
.convertRatesTo(TimeUnit.SECONDS)
|
.filter(MetricFilter.ALL).build(graphite);
|
||||||
.convertDurationsTo(TimeUnit.MILLISECONDS)
|
|
||||||
.filter(MetricFilter.ALL)
|
|
||||||
.build(graphite);
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -28,8 +28,7 @@ public class MetricsReporterFactory {
|
|||||||
|
|
||||||
private static Logger logger = LogManager.getLogger(MetricsReporterFactory.class);
|
private static Logger logger = LogManager.getLogger(MetricsReporterFactory.class);
|
||||||
|
|
||||||
public static MetricsReporter createReporter(HoodieWriteConfig config,
|
public static MetricsReporter createReporter(HoodieWriteConfig config, MetricRegistry registry) {
|
||||||
MetricRegistry registry) {
|
|
||||||
MetricsReporterType type = config.getMetricsReporterType();
|
MetricsReporterType type = config.getMetricsReporterType();
|
||||||
MetricsReporter reporter = null;
|
MetricsReporter reporter = null;
|
||||||
switch (type) {
|
switch (type) {
|
||||||
|
|||||||
@@ -21,6 +21,5 @@ package com.uber.hoodie.metrics;
|
|||||||
* future.
|
* future.
|
||||||
*/
|
*/
|
||||||
public enum MetricsReporterType {
|
public enum MetricsReporterType {
|
||||||
GRAPHITE,
|
GRAPHITE, INMEMORY
|
||||||
INMEMORY
|
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -75,23 +75,425 @@ import scala.Tuple2;
|
|||||||
|
|
||||||
/**
|
/**
|
||||||
* Implementation of a very heavily read-optimized Hoodie Table where
|
* Implementation of a very heavily read-optimized Hoodie Table where
|
||||||
*
|
* <p>
|
||||||
* INSERTS - Produce new files, block aligned to desired size (or) Merge with the smallest existing
|
* INSERTS - Produce new files, block aligned to desired size (or) Merge with the smallest existing
|
||||||
* file, to expand it
|
* file, to expand it
|
||||||
*
|
* <p>
|
||||||
* UPDATES - Produce a new version of the file, just replacing the updated records with new values
|
* UPDATES - Produce a new version of the file, just replacing the updated records with new values
|
||||||
*/
|
*/
|
||||||
public class HoodieCopyOnWriteTable<T extends HoodieRecordPayload> extends HoodieTable<T> {
|
public class HoodieCopyOnWriteTable<T extends HoodieRecordPayload> extends HoodieTable<T> {
|
||||||
|
|
||||||
|
private static Logger logger = LogManager.getLogger(HoodieCopyOnWriteTable.class);
|
||||||
|
|
||||||
public HoodieCopyOnWriteTable(HoodieWriteConfig config, HoodieTableMetaClient metaClient) {
|
public HoodieCopyOnWriteTable(HoodieWriteConfig config, HoodieTableMetaClient metaClient) {
|
||||||
super(config, metaClient);
|
super(config, metaClient);
|
||||||
}
|
}
|
||||||
|
|
||||||
private static Logger logger = LogManager.getLogger(HoodieCopyOnWriteTable.class);
|
private static PairFlatMapFunction<Iterator<Tuple2<String, String>>, String,
|
||||||
|
PartitionCleanStat> deleteFilesFunc(
|
||||||
|
HoodieTable table) {
|
||||||
|
return (PairFlatMapFunction<Iterator<Tuple2<String, String>>, String, PartitionCleanStat>)
|
||||||
|
iter -> {
|
||||||
|
Map<String, PartitionCleanStat> partitionCleanStatMap = new HashMap<>();
|
||||||
|
|
||||||
|
FileSystem fs = table.getMetaClient().getFs();
|
||||||
|
while (iter.hasNext()) {
|
||||||
|
Tuple2<String, String> partitionDelFileTuple = iter.next();
|
||||||
|
String partitionPath = partitionDelFileTuple._1();
|
||||||
|
String deletePathStr = partitionDelFileTuple._2();
|
||||||
|
Boolean deletedFileResult = deleteFileAndGetResult(fs, deletePathStr);
|
||||||
|
if (!partitionCleanStatMap.containsKey(partitionPath)) {
|
||||||
|
partitionCleanStatMap.put(partitionPath, new PartitionCleanStat(partitionPath));
|
||||||
|
}
|
||||||
|
PartitionCleanStat partitionCleanStat = partitionCleanStatMap.get(partitionPath);
|
||||||
|
partitionCleanStat.addDeleteFilePatterns(deletePathStr);
|
||||||
|
partitionCleanStat.addDeletedFileResult(deletePathStr, deletedFileResult);
|
||||||
|
}
|
||||||
|
|
||||||
|
return partitionCleanStatMap.entrySet().stream()
|
||||||
|
.map(e -> new Tuple2<>(e.getKey(), e.getValue()))
|
||||||
|
.collect(Collectors.toList()).iterator();
|
||||||
|
};
|
||||||
|
}
|
||||||
|
|
||||||
|
private static PairFlatMapFunction<String, String, String> getFilesToDeleteFunc(HoodieTable table,
|
||||||
|
HoodieWriteConfig config) {
|
||||||
|
return (PairFlatMapFunction<String, String, String>) partitionPathToClean -> {
|
||||||
|
HoodieCleanHelper cleaner = new HoodieCleanHelper(table, config);
|
||||||
|
return cleaner.getDeletePaths(partitionPathToClean).stream()
|
||||||
|
.map(deleteFile -> new Tuple2<>(partitionPathToClean, deleteFile.toString())).iterator();
|
||||||
|
};
|
||||||
|
}
|
||||||
|
|
||||||
|
private static Boolean deleteFileAndGetResult(FileSystem fs, String deletePathStr)
|
||||||
|
throws IOException {
|
||||||
|
Path deletePath = new Path(deletePathStr);
|
||||||
|
logger.debug("Working on delete path :" + deletePath);
|
||||||
|
boolean deleteResult = fs.delete(deletePath, false);
|
||||||
|
if (deleteResult) {
|
||||||
|
logger.debug("Cleaned file at path :" + deletePath);
|
||||||
|
}
|
||||||
|
return deleteResult;
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public Partitioner getUpsertPartitioner(WorkloadProfile profile) {
|
||||||
|
if (profile == null) {
|
||||||
|
throw new HoodieUpsertException("Need workload profile to construct the upsert partitioner.");
|
||||||
|
}
|
||||||
|
return new UpsertPartitioner(profile);
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public Partitioner getInsertPartitioner(WorkloadProfile profile) {
|
||||||
|
return getUpsertPartitioner(profile);
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public boolean isWorkloadProfileNeeded() {
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public JavaRDD<WriteStatus> compact(JavaSparkContext jsc, String commitTime) {
|
||||||
|
throw new HoodieNotSupportedException("Compaction is not supported from a CopyOnWrite table");
|
||||||
|
}
|
||||||
|
|
||||||
|
public Iterator<List<WriteStatus>> handleUpdate(String commitTime, String fileLoc,
|
||||||
|
Iterator<HoodieRecord<T>> recordItr) throws IOException {
|
||||||
|
// these are updates
|
||||||
|
HoodieMergeHandle upsertHandle = getUpdateHandle(commitTime, fileLoc, recordItr);
|
||||||
|
return handleUpdateInternal(upsertHandle, commitTime, fileLoc);
|
||||||
|
}
|
||||||
|
|
||||||
|
public Iterator<List<WriteStatus>> handleUpdate(String commitTime, String fileLoc,
|
||||||
|
Map<String, HoodieRecord<T>> keyToNewRecords) throws IOException {
|
||||||
|
// these are updates
|
||||||
|
HoodieMergeHandle upsertHandle = getUpdateHandle(commitTime, fileLoc, keyToNewRecords);
|
||||||
|
return handleUpdateInternal(upsertHandle, commitTime, fileLoc);
|
||||||
|
}
|
||||||
|
|
||||||
|
protected Iterator<List<WriteStatus>> handleUpdateInternal(HoodieMergeHandle upsertHandle,
|
||||||
|
String commitTime, String fileLoc) throws IOException {
|
||||||
|
if (upsertHandle.getOldFilePath() == null) {
|
||||||
|
throw new HoodieUpsertException(
|
||||||
|
"Error in finding the old file path at commit " + commitTime + " at fileLoc: " + fileLoc);
|
||||||
|
} else {
|
||||||
|
AvroReadSupport.setAvroReadSchema(getHadoopConf(), upsertHandle.getSchema());
|
||||||
|
ParquetReader<IndexedRecord> reader = AvroParquetReader.builder(upsertHandle.getOldFilePath())
|
||||||
|
.withConf(getHadoopConf()).build();
|
||||||
|
try {
|
||||||
|
IndexedRecord record;
|
||||||
|
while ((record = reader.read()) != null) {
|
||||||
|
// Two types of writes here (new record, and old record).
|
||||||
|
// We have already catch the exception during writing new records.
|
||||||
|
// But for old records, we should fail if any exception happens.
|
||||||
|
upsertHandle.write((GenericRecord) record);
|
||||||
|
}
|
||||||
|
} catch (IOException e) {
|
||||||
|
throw new HoodieUpsertException(
|
||||||
|
"Failed to read record from " + upsertHandle.getOldFilePath() + " with new Schema "
|
||||||
|
+ upsertHandle.getSchema(), e);
|
||||||
|
} finally {
|
||||||
|
reader.close();
|
||||||
|
upsertHandle.close();
|
||||||
|
}
|
||||||
|
}
|
||||||
|
//TODO(vc): This needs to be revisited
|
||||||
|
if (upsertHandle.getWriteStatus().getPartitionPath() == null) {
|
||||||
|
logger.info("Upsert Handle has partition path as null " + upsertHandle.getOldFilePath() + ", "
|
||||||
|
+ upsertHandle.getWriteStatus());
|
||||||
|
}
|
||||||
|
return Collections.singletonList(Collections.singletonList(upsertHandle.getWriteStatus()))
|
||||||
|
.iterator();
|
||||||
|
}
|
||||||
|
|
||||||
|
protected HoodieMergeHandle getUpdateHandle(String commitTime, String fileLoc,
|
||||||
|
Iterator<HoodieRecord<T>> recordItr) {
|
||||||
|
return new HoodieMergeHandle<>(config, commitTime, this, recordItr, fileLoc);
|
||||||
|
}
|
||||||
|
|
||||||
|
protected HoodieMergeHandle getUpdateHandle(String commitTime, String fileLoc,
|
||||||
|
Map<String, HoodieRecord<T>> keyToNewRecords) {
|
||||||
|
return new HoodieMergeHandle<>(config, commitTime, this, keyToNewRecords, fileLoc);
|
||||||
|
}
|
||||||
|
|
||||||
|
public Iterator<List<WriteStatus>> handleInsert(String commitTime,
|
||||||
|
Iterator<HoodieRecord<T>> recordItr) throws Exception {
|
||||||
|
return new LazyInsertIterable<>(recordItr, config, commitTime, this);
|
||||||
|
}
|
||||||
|
|
||||||
|
@SuppressWarnings("unchecked")
|
||||||
|
@Override
|
||||||
|
public Iterator<List<WriteStatus>> handleUpsertPartition(String commitTime, Integer partition,
|
||||||
|
Iterator recordItr, Partitioner partitioner) {
|
||||||
|
UpsertPartitioner upsertPartitioner = (UpsertPartitioner) partitioner;
|
||||||
|
BucketInfo binfo = upsertPartitioner.getBucketInfo(partition);
|
||||||
|
BucketType btype = binfo.bucketType;
|
||||||
|
try {
|
||||||
|
if (btype.equals(BucketType.INSERT)) {
|
||||||
|
return handleInsert(commitTime, recordItr);
|
||||||
|
} else if (btype.equals(BucketType.UPDATE)) {
|
||||||
|
return handleUpdate(commitTime, binfo.fileLoc, recordItr);
|
||||||
|
} else {
|
||||||
|
throw new HoodieUpsertException(
|
||||||
|
"Unknown bucketType " + btype + " for partition :" + partition);
|
||||||
|
}
|
||||||
|
} catch (Throwable t) {
|
||||||
|
String msg = "Error upserting bucketType " + btype + " for partition :" + partition;
|
||||||
|
logger.error(msg, t);
|
||||||
|
throw new HoodieUpsertException(msg, t);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public Iterator<List<WriteStatus>> handleInsertPartition(String commitTime, Integer partition,
|
||||||
|
Iterator recordItr, Partitioner partitioner) {
|
||||||
|
return handleUpsertPartition(commitTime, partition, recordItr, partitioner);
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Performs cleaning of partition paths according to cleaning policy and returns the number of
|
||||||
|
* files cleaned. Handles skews in partitions to clean by making files to clean as the unit of
|
||||||
|
* task distribution.
|
||||||
|
*
|
||||||
|
* @throws IllegalArgumentException if unknown cleaning policy is provided
|
||||||
|
*/
|
||||||
|
@Override
|
||||||
|
public List<HoodieCleanStat> clean(JavaSparkContext jsc) {
|
||||||
|
try {
|
||||||
|
FileSystem fs = getMetaClient().getFs();
|
||||||
|
List<String> partitionsToClean = FSUtils
|
||||||
|
.getAllPartitionPaths(fs, getMetaClient().getBasePath(),
|
||||||
|
config.shouldAssumeDatePartitioning());
|
||||||
|
logger.info("Partitions to clean up : " + partitionsToClean + ", with policy " + config
|
||||||
|
.getCleanerPolicy());
|
||||||
|
if (partitionsToClean.isEmpty()) {
|
||||||
|
logger.info("Nothing to clean here mom. It is already clean");
|
||||||
|
return Collections.emptyList();
|
||||||
|
}
|
||||||
|
return cleanPartitionPaths(partitionsToClean, jsc);
|
||||||
|
} catch (IOException e) {
|
||||||
|
throw new HoodieIOException("Failed to clean up after commit", e);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Common method used for cleaning out parquet files under a partition path during rollback of a
|
||||||
|
* set of commits
|
||||||
|
*/
|
||||||
|
protected Map<FileStatus, Boolean> deleteCleanedFiles(String partitionPath, List<String> commits)
|
||||||
|
throws IOException {
|
||||||
|
logger.info("Cleaning path " + partitionPath);
|
||||||
|
FileSystem fs = getMetaClient().getFs();
|
||||||
|
FileStatus[] toBeDeleted = fs
|
||||||
|
.listStatus(new Path(config.getBasePath(), partitionPath), path -> {
|
||||||
|
if (!path.toString().contains(".parquet")) {
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
String fileCommitTime = FSUtils.getCommitTime(path.getName());
|
||||||
|
return commits.contains(fileCommitTime);
|
||||||
|
});
|
||||||
|
Map<FileStatus, Boolean> results = Maps.newHashMap();
|
||||||
|
for (FileStatus file : toBeDeleted) {
|
||||||
|
boolean success = fs.delete(file.getPath(), false);
|
||||||
|
results.put(file, success);
|
||||||
|
logger.info("Delete file " + file.getPath() + "\t" + success);
|
||||||
|
}
|
||||||
|
return results;
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public List<HoodieRollbackStat> rollback(JavaSparkContext jsc, List<String> commits)
|
||||||
|
throws IOException {
|
||||||
|
String actionType = this.getCommitActionType();
|
||||||
|
HoodieActiveTimeline activeTimeline = this.getActiveTimeline();
|
||||||
|
List<String> inflights = this.getInflightCommitTimeline().getInstants()
|
||||||
|
.map(HoodieInstant::getTimestamp).collect(Collectors.toList());
|
||||||
|
|
||||||
|
// Atomically unpublish all the commits
|
||||||
|
commits.stream().filter(s -> !inflights.contains(s))
|
||||||
|
.map(s -> new HoodieInstant(false, actionType, s))
|
||||||
|
.forEach(activeTimeline::revertToInflight);
|
||||||
|
logger.info("Unpublished " + commits);
|
||||||
|
|
||||||
|
// delete all the data files for all these commits
|
||||||
|
logger.info("Clean out all parquet files generated for commits: " + commits);
|
||||||
|
List<HoodieRollbackStat> stats = jsc.parallelize(FSUtils
|
||||||
|
.getAllPartitionPaths(metaClient.getFs(), getMetaClient().getBasePath(),
|
||||||
|
config.shouldAssumeDatePartitioning()))
|
||||||
|
.map((Function<String, HoodieRollbackStat>) partitionPath -> {
|
||||||
|
// Scan all partitions files with this commit time
|
||||||
|
Map<FileStatus, Boolean> results = deleteCleanedFiles(partitionPath, commits);
|
||||||
|
return HoodieRollbackStat.newBuilder().withPartitionPath(partitionPath)
|
||||||
|
.withDeletedFileResults(results).build();
|
||||||
|
}).collect();
|
||||||
|
|
||||||
|
// clean temporary data files
|
||||||
|
cleanTemporaryDataFiles(jsc);
|
||||||
|
|
||||||
|
// Remove the rolled back inflight commits
|
||||||
|
commits.stream().map(s -> new HoodieInstant(true, actionType, s))
|
||||||
|
.forEach(activeTimeline::deleteInflight);
|
||||||
|
logger.info("Deleted inflight commits " + commits);
|
||||||
|
return stats;
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Finalize the written data files
|
||||||
|
*
|
||||||
|
* @param writeStatuses List of WriteStatus
|
||||||
|
* @return number of files finalized
|
||||||
|
*/
|
||||||
|
@Override
|
||||||
|
@SuppressWarnings("unchecked")
|
||||||
|
public Optional<Integer> finalizeWrite(JavaSparkContext jsc, List writeStatuses) {
|
||||||
|
if (!config.shouldUseTempFolderForCopyOnWrite()) {
|
||||||
|
return Optional.empty();
|
||||||
|
}
|
||||||
|
|
||||||
|
// This is to rename each data file from temporary path to its final location
|
||||||
|
List<Tuple2<String, Boolean>> results = jsc
|
||||||
|
.parallelize(writeStatuses, config.getFinalizeWriteParallelism()).map(writeStatus -> {
|
||||||
|
Tuple2<String, HoodieWriteStat> writeStatTuple2 = (Tuple2<String, HoodieWriteStat>)
|
||||||
|
writeStatus;
|
||||||
|
HoodieWriteStat writeStat = writeStatTuple2._2();
|
||||||
|
final FileSystem fs = getMetaClient().getFs();
|
||||||
|
final Path finalPath = new Path(config.getBasePath(), writeStat.getPath());
|
||||||
|
|
||||||
|
if (writeStat.getTempPath() != null) {
|
||||||
|
final Path tempPath = new Path(config.getBasePath(), writeStat.getTempPath());
|
||||||
|
boolean success;
|
||||||
|
try {
|
||||||
|
logger.info("Renaming temporary file: " + tempPath + " to " + finalPath);
|
||||||
|
success = fs.rename(tempPath, finalPath);
|
||||||
|
} catch (IOException e) {
|
||||||
|
throw new HoodieIOException(
|
||||||
|
"Failed to rename file: " + tempPath + " to " + finalPath);
|
||||||
|
}
|
||||||
|
|
||||||
|
if (!success) {
|
||||||
|
throw new HoodieIOException(
|
||||||
|
"Failed to rename file: " + tempPath + " to " + finalPath);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
return new Tuple2<>(writeStat.getPath(), true);
|
||||||
|
}).collect();
|
||||||
|
|
||||||
|
// clean temporary data files
|
||||||
|
cleanTemporaryDataFiles(jsc);
|
||||||
|
|
||||||
|
return Optional.of(results.size());
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Clean temporary data files that are produced from previous failed commit or retried spark
|
||||||
|
* stages.
|
||||||
|
*/
|
||||||
|
private void cleanTemporaryDataFiles(JavaSparkContext jsc) {
|
||||||
|
if (!config.shouldUseTempFolderForCopyOnWrite()) {
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
final FileSystem fs = getMetaClient().getFs();
|
||||||
|
final Path temporaryFolder = new Path(config.getBasePath(),
|
||||||
|
HoodieTableMetaClient.TEMPFOLDER_NAME);
|
||||||
|
try {
|
||||||
|
if (!fs.exists(temporaryFolder)) {
|
||||||
|
logger.info("Temporary folder does not exist: " + temporaryFolder);
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
List<FileStatus> fileStatusesList = Arrays.asList(fs.listStatus(temporaryFolder));
|
||||||
|
List<Tuple2<String, Boolean>> results = jsc
|
||||||
|
.parallelize(fileStatusesList, config.getFinalizeWriteParallelism()).map(fileStatus -> {
|
||||||
|
FileSystem fs1 = getMetaClient().getFs();
|
||||||
|
boolean success = fs1.delete(fileStatus.getPath(), false);
|
||||||
|
logger
|
||||||
|
.info("Deleting file in temporary folder" + fileStatus.getPath() + "\t" + success);
|
||||||
|
return new Tuple2<>(fileStatus.getPath().toString(), success);
|
||||||
|
}).collect();
|
||||||
|
|
||||||
|
for (Tuple2<String, Boolean> result : results) {
|
||||||
|
if (!result._2()) {
|
||||||
|
logger.info("Failed to delete file: " + result._1());
|
||||||
|
throw new HoodieIOException("Failed to delete file in temporary folder: " + result._1());
|
||||||
|
}
|
||||||
|
}
|
||||||
|
} catch (IOException e) {
|
||||||
|
throw new HoodieIOException(
|
||||||
|
"Failed to clean data files in temporary folder: " + temporaryFolder);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
private List<HoodieCleanStat> cleanPartitionPaths(List<String> partitionsToClean,
|
||||||
|
JavaSparkContext jsc) {
|
||||||
|
int cleanerParallelism = Math.min(partitionsToClean.size(), config.getCleanerParallelism());
|
||||||
|
logger.info("Using cleanerParallelism: " + cleanerParallelism);
|
||||||
|
List<Tuple2<String, PartitionCleanStat>> partitionCleanStats = jsc
|
||||||
|
.parallelize(partitionsToClean, cleanerParallelism)
|
||||||
|
.flatMapToPair(getFilesToDeleteFunc(this, config))
|
||||||
|
.repartition(cleanerParallelism) // repartition to remove skews
|
||||||
|
.mapPartitionsToPair(deleteFilesFunc(this)).reduceByKey(
|
||||||
|
// merge partition level clean stats below
|
||||||
|
(Function2<PartitionCleanStat, PartitionCleanStat, PartitionCleanStat>) (e1, e2) -> e1
|
||||||
|
.merge(e2)).collect();
|
||||||
|
|
||||||
|
Map<String, PartitionCleanStat> partitionCleanStatsMap = partitionCleanStats.stream()
|
||||||
|
.collect(Collectors.toMap(e -> e._1(), e -> e._2()));
|
||||||
|
|
||||||
|
HoodieCleanHelper cleaner = new HoodieCleanHelper(this, config);
|
||||||
|
// Return PartitionCleanStat for each partition passed.
|
||||||
|
return partitionsToClean.stream().map(partitionPath -> {
|
||||||
|
PartitionCleanStat partitionCleanStat =
|
||||||
|
(partitionCleanStatsMap.containsKey(partitionPath)) ? partitionCleanStatsMap
|
||||||
|
.get(partitionPath) : new PartitionCleanStat(partitionPath);
|
||||||
|
return HoodieCleanStat.newBuilder().withPolicy(config.getCleanerPolicy())
|
||||||
|
.withPartitionPath(partitionPath)
|
||||||
|
.withEarliestCommitRetained(cleaner.getEarliestCommitToRetain())
|
||||||
|
.withDeletePathPattern(partitionCleanStat.deletePathPatterns)
|
||||||
|
.withSuccessfulDeletes(partitionCleanStat.successDeleteFiles)
|
||||||
|
.withFailedDeletes(partitionCleanStat.failedDeleteFiles).build();
|
||||||
|
}).collect(Collectors.toList());
|
||||||
|
}
|
||||||
|
|
||||||
enum BucketType {
|
enum BucketType {
|
||||||
UPDATE,
|
UPDATE, INSERT
|
||||||
INSERT
|
}
|
||||||
|
|
||||||
|
private static class PartitionCleanStat implements Serializable {
|
||||||
|
|
||||||
|
private final String partitionPath;
|
||||||
|
private final List<String> deletePathPatterns = new ArrayList<>();
|
||||||
|
private final List<String> successDeleteFiles = new ArrayList<>();
|
||||||
|
private final List<String> failedDeleteFiles = new ArrayList<>();
|
||||||
|
|
||||||
|
private PartitionCleanStat(String partitionPath) {
|
||||||
|
this.partitionPath = partitionPath;
|
||||||
|
}
|
||||||
|
|
||||||
|
private void addDeletedFileResult(String deletePathStr, Boolean deletedFileResult) {
|
||||||
|
if (deletedFileResult) {
|
||||||
|
successDeleteFiles.add(deletePathStr);
|
||||||
|
} else {
|
||||||
|
failedDeleteFiles.add(deletePathStr);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
private void addDeleteFilePatterns(String deletePathStr) {
|
||||||
|
deletePathPatterns.add(deletePathStr);
|
||||||
|
}
|
||||||
|
|
||||||
|
private PartitionCleanStat merge(PartitionCleanStat other) {
|
||||||
|
if (!this.partitionPath.equals(other.partitionPath)) {
|
||||||
|
throw new RuntimeException(String
|
||||||
|
.format("partitionPath is not a match: (%s, %s)", partitionPath, other.partitionPath));
|
||||||
|
}
|
||||||
|
successDeleteFiles.addAll(other.successDeleteFiles);
|
||||||
|
deletePathPatterns.addAll(other.deletePathPatterns);
|
||||||
|
failedDeleteFiles.addAll(other.failedDeleteFiles);
|
||||||
|
return this;
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
@@ -150,45 +552,37 @@ public class HoodieCopyOnWriteTable<T extends HoodieRecordPayload> extends Hoodi
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Packs incoming records to be upserted, into buckets (1 bucket = 1 RDD partition)
|
* Packs incoming records to be upserted, into buckets (1 bucket = 1 RDD partition)
|
||||||
*/
|
*/
|
||||||
class UpsertPartitioner extends Partitioner {
|
class UpsertPartitioner extends Partitioner {
|
||||||
|
|
||||||
|
/**
|
||||||
|
* List of all small files to be corrected
|
||||||
|
*/
|
||||||
|
List<SmallFile> smallFiles = new ArrayList<SmallFile>();
|
||||||
/**
|
/**
|
||||||
* Total number of RDD partitions, is determined by total buckets we want to pack the incoming
|
* Total number of RDD partitions, is determined by total buckets we want to pack the incoming
|
||||||
* workload into
|
* workload into
|
||||||
*/
|
*/
|
||||||
private int totalBuckets = 0;
|
private int totalBuckets = 0;
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Stat for the current workload. Helps in determining total inserts, upserts etc.
|
* Stat for the current workload. Helps in determining total inserts, upserts etc.
|
||||||
*/
|
*/
|
||||||
private WorkloadStat globalStat;
|
private WorkloadStat globalStat;
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Helps decide which bucket an incoming update should go to.
|
* Helps decide which bucket an incoming update should go to.
|
||||||
*/
|
*/
|
||||||
private HashMap<String, Integer> updateLocationToBucket;
|
private HashMap<String, Integer> updateLocationToBucket;
|
||||||
|
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Helps us pack inserts into 1 or more buckets depending on number of incoming records.
|
* Helps us pack inserts into 1 or more buckets depending on number of incoming records.
|
||||||
*/
|
*/
|
||||||
private HashMap<String, List<InsertBucket>> partitionPathToInsertBuckets;
|
private HashMap<String, List<InsertBucket>> partitionPathToInsertBuckets;
|
||||||
|
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Remembers what type each bucket is for later.
|
* Remembers what type each bucket is for later.
|
||||||
*/
|
*/
|
||||||
private HashMap<Integer, BucketInfo> bucketInfoMap;
|
private HashMap<Integer, BucketInfo> bucketInfoMap;
|
||||||
|
|
||||||
/**
|
|
||||||
* List of all small files to be corrected
|
|
||||||
*/
|
|
||||||
List<SmallFile> smallFiles = new ArrayList<SmallFile>();
|
|
||||||
|
|
||||||
UpsertPartitioner(WorkloadProfile profile) {
|
UpsertPartitioner(WorkloadProfile profile) {
|
||||||
updateLocationToBucket = new HashMap<>();
|
updateLocationToBucket = new HashMap<>();
|
||||||
partitionPathToInsertBuckets = new HashMap<>();
|
partitionPathToInsertBuckets = new HashMap<>();
|
||||||
@@ -198,16 +592,17 @@ public class HoodieCopyOnWriteTable<T extends HoodieRecordPayload> extends Hoodi
|
|||||||
assignUpdates(profile);
|
assignUpdates(profile);
|
||||||
assignInserts(profile);
|
assignInserts(profile);
|
||||||
|
|
||||||
logger.info("Total Buckets :" + totalBuckets + ", " +
|
logger.info(
|
||||||
"buckets info => " + bucketInfoMap + ", \n" +
|
"Total Buckets :" + totalBuckets + ", " + "buckets info => " + bucketInfoMap + ", \n"
|
||||||
"Partition to insert buckets => " + partitionPathToInsertBuckets + ", \n" +
|
+ "Partition to insert buckets => " + partitionPathToInsertBuckets + ", \n"
|
||||||
"UpdateLocations mapped to buckets =>" + updateLocationToBucket);
|
+ "UpdateLocations mapped to buckets =>" + updateLocationToBucket);
|
||||||
}
|
}
|
||||||
|
|
||||||
private void assignUpdates(WorkloadProfile profile) {
|
private void assignUpdates(WorkloadProfile profile) {
|
||||||
// each update location gets a partition
|
// each update location gets a partition
|
||||||
WorkloadStat gStat = profile.getGlobalStat();
|
WorkloadStat gStat = profile.getGlobalStat();
|
||||||
for (Map.Entry<String, Pair<String, Long>> updateLocEntry : gStat.getUpdateLocationToCount().entrySet()) {
|
for (Map.Entry<String, Pair<String, Long>> updateLocEntry : gStat.getUpdateLocationToCount()
|
||||||
|
.entrySet()) {
|
||||||
addUpdateBucket(updateLocEntry.getKey());
|
addUpdateBucket(updateLocEntry.getKey());
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@@ -270,10 +665,10 @@ public class HoodieCopyOnWriteTable<T extends HoodieRecordPayload> extends Hoodi
|
|||||||
}
|
}
|
||||||
|
|
||||||
int insertBuckets = (int) Math.max(totalUnassignedInserts / insertRecordsPerBucket, 1L);
|
int insertBuckets = (int) Math.max(totalUnassignedInserts / insertRecordsPerBucket, 1L);
|
||||||
logger
|
logger.info(
|
||||||
.info("After small file assignment: unassignedInserts => " + totalUnassignedInserts
|
"After small file assignment: unassignedInserts => " + totalUnassignedInserts
|
||||||
+ ", totalInsertBuckets => " + insertBuckets
|
+ ", totalInsertBuckets => " + insertBuckets + ", recordsPerBucket => "
|
||||||
+ ", recordsPerBucket => " + insertRecordsPerBucket);
|
+ insertRecordsPerBucket);
|
||||||
for (int b = 0; b < insertBuckets; b++) {
|
for (int b = 0; b < insertBuckets; b++) {
|
||||||
bucketNumbers.add(totalBuckets);
|
bucketNumbers.add(totalBuckets);
|
||||||
recordsPerBucket.add(totalUnassignedInserts / insertBuckets);
|
recordsPerBucket.add(totalUnassignedInserts / insertBuckets);
|
||||||
@@ -339,8 +734,8 @@ public class HoodieCopyOnWriteTable<T extends HoodieRecordPayload> extends Hoodi
|
|||||||
*/
|
*/
|
||||||
private long averageBytesPerRecord() {
|
private long averageBytesPerRecord() {
|
||||||
long avgSize = 0L;
|
long avgSize = 0L;
|
||||||
HoodieTimeline commitTimeline =
|
HoodieTimeline commitTimeline = metaClient.getActiveTimeline().getCommitTimeline()
|
||||||
metaClient.getActiveTimeline().getCommitTimeline().filterCompletedInstants();
|
.filterCompletedInstants();
|
||||||
try {
|
try {
|
||||||
if (!commitTimeline.empty()) {
|
if (!commitTimeline.empty()) {
|
||||||
HoodieInstant latestCommitTime = commitTimeline.lastInstant().get();
|
HoodieInstant latestCommitTime = commitTimeline.lastInstant().get();
|
||||||
@@ -372,7 +767,8 @@ public class HoodieCopyOnWriteTable<T extends HoodieRecordPayload> extends Hoodi
|
|||||||
|
|
||||||
@Override
|
@Override
|
||||||
public int getPartition(Object key) {
|
public int getPartition(Object key) {
|
||||||
Tuple2<HoodieKey, Option<HoodieRecordLocation>> keyLocation = (Tuple2<HoodieKey, Option<HoodieRecordLocation>>) key;
|
Tuple2<HoodieKey, Option<HoodieRecordLocation>> keyLocation = (Tuple2<HoodieKey,
|
||||||
|
Option<HoodieRecordLocation>>) key;
|
||||||
if (keyLocation._2().isDefined()) {
|
if (keyLocation._2().isDefined()) {
|
||||||
HoodieRecordLocation location = keyLocation._2().get();
|
HoodieRecordLocation location = keyLocation._2().get();
|
||||||
return updateLocationToBucket.get(location.getFileId());
|
return updateLocationToBucket.get(location.getFileId());
|
||||||
@@ -396,420 +792,4 @@ public class HoodieCopyOnWriteTable<T extends HoodieRecordPayload> extends Hoodi
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
@Override
|
|
||||||
public Partitioner getUpsertPartitioner(WorkloadProfile profile) {
|
|
||||||
if (profile == null) {
|
|
||||||
throw new HoodieUpsertException("Need workload profile to construct the upsert partitioner.");
|
|
||||||
}
|
|
||||||
return new UpsertPartitioner(profile);
|
|
||||||
}
|
|
||||||
|
|
||||||
@Override
|
|
||||||
public Partitioner getInsertPartitioner(WorkloadProfile profile) {
|
|
||||||
return getUpsertPartitioner(profile);
|
|
||||||
}
|
|
||||||
|
|
||||||
@Override
|
|
||||||
public boolean isWorkloadProfileNeeded() {
|
|
||||||
return true;
|
|
||||||
}
|
|
||||||
|
|
||||||
@Override
|
|
||||||
public JavaRDD<WriteStatus> compact(JavaSparkContext jsc, String commitTime) {
|
|
||||||
throw new HoodieNotSupportedException("Compaction is not supported from a CopyOnWrite table");
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
public Iterator<List<WriteStatus>> handleUpdate(String commitTime, String fileLoc,
|
|
||||||
Iterator<HoodieRecord<T>> recordItr)
|
|
||||||
throws IOException {
|
|
||||||
// these are updates
|
|
||||||
HoodieMergeHandle upsertHandle = getUpdateHandle(commitTime, fileLoc, recordItr);
|
|
||||||
return handleUpdateInternal(upsertHandle, commitTime, fileLoc);
|
|
||||||
}
|
|
||||||
|
|
||||||
public Iterator<List<WriteStatus>> handleUpdate(String commitTime, String fileLoc,
|
|
||||||
Map<String, HoodieRecord<T>> keyToNewRecords)
|
|
||||||
throws IOException {
|
|
||||||
// these are updates
|
|
||||||
HoodieMergeHandle upsertHandle = getUpdateHandle(commitTime, fileLoc, keyToNewRecords);
|
|
||||||
return handleUpdateInternal(upsertHandle, commitTime, fileLoc);
|
|
||||||
}
|
|
||||||
|
|
||||||
protected Iterator<List<WriteStatus>> handleUpdateInternal(HoodieMergeHandle upsertHandle, String commitTime, String fileLoc)
|
|
||||||
throws IOException {
|
|
||||||
if (upsertHandle.getOldFilePath() == null) {
|
|
||||||
throw new HoodieUpsertException("Error in finding the old file path at commit " +
|
|
||||||
commitTime + " at fileLoc: " + fileLoc);
|
|
||||||
} else {
|
|
||||||
AvroReadSupport.setAvroReadSchema(getHadoopConf(), upsertHandle.getSchema());
|
|
||||||
ParquetReader<IndexedRecord> reader =
|
|
||||||
AvroParquetReader.builder(upsertHandle.getOldFilePath()).withConf(getHadoopConf())
|
|
||||||
.build();
|
|
||||||
try {
|
|
||||||
IndexedRecord record;
|
|
||||||
while ((record = reader.read()) != null) {
|
|
||||||
// Two types of writes here (new record, and old record).
|
|
||||||
// We have already catch the exception during writing new records.
|
|
||||||
// But for old records, we should fail if any exception happens.
|
|
||||||
upsertHandle.write((GenericRecord) record);
|
|
||||||
}
|
|
||||||
} catch (IOException e) {
|
|
||||||
throw new HoodieUpsertException(
|
|
||||||
"Failed to read record from " + upsertHandle.getOldFilePath()
|
|
||||||
+ " with new Schema " + upsertHandle.getSchema(), e);
|
|
||||||
} finally {
|
|
||||||
reader.close();
|
|
||||||
upsertHandle.close();
|
|
||||||
}
|
|
||||||
}
|
|
||||||
//TODO(vc): This needs to be revisited
|
|
||||||
if (upsertHandle.getWriteStatus().getPartitionPath() == null) {
|
|
||||||
logger.info("Upsert Handle has partition path as null " + upsertHandle.getOldFilePath()
|
|
||||||
+ ", " + upsertHandle.getWriteStatus());
|
|
||||||
}
|
|
||||||
return Collections.singletonList(Collections.singletonList(upsertHandle.getWriteStatus()))
|
|
||||||
.iterator();
|
|
||||||
}
|
|
||||||
|
|
||||||
protected HoodieMergeHandle getUpdateHandle(String commitTime, String fileLoc,
|
|
||||||
Iterator<HoodieRecord<T>> recordItr) {
|
|
||||||
return new HoodieMergeHandle<>(config, commitTime, this, recordItr, fileLoc);
|
|
||||||
}
|
|
||||||
|
|
||||||
protected HoodieMergeHandle getUpdateHandle(String commitTime, String fileLoc,
|
|
||||||
Map<String, HoodieRecord<T>> keyToNewRecords) {
|
|
||||||
return new HoodieMergeHandle<>(config, commitTime, this, keyToNewRecords, fileLoc);
|
|
||||||
}
|
|
||||||
|
|
||||||
public Iterator<List<WriteStatus>> handleInsert(String commitTime,
|
|
||||||
Iterator<HoodieRecord<T>> recordItr) throws Exception {
|
|
||||||
return new LazyInsertIterable<>(recordItr, config, commitTime, this);
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
@SuppressWarnings("unchecked")
|
|
||||||
@Override
|
|
||||||
public Iterator<List<WriteStatus>> handleUpsertPartition(String commitTime, Integer partition,
|
|
||||||
Iterator recordItr, Partitioner partitioner) {
|
|
||||||
UpsertPartitioner upsertPartitioner = (UpsertPartitioner) partitioner;
|
|
||||||
BucketInfo binfo = upsertPartitioner.getBucketInfo(partition);
|
|
||||||
BucketType btype = binfo.bucketType;
|
|
||||||
try {
|
|
||||||
if (btype.equals(BucketType.INSERT)) {
|
|
||||||
return handleInsert(commitTime, recordItr);
|
|
||||||
} else if (btype.equals(BucketType.UPDATE)) {
|
|
||||||
return handleUpdate(commitTime, binfo.fileLoc, recordItr);
|
|
||||||
} else {
|
|
||||||
throw new HoodieUpsertException(
|
|
||||||
"Unknown bucketType " + btype + " for partition :" + partition);
|
|
||||||
}
|
|
||||||
} catch (Throwable t) {
|
|
||||||
String msg = "Error upserting bucketType " + btype + " for partition :" + partition;
|
|
||||||
logger.error(msg, t);
|
|
||||||
throw new HoodieUpsertException(msg, t);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
@Override
|
|
||||||
public Iterator<List<WriteStatus>> handleInsertPartition(String commitTime, Integer partition,
|
|
||||||
Iterator recordItr,
|
|
||||||
Partitioner partitioner) {
|
|
||||||
return handleUpsertPartition(commitTime, partition, recordItr, partitioner);
|
|
||||||
}
|
|
||||||
|
|
||||||
/**
|
|
||||||
* Performs cleaning of partition paths according to cleaning policy and returns the number of
|
|
||||||
* files cleaned. Handles skews in partitions to clean by making files to clean as the unit of
|
|
||||||
* task distribution.
|
|
||||||
*
|
|
||||||
* @throws IllegalArgumentException if unknown cleaning policy is provided
|
|
||||||
*/
|
|
||||||
@Override
|
|
||||||
public List<HoodieCleanStat> clean(JavaSparkContext jsc) {
|
|
||||||
try {
|
|
||||||
FileSystem fs = getMetaClient().getFs();
|
|
||||||
List<String> partitionsToClean =
|
|
||||||
FSUtils.getAllPartitionPaths(fs, getMetaClient().getBasePath(),
|
|
||||||
config.shouldAssumeDatePartitioning());
|
|
||||||
logger.info("Partitions to clean up : " + partitionsToClean + ", with policy " + config
|
|
||||||
.getCleanerPolicy());
|
|
||||||
if (partitionsToClean.isEmpty()) {
|
|
||||||
logger.info("Nothing to clean here mom. It is already clean");
|
|
||||||
return Collections.emptyList();
|
|
||||||
}
|
|
||||||
return cleanPartitionPaths(partitionsToClean, jsc);
|
|
||||||
} catch (IOException e) {
|
|
||||||
throw new HoodieIOException("Failed to clean up after commit", e);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
/**
|
|
||||||
* Common method used for cleaning out parquet files under a partition path during rollback of a
|
|
||||||
* set of commits
|
|
||||||
*/
|
|
||||||
protected Map<FileStatus, Boolean> deleteCleanedFiles(String partitionPath, List<String> commits)
|
|
||||||
throws IOException {
|
|
||||||
logger.info("Cleaning path " + partitionPath);
|
|
||||||
FileSystem fs = getMetaClient().getFs();
|
|
||||||
FileStatus[] toBeDeleted =
|
|
||||||
fs.listStatus(new Path(config.getBasePath(), partitionPath), path -> {
|
|
||||||
if (!path.toString().contains(".parquet")) {
|
|
||||||
return false;
|
|
||||||
}
|
|
||||||
String fileCommitTime = FSUtils.getCommitTime(path.getName());
|
|
||||||
return commits.contains(fileCommitTime);
|
|
||||||
});
|
|
||||||
Map<FileStatus, Boolean> results = Maps.newHashMap();
|
|
||||||
for (FileStatus file : toBeDeleted) {
|
|
||||||
boolean success = fs.delete(file.getPath(), false);
|
|
||||||
results.put(file, success);
|
|
||||||
logger.info("Delete file " + file.getPath() + "\t" + success);
|
|
||||||
}
|
|
||||||
return results;
|
|
||||||
}
|
|
||||||
|
|
||||||
@Override
|
|
||||||
public List<HoodieRollbackStat> rollback(JavaSparkContext jsc, List<String> commits)
|
|
||||||
throws IOException {
|
|
||||||
String actionType = this.getCommitActionType();
|
|
||||||
HoodieActiveTimeline activeTimeline = this.getActiveTimeline();
|
|
||||||
List<String> inflights = this.getInflightCommitTimeline().getInstants()
|
|
||||||
.map(HoodieInstant::getTimestamp)
|
|
||||||
.collect(Collectors.toList());
|
|
||||||
|
|
||||||
// Atomically unpublish all the commits
|
|
||||||
commits.stream().filter(s -> !inflights.contains(s))
|
|
||||||
.map(s -> new HoodieInstant(false, actionType, s))
|
|
||||||
.forEach(activeTimeline::revertToInflight);
|
|
||||||
logger.info("Unpublished " + commits);
|
|
||||||
|
|
||||||
// delete all the data files for all these commits
|
|
||||||
logger.info("Clean out all parquet files generated for commits: " + commits);
|
|
||||||
List<HoodieRollbackStat> stats = jsc.parallelize(
|
|
||||||
FSUtils.getAllPartitionPaths(metaClient.getFs(), getMetaClient().getBasePath(),
|
|
||||||
config.shouldAssumeDatePartitioning()))
|
|
||||||
.map((Function<String, HoodieRollbackStat>) partitionPath -> {
|
|
||||||
// Scan all partitions files with this commit time
|
|
||||||
Map<FileStatus, Boolean> results = deleteCleanedFiles(partitionPath, commits);
|
|
||||||
return HoodieRollbackStat.newBuilder().withPartitionPath(partitionPath)
|
|
||||||
.withDeletedFileResults(results).build();
|
|
||||||
}).collect();
|
|
||||||
|
|
||||||
// clean temporary data files
|
|
||||||
cleanTemporaryDataFiles(jsc);
|
|
||||||
|
|
||||||
// Remove the rolled back inflight commits
|
|
||||||
commits.stream().map(s -> new HoodieInstant(true, actionType, s))
|
|
||||||
.forEach(activeTimeline::deleteInflight);
|
|
||||||
logger.info("Deleted inflight commits " + commits);
|
|
||||||
return stats;
|
|
||||||
}
|
|
||||||
|
|
||||||
/**
|
|
||||||
* Finalize the written data files
|
|
||||||
*
|
|
||||||
* @param writeStatuses List of WriteStatus
|
|
||||||
* @return number of files finalized
|
|
||||||
*/
|
|
||||||
@Override
|
|
||||||
@SuppressWarnings("unchecked")
|
|
||||||
public Optional<Integer> finalizeWrite(JavaSparkContext jsc, List writeStatuses) {
|
|
||||||
if (!config.shouldUseTempFolderForCopyOnWrite()) {
|
|
||||||
return Optional.empty();
|
|
||||||
}
|
|
||||||
|
|
||||||
// This is to rename each data file from temporary path to its final location
|
|
||||||
List<Tuple2<String, Boolean>> results = jsc.parallelize(writeStatuses, config.getFinalizeWriteParallelism())
|
|
||||||
.map(writeStatus -> {
|
|
||||||
Tuple2<String, HoodieWriteStat> writeStatTuple2 = (Tuple2<String, HoodieWriteStat>) writeStatus;
|
|
||||||
HoodieWriteStat writeStat = writeStatTuple2._2();
|
|
||||||
final FileSystem fs = getMetaClient().getFs();
|
|
||||||
final Path finalPath = new Path(config.getBasePath(), writeStat.getPath());
|
|
||||||
|
|
||||||
if (writeStat.getTempPath() != null) {
|
|
||||||
final Path tempPath = new Path(config.getBasePath(), writeStat.getTempPath());
|
|
||||||
boolean success;
|
|
||||||
try {
|
|
||||||
logger.info("Renaming temporary file: " + tempPath + " to " + finalPath);
|
|
||||||
success = fs.rename(tempPath, finalPath);
|
|
||||||
} catch (IOException e) {
|
|
||||||
throw new HoodieIOException("Failed to rename file: " + tempPath + " to " + finalPath);
|
|
||||||
}
|
|
||||||
|
|
||||||
if (!success) {
|
|
||||||
throw new HoodieIOException("Failed to rename file: " + tempPath + " to " + finalPath);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
return new Tuple2<>(writeStat.getPath(), true);
|
|
||||||
}).collect();
|
|
||||||
|
|
||||||
// clean temporary data files
|
|
||||||
cleanTemporaryDataFiles(jsc);
|
|
||||||
|
|
||||||
return Optional.of(results.size());
|
|
||||||
}
|
|
||||||
|
|
||||||
/**
|
|
||||||
* Clean temporary data files that are produced from previous failed commit or retried spark
|
|
||||||
* stages.
|
|
||||||
*/
|
|
||||||
private void cleanTemporaryDataFiles(JavaSparkContext jsc) {
|
|
||||||
if (!config.shouldUseTempFolderForCopyOnWrite()) {
|
|
||||||
return;
|
|
||||||
}
|
|
||||||
|
|
||||||
final FileSystem fs = getMetaClient().getFs();
|
|
||||||
final Path temporaryFolder = new Path(config.getBasePath(),
|
|
||||||
HoodieTableMetaClient.TEMPFOLDER_NAME);
|
|
||||||
try {
|
|
||||||
if (!fs.exists(temporaryFolder)) {
|
|
||||||
logger.info("Temporary folder does not exist: " + temporaryFolder);
|
|
||||||
return;
|
|
||||||
}
|
|
||||||
List<FileStatus> fileStatusesList = Arrays.asList(fs.listStatus(temporaryFolder));
|
|
||||||
List<Tuple2<String, Boolean>> results = jsc
|
|
||||||
.parallelize(fileStatusesList, config.getFinalizeWriteParallelism())
|
|
||||||
.map(fileStatus -> {
|
|
||||||
FileSystem fs1 = getMetaClient().getFs();
|
|
||||||
boolean success = fs1.delete(fileStatus.getPath(), false);
|
|
||||||
logger.info("Deleting file in temporary folder" + fileStatus.getPath() + "\t"
|
|
||||||
+ success);
|
|
||||||
return new Tuple2<>(fileStatus.getPath().toString(), success);
|
|
||||||
}).collect();
|
|
||||||
|
|
||||||
for (Tuple2<String, Boolean> result : results) {
|
|
||||||
if (!result._2()) {
|
|
||||||
logger.info("Failed to delete file: " + result._1());
|
|
||||||
throw new HoodieIOException(
|
|
||||||
"Failed to delete file in temporary folder: " + result._1());
|
|
||||||
}
|
|
||||||
}
|
|
||||||
} catch (IOException e) {
|
|
||||||
throw new HoodieIOException(
|
|
||||||
"Failed to clean data files in temporary folder: " + temporaryFolder);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
private static class PartitionCleanStat implements Serializable {
|
|
||||||
|
|
||||||
private final String partitionPath;
|
|
||||||
private final List<String> deletePathPatterns = new ArrayList<>();
|
|
||||||
private final List<String> successDeleteFiles = new ArrayList<>();
|
|
||||||
private final List<String> failedDeleteFiles = new ArrayList<>();
|
|
||||||
|
|
||||||
private PartitionCleanStat(String partitionPath) {
|
|
||||||
this.partitionPath = partitionPath;
|
|
||||||
}
|
|
||||||
|
|
||||||
private void addDeletedFileResult(String deletePathStr, Boolean deletedFileResult) {
|
|
||||||
if (deletedFileResult) {
|
|
||||||
successDeleteFiles.add(deletePathStr);
|
|
||||||
} else {
|
|
||||||
failedDeleteFiles.add(deletePathStr);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
private void addDeleteFilePatterns(String deletePathStr) {
|
|
||||||
deletePathPatterns.add(deletePathStr);
|
|
||||||
}
|
|
||||||
|
|
||||||
private PartitionCleanStat merge(PartitionCleanStat other) {
|
|
||||||
if (!this.partitionPath.equals(other.partitionPath)) {
|
|
||||||
throw new RuntimeException(String.format(
|
|
||||||
"partitionPath is not a match: (%s, %s)",
|
|
||||||
partitionPath, other.partitionPath));
|
|
||||||
}
|
|
||||||
successDeleteFiles.addAll(other.successDeleteFiles);
|
|
||||||
deletePathPatterns.addAll(other.deletePathPatterns);
|
|
||||||
failedDeleteFiles.addAll(other.failedDeleteFiles);
|
|
||||||
return this;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
private List<HoodieCleanStat> cleanPartitionPaths(List<String> partitionsToClean,
|
|
||||||
JavaSparkContext jsc) {
|
|
||||||
int cleanerParallelism = Math.min(partitionsToClean.size(), config.getCleanerParallelism());
|
|
||||||
logger.info("Using cleanerParallelism: " + cleanerParallelism);
|
|
||||||
List<Tuple2<String, PartitionCleanStat>> partitionCleanStats = jsc
|
|
||||||
.parallelize(partitionsToClean, cleanerParallelism)
|
|
||||||
.flatMapToPair(getFilesToDeleteFunc(this, config))
|
|
||||||
.repartition(cleanerParallelism) // repartition to remove skews
|
|
||||||
.mapPartitionsToPair(deleteFilesFunc(this))
|
|
||||||
.reduceByKey(
|
|
||||||
// merge partition level clean stats below
|
|
||||||
(Function2<PartitionCleanStat, PartitionCleanStat, PartitionCleanStat>) (e1, e2) -> e1
|
|
||||||
.merge(e2))
|
|
||||||
.collect();
|
|
||||||
|
|
||||||
Map<String, PartitionCleanStat> partitionCleanStatsMap = partitionCleanStats
|
|
||||||
.stream().collect(Collectors.toMap(e -> e._1(), e -> e._2()));
|
|
||||||
|
|
||||||
HoodieCleanHelper cleaner = new HoodieCleanHelper(this, config);
|
|
||||||
// Return PartitionCleanStat for each partition passed.
|
|
||||||
return partitionsToClean.stream().map(partitionPath -> {
|
|
||||||
PartitionCleanStat partitionCleanStat =
|
|
||||||
(partitionCleanStatsMap.containsKey(partitionPath)) ?
|
|
||||||
partitionCleanStatsMap.get(partitionPath)
|
|
||||||
: new PartitionCleanStat(partitionPath);
|
|
||||||
return HoodieCleanStat.newBuilder()
|
|
||||||
.withPolicy(config.getCleanerPolicy())
|
|
||||||
.withPartitionPath(partitionPath)
|
|
||||||
.withEarliestCommitRetained(cleaner.getEarliestCommitToRetain())
|
|
||||||
.withDeletePathPattern(partitionCleanStat.deletePathPatterns)
|
|
||||||
.withSuccessfulDeletes(partitionCleanStat.successDeleteFiles)
|
|
||||||
.withFailedDeletes(partitionCleanStat.failedDeleteFiles)
|
|
||||||
.build();
|
|
||||||
}).collect(Collectors.toList());
|
|
||||||
}
|
|
||||||
|
|
||||||
private static PairFlatMapFunction<Iterator<Tuple2<String, String>>, String, PartitionCleanStat> deleteFilesFunc(
|
|
||||||
HoodieTable table) {
|
|
||||||
return (PairFlatMapFunction<Iterator<Tuple2<String, String>>, String, PartitionCleanStat>) iter -> {
|
|
||||||
Map<String, PartitionCleanStat> partitionCleanStatMap = new HashMap<>();
|
|
||||||
|
|
||||||
FileSystem fs = table.getMetaClient().getFs();
|
|
||||||
while (iter.hasNext()) {
|
|
||||||
Tuple2<String, String> partitionDelFileTuple = iter.next();
|
|
||||||
String partitionPath = partitionDelFileTuple._1();
|
|
||||||
String deletePathStr = partitionDelFileTuple._2();
|
|
||||||
Boolean deletedFileResult = deleteFileAndGetResult(fs, deletePathStr);
|
|
||||||
if (!partitionCleanStatMap.containsKey(partitionPath)) {
|
|
||||||
partitionCleanStatMap.put(partitionPath,
|
|
||||||
new PartitionCleanStat(partitionPath));
|
|
||||||
}
|
|
||||||
PartitionCleanStat partitionCleanStat = partitionCleanStatMap.get(partitionPath);
|
|
||||||
partitionCleanStat.addDeleteFilePatterns(deletePathStr);
|
|
||||||
partitionCleanStat.addDeletedFileResult(deletePathStr, deletedFileResult);
|
|
||||||
}
|
|
||||||
|
|
||||||
return partitionCleanStatMap.entrySet().stream()
|
|
||||||
.map(e -> new Tuple2<>(e.getKey(), e.getValue()))
|
|
||||||
.collect(Collectors.toList()).iterator();
|
|
||||||
};
|
|
||||||
}
|
|
||||||
|
|
||||||
private static PairFlatMapFunction<String, String, String> getFilesToDeleteFunc(
|
|
||||||
HoodieTable table, HoodieWriteConfig config) {
|
|
||||||
return (PairFlatMapFunction<String, String, String>) partitionPathToClean -> {
|
|
||||||
HoodieCleanHelper cleaner = new HoodieCleanHelper(table, config);
|
|
||||||
return cleaner.getDeletePaths(partitionPathToClean).stream()
|
|
||||||
.map(deleteFile -> new Tuple2<>(partitionPathToClean, deleteFile.toString()))
|
|
||||||
.iterator();
|
|
||||||
};
|
|
||||||
}
|
|
||||||
|
|
||||||
private static Boolean deleteFileAndGetResult(FileSystem fs, String deletePathStr)
|
|
||||||
throws IOException {
|
|
||||||
Path deletePath = new Path(deletePathStr);
|
|
||||||
logger.debug("Working on delete path :" + deletePath);
|
|
||||||
boolean deleteResult = fs.delete(deletePath, false);
|
|
||||||
if (deleteResult) {
|
|
||||||
logger.debug("Cleaned file at path :" + deletePath);
|
|
||||||
}
|
|
||||||
return deleteResult;
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -62,19 +62,12 @@ import org.apache.spark.api.java.JavaSparkContext;
|
|||||||
import org.apache.spark.api.java.function.Function;
|
import org.apache.spark.api.java.function.Function;
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Implementation of a more real-time read-optimized Hoodie Table where
|
* Implementation of a more real-time read-optimized Hoodie Table where <p> INSERTS - Same as
|
||||||
* <p>
|
* HoodieCopyOnWriteTable - Produce new files, block aligned to desired size (or) Merge with the
|
||||||
* INSERTS - Same as HoodieCopyOnWriteTable - Produce new files, block aligned to desired size (or)
|
* smallest existing file, to expand it </p> <p> UPDATES - Appends the changes to a rolling log file
|
||||||
* Merge with the smallest existing file, to expand it
|
* maintained per file Id. Compaction merges the log file into the base file. </p> <p> WARNING - MOR
|
||||||
* </p>
|
* table type does not support nested rollbacks, every rollback must be followed by an attempted
|
||||||
* <p>
|
* commit action </p>
|
||||||
* UPDATES - Appends the changes to a rolling log file maintained per file Id. Compaction merges the
|
|
||||||
* log file into the base file.
|
|
||||||
* </p>
|
|
||||||
* <p>
|
|
||||||
* WARNING - MOR table type does not support nested rollbacks, every rollback must be followed by an
|
|
||||||
* attempted commit action
|
|
||||||
* </p>
|
|
||||||
*/
|
*/
|
||||||
public class HoodieMergeOnReadTable<T extends HoodieRecordPayload> extends
|
public class HoodieMergeOnReadTable<T extends HoodieRecordPayload> extends
|
||||||
HoodieCopyOnWriteTable<T> {
|
HoodieCopyOnWriteTable<T> {
|
||||||
@@ -88,57 +81,6 @@ public class HoodieMergeOnReadTable<T extends HoodieRecordPayload> extends
|
|||||||
super(config, metaClient);
|
super(config, metaClient);
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
|
||||||
* UpsertPartitioner for MergeOnRead table type, this allows auto correction of small parquet
|
|
||||||
* files to larger ones without the need for an index in the logFile.
|
|
||||||
*/
|
|
||||||
class MergeOnReadUpsertPartitioner extends HoodieCopyOnWriteTable.UpsertPartitioner {
|
|
||||||
|
|
||||||
MergeOnReadUpsertPartitioner(WorkloadProfile profile) {
|
|
||||||
super(profile);
|
|
||||||
}
|
|
||||||
|
|
||||||
@Override
|
|
||||||
protected List<SmallFile> getSmallFiles(String partitionPath) {
|
|
||||||
|
|
||||||
// smallFiles only for partitionPath
|
|
||||||
List<SmallFile> smallFileLocations = new ArrayList<>();
|
|
||||||
|
|
||||||
// Init here since this class (and member variables) might not have been initialized
|
|
||||||
HoodieTimeline commitTimeline = getCompletedCommitTimeline();
|
|
||||||
|
|
||||||
if (!commitTimeline.empty()) {
|
|
||||||
HoodieInstant latestCommitTime = commitTimeline.lastInstant().get();
|
|
||||||
// find smallest file in partition and append to it
|
|
||||||
Optional<FileSlice> smallFileSlice = getRTFileSystemView()
|
|
||||||
.getLatestFileSlicesBeforeOrOn(partitionPath, latestCommitTime.getTimestamp())
|
|
||||||
.filter(fileSlice -> fileSlice.getLogFiles().count() < 1 &&
|
|
||||||
fileSlice.getDataFile().get().getFileSize() < config.getParquetSmallFileLimit())
|
|
||||||
.sorted((FileSlice left, FileSlice right) ->
|
|
||||||
left.getDataFile().get().getFileSize() < right.getDataFile().get().getFileSize() ? -1 : 1)
|
|
||||||
.findFirst();
|
|
||||||
|
|
||||||
if(smallFileSlice.isPresent()) {
|
|
||||||
String filename = smallFileSlice.get().getDataFile().get().getFileName();
|
|
||||||
SmallFile sf = new SmallFile();
|
|
||||||
sf.location = new HoodieRecordLocation(FSUtils.getCommitTime(filename),
|
|
||||||
FSUtils.getFileId(filename));
|
|
||||||
sf.sizeBytes = smallFileSlice.get().getDataFile().get().getFileSize();
|
|
||||||
smallFileLocations.add(sf);
|
|
||||||
// Update the global small files list
|
|
||||||
smallFiles.add(sf);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
return smallFileLocations;
|
|
||||||
}
|
|
||||||
|
|
||||||
public List<String> getSmallFileIds() {
|
|
||||||
return (List<String>) smallFiles.stream().map(smallFile -> ((SmallFile) smallFile).location.getFileId())
|
|
||||||
.collect(Collectors.toList());
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
public Partitioner getUpsertPartitioner(WorkloadProfile profile) {
|
public Partitioner getUpsertPartitioner(WorkloadProfile profile) {
|
||||||
if (profile == null) {
|
if (profile == null) {
|
||||||
@@ -154,11 +96,12 @@ public class HoodieMergeOnReadTable<T extends HoodieRecordPayload> extends
|
|||||||
logger.info("Merging updates for commit " + commitTime + " for file " + fileId);
|
logger.info("Merging updates for commit " + commitTime + " for file " + fileId);
|
||||||
|
|
||||||
if (mergeOnReadUpsertPartitioner.getSmallFileIds().contains(fileId)) {
|
if (mergeOnReadUpsertPartitioner.getSmallFileIds().contains(fileId)) {
|
||||||
logger.info("Small file corrections for updates for commit " + commitTime + " for file " + fileId);
|
logger.info(
|
||||||
|
"Small file corrections for updates for commit " + commitTime + " for file " + fileId);
|
||||||
return super.handleUpdate(commitTime, fileId, recordItr);
|
return super.handleUpdate(commitTime, fileId, recordItr);
|
||||||
} else {
|
} else {
|
||||||
HoodieAppendHandle<T> appendHandle =
|
HoodieAppendHandle<T> appendHandle = new HoodieAppendHandle<>(config, commitTime, this,
|
||||||
new HoodieAppendHandle<>(config, commitTime, this, fileId, recordItr);
|
fileId, recordItr);
|
||||||
appendHandle.doAppend();
|
appendHandle.doAppend();
|
||||||
appendHandle.close();
|
appendHandle.close();
|
||||||
return Collections.singletonList(Collections.singletonList(appendHandle.getWriteStatus()))
|
return Collections.singletonList(Collections.singletonList(appendHandle.getWriteStatus()))
|
||||||
@@ -202,11 +145,9 @@ public class HoodieMergeOnReadTable<T extends HoodieRecordPayload> extends
|
|||||||
if (commits.size() > 1) {
|
if (commits.size() > 1) {
|
||||||
throw new UnsupportedOperationException("Nested Rollbacks are not supported");
|
throw new UnsupportedOperationException("Nested Rollbacks are not supported");
|
||||||
}
|
}
|
||||||
Map<String, HoodieInstant> commitsAndCompactions =
|
Map<String, HoodieInstant> commitsAndCompactions = this.getActiveTimeline()
|
||||||
this.getActiveTimeline()
|
|
||||||
.getTimelineOfActions(Sets.newHashSet(HoodieActiveTimeline.COMMIT_ACTION,
|
.getTimelineOfActions(Sets.newHashSet(HoodieActiveTimeline.COMMIT_ACTION,
|
||||||
HoodieActiveTimeline.DELTA_COMMIT_ACTION))
|
HoodieActiveTimeline.DELTA_COMMIT_ACTION)).getInstants()
|
||||||
.getInstants()
|
|
||||||
.filter(i -> commits.contains(i.getTimestamp()))
|
.filter(i -> commits.contains(i.getTimestamp()))
|
||||||
.collect(Collectors.toMap(i -> i.getTimestamp(), i -> i));
|
.collect(Collectors.toMap(i -> i.getTimestamp(), i -> i));
|
||||||
|
|
||||||
@@ -218,9 +159,9 @@ public class HoodieMergeOnReadTable<T extends HoodieRecordPayload> extends
|
|||||||
|
|
||||||
Long startTime = System.currentTimeMillis();
|
Long startTime = System.currentTimeMillis();
|
||||||
|
|
||||||
List<HoodieRollbackStat> allRollbackStats = jsc.parallelize
|
List<HoodieRollbackStat> allRollbackStats = jsc.parallelize(FSUtils
|
||||||
(FSUtils.getAllPartitionPaths(this.metaClient.getFs(),
|
.getAllPartitionPaths(this.metaClient.getFs(), this.getMetaClient().getBasePath(),
|
||||||
this.getMetaClient().getBasePath(), config.shouldAssumeDatePartitioning()))
|
config.shouldAssumeDatePartitioning()))
|
||||||
.map((Function<String, List<HoodieRollbackStat>>) partitionPath -> {
|
.map((Function<String, List<HoodieRollbackStat>>) partitionPath -> {
|
||||||
return commits.stream().map(commit -> {
|
return commits.stream().map(commit -> {
|
||||||
HoodieInstant instant = commitsAndCompactions.get(commit);
|
HoodieInstant instant = commitsAndCompactions.get(commit);
|
||||||
@@ -228,23 +169,27 @@ public class HoodieMergeOnReadTable<T extends HoodieRecordPayload> extends
|
|||||||
switch (instant.getAction()) {
|
switch (instant.getAction()) {
|
||||||
case HoodieTimeline.COMMIT_ACTION:
|
case HoodieTimeline.COMMIT_ACTION:
|
||||||
try {
|
try {
|
||||||
Map<FileStatus, Boolean> results = super.deleteCleanedFiles(partitionPath, Arrays.asList(commit));
|
Map<FileStatus, Boolean> results = super
|
||||||
hoodieRollbackStats = HoodieRollbackStat.newBuilder().withPartitionPath(partitionPath)
|
.deleteCleanedFiles(partitionPath, Arrays.asList(commit));
|
||||||
.withDeletedFileResults(results).build();
|
hoodieRollbackStats = HoodieRollbackStat.newBuilder()
|
||||||
|
.withPartitionPath(partitionPath).withDeletedFileResults(results).build();
|
||||||
break;
|
break;
|
||||||
} catch (IOException io) {
|
} catch (IOException io) {
|
||||||
throw new UncheckedIOException("Failed to rollback for commit " + commit, io);
|
throw new UncheckedIOException("Failed to rollback for commit " + commit, io);
|
||||||
}
|
}
|
||||||
case HoodieTimeline.DELTA_COMMIT_ACTION:
|
case HoodieTimeline.DELTA_COMMIT_ACTION:
|
||||||
try {
|
try {
|
||||||
HoodieCommitMetadata commitMetadata = HoodieCommitMetadata
|
HoodieCommitMetadata commitMetadata = HoodieCommitMetadata.fromBytes(
|
||||||
.fromBytes(this.getCommitTimeline().getInstantDetails(new HoodieInstant(true, instant.getAction(), instant.getTimestamp())).get());
|
this.getCommitTimeline().getInstantDetails(
|
||||||
|
new HoodieInstant(true, instant.getAction(), instant.getTimestamp()))
|
||||||
|
.get());
|
||||||
|
|
||||||
// read commit file and (either append delete blocks or delete file)
|
// read commit file and (either append delete blocks or delete file)
|
||||||
Map<FileStatus, Boolean> filesToDeletedStatus = new HashMap<>();
|
Map<FileStatus, Boolean> filesToDeletedStatus = new HashMap<>();
|
||||||
Map<FileStatus, Long> filesToNumBlocksRollback = new HashMap<>();
|
Map<FileStatus, Long> filesToNumBlocksRollback = new HashMap<>();
|
||||||
|
|
||||||
// we do not know fileIds for inserts (first inserts are parquet files), delete all parquet files for the corresponding failed commit, if present (same as COW)
|
// we do not know fileIds for inserts (first inserts are parquet files), delete
|
||||||
|
// all parquet files for the corresponding failed commit, if present (same as COW)
|
||||||
filesToDeletedStatus = super
|
filesToDeletedStatus = super
|
||||||
.deleteCleanedFiles(partitionPath, Arrays.asList(commit));
|
.deleteCleanedFiles(partitionPath, Arrays.asList(commit));
|
||||||
|
|
||||||
@@ -252,32 +197,35 @@ public class HoodieMergeOnReadTable<T extends HoodieRecordPayload> extends
|
|||||||
if (commitMetadata.getPartitionToWriteStats().containsKey(partitionPath)) {
|
if (commitMetadata.getPartitionToWriteStats().containsKey(partitionPath)) {
|
||||||
commitMetadata.getPartitionToWriteStats().get(partitionPath).stream()
|
commitMetadata.getPartitionToWriteStats().get(partitionPath).stream()
|
||||||
.filter(wStat -> {
|
.filter(wStat -> {
|
||||||
return wStat != null && wStat.getPrevCommit() != HoodieWriteStat.NULL_COMMIT
|
return wStat != null
|
||||||
|
&& wStat.getPrevCommit() != HoodieWriteStat.NULL_COMMIT
|
||||||
&& wStat.getPrevCommit() != null;
|
&& wStat.getPrevCommit() != null;
|
||||||
})
|
}).forEach(wStat -> {
|
||||||
.forEach(wStat -> {
|
|
||||||
HoodieLogFormat.Writer writer = null;
|
HoodieLogFormat.Writer writer = null;
|
||||||
try {
|
try {
|
||||||
writer = HoodieLogFormat.newWriterBuilder()
|
writer = HoodieLogFormat.newWriterBuilder().onParentPath(
|
||||||
.onParentPath(new Path(this.getMetaClient().getBasePath(), partitionPath))
|
new Path(this.getMetaClient().getBasePath(), partitionPath))
|
||||||
.withFileId(wStat.getFileId()).overBaseCommit(wStat.getPrevCommit())
|
.withFileId(wStat.getFileId()).overBaseCommit(wStat.getPrevCommit())
|
||||||
.withFs(this.metaClient.getFs())
|
.withFs(this.metaClient.getFs())
|
||||||
.withFileExtension(HoodieLogFile.DELTA_EXTENSION).build();
|
.withFileExtension(HoodieLogFile.DELTA_EXTENSION).build();
|
||||||
Long numRollbackBlocks = 0L;
|
Long numRollbackBlocks = 0L;
|
||||||
// generate metadata
|
// generate metadata
|
||||||
Map<HoodieLogBlock.HeaderMetadataType, String> header = Maps.newHashMap();
|
Map<HoodieLogBlock.HeaderMetadataType, String> header =
|
||||||
|
Maps.newHashMap();
|
||||||
header.put(HoodieLogBlock.HeaderMetadataType.INSTANT_TIME,
|
header.put(HoodieLogBlock.HeaderMetadataType.INSTANT_TIME,
|
||||||
metaClient.getActiveTimeline().lastInstant().get().getTimestamp());
|
metaClient.getActiveTimeline().lastInstant().get().getTimestamp());
|
||||||
header.put(HoodieLogBlock.HeaderMetadataType.TARGET_INSTANT_TIME, commit);
|
header.put(HoodieLogBlock.HeaderMetadataType.TARGET_INSTANT_TIME,
|
||||||
header.put(HoodieLogBlock.HeaderMetadataType.COMMAND_BLOCK_TYPE,
|
commit);
|
||||||
String.valueOf(HoodieCommandBlock.HoodieCommandBlockTypeEnum.ROLLBACK_PREVIOUS_BLOCK.ordinal()));
|
header.put(HoodieLogBlock.HeaderMetadataType.COMMAND_BLOCK_TYPE, String
|
||||||
|
.valueOf(
|
||||||
|
HoodieCommandBlock.HoodieCommandBlockTypeEnum
|
||||||
|
.ROLLBACK_PREVIOUS_BLOCK
|
||||||
|
.ordinal()));
|
||||||
// if update belongs to an existing log file
|
// if update belongs to an existing log file
|
||||||
writer = writer.appendBlock(new HoodieCommandBlock(
|
writer = writer.appendBlock(new HoodieCommandBlock(header));
|
||||||
header));
|
|
||||||
numRollbackBlocks++;
|
numRollbackBlocks++;
|
||||||
filesToNumBlocksRollback
|
filesToNumBlocksRollback.put(this.getMetaClient().getFs()
|
||||||
.put(this.getMetaClient().getFs().getFileStatus(writer.getLogFile().getPath()),
|
.getFileStatus(writer.getLogFile().getPath()), numRollbackBlocks);
|
||||||
numRollbackBlocks);
|
|
||||||
} catch (IOException | InterruptedException io) {
|
} catch (IOException | InterruptedException io) {
|
||||||
throw new HoodieRollbackException(
|
throw new HoodieRollbackException(
|
||||||
"Failed to rollback for commit " + commit, io);
|
"Failed to rollback for commit " + commit, io);
|
||||||
@@ -289,7 +237,8 @@ public class HoodieMergeOnReadTable<T extends HoodieRecordPayload> extends
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
});
|
});
|
||||||
hoodieRollbackStats = HoodieRollbackStat.newBuilder().withPartitionPath(partitionPath)
|
hoodieRollbackStats = HoodieRollbackStat.newBuilder()
|
||||||
|
.withPartitionPath(partitionPath)
|
||||||
.withDeletedFileResults(filesToDeletedStatus)
|
.withDeletedFileResults(filesToDeletedStatus)
|
||||||
.withRollbackBlockAppendResults(filesToNumBlocksRollback).build();
|
.withRollbackBlockAppendResults(filesToNumBlocksRollback).build();
|
||||||
}
|
}
|
||||||
@@ -297,17 +246,19 @@ public class HoodieMergeOnReadTable<T extends HoodieRecordPayload> extends
|
|||||||
} catch (IOException io) {
|
} catch (IOException io) {
|
||||||
throw new UncheckedIOException("Failed to rollback for commit " + commit, io);
|
throw new UncheckedIOException("Failed to rollback for commit " + commit, io);
|
||||||
}
|
}
|
||||||
|
default:
|
||||||
|
break;
|
||||||
}
|
}
|
||||||
return hoodieRollbackStats;
|
return hoodieRollbackStats;
|
||||||
}).collect(Collectors.toList());
|
}).collect(Collectors.toList());
|
||||||
}).flatMap(x -> x.iterator()).filter(x -> x != null).collect();
|
}).flatMap(x -> x.iterator()).filter(x -> x != null).collect();
|
||||||
|
|
||||||
commitsAndCompactions.entrySet().stream()
|
commitsAndCompactions.entrySet().stream().map(
|
||||||
.map(entry -> new HoodieInstant(true, entry.getValue().getAction(),
|
entry -> new HoodieInstant(true, entry.getValue().getAction(),
|
||||||
entry.getValue().getTimestamp()))
|
entry.getValue().getTimestamp())).forEach(this.getActiveTimeline()::deleteInflight);
|
||||||
.forEach(this.getActiveTimeline()::deleteInflight);
|
|
||||||
|
|
||||||
logger.debug("Time(in ms) taken to finish rollback " + (System.currentTimeMillis() - startTime));
|
logger
|
||||||
|
.debug("Time(in ms) taken to finish rollback " + (System.currentTimeMillis() - startTime));
|
||||||
|
|
||||||
return allRollbackStats;
|
return allRollbackStats;
|
||||||
}
|
}
|
||||||
@@ -317,4 +268,56 @@ public class HoodieMergeOnReadTable<T extends HoodieRecordPayload> extends
|
|||||||
// do nothing for MOR tables
|
// do nothing for MOR tables
|
||||||
return Optional.empty();
|
return Optional.empty();
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* UpsertPartitioner for MergeOnRead table type, this allows auto correction of small parquet
|
||||||
|
* files to larger ones without the need for an index in the logFile.
|
||||||
|
*/
|
||||||
|
class MergeOnReadUpsertPartitioner extends HoodieCopyOnWriteTable.UpsertPartitioner {
|
||||||
|
|
||||||
|
MergeOnReadUpsertPartitioner(WorkloadProfile profile) {
|
||||||
|
super(profile);
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
protected List<SmallFile> getSmallFiles(String partitionPath) {
|
||||||
|
|
||||||
|
// smallFiles only for partitionPath
|
||||||
|
List<SmallFile> smallFileLocations = new ArrayList<>();
|
||||||
|
|
||||||
|
// Init here since this class (and member variables) might not have been initialized
|
||||||
|
HoodieTimeline commitTimeline = getCompletedCommitTimeline();
|
||||||
|
|
||||||
|
if (!commitTimeline.empty()) {
|
||||||
|
HoodieInstant latestCommitTime = commitTimeline.lastInstant().get();
|
||||||
|
// find smallest file in partition and append to it
|
||||||
|
Optional<FileSlice> smallFileSlice = getRTFileSystemView()
|
||||||
|
.getLatestFileSlicesBeforeOrOn(partitionPath, latestCommitTime.getTimestamp()).filter(
|
||||||
|
fileSlice -> fileSlice.getLogFiles().count() < 1
|
||||||
|
&& fileSlice.getDataFile().get().getFileSize() < config
|
||||||
|
.getParquetSmallFileLimit()).sorted((FileSlice left, FileSlice right) ->
|
||||||
|
left.getDataFile().get().getFileSize() < right.getDataFile().get().getFileSize()
|
||||||
|
? -1 : 1).findFirst();
|
||||||
|
|
||||||
|
if (smallFileSlice.isPresent()) {
|
||||||
|
String filename = smallFileSlice.get().getDataFile().get().getFileName();
|
||||||
|
SmallFile sf = new SmallFile();
|
||||||
|
sf.location = new HoodieRecordLocation(FSUtils.getCommitTime(filename),
|
||||||
|
FSUtils.getFileId(filename));
|
||||||
|
sf.sizeBytes = smallFileSlice.get().getDataFile().get().getFileSize();
|
||||||
|
smallFileLocations.add(sf);
|
||||||
|
// Update the global small files list
|
||||||
|
smallFiles.add(sf);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
return smallFileLocations;
|
||||||
|
}
|
||||||
|
|
||||||
|
public List<String> getSmallFileIds() {
|
||||||
|
return (List<String>) smallFiles.stream()
|
||||||
|
.map(smallFile -> ((SmallFile) smallFile).location.getFileId())
|
||||||
|
.collect(Collectors.toList());
|
||||||
|
}
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -60,18 +60,28 @@ public abstract class HoodieTable<T extends HoodieRecordPayload> implements Seri
|
|||||||
this.metaClient = metaClient;
|
this.metaClient = metaClient;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
public static <T extends HoodieRecordPayload> HoodieTable<T> getHoodieTable(
|
||||||
|
HoodieTableMetaClient metaClient, HoodieWriteConfig config) {
|
||||||
|
switch (metaClient.getTableType()) {
|
||||||
|
case COPY_ON_WRITE:
|
||||||
|
return new HoodieCopyOnWriteTable<>(config, metaClient);
|
||||||
|
case MERGE_ON_READ:
|
||||||
|
return new HoodieMergeOnReadTable<>(config, metaClient);
|
||||||
|
default:
|
||||||
|
throw new HoodieException("Unsupported table type :" + metaClient.getTableType());
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Provides a partitioner to perform the upsert operation, based on the workload profile
|
* Provides a partitioner to perform the upsert operation, based on the workload profile
|
||||||
*/
|
*/
|
||||||
public abstract Partitioner getUpsertPartitioner(WorkloadProfile profile);
|
public abstract Partitioner getUpsertPartitioner(WorkloadProfile profile);
|
||||||
|
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Provides a partitioner to perform the insert operation, based on the workload profile
|
* Provides a partitioner to perform the insert operation, based on the workload profile
|
||||||
*/
|
*/
|
||||||
public abstract Partitioner getInsertPartitioner(WorkloadProfile profile);
|
public abstract Partitioner getInsertPartitioner(WorkloadProfile profile);
|
||||||
|
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Return whether this HoodieTable implementation can benefit from workload profiling
|
* Return whether this HoodieTable implementation can benefit from workload profiling
|
||||||
*/
|
*/
|
||||||
@@ -131,7 +141,6 @@ public abstract class HoodieTable<T extends HoodieRecordPayload> implements Seri
|
|||||||
return getCommitsTimeline().filterInflights();
|
return getCommitsTimeline().filterInflights();
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Get only the completed (no-inflights) clean timeline
|
* Get only the completed (no-inflights) clean timeline
|
||||||
*/
|
*/
|
||||||
@@ -162,12 +171,12 @@ public abstract class HoodieTable<T extends HoodieRecordPayload> implements Seri
|
|||||||
throw new HoodieSavepointException(
|
throw new HoodieSavepointException(
|
||||||
"Could not get data files for savepoint " + savepointTime + ". No such savepoint.");
|
"Could not get data files for savepoint " + savepointTime + ". No such savepoint.");
|
||||||
}
|
}
|
||||||
HoodieInstant instant =
|
HoodieInstant instant = new HoodieInstant(false, HoodieTimeline.SAVEPOINT_ACTION,
|
||||||
new HoodieInstant(false, HoodieTimeline.SAVEPOINT_ACTION, savepointTime);
|
savepointTime);
|
||||||
HoodieSavepointMetadata metadata = null;
|
HoodieSavepointMetadata metadata = null;
|
||||||
try {
|
try {
|
||||||
metadata = AvroUtils.deserializeHoodieSavepointMetadata(
|
metadata = AvroUtils
|
||||||
getActiveTimeline().getInstantDetails(instant).get());
|
.deserializeHoodieSavepointMetadata(getActiveTimeline().getInstantDetails(instant).get());
|
||||||
} catch (IOException e) {
|
} catch (IOException e) {
|
||||||
throw new HoodieSavepointException(
|
throw new HoodieSavepointException(
|
||||||
"Could not get savepointed data files for savepoint " + savepointTime, e);
|
"Could not get savepointed data files for savepoint " + savepointTime, e);
|
||||||
@@ -189,7 +198,8 @@ public abstract class HoodieTable<T extends HoodieRecordPayload> implements Seri
|
|||||||
return getActiveTimeline().getCommitTimeline();
|
return getActiveTimeline().getCommitTimeline();
|
||||||
case MERGE_ON_READ:
|
case MERGE_ON_READ:
|
||||||
// We need to include the parquet files written out in delta commits
|
// We need to include the parquet files written out in delta commits
|
||||||
// Include commit action to be able to start doing a MOR over a COW dataset - no migration required
|
// Include commit action to be able to start doing a MOR over a COW dataset - no
|
||||||
|
// migration required
|
||||||
return getActiveTimeline().getCommitsTimeline();
|
return getActiveTimeline().getCommitsTimeline();
|
||||||
default:
|
default:
|
||||||
throw new HoodieException("Unsupported table type :" + metaClient.getTableType());
|
throw new HoodieException("Unsupported table type :" + metaClient.getTableType());
|
||||||
@@ -219,10 +229,11 @@ public abstract class HoodieTable<T extends HoodieRecordPayload> implements Seri
|
|||||||
return HoodieActiveTimeline.COMMIT_ACTION;
|
return HoodieActiveTimeline.COMMIT_ACTION;
|
||||||
case MERGE_ON_READ:
|
case MERGE_ON_READ:
|
||||||
return HoodieActiveTimeline.DELTA_COMMIT_ACTION;
|
return HoodieActiveTimeline.DELTA_COMMIT_ACTION;
|
||||||
}
|
default:
|
||||||
throw new HoodieCommitException(
|
throw new HoodieCommitException(
|
||||||
"Could not commit on unknown storage type " + metaClient.getTableType());
|
"Could not commit on unknown storage type " + metaClient.getTableType());
|
||||||
}
|
}
|
||||||
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Perform the ultimate IO for a given upserted (RDD) partition
|
* Perform the ultimate IO for a given upserted (RDD) partition
|
||||||
@@ -236,21 +247,9 @@ public abstract class HoodieTable<T extends HoodieRecordPayload> implements Seri
|
|||||||
public abstract Iterator<List<WriteStatus>> handleInsertPartition(String commitTime,
|
public abstract Iterator<List<WriteStatus>> handleInsertPartition(String commitTime,
|
||||||
Integer partition, Iterator<HoodieRecord<T>> recordIterator, Partitioner partitioner);
|
Integer partition, Iterator<HoodieRecord<T>> recordIterator, Partitioner partitioner);
|
||||||
|
|
||||||
public static <T extends HoodieRecordPayload> HoodieTable<T> getHoodieTable(
|
|
||||||
HoodieTableMetaClient metaClient, HoodieWriteConfig config) {
|
|
||||||
switch (metaClient.getTableType()) {
|
|
||||||
case COPY_ON_WRITE:
|
|
||||||
return new HoodieCopyOnWriteTable<>(config, metaClient);
|
|
||||||
case MERGE_ON_READ:
|
|
||||||
return new HoodieMergeOnReadTable<>(config, metaClient);
|
|
||||||
default:
|
|
||||||
throw new HoodieException("Unsupported table type :" + metaClient.getTableType());
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Run Compaction on the table.
|
* Run Compaction on the table. Compaction arranges the data so that it is optimized for data
|
||||||
* Compaction arranges the data so that it is optimized for data access
|
* access
|
||||||
*/
|
*/
|
||||||
public abstract JavaRDD<WriteStatus> compact(JavaSparkContext jsc, String commitTime);
|
public abstract JavaRDD<WriteStatus> compact(JavaSparkContext jsc, String commitTime);
|
||||||
|
|
||||||
|
|||||||
@@ -13,6 +13,7 @@
|
|||||||
* See the License for the specific language governing permissions and
|
* See the License for the specific language governing permissions and
|
||||||
* limitations under the License.
|
* limitations under the License.
|
||||||
*/
|
*/
|
||||||
|
|
||||||
package com.uber.hoodie.table;
|
package com.uber.hoodie.table;
|
||||||
|
|
||||||
import com.uber.hoodie.common.model.HoodieRecord;
|
import com.uber.hoodie.common.model.HoodieRecord;
|
||||||
|
|||||||
@@ -16,7 +16,6 @@
|
|||||||
|
|
||||||
package com.uber.hoodie.table;
|
package com.uber.hoodie.table;
|
||||||
|
|
||||||
|
|
||||||
import com.uber.hoodie.common.model.HoodieRecord;
|
import com.uber.hoodie.common.model.HoodieRecord;
|
||||||
import com.uber.hoodie.common.model.HoodieRecordLocation;
|
import com.uber.hoodie.common.model.HoodieRecordLocation;
|
||||||
import com.uber.hoodie.common.model.HoodieRecordPayload;
|
import com.uber.hoodie.common.model.HoodieRecordPayload;
|
||||||
@@ -31,7 +30,7 @@ import scala.Tuple2;
|
|||||||
/**
|
/**
|
||||||
* Information about incoming records for upsert/insert obtained either via sampling or
|
* Information about incoming records for upsert/insert obtained either via sampling or
|
||||||
* introspecting the data fully
|
* introspecting the data fully
|
||||||
*
|
* <p>
|
||||||
* TODO(vc): Think about obtaining this directly from index.tagLocation
|
* TODO(vc): Think about obtaining this directly from index.tagLocation
|
||||||
*/
|
*/
|
||||||
public class WorkloadProfile<T extends HoodieRecordPayload> implements Serializable {
|
public class WorkloadProfile<T extends HoodieRecordPayload> implements Serializable {
|
||||||
@@ -60,11 +59,9 @@ public class WorkloadProfile<T extends HoodieRecordPayload> implements Serializa
|
|||||||
private void buildProfile() {
|
private void buildProfile() {
|
||||||
|
|
||||||
Map<Tuple2<String, Option<HoodieRecordLocation>>, Long> partitionLocationCounts = taggedRecords
|
Map<Tuple2<String, Option<HoodieRecordLocation>>, Long> partitionLocationCounts = taggedRecords
|
||||||
.mapToPair(record ->
|
.mapToPair(record -> new Tuple2<>(
|
||||||
new Tuple2<>(
|
|
||||||
new Tuple2<>(record.getPartitionPath(), Option.apply(record.getCurrentLocation())),
|
new Tuple2<>(record.getPartitionPath(), Option.apply(record.getCurrentLocation())),
|
||||||
record))
|
record)).countByKey();
|
||||||
.countByKey();
|
|
||||||
|
|
||||||
for (Map.Entry<Tuple2<String, Option<HoodieRecordLocation>>, Long> e : partitionLocationCounts
|
for (Map.Entry<Tuple2<String, Option<HoodieRecordLocation>>, Long> e : partitionLocationCounts
|
||||||
.entrySet()) {
|
.entrySet()) {
|
||||||
|
|||||||
@@ -17,10 +17,9 @@
|
|||||||
package com.uber.hoodie.table;
|
package com.uber.hoodie.table;
|
||||||
|
|
||||||
import com.uber.hoodie.common.model.HoodieRecordLocation;
|
import com.uber.hoodie.common.model.HoodieRecordLocation;
|
||||||
import org.apache.commons.lang3.tuple.Pair;
|
|
||||||
|
|
||||||
import java.io.Serializable;
|
import java.io.Serializable;
|
||||||
import java.util.HashMap;
|
import java.util.HashMap;
|
||||||
|
import org.apache.commons.lang3.tuple.Pair;
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Wraps stats about a single partition path.
|
* Wraps stats about a single partition path.
|
||||||
|
|||||||
@@ -14,7 +14,6 @@
|
|||||||
* limitations under the License.
|
* limitations under the License.
|
||||||
*/
|
*/
|
||||||
|
|
||||||
|
|
||||||
import com.beust.jcommander.JCommander;
|
import com.beust.jcommander.JCommander;
|
||||||
import com.beust.jcommander.Parameter;
|
import com.beust.jcommander.Parameter;
|
||||||
import com.uber.hoodie.HoodieWriteClient;
|
import com.uber.hoodie.HoodieWriteClient;
|
||||||
@@ -38,24 +37,19 @@ import org.apache.spark.api.java.JavaRDD;
|
|||||||
import org.apache.spark.api.java.JavaSparkContext;
|
import org.apache.spark.api.java.JavaSparkContext;
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Driver program that uses the Hoodie client with synthetic workload, and performs basic
|
* Driver program that uses the Hoodie client with synthetic workload, and performs basic operations. <p>
|
||||||
* operations. <p>
|
|
||||||
*/
|
*/
|
||||||
public class HoodieClientExample {
|
public class HoodieClientExample {
|
||||||
|
|
||||||
@Parameter(names = {"--table-path", "-p"}, description = "path for Hoodie sample table")
|
private static Logger logger = LogManager.getLogger(HoodieClientExample.class);
|
||||||
private String tablePath = "file:///tmp/hoodie/sample-table";
|
|
||||||
|
|
||||||
@Parameter(names = {"--table-name", "-n"}, description = "table name for Hoodie sample table")
|
|
||||||
private String tableName = "hoodie_rt";
|
|
||||||
|
|
||||||
@Parameter(names = {"--table-type", "-t"}, description = "One of COPY_ON_WRITE or MERGE_ON_READ")
|
|
||||||
private String tableType = HoodieTableType.COPY_ON_WRITE.name();
|
|
||||||
|
|
||||||
@Parameter(names = {"--help", "-h"}, help = true)
|
@Parameter(names = {"--help", "-h"}, help = true)
|
||||||
public Boolean help = false;
|
public Boolean help = false;
|
||||||
|
@Parameter(names = {"--table-path", "-p"}, description = "path for Hoodie sample table")
|
||||||
private static Logger logger = LogManager.getLogger(HoodieClientExample.class);
|
private String tablePath = "file:///tmp/hoodie/sample-table";
|
||||||
|
@Parameter(names = {"--table-name", "-n"}, description = "table name for Hoodie sample table")
|
||||||
|
private String tableName = "hoodie_rt";
|
||||||
|
@Parameter(names = {"--table-type", "-t"}, description = "One of COPY_ON_WRITE or MERGE_ON_READ")
|
||||||
|
private String tableType = HoodieTableType.COPY_ON_WRITE.name();
|
||||||
|
|
||||||
public static void main(String[] args) throws Exception {
|
public static void main(String[] args) throws Exception {
|
||||||
HoodieClientExample cli = new HoodieClientExample();
|
HoodieClientExample cli = new HoodieClientExample();
|
||||||
@@ -92,10 +86,10 @@ public class HoodieClientExample {
|
|||||||
// Create the write client to write some records in
|
// Create the write client to write some records in
|
||||||
HoodieWriteConfig cfg = HoodieWriteConfig.newBuilder().withPath(tablePath)
|
HoodieWriteConfig cfg = HoodieWriteConfig.newBuilder().withPath(tablePath)
|
||||||
.withSchema(HoodieTestDataGenerator.TRIP_EXAMPLE_SCHEMA).withParallelism(2, 2)
|
.withSchema(HoodieTestDataGenerator.TRIP_EXAMPLE_SCHEMA).withParallelism(2, 2)
|
||||||
.forTable(tableName).withIndexConfig(
|
.forTable(tableName)
|
||||||
HoodieIndexConfig.newBuilder().withIndexType(IndexType.BLOOM).build())
|
.withIndexConfig(HoodieIndexConfig.newBuilder().withIndexType(IndexType.BLOOM).build())
|
||||||
.withCompactionConfig(HoodieCompactionConfig.newBuilder().archiveCommitsWith(2, 3).build())
|
.withCompactionConfig(
|
||||||
.build();
|
HoodieCompactionConfig.newBuilder().archiveCommitsWith(2, 3).build()).build();
|
||||||
HoodieWriteClient client = new HoodieWriteClient(jsc, cfg);
|
HoodieWriteClient client = new HoodieWriteClient(jsc, cfg);
|
||||||
|
|
||||||
/**
|
/**
|
||||||
|
|||||||
File diff suppressed because it is too large
Load Diff
@@ -16,7 +16,6 @@
|
|||||||
|
|
||||||
package com.uber.hoodie;
|
package com.uber.hoodie;
|
||||||
|
|
||||||
|
|
||||||
import static org.junit.Assert.assertEquals;
|
import static org.junit.Assert.assertEquals;
|
||||||
|
|
||||||
import com.uber.hoodie.common.HoodieClientTestUtils;
|
import com.uber.hoodie.common.HoodieClientTestUtils;
|
||||||
@@ -58,11 +57,11 @@ public class TestMultiFS implements Serializable {
|
|||||||
private static MiniDFSCluster dfsCluster;
|
private static MiniDFSCluster dfsCluster;
|
||||||
private static DistributedFileSystem dfs;
|
private static DistributedFileSystem dfs;
|
||||||
private static Logger logger = LogManager.getLogger(TestMultiFS.class);
|
private static Logger logger = LogManager.getLogger(TestMultiFS.class);
|
||||||
|
private static JavaSparkContext jsc;
|
||||||
|
private static SQLContext sqlContext;
|
||||||
private String tablePath = "file:///tmp/hoodie/sample-table";
|
private String tablePath = "file:///tmp/hoodie/sample-table";
|
||||||
private String tableName = "hoodie_rt";
|
private String tableName = "hoodie_rt";
|
||||||
private String tableType = HoodieTableType.COPY_ON_WRITE.name();
|
private String tableType = HoodieTableType.COPY_ON_WRITE.name();
|
||||||
private static JavaSparkContext jsc;
|
|
||||||
private static SQLContext sqlContext;
|
|
||||||
|
|
||||||
@BeforeClass
|
@BeforeClass
|
||||||
public static void initClass() throws Exception {
|
public static void initClass() throws Exception {
|
||||||
@@ -92,7 +91,8 @@ public class TestMultiFS implements Serializable {
|
|||||||
hdfsTestService.stop();
|
hdfsTestService.stop();
|
||||||
dfsCluster.shutdown();
|
dfsCluster.shutdown();
|
||||||
}
|
}
|
||||||
// Need to closeAll to clear FileSystem.Cache, required because DFS and LocalFS used in the same JVM
|
// Need to closeAll to clear FileSystem.Cache, required because DFS and LocalFS used in the
|
||||||
|
// same JVM
|
||||||
FileSystem.closeAll();
|
FileSystem.closeAll();
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -111,8 +111,7 @@ public class TestMultiFS implements Serializable {
|
|||||||
HoodieWriteConfig cfg = HoodieWriteConfig.newBuilder().withPath(dfsBasePath)
|
HoodieWriteConfig cfg = HoodieWriteConfig.newBuilder().withPath(dfsBasePath)
|
||||||
.withSchema(HoodieTestDataGenerator.TRIP_EXAMPLE_SCHEMA).withParallelism(2, 2)
|
.withSchema(HoodieTestDataGenerator.TRIP_EXAMPLE_SCHEMA).withParallelism(2, 2)
|
||||||
.forTable(tableName).withIndexConfig(
|
.forTable(tableName).withIndexConfig(
|
||||||
HoodieIndexConfig.newBuilder().withIndexType(HoodieIndex.IndexType.BLOOM).build())
|
HoodieIndexConfig.newBuilder().withIndexType(HoodieIndex.IndexType.BLOOM).build()).build();
|
||||||
.build();
|
|
||||||
HoodieWriteClient hdfsWriteClient = new HoodieWriteClient(jsc, cfg);
|
HoodieWriteClient hdfsWriteClient = new HoodieWriteClient(jsc, cfg);
|
||||||
|
|
||||||
// Write generated data to hdfs (only inserts)
|
// Write generated data to hdfs (only inserts)
|
||||||
@@ -125,10 +124,8 @@ public class TestMultiFS implements Serializable {
|
|||||||
// Read from hdfs
|
// Read from hdfs
|
||||||
FileSystem fs = FSUtils.getFs(dfsBasePath, HoodieTestUtils.getDefaultHadoopConf());
|
FileSystem fs = FSUtils.getFs(dfsBasePath, HoodieTestUtils.getDefaultHadoopConf());
|
||||||
HoodieTableMetaClient metaClient = new HoodieTableMetaClient(fs.getConf(), dfsBasePath);
|
HoodieTableMetaClient metaClient = new HoodieTableMetaClient(fs.getConf(), dfsBasePath);
|
||||||
HoodieTimeline timeline = new HoodieActiveTimeline(metaClient)
|
HoodieTimeline timeline = new HoodieActiveTimeline(metaClient).getCommitTimeline();
|
||||||
.getCommitTimeline();
|
Dataset<Row> readRecords = HoodieClientTestUtils.readCommit(dfsBasePath, sqlContext, timeline, readCommitTime);
|
||||||
Dataset<Row> readRecords = HoodieClientTestUtils
|
|
||||||
.readCommit(dfsBasePath, sqlContext, timeline, readCommitTime);
|
|
||||||
assertEquals("Should contain 100 records", readRecords.count(), records.size());
|
assertEquals("Should contain 100 records", readRecords.count(), records.size());
|
||||||
|
|
||||||
// Write to local
|
// Write to local
|
||||||
@@ -138,8 +135,7 @@ public class TestMultiFS implements Serializable {
|
|||||||
HoodieWriteConfig localConfig = HoodieWriteConfig.newBuilder().withPath(tablePath)
|
HoodieWriteConfig localConfig = HoodieWriteConfig.newBuilder().withPath(tablePath)
|
||||||
.withSchema(HoodieTestDataGenerator.TRIP_EXAMPLE_SCHEMA).withParallelism(2, 2)
|
.withSchema(HoodieTestDataGenerator.TRIP_EXAMPLE_SCHEMA).withParallelism(2, 2)
|
||||||
.forTable(tableName).withIndexConfig(
|
.forTable(tableName).withIndexConfig(
|
||||||
HoodieIndexConfig.newBuilder().withIndexType(HoodieIndex.IndexType.BLOOM).build())
|
HoodieIndexConfig.newBuilder().withIndexType(HoodieIndex.IndexType.BLOOM).build()).build();
|
||||||
.build();
|
|
||||||
HoodieWriteClient localWriteClient = new HoodieWriteClient(jsc, localConfig);
|
HoodieWriteClient localWriteClient = new HoodieWriteClient(jsc, localConfig);
|
||||||
|
|
||||||
String writeCommitTime = localWriteClient.startCommit();
|
String writeCommitTime = localWriteClient.startCommit();
|
||||||
@@ -153,8 +149,7 @@ public class TestMultiFS implements Serializable {
|
|||||||
fs = FSUtils.getFs(tablePath, HoodieTestUtils.getDefaultHadoopConf());
|
fs = FSUtils.getFs(tablePath, HoodieTestUtils.getDefaultHadoopConf());
|
||||||
metaClient = new HoodieTableMetaClient(fs.getConf(), tablePath);
|
metaClient = new HoodieTableMetaClient(fs.getConf(), tablePath);
|
||||||
timeline = new HoodieActiveTimeline(metaClient).getCommitTimeline();
|
timeline = new HoodieActiveTimeline(metaClient).getCommitTimeline();
|
||||||
Dataset<Row> localReadRecords = HoodieClientTestUtils
|
Dataset<Row> localReadRecords = HoodieClientTestUtils.readCommit(tablePath, sqlContext, timeline, writeCommitTime);
|
||||||
.readCommit(tablePath, sqlContext, timeline, writeCommitTime);
|
|
||||||
assertEquals("Should contain 100 records", localReadRecords.count(), localRecords.size());
|
assertEquals("Should contain 100 records", localReadRecords.count(), localRecords.size());
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -69,8 +69,7 @@ public class HoodieClientTestUtils {
|
|||||||
return keys;
|
return keys;
|
||||||
}
|
}
|
||||||
|
|
||||||
private static void fakeMetaFile(String basePath, String commitTime, String suffix)
|
private static void fakeMetaFile(String basePath, String commitTime, String suffix) throws IOException {
|
||||||
throws IOException {
|
|
||||||
String parentPath = basePath + "/" + HoodieTableMetaClient.METAFOLDER_NAME;
|
String parentPath = basePath + "/" + HoodieTableMetaClient.METAFOLDER_NAME;
|
||||||
new File(parentPath).mkdirs();
|
new File(parentPath).mkdirs();
|
||||||
new File(parentPath + "/" + commitTime + suffix).createNewFile();
|
new File(parentPath + "/" + commitTime + suffix).createNewFile();
|
||||||
@@ -85,55 +84,48 @@ public class HoodieClientTestUtils {
|
|||||||
fakeMetaFile(basePath, commitTime, HoodieTimeline.INFLIGHT_EXTENSION);
|
fakeMetaFile(basePath, commitTime, HoodieTimeline.INFLIGHT_EXTENSION);
|
||||||
}
|
}
|
||||||
|
|
||||||
public static void fakeDataFile(String basePath, String partitionPath, String commitTime,
|
public static void fakeDataFile(String basePath, String partitionPath, String commitTime, String fileId)
|
||||||
String fileId) throws Exception {
|
throws Exception {
|
||||||
fakeDataFile(basePath, partitionPath, commitTime, fileId, 0);
|
fakeDataFile(basePath, partitionPath, commitTime, fileId, 0);
|
||||||
}
|
}
|
||||||
|
|
||||||
public static void fakeDataFile(String basePath, String partitionPath, String commitTime,
|
public static void fakeDataFile(String basePath, String partitionPath, String commitTime, String fileId, long length)
|
||||||
String fileId, long length) throws Exception {
|
throws Exception {
|
||||||
String parentPath = String.format("%s/%s", basePath, partitionPath);
|
String parentPath = String.format("%s/%s", basePath, partitionPath);
|
||||||
new File(parentPath).mkdirs();
|
new File(parentPath).mkdirs();
|
||||||
String path = String
|
String path = String.format("%s/%s", parentPath, FSUtils.makeDataFileName(commitTime, 0, fileId));
|
||||||
.format("%s/%s", parentPath, FSUtils.makeDataFileName(commitTime, 0, fileId));
|
|
||||||
new File(path).createNewFile();
|
new File(path).createNewFile();
|
||||||
new RandomAccessFile(path, "rw").setLength(length);
|
new RandomAccessFile(path, "rw").setLength(length);
|
||||||
}
|
}
|
||||||
|
|
||||||
public static SparkConf getSparkConfForTest(String appName) {
|
public static SparkConf getSparkConfForTest(String appName) {
|
||||||
SparkConf sparkConf = new SparkConf()
|
System.out.println("HIII" + "HII2");
|
||||||
.setAppName(appName)
|
SparkConf sparkConf = new SparkConf().setAppName(appName)
|
||||||
.set("spark.serializer", "org.apache.spark.serializer.KryoSerializer")
|
.set("spark.serializer", "org.apache.spark.serializer.KryoSerializer")
|
||||||
.setMaster("local[1]");
|
.setMaster("local[1]");
|
||||||
return HoodieReadClient.addHoodieSupport(sparkConf);
|
return HoodieReadClient.addHoodieSupport(sparkConf);
|
||||||
}
|
}
|
||||||
|
|
||||||
public static HashMap<String, String> getLatestFileIDsToFullPath(String basePath,
|
public static HashMap<String, String> getLatestFileIDsToFullPath(String basePath, HoodieTimeline commitTimeline,
|
||||||
HoodieTimeline commitTimeline,
|
|
||||||
List<HoodieInstant> commitsToReturn) throws IOException {
|
List<HoodieInstant> commitsToReturn) throws IOException {
|
||||||
HashMap<String, String> fileIdToFullPath = new HashMap<>();
|
HashMap<String, String> fileIdToFullPath = new HashMap<>();
|
||||||
for (HoodieInstant commit : commitsToReturn) {
|
for (HoodieInstant commit : commitsToReturn) {
|
||||||
HoodieCommitMetadata metadata =
|
HoodieCommitMetadata metadata = HoodieCommitMetadata.fromBytes(commitTimeline.getInstantDetails(commit).get());
|
||||||
HoodieCommitMetadata.fromBytes(commitTimeline.getInstantDetails(commit).get());
|
|
||||||
fileIdToFullPath.putAll(metadata.getFileIdAndFullPaths(basePath));
|
fileIdToFullPath.putAll(metadata.getFileIdAndFullPaths(basePath));
|
||||||
}
|
}
|
||||||
return fileIdToFullPath;
|
return fileIdToFullPath;
|
||||||
}
|
}
|
||||||
|
|
||||||
public static Dataset<Row> readCommit(String basePath,
|
public static Dataset<Row> readCommit(String basePath, SQLContext sqlContext, HoodieTimeline commitTimeline,
|
||||||
SQLContext sqlContext,
|
|
||||||
HoodieTimeline commitTimeline,
|
|
||||||
String commitTime) {
|
String commitTime) {
|
||||||
HoodieInstant commitInstant =
|
HoodieInstant commitInstant = new HoodieInstant(false, HoodieTimeline.COMMIT_ACTION, commitTime);
|
||||||
new HoodieInstant(false, HoodieTimeline.COMMIT_ACTION, commitTime);
|
|
||||||
if (!commitTimeline.containsInstant(commitInstant)) {
|
if (!commitTimeline.containsInstant(commitInstant)) {
|
||||||
new HoodieException("No commit exists at " + commitTime);
|
new HoodieException("No commit exists at " + commitTime);
|
||||||
}
|
}
|
||||||
try {
|
try {
|
||||||
HashMap<String, String> paths = getLatestFileIDsToFullPath(basePath, commitTimeline,
|
HashMap<String, String> paths = getLatestFileIDsToFullPath(basePath, commitTimeline,
|
||||||
Arrays.asList(commitInstant));
|
Arrays.asList(commitInstant));
|
||||||
return sqlContext.read()
|
return sqlContext.read().parquet(paths.values().toArray(new String[paths.size()]))
|
||||||
.parquet(paths.values().toArray(new String[paths.size()]))
|
|
||||||
.filter(String.format("%s ='%s'", HoodieRecord.COMMIT_TIME_METADATA_FIELD, commitTime));
|
.filter(String.format("%s ='%s'", HoodieRecord.COMMIT_TIME_METADATA_FIELD, commitTime));
|
||||||
} catch (Exception e) {
|
} catch (Exception e) {
|
||||||
throw new HoodieException("Error reading commit " + commitTime, e);
|
throw new HoodieException("Error reading commit " + commitTime, e);
|
||||||
@@ -143,50 +135,37 @@ public class HoodieClientTestUtils {
|
|||||||
/**
|
/**
|
||||||
* Obtain all new data written into the Hoodie dataset since the given timestamp.
|
* Obtain all new data written into the Hoodie dataset since the given timestamp.
|
||||||
*/
|
*/
|
||||||
public static Dataset<Row> readSince(String basePath,
|
public static Dataset<Row> readSince(String basePath, SQLContext sqlContext, HoodieTimeline commitTimeline,
|
||||||
SQLContext sqlContext,
|
|
||||||
HoodieTimeline commitTimeline,
|
|
||||||
String lastCommitTime) {
|
String lastCommitTime) {
|
||||||
List<HoodieInstant> commitsToReturn =
|
List<HoodieInstant> commitsToReturn = commitTimeline.findInstantsAfter(lastCommitTime, Integer.MAX_VALUE)
|
||||||
commitTimeline.findInstantsAfter(lastCommitTime, Integer.MAX_VALUE)
|
|
||||||
.getInstants().collect(Collectors.toList());
|
.getInstants().collect(Collectors.toList());
|
||||||
try {
|
try {
|
||||||
// Go over the commit metadata, and obtain the new files that need to be read.
|
// Go over the commit metadata, and obtain the new files that need to be read.
|
||||||
HashMap<String, String> fileIdToFullPath = getLatestFileIDsToFullPath(basePath,
|
HashMap<String, String> fileIdToFullPath = getLatestFileIDsToFullPath(basePath, commitTimeline, commitsToReturn);
|
||||||
commitTimeline, commitsToReturn);
|
return sqlContext.read().parquet(fileIdToFullPath.values().toArray(new String[fileIdToFullPath.size()]))
|
||||||
return sqlContext.read()
|
.filter(String.format("%s >'%s'", HoodieRecord.COMMIT_TIME_METADATA_FIELD, lastCommitTime));
|
||||||
.parquet(fileIdToFullPath.values().toArray(new String[fileIdToFullPath.size()]))
|
|
||||||
.filter(
|
|
||||||
String.format("%s >'%s'", HoodieRecord.COMMIT_TIME_METADATA_FIELD, lastCommitTime));
|
|
||||||
} catch (IOException e) {
|
} catch (IOException e) {
|
||||||
throw new HoodieException(
|
throw new HoodieException("Error pulling data incrementally from commitTimestamp :" + lastCommitTime, e);
|
||||||
"Error pulling data incrementally from commitTimestamp :" + lastCommitTime, e);
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Reads the paths under the a hoodie dataset out as a DataFrame
|
* Reads the paths under the a hoodie dataset out as a DataFrame
|
||||||
*/
|
*/
|
||||||
public static Dataset<Row> read(String basePath,
|
public static Dataset<Row> read(String basePath, SQLContext sqlContext, FileSystem fs, String... paths) {
|
||||||
SQLContext sqlContext,
|
|
||||||
FileSystem fs,
|
|
||||||
String... paths) {
|
|
||||||
List<String> filteredPaths = new ArrayList<>();
|
List<String> filteredPaths = new ArrayList<>();
|
||||||
try {
|
try {
|
||||||
HoodieTable hoodieTable = HoodieTable
|
HoodieTable hoodieTable = HoodieTable
|
||||||
.getHoodieTable(new HoodieTableMetaClient(fs.getConf(), basePath, true), null);
|
.getHoodieTable(new HoodieTableMetaClient(fs.getConf(), basePath, true), null);
|
||||||
for (String path : paths) {
|
for (String path : paths) {
|
||||||
TableFileSystemView.ReadOptimizedView fileSystemView = new HoodieTableFileSystemView(
|
TableFileSystemView.ReadOptimizedView fileSystemView = new HoodieTableFileSystemView(
|
||||||
hoodieTable.getMetaClient(),
|
hoodieTable.getMetaClient(), hoodieTable.getCompletedCommitTimeline(), fs.globStatus(new Path(path)));
|
||||||
hoodieTable.getCompletedCommitTimeline(), fs.globStatus(new Path(path)));
|
List<HoodieDataFile> latestFiles = fileSystemView.getLatestDataFiles().collect(Collectors.toList());
|
||||||
List<HoodieDataFile> latestFiles = fileSystemView.getLatestDataFiles().collect(
|
|
||||||
Collectors.toList());
|
|
||||||
for (HoodieDataFile file : latestFiles) {
|
for (HoodieDataFile file : latestFiles) {
|
||||||
filteredPaths.add(file.getPath());
|
filteredPaths.add(file.getPath());
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
return sqlContext.read()
|
return sqlContext.read().parquet(filteredPaths.toArray(new String[filteredPaths.size()]));
|
||||||
.parquet(filteredPaths.toArray(new String[filteredPaths.size()]));
|
|
||||||
} catch (Exception e) {
|
} catch (Exception e) {
|
||||||
throw new HoodieException("Error reading hoodie dataset as a dataframe", e);
|
throw new HoodieException("Error reading hoodie dataset as a dataframe", e);
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -42,8 +42,7 @@ import org.apache.hadoop.mapred.RecordReader;
|
|||||||
*/
|
*/
|
||||||
public class HoodieMergeOnReadTestUtils {
|
public class HoodieMergeOnReadTestUtils {
|
||||||
|
|
||||||
public static List<GenericRecord> getRecordsUsingInputFormat(List<String> inputPaths,
|
public static List<GenericRecord> getRecordsUsingInputFormat(List<String> inputPaths, String basePath)
|
||||||
String basePath)
|
|
||||||
throws IOException {
|
throws IOException {
|
||||||
JobConf jobConf = new JobConf();
|
JobConf jobConf = new JobConf();
|
||||||
Schema schema = HoodieAvroUtils.addMetadataFields(Schema.parse(TRIP_EXAMPLE_SCHEMA));
|
Schema schema = HoodieAvroUtils.addMetadataFields(Schema.parse(TRIP_EXAMPLE_SCHEMA));
|
||||||
@@ -59,7 +58,8 @@ public class HoodieMergeOnReadTestUtils {
|
|||||||
ArrayWritable writable = (ArrayWritable) recordReader.createValue();
|
ArrayWritable writable = (ArrayWritable) recordReader.createValue();
|
||||||
while (recordReader.next(key, writable)) {
|
while (recordReader.next(key, writable)) {
|
||||||
GenericRecordBuilder newRecord = new GenericRecordBuilder(schema);
|
GenericRecordBuilder newRecord = new GenericRecordBuilder(schema);
|
||||||
// writable returns an array with [field1, field2, _hoodie_commit_time, _hoodie_commit_seqno]
|
// writable returns an array with [field1, field2, _hoodie_commit_time,
|
||||||
|
// _hoodie_commit_seqno]
|
||||||
Writable[] values = writable.get();
|
Writable[] values = writable.get();
|
||||||
schema.getFields().forEach(field -> {
|
schema.getFields().forEach(field -> {
|
||||||
newRecord.set(field, values[2]);
|
newRecord.set(field, values[2]);
|
||||||
@@ -76,12 +76,11 @@ public class HoodieMergeOnReadTestUtils {
|
|||||||
}).get();
|
}).get();
|
||||||
}
|
}
|
||||||
|
|
||||||
private static void setPropsForInputFormat(HoodieRealtimeInputFormat inputFormat, JobConf jobConf,
|
private static void setPropsForInputFormat(HoodieRealtimeInputFormat inputFormat, JobConf jobConf, Schema schema,
|
||||||
Schema schema, String basePath) {
|
String basePath) {
|
||||||
List<Schema.Field> fields = schema.getFields();
|
List<Schema.Field> fields = schema.getFields();
|
||||||
String names = fields.stream().map(f -> f.name().toString()).collect(Collectors.joining(","));
|
String names = fields.stream().map(f -> f.name().toString()).collect(Collectors.joining(","));
|
||||||
String postions = fields.stream().map(f -> String.valueOf(f.pos()))
|
String postions = fields.stream().map(f -> String.valueOf(f.pos())).collect(Collectors.joining(","));
|
||||||
.collect(Collectors.joining(","));
|
|
||||||
Configuration conf = HoodieTestUtils.getDefaultHadoopConf();
|
Configuration conf = HoodieTestUtils.getDefaultHadoopConf();
|
||||||
jobConf.set(ColumnProjectionUtils.READ_COLUMN_NAMES_CONF_STR, names);
|
jobConf.set(ColumnProjectionUtils.READ_COLUMN_NAMES_CONF_STR, names);
|
||||||
jobConf.set(ColumnProjectionUtils.READ_COLUMN_IDS_CONF_STR, postions);
|
jobConf.set(ColumnProjectionUtils.READ_COLUMN_IDS_CONF_STR, postions);
|
||||||
|
|||||||
@@ -41,20 +41,15 @@ import org.apache.hadoop.fs.Path;
|
|||||||
|
|
||||||
/**
|
/**
|
||||||
* Class to be used in tests to keep generating test inserts and updates against a corpus.
|
* Class to be used in tests to keep generating test inserts and updates against a corpus.
|
||||||
*
|
* <p>
|
||||||
* Test data uses a toy Uber trips, data model.
|
* Test data uses a toy Uber trips, data model.
|
||||||
*/
|
*/
|
||||||
public class HoodieTestDataGenerator {
|
public class HoodieTestDataGenerator {
|
||||||
|
|
||||||
static class KeyPartition {
|
// based on examination of sample file, the schema produces the following per record size
|
||||||
|
public static final int SIZE_PER_RECORD = 50 * 1024;
|
||||||
HoodieKey key;
|
public static final String[] DEFAULT_PARTITION_PATHS = {"2016/03/15", "2015/03/16", "2015/03/17"};
|
||||||
String partitionPath;
|
public static String TRIP_EXAMPLE_SCHEMA = "{\"type\": \"record\"," + "\"name\": \"triprec\"," + "\"fields\": [ "
|
||||||
}
|
|
||||||
|
|
||||||
public static String TRIP_EXAMPLE_SCHEMA = "{\"type\": \"record\","
|
|
||||||
+ "\"name\": \"triprec\","
|
|
||||||
+ "\"fields\": [ "
|
|
||||||
+ "{\"name\": \"timestamp\",\"type\": \"double\"},"
|
+ "{\"name\": \"timestamp\",\"type\": \"double\"},"
|
||||||
+ "{\"name\": \"_row_key\", \"type\": \"string\"},"
|
+ "{\"name\": \"_row_key\", \"type\": \"string\"},"
|
||||||
+ "{\"name\": \"rider\", \"type\": \"string\"},"
|
+ "{\"name\": \"rider\", \"type\": \"string\"},"
|
||||||
@@ -64,25 +59,9 @@ public class HoodieTestDataGenerator {
|
|||||||
+ "{\"name\": \"end_lat\", \"type\": \"double\"},"
|
+ "{\"name\": \"end_lat\", \"type\": \"double\"},"
|
||||||
+ "{\"name\": \"end_lon\", \"type\": \"double\"},"
|
+ "{\"name\": \"end_lon\", \"type\": \"double\"},"
|
||||||
+ "{\"name\":\"fare\",\"type\": \"double\"}]}";
|
+ "{\"name\":\"fare\",\"type\": \"double\"}]}";
|
||||||
|
public static Schema avroSchema = HoodieAvroUtils.addMetadataFields(new Schema.Parser().parse(TRIP_EXAMPLE_SCHEMA));
|
||||||
// based on examination of sample file, the schema produces the following per record size
|
|
||||||
public static final int SIZE_PER_RECORD = 50 * 1024;
|
|
||||||
|
|
||||||
public static final String[] DEFAULT_PARTITION_PATHS = {"2016/03/15", "2015/03/16", "2015/03/17"};
|
|
||||||
|
|
||||||
|
|
||||||
public static void writePartitionMetadata(FileSystem fs, String[] partitionPaths,
|
|
||||||
String basePath) {
|
|
||||||
for (String partitionPath : partitionPaths) {
|
|
||||||
new HoodiePartitionMetadata(fs, "000", new Path(basePath), new Path(basePath, partitionPath))
|
|
||||||
.trySave(0);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
private List<KeyPartition> existingKeysList = new ArrayList<>();
|
|
||||||
public static Schema avroSchema = HoodieAvroUtils
|
|
||||||
.addMetadataFields(new Schema.Parser().parse(TRIP_EXAMPLE_SCHEMA));
|
|
||||||
private static Random rand = new Random(46474747);
|
private static Random rand = new Random(46474747);
|
||||||
|
private List<KeyPartition> existingKeysList = new ArrayList<>();
|
||||||
private String[] partitionPaths = DEFAULT_PARTITION_PATHS;
|
private String[] partitionPaths = DEFAULT_PARTITION_PATHS;
|
||||||
|
|
||||||
public HoodieTestDataGenerator(String[] partitionPaths) {
|
public HoodieTestDataGenerator(String[] partitionPaths) {
|
||||||
@@ -93,10 +72,66 @@ public class HoodieTestDataGenerator {
|
|||||||
this(new String[] {"2016/03/15", "2015/03/16", "2015/03/17"});
|
this(new String[] {"2016/03/15", "2015/03/16", "2015/03/17"});
|
||||||
}
|
}
|
||||||
|
|
||||||
|
public static void writePartitionMetadata(FileSystem fs, String[] partitionPaths, String basePath) {
|
||||||
|
for (String partitionPath : partitionPaths) {
|
||||||
|
new HoodiePartitionMetadata(fs, "000", new Path(basePath), new Path(basePath, partitionPath)).trySave(0);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Generates new inserts, uniformly across the partition paths above. It also updates the list of
|
* Generates a new avro record of the above schema format, retaining the key if optionally provided.
|
||||||
* existing keys.
|
*/
|
||||||
|
public static TestRawTripPayload generateRandomValue(HoodieKey key, String commitTime) throws IOException {
|
||||||
|
GenericRecord rec = generateGenericRecord(key.getRecordKey(), "rider-" + commitTime, "driver-" + commitTime, 0.0);
|
||||||
|
HoodieAvroUtils.addCommitMetadataToRecord(rec, commitTime, "-1");
|
||||||
|
return new TestRawTripPayload(rec.toString(), key.getRecordKey(), key.getPartitionPath(), TRIP_EXAMPLE_SCHEMA);
|
||||||
|
}
|
||||||
|
|
||||||
|
public static GenericRecord generateGenericRecord(String rowKey, String riderName, String driverName,
|
||||||
|
double timestamp) {
|
||||||
|
GenericRecord rec = new GenericData.Record(avroSchema);
|
||||||
|
rec.put("_row_key", rowKey);
|
||||||
|
rec.put("timestamp", timestamp);
|
||||||
|
rec.put("rider", riderName);
|
||||||
|
rec.put("driver", driverName);
|
||||||
|
rec.put("begin_lat", rand.nextDouble());
|
||||||
|
rec.put("begin_lon", rand.nextDouble());
|
||||||
|
rec.put("end_lat", rand.nextDouble());
|
||||||
|
rec.put("end_lon", rand.nextDouble());
|
||||||
|
rec.put("fare", rand.nextDouble() * 100);
|
||||||
|
return rec;
|
||||||
|
}
|
||||||
|
|
||||||
|
public static void createCommitFile(String basePath, String commitTime) throws IOException {
|
||||||
|
Path commitFile = new Path(
|
||||||
|
basePath + "/" + HoodieTableMetaClient.METAFOLDER_NAME + "/" + HoodieTimeline.makeCommitFileName(commitTime));
|
||||||
|
FileSystem fs = FSUtils.getFs(basePath, HoodieTestUtils.getDefaultHadoopConf());
|
||||||
|
FSDataOutputStream os = fs.create(commitFile, true);
|
||||||
|
HoodieCommitMetadata commitMetadata = new HoodieCommitMetadata();
|
||||||
|
try {
|
||||||
|
// Write empty commit metadata
|
||||||
|
os.writeBytes(new String(commitMetadata.toJsonString().getBytes(StandardCharsets.UTF_8)));
|
||||||
|
} finally {
|
||||||
|
os.close();
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
public static void createSavepointFile(String basePath, String commitTime) throws IOException {
|
||||||
|
Path commitFile = new Path(basePath + "/" + HoodieTableMetaClient.METAFOLDER_NAME
|
||||||
|
+ "/" + HoodieTimeline.makeSavePointFileName(commitTime));
|
||||||
|
FileSystem fs = FSUtils.getFs(basePath, HoodieTestUtils.getDefaultHadoopConf());
|
||||||
|
FSDataOutputStream os = fs.create(commitFile, true);
|
||||||
|
HoodieCommitMetadata commitMetadata = new HoodieCommitMetadata();
|
||||||
|
try {
|
||||||
|
// Write empty commit metadata
|
||||||
|
os.writeBytes(new String(commitMetadata.toJsonString().getBytes(StandardCharsets.UTF_8)));
|
||||||
|
} finally {
|
||||||
|
os.close();
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Generates new inserts, uniformly across the partition paths above. It also updates the list of existing keys.
|
||||||
*/
|
*/
|
||||||
public List<HoodieRecord> generateInserts(String commitTime, int n) throws IOException {
|
public List<HoodieRecord> generateInserts(String commitTime, int n) throws IOException {
|
||||||
List<HoodieRecord> inserts = new ArrayList<>();
|
List<HoodieRecord> inserts = new ArrayList<>();
|
||||||
@@ -119,8 +154,7 @@ public class HoodieTestDataGenerator {
|
|||||||
return generateDeletesFromExistingRecords(inserts);
|
return generateDeletesFromExistingRecords(inserts);
|
||||||
}
|
}
|
||||||
|
|
||||||
public List<HoodieRecord> generateDeletesFromExistingRecords(List<HoodieRecord> existingRecords)
|
public List<HoodieRecord> generateDeletesFromExistingRecords(List<HoodieRecord> existingRecords) throws IOException {
|
||||||
throws IOException {
|
|
||||||
List<HoodieRecord> deletes = new ArrayList<>();
|
List<HoodieRecord> deletes = new ArrayList<>();
|
||||||
for (HoodieRecord existingRecord : existingRecords) {
|
for (HoodieRecord existingRecord : existingRecords) {
|
||||||
HoodieRecord record = generateDeleteRecord(existingRecord);
|
HoodieRecord record = generateDeleteRecord(existingRecord);
|
||||||
@@ -132,17 +166,15 @@ public class HoodieTestDataGenerator {
|
|||||||
|
|
||||||
public HoodieRecord generateDeleteRecord(HoodieRecord existingRecord) throws IOException {
|
public HoodieRecord generateDeleteRecord(HoodieRecord existingRecord) throws IOException {
|
||||||
HoodieKey key = existingRecord.getKey();
|
HoodieKey key = existingRecord.getKey();
|
||||||
TestRawTripPayload payload = new TestRawTripPayload(Optional.empty(), key.getRecordKey(),
|
TestRawTripPayload payload = new TestRawTripPayload(Optional.empty(), key.getRecordKey(), key.getPartitionPath(),
|
||||||
key.getPartitionPath(), null, true);
|
null, true);
|
||||||
return new HoodieRecord(key, payload);
|
return new HoodieRecord(key, payload);
|
||||||
}
|
}
|
||||||
|
|
||||||
public List<HoodieRecord> generateUpdates(String commitTime, List<HoodieRecord> baseRecords)
|
public List<HoodieRecord> generateUpdates(String commitTime, List<HoodieRecord> baseRecords) throws IOException {
|
||||||
throws IOException {
|
|
||||||
List<HoodieRecord> updates = new ArrayList<>();
|
List<HoodieRecord> updates = new ArrayList<>();
|
||||||
for (HoodieRecord baseRecord : baseRecords) {
|
for (HoodieRecord baseRecord : baseRecords) {
|
||||||
HoodieRecord record = new HoodieRecord(baseRecord.getKey(),
|
HoodieRecord record = new HoodieRecord(baseRecord.getKey(), generateRandomValue(baseRecord.getKey(), commitTime));
|
||||||
generateRandomValue(baseRecord.getKey(), commitTime));
|
|
||||||
updates.add(record);
|
updates.add(record);
|
||||||
}
|
}
|
||||||
return updates;
|
return updates;
|
||||||
@@ -161,68 +193,13 @@ public class HoodieTestDataGenerator {
|
|||||||
return updates;
|
return updates;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
/**
|
|
||||||
* Generates a new avro record of the above schema format, retaining the key if optionally
|
|
||||||
* provided.
|
|
||||||
*/
|
|
||||||
public static TestRawTripPayload generateRandomValue(HoodieKey key, String commitTime)
|
|
||||||
throws IOException {
|
|
||||||
GenericRecord rec = generateGenericRecord(key.getRecordKey(), "rider-" + commitTime,
|
|
||||||
"driver-" + commitTime, 0.0);
|
|
||||||
HoodieAvroUtils.addCommitMetadataToRecord(rec, commitTime, "-1");
|
|
||||||
return new TestRawTripPayload(rec.toString(), key.getRecordKey(), key.getPartitionPath(),
|
|
||||||
TRIP_EXAMPLE_SCHEMA);
|
|
||||||
}
|
|
||||||
|
|
||||||
public static GenericRecord generateGenericRecord(String rowKey, String riderName,
|
|
||||||
String driverName, double timestamp) {
|
|
||||||
GenericRecord rec = new GenericData.Record(avroSchema);
|
|
||||||
rec.put("_row_key", rowKey);
|
|
||||||
rec.put("timestamp", timestamp);
|
|
||||||
rec.put("rider", riderName);
|
|
||||||
rec.put("driver", driverName);
|
|
||||||
rec.put("begin_lat", rand.nextDouble());
|
|
||||||
rec.put("begin_lon", rand.nextDouble());
|
|
||||||
rec.put("end_lat", rand.nextDouble());
|
|
||||||
rec.put("end_lon", rand.nextDouble());
|
|
||||||
rec.put("fare", rand.nextDouble() * 100);
|
|
||||||
return rec;
|
|
||||||
}
|
|
||||||
|
|
||||||
public static void createCommitFile(String basePath, String commitTime) throws IOException {
|
|
||||||
Path commitFile =
|
|
||||||
new Path(basePath + "/" + HoodieTableMetaClient.METAFOLDER_NAME + "/" + HoodieTimeline
|
|
||||||
.makeCommitFileName(commitTime));
|
|
||||||
FileSystem fs = FSUtils.getFs(basePath, HoodieTestUtils.getDefaultHadoopConf());
|
|
||||||
FSDataOutputStream os = fs.create(commitFile, true);
|
|
||||||
HoodieCommitMetadata commitMetadata = new HoodieCommitMetadata();
|
|
||||||
try {
|
|
||||||
// Write empty commit metadata
|
|
||||||
os.writeBytes(new String(commitMetadata.toJsonString().getBytes(
|
|
||||||
StandardCharsets.UTF_8)));
|
|
||||||
} finally {
|
|
||||||
os.close();
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
public static void createSavepointFile(String basePath, String commitTime) throws IOException {
|
|
||||||
Path commitFile =
|
|
||||||
new Path(basePath + "/" + HoodieTableMetaClient.METAFOLDER_NAME + "/" + HoodieTimeline
|
|
||||||
.makeSavePointFileName(commitTime));
|
|
||||||
FileSystem fs = FSUtils.getFs(basePath, HoodieTestUtils.getDefaultHadoopConf());
|
|
||||||
FSDataOutputStream os = fs.create(commitFile, true);
|
|
||||||
HoodieCommitMetadata commitMetadata = new HoodieCommitMetadata();
|
|
||||||
try {
|
|
||||||
// Write empty commit metadata
|
|
||||||
os.writeBytes(new String(commitMetadata.toJsonString().getBytes(
|
|
||||||
StandardCharsets.UTF_8)));
|
|
||||||
} finally {
|
|
||||||
os.close();
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
public String[] getPartitionPaths() {
|
public String[] getPartitionPaths() {
|
||||||
return partitionPaths;
|
return partitionPaths;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
static class KeyPartition {
|
||||||
|
|
||||||
|
HoodieKey key;
|
||||||
|
String partitionPath;
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -43,15 +43,15 @@ import org.apache.commons.io.IOUtils;
|
|||||||
*/
|
*/
|
||||||
public class TestRawTripPayload implements HoodieRecordPayload<TestRawTripPayload> {
|
public class TestRawTripPayload implements HoodieRecordPayload<TestRawTripPayload> {
|
||||||
|
|
||||||
private transient static final ObjectMapper mapper = new ObjectMapper();
|
private static final transient ObjectMapper mapper = new ObjectMapper();
|
||||||
private String partitionPath;
|
private String partitionPath;
|
||||||
private String rowKey;
|
private String rowKey;
|
||||||
private byte[] jsonDataCompressed;
|
private byte[] jsonDataCompressed;
|
||||||
private int dataSize;
|
private int dataSize;
|
||||||
private boolean isDeleted;
|
private boolean isDeleted;
|
||||||
|
|
||||||
public TestRawTripPayload(Optional<String> jsonData, String rowKey, String partitionPath,
|
public TestRawTripPayload(Optional<String> jsonData, String rowKey, String partitionPath, String schemaStr,
|
||||||
String schemaStr, Boolean isDeleted) throws IOException {
|
Boolean isDeleted) throws IOException {
|
||||||
if (jsonData.isPresent()) {
|
if (jsonData.isPresent()) {
|
||||||
this.jsonDataCompressed = compressData(jsonData.get());
|
this.jsonDataCompressed = compressData(jsonData.get());
|
||||||
this.dataSize = jsonData.get().length();
|
this.dataSize = jsonData.get().length();
|
||||||
@@ -61,8 +61,7 @@ public class TestRawTripPayload implements HoodieRecordPayload<TestRawTripPayloa
|
|||||||
this.isDeleted = isDeleted;
|
this.isDeleted = isDeleted;
|
||||||
}
|
}
|
||||||
|
|
||||||
public TestRawTripPayload(String jsonData, String rowKey, String partitionPath,
|
public TestRawTripPayload(String jsonData, String rowKey, String partitionPath, String schemaStr) throws IOException {
|
||||||
String schemaStr) throws IOException {
|
|
||||||
this(Optional.of(jsonData), rowKey, partitionPath, schemaStr, false);
|
this(Optional.of(jsonData), rowKey, partitionPath, schemaStr, false);
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -86,8 +85,7 @@ public class TestRawTripPayload implements HoodieRecordPayload<TestRawTripPayloa
|
|||||||
}
|
}
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
public Optional<IndexedRecord> combineAndGetUpdateValue(IndexedRecord oldRec, Schema schema)
|
public Optional<IndexedRecord> combineAndGetUpdateValue(IndexedRecord oldRec, Schema schema) throws IOException {
|
||||||
throws IOException {
|
|
||||||
return this.getInsertValue(schema);
|
return this.getInsertValue(schema);
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -120,8 +118,7 @@ public class TestRawTripPayload implements HoodieRecordPayload<TestRawTripPayloa
|
|||||||
|
|
||||||
private byte[] compressData(String jsonData) throws IOException {
|
private byte[] compressData(String jsonData) throws IOException {
|
||||||
ByteArrayOutputStream baos = new ByteArrayOutputStream();
|
ByteArrayOutputStream baos = new ByteArrayOutputStream();
|
||||||
DeflaterOutputStream dos =
|
DeflaterOutputStream dos = new DeflaterOutputStream(baos, new Deflater(Deflater.BEST_COMPRESSION), true);
|
||||||
new DeflaterOutputStream(baos, new Deflater(Deflater.BEST_COMPRESSION), true);
|
|
||||||
try {
|
try {
|
||||||
dos.write(jsonData.getBytes());
|
dos.write(jsonData.getBytes());
|
||||||
} finally {
|
} finally {
|
||||||
@@ -140,13 +137,36 @@ public class TestRawTripPayload implements HoodieRecordPayload<TestRawTripPayloa
|
|||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* A custom {@link WriteStatus} that merges passed metadata key value map to {@code
|
* A custom {@link WriteStatus} that merges passed metadata key value map to {@code WriteStatus.markSuccess()} and
|
||||||
* WriteStatus.markSuccess()} and {@code WriteStatus.markFailure()}.
|
* {@code WriteStatus.markFailure()}.
|
||||||
*/
|
*/
|
||||||
public static class MetadataMergeWriteStatus extends WriteStatus {
|
public static class MetadataMergeWriteStatus extends WriteStatus {
|
||||||
|
|
||||||
private Map<String, String> mergedMetadataMap = new HashMap<>();
|
private Map<String, String> mergedMetadataMap = new HashMap<>();
|
||||||
|
|
||||||
|
public static Map<String, String> mergeMetadataForWriteStatuses(List<WriteStatus> writeStatuses) {
|
||||||
|
Map<String, String> allWriteStatusMergedMetadataMap = new HashMap<>();
|
||||||
|
for (WriteStatus writeStatus : writeStatuses) {
|
||||||
|
MetadataMergeWriteStatus.mergeMetadataMaps(((MetadataMergeWriteStatus) writeStatus).getMergedMetadataMap(),
|
||||||
|
allWriteStatusMergedMetadataMap);
|
||||||
|
}
|
||||||
|
return allWriteStatusMergedMetadataMap;
|
||||||
|
}
|
||||||
|
|
||||||
|
private static void mergeMetadataMaps(Map<String, String> mergeFromMap, Map<String, String> mergeToMap) {
|
||||||
|
for (Entry<String, String> entry : mergeFromMap.entrySet()) {
|
||||||
|
String key = entry.getKey();
|
||||||
|
if (!mergeToMap.containsKey(key)) {
|
||||||
|
mergeToMap.put(key, "0");
|
||||||
|
}
|
||||||
|
mergeToMap.put(key, addStrsAsInt(entry.getValue(), mergeToMap.get(key)));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
private static String addStrsAsInt(String a, String b) {
|
||||||
|
return String.valueOf(Integer.parseInt(a) + Integer.parseInt(b));
|
||||||
|
}
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
public void markSuccess(HoodieRecord record, Optional<Map<String, String>> recordMetadata) {
|
public void markSuccess(HoodieRecord record, Optional<Map<String, String>> recordMetadata) {
|
||||||
super.markSuccess(record, recordMetadata);
|
super.markSuccess(record, recordMetadata);
|
||||||
@@ -156,43 +176,15 @@ public class TestRawTripPayload implements HoodieRecordPayload<TestRawTripPayloa
|
|||||||
}
|
}
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
public void markFailure(HoodieRecord record, Throwable t,
|
public void markFailure(HoodieRecord record, Throwable t, Optional<Map<String, String>> recordMetadata) {
|
||||||
Optional<Map<String, String>> recordMetadata) {
|
|
||||||
super.markFailure(record, t, recordMetadata);
|
super.markFailure(record, t, recordMetadata);
|
||||||
if (recordMetadata.isPresent()) {
|
if (recordMetadata.isPresent()) {
|
||||||
mergeMetadataMaps(recordMetadata.get(), mergedMetadataMap);
|
mergeMetadataMaps(recordMetadata.get(), mergedMetadataMap);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
public static Map<String, String> mergeMetadataForWriteStatuses(
|
|
||||||
List<WriteStatus> writeStatuses) {
|
|
||||||
Map<String, String> allWriteStatusMergedMetadataMap = new HashMap<>();
|
|
||||||
for (WriteStatus writeStatus : writeStatuses) {
|
|
||||||
MetadataMergeWriteStatus.mergeMetadataMaps(
|
|
||||||
((MetadataMergeWriteStatus) writeStatus).getMergedMetadataMap(),
|
|
||||||
allWriteStatusMergedMetadataMap);
|
|
||||||
}
|
|
||||||
return allWriteStatusMergedMetadataMap;
|
|
||||||
}
|
|
||||||
|
|
||||||
private static void mergeMetadataMaps(Map<String, String> mergeFromMap,
|
|
||||||
Map<String, String> mergeToMap) {
|
|
||||||
for (Entry<String, String> entry : mergeFromMap.entrySet()) {
|
|
||||||
String key = entry.getKey();
|
|
||||||
if (!mergeToMap.containsKey(key)) {
|
|
||||||
mergeToMap.put(key, "0");
|
|
||||||
}
|
|
||||||
mergeToMap
|
|
||||||
.put(key, addStrsAsInt(entry.getValue(), mergeToMap.get(key)));
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
private Map<String, String> getMergedMetadataMap() {
|
private Map<String, String> getMergedMetadataMap() {
|
||||||
return mergedMetadataMap;
|
return mergedMetadataMap;
|
||||||
}
|
}
|
||||||
|
|
||||||
private static String addStrsAsInt(String a, String b) {
|
|
||||||
return String.valueOf(Integer.parseInt(a) + Integer.parseInt(b));
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -49,8 +49,7 @@ public class HoodieWriteConfigTest {
|
|||||||
assertEquals(config.getMinCommitsToKeep(), 2);
|
assertEquals(config.getMinCommitsToKeep(), 2);
|
||||||
}
|
}
|
||||||
|
|
||||||
private ByteArrayOutputStream saveParamsIntoOutputStream(Map<String, String> params)
|
private ByteArrayOutputStream saveParamsIntoOutputStream(Map<String, String> params) throws IOException {
|
||||||
throws IOException {
|
|
||||||
Properties properties = new Properties();
|
Properties properties = new Properties();
|
||||||
properties.putAll(params);
|
properties.putAll(params);
|
||||||
ByteArrayOutputStream outStream = new ByteArrayOutputStream();
|
ByteArrayOutputStream outStream = new ByteArrayOutputStream();
|
||||||
|
|||||||
@@ -16,18 +16,13 @@
|
|||||||
|
|
||||||
package com.uber.hoodie.func;
|
package com.uber.hoodie.func;
|
||||||
|
|
||||||
|
import static org.mockito.Mockito.mock;
|
||||||
|
import static org.mockito.Mockito.when;
|
||||||
|
|
||||||
import com.uber.hoodie.common.HoodieTestDataGenerator;
|
import com.uber.hoodie.common.HoodieTestDataGenerator;
|
||||||
import com.uber.hoodie.common.model.HoodieRecord;
|
import com.uber.hoodie.common.model.HoodieRecord;
|
||||||
import com.uber.hoodie.common.table.timeline.HoodieActiveTimeline;
|
import com.uber.hoodie.common.table.timeline.HoodieActiveTimeline;
|
||||||
import com.uber.hoodie.exception.HoodieException;
|
import com.uber.hoodie.exception.HoodieException;
|
||||||
import org.apache.avro.generic.IndexedRecord;
|
|
||||||
import org.apache.commons.io.FileUtils;
|
|
||||||
import org.apache.spark.util.SizeEstimator;
|
|
||||||
import org.junit.After;
|
|
||||||
import org.junit.Assert;
|
|
||||||
import org.junit.Before;
|
|
||||||
import org.junit.Test;
|
|
||||||
|
|
||||||
import java.io.IOException;
|
import java.io.IOException;
|
||||||
import java.util.Iterator;
|
import java.util.Iterator;
|
||||||
import java.util.List;
|
import java.util.List;
|
||||||
@@ -37,9 +32,13 @@ import java.util.concurrent.ExecutorService;
|
|||||||
import java.util.concurrent.Executors;
|
import java.util.concurrent.Executors;
|
||||||
import java.util.concurrent.Future;
|
import java.util.concurrent.Future;
|
||||||
import java.util.concurrent.Semaphore;
|
import java.util.concurrent.Semaphore;
|
||||||
|
import org.apache.avro.generic.IndexedRecord;
|
||||||
import static org.mockito.Mockito.mock;
|
import org.apache.commons.io.FileUtils;
|
||||||
import static org.mockito.Mockito.when;
|
import org.apache.spark.util.SizeEstimator;
|
||||||
|
import org.junit.After;
|
||||||
|
import org.junit.Assert;
|
||||||
|
import org.junit.Before;
|
||||||
|
import org.junit.Test;
|
||||||
|
|
||||||
public class TestBufferedIterator {
|
public class TestBufferedIterator {
|
||||||
|
|
||||||
@@ -60,26 +59,24 @@ public class TestBufferedIterator {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
// Test to ensure that we are reading all records from buffered iterator in the same order without any exceptions.
|
// Test to ensure that we are reading all records from buffered iterator in the same order
|
||||||
|
// without any exceptions.
|
||||||
@Test(timeout = 60000)
|
@Test(timeout = 60000)
|
||||||
public void testRecordReading() throws IOException, ExecutionException, InterruptedException {
|
public void testRecordReading() throws IOException, ExecutionException, InterruptedException {
|
||||||
final int numRecords = 128;
|
final int numRecords = 128;
|
||||||
final List<HoodieRecord> hoodieRecords = hoodieTestDataGenerator.generateInserts(commitTime, numRecords);
|
final List<HoodieRecord> hoodieRecords = hoodieTestDataGenerator.generateInserts(commitTime, numRecords);
|
||||||
final BufferedIterator bufferedIterator =
|
final BufferedIterator bufferedIterator = new BufferedIterator(hoodieRecords.iterator(), FileUtils.ONE_KB,
|
||||||
new BufferedIterator(hoodieRecords.iterator(), FileUtils.ONE_KB, HoodieTestDataGenerator.avroSchema);
|
HoodieTestDataGenerator.avroSchema);
|
||||||
Future<Boolean> result =
|
Future<Boolean> result = recordReader.submit(() -> {
|
||||||
recordReader.submit(
|
|
||||||
() -> {
|
|
||||||
bufferedIterator.startBuffering();
|
bufferedIterator.startBuffering();
|
||||||
return true;
|
return true;
|
||||||
}
|
});
|
||||||
);
|
|
||||||
final Iterator<HoodieRecord> originalRecordIterator = hoodieRecords.iterator();
|
final Iterator<HoodieRecord> originalRecordIterator = hoodieRecords.iterator();
|
||||||
int recordsRead = 0;
|
int recordsRead = 0;
|
||||||
while (bufferedIterator.hasNext()) {
|
while (bufferedIterator.hasNext()) {
|
||||||
final HoodieRecord originalRecord = originalRecordIterator.next();
|
final HoodieRecord originalRecord = originalRecordIterator.next();
|
||||||
final Optional<IndexedRecord> originalInsertValue =
|
final Optional<IndexedRecord> originalInsertValue = originalRecord.getData()
|
||||||
originalRecord.getData().getInsertValue(HoodieTestDataGenerator.avroSchema);
|
.getInsertValue(HoodieTestDataGenerator.avroSchema);
|
||||||
final BufferedIterator.BufferedIteratorPayload payload = bufferedIterator.next();
|
final BufferedIterator.BufferedIteratorPayload payload = bufferedIterator.next();
|
||||||
// Ensure that record ordering is guaranteed.
|
// Ensure that record ordering is guaranteed.
|
||||||
Assert.assertEquals(originalRecord, payload.record);
|
Assert.assertEquals(originalRecord, payload.record);
|
||||||
@@ -102,15 +99,12 @@ public class TestBufferedIterator {
|
|||||||
// maximum number of records to keep in memory.
|
// maximum number of records to keep in memory.
|
||||||
final int recordLimit = 5;
|
final int recordLimit = 5;
|
||||||
final long memoryLimitInBytes = recordLimit * SizeEstimator.estimate(hoodieRecords.get(0));
|
final long memoryLimitInBytes = recordLimit * SizeEstimator.estimate(hoodieRecords.get(0));
|
||||||
final BufferedIterator bufferedIterator =
|
final BufferedIterator bufferedIterator = new BufferedIterator(hoodieRecords.iterator(), memoryLimitInBytes,
|
||||||
new BufferedIterator(hoodieRecords.iterator(), memoryLimitInBytes, HoodieTestDataGenerator.avroSchema);
|
HoodieTestDataGenerator.avroSchema);
|
||||||
Future<Boolean> result =
|
Future<Boolean> result = recordReader.submit(() -> {
|
||||||
recordReader.submit(
|
|
||||||
() -> {
|
|
||||||
bufferedIterator.startBuffering();
|
bufferedIterator.startBuffering();
|
||||||
return true;
|
return true;
|
||||||
}
|
});
|
||||||
);
|
|
||||||
// waiting for permits to expire.
|
// waiting for permits to expire.
|
||||||
while (!isQueueFull(bufferedIterator.rateLimiter)) {
|
while (!isQueueFull(bufferedIterator.rateLimiter)) {
|
||||||
Thread.sleep(10);
|
Thread.sleep(10);
|
||||||
@@ -128,7 +122,8 @@ public class TestBufferedIterator {
|
|||||||
while (!isQueueFull(bufferedIterator.rateLimiter)) {
|
while (!isQueueFull(bufferedIterator.rateLimiter)) {
|
||||||
Thread.sleep(10);
|
Thread.sleep(10);
|
||||||
}
|
}
|
||||||
// No change is expected in rate limit or number of buffered records. We only expect buffering thread to read
|
// No change is expected in rate limit or number of buffered records. We only expect
|
||||||
|
// buffering thread to read
|
||||||
// 2 more records into the buffer.
|
// 2 more records into the buffer.
|
||||||
Assert.assertEquals(0, bufferedIterator.rateLimiter.availablePermits());
|
Assert.assertEquals(0, bufferedIterator.rateLimiter.availablePermits());
|
||||||
Assert.assertEquals(recordLimit, bufferedIterator.currentRateLimit);
|
Assert.assertEquals(recordLimit, bufferedIterator.currentRateLimit);
|
||||||
@@ -136,7 +131,8 @@ public class TestBufferedIterator {
|
|||||||
Assert.assertEquals(recordLimit - 1 + 2, bufferedIterator.samplingRecordCounter.get());
|
Assert.assertEquals(recordLimit - 1 + 2, bufferedIterator.samplingRecordCounter.get());
|
||||||
}
|
}
|
||||||
|
|
||||||
// Test to ensure that exception in either buffering thread or BufferedIterator-reader thread is propagated to
|
// Test to ensure that exception in either buffering thread or BufferedIterator-reader thread
|
||||||
|
// is propagated to
|
||||||
// another thread.
|
// another thread.
|
||||||
@Test(timeout = 60000)
|
@Test(timeout = 60000)
|
||||||
public void testException() throws IOException, InterruptedException {
|
public void testException() throws IOException, InterruptedException {
|
||||||
@@ -145,17 +141,15 @@ public class TestBufferedIterator {
|
|||||||
// buffer memory limit
|
// buffer memory limit
|
||||||
final long memoryLimitInBytes = 4 * SizeEstimator.estimate(hoodieRecords.get(0));
|
final long memoryLimitInBytes = 4 * SizeEstimator.estimate(hoodieRecords.get(0));
|
||||||
|
|
||||||
// first let us throw exception from bufferIterator reader and test that buffering thread stops and throws
|
// first let us throw exception from bufferIterator reader and test that buffering thread
|
||||||
|
// stops and throws
|
||||||
// correct exception back.
|
// correct exception back.
|
||||||
BufferedIterator bufferedIterator1 =
|
BufferedIterator bufferedIterator1 = new BufferedIterator(hoodieRecords.iterator(), memoryLimitInBytes,
|
||||||
new BufferedIterator(hoodieRecords.iterator(), memoryLimitInBytes, HoodieTestDataGenerator.avroSchema);
|
HoodieTestDataGenerator.avroSchema);
|
||||||
Future<Boolean> result =
|
Future<Boolean> result = recordReader.submit(() -> {
|
||||||
recordReader.submit(
|
|
||||||
() -> {
|
|
||||||
bufferedIterator1.startBuffering();
|
bufferedIterator1.startBuffering();
|
||||||
return true;
|
return true;
|
||||||
}
|
});
|
||||||
);
|
|
||||||
// waiting for permits to expire.
|
// waiting for permits to expire.
|
||||||
while (!isQueueFull(bufferedIterator1.rateLimiter)) {
|
while (!isQueueFull(bufferedIterator1.rateLimiter)) {
|
||||||
Thread.sleep(10);
|
Thread.sleep(10);
|
||||||
@@ -171,21 +165,19 @@ public class TestBufferedIterator {
|
|||||||
Assert.assertEquals(e, e1.getCause().getCause());
|
Assert.assertEquals(e, e1.getCause().getCause());
|
||||||
}
|
}
|
||||||
|
|
||||||
// second let us raise an exception while doing record buffering. this exception should get propagated to
|
// second let us raise an exception while doing record buffering. this exception should get
|
||||||
|
// propagated to
|
||||||
// buffered iterator reader.
|
// buffered iterator reader.
|
||||||
final RuntimeException expectedException = new RuntimeException("failing record reading");
|
final RuntimeException expectedException = new RuntimeException("failing record reading");
|
||||||
final Iterator<HoodieRecord> mockHoodieRecordsIterator = mock(Iterator.class);
|
final Iterator<HoodieRecord> mockHoodieRecordsIterator = mock(Iterator.class);
|
||||||
when(mockHoodieRecordsIterator.hasNext()).thenReturn(true);
|
when(mockHoodieRecordsIterator.hasNext()).thenReturn(true);
|
||||||
when(mockHoodieRecordsIterator.next()).thenThrow(expectedException);
|
when(mockHoodieRecordsIterator.next()).thenThrow(expectedException);
|
||||||
BufferedIterator bufferedIterator2 =
|
BufferedIterator bufferedIterator2 = new BufferedIterator(mockHoodieRecordsIterator, memoryLimitInBytes,
|
||||||
new BufferedIterator(mockHoodieRecordsIterator, memoryLimitInBytes, HoodieTestDataGenerator.avroSchema);
|
HoodieTestDataGenerator.avroSchema);
|
||||||
Future<Boolean> result2 =
|
Future<Boolean> result2 = recordReader.submit(() -> {
|
||||||
recordReader.submit(
|
|
||||||
() -> {
|
|
||||||
bufferedIterator2.startBuffering();
|
bufferedIterator2.startBuffering();
|
||||||
return true;
|
return true;
|
||||||
}
|
});
|
||||||
);
|
|
||||||
try {
|
try {
|
||||||
bufferedIterator2.hasNext();
|
bufferedIterator2.hasNext();
|
||||||
Assert.fail("exception is expected");
|
Assert.fail("exception is expected");
|
||||||
|
|||||||
@@ -55,32 +55,24 @@ public class TestUpdateMapFunction {
|
|||||||
public void testSchemaEvolutionOnUpdate() throws Exception {
|
public void testSchemaEvolutionOnUpdate() throws Exception {
|
||||||
// Create a bunch of records with a old version of schema
|
// Create a bunch of records with a old version of schema
|
||||||
HoodieWriteConfig config = makeHoodieClientConfig("/exampleSchema.txt");
|
HoodieWriteConfig config = makeHoodieClientConfig("/exampleSchema.txt");
|
||||||
HoodieTableMetaClient metaClient = new HoodieTableMetaClient(
|
HoodieTableMetaClient metaClient = new HoodieTableMetaClient(HoodieTestUtils.getDefaultHadoopConf(), basePath);
|
||||||
HoodieTestUtils.getDefaultHadoopConf(), basePath);
|
|
||||||
HoodieCopyOnWriteTable table = new HoodieCopyOnWriteTable(config, metaClient);
|
HoodieCopyOnWriteTable table = new HoodieCopyOnWriteTable(config, metaClient);
|
||||||
|
|
||||||
String recordStr1 =
|
String recordStr1 = "{\"_row_key\":\"8eb5b87a-1feh-4edd-87b4-6ec96dc405a0\","
|
||||||
"{\"_row_key\":\"8eb5b87a-1feh-4edd-87b4-6ec96dc405a0\",\"time\":\"2016-01-31T03:16:41.415Z\",\"number\":12}";
|
+ "\"time\":\"2016-01-31T03:16:41.415Z\",\"number\":12}";
|
||||||
String recordStr2 =
|
String recordStr2 = "{\"_row_key\":\"8eb5b87b-1feu-4edd-87b4-6ec96dc405a0\","
|
||||||
"{\"_row_key\":\"8eb5b87b-1feu-4edd-87b4-6ec96dc405a0\",\"time\":\"2016-01-31T03:20:41.415Z\",\"number\":100}";
|
+ "\"time\":\"2016-01-31T03:20:41.415Z\",\"number\":100}";
|
||||||
String recordStr3 =
|
String recordStr3 = "{\"_row_key\":\"8eb5b87c-1fej-4edd-87b4-6ec96dc405a0\","
|
||||||
"{\"_row_key\":\"8eb5b87c-1fej-4edd-87b4-6ec96dc405a0\",\"time\":\"2016-01-31T03:16:41.415Z\",\"number\":15}";
|
+ "\"time\":\"2016-01-31T03:16:41.415Z\",\"number\":15}";
|
||||||
List<HoodieRecord> records = new ArrayList<>();
|
List<HoodieRecord> records = new ArrayList<>();
|
||||||
TestRawTripPayload rowChange1 = new TestRawTripPayload(recordStr1);
|
TestRawTripPayload rowChange1 = new TestRawTripPayload(recordStr1);
|
||||||
records.add(
|
records.add(new HoodieRecord(new HoodieKey(rowChange1.getRowKey(), rowChange1.getPartitionPath()), rowChange1));
|
||||||
new HoodieRecord(new HoodieKey(rowChange1.getRowKey(), rowChange1.getPartitionPath()),
|
|
||||||
rowChange1));
|
|
||||||
TestRawTripPayload rowChange2 = new TestRawTripPayload(recordStr2);
|
TestRawTripPayload rowChange2 = new TestRawTripPayload(recordStr2);
|
||||||
records.add(
|
records.add(new HoodieRecord(new HoodieKey(rowChange2.getRowKey(), rowChange2.getPartitionPath()), rowChange2));
|
||||||
new HoodieRecord(new HoodieKey(rowChange2.getRowKey(), rowChange2.getPartitionPath()),
|
|
||||||
rowChange2));
|
|
||||||
TestRawTripPayload rowChange3 = new TestRawTripPayload(recordStr3);
|
TestRawTripPayload rowChange3 = new TestRawTripPayload(recordStr3);
|
||||||
records.add(
|
records.add(new HoodieRecord(new HoodieKey(rowChange3.getRowKey(), rowChange3.getPartitionPath()), rowChange3));
|
||||||
new HoodieRecord(new HoodieKey(rowChange3.getRowKey(), rowChange3.getPartitionPath()),
|
|
||||||
rowChange3));
|
|
||||||
Iterator<List<WriteStatus>> insertResult = table.handleInsert("100", records.iterator());
|
Iterator<List<WriteStatus>> insertResult = table.handleInsert("100", records.iterator());
|
||||||
Path commitFile =
|
Path commitFile = new Path(config.getBasePath() + "/.hoodie/" + HoodieTimeline.makeCommitFileName("100"));
|
||||||
new Path(config.getBasePath() + "/.hoodie/" + HoodieTimeline.makeCommitFileName("100"));
|
|
||||||
FSUtils.getFs(basePath, HoodieTestUtils.getDefaultHadoopConf()).create(commitFile);
|
FSUtils.getFs(basePath, HoodieTestUtils.getDefaultHadoopConf()).create(commitFile);
|
||||||
|
|
||||||
// Now try an update with an evolved schema
|
// Now try an update with an evolved schema
|
||||||
@@ -92,12 +84,11 @@ public class TestUpdateMapFunction {
|
|||||||
|
|
||||||
table = new HoodieCopyOnWriteTable(config, metaClient);
|
table = new HoodieCopyOnWriteTable(config, metaClient);
|
||||||
// New content with values for the newly added field
|
// New content with values for the newly added field
|
||||||
recordStr1 =
|
recordStr1 = "{\"_row_key\":\"8eb5b87a-1feh-4edd-87b4-6ec96dc405a0\","
|
||||||
"{\"_row_key\":\"8eb5b87a-1feh-4edd-87b4-6ec96dc405a0\",\"time\":\"2016-01-31T03:16:41.415Z\",\"number\":12,\"added_field\":1}";
|
+ "\"time\":\"2016-01-31T03:16:41.415Z\",\"number\":12,\"added_field\":1}";
|
||||||
records = new ArrayList<>();
|
records = new ArrayList<>();
|
||||||
rowChange1 = new TestRawTripPayload(recordStr1);
|
rowChange1 = new TestRawTripPayload(recordStr1);
|
||||||
HoodieRecord record1 =
|
HoodieRecord record1 = new HoodieRecord(new HoodieKey(rowChange1.getRowKey(), rowChange1.getPartitionPath()),
|
||||||
new HoodieRecord(new HoodieKey(rowChange1.getRowKey(), rowChange1.getPartitionPath()),
|
|
||||||
rowChange1);
|
rowChange1);
|
||||||
record1.setCurrentLocation(new HoodieRecordLocation("100", fileId));
|
record1.setCurrentLocation(new HoodieRecordLocation("100", fileId));
|
||||||
records.add(record1);
|
records.add(record1);
|
||||||
@@ -105,8 +96,8 @@ public class TestUpdateMapFunction {
|
|||||||
try {
|
try {
|
||||||
table.handleUpdate("101", fileId, records.iterator());
|
table.handleUpdate("101", fileId, records.iterator());
|
||||||
} catch (ClassCastException e) {
|
} catch (ClassCastException e) {
|
||||||
fail(
|
fail("UpdateFunction could not read records written with exampleSchema.txt using the "
|
||||||
"UpdateFunction could not read records written with exampleSchema.txt using the exampleEvolvedSchema.txt");
|
+ "exampleEvolvedSchema.txt");
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|||||||
@@ -16,6 +16,12 @@
|
|||||||
|
|
||||||
package com.uber.hoodie.index;
|
package com.uber.hoodie.index;
|
||||||
|
|
||||||
|
import static org.junit.Assert.assertFalse;
|
||||||
|
import static org.junit.Assert.assertTrue;
|
||||||
|
import static org.mockito.Matchers.anyObject;
|
||||||
|
import static org.mockito.Mockito.atMost;
|
||||||
|
import static org.mockito.Mockito.times;
|
||||||
|
|
||||||
import com.uber.hoodie.HoodieWriteClient;
|
import com.uber.hoodie.HoodieWriteClient;
|
||||||
import com.uber.hoodie.WriteStatus;
|
import com.uber.hoodie.WriteStatus;
|
||||||
import com.uber.hoodie.common.HoodieTestDataGenerator;
|
import com.uber.hoodie.common.HoodieTestDataGenerator;
|
||||||
@@ -23,19 +29,16 @@ import com.uber.hoodie.common.model.HoodieRecord;
|
|||||||
import com.uber.hoodie.common.model.HoodieTableType;
|
import com.uber.hoodie.common.model.HoodieTableType;
|
||||||
import com.uber.hoodie.common.table.HoodieTableConfig;
|
import com.uber.hoodie.common.table.HoodieTableConfig;
|
||||||
import com.uber.hoodie.common.table.HoodieTableMetaClient;
|
import com.uber.hoodie.common.table.HoodieTableMetaClient;
|
||||||
import com.uber.hoodie.common.table.TableFileSystemView;
|
|
||||||
import com.uber.hoodie.common.table.view.HoodieTableFileSystemView;
|
|
||||||
import com.uber.hoodie.common.util.FSUtils;
|
|
||||||
import com.uber.hoodie.config.HoodieCompactionConfig;
|
import com.uber.hoodie.config.HoodieCompactionConfig;
|
||||||
import com.uber.hoodie.config.HoodieIndexConfig;
|
import com.uber.hoodie.config.HoodieIndexConfig;
|
||||||
import com.uber.hoodie.config.HoodieStorageConfig;
|
import com.uber.hoodie.config.HoodieStorageConfig;
|
||||||
import com.uber.hoodie.config.HoodieWriteConfig;
|
import com.uber.hoodie.config.HoodieWriteConfig;
|
||||||
import com.uber.hoodie.index.hbase.HBaseIndex;
|
import com.uber.hoodie.index.hbase.HBaseIndex;
|
||||||
import com.uber.hoodie.table.HoodieTable;
|
import com.uber.hoodie.table.HoodieTable;
|
||||||
|
import java.io.File;
|
||||||
|
import java.util.List;
|
||||||
import org.apache.hadoop.conf.Configuration;
|
import org.apache.hadoop.conf.Configuration;
|
||||||
import org.apache.hadoop.fs.FileStatus;
|
|
||||||
import org.apache.hadoop.fs.FileSystem;
|
import org.apache.hadoop.fs.FileSystem;
|
||||||
import org.apache.hadoop.fs.Path;
|
|
||||||
import org.apache.hadoop.hbase.HBaseTestingUtility;
|
import org.apache.hadoop.hbase.HBaseTestingUtility;
|
||||||
import org.apache.hadoop.hbase.TableName;
|
import org.apache.hadoop.hbase.TableName;
|
||||||
import org.apache.hadoop.hbase.client.Connection;
|
import org.apache.hadoop.hbase.client.Connection;
|
||||||
@@ -56,37 +59,26 @@ import org.junit.Test;
|
|||||||
import org.junit.rules.TemporaryFolder;
|
import org.junit.rules.TemporaryFolder;
|
||||||
import org.junit.runners.MethodSorters;
|
import org.junit.runners.MethodSorters;
|
||||||
import org.mockito.Mockito;
|
import org.mockito.Mockito;
|
||||||
import scala.Tuple2;
|
|
||||||
|
|
||||||
import java.io.File;
|
|
||||||
import java.io.IOException;
|
|
||||||
import java.util.List;
|
|
||||||
|
|
||||||
import static org.junit.Assert.assertEquals;
|
|
||||||
import static org.junit.Assert.assertFalse;
|
|
||||||
import static org.junit.Assert.assertTrue;
|
|
||||||
import static org.mockito.Matchers.anyObject;
|
|
||||||
import static org.mockito.Mockito.atLeast;
|
|
||||||
import static org.mockito.Mockito.atMost;
|
|
||||||
import static org.mockito.Mockito.times;
|
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Note :: HBaseTestingUtility is really flaky with issues where the HbaseMiniCluster fails to shutdown
|
* Note :: HBaseTestingUtility is really flaky with issues where the HbaseMiniCluster fails to shutdown across tests,
|
||||||
* across tests, (see one problem here : https://issues.apache.org/jira/browse/HBASE-15835).
|
* (see one problem here : https://issues.apache .org/jira/browse/HBASE-15835). Hence, the need to use
|
||||||
* Hence, the need to use MethodSorters.NAME_ASCENDING to make sure the tests run in order. Please alter
|
* MethodSorters.NAME_ASCENDING to make sure the tests run in order. Please alter the order of tests running carefully.
|
||||||
* the order of tests running carefully.
|
|
||||||
*/
|
*/
|
||||||
@FixMethodOrder(MethodSorters.NAME_ASCENDING)
|
@FixMethodOrder(MethodSorters.NAME_ASCENDING)
|
||||||
public class TestHbaseIndex {
|
public class TestHbaseIndex {
|
||||||
|
|
||||||
private static JavaSparkContext jsc = null;
|
private static JavaSparkContext jsc = null;
|
||||||
private String basePath = null;
|
|
||||||
private transient FileSystem fs;
|
|
||||||
private static HBaseTestingUtility utility;
|
private static HBaseTestingUtility utility;
|
||||||
private static Configuration hbaseConfig;
|
private static Configuration hbaseConfig;
|
||||||
private static String tableName = "test_table";
|
private static String tableName = "test_table";
|
||||||
|
private String basePath = null;
|
||||||
|
private transient FileSystem fs;
|
||||||
private HoodieTableMetaClient metaClient;
|
private HoodieTableMetaClient metaClient;
|
||||||
|
|
||||||
|
public TestHbaseIndex() throws Exception {
|
||||||
|
}
|
||||||
|
|
||||||
@AfterClass
|
@AfterClass
|
||||||
public static void clean() throws Exception {
|
public static void clean() throws Exception {
|
||||||
if (jsc != null) {
|
if (jsc != null) {
|
||||||
@@ -97,6 +89,20 @@ public class TestHbaseIndex {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@BeforeClass
|
||||||
|
public static void init() throws Exception {
|
||||||
|
|
||||||
|
// Initialize HbaseMiniCluster
|
||||||
|
utility = new HBaseTestingUtility();
|
||||||
|
utility.startMiniCluster();
|
||||||
|
hbaseConfig = utility.getConnection().getConfiguration();
|
||||||
|
utility.createTable(TableName.valueOf(tableName), Bytes.toBytes("_s"));
|
||||||
|
// Initialize a local spark env
|
||||||
|
SparkConf sparkConf = new SparkConf().setAppName("TestHbaseIndex").setMaster("local[1]");
|
||||||
|
jsc = new JavaSparkContext(sparkConf);
|
||||||
|
jsc.hadoopConfiguration().addResource(utility.getConfiguration());
|
||||||
|
}
|
||||||
|
|
||||||
@After
|
@After
|
||||||
public void clear() throws Exception {
|
public void clear() throws Exception {
|
||||||
if (basePath != null) {
|
if (basePath != null) {
|
||||||
@@ -112,25 +118,8 @@ public class TestHbaseIndex {
|
|||||||
basePath = folder.getRoot().getAbsolutePath();
|
basePath = folder.getRoot().getAbsolutePath();
|
||||||
// Initialize table
|
// Initialize table
|
||||||
metaClient = HoodieTableMetaClient
|
metaClient = HoodieTableMetaClient
|
||||||
.initTableType(utility.getConfiguration(), basePath, HoodieTableType.COPY_ON_WRITE,
|
.initTableType(utility.getConfiguration(), basePath, HoodieTableType.COPY_ON_WRITE, tableName,
|
||||||
tableName, HoodieTableConfig.DEFAULT_PAYLOAD_CLASS);
|
HoodieTableConfig.DEFAULT_PAYLOAD_CLASS);
|
||||||
}
|
|
||||||
|
|
||||||
public TestHbaseIndex() throws Exception {
|
|
||||||
}
|
|
||||||
|
|
||||||
@BeforeClass
|
|
||||||
public static void init() throws Exception {
|
|
||||||
|
|
||||||
// Initialize HbaseMiniCluster
|
|
||||||
utility = new HBaseTestingUtility();
|
|
||||||
utility.startMiniCluster();
|
|
||||||
hbaseConfig = utility.getConnection().getConfiguration();
|
|
||||||
utility.createTable(TableName.valueOf(tableName), Bytes.toBytes("_s"));
|
|
||||||
// Initialize a local spark env
|
|
||||||
SparkConf sparkConf = new SparkConf().setAppName("TestHbaseIndex").setMaster("local[1]");
|
|
||||||
jsc = new JavaSparkContext(sparkConf);
|
|
||||||
jsc.hadoopConfiguration().addResource(utility.getConfiguration());
|
|
||||||
}
|
}
|
||||||
|
|
||||||
@Test
|
@Test
|
||||||
@@ -156,7 +145,8 @@ public class TestHbaseIndex {
|
|||||||
JavaRDD<WriteStatus> writeStatues = writeClient.upsert(writeRecords, newCommitTime);
|
JavaRDD<WriteStatus> writeStatues = writeClient.upsert(writeRecords, newCommitTime);
|
||||||
assertNoWriteErrors(writeStatues.collect());
|
assertNoWriteErrors(writeStatues.collect());
|
||||||
|
|
||||||
// Now tagLocation for these records, hbaseIndex should not tag them since it was a failed commit
|
// Now tagLocation for these records, hbaseIndex should not tag them since it was a failed
|
||||||
|
// commit
|
||||||
javaRDD = index.tagLocation(writeRecords, hoodieTable);
|
javaRDD = index.tagLocation(writeRecords, hoodieTable);
|
||||||
assert (javaRDD.filter(record -> record.isCurrentLocationKnown()).collect().size() == 0);
|
assert (javaRDD.filter(record -> record.isCurrentLocationKnown()).collect().size() == 0);
|
||||||
|
|
||||||
@@ -167,8 +157,9 @@ public class TestHbaseIndex {
|
|||||||
javaRDD = index.tagLocation(writeRecords, hoodieTable);
|
javaRDD = index.tagLocation(writeRecords, hoodieTable);
|
||||||
assertTrue(javaRDD.filter(record -> record.isCurrentLocationKnown()).collect().size() == 200);
|
assertTrue(javaRDD.filter(record -> record.isCurrentLocationKnown()).collect().size() == 200);
|
||||||
assertTrue(javaRDD.map(record -> record.getKey().getRecordKey()).distinct().count() == 200);
|
assertTrue(javaRDD.map(record -> record.getKey().getRecordKey()).distinct().count() == 200);
|
||||||
assertTrue(javaRDD.filter(record -> (record.getCurrentLocation() != null
|
assertTrue(javaRDD.filter(
|
||||||
&& record.getCurrentLocation().getCommitTime().equals(newCommitTime))).distinct().count() == 200);
|
record -> (record.getCurrentLocation() != null && record.getCurrentLocation().getCommitTime()
|
||||||
|
.equals(newCommitTime))).distinct().count() == 200);
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -208,7 +199,8 @@ public class TestHbaseIndex {
|
|||||||
// Rollback the last commit
|
// Rollback the last commit
|
||||||
writeClient.rollback(newCommitTime);
|
writeClient.rollback(newCommitTime);
|
||||||
|
|
||||||
// Now tagLocation for these records, hbaseIndex should not tag them since it was a rolled back commit
|
// Now tagLocation for these records, hbaseIndex should not tag them since it was a rolled
|
||||||
|
// back commit
|
||||||
javaRDD = index.tagLocation(writeRecords, hoodieTable);
|
javaRDD = index.tagLocation(writeRecords, hoodieTable);
|
||||||
assert (javaRDD.filter(record -> record.isCurrentLocationKnown()).collect().size() == 0);
|
assert (javaRDD.filter(record -> record.isCurrentLocationKnown()).collect().size() == 0);
|
||||||
assert (javaRDD.filter(record -> record.getCurrentLocation() != null).collect().size() == 0);
|
assert (javaRDD.filter(record -> record.getCurrentLocation() != null).collect().size() == 0);
|
||||||
@@ -302,12 +294,10 @@ public class TestHbaseIndex {
|
|||||||
}
|
}
|
||||||
|
|
||||||
private HoodieWriteConfig.Builder getConfigBuilder() {
|
private HoodieWriteConfig.Builder getConfigBuilder() {
|
||||||
return HoodieWriteConfig.newBuilder().withPath(basePath)
|
return HoodieWriteConfig.newBuilder().withPath(basePath).withSchema(HoodieTestDataGenerator.TRIP_EXAMPLE_SCHEMA)
|
||||||
.withSchema(HoodieTestDataGenerator.TRIP_EXAMPLE_SCHEMA).withParallelism(1, 1)
|
.withParallelism(1, 1).withCompactionConfig(
|
||||||
.withCompactionConfig(
|
HoodieCompactionConfig.newBuilder().compactionSmallFileSize(1024 * 1024).withInlineCompaction(false)
|
||||||
HoodieCompactionConfig.newBuilder().compactionSmallFileSize(1024 * 1024)
|
.build()).withAutoCommit(false)
|
||||||
.withInlineCompaction(false).build())
|
|
||||||
.withAutoCommit(false)
|
|
||||||
.withStorageConfig(HoodieStorageConfig.newBuilder().limitFileSize(1024 * 1024).build())
|
.withStorageConfig(HoodieStorageConfig.newBuilder().limitFileSize(1024 * 1024).build())
|
||||||
.forTable("test-trip-table").withIndexConfig(
|
.forTable("test-trip-table").withIndexConfig(
|
||||||
HoodieIndexConfig.newBuilder().withIndexType(HoodieIndex.IndexType.HBASE)
|
HoodieIndexConfig.newBuilder().withIndexType(HoodieIndex.IndexType.HBASE)
|
||||||
|
|||||||
@@ -31,16 +31,14 @@ public class TestHoodieIndex {
|
|||||||
HoodieWriteConfig.Builder clientConfigBuilder = HoodieWriteConfig.newBuilder();
|
HoodieWriteConfig.Builder clientConfigBuilder = HoodieWriteConfig.newBuilder();
|
||||||
HoodieIndexConfig.Builder indexConfigBuilder = HoodieIndexConfig.newBuilder();
|
HoodieIndexConfig.Builder indexConfigBuilder = HoodieIndexConfig.newBuilder();
|
||||||
// Different types
|
// Different types
|
||||||
HoodieWriteConfig config = clientConfigBuilder.withPath("")
|
HoodieWriteConfig config = clientConfigBuilder.withPath("").withIndexConfig(
|
||||||
.withIndexConfig(indexConfigBuilder.withIndexType(HoodieIndex.IndexType.HBASE).build())
|
indexConfigBuilder.withIndexType(HoodieIndex.IndexType.HBASE).build()).build();
|
||||||
.build();
|
|
||||||
assertTrue(HoodieIndex.createIndex(config, null) instanceof HBaseIndex);
|
assertTrue(HoodieIndex.createIndex(config, null) instanceof HBaseIndex);
|
||||||
config = clientConfigBuilder.withPath("").withIndexConfig(
|
config = clientConfigBuilder.withPath("")
|
||||||
indexConfigBuilder.withIndexType(HoodieIndex.IndexType.INMEMORY).build()).build();
|
.withIndexConfig(indexConfigBuilder.withIndexType(HoodieIndex.IndexType.INMEMORY).build()).build();
|
||||||
assertTrue(HoodieIndex.createIndex(config, null) instanceof InMemoryHashIndex);
|
assertTrue(HoodieIndex.createIndex(config, null) instanceof InMemoryHashIndex);
|
||||||
config = clientConfigBuilder.withPath("")
|
config = clientConfigBuilder.withPath("")
|
||||||
.withIndexConfig(indexConfigBuilder.withIndexType(HoodieIndex.IndexType.BLOOM).build())
|
.withIndexConfig(indexConfigBuilder.withIndexType(HoodieIndex.IndexType.BLOOM).build()).build();
|
||||||
.build();
|
|
||||||
assertTrue(HoodieIndex.createIndex(config, null) instanceof HoodieBloomIndex);
|
assertTrue(HoodieIndex.createIndex(config, null) instanceof HoodieBloomIndex);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -98,31 +98,33 @@ public class TestHoodieBloomIndex {
|
|||||||
@Test
|
@Test
|
||||||
public void testLoadUUIDsInMemory() throws IOException {
|
public void testLoadUUIDsInMemory() throws IOException {
|
||||||
// Create one RDD of hoodie record
|
// Create one RDD of hoodie record
|
||||||
String recordStr1 = "{\"_row_key\":\"1eb5b87a-1feh-4edd-87b4-6ec96dc405a0\",\"time\":\"2016-01-31T03:16:41.415Z\",\"number\":12}";
|
String recordStr1 = "{\"_row_key\":\"1eb5b87a-1feh-4edd-87b4-6ec96dc405a0\","
|
||||||
String recordStr2 = "{\"_row_key\":\"2eb5b87b-1feu-4edd-87b4-6ec96dc405a0\",\"time\":\"2016-01-31T03:20:41.415Z\",\"number\":100}";
|
+ "\"time\":\"2016-01-31T03:16:41.415Z\",\"number\":12}";
|
||||||
String recordStr3 = "{\"_row_key\":\"3eb5b87c-1fej-4edd-87b4-6ec96dc405a0\",\"time\":\"2016-01-31T03:16:41.415Z\",\"number\":15}";
|
String recordStr2 = "{\"_row_key\":\"2eb5b87b-1feu-4edd-87b4-6ec96dc405a0\","
|
||||||
String recordStr4 = "{\"_row_key\":\"4eb5b87c-1fej-4edd-87b4-6ec96dc405a0\",\"time\":\"2015-01-31T03:16:41.415Z\",\"number\":32}";
|
+ "\"time\":\"2016-01-31T03:20:41.415Z\",\"number\":100}";
|
||||||
|
String recordStr3 = "{\"_row_key\":\"3eb5b87c-1fej-4edd-87b4-6ec96dc405a0\","
|
||||||
|
+ "\"time\":\"2016-01-31T03:16:41.415Z\",\"number\":15}";
|
||||||
|
String recordStr4 = "{\"_row_key\":\"4eb5b87c-1fej-4edd-87b4-6ec96dc405a0\","
|
||||||
|
+ "\"time\":\"2015-01-31T03:16:41.415Z\",\"number\":32}";
|
||||||
|
|
||||||
TestRawTripPayload rowChange1 = new TestRawTripPayload(recordStr1);
|
TestRawTripPayload rowChange1 = new TestRawTripPayload(recordStr1);
|
||||||
HoodieRecord record1 = new HoodieRecord(
|
HoodieRecord record1 = new HoodieRecord(new HoodieKey(rowChange1.getRowKey(), rowChange1.getPartitionPath()),
|
||||||
new HoodieKey(rowChange1.getRowKey(), rowChange1.getPartitionPath()), rowChange1);
|
rowChange1);
|
||||||
TestRawTripPayload rowChange2 = new TestRawTripPayload(recordStr2);
|
TestRawTripPayload rowChange2 = new TestRawTripPayload(recordStr2);
|
||||||
HoodieRecord record2 = new HoodieRecord(
|
HoodieRecord record2 = new HoodieRecord(new HoodieKey(rowChange2.getRowKey(), rowChange2.getPartitionPath()),
|
||||||
new HoodieKey(rowChange2.getRowKey(), rowChange2.getPartitionPath()), rowChange2);
|
rowChange2);
|
||||||
TestRawTripPayload rowChange3 = new TestRawTripPayload(recordStr3);
|
TestRawTripPayload rowChange3 = new TestRawTripPayload(recordStr3);
|
||||||
HoodieRecord record3 = new HoodieRecord(
|
HoodieRecord record3 = new HoodieRecord(new HoodieKey(rowChange3.getRowKey(), rowChange3.getPartitionPath()),
|
||||||
new HoodieKey(rowChange3.getRowKey(), rowChange3.getPartitionPath()), rowChange3);
|
rowChange3);
|
||||||
TestRawTripPayload rowChange4 = new TestRawTripPayload(recordStr4);
|
TestRawTripPayload rowChange4 = new TestRawTripPayload(recordStr4);
|
||||||
HoodieRecord record4 = new HoodieRecord(
|
HoodieRecord record4 = new HoodieRecord(new HoodieKey(rowChange4.getRowKey(), rowChange4.getPartitionPath()),
|
||||||
new HoodieKey(rowChange4.getRowKey(), rowChange4.getPartitionPath()), rowChange4);
|
rowChange4);
|
||||||
|
|
||||||
JavaRDD<HoodieRecord> recordRDD = jsc
|
JavaRDD<HoodieRecord> recordRDD = jsc.parallelize(Arrays.asList(record1, record2, record3, record4));
|
||||||
.parallelize(Arrays.asList(record1, record2, record3, record4));
|
|
||||||
|
|
||||||
// Load to memory
|
// Load to memory
|
||||||
Map<String, Iterable<String>> map = recordRDD
|
Map<String, Iterable<String>> map = recordRDD.mapToPair(
|
||||||
.mapToPair(record -> new Tuple2<>(record.getPartitionPath(), record.getRecordKey()))
|
record -> new Tuple2<>(record.getPartitionPath(), record.getRecordKey())).groupByKey().collectAsMap();
|
||||||
.groupByKey().collectAsMap();
|
|
||||||
assertEquals(map.size(), 2);
|
assertEquals(map.size(), 2);
|
||||||
List<String> list1 = Lists.newArrayList(map.get("2016/01/31"));
|
List<String> list1 = Lists.newArrayList(map.get("2016/01/31"));
|
||||||
List<String> list2 = Lists.newArrayList(map.get("2015/01/31"));
|
List<String> list2 = Lists.newArrayList(map.get("2015/01/31"));
|
||||||
@@ -132,44 +134,40 @@ public class TestHoodieBloomIndex {
|
|||||||
|
|
||||||
@Test
|
@Test
|
||||||
public void testLoadInvolvedFiles() throws IOException {
|
public void testLoadInvolvedFiles() throws IOException {
|
||||||
HoodieWriteConfig config = HoodieWriteConfig.newBuilder()
|
HoodieWriteConfig config = HoodieWriteConfig.newBuilder().withPath(basePath).build();
|
||||||
.withPath(basePath)
|
|
||||||
.build();
|
|
||||||
HoodieBloomIndex index = new HoodieBloomIndex(config, jsc);
|
HoodieBloomIndex index = new HoodieBloomIndex(config, jsc);
|
||||||
|
|
||||||
// Create some partitions, and put some files
|
// Create some partitions, and put some files
|
||||||
// "2016/01/21": 0 file
|
// "2016/01/21": 0 file
|
||||||
// "2016/04/01": 1 file (2_0_20160401010101.parquet)
|
// "2016/04/01": 1 file (2_0_20160401010101.parquet)
|
||||||
// "2015/03/12": 3 files (1_0_20150312101010.parquet, 3_0_20150312101010.parquet, 4_0_20150312101010.parquet)
|
// "2015/03/12": 3 files (1_0_20150312101010.parquet, 3_0_20150312101010.parquet,
|
||||||
|
// 4_0_20150312101010.parquet)
|
||||||
new File(basePath + "/2016/01/21").mkdirs();
|
new File(basePath + "/2016/01/21").mkdirs();
|
||||||
new File(basePath + "/2016/04/01").mkdirs();
|
new File(basePath + "/2016/04/01").mkdirs();
|
||||||
new File(basePath + "/2015/03/12").mkdirs();
|
new File(basePath + "/2015/03/12").mkdirs();
|
||||||
|
|
||||||
TestRawTripPayload rowChange1 = new TestRawTripPayload(
|
TestRawTripPayload rowChange1 = new TestRawTripPayload(
|
||||||
"{\"_row_key\":\"000\",\"time\":\"2016-01-31T03:16:41.415Z\",\"number\":12}");
|
"{\"_row_key\":\"000\",\"time\":\"2016-01-31T03:16:41.415Z\",\"number\":12}");
|
||||||
HoodieRecord record1 = new HoodieRecord(
|
HoodieRecord record1 = new HoodieRecord(new HoodieKey(rowChange1.getRowKey(), rowChange1.getPartitionPath()),
|
||||||
new HoodieKey(rowChange1.getRowKey(), rowChange1.getPartitionPath()), rowChange1);
|
rowChange1);
|
||||||
TestRawTripPayload rowChange2 = new TestRawTripPayload(
|
TestRawTripPayload rowChange2 = new TestRawTripPayload(
|
||||||
"{\"_row_key\":\"001\",\"time\":\"2016-01-31T03:16:41.415Z\",\"number\":12}");
|
"{\"_row_key\":\"001\",\"time\":\"2016-01-31T03:16:41.415Z\",\"number\":12}");
|
||||||
HoodieRecord record2 = new HoodieRecord(
|
HoodieRecord record2 = new HoodieRecord(new HoodieKey(rowChange2.getRowKey(), rowChange2.getPartitionPath()),
|
||||||
new HoodieKey(rowChange2.getRowKey(), rowChange2.getPartitionPath()), rowChange2);
|
rowChange2);
|
||||||
TestRawTripPayload rowChange3 = new TestRawTripPayload(
|
TestRawTripPayload rowChange3 = new TestRawTripPayload(
|
||||||
"{\"_row_key\":\"002\",\"time\":\"2016-01-31T03:16:41.415Z\",\"number\":12}");
|
"{\"_row_key\":\"002\",\"time\":\"2016-01-31T03:16:41.415Z\",\"number\":12}");
|
||||||
HoodieRecord record3 = new HoodieRecord(
|
HoodieRecord record3 = new HoodieRecord(new HoodieKey(rowChange3.getRowKey(), rowChange3.getPartitionPath()),
|
||||||
new HoodieKey(rowChange3.getRowKey(), rowChange3.getPartitionPath()), rowChange3);
|
rowChange3);
|
||||||
TestRawTripPayload rowChange4 = new TestRawTripPayload(
|
TestRawTripPayload rowChange4 = new TestRawTripPayload(
|
||||||
"{\"_row_key\":\"003\",\"time\":\"2016-01-31T03:16:41.415Z\",\"number\":12}");
|
"{\"_row_key\":\"003\",\"time\":\"2016-01-31T03:16:41.415Z\",\"number\":12}");
|
||||||
HoodieRecord record4 = new HoodieRecord(
|
HoodieRecord record4 = new HoodieRecord(new HoodieKey(rowChange4.getRowKey(), rowChange4.getPartitionPath()),
|
||||||
new HoodieKey(rowChange4.getRowKey(), rowChange4.getPartitionPath()), rowChange4);
|
rowChange4);
|
||||||
|
|
||||||
writeParquetFile("2016/04/01", "2_0_20160401010101.parquet", Lists.newArrayList(), schema, null,
|
writeParquetFile("2016/04/01", "2_0_20160401010101.parquet", Lists.newArrayList(), schema, null, false);
|
||||||
|
writeParquetFile("2015/03/12", "1_0_20150312101010.parquet", Lists.newArrayList(), schema, null, false);
|
||||||
|
writeParquetFile("2015/03/12", "3_0_20150312101010.parquet", Arrays.asList(record1), schema, null, false);
|
||||||
|
writeParquetFile("2015/03/12", "4_0_20150312101010.parquet", Arrays.asList(record2, record3, record4), schema, null,
|
||||||
false);
|
false);
|
||||||
writeParquetFile("2015/03/12", "1_0_20150312101010.parquet", Lists.newArrayList(), schema, null,
|
|
||||||
false);
|
|
||||||
writeParquetFile("2015/03/12", "3_0_20150312101010.parquet", Arrays.asList(record1), schema,
|
|
||||||
null, false);
|
|
||||||
writeParquetFile("2015/03/12", "4_0_20150312101010.parquet",
|
|
||||||
Arrays.asList(record2, record3, record4), schema, null, false);
|
|
||||||
|
|
||||||
List<String> partitions = Arrays.asList("2016/01/21", "2016/04/01", "2015/03/12");
|
List<String> partitions = Arrays.asList("2016/01/21", "2016/04/01", "2015/03/12");
|
||||||
HoodieTableMetaClient metadata = new HoodieTableMetaClient(jsc.hadoopConfiguration(), basePath);
|
HoodieTableMetaClient metadata = new HoodieTableMetaClient(jsc.hadoopConfiguration(), basePath);
|
||||||
@@ -198,51 +196,32 @@ public class TestHoodieBloomIndex {
|
|||||||
List<Tuple2<String, BloomIndexFileInfo>> expected = Arrays.asList(
|
List<Tuple2<String, BloomIndexFileInfo>> expected = Arrays.asList(
|
||||||
new Tuple2<>("2016/04/01", new BloomIndexFileInfo("2_0_20160401010101.parquet")),
|
new Tuple2<>("2016/04/01", new BloomIndexFileInfo("2_0_20160401010101.parquet")),
|
||||||
new Tuple2<>("2015/03/12", new BloomIndexFileInfo("1_0_20150312101010.parquet")),
|
new Tuple2<>("2015/03/12", new BloomIndexFileInfo("1_0_20150312101010.parquet")),
|
||||||
new Tuple2<>("2015/03/12",
|
new Tuple2<>("2015/03/12", new BloomIndexFileInfo("3_0_20150312101010.parquet", "000", "000")),
|
||||||
new BloomIndexFileInfo("3_0_20150312101010.parquet", "000", "000")),
|
new Tuple2<>("2015/03/12", new BloomIndexFileInfo("4_0_20150312101010.parquet", "001", "003")));
|
||||||
new Tuple2<>("2015/03/12",
|
|
||||||
new BloomIndexFileInfo("4_0_20150312101010.parquet", "001", "003"))
|
|
||||||
);
|
|
||||||
assertEquals(expected, filesList);
|
assertEquals(expected, filesList);
|
||||||
}
|
}
|
||||||
|
|
||||||
@Test
|
@Test
|
||||||
public void testRangePruning() {
|
public void testRangePruning() {
|
||||||
|
|
||||||
HoodieWriteConfig config = HoodieWriteConfig.newBuilder()
|
HoodieWriteConfig config = HoodieWriteConfig.newBuilder().withPath(basePath).build();
|
||||||
.withPath(basePath)
|
|
||||||
.build();
|
|
||||||
HoodieBloomIndex index = new HoodieBloomIndex(config, jsc);
|
HoodieBloomIndex index = new HoodieBloomIndex(config, jsc);
|
||||||
|
|
||||||
final Map<String, List<BloomIndexFileInfo>> partitionToFileIndexInfo = new HashMap<>();
|
final Map<String, List<BloomIndexFileInfo>> partitionToFileIndexInfo = new HashMap<>();
|
||||||
partitionToFileIndexInfo.put("2017/10/22", Arrays.asList(
|
partitionToFileIndexInfo.put("2017/10/22", Arrays.asList(new BloomIndexFileInfo("f1"),
|
||||||
new BloomIndexFileInfo("f1"),
|
new BloomIndexFileInfo("f2", "000", "000"), new BloomIndexFileInfo("f3", "001", "003"),
|
||||||
new BloomIndexFileInfo("f2", "000", "000"),
|
new BloomIndexFileInfo("f4", "002", "007"), new BloomIndexFileInfo("f5", "009", "010")));
|
||||||
new BloomIndexFileInfo("f3", "001", "003"),
|
|
||||||
new BloomIndexFileInfo("f4", "002", "007"),
|
|
||||||
new BloomIndexFileInfo("f5", "009", "010")
|
|
||||||
));
|
|
||||||
|
|
||||||
JavaPairRDD<String, String> partitionRecordKeyPairRDD = jsc
|
JavaPairRDD<String, String> partitionRecordKeyPairRDD = jsc.parallelize(Arrays.asList(
|
||||||
.parallelize(Arrays.asList(
|
new Tuple2<>("2017/10/22", "003"), new Tuple2<>("2017/10/22", "002"), new Tuple2<>("2017/10/22", "005"),
|
||||||
new Tuple2<>("2017/10/22", "003"),
|
new Tuple2<>("2017/10/22", "004"))).mapToPair(t -> t);
|
||||||
new Tuple2<>("2017/10/22", "002"),
|
|
||||||
new Tuple2<>("2017/10/22", "005"),
|
|
||||||
new Tuple2<>("2017/10/22", "004")
|
|
||||||
))
|
|
||||||
.mapToPair(t -> t);
|
|
||||||
|
|
||||||
List<Tuple2<String, Tuple2<String, HoodieKey>>> comparisonKeyList = index
|
List<Tuple2<String, Tuple2<String, HoodieKey>>> comparisonKeyList = index.explodeRecordRDDWithFileComparisons(
|
||||||
.explodeRecordRDDWithFileComparisons(partitionToFileIndexInfo, partitionRecordKeyPairRDD)
|
partitionToFileIndexInfo, partitionRecordKeyPairRDD).collect();
|
||||||
.collect();
|
|
||||||
|
|
||||||
assertEquals(10, comparisonKeyList.size());
|
assertEquals(10, comparisonKeyList.size());
|
||||||
Map<String, List<String>> recordKeyToFileComps = comparisonKeyList.stream()
|
Map<String, List<String>> recordKeyToFileComps = comparisonKeyList.stream().collect(Collectors.groupingBy(
|
||||||
.collect(Collectors.groupingBy(
|
t -> t._2()._2().getRecordKey(), Collectors.mapping(t -> t._2()._1().split("#")[0], Collectors.toList())));
|
||||||
t -> t._2()._2().getRecordKey(),
|
|
||||||
Collectors.mapping(t -> t._2()._1().split("#")[0], Collectors.toList()
|
|
||||||
)
|
|
||||||
));
|
|
||||||
|
|
||||||
assertEquals(4, recordKeyToFileComps.size());
|
assertEquals(4, recordKeyToFileComps.size());
|
||||||
assertEquals(Arrays.asList("f1", "f3", "f4"), recordKeyToFileComps.get("002"));
|
assertEquals(Arrays.asList("f1", "f3", "f4"), recordKeyToFileComps.get("002"));
|
||||||
@@ -252,32 +231,35 @@ public class TestHoodieBloomIndex {
|
|||||||
}
|
}
|
||||||
|
|
||||||
@Test
|
@Test
|
||||||
public void testCheckUUIDsAgainstOneFile()
|
public void testCheckUUIDsAgainstOneFile() throws IOException, InterruptedException, ClassNotFoundException {
|
||||||
throws IOException, InterruptedException, ClassNotFoundException {
|
|
||||||
|
|
||||||
// Create some records to use
|
// Create some records to use
|
||||||
String recordStr1 = "{\"_row_key\":\"1eb5b87a-1feh-4edd-87b4-6ec96dc405a0\",\"time\":\"2016-01-31T03:16:41.415Z\",\"number\":12}";
|
String recordStr1 = "{\"_row_key\":\"1eb5b87a-1feh-4edd-87b4-6ec96dc405a0\","
|
||||||
String recordStr2 = "{\"_row_key\":\"2eb5b87b-1feu-4edd-87b4-6ec96dc405a0\",\"time\":\"2016-01-31T03:20:41.415Z\",\"number\":100}";
|
+ "\"time\":\"2016-01-31T03:16:41.415Z\",\"number\":12}";
|
||||||
String recordStr3 = "{\"_row_key\":\"3eb5b87c-1fej-4edd-87b4-6ec96dc405a0\",\"time\":\"2016-01-31T03:16:41.415Z\",\"number\":15}";
|
String recordStr2 = "{\"_row_key\":\"2eb5b87b-1feu-4edd-87b4-6ec96dc405a0\","
|
||||||
String recordStr4 = "{\"_row_key\":\"4eb5b87c-1fej-4edd-87b4-6ec96dc405a0\",\"time\":\"2016-01-31T03:16:41.415Z\",\"number\":32}";
|
+ "\"time\":\"2016-01-31T03:20:41.415Z\",\"number\":100}";
|
||||||
|
String recordStr3 = "{\"_row_key\":\"3eb5b87c-1fej-4edd-87b4-6ec96dc405a0\","
|
||||||
|
+ "\"time\":\"2016-01-31T03:16:41.415Z\",\"number\":15}";
|
||||||
|
String recordStr4 = "{\"_row_key\":\"4eb5b87c-1fej-4edd-87b4-6ec96dc405a0\","
|
||||||
|
+ "\"time\":\"2016-01-31T03:16:41.415Z\",\"number\":32}";
|
||||||
TestRawTripPayload rowChange1 = new TestRawTripPayload(recordStr1);
|
TestRawTripPayload rowChange1 = new TestRawTripPayload(recordStr1);
|
||||||
HoodieRecord record1 = new HoodieRecord(
|
HoodieRecord record1 = new HoodieRecord(new HoodieKey(rowChange1.getRowKey(), rowChange1.getPartitionPath()),
|
||||||
new HoodieKey(rowChange1.getRowKey(), rowChange1.getPartitionPath()), rowChange1);
|
rowChange1);
|
||||||
TestRawTripPayload rowChange2 = new TestRawTripPayload(recordStr2);
|
TestRawTripPayload rowChange2 = new TestRawTripPayload(recordStr2);
|
||||||
HoodieRecord record2 = new HoodieRecord(
|
HoodieRecord record2 = new HoodieRecord(new HoodieKey(rowChange2.getRowKey(), rowChange2.getPartitionPath()),
|
||||||
new HoodieKey(rowChange2.getRowKey(), rowChange2.getPartitionPath()), rowChange2);
|
rowChange2);
|
||||||
TestRawTripPayload rowChange3 = new TestRawTripPayload(recordStr3);
|
TestRawTripPayload rowChange3 = new TestRawTripPayload(recordStr3);
|
||||||
HoodieRecord record3 = new HoodieRecord(
|
HoodieRecord record3 = new HoodieRecord(new HoodieKey(rowChange3.getRowKey(), rowChange3.getPartitionPath()),
|
||||||
new HoodieKey(rowChange3.getRowKey(), rowChange3.getPartitionPath()), rowChange3);
|
rowChange3);
|
||||||
TestRawTripPayload rowChange4 = new TestRawTripPayload(recordStr4);
|
TestRawTripPayload rowChange4 = new TestRawTripPayload(recordStr4);
|
||||||
HoodieRecord record4 = new HoodieRecord(
|
HoodieRecord record4 = new HoodieRecord(new HoodieKey(rowChange4.getRowKey(), rowChange4.getPartitionPath()),
|
||||||
new HoodieKey(rowChange4.getRowKey(), rowChange4.getPartitionPath()), rowChange4);
|
rowChange4);
|
||||||
|
|
||||||
// We write record1, record2 to a parquet file, but the bloom filter contains (record1, record2, record3).
|
// We write record1, record2 to a parquet file, but the bloom filter contains (record1,
|
||||||
|
// record2, record3).
|
||||||
BloomFilter filter = new BloomFilter(10000, 0.0000001);
|
BloomFilter filter = new BloomFilter(10000, 0.0000001);
|
||||||
filter.add(record3.getRecordKey());
|
filter.add(record3.getRecordKey());
|
||||||
String filename = writeParquetFile("2016/01/31", Arrays.asList(record1, record2), schema,
|
String filename = writeParquetFile("2016/01/31", Arrays.asList(record1, record2), schema, filter, true);
|
||||||
filter, true);
|
|
||||||
|
|
||||||
// The bloom filter contains 3 records
|
// The bloom filter contains 3 records
|
||||||
assertTrue(filter.mightContain(record1.getRecordKey()));
|
assertTrue(filter.mightContain(record1.getRecordKey()));
|
||||||
@@ -286,17 +268,16 @@ public class TestHoodieBloomIndex {
|
|||||||
assertFalse(filter.mightContain(record4.getRecordKey()));
|
assertFalse(filter.mightContain(record4.getRecordKey()));
|
||||||
|
|
||||||
// Compare with file
|
// Compare with file
|
||||||
List<String> uuids = Arrays.asList(record1.getRecordKey(), record2.getRecordKey(),
|
List<String> uuids = Arrays.asList(record1.getRecordKey(), record2.getRecordKey(), record3.getRecordKey(),
|
||||||
record3.getRecordKey(), record4.getRecordKey());
|
record4.getRecordKey());
|
||||||
|
|
||||||
List<String> results = HoodieBloomIndexCheckFunction
|
List<String> results = HoodieBloomIndexCheckFunction.checkCandidatesAgainstFile(jsc.hadoopConfiguration(), uuids,
|
||||||
.checkCandidatesAgainstFile(jsc.hadoopConfiguration(), uuids,
|
|
||||||
new Path(basePath + "/2016/01/31/" + filename));
|
new Path(basePath + "/2016/01/31/" + filename));
|
||||||
assertEquals(results.size(), 2);
|
assertEquals(results.size(), 2);
|
||||||
assertTrue(results.get(0).equals("1eb5b87a-1feh-4edd-87b4-6ec96dc405a0")
|
assertTrue(results.get(0).equals("1eb5b87a-1feh-4edd-87b4-6ec96dc405a0") || results.get(1).equals(
|
||||||
|| results.get(1).equals("1eb5b87a-1feh-4edd-87b4-6ec96dc405a0"));
|
"1eb5b87a-1feh-4edd-87b4-6ec96dc405a0"));
|
||||||
assertTrue(results.get(0).equals("2eb5b87b-1feu-4edd-87b4-6ec96dc405a0")
|
assertTrue(results.get(0).equals("2eb5b87b-1feu-4edd-87b4-6ec96dc405a0") || results.get(1).equals(
|
||||||
|| results.get(1).equals("2eb5b87b-1feu-4edd-87b4-6ec96dc405a0"));
|
"2eb5b87b-1feu-4edd-87b4-6ec96dc405a0"));
|
||||||
// TODO(vc): Need more coverage on actual filenames
|
// TODO(vc): Need more coverage on actual filenames
|
||||||
//assertTrue(results.get(0)._2().equals(filename));
|
//assertTrue(results.get(0)._2().equals(filename));
|
||||||
//assertTrue(results.get(1)._2().equals(filename));
|
//assertTrue(results.get(1)._2().equals(filename));
|
||||||
@@ -317,8 +298,7 @@ public class TestHoodieBloomIndex {
|
|||||||
try {
|
try {
|
||||||
bloomIndex.tagLocation(recordRDD, table);
|
bloomIndex.tagLocation(recordRDD, table);
|
||||||
} catch (IllegalArgumentException e) {
|
} catch (IllegalArgumentException e) {
|
||||||
fail(
|
fail("EmptyRDD should not result in IllegalArgumentException: Positive number of slices " + "required");
|
||||||
"EmptyRDD should not result in IllegalArgumentException: Positive number of slices required");
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -327,24 +307,27 @@ public class TestHoodieBloomIndex {
|
|||||||
public void testTagLocation() throws Exception {
|
public void testTagLocation() throws Exception {
|
||||||
// We have some records to be tagged (two different partitions)
|
// We have some records to be tagged (two different partitions)
|
||||||
|
|
||||||
String recordStr1 = "{\"_row_key\":\"1eb5b87a-1feh-4edd-87b4-6ec96dc405a0\",\"time\":\"2016-01-31T03:16:41.415Z\",\"number\":12}";
|
String recordStr1 = "{\"_row_key\":\"1eb5b87a-1feh-4edd-87b4-6ec96dc405a0\","
|
||||||
String recordStr2 = "{\"_row_key\":\"2eb5b87b-1feu-4edd-87b4-6ec96dc405a0\",\"time\":\"2016-01-31T03:20:41.415Z\",\"number\":100}";
|
+ "\"time\":\"2016-01-31T03:16:41.415Z\",\"number\":12}";
|
||||||
String recordStr3 = "{\"_row_key\":\"3eb5b87c-1fej-4edd-87b4-6ec96dc405a0\",\"time\":\"2016-01-31T03:16:41.415Z\",\"number\":15}";
|
String recordStr2 = "{\"_row_key\":\"2eb5b87b-1feu-4edd-87b4-6ec96dc405a0\","
|
||||||
String recordStr4 = "{\"_row_key\":\"4eb5b87c-1fej-4edd-87b4-6ec96dc405a0\",\"time\":\"2015-01-31T03:16:41.415Z\",\"number\":32}";
|
+ "\"time\":\"2016-01-31T03:20:41.415Z\",\"number\":100}";
|
||||||
|
String recordStr3 = "{\"_row_key\":\"3eb5b87c-1fej-4edd-87b4-6ec96dc405a0\","
|
||||||
|
+ "\"time\":\"2016-01-31T03:16:41.415Z\",\"number\":15}";
|
||||||
|
String recordStr4 = "{\"_row_key\":\"4eb5b87c-1fej-4edd-87b4-6ec96dc405a0\","
|
||||||
|
+ "\"time\":\"2015-01-31T03:16:41.415Z\",\"number\":32}";
|
||||||
TestRawTripPayload rowChange1 = new TestRawTripPayload(recordStr1);
|
TestRawTripPayload rowChange1 = new TestRawTripPayload(recordStr1);
|
||||||
HoodieRecord record1 = new HoodieRecord(
|
HoodieRecord record1 = new HoodieRecord(new HoodieKey(rowChange1.getRowKey(), rowChange1.getPartitionPath()),
|
||||||
new HoodieKey(rowChange1.getRowKey(), rowChange1.getPartitionPath()), rowChange1);
|
rowChange1);
|
||||||
TestRawTripPayload rowChange2 = new TestRawTripPayload(recordStr2);
|
TestRawTripPayload rowChange2 = new TestRawTripPayload(recordStr2);
|
||||||
HoodieRecord record2 = new HoodieRecord(
|
HoodieRecord record2 = new HoodieRecord(new HoodieKey(rowChange2.getRowKey(), rowChange2.getPartitionPath()),
|
||||||
new HoodieKey(rowChange2.getRowKey(), rowChange2.getPartitionPath()), rowChange2);
|
rowChange2);
|
||||||
TestRawTripPayload rowChange3 = new TestRawTripPayload(recordStr3);
|
TestRawTripPayload rowChange3 = new TestRawTripPayload(recordStr3);
|
||||||
HoodieRecord record3 = new HoodieRecord(
|
HoodieRecord record3 = new HoodieRecord(new HoodieKey(rowChange3.getRowKey(), rowChange3.getPartitionPath()),
|
||||||
new HoodieKey(rowChange3.getRowKey(), rowChange3.getPartitionPath()), rowChange3);
|
rowChange3);
|
||||||
TestRawTripPayload rowChange4 = new TestRawTripPayload(recordStr4);
|
TestRawTripPayload rowChange4 = new TestRawTripPayload(recordStr4);
|
||||||
HoodieRecord record4 = new HoodieRecord(
|
HoodieRecord record4 = new HoodieRecord(new HoodieKey(rowChange4.getRowKey(), rowChange4.getPartitionPath()),
|
||||||
new HoodieKey(rowChange4.getRowKey(), rowChange4.getPartitionPath()), rowChange4);
|
rowChange4);
|
||||||
JavaRDD<HoodieRecord> recordRDD = jsc
|
JavaRDD<HoodieRecord> recordRDD = jsc.parallelize(Arrays.asList(record1, record2, record3, record4));
|
||||||
.parallelize(Arrays.asList(record1, record2, record3, record4));
|
|
||||||
|
|
||||||
// Also create the metadata and config
|
// Also create the metadata and config
|
||||||
HoodieTableMetaClient metadata = new HoodieTableMetaClient(jsc.hadoopConfiguration(), basePath);
|
HoodieTableMetaClient metadata = new HoodieTableMetaClient(jsc.hadoopConfiguration(), basePath);
|
||||||
@@ -389,10 +372,14 @@ public class TestHoodieBloomIndex {
|
|||||||
public void testCheckExists() throws Exception {
|
public void testCheckExists() throws Exception {
|
||||||
// We have some records to be tagged (two different partitions)
|
// We have some records to be tagged (two different partitions)
|
||||||
|
|
||||||
String recordStr1 = "{\"_row_key\":\"1eb5b87a-1feh-4edd-87b4-6ec96dc405a0\",\"time\":\"2016-01-31T03:16:41.415Z\",\"number\":12}";
|
String recordStr1 = "{\"_row_key\":\"1eb5b87a-1feh-4edd-87b4-6ec96dc405a0\","
|
||||||
String recordStr2 = "{\"_row_key\":\"2eb5b87b-1feu-4edd-87b4-6ec96dc405a0\",\"time\":\"2016-01-31T03:20:41.415Z\",\"number\":100}";
|
+ "\"time\":\"2016-01-31T03:16:41.415Z\",\"number\":12}";
|
||||||
String recordStr3 = "{\"_row_key\":\"3eb5b87c-1fej-4edd-87b4-6ec96dc405a0\",\"time\":\"2016-01-31T03:16:41.415Z\",\"number\":15}";
|
String recordStr2 = "{\"_row_key\":\"2eb5b87b-1feu-4edd-87b4-6ec96dc405a0\","
|
||||||
String recordStr4 = "{\"_row_key\":\"4eb5b87c-1fej-4edd-87b4-6ec96dc405a0\",\"time\":\"2015-01-31T03:16:41.415Z\",\"number\":32}";
|
+ "\"time\":\"2016-01-31T03:20:41.415Z\",\"number\":100}";
|
||||||
|
String recordStr3 = "{\"_row_key\":\"3eb5b87c-1fej-4edd-87b4-6ec96dc405a0\","
|
||||||
|
+ "\"time\":\"2016-01-31T03:16:41.415Z\",\"number\":15}";
|
||||||
|
String recordStr4 = "{\"_row_key\":\"4eb5b87c-1fej-4edd-87b4-6ec96dc405a0\","
|
||||||
|
+ "\"time\":\"2015-01-31T03:16:41.415Z\",\"number\":32}";
|
||||||
TestRawTripPayload rowChange1 = new TestRawTripPayload(recordStr1);
|
TestRawTripPayload rowChange1 = new TestRawTripPayload(recordStr1);
|
||||||
HoodieKey key1 = new HoodieKey(rowChange1.getRowKey(), rowChange1.getPartitionPath());
|
HoodieKey key1 = new HoodieKey(rowChange1.getRowKey(), rowChange1.getPartitionPath());
|
||||||
HoodieRecord record1 = new HoodieRecord(key1, rowChange1);
|
HoodieRecord record1 = new HoodieRecord(key1, rowChange1);
|
||||||
@@ -414,8 +401,7 @@ public class TestHoodieBloomIndex {
|
|||||||
|
|
||||||
// Let's tag
|
// Let's tag
|
||||||
HoodieBloomIndex bloomIndex = new HoodieBloomIndex(config, jsc);
|
HoodieBloomIndex bloomIndex = new HoodieBloomIndex(config, jsc);
|
||||||
JavaPairRDD<HoodieKey, Optional<String>> taggedRecordRDD = bloomIndex
|
JavaPairRDD<HoodieKey, Optional<String>> taggedRecordRDD = bloomIndex.fetchRecordLocation(keysRDD, table);
|
||||||
.fetchRecordLocation(keysRDD, table);
|
|
||||||
|
|
||||||
// Should not find any files
|
// Should not find any files
|
||||||
for (Tuple2<HoodieKey, Optional<String>> record : taggedRecordRDD.collect()) {
|
for (Tuple2<HoodieKey, Optional<String>> record : taggedRecordRDD.collect()) {
|
||||||
@@ -456,16 +442,18 @@ public class TestHoodieBloomIndex {
|
|||||||
@Test
|
@Test
|
||||||
public void testBloomFilterFalseError() throws IOException, InterruptedException {
|
public void testBloomFilterFalseError() throws IOException, InterruptedException {
|
||||||
// We have two hoodie records
|
// We have two hoodie records
|
||||||
String recordStr1 = "{\"_row_key\":\"1eb5b87a-1feh-4edd-87b4-6ec96dc405a0\",\"time\":\"2016-01-31T03:16:41.415Z\",\"number\":12}";
|
String recordStr1 = "{\"_row_key\":\"1eb5b87a-1feh-4edd-87b4-6ec96dc405a0\","
|
||||||
String recordStr2 = "{\"_row_key\":\"2eb5b87b-1feu-4edd-87b4-6ec96dc405a0\",\"time\":\"2016-01-31T03:20:41.415Z\",\"number\":100}";
|
+ "\"time\":\"2016-01-31T03:16:41.415Z\",\"number\":12}";
|
||||||
|
String recordStr2 = "{\"_row_key\":\"2eb5b87b-1feu-4edd-87b4-6ec96dc405a0\","
|
||||||
|
+ "\"time\":\"2016-01-31T03:20:41.415Z\",\"number\":100}";
|
||||||
|
|
||||||
// We write record1 to a parquet file, using a bloom filter having both records
|
// We write record1 to a parquet file, using a bloom filter having both records
|
||||||
TestRawTripPayload rowChange1 = new TestRawTripPayload(recordStr1);
|
TestRawTripPayload rowChange1 = new TestRawTripPayload(recordStr1);
|
||||||
HoodieRecord record1 = new HoodieRecord(
|
HoodieRecord record1 = new HoodieRecord(new HoodieKey(rowChange1.getRowKey(), rowChange1.getPartitionPath()),
|
||||||
new HoodieKey(rowChange1.getRowKey(), rowChange1.getPartitionPath()), rowChange1);
|
rowChange1);
|
||||||
TestRawTripPayload rowChange2 = new TestRawTripPayload(recordStr2);
|
TestRawTripPayload rowChange2 = new TestRawTripPayload(recordStr2);
|
||||||
HoodieRecord record2 = new HoodieRecord(
|
HoodieRecord record2 = new HoodieRecord(new HoodieKey(rowChange2.getRowKey(), rowChange2.getPartitionPath()),
|
||||||
new HoodieKey(rowChange2.getRowKey(), rowChange2.getPartitionPath()), rowChange2);
|
rowChange2);
|
||||||
|
|
||||||
BloomFilter filter = new BloomFilter(10000, 0.0000001);
|
BloomFilter filter = new BloomFilter(10000, 0.0000001);
|
||||||
filter.add(record2.getRecordKey());
|
filter.add(record2.getRecordKey());
|
||||||
@@ -492,8 +480,8 @@ public class TestHoodieBloomIndex {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
private String writeParquetFile(String partitionPath, List<HoodieRecord> records, Schema schema,
|
private String writeParquetFile(String partitionPath, List<HoodieRecord> records, Schema schema, BloomFilter filter,
|
||||||
BloomFilter filter, boolean createCommitTime) throws IOException, InterruptedException {
|
boolean createCommitTime) throws IOException, InterruptedException {
|
||||||
Thread.sleep(1000);
|
Thread.sleep(1000);
|
||||||
String commitTime = new SimpleDateFormat("yyyyMMddHHmmss").format(new Date());
|
String commitTime = new SimpleDateFormat("yyyyMMddHHmmss").format(new Date());
|
||||||
String fileId = UUID.randomUUID().toString();
|
String fileId = UUID.randomUUID().toString();
|
||||||
@@ -502,15 +490,14 @@ public class TestHoodieBloomIndex {
|
|||||||
return writeParquetFile(partitionPath, filename, records, schema, filter, createCommitTime);
|
return writeParquetFile(partitionPath, filename, records, schema, filter, createCommitTime);
|
||||||
}
|
}
|
||||||
|
|
||||||
private String writeParquetFile(String partitionPath, String filename, List<HoodieRecord> records,
|
private String writeParquetFile(String partitionPath, String filename, List<HoodieRecord> records, Schema schema,
|
||||||
Schema schema,
|
|
||||||
BloomFilter filter, boolean createCommitTime) throws IOException {
|
BloomFilter filter, boolean createCommitTime) throws IOException {
|
||||||
|
|
||||||
if (filter == null) {
|
if (filter == null) {
|
||||||
filter = new BloomFilter(10000, 0.0000001);
|
filter = new BloomFilter(10000, 0.0000001);
|
||||||
}
|
}
|
||||||
HoodieAvroWriteSupport writeSupport = new HoodieAvroWriteSupport(
|
HoodieAvroWriteSupport writeSupport = new HoodieAvroWriteSupport(new AvroSchemaConverter().convert(schema), schema,
|
||||||
new AvroSchemaConverter().convert(schema), schema, filter);
|
filter);
|
||||||
String commitTime = FSUtils.getCommitTime(filename);
|
String commitTime = FSUtils.getCommitTime(filename);
|
||||||
HoodieParquetConfig config = new HoodieParquetConfig(writeSupport, CompressionCodecName.GZIP,
|
HoodieParquetConfig config = new HoodieParquetConfig(writeSupport, CompressionCodecName.GZIP,
|
||||||
ParquetWriter.DEFAULT_BLOCK_SIZE, ParquetWriter.DEFAULT_PAGE_SIZE, 120 * 1024 * 1024,
|
ParquetWriter.DEFAULT_BLOCK_SIZE, ParquetWriter.DEFAULT_PAGE_SIZE, 120 * 1024 * 1024,
|
||||||
@@ -525,9 +512,7 @@ public class TestHoodieBloomIndex {
|
|||||||
for (HoodieRecord record : records) {
|
for (HoodieRecord record : records) {
|
||||||
GenericRecord avroRecord = (GenericRecord) record.getData().getInsertValue(schema).get();
|
GenericRecord avroRecord = (GenericRecord) record.getData().getInsertValue(schema).get();
|
||||||
HoodieAvroUtils.addCommitMetadataToRecord(avroRecord, commitTime, "" + seqId++);
|
HoodieAvroUtils.addCommitMetadataToRecord(avroRecord, commitTime, "" + seqId++);
|
||||||
HoodieAvroUtils
|
HoodieAvroUtils.addHoodieKeyToRecord(avroRecord, record.getRecordKey(), record.getPartitionPath(), filename);
|
||||||
.addHoodieKeyToRecord(avroRecord, record.getRecordKey(), record.getPartitionPath(),
|
|
||||||
filename);
|
|
||||||
writer.writeAvro(record.getRecordKey(), avroRecord);
|
writer.writeAvro(record.getRecordKey(), avroRecord);
|
||||||
filter.add(record.getRecordKey());
|
filter.add(record.getRecordKey());
|
||||||
}
|
}
|
||||||
@@ -536,9 +521,7 @@ public class TestHoodieBloomIndex {
|
|||||||
if (createCommitTime) {
|
if (createCommitTime) {
|
||||||
// Also make sure the commit is valid
|
// Also make sure the commit is valid
|
||||||
new File(basePath + "/" + HoodieTableMetaClient.METAFOLDER_NAME).mkdirs();
|
new File(basePath + "/" + HoodieTableMetaClient.METAFOLDER_NAME).mkdirs();
|
||||||
new File(
|
new File(basePath + "/" + HoodieTableMetaClient.METAFOLDER_NAME + "/" + commitTime + ".commit").createNewFile();
|
||||||
basePath + "/" + HoodieTableMetaClient.METAFOLDER_NAME + "/" + commitTime + ".commit")
|
|
||||||
.createNewFile();
|
|
||||||
}
|
}
|
||||||
return filename;
|
return filename;
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -77,7 +77,8 @@ public class TestHoodieCommitArchiveLog {
|
|||||||
public void testArchiveDatasetWithArchival() throws IOException {
|
public void testArchiveDatasetWithArchival() throws IOException {
|
||||||
HoodieWriteConfig cfg = HoodieWriteConfig.newBuilder().withPath(basePath)
|
HoodieWriteConfig cfg = HoodieWriteConfig.newBuilder().withPath(basePath)
|
||||||
.withSchema(HoodieTestDataGenerator.TRIP_EXAMPLE_SCHEMA).withParallelism(2, 2)
|
.withSchema(HoodieTestDataGenerator.TRIP_EXAMPLE_SCHEMA).withParallelism(2, 2)
|
||||||
.withCompactionConfig(HoodieCompactionConfig.newBuilder().archiveCommitsWith(2, 4).build())
|
.withCompactionConfig(
|
||||||
|
HoodieCompactionConfig.newBuilder().archiveCommitsWith(2, 4).build())
|
||||||
.forTable("test-trip-table").build();
|
.forTable("test-trip-table").build();
|
||||||
HoodieTestUtils.init(hadoopConf, basePath);
|
HoodieTestUtils.init(hadoopConf, basePath);
|
||||||
HoodieTestDataGenerator.createCommitFile(basePath, "100");
|
HoodieTestDataGenerator.createCommitFile(basePath, "100");
|
||||||
@@ -88,8 +89,7 @@ public class TestHoodieCommitArchiveLog {
|
|||||||
HoodieTestDataGenerator.createCommitFile(basePath, "105");
|
HoodieTestDataGenerator.createCommitFile(basePath, "105");
|
||||||
|
|
||||||
HoodieTableMetaClient metaClient = new HoodieTableMetaClient(fs.getConf(), basePath);
|
HoodieTableMetaClient metaClient = new HoodieTableMetaClient(fs.getConf(), basePath);
|
||||||
HoodieTimeline timeline =
|
HoodieTimeline timeline = metaClient.getActiveTimeline().getCommitsTimeline().filterCompletedInstants();
|
||||||
metaClient.getActiveTimeline().getCommitsTimeline().filterCompletedInstants();
|
|
||||||
|
|
||||||
assertEquals("Loaded 6 commits and the count should match", 6, timeline.countInstants());
|
assertEquals("Loaded 6 commits and the count should match", 6, timeline.countInstants());
|
||||||
|
|
||||||
@@ -103,8 +103,7 @@ public class TestHoodieCommitArchiveLog {
|
|||||||
HoodieTestUtils.createInflightCleanFiles(basePath, "106", "107");
|
HoodieTestUtils.createInflightCleanFiles(basePath, "106", "107");
|
||||||
|
|
||||||
//reload the timeline and get all the commmits before archive
|
//reload the timeline and get all the commmits before archive
|
||||||
timeline = metaClient.getActiveTimeline().reload().getAllCommitsTimeline()
|
timeline = metaClient.getActiveTimeline().reload().getAllCommitsTimeline().filterCompletedInstants();
|
||||||
.filterCompletedInstants();
|
|
||||||
List<HoodieInstant> originalCommits = timeline.getInstants().collect(Collectors.toList());
|
List<HoodieInstant> originalCommits = timeline.getInstants().collect(Collectors.toList());
|
||||||
|
|
||||||
assertEquals("Loaded 6 commits and the count should match", 12, timeline.countInstants());
|
assertEquals("Loaded 6 commits and the count should match", 12, timeline.countInstants());
|
||||||
@@ -118,13 +117,12 @@ public class TestHoodieCommitArchiveLog {
|
|||||||
assertTrue(archiveLog.archiveIfRequired());
|
assertTrue(archiveLog.archiveIfRequired());
|
||||||
|
|
||||||
//reload the timeline and remove the remaining commits
|
//reload the timeline and remove the remaining commits
|
||||||
timeline = metaClient.getActiveTimeline().reload().getAllCommitsTimeline()
|
timeline = metaClient.getActiveTimeline().reload().getAllCommitsTimeline().filterCompletedInstants();
|
||||||
.filterCompletedInstants();
|
|
||||||
originalCommits.removeAll(timeline.getInstants().collect(Collectors.toList()));
|
originalCommits.removeAll(timeline.getInstants().collect(Collectors.toList()));
|
||||||
|
|
||||||
//read the file
|
//read the file
|
||||||
HoodieLogFormat.Reader reader = HoodieLogFormat
|
HoodieLogFormat.Reader reader = HoodieLogFormat.newReader(fs,
|
||||||
.newReader(fs, new HoodieLogFile(new Path(basePath + "/.hoodie/.commits_.archive.1")),
|
new HoodieLogFile(new Path(basePath + "/.hoodie/.commits_.archive.1")),
|
||||||
HoodieArchivedMetaEntry.getClassSchema());
|
HoodieArchivedMetaEntry.getClassSchema());
|
||||||
|
|
||||||
int archivedRecordsCount = 0;
|
int archivedRecordsCount = 0;
|
||||||
@@ -137,8 +135,7 @@ public class TestHoodieCommitArchiveLog {
|
|||||||
assertEquals("Archived and read records for each block are same", 8, records.size());
|
assertEquals("Archived and read records for each block are same", 8, records.size());
|
||||||
archivedRecordsCount += records.size();
|
archivedRecordsCount += records.size();
|
||||||
}
|
}
|
||||||
assertEquals("Total archived records and total read records are the same count", 8,
|
assertEquals("Total archived records and total read records are the same count", 8, archivedRecordsCount);
|
||||||
archivedRecordsCount);
|
|
||||||
|
|
||||||
//make sure the archived commits are the same as the (originalcommits - commitsleft)
|
//make sure the archived commits are the same as the (originalcommits - commitsleft)
|
||||||
List<String> readCommits = readRecords.stream().map(r -> (GenericRecord) r).map(r -> {
|
List<String> readCommits = readRecords.stream().map(r -> (GenericRecord) r).map(r -> {
|
||||||
@@ -146,10 +143,8 @@ public class TestHoodieCommitArchiveLog {
|
|||||||
}).collect(Collectors.toList());
|
}).collect(Collectors.toList());
|
||||||
Collections.sort(readCommits);
|
Collections.sort(readCommits);
|
||||||
|
|
||||||
assertEquals(
|
assertEquals("Read commits map should match the originalCommits - commitsLoadedFromArchival",
|
||||||
"Read commits map should match the originalCommits - commitsLoadedFromArchival",
|
originalCommits.stream().map(HoodieInstant::getTimestamp).collect(Collectors.toList()), readCommits);
|
||||||
originalCommits.stream().map(HoodieInstant::getTimestamp).collect(Collectors.toList()),
|
|
||||||
readCommits);
|
|
||||||
|
|
||||||
// verify in-flight instants after archive
|
// verify in-flight instants after archive
|
||||||
verifyInflightInstants(metaClient, 3);
|
verifyInflightInstants(metaClient, 3);
|
||||||
@@ -168,15 +163,12 @@ public class TestHoodieCommitArchiveLog {
|
|||||||
HoodieTestDataGenerator.createCommitFile(basePath, "102");
|
HoodieTestDataGenerator.createCommitFile(basePath, "102");
|
||||||
HoodieTestDataGenerator.createCommitFile(basePath, "103");
|
HoodieTestDataGenerator.createCommitFile(basePath, "103");
|
||||||
|
|
||||||
HoodieTimeline timeline = metaClient.getActiveTimeline().getCommitsTimeline()
|
HoodieTimeline timeline = metaClient.getActiveTimeline().getCommitsTimeline().filterCompletedInstants();
|
||||||
.filterCompletedInstants();
|
|
||||||
assertEquals("Loaded 4 commits and the count should match", 4, timeline.countInstants());
|
assertEquals("Loaded 4 commits and the count should match", 4, timeline.countInstants());
|
||||||
boolean result = archiveLog.archiveIfRequired();
|
boolean result = archiveLog.archiveIfRequired();
|
||||||
assertTrue(result);
|
assertTrue(result);
|
||||||
timeline = metaClient.getActiveTimeline().reload().getCommitsTimeline()
|
timeline = metaClient.getActiveTimeline().reload().getCommitsTimeline().filterCompletedInstants();
|
||||||
.filterCompletedInstants();
|
assertEquals("Should not archive commits when maxCommitsToKeep is 5", 4, timeline.countInstants());
|
||||||
assertEquals("Should not archive commits when maxCommitsToKeep is 5", 4,
|
|
||||||
timeline.countInstants());
|
|
||||||
}
|
}
|
||||||
|
|
||||||
@Test
|
@Test
|
||||||
@@ -194,21 +186,15 @@ public class TestHoodieCommitArchiveLog {
|
|||||||
HoodieTestDataGenerator.createCommitFile(basePath, "104");
|
HoodieTestDataGenerator.createCommitFile(basePath, "104");
|
||||||
HoodieTestDataGenerator.createCommitFile(basePath, "105");
|
HoodieTestDataGenerator.createCommitFile(basePath, "105");
|
||||||
|
|
||||||
HoodieTimeline timeline = metaClient.getActiveTimeline().getCommitsTimeline()
|
HoodieTimeline timeline = metaClient.getActiveTimeline().getCommitsTimeline().filterCompletedInstants();
|
||||||
.filterCompletedInstants();
|
|
||||||
assertEquals("Loaded 6 commits and the count should match", 6, timeline.countInstants());
|
assertEquals("Loaded 6 commits and the count should match", 6, timeline.countInstants());
|
||||||
boolean result = archiveLog.archiveIfRequired();
|
boolean result = archiveLog.archiveIfRequired();
|
||||||
assertTrue(result);
|
assertTrue(result);
|
||||||
timeline = metaClient.getActiveTimeline().reload().getCommitsTimeline()
|
timeline = metaClient.getActiveTimeline().reload().getCommitsTimeline().filterCompletedInstants();
|
||||||
.filterCompletedInstants();
|
assertTrue("Archived commits should always be safe", timeline.containsOrBeforeTimelineStarts("100"));
|
||||||
assertTrue("Archived commits should always be safe",
|
assertTrue("Archived commits should always be safe", timeline.containsOrBeforeTimelineStarts("101"));
|
||||||
timeline.containsOrBeforeTimelineStarts("100"));
|
assertTrue("Archived commits should always be safe", timeline.containsOrBeforeTimelineStarts("102"));
|
||||||
assertTrue("Archived commits should always be safe",
|
assertTrue("Archived commits should always be safe", timeline.containsOrBeforeTimelineStarts("103"));
|
||||||
timeline.containsOrBeforeTimelineStarts("101"));
|
|
||||||
assertTrue("Archived commits should always be safe",
|
|
||||||
timeline.containsOrBeforeTimelineStarts("102"));
|
|
||||||
assertTrue("Archived commits should always be safe",
|
|
||||||
timeline.containsOrBeforeTimelineStarts("103"));
|
|
||||||
}
|
}
|
||||||
|
|
||||||
@Test
|
@Test
|
||||||
@@ -227,16 +213,14 @@ public class TestHoodieCommitArchiveLog {
|
|||||||
HoodieTestDataGenerator.createCommitFile(basePath, "104");
|
HoodieTestDataGenerator.createCommitFile(basePath, "104");
|
||||||
HoodieTestDataGenerator.createCommitFile(basePath, "105");
|
HoodieTestDataGenerator.createCommitFile(basePath, "105");
|
||||||
|
|
||||||
HoodieTimeline timeline = metaClient.getActiveTimeline().getCommitsTimeline()
|
HoodieTimeline timeline = metaClient.getActiveTimeline().getCommitsTimeline().filterCompletedInstants();
|
||||||
.filterCompletedInstants();
|
|
||||||
assertEquals("Loaded 6 commits and the count should match", 6, timeline.countInstants());
|
assertEquals("Loaded 6 commits and the count should match", 6, timeline.countInstants());
|
||||||
boolean result = archiveLog.archiveIfRequired();
|
boolean result = archiveLog.archiveIfRequired();
|
||||||
assertTrue(result);
|
assertTrue(result);
|
||||||
timeline = metaClient.getActiveTimeline().reload().getCommitsTimeline()
|
timeline = metaClient.getActiveTimeline().reload().getCommitsTimeline().filterCompletedInstants();
|
||||||
.filterCompletedInstants();
|
|
||||||
assertEquals(
|
assertEquals(
|
||||||
"Since we have a savepoint at 101, we should never archive any commit after 101 (we only archive 100)",
|
"Since we have a savepoint at 101, we should never archive any commit after 101 (we only " + "archive 100)", 5,
|
||||||
5, timeline.countInstants());
|
timeline.countInstants());
|
||||||
assertTrue("Archived commits should always be safe",
|
assertTrue("Archived commits should always be safe",
|
||||||
timeline.containsInstant(new HoodieInstant(false, HoodieTimeline.COMMIT_ACTION, "101")));
|
timeline.containsInstant(new HoodieInstant(false, HoodieTimeline.COMMIT_ACTION, "101")));
|
||||||
assertTrue("Archived commits should always be safe",
|
assertTrue("Archived commits should always be safe",
|
||||||
@@ -248,7 +232,7 @@ public class TestHoodieCommitArchiveLog {
|
|||||||
private void verifyInflightInstants(HoodieTableMetaClient metaClient, int expectedTotalInstants) {
|
private void verifyInflightInstants(HoodieTableMetaClient metaClient, int expectedTotalInstants) {
|
||||||
HoodieTimeline timeline = metaClient.getActiveTimeline().reload()
|
HoodieTimeline timeline = metaClient.getActiveTimeline().reload()
|
||||||
.getTimelineOfActions(Sets.newHashSet(HoodieTimeline.CLEAN_ACTION)).filterInflights();
|
.getTimelineOfActions(Sets.newHashSet(HoodieTimeline.CLEAN_ACTION)).filterInflights();
|
||||||
assertEquals("Loaded inflight clean actions and the count should match",
|
assertEquals("Loaded inflight clean actions and the count should match", expectedTotalInstants,
|
||||||
expectedTotalInstants, timeline.countInstants());
|
timeline.countInstants());
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -93,32 +93,27 @@ public class TestHoodieCompactor {
|
|||||||
}
|
}
|
||||||
|
|
||||||
private HoodieWriteConfig.Builder getConfigBuilder() {
|
private HoodieWriteConfig.Builder getConfigBuilder() {
|
||||||
return HoodieWriteConfig.newBuilder().withPath(basePath)
|
return HoodieWriteConfig.newBuilder().withPath(basePath).withSchema(HoodieTestDataGenerator.TRIP_EXAMPLE_SCHEMA)
|
||||||
.withSchema(HoodieTestDataGenerator.TRIP_EXAMPLE_SCHEMA).withParallelism(2, 2)
|
.withParallelism(2, 2).withCompactionConfig(
|
||||||
.withCompactionConfig(
|
HoodieCompactionConfig.newBuilder().compactionSmallFileSize(1024 * 1024).withInlineCompaction(false)
|
||||||
HoodieCompactionConfig.newBuilder().compactionSmallFileSize(1024 * 1024)
|
.build()).withStorageConfig(HoodieStorageConfig.newBuilder().limitFileSize(1024 * 1024).build())
|
||||||
.withInlineCompaction(false).build())
|
.forTable("test-trip-table")
|
||||||
.withStorageConfig(HoodieStorageConfig.newBuilder().limitFileSize(1024 * 1024).build())
|
.withIndexConfig(HoodieIndexConfig.newBuilder().withIndexType(HoodieIndex.IndexType.BLOOM).build());
|
||||||
.forTable("test-trip-table").withIndexConfig(
|
|
||||||
HoodieIndexConfig.newBuilder().withIndexType(HoodieIndex.IndexType.BLOOM).build());
|
|
||||||
}
|
}
|
||||||
|
|
||||||
@Test(expected = IllegalArgumentException.class)
|
@Test(expected = IllegalArgumentException.class)
|
||||||
public void testCompactionOnCopyOnWriteFail() throws Exception {
|
public void testCompactionOnCopyOnWriteFail() throws Exception {
|
||||||
HoodieTestUtils.initTableType(hadoopConf, basePath, HoodieTableType.COPY_ON_WRITE);
|
HoodieTestUtils.initTableType(hadoopConf, basePath, HoodieTableType.COPY_ON_WRITE);
|
||||||
HoodieTableMetaClient metaClient = new HoodieTableMetaClient(jsc.hadoopConfiguration(),
|
HoodieTableMetaClient metaClient = new HoodieTableMetaClient(jsc.hadoopConfiguration(), basePath);
|
||||||
basePath);
|
|
||||||
HoodieTable table = HoodieTable.getHoodieTable(metaClient, getConfig());
|
HoodieTable table = HoodieTable.getHoodieTable(metaClient, getConfig());
|
||||||
compactor.compact(jsc, getConfig(), table, HoodieActiveTimeline.createNewCommitTime());
|
compactor.compact(jsc, getConfig(), table, HoodieActiveTimeline.createNewCommitTime());
|
||||||
}
|
}
|
||||||
|
|
||||||
@Test
|
@Test
|
||||||
public void testCompactionEmpty() throws Exception {
|
public void testCompactionEmpty() throws Exception {
|
||||||
HoodieTableMetaClient metaClient = new HoodieTableMetaClient(jsc.hadoopConfiguration(),
|
HoodieTableMetaClient metaClient = new HoodieTableMetaClient(jsc.hadoopConfiguration(), basePath);
|
||||||
basePath);
|
|
||||||
HoodieWriteConfig config = getConfig();
|
HoodieWriteConfig config = getConfig();
|
||||||
HoodieTable table = HoodieTable
|
HoodieTable table = HoodieTable.getHoodieTable(metaClient, config);
|
||||||
.getHoodieTable(metaClient, config);
|
|
||||||
HoodieWriteClient writeClient = new HoodieWriteClient(jsc, config);
|
HoodieWriteClient writeClient = new HoodieWriteClient(jsc, config);
|
||||||
|
|
||||||
String newCommitTime = writeClient.startCommit();
|
String newCommitTime = writeClient.startCommit();
|
||||||
@@ -126,10 +121,9 @@ public class TestHoodieCompactor {
|
|||||||
JavaRDD<HoodieRecord> recordsRDD = jsc.parallelize(records, 1);
|
JavaRDD<HoodieRecord> recordsRDD = jsc.parallelize(records, 1);
|
||||||
writeClient.insert(recordsRDD, newCommitTime).collect();
|
writeClient.insert(recordsRDD, newCommitTime).collect();
|
||||||
|
|
||||||
JavaRDD<WriteStatus> result =
|
JavaRDD<WriteStatus> result = compactor
|
||||||
compactor.compact(jsc, getConfig(), table, HoodieActiveTimeline.createNewCommitTime());
|
.compact(jsc, getConfig(), table, HoodieActiveTimeline.createNewCommitTime());
|
||||||
assertTrue("If there is nothing to compact, result will be empty",
|
assertTrue("If there is nothing to compact, result will be empty", result.isEmpty());
|
||||||
result.isEmpty());
|
|
||||||
}
|
}
|
||||||
|
|
||||||
@Test
|
@Test
|
||||||
@@ -145,8 +139,7 @@ public class TestHoodieCompactor {
|
|||||||
List<WriteStatus> statuses = writeClient.insert(recordsRDD, newCommitTime).collect();
|
List<WriteStatus> statuses = writeClient.insert(recordsRDD, newCommitTime).collect();
|
||||||
|
|
||||||
// Update all the 100 records
|
// Update all the 100 records
|
||||||
HoodieTableMetaClient metaClient = new HoodieTableMetaClient(jsc.hadoopConfiguration(),
|
HoodieTableMetaClient metaClient = new HoodieTableMetaClient(jsc.hadoopConfiguration(), basePath);
|
||||||
basePath);
|
|
||||||
HoodieTable table = HoodieTable.getHoodieTable(metaClient, config);
|
HoodieTable table = HoodieTable.getHoodieTable(metaClient, config);
|
||||||
|
|
||||||
newCommitTime = "101";
|
newCommitTime = "101";
|
||||||
@@ -159,19 +152,16 @@ public class TestHoodieCompactor {
|
|||||||
|
|
||||||
// Write them to corresponding avro logfiles
|
// Write them to corresponding avro logfiles
|
||||||
HoodieTestUtils
|
HoodieTestUtils
|
||||||
.writeRecordsToLogFiles(fs, metaClient.getBasePath(), HoodieTestDataGenerator.avroSchema,
|
.writeRecordsToLogFiles(fs, metaClient.getBasePath(), HoodieTestDataGenerator.avroSchema, updatedRecords);
|
||||||
updatedRecords);
|
|
||||||
|
|
||||||
// Verify that all data file has one log file
|
// Verify that all data file has one log file
|
||||||
metaClient = new HoodieTableMetaClient(jsc.hadoopConfiguration(), basePath);
|
metaClient = new HoodieTableMetaClient(jsc.hadoopConfiguration(), basePath);
|
||||||
table = HoodieTable.getHoodieTable(metaClient, config);
|
table = HoodieTable.getHoodieTable(metaClient, config);
|
||||||
for (String partitionPath : dataGen.getPartitionPaths()) {
|
for (String partitionPath : dataGen.getPartitionPaths()) {
|
||||||
List<FileSlice> groupedLogFiles =
|
List<FileSlice> groupedLogFiles = table.getRTFileSystemView().getLatestFileSlices(partitionPath)
|
||||||
table.getRTFileSystemView().getLatestFileSlices(partitionPath)
|
|
||||||
.collect(Collectors.toList());
|
.collect(Collectors.toList());
|
||||||
for (FileSlice fileSlice : groupedLogFiles) {
|
for (FileSlice fileSlice : groupedLogFiles) {
|
||||||
assertEquals("There should be 1 log file written for every data file", 1,
|
assertEquals("There should be 1 log file written for every data file", 1, fileSlice.getLogFiles().count());
|
||||||
fileSlice.getLogFiles().count());
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -179,18 +169,19 @@ public class TestHoodieCompactor {
|
|||||||
metaClient = new HoodieTableMetaClient(jsc.hadoopConfiguration(), basePath);
|
metaClient = new HoodieTableMetaClient(jsc.hadoopConfiguration(), basePath);
|
||||||
table = HoodieTable.getHoodieTable(metaClient, config);
|
table = HoodieTable.getHoodieTable(metaClient, config);
|
||||||
|
|
||||||
JavaRDD<WriteStatus> result =
|
JavaRDD<WriteStatus> result = compactor
|
||||||
compactor.compact(jsc, getConfig(), table, HoodieActiveTimeline.createNewCommitTime());
|
.compact(jsc, getConfig(), table, HoodieActiveTimeline.createNewCommitTime());
|
||||||
|
|
||||||
// Verify that all partition paths are present in the WriteStatus result
|
// Verify that all partition paths are present in the WriteStatus result
|
||||||
for (String partitionPath : dataGen.getPartitionPaths()) {
|
for (String partitionPath : dataGen.getPartitionPaths()) {
|
||||||
List<WriteStatus> writeStatuses = result.collect();
|
List<WriteStatus> writeStatuses = result.collect();
|
||||||
assertTrue(writeStatuses.stream()
|
assertTrue(writeStatuses.stream()
|
||||||
.filter(writeStatus -> writeStatus.getStat().getPartitionPath()
|
.filter(writeStatus -> writeStatus.getStat().getPartitionPath().contentEquals(partitionPath))
|
||||||
.contentEquals(partitionPath)).count() > 0);
|
.count() > 0);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
// TODO - after modifying HoodieReadClient to support realtime tables - add more tests to make sure the data read is the updated data (compaction correctness)
|
// TODO - after modifying HoodieReadClient to support realtime tables - add more tests to make
|
||||||
|
// sure the data read is the updated data (compaction correctness)
|
||||||
// TODO - add more test cases for compactions after a failed commit/compaction
|
// TODO - add more test cases for compactions after a failed commit/compaction
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -16,6 +16,9 @@
|
|||||||
|
|
||||||
package com.uber.hoodie.io.strategy;
|
package com.uber.hoodie.io.strategy;
|
||||||
|
|
||||||
|
import static org.junit.Assert.assertEquals;
|
||||||
|
import static org.junit.Assert.assertTrue;
|
||||||
|
|
||||||
import com.beust.jcommander.internal.Lists;
|
import com.beust.jcommander.internal.Lists;
|
||||||
import com.google.common.collect.Maps;
|
import com.google.common.collect.Maps;
|
||||||
import com.uber.hoodie.config.HoodieCompactionConfig;
|
import com.uber.hoodie.config.HoodieCompactionConfig;
|
||||||
@@ -25,15 +28,11 @@ import com.uber.hoodie.io.compact.strategy.BoundedIOCompactionStrategy;
|
|||||||
import com.uber.hoodie.io.compact.strategy.DayBasedCompactionStrategy;
|
import com.uber.hoodie.io.compact.strategy.DayBasedCompactionStrategy;
|
||||||
import com.uber.hoodie.io.compact.strategy.LogFileSizeBasedCompactionStrategy;
|
import com.uber.hoodie.io.compact.strategy.LogFileSizeBasedCompactionStrategy;
|
||||||
import com.uber.hoodie.io.compact.strategy.UnBoundedCompactionStrategy;
|
import com.uber.hoodie.io.compact.strategy.UnBoundedCompactionStrategy;
|
||||||
import org.junit.Test;
|
|
||||||
|
|
||||||
import java.util.List;
|
import java.util.List;
|
||||||
import java.util.Map;
|
import java.util.Map;
|
||||||
import java.util.Random;
|
import java.util.Random;
|
||||||
import java.util.stream.Collectors;
|
import java.util.stream.Collectors;
|
||||||
|
import org.junit.Test;
|
||||||
import static org.junit.Assert.assertEquals;
|
|
||||||
import static org.junit.Assert.assertTrue;
|
|
||||||
|
|
||||||
public class TestHoodieCompactionStrategy {
|
public class TestHoodieCompactionStrategy {
|
||||||
|
|
||||||
@@ -48,8 +47,7 @@ public class TestHoodieCompactionStrategy {
|
|||||||
sizesMap.put(100 * MB, Lists.newArrayList(MB));
|
sizesMap.put(100 * MB, Lists.newArrayList(MB));
|
||||||
sizesMap.put(90 * MB, Lists.newArrayList(1024 * MB));
|
sizesMap.put(90 * MB, Lists.newArrayList(1024 * MB));
|
||||||
UnBoundedCompactionStrategy strategy = new UnBoundedCompactionStrategy();
|
UnBoundedCompactionStrategy strategy = new UnBoundedCompactionStrategy();
|
||||||
HoodieWriteConfig writeConfig = HoodieWriteConfig.newBuilder().withPath("/tmp")
|
HoodieWriteConfig writeConfig = HoodieWriteConfig.newBuilder().withPath("/tmp").withCompactionConfig(
|
||||||
.withCompactionConfig(
|
|
||||||
HoodieCompactionConfig.newBuilder().withCompactionStrategy(strategy).build()).build();
|
HoodieCompactionConfig.newBuilder().withCompactionStrategy(strategy).build()).build();
|
||||||
List<CompactionOperation> operations = createCompactionOperations(writeConfig, sizesMap);
|
List<CompactionOperation> operations = createCompactionOperations(writeConfig, sizesMap);
|
||||||
List<CompactionOperation> returned = strategy.orderAndFilter(writeConfig, operations);
|
List<CompactionOperation> returned = strategy.orderAndFilter(writeConfig, operations);
|
||||||
@@ -64,23 +62,19 @@ public class TestHoodieCompactionStrategy {
|
|||||||
sizesMap.put(100 * MB, Lists.newArrayList(MB));
|
sizesMap.put(100 * MB, Lists.newArrayList(MB));
|
||||||
sizesMap.put(90 * MB, Lists.newArrayList(1024 * MB));
|
sizesMap.put(90 * MB, Lists.newArrayList(1024 * MB));
|
||||||
BoundedIOCompactionStrategy strategy = new BoundedIOCompactionStrategy();
|
BoundedIOCompactionStrategy strategy = new BoundedIOCompactionStrategy();
|
||||||
HoodieWriteConfig writeConfig = HoodieWriteConfig.newBuilder().withPath("/tmp")
|
HoodieWriteConfig writeConfig = HoodieWriteConfig.newBuilder().withPath("/tmp").withCompactionConfig(
|
||||||
.withCompactionConfig(
|
HoodieCompactionConfig.newBuilder().withCompactionStrategy(strategy).withTargetIOPerCompactionInMB(400).build())
|
||||||
HoodieCompactionConfig.newBuilder().withCompactionStrategy(strategy)
|
.build();
|
||||||
.withTargetIOPerCompactionInMB(400).build()).build();
|
|
||||||
List<CompactionOperation> operations = createCompactionOperations(writeConfig, sizesMap);
|
List<CompactionOperation> operations = createCompactionOperations(writeConfig, sizesMap);
|
||||||
List<CompactionOperation> returned = strategy.orderAndFilter(writeConfig, operations);
|
List<CompactionOperation> returned = strategy.orderAndFilter(writeConfig, operations);
|
||||||
|
|
||||||
assertTrue("BoundedIOCompaction should have resulted in fewer compactions",
|
assertTrue("BoundedIOCompaction should have resulted in fewer compactions", returned.size() < operations.size());
|
||||||
returned.size() < operations.size());
|
assertEquals("BoundedIOCompaction should have resulted in 2 compactions being chosen", 2, returned.size());
|
||||||
assertEquals("BoundedIOCompaction should have resulted in 2 compactions being chosen",
|
|
||||||
2, returned.size());
|
|
||||||
// Total size of all the log files
|
// Total size of all the log files
|
||||||
Long returnedSize = returned.stream()
|
Long returnedSize = returned.stream().map(s -> s.getMetrics().get(BoundedIOCompactionStrategy.TOTAL_IO_MB))
|
||||||
.map(s -> s.getMetrics().get(BoundedIOCompactionStrategy.TOTAL_IO_MB)).map(s -> (Long) s)
|
.map(s -> (Long) s).reduce((size1, size2) -> size1 + size2).orElse(0L);
|
||||||
.reduce((size1, size2) -> size1 + size2).orElse(0L);
|
assertEquals("Should chose the first 2 compactions which should result in a total IO of 690 MB", 610,
|
||||||
assertEquals("Should chose the first 2 compactions which should result in a total IO of 690 MB",
|
(long) returnedSize);
|
||||||
610, (long) returnedSize);
|
|
||||||
}
|
}
|
||||||
|
|
||||||
@Test
|
@Test
|
||||||
@@ -91,23 +85,20 @@ public class TestHoodieCompactionStrategy {
|
|||||||
sizesMap.put(100 * MB, Lists.newArrayList(MB));
|
sizesMap.put(100 * MB, Lists.newArrayList(MB));
|
||||||
sizesMap.put(90 * MB, Lists.newArrayList(1024 * MB));
|
sizesMap.put(90 * MB, Lists.newArrayList(1024 * MB));
|
||||||
LogFileSizeBasedCompactionStrategy strategy = new LogFileSizeBasedCompactionStrategy();
|
LogFileSizeBasedCompactionStrategy strategy = new LogFileSizeBasedCompactionStrategy();
|
||||||
HoodieWriteConfig writeConfig = HoodieWriteConfig.newBuilder().withPath("/tmp")
|
HoodieWriteConfig writeConfig = HoodieWriteConfig.newBuilder().withPath("/tmp").withCompactionConfig(
|
||||||
.withCompactionConfig(
|
HoodieCompactionConfig.newBuilder().withCompactionStrategy(strategy).withTargetIOPerCompactionInMB(400).build())
|
||||||
HoodieCompactionConfig.newBuilder().withCompactionStrategy(strategy)
|
.build();
|
||||||
.withTargetIOPerCompactionInMB(400).build()).build();
|
|
||||||
List<CompactionOperation> operations = createCompactionOperations(writeConfig, sizesMap);
|
List<CompactionOperation> operations = createCompactionOperations(writeConfig, sizesMap);
|
||||||
List<CompactionOperation> returned = strategy.orderAndFilter(writeConfig, operations);
|
List<CompactionOperation> returned = strategy.orderAndFilter(writeConfig, operations);
|
||||||
|
|
||||||
assertTrue("LogFileSizeBasedCompactionStrategy should have resulted in fewer compactions",
|
assertTrue("LogFileSizeBasedCompactionStrategy should have resulted in fewer compactions",
|
||||||
returned.size() < operations.size());
|
returned.size() < operations.size());
|
||||||
assertEquals("LogFileSizeBasedCompactionStrategy should have resulted in 1 compaction",
|
assertEquals("LogFileSizeBasedCompactionStrategy should have resulted in 1 compaction", 1, returned.size());
|
||||||
1, returned.size());
|
|
||||||
// Total size of all the log files
|
// Total size of all the log files
|
||||||
Long returnedSize = returned.stream()
|
Long returnedSize = returned.stream().map(s -> s.getMetrics().get(BoundedIOCompactionStrategy.TOTAL_IO_MB))
|
||||||
.map(s -> s.getMetrics().get(BoundedIOCompactionStrategy.TOTAL_IO_MB)).map(s -> (Long) s)
|
.map(s -> (Long) s).reduce((size1, size2) -> size1 + size2).orElse(0L);
|
||||||
.reduce((size1, size2) -> size1 + size2).orElse(0L);
|
assertEquals("Should chose the first 2 compactions which should result in a total IO of 690 MB", 1204,
|
||||||
assertEquals("Should chose the first 2 compactions which should result in a total IO of 690 MB",
|
(long) returnedSize);
|
||||||
1204, (long) returnedSize);
|
|
||||||
}
|
}
|
||||||
|
|
||||||
@Test
|
@Test
|
||||||
@@ -118,10 +109,9 @@ public class TestHoodieCompactionStrategy {
|
|||||||
sizesMap.put(100 * MB, Lists.newArrayList(MB));
|
sizesMap.put(100 * MB, Lists.newArrayList(MB));
|
||||||
sizesMap.put(90 * MB, Lists.newArrayList(1024 * MB));
|
sizesMap.put(90 * MB, Lists.newArrayList(1024 * MB));
|
||||||
DayBasedCompactionStrategy strategy = new DayBasedCompactionStrategy();
|
DayBasedCompactionStrategy strategy = new DayBasedCompactionStrategy();
|
||||||
HoodieWriteConfig writeConfig = HoodieWriteConfig.newBuilder().withPath("/tmp")
|
HoodieWriteConfig writeConfig = HoodieWriteConfig.newBuilder().withPath("/tmp").withCompactionConfig(
|
||||||
.withCompactionConfig(
|
HoodieCompactionConfig.newBuilder().withCompactionStrategy(strategy).withTargetIOPerCompactionInMB(400).build())
|
||||||
HoodieCompactionConfig.newBuilder().withCompactionStrategy(strategy)
|
.build();
|
||||||
.withTargetIOPerCompactionInMB(400).build()).build();
|
|
||||||
List<CompactionOperation> operations = createCompactionOperations(writeConfig, sizesMap);
|
List<CompactionOperation> operations = createCompactionOperations(writeConfig, sizesMap);
|
||||||
List<CompactionOperation> returned = strategy.orderAndFilter(writeConfig, operations);
|
List<CompactionOperation> returned = strategy.orderAndFilter(writeConfig, operations);
|
||||||
|
|
||||||
@@ -130,8 +120,7 @@ public class TestHoodieCompactionStrategy {
|
|||||||
|
|
||||||
int comparision = strategy.getComparator().compare(returned.get(returned.size() - 1), returned.get(0));
|
int comparision = strategy.getComparator().compare(returned.get(returned.size() - 1), returned.get(0));
|
||||||
// Either the partition paths are sorted in descending order or they are equal
|
// Either the partition paths are sorted in descending order or they are equal
|
||||||
assertTrue("DayBasedCompactionStrategy should sort partitions in descending order",
|
assertTrue("DayBasedCompactionStrategy should sort partitions in descending order", comparision >= 0);
|
||||||
comparision >= 0);
|
|
||||||
}
|
}
|
||||||
|
|
||||||
private List<CompactionOperation> createCompactionOperations(HoodieWriteConfig config,
|
private List<CompactionOperation> createCompactionOperations(HoodieWriteConfig config,
|
||||||
@@ -140,8 +129,7 @@ public class TestHoodieCompactionStrategy {
|
|||||||
sizesMap.forEach((k, v) -> {
|
sizesMap.forEach((k, v) -> {
|
||||||
operations.add(new CompactionOperation(TestHoodieDataFile.newDataFile(k),
|
operations.add(new CompactionOperation(TestHoodieDataFile.newDataFile(k),
|
||||||
partitionPaths[new Random().nextInt(partitionPaths.length - 1)],
|
partitionPaths[new Random().nextInt(partitionPaths.length - 1)],
|
||||||
v.stream().map(TestHoodieLogFile::newLogFile).collect(
|
v.stream().map(TestHoodieLogFile::newLogFile).collect(Collectors.toList()), config));
|
||||||
Collectors.toList()), config));
|
|
||||||
});
|
});
|
||||||
return operations;
|
return operations;
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -28,6 +28,10 @@ public class TestHoodieDataFile extends HoodieDataFile {
|
|||||||
this.size = size;
|
this.size = size;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
public static HoodieDataFile newDataFile(long size) {
|
||||||
|
return new TestHoodieDataFile(size);
|
||||||
|
}
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
public String getPath() {
|
public String getPath() {
|
||||||
return "/tmp/test";
|
return "/tmp/test";
|
||||||
@@ -43,13 +47,8 @@ public class TestHoodieDataFile extends HoodieDataFile {
|
|||||||
return "100";
|
return "100";
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
public long getFileSize() {
|
public long getFileSize() {
|
||||||
return size;
|
return size;
|
||||||
}
|
}
|
||||||
|
|
||||||
public static HoodieDataFile newDataFile(long size) {
|
|
||||||
return new TestHoodieDataFile(size);
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -29,6 +29,10 @@ public class TestHoodieLogFile extends HoodieLogFile {
|
|||||||
this.size = size;
|
this.size = size;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
public static HoodieLogFile newLogFile(long size) {
|
||||||
|
return new TestHoodieLogFile(size);
|
||||||
|
}
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
public Path getPath() {
|
public Path getPath() {
|
||||||
return new Path("/tmp/test-log");
|
return new Path("/tmp/test-log");
|
||||||
@@ -38,8 +42,4 @@ public class TestHoodieLogFile extends HoodieLogFile {
|
|||||||
public Optional<Long> getFileSize() {
|
public Optional<Long> getFileSize() {
|
||||||
return Optional.of(size);
|
return Optional.of(size);
|
||||||
}
|
}
|
||||||
|
|
||||||
public static HoodieLogFile newLogFile(long size) {
|
|
||||||
return new TestHoodieLogFile(size);
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -40,7 +40,6 @@ public class TestHoodieMetrics {
|
|||||||
@Test
|
@Test
|
||||||
public void testRegisterGauge() {
|
public void testRegisterGauge() {
|
||||||
metrics.registerGauge("metric1", 123L);
|
metrics.registerGauge("metric1", 123L);
|
||||||
assertTrue(Metrics.getInstance().getRegistry().getGauges().get("metric1").getValue().toString()
|
assertTrue(Metrics.getInstance().getRegistry().getGauges().get("metric1").getValue().toString().equals("123"));
|
||||||
.equals("123"));
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -89,14 +89,13 @@ public class TestCopyOnWriteTable {
|
|||||||
|
|
||||||
String commitTime = HoodieTestUtils.makeNewCommitTime();
|
String commitTime = HoodieTestUtils.makeNewCommitTime();
|
||||||
HoodieWriteConfig config = makeHoodieClientConfig();
|
HoodieWriteConfig config = makeHoodieClientConfig();
|
||||||
HoodieTableMetaClient metaClient = new HoodieTableMetaClient(jsc.hadoopConfiguration(),
|
HoodieTableMetaClient metaClient = new HoodieTableMetaClient(jsc.hadoopConfiguration(), basePath);
|
||||||
basePath);
|
|
||||||
HoodieTable table = HoodieTable.getHoodieTable(metaClient, config);
|
HoodieTable table = HoodieTable.getHoodieTable(metaClient, config);
|
||||||
|
|
||||||
HoodieCreateHandle io = new HoodieCreateHandle(config, commitTime, table, partitionPath);
|
HoodieCreateHandle io = new HoodieCreateHandle(config, commitTime, table, partitionPath);
|
||||||
Path newPath = io.makeNewPath(record.getPartitionPath(), unitNumber, fileName);
|
Path newPath = io.makeNewPath(record.getPartitionPath(), unitNumber, fileName);
|
||||||
assertTrue(newPath.toString().equals(this.basePath + "/" + partitionPath + "/" + FSUtils
|
assertTrue(newPath.toString().equals(
|
||||||
.makeDataFileName(commitTime, unitNumber, fileName)));
|
this.basePath + "/" + partitionPath + "/" + FSUtils.makeDataFileName(commitTime, unitNumber, fileName)));
|
||||||
}
|
}
|
||||||
|
|
||||||
private HoodieWriteConfig makeHoodieClientConfig() throws Exception {
|
private HoodieWriteConfig makeHoodieClientConfig() throws Exception {
|
||||||
@@ -105,8 +104,7 @@ public class TestCopyOnWriteTable {
|
|||||||
|
|
||||||
private HoodieWriteConfig.Builder makeHoodieClientConfigBuilder() throws Exception {
|
private HoodieWriteConfig.Builder makeHoodieClientConfigBuilder() throws Exception {
|
||||||
// Prepare the AvroParquetIO
|
// Prepare the AvroParquetIO
|
||||||
String schemaStr = IOUtils
|
String schemaStr = IOUtils.toString(getClass().getResourceAsStream("/exampleSchema.txt"), "UTF-8");
|
||||||
.toString(getClass().getResourceAsStream("/exampleSchema.txt"), "UTF-8");
|
|
||||||
return HoodieWriteConfig.newBuilder().withPath(basePath).withSchema(schemaStr);
|
return HoodieWriteConfig.newBuilder().withPath(basePath).withSchema(schemaStr);
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -122,28 +120,27 @@ public class TestCopyOnWriteTable {
|
|||||||
HoodieCopyOnWriteTable table = new HoodieCopyOnWriteTable(config, metadata);
|
HoodieCopyOnWriteTable table = new HoodieCopyOnWriteTable(config, metadata);
|
||||||
|
|
||||||
// Get some records belong to the same partition (2016/01/31)
|
// Get some records belong to the same partition (2016/01/31)
|
||||||
String recordStr1 = "{\"_row_key\":\"8eb5b87a-1feh-4edd-87b4-6ec96dc405a0\",\"time\":\"2016-01-31T03:16:41.415Z\",\"number\":12}";
|
String recordStr1 = "{\"_row_key\":\"8eb5b87a-1feh-4edd-87b4-6ec96dc405a0\","
|
||||||
String recordStr2 = "{\"_row_key\":\"8eb5b87b-1feu-4edd-87b4-6ec96dc405a0\",\"time\":\"2016-01-31T03:20:41.415Z\",\"number\":100}";
|
+ "\"time\":\"2016-01-31T03:16:41.415Z\",\"number\":12}";
|
||||||
String recordStr3 = "{\"_row_key\":\"8eb5b87c-1fej-4edd-87b4-6ec96dc405a0\",\"time\":\"2016-01-31T03:16:41.415Z\",\"number\":15}";
|
String recordStr2 = "{\"_row_key\":\"8eb5b87b-1feu-4edd-87b4-6ec96dc405a0\","
|
||||||
String recordStr4 = "{\"_row_key\":\"8eb5b87d-1fej-4edd-87b4-6ec96dc405a0\",\"time\":\"2016-01-31T03:16:41.415Z\",\"number\":51}";
|
+ "\"time\":\"2016-01-31T03:20:41.415Z\",\"number\":100}";
|
||||||
|
String recordStr3 = "{\"_row_key\":\"8eb5b87c-1fej-4edd-87b4-6ec96dc405a0\","
|
||||||
|
+ "\"time\":\"2016-01-31T03:16:41.415Z\",\"number\":15}";
|
||||||
|
String recordStr4 = "{\"_row_key\":\"8eb5b87d-1fej-4edd-87b4-6ec96dc405a0\","
|
||||||
|
+ "\"time\":\"2016-01-31T03:16:41.415Z\",\"number\":51}";
|
||||||
|
|
||||||
List<HoodieRecord> records = new ArrayList<>();
|
List<HoodieRecord> records = new ArrayList<>();
|
||||||
TestRawTripPayload rowChange1 = new TestRawTripPayload(recordStr1);
|
TestRawTripPayload rowChange1 = new TestRawTripPayload(recordStr1);
|
||||||
records.add(
|
records.add(new HoodieRecord(new HoodieKey(rowChange1.getRowKey(), rowChange1.getPartitionPath()), rowChange1));
|
||||||
new HoodieRecord(new HoodieKey(rowChange1.getRowKey(), rowChange1.getPartitionPath()),
|
|
||||||
rowChange1));
|
|
||||||
TestRawTripPayload rowChange2 = new TestRawTripPayload(recordStr2);
|
TestRawTripPayload rowChange2 = new TestRawTripPayload(recordStr2);
|
||||||
records.add(
|
records.add(new HoodieRecord(new HoodieKey(rowChange2.getRowKey(), rowChange2.getPartitionPath()), rowChange2));
|
||||||
new HoodieRecord(new HoodieKey(rowChange2.getRowKey(), rowChange2.getPartitionPath()),
|
|
||||||
rowChange2));
|
|
||||||
TestRawTripPayload rowChange3 = new TestRawTripPayload(recordStr3);
|
TestRawTripPayload rowChange3 = new TestRawTripPayload(recordStr3);
|
||||||
records.add(
|
records.add(new HoodieRecord(new HoodieKey(rowChange3.getRowKey(), rowChange3.getPartitionPath()), rowChange3));
|
||||||
new HoodieRecord(new HoodieKey(rowChange3.getRowKey(), rowChange3.getPartitionPath()),
|
|
||||||
rowChange3));
|
|
||||||
|
|
||||||
// Insert new records
|
// Insert new records
|
||||||
HoodieClientTestUtils.collectStatuses(table.handleInsert(firstCommitTime, records.iterator()));
|
HoodieClientTestUtils.collectStatuses(table.handleInsert(firstCommitTime, records.iterator()));
|
||||||
// We should have a parquet file generated (TODO: better control # files after we revise AvroParquetIO)
|
// We should have a parquet file generated (TODO: better control # files after we revise
|
||||||
|
// AvroParquetIO)
|
||||||
File parquetFile = null;
|
File parquetFile = null;
|
||||||
for (File file : new File(this.basePath + partitionPath).listFiles()) {
|
for (File file : new File(this.basePath + partitionPath).listFiles()) {
|
||||||
if (file.getName().endsWith(".parquet")) {
|
if (file.getName().endsWith(".parquet")) {
|
||||||
@@ -155,18 +152,17 @@ public class TestCopyOnWriteTable {
|
|||||||
|
|
||||||
// Read out the bloom filter and make sure filter can answer record exist or not
|
// Read out the bloom filter and make sure filter can answer record exist or not
|
||||||
Path parquetFilePath = new Path(parquetFile.getAbsolutePath());
|
Path parquetFilePath = new Path(parquetFile.getAbsolutePath());
|
||||||
BloomFilter filter = ParquetUtils
|
BloomFilter filter = ParquetUtils.readBloomFilterFromParquetMetadata(jsc.hadoopConfiguration(), parquetFilePath);
|
||||||
.readBloomFilterFromParquetMetadata(jsc.hadoopConfiguration(), parquetFilePath);
|
|
||||||
for (HoodieRecord record : records) {
|
for (HoodieRecord record : records) {
|
||||||
assertTrue(filter.mightContain(record.getRecordKey()));
|
assertTrue(filter.mightContain(record.getRecordKey()));
|
||||||
}
|
}
|
||||||
// Create a commit file
|
// Create a commit file
|
||||||
new File(this.basePath + "/" + HoodieTableMetaClient.METAFOLDER_NAME + "/"
|
new File(
|
||||||
+ FSUtils.getCommitTime(parquetFile.getName()) + ".commit").createNewFile();
|
this.basePath + "/" + HoodieTableMetaClient.METAFOLDER_NAME + "/" + FSUtils.getCommitTime(parquetFile.getName())
|
||||||
|
+ ".commit").createNewFile();
|
||||||
|
|
||||||
// Read the parquet file, check the record content
|
// Read the parquet file, check the record content
|
||||||
List<GenericRecord> fileRecords = ParquetUtils
|
List<GenericRecord> fileRecords = ParquetUtils.readAvroRecords(jsc.hadoopConfiguration(), parquetFilePath);
|
||||||
.readAvroRecords(jsc.hadoopConfiguration(), parquetFilePath);
|
|
||||||
GenericRecord newRecord;
|
GenericRecord newRecord;
|
||||||
int index = 0;
|
int index = 0;
|
||||||
for (GenericRecord record : fileRecords) {
|
for (GenericRecord record : fileRecords) {
|
||||||
@@ -175,13 +171,12 @@ public class TestCopyOnWriteTable {
|
|||||||
}
|
}
|
||||||
|
|
||||||
// We update the 1st record & add a new record
|
// We update the 1st record & add a new record
|
||||||
String updateRecordStr1 = "{\"_row_key\":\"8eb5b87a-1feh-4edd-87b4-6ec96dc405a0\",\"time\":\"2016-01-31T03:16:41.415Z\",\"number\":15}";
|
String updateRecordStr1 = "{\"_row_key\":\"8eb5b87a-1feh-4edd-87b4-6ec96dc405a0\","
|
||||||
|
+ "\"time\":\"2016-01-31T03:16:41.415Z\",\"number\":15}";
|
||||||
TestRawTripPayload updateRowChanges1 = new TestRawTripPayload(updateRecordStr1);
|
TestRawTripPayload updateRowChanges1 = new TestRawTripPayload(updateRecordStr1);
|
||||||
HoodieRecord updatedRecord1 = new HoodieRecord(
|
HoodieRecord updatedRecord1 = new HoodieRecord(
|
||||||
new HoodieKey(updateRowChanges1.getRowKey(), updateRowChanges1.getPartitionPath()),
|
new HoodieKey(updateRowChanges1.getRowKey(), updateRowChanges1.getPartitionPath()), updateRowChanges1);
|
||||||
updateRowChanges1);
|
updatedRecord1.setCurrentLocation(new HoodieRecordLocation(null, FSUtils.getFileId(parquetFile.getName())));
|
||||||
updatedRecord1.setCurrentLocation(
|
|
||||||
new HoodieRecordLocation(null, FSUtils.getFileId(parquetFile.getName())));
|
|
||||||
|
|
||||||
TestRawTripPayload rowChange4 = new TestRawTripPayload(recordStr4);
|
TestRawTripPayload rowChange4 = new TestRawTripPayload(recordStr4);
|
||||||
HoodieRecord insertedRecord1 = new HoodieRecord(
|
HoodieRecord insertedRecord1 = new HoodieRecord(
|
||||||
@@ -201,9 +196,8 @@ public class TestCopyOnWriteTable {
|
|||||||
File updatedParquetFile = null;
|
File updatedParquetFile = null;
|
||||||
for (File file : new File(basePath + "/2016/01/31").listFiles()) {
|
for (File file : new File(basePath + "/2016/01/31").listFiles()) {
|
||||||
if (file.getName().endsWith(".parquet")) {
|
if (file.getName().endsWith(".parquet")) {
|
||||||
if (FSUtils.getFileId(file.getName())
|
if (FSUtils.getFileId(file.getName()).equals(FSUtils.getFileId(parquetFile.getName()))
|
||||||
.equals(FSUtils.getFileId(parquetFile.getName())) &&
|
&& HoodieTimeline.compareTimestamps(FSUtils.getCommitTime(file.getName()),
|
||||||
HoodieTimeline.compareTimestamps(FSUtils.getCommitTime(file.getName()),
|
|
||||||
FSUtils.getCommitTime(parquetFile.getName()), HoodieTimeline.GREATER)) {
|
FSUtils.getCommitTime(parquetFile.getName()), HoodieTimeline.GREATER)) {
|
||||||
updatedParquetFile = file;
|
updatedParquetFile = file;
|
||||||
break;
|
break;
|
||||||
@@ -213,8 +207,8 @@ public class TestCopyOnWriteTable {
|
|||||||
assertTrue(updatedParquetFile != null);
|
assertTrue(updatedParquetFile != null);
|
||||||
// Check whether the record has been updated
|
// Check whether the record has been updated
|
||||||
Path updatedParquetFilePath = new Path(updatedParquetFile.getAbsolutePath());
|
Path updatedParquetFilePath = new Path(updatedParquetFile.getAbsolutePath());
|
||||||
BloomFilter updatedFilter = ParquetUtils
|
BloomFilter updatedFilter = ParquetUtils.readBloomFilterFromParquetMetadata(jsc.hadoopConfiguration(),
|
||||||
.readBloomFilterFromParquetMetadata(jsc.hadoopConfiguration(), updatedParquetFilePath);
|
updatedParquetFilePath);
|
||||||
for (HoodieRecord record : records) {
|
for (HoodieRecord record : records) {
|
||||||
// No change to the _row_key
|
// No change to the _row_key
|
||||||
assertTrue(updatedFilter.mightContain(record.getRecordKey()));
|
assertTrue(updatedFilter.mightContain(record.getRecordKey()));
|
||||||
@@ -223,8 +217,7 @@ public class TestCopyOnWriteTable {
|
|||||||
assertTrue(updatedFilter.mightContain(insertedRecord1.getRecordKey()));
|
assertTrue(updatedFilter.mightContain(insertedRecord1.getRecordKey()));
|
||||||
records.add(insertedRecord1);// add this so it can further check below
|
records.add(insertedRecord1);// add this so it can further check below
|
||||||
|
|
||||||
ParquetReader updatedReader = ParquetReader
|
ParquetReader updatedReader = ParquetReader.builder(new AvroReadSupport<>(), updatedParquetFilePath).build();
|
||||||
.builder(new AvroReadSupport<>(), updatedParquetFilePath).build();
|
|
||||||
index = 0;
|
index = 0;
|
||||||
while ((newRecord = (GenericRecord) updatedReader.read()) != null) {
|
while ((newRecord = (GenericRecord) updatedReader.read()) != null) {
|
||||||
assertTrue(newRecord.get("_row_key").toString().equals(records.get(index).getRecordKey()));
|
assertTrue(newRecord.get("_row_key").toString().equals(records.get(index).getRecordKey()));
|
||||||
@@ -246,13 +239,9 @@ public class TestCopyOnWriteTable {
|
|||||||
List<HoodieRecord> records = new ArrayList<>();
|
List<HoodieRecord> records = new ArrayList<>();
|
||||||
for (int i = 0; i < n; i++) {
|
for (int i = 0; i < n; i++) {
|
||||||
String recordStr = String.format("{\"_row_key\":\"%s\",\"time\":\"%s\",\"number\":%d}",
|
String recordStr = String.format("{\"_row_key\":\"%s\",\"time\":\"%s\",\"number\":%d}",
|
||||||
UUID.randomUUID().toString(),
|
UUID.randomUUID().toString(), time, i);
|
||||||
time,
|
|
||||||
i);
|
|
||||||
TestRawTripPayload rowChange = new TestRawTripPayload(recordStr);
|
TestRawTripPayload rowChange = new TestRawTripPayload(recordStr);
|
||||||
records.add(new HoodieRecord(
|
records.add(new HoodieRecord(new HoodieKey(rowChange.getRowKey(), rowChange.getPartitionPath()), rowChange));
|
||||||
new HoodieKey(rowChange.getRowKey(), rowChange.getPartitionPath()),
|
|
||||||
rowChange));
|
|
||||||
}
|
}
|
||||||
return records;
|
return records;
|
||||||
}
|
}
|
||||||
@@ -261,31 +250,28 @@ public class TestCopyOnWriteTable {
|
|||||||
@Test
|
@Test
|
||||||
public void testMetadataAggregateFromWriteStatus() throws Exception {
|
public void testMetadataAggregateFromWriteStatus() throws Exception {
|
||||||
// Prepare the AvroParquetIO
|
// Prepare the AvroParquetIO
|
||||||
HoodieWriteConfig config = makeHoodieClientConfigBuilder()
|
HoodieWriteConfig config = makeHoodieClientConfigBuilder().withWriteStatusClass(MetadataMergeWriteStatus.class)
|
||||||
.withWriteStatusClass(MetadataMergeWriteStatus.class).build();
|
.build();
|
||||||
String firstCommitTime = HoodieTestUtils.makeNewCommitTime();
|
String firstCommitTime = HoodieTestUtils.makeNewCommitTime();
|
||||||
HoodieTableMetaClient metadata = new HoodieTableMetaClient(jsc.hadoopConfiguration(), basePath);
|
HoodieTableMetaClient metadata = new HoodieTableMetaClient(jsc.hadoopConfiguration(), basePath);
|
||||||
|
|
||||||
HoodieCopyOnWriteTable table = new HoodieCopyOnWriteTable(config, metadata);
|
HoodieCopyOnWriteTable table = new HoodieCopyOnWriteTable(config, metadata);
|
||||||
|
|
||||||
// Get some records belong to the same partition (2016/01/31)
|
// Get some records belong to the same partition (2016/01/31)
|
||||||
String recordStr1 = "{\"_row_key\":\"8eb5b87a-1feh-4edd-87b4-6ec96dc405a0\",\"time\":\"2016-01-31T03:16:41.415Z\",\"number\":12}";
|
String recordStr1 = "{\"_row_key\":\"8eb5b87a-1feh-4edd-87b4-6ec96dc405a0\","
|
||||||
String recordStr2 = "{\"_row_key\":\"8eb5b87b-1feu-4edd-87b4-6ec96dc405a0\",\"time\":\"2016-01-31T03:20:41.415Z\",\"number\":100}";
|
+ "\"time\":\"2016-01-31T03:16:41.415Z\",\"number\":12}";
|
||||||
String recordStr3 = "{\"_row_key\":\"8eb5b87c-1fej-4edd-87b4-6ec96dc405a0\",\"time\":\"2016-01-31T03:16:41.415Z\",\"number\":15}";
|
String recordStr2 = "{\"_row_key\":\"8eb5b87b-1feu-4edd-87b4-6ec96dc405a0\","
|
||||||
|
+ "\"time\":\"2016-01-31T03:20:41.415Z\",\"number\":100}";
|
||||||
|
String recordStr3 = "{\"_row_key\":\"8eb5b87c-1fej-4edd-87b4-6ec96dc405a0\","
|
||||||
|
+ "\"time\":\"2016-01-31T03:16:41.415Z\",\"number\":15}";
|
||||||
|
|
||||||
List<HoodieRecord> records = new ArrayList<>();
|
List<HoodieRecord> records = new ArrayList<>();
|
||||||
TestRawTripPayload rowChange1 = new TestRawTripPayload(recordStr1);
|
TestRawTripPayload rowChange1 = new TestRawTripPayload(recordStr1);
|
||||||
records.add(
|
records.add(new HoodieRecord(new HoodieKey(rowChange1.getRowKey(), rowChange1.getPartitionPath()), rowChange1));
|
||||||
new HoodieRecord(new HoodieKey(rowChange1.getRowKey(), rowChange1.getPartitionPath()),
|
|
||||||
rowChange1));
|
|
||||||
TestRawTripPayload rowChange2 = new TestRawTripPayload(recordStr2);
|
TestRawTripPayload rowChange2 = new TestRawTripPayload(recordStr2);
|
||||||
records.add(
|
records.add(new HoodieRecord(new HoodieKey(rowChange2.getRowKey(), rowChange2.getPartitionPath()), rowChange2));
|
||||||
new HoodieRecord(new HoodieKey(rowChange2.getRowKey(), rowChange2.getPartitionPath()),
|
|
||||||
rowChange2));
|
|
||||||
TestRawTripPayload rowChange3 = new TestRawTripPayload(recordStr3);
|
TestRawTripPayload rowChange3 = new TestRawTripPayload(recordStr3);
|
||||||
records.add(
|
records.add(new HoodieRecord(new HoodieKey(rowChange3.getRowKey(), rowChange3.getPartitionPath()), rowChange3));
|
||||||
new HoodieRecord(new HoodieKey(rowChange3.getRowKey(), rowChange3.getPartitionPath()),
|
|
||||||
rowChange3));
|
|
||||||
|
|
||||||
// Insert new records
|
// Insert new records
|
||||||
List<WriteStatus> writeStatuses = HoodieClientTestUtils
|
List<WriteStatus> writeStatuses = HoodieClientTestUtils
|
||||||
@@ -293,7 +279,8 @@ public class TestCopyOnWriteTable {
|
|||||||
Map<String, String> allWriteStatusMergedMetadataMap = MetadataMergeWriteStatus
|
Map<String, String> allWriteStatusMergedMetadataMap = MetadataMergeWriteStatus
|
||||||
.mergeMetadataForWriteStatuses(writeStatuses);
|
.mergeMetadataForWriteStatuses(writeStatuses);
|
||||||
assertTrue(allWriteStatusMergedMetadataMap.containsKey("InputRecordCount_1506582000"));
|
assertTrue(allWriteStatusMergedMetadataMap.containsKey("InputRecordCount_1506582000"));
|
||||||
// For metadata key InputRecordCount_1506582000, value is 2 for each record. So sum of this should be 2 * 3
|
// For metadata key InputRecordCount_1506582000, value is 2 for each record. So sum of this
|
||||||
|
// should be 2 * 3
|
||||||
assertEquals("6", allWriteStatusMergedMetadataMap.get("InputRecordCount_1506582000"));
|
assertEquals("6", allWriteStatusMergedMetadataMap.get("InputRecordCount_1506582000"));
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -314,26 +301,19 @@ public class TestCopyOnWriteTable {
|
|||||||
List<WriteStatus> statuses = HoodieClientTestUtils
|
List<WriteStatus> statuses = HoodieClientTestUtils
|
||||||
.collectStatuses(table.handleInsert(commitTime, records.iterator()));
|
.collectStatuses(table.handleInsert(commitTime, records.iterator()));
|
||||||
WriteStatus status = statuses.get(0);
|
WriteStatus status = statuses.get(0);
|
||||||
Path partialFile = new Path(String.format("%s/%s/%s",
|
Path partialFile = new Path(String.format("%s/%s/%s", basePath, status.getPartitionPath(),
|
||||||
basePath,
|
FSUtils.makeDataFileName(commitTime, 0, status.getFileId())));
|
||||||
status.getPartitionPath(),
|
|
||||||
FSUtils.makeDataFileName(commitTime, 0, status.getFileId()))
|
|
||||||
);
|
|
||||||
assertTrue(fs.exists(partialFile));
|
assertTrue(fs.exists(partialFile));
|
||||||
|
|
||||||
// When we retry
|
// When we retry
|
||||||
records = newHoodieRecords(10, "2016-01-31T03:16:41.415Z");
|
records = newHoodieRecords(10, "2016-01-31T03:16:41.415Z");
|
||||||
records.addAll(newHoodieRecords(1, "2016-02-01T03:16:41.415Z"));
|
records.addAll(newHoodieRecords(1, "2016-02-01T03:16:41.415Z"));
|
||||||
|
|
||||||
statuses = HoodieClientTestUtils
|
statuses = HoodieClientTestUtils.collectStatuses(table.handleInsert(commitTime, records.iterator()));
|
||||||
.collectStatuses(table.handleInsert(commitTime, records.iterator()));
|
|
||||||
status = statuses.get(0);
|
status = statuses.get(0);
|
||||||
|
|
||||||
Path retriedFIle = new Path(String.format("%s/%s/%s",
|
Path retriedFIle = new Path(String.format("%s/%s/%s", basePath, status.getPartitionPath(),
|
||||||
basePath,
|
FSUtils.makeDataFileName(commitTime, 0, status.getFileId())));
|
||||||
status.getPartitionPath(),
|
|
||||||
FSUtils.makeDataFileName(commitTime, 0, status.getFileId()))
|
|
||||||
);
|
|
||||||
assertTrue(fs.exists(retriedFIle));
|
assertTrue(fs.exists(retriedFIle));
|
||||||
assertFalse(fs.exists(partialFile));
|
assertFalse(fs.exists(partialFile));
|
||||||
}
|
}
|
||||||
@@ -371,8 +351,7 @@ public class TestCopyOnWriteTable {
|
|||||||
records.addAll(newHoodieRecords(1, "2016-02-02T03:16:41.415Z"));
|
records.addAll(newHoodieRecords(1, "2016-02-02T03:16:41.415Z"));
|
||||||
|
|
||||||
// Insert new records
|
// Insert new records
|
||||||
returnedStatuses = HoodieClientTestUtils
|
returnedStatuses = HoodieClientTestUtils.collectStatuses(table.handleInsert(commitTime, records.iterator()));
|
||||||
.collectStatuses(table.handleInsert(commitTime, records.iterator()));
|
|
||||||
|
|
||||||
assertEquals(3, returnedStatuses.size());
|
assertEquals(3, returnedStatuses.size());
|
||||||
assertEquals("2016/01/31", returnedStatuses.get(0).getPartitionPath());
|
assertEquals("2016/01/31", returnedStatuses.get(0).getPartitionPath());
|
||||||
@@ -389,8 +368,8 @@ public class TestCopyOnWriteTable {
|
|||||||
@Test
|
@Test
|
||||||
public void testFileSizeUpsertRecords() throws Exception {
|
public void testFileSizeUpsertRecords() throws Exception {
|
||||||
HoodieWriteConfig config = makeHoodieClientConfigBuilder().withStorageConfig(
|
HoodieWriteConfig config = makeHoodieClientConfigBuilder().withStorageConfig(
|
||||||
HoodieStorageConfig.newBuilder().limitFileSize(64 * 1024).parquetBlockSize(64 * 1024)
|
HoodieStorageConfig.newBuilder().limitFileSize(64 * 1024).parquetBlockSize(64 * 1024).parquetPageSize(64 * 1024)
|
||||||
.parquetPageSize(64 * 1024).build()).build();
|
.build()).build();
|
||||||
String commitTime = HoodieTestUtils.makeNewCommitTime();
|
String commitTime = HoodieTestUtils.makeNewCommitTime();
|
||||||
HoodieTableMetaClient metadata = new HoodieTableMetaClient(jsc.hadoopConfiguration(), basePath);
|
HoodieTableMetaClient metadata = new HoodieTableMetaClient(jsc.hadoopConfiguration(), basePath);
|
||||||
HoodieCopyOnWriteTable table = new HoodieCopyOnWriteTable(config, metadata);
|
HoodieCopyOnWriteTable table = new HoodieCopyOnWriteTable(config, metadata);
|
||||||
@@ -398,12 +377,11 @@ public class TestCopyOnWriteTable {
|
|||||||
List<HoodieRecord> records = new ArrayList<>();
|
List<HoodieRecord> records = new ArrayList<>();
|
||||||
// Approx 1150 records are written for block size of 64KB
|
// Approx 1150 records are written for block size of 64KB
|
||||||
for (int i = 0; i < 2000; i++) {
|
for (int i = 0; i < 2000; i++) {
|
||||||
String recordStr = "{\"_row_key\":\"" + UUID.randomUUID().toString()
|
String recordStr =
|
||||||
+ "\",\"time\":\"2016-01-31T03:16:41.415Z\",\"number\":" + i + "}";
|
"{\"_row_key\":\"" + UUID.randomUUID().toString() + "\",\"time\":\"2016-01-31T03:16:41.415Z\",\"number\":" + i
|
||||||
|
+ "}";
|
||||||
TestRawTripPayload rowChange = new TestRawTripPayload(recordStr);
|
TestRawTripPayload rowChange = new TestRawTripPayload(recordStr);
|
||||||
records
|
records.add(new HoodieRecord(new HoodieKey(rowChange.getRowKey(), rowChange.getPartitionPath()), rowChange));
|
||||||
.add(new HoodieRecord(new HoodieKey(rowChange.getRowKey(), rowChange.getPartitionPath()),
|
|
||||||
rowChange));
|
|
||||||
}
|
}
|
||||||
|
|
||||||
// Insert new records
|
// Insert new records
|
||||||
@@ -412,39 +390,30 @@ public class TestCopyOnWriteTable {
|
|||||||
// Check the updated file
|
// Check the updated file
|
||||||
int counts = 0;
|
int counts = 0;
|
||||||
for (File file : new File(basePath + "/2016/01/31").listFiles()) {
|
for (File file : new File(basePath + "/2016/01/31").listFiles()) {
|
||||||
if (file.getName().endsWith(".parquet") && FSUtils.getCommitTime(file.getName())
|
if (file.getName().endsWith(".parquet") && FSUtils.getCommitTime(file.getName()).equals(commitTime)) {
|
||||||
.equals(commitTime)) {
|
|
||||||
System.out.println(file.getName() + "-" + file.length());
|
System.out.println(file.getName() + "-" + file.length());
|
||||||
counts++;
|
counts++;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
assertEquals(
|
assertEquals("If the number of records are more than 1150, then there should be a new file", 3, counts);
|
||||||
"If the number of records are more than 1150, then there should be a new file", 3,
|
|
||||||
counts);
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
private List<HoodieCopyOnWriteTable.InsertBucket> testUpsertPartitioner(int smallFileSize,
|
private List<HoodieCopyOnWriteTable.InsertBucket> testUpsertPartitioner(int smallFileSize, int numInserts,
|
||||||
int numInserts,
|
int numUpdates, int fileSize, boolean autoSplitInserts) throws Exception {
|
||||||
int numUpdates,
|
final String testPartitionPath = "2016/09/26";
|
||||||
int fileSize,
|
HoodieWriteConfig config = makeHoodieClientConfigBuilder().withCompactionConfig(
|
||||||
boolean autoSplitInserts) throws Exception {
|
HoodieCompactionConfig.newBuilder().compactionSmallFileSize(smallFileSize).insertSplitSize(100)
|
||||||
final String TEST_PARTITION_PATH = "2016/09/26";
|
.autoTuneInsertSplits(autoSplitInserts).build()).withStorageConfig(
|
||||||
HoodieWriteConfig config = makeHoodieClientConfigBuilder()
|
HoodieStorageConfig.newBuilder().limitFileSize(1000 * 1024).build()).build();
|
||||||
.withCompactionConfig(HoodieCompactionConfig.newBuilder()
|
|
||||||
.compactionSmallFileSize(smallFileSize).insertSplitSize(100)
|
|
||||||
.autoTuneInsertSplits(autoSplitInserts).build())
|
|
||||||
.withStorageConfig(HoodieStorageConfig.newBuilder().limitFileSize(1000 * 1024).build())
|
|
||||||
.build();
|
|
||||||
|
|
||||||
HoodieClientTestUtils.fakeCommitFile(basePath, "001");
|
HoodieClientTestUtils.fakeCommitFile(basePath, "001");
|
||||||
HoodieClientTestUtils.fakeDataFile(basePath, TEST_PARTITION_PATH, "001", "file1", fileSize);
|
HoodieClientTestUtils.fakeDataFile(basePath, testPartitionPath, "001", "file1", fileSize);
|
||||||
|
|
||||||
HoodieTableMetaClient metadata = new HoodieTableMetaClient(jsc.hadoopConfiguration(), basePath);
|
HoodieTableMetaClient metadata = new HoodieTableMetaClient(jsc.hadoopConfiguration(), basePath);
|
||||||
HoodieCopyOnWriteTable table = new HoodieCopyOnWriteTable(config, metadata);
|
HoodieCopyOnWriteTable table = new HoodieCopyOnWriteTable(config, metadata);
|
||||||
|
|
||||||
HoodieTestDataGenerator dataGenerator = new HoodieTestDataGenerator(
|
HoodieTestDataGenerator dataGenerator = new HoodieTestDataGenerator(new String[] {testPartitionPath});
|
||||||
new String[]{TEST_PARTITION_PATH});
|
|
||||||
List<HoodieRecord> insertRecords = dataGenerator.generateInserts("001", numInserts);
|
List<HoodieRecord> insertRecords = dataGenerator.generateInserts("001", numInserts);
|
||||||
List<HoodieRecord> updateRecords = dataGenerator.generateUpdates("001", numUpdates);
|
List<HoodieRecord> updateRecords = dataGenerator.generateUpdates("001", numUpdates);
|
||||||
for (HoodieRecord updateRec : updateRecords) {
|
for (HoodieRecord updateRec : updateRecords) {
|
||||||
@@ -454,8 +423,8 @@ public class TestCopyOnWriteTable {
|
|||||||
records.addAll(insertRecords);
|
records.addAll(insertRecords);
|
||||||
records.addAll(updateRecords);
|
records.addAll(updateRecords);
|
||||||
WorkloadProfile profile = new WorkloadProfile(jsc.parallelize(records));
|
WorkloadProfile profile = new WorkloadProfile(jsc.parallelize(records));
|
||||||
HoodieCopyOnWriteTable.UpsertPartitioner partitioner = (HoodieCopyOnWriteTable.UpsertPartitioner)
|
HoodieCopyOnWriteTable.UpsertPartitioner partitioner =
|
||||||
table.getUpsertPartitioner(profile);
|
(HoodieCopyOnWriteTable.UpsertPartitioner) table.getUpsertPartitioner(profile);
|
||||||
|
|
||||||
assertEquals("Should have 3 partitions", 3, partitioner.numPartitions());
|
assertEquals("Should have 3 partitions", 3, partitioner.numPartitions());
|
||||||
assertEquals("Bucket 0 is UPDATE", HoodieCopyOnWriteTable.BucketType.UPDATE,
|
assertEquals("Bucket 0 is UPDATE", HoodieCopyOnWriteTable.BucketType.UPDATE,
|
||||||
@@ -464,40 +433,35 @@ public class TestCopyOnWriteTable {
|
|||||||
partitioner.getBucketInfo(1).bucketType);
|
partitioner.getBucketInfo(1).bucketType);
|
||||||
assertEquals("Bucket 2 is INSERT", HoodieCopyOnWriteTable.BucketType.INSERT,
|
assertEquals("Bucket 2 is INSERT", HoodieCopyOnWriteTable.BucketType.INSERT,
|
||||||
partitioner.getBucketInfo(2).bucketType);
|
partitioner.getBucketInfo(2).bucketType);
|
||||||
assertEquals("Update record should have gone to the 1 update partiton", 0,
|
assertEquals("Update record should have gone to the 1 update partiton", 0, partitioner.getPartition(
|
||||||
partitioner.getPartition(new Tuple2<>(updateRecords.get(0).getKey(),
|
new Tuple2<>(updateRecords.get(0).getKey(), Option.apply(updateRecords.get(0).getCurrentLocation()))));
|
||||||
Option.apply(updateRecords.get(0).getCurrentLocation()))));
|
return partitioner.getInsertBuckets(testPartitionPath);
|
||||||
return partitioner.getInsertBuckets(TEST_PARTITION_PATH);
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
@Test
|
@Test
|
||||||
public void testUpsertPartitioner() throws Exception {
|
public void testUpsertPartitioner() throws Exception {
|
||||||
// Inserts + Updates... Check all updates go together & inserts subsplit
|
// Inserts + Updates... Check all updates go together & inserts subsplit
|
||||||
List<HoodieCopyOnWriteTable.InsertBucket> insertBuckets = testUpsertPartitioner(0, 200, 100,
|
List<HoodieCopyOnWriteTable.InsertBucket> insertBuckets = testUpsertPartitioner(0, 200, 100, 1024, false);
|
||||||
1024, false);
|
|
||||||
assertEquals("Total of 2 insert buckets", 2, insertBuckets.size());
|
assertEquals("Total of 2 insert buckets", 2, insertBuckets.size());
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
@Test
|
@Test
|
||||||
public void testUpsertPartitionerWithSmallInsertHandling() throws Exception {
|
public void testUpsertPartitionerWithSmallInsertHandling() throws Exception {
|
||||||
// Inserts + Updates .. Check updates go together & inserts subsplit, after expanding smallest file
|
// Inserts + Updates .. Check updates go together & inserts subsplit, after expanding
|
||||||
List<HoodieCopyOnWriteTable.InsertBucket> insertBuckets = testUpsertPartitioner(1000 * 1024,
|
// smallest file
|
||||||
400, 100, 800 * 1024, false);
|
List<HoodieCopyOnWriteTable.InsertBucket> insertBuckets = testUpsertPartitioner(1000 * 1024, 400, 100, 800 * 1024,
|
||||||
|
false);
|
||||||
assertEquals("Total of 3 insert buckets", 3, insertBuckets.size());
|
assertEquals("Total of 3 insert buckets", 3, insertBuckets.size());
|
||||||
assertEquals("First insert bucket must be same as update bucket", 0,
|
assertEquals("First insert bucket must be same as update bucket", 0, insertBuckets.get(0).bucketNumber);
|
||||||
insertBuckets.get(0).bucketNumber);
|
assertEquals("First insert bucket should have weight 0.5", 0.5, insertBuckets.get(0).weight, 0.01);
|
||||||
assertEquals("First insert bucket should have weight 0.5", 0.5, insertBuckets.get(0).weight,
|
|
||||||
0.01);
|
|
||||||
|
|
||||||
// Now with insert split size auto tuned
|
// Now with insert split size auto tuned
|
||||||
insertBuckets = testUpsertPartitioner(1000 * 1024, 2400, 100, 800 * 1024, true);
|
insertBuckets = testUpsertPartitioner(1000 * 1024, 2400, 100, 800 * 1024, true);
|
||||||
assertEquals("Total of 3 insert buckets", 3, insertBuckets.size());
|
assertEquals("Total of 3 insert buckets", 3, insertBuckets.size());
|
||||||
assertEquals("First insert bucket must be same as update bucket", 0,
|
assertEquals("First insert bucket must be same as update bucket", 0, insertBuckets.get(0).bucketNumber);
|
||||||
insertBuckets.get(0).bucketNumber);
|
assertEquals("First insert bucket should have weight 0.5", 200.0 / 2400, insertBuckets.get(0).weight, 0.01);
|
||||||
assertEquals("First insert bucket should have weight 0.5", 200.0 / 2400,
|
|
||||||
insertBuckets.get(0).weight, 0.01);
|
|
||||||
}
|
}
|
||||||
|
|
||||||
@After
|
@After
|
||||||
|
|||||||
@@ -18,7 +18,6 @@
|
|||||||
|
|
||||||
package com.uber.hoodie.table;
|
package com.uber.hoodie.table;
|
||||||
|
|
||||||
|
|
||||||
import static com.uber.hoodie.common.HoodieTestDataGenerator.TRIP_EXAMPLE_SCHEMA;
|
import static com.uber.hoodie.common.HoodieTestDataGenerator.TRIP_EXAMPLE_SCHEMA;
|
||||||
import static org.junit.Assert.assertEquals;
|
import static org.junit.Assert.assertEquals;
|
||||||
import static org.junit.Assert.assertFalse;
|
import static org.junit.Assert.assertFalse;
|
||||||
@@ -76,15 +75,14 @@ import org.junit.rules.TemporaryFolder;
|
|||||||
|
|
||||||
public class TestMergeOnReadTable {
|
public class TestMergeOnReadTable {
|
||||||
|
|
||||||
private transient JavaSparkContext jsc = null;
|
|
||||||
private transient SQLContext sqlContext;
|
|
||||||
private static String basePath = null;
|
private static String basePath = null;
|
||||||
|
|
||||||
//NOTE : Be careful in using DFS (FileSystem.class) vs LocalFs(RawLocalFileSystem.class)
|
//NOTE : Be careful in using DFS (FileSystem.class) vs LocalFs(RawLocalFileSystem.class)
|
||||||
//The implementation and gurantees of many API's differ, for example check rename(src,dst)
|
//The implementation and gurantees of many API's differ, for example check rename(src,dst)
|
||||||
private static MiniDFSCluster dfsCluster;
|
private static MiniDFSCluster dfsCluster;
|
||||||
private static DistributedFileSystem dfs;
|
private static DistributedFileSystem dfs;
|
||||||
private static HdfsTestService hdfsTestService;
|
private static HdfsTestService hdfsTestService;
|
||||||
|
private transient JavaSparkContext jsc = null;
|
||||||
|
private transient SQLContext sqlContext;
|
||||||
|
|
||||||
@AfterClass
|
@AfterClass
|
||||||
public static void cleanUp() throws Exception {
|
public static void cleanUp() throws Exception {
|
||||||
@@ -92,13 +90,15 @@ public class TestMergeOnReadTable {
|
|||||||
hdfsTestService.stop();
|
hdfsTestService.stop();
|
||||||
dfsCluster.shutdown();
|
dfsCluster.shutdown();
|
||||||
}
|
}
|
||||||
// Need to closeAll to clear FileSystem.Cache, required because DFS and LocalFS used in the same JVM
|
// Need to closeAll to clear FileSystem.Cache, required because DFS and LocalFS used in the
|
||||||
|
// same JVM
|
||||||
FileSystem.closeAll();
|
FileSystem.closeAll();
|
||||||
}
|
}
|
||||||
|
|
||||||
@BeforeClass
|
@BeforeClass
|
||||||
public static void setUpDFS() throws IOException {
|
public static void setUpDFS() throws IOException {
|
||||||
// Need to closeAll to clear FileSystem.Cache, required because DFS and LocalFS used in the same JVM
|
// Need to closeAll to clear FileSystem.Cache, required because DFS and LocalFS used in the
|
||||||
|
// same JVM
|
||||||
FileSystem.closeAll();
|
FileSystem.closeAll();
|
||||||
if (hdfsTestService == null) {
|
if (hdfsTestService == null) {
|
||||||
hdfsTestService = new HdfsTestService();
|
hdfsTestService = new HdfsTestService();
|
||||||
@@ -111,8 +111,7 @@ public class TestMergeOnReadTable {
|
|||||||
@Before
|
@Before
|
||||||
public void init() throws IOException {
|
public void init() throws IOException {
|
||||||
// Initialize a local spark env
|
// Initialize a local spark env
|
||||||
jsc = new JavaSparkContext(
|
jsc = new JavaSparkContext(HoodieClientTestUtils.getSparkConfForTest("TestHoodieMergeOnReadTable"));
|
||||||
HoodieClientTestUtils.getSparkConfForTest("TestHoodieMergeOnReadTable"));
|
|
||||||
|
|
||||||
// Create a temp folder as the base path
|
// Create a temp folder as the base path
|
||||||
TemporaryFolder folder = new TemporaryFolder();
|
TemporaryFolder folder = new TemporaryFolder();
|
||||||
@@ -154,28 +153,23 @@ public class TestMergeOnReadTable {
|
|||||||
List<WriteStatus> statuses = client.upsert(writeRecords, newCommitTime).collect();
|
List<WriteStatus> statuses = client.upsert(writeRecords, newCommitTime).collect();
|
||||||
assertNoWriteErrors(statuses);
|
assertNoWriteErrors(statuses);
|
||||||
|
|
||||||
HoodieTableMetaClient metaClient = new HoodieTableMetaClient(jsc.hadoopConfiguration(),
|
HoodieTableMetaClient metaClient = new HoodieTableMetaClient(jsc.hadoopConfiguration(), cfg.getBasePath());
|
||||||
cfg.getBasePath());
|
|
||||||
HoodieTable hoodieTable = HoodieTable.getHoodieTable(metaClient, cfg);
|
HoodieTable hoodieTable = HoodieTable.getHoodieTable(metaClient, cfg);
|
||||||
|
|
||||||
Optional<HoodieInstant> deltaCommit =
|
Optional<HoodieInstant> deltaCommit = metaClient.getActiveTimeline().getDeltaCommitTimeline().firstInstant();
|
||||||
metaClient.getActiveTimeline().getDeltaCommitTimeline().firstInstant();
|
|
||||||
assertTrue(deltaCommit.isPresent());
|
assertTrue(deltaCommit.isPresent());
|
||||||
assertEquals("Delta commit should be 001", "001", deltaCommit.get().getTimestamp());
|
assertEquals("Delta commit should be 001", "001", deltaCommit.get().getTimestamp());
|
||||||
|
|
||||||
Optional<HoodieInstant> commit =
|
Optional<HoodieInstant> commit = metaClient.getActiveTimeline().getCommitTimeline().firstInstant();
|
||||||
metaClient.getActiveTimeline().getCommitTimeline().firstInstant();
|
|
||||||
assertFalse(commit.isPresent());
|
assertFalse(commit.isPresent());
|
||||||
|
|
||||||
FileStatus[] allFiles = HoodieTestUtils
|
FileStatus[] allFiles = HoodieTestUtils.listAllDataFilesInPath(metaClient.getFs(), cfg.getBasePath());
|
||||||
.listAllDataFilesInPath(metaClient.getFs(), cfg.getBasePath());
|
|
||||||
TableFileSystemView.ReadOptimizedView roView = new HoodieTableFileSystemView(metaClient,
|
TableFileSystemView.ReadOptimizedView roView = new HoodieTableFileSystemView(metaClient,
|
||||||
hoodieTable.getCommitTimeline().filterCompletedInstants(), allFiles);
|
hoodieTable.getCommitTimeline().filterCompletedInstants(), allFiles);
|
||||||
Stream<HoodieDataFile> dataFilesToRead = roView.getLatestDataFiles();
|
Stream<HoodieDataFile> dataFilesToRead = roView.getLatestDataFiles();
|
||||||
assertTrue(!dataFilesToRead.findAny().isPresent());
|
assertTrue(!dataFilesToRead.findAny().isPresent());
|
||||||
|
|
||||||
roView = new HoodieTableFileSystemView(metaClient, hoodieTable.getCompletedCommitTimeline(),
|
roView = new HoodieTableFileSystemView(metaClient, hoodieTable.getCompletedCommitTimeline(), allFiles);
|
||||||
allFiles);
|
|
||||||
dataFilesToRead = roView.getLatestDataFiles();
|
dataFilesToRead = roView.getLatestDataFiles();
|
||||||
assertTrue("RealtimeTableView should list the parquet files we wrote in the delta commit",
|
assertTrue("RealtimeTableView should list the parquet files we wrote in the delta commit",
|
||||||
dataFilesToRead.findAny().isPresent());
|
dataFilesToRead.findAny().isPresent());
|
||||||
@@ -209,21 +203,17 @@ public class TestMergeOnReadTable {
|
|||||||
client.compact(compactionCommitTime);
|
client.compact(compactionCommitTime);
|
||||||
|
|
||||||
allFiles = HoodieTestUtils.listAllDataFilesInPath(dfs, cfg.getBasePath());
|
allFiles = HoodieTestUtils.listAllDataFilesInPath(dfs, cfg.getBasePath());
|
||||||
roView = new HoodieTableFileSystemView(metaClient, hoodieTable.getCompletedCommitTimeline(),
|
roView = new HoodieTableFileSystemView(metaClient, hoodieTable.getCompletedCommitTimeline(), allFiles);
|
||||||
allFiles);
|
|
||||||
dataFilesToRead = roView.getLatestDataFiles();
|
dataFilesToRead = roView.getLatestDataFiles();
|
||||||
assertTrue(dataFilesToRead.findAny().isPresent());
|
assertTrue(dataFilesToRead.findAny().isPresent());
|
||||||
|
|
||||||
// verify that there is a commit
|
// verify that there is a commit
|
||||||
HoodieTable table = HoodieTable.getHoodieTable(
|
HoodieTable table = HoodieTable.getHoodieTable(
|
||||||
new HoodieTableMetaClient(jsc.hadoopConfiguration(), cfg.getBasePath(), true),
|
new HoodieTableMetaClient(jsc.hadoopConfiguration(), cfg.getBasePath(), true), getConfig(false));
|
||||||
getConfig(false));
|
|
||||||
HoodieTimeline timeline = table.getCommitTimeline().filterCompletedInstants();
|
HoodieTimeline timeline = table.getCommitTimeline().filterCompletedInstants();
|
||||||
assertEquals("Expecting a single commit.", 1,
|
assertEquals("Expecting a single commit.", 1, timeline.findInstantsAfter("000", Integer.MAX_VALUE).countInstants());
|
||||||
timeline.findInstantsAfter("000", Integer.MAX_VALUE).countInstants());
|
|
||||||
String latestCompactionCommitTime = timeline.lastInstant().get().getTimestamp();
|
String latestCompactionCommitTime = timeline.lastInstant().get().getTimestamp();
|
||||||
assertTrue(HoodieTimeline
|
assertTrue(HoodieTimeline.compareTimestamps("000", latestCompactionCommitTime, HoodieTimeline.LESSER));
|
||||||
.compareTimestamps("000", latestCompactionCommitTime, HoodieTimeline.LESSER));
|
|
||||||
|
|
||||||
assertEquals("Must contain 200 records", 200,
|
assertEquals("Must contain 200 records", 200,
|
||||||
HoodieClientTestUtils.readSince(basePath, sqlContext, timeline, "000").count());
|
HoodieClientTestUtils.readSince(basePath, sqlContext, timeline, "000").count());
|
||||||
@@ -232,8 +222,7 @@ public class TestMergeOnReadTable {
|
|||||||
// Check if record level metadata is aggregated properly at the end of write.
|
// Check if record level metadata is aggregated properly at the end of write.
|
||||||
@Test
|
@Test
|
||||||
public void testMetadataAggregateFromWriteStatus() throws Exception {
|
public void testMetadataAggregateFromWriteStatus() throws Exception {
|
||||||
HoodieWriteConfig cfg = getConfigBuilder(false)
|
HoodieWriteConfig cfg = getConfigBuilder(false).withWriteStatusClass(MetadataMergeWriteStatus.class).build();
|
||||||
.withWriteStatusClass(MetadataMergeWriteStatus.class).build();
|
|
||||||
HoodieWriteClient client = new HoodieWriteClient(jsc, cfg);
|
HoodieWriteClient client = new HoodieWriteClient(jsc, cfg);
|
||||||
|
|
||||||
String newCommitTime = "001";
|
String newCommitTime = "001";
|
||||||
@@ -248,7 +237,8 @@ public class TestMergeOnReadTable {
|
|||||||
Map<String, String> allWriteStatusMergedMetadataMap = MetadataMergeWriteStatus
|
Map<String, String> allWriteStatusMergedMetadataMap = MetadataMergeWriteStatus
|
||||||
.mergeMetadataForWriteStatuses(statuses);
|
.mergeMetadataForWriteStatuses(statuses);
|
||||||
assertTrue(allWriteStatusMergedMetadataMap.containsKey("InputRecordCount_1506582000"));
|
assertTrue(allWriteStatusMergedMetadataMap.containsKey("InputRecordCount_1506582000"));
|
||||||
// For metadata key InputRecordCount_1506582000, value is 2 for each record. So sum of this should be 2 * records.size()
|
// For metadata key InputRecordCount_1506582000, value is 2 for each record. So sum of this
|
||||||
|
// should be 2 * records.size()
|
||||||
assertEquals(String.valueOf(2 * records.size()),
|
assertEquals(String.valueOf(2 * records.size()),
|
||||||
allWriteStatusMergedMetadataMap.get("InputRecordCount_1506582000"));
|
allWriteStatusMergedMetadataMap.get("InputRecordCount_1506582000"));
|
||||||
}
|
}
|
||||||
@@ -271,28 +261,23 @@ public class TestMergeOnReadTable {
|
|||||||
List<WriteStatus> statuses = client.upsert(writeRecords, newCommitTime).collect();
|
List<WriteStatus> statuses = client.upsert(writeRecords, newCommitTime).collect();
|
||||||
assertNoWriteErrors(statuses);
|
assertNoWriteErrors(statuses);
|
||||||
|
|
||||||
HoodieTableMetaClient metaClient = new HoodieTableMetaClient(jsc.hadoopConfiguration(),
|
HoodieTableMetaClient metaClient = new HoodieTableMetaClient(jsc.hadoopConfiguration(), cfg.getBasePath());
|
||||||
cfg.getBasePath());
|
|
||||||
HoodieTable hoodieTable = HoodieTable.getHoodieTable(metaClient, cfg);
|
HoodieTable hoodieTable = HoodieTable.getHoodieTable(metaClient, cfg);
|
||||||
|
|
||||||
Optional<HoodieInstant> deltaCommit =
|
Optional<HoodieInstant> deltaCommit = metaClient.getActiveTimeline().getDeltaCommitTimeline().firstInstant();
|
||||||
metaClient.getActiveTimeline().getDeltaCommitTimeline().firstInstant();
|
|
||||||
assertTrue(deltaCommit.isPresent());
|
assertTrue(deltaCommit.isPresent());
|
||||||
assertEquals("Delta commit should be 001", "001", deltaCommit.get().getTimestamp());
|
assertEquals("Delta commit should be 001", "001", deltaCommit.get().getTimestamp());
|
||||||
|
|
||||||
Optional<HoodieInstant> commit =
|
Optional<HoodieInstant> commit = metaClient.getActiveTimeline().getCommitTimeline().firstInstant();
|
||||||
metaClient.getActiveTimeline().getCommitTimeline().firstInstant();
|
|
||||||
assertFalse(commit.isPresent());
|
assertFalse(commit.isPresent());
|
||||||
|
|
||||||
FileStatus[] allFiles = HoodieTestUtils
|
FileStatus[] allFiles = HoodieTestUtils.listAllDataFilesInPath(metaClient.getFs(), cfg.getBasePath());
|
||||||
.listAllDataFilesInPath(metaClient.getFs(), cfg.getBasePath());
|
|
||||||
TableFileSystemView.ReadOptimizedView roView = new HoodieTableFileSystemView(metaClient,
|
TableFileSystemView.ReadOptimizedView roView = new HoodieTableFileSystemView(metaClient,
|
||||||
hoodieTable.getCommitTimeline().filterCompletedInstants(), allFiles);
|
hoodieTable.getCommitTimeline().filterCompletedInstants(), allFiles);
|
||||||
Stream<HoodieDataFile> dataFilesToRead = roView.getLatestDataFiles();
|
Stream<HoodieDataFile> dataFilesToRead = roView.getLatestDataFiles();
|
||||||
assertTrue(!dataFilesToRead.findAny().isPresent());
|
assertTrue(!dataFilesToRead.findAny().isPresent());
|
||||||
|
|
||||||
roView = new HoodieTableFileSystemView(metaClient, hoodieTable.getCompletedCommitTimeline(),
|
roView = new HoodieTableFileSystemView(metaClient, hoodieTable.getCompletedCommitTimeline(), allFiles);
|
||||||
allFiles);
|
|
||||||
dataFilesToRead = roView.getLatestDataFiles();
|
dataFilesToRead = roView.getLatestDataFiles();
|
||||||
assertTrue("RealtimeTableView should list the parquet files we wrote in the delta commit",
|
assertTrue("RealtimeTableView should list the parquet files we wrote in the delta commit",
|
||||||
dataFilesToRead.findAny().isPresent());
|
dataFilesToRead.findAny().isPresent());
|
||||||
@@ -329,15 +314,12 @@ public class TestMergeOnReadTable {
|
|||||||
assertFalse(commit.isPresent());
|
assertFalse(commit.isPresent());
|
||||||
|
|
||||||
allFiles = HoodieTestUtils.listAllDataFilesInPath(dfs, cfg.getBasePath());
|
allFiles = HoodieTestUtils.listAllDataFilesInPath(dfs, cfg.getBasePath());
|
||||||
roView = new HoodieTableFileSystemView(metaClient, hoodieTable.getCompletedCommitTimeline(),
|
roView = new HoodieTableFileSystemView(metaClient, hoodieTable.getCompletedCommitTimeline(), allFiles);
|
||||||
allFiles);
|
|
||||||
dataFilesToRead = roView.getLatestDataFiles();
|
dataFilesToRead = roView.getLatestDataFiles();
|
||||||
assertTrue(dataFilesToRead.findAny().isPresent());
|
assertTrue(dataFilesToRead.findAny().isPresent());
|
||||||
|
|
||||||
List<String> dataFiles = roView.getLatestDataFiles().map(hf -> hf.getPath())
|
List<String> dataFiles = roView.getLatestDataFiles().map(hf -> hf.getPath()).collect(Collectors.toList());
|
||||||
.collect(Collectors.toList());
|
List<GenericRecord> recordsRead = HoodieMergeOnReadTestUtils.getRecordsUsingInputFormat(dataFiles, basePath);
|
||||||
List<GenericRecord> recordsRead = HoodieMergeOnReadTestUtils
|
|
||||||
.getRecordsUsingInputFormat(dataFiles, basePath);
|
|
||||||
//Wrote 40 records and deleted 20 records, so remaining 40-20 = 20
|
//Wrote 40 records and deleted 20 records, so remaining 40-20 = 20
|
||||||
assertEquals("Must contain 20 records", 20, recordsRead.size());
|
assertEquals("Must contain 20 records", 20, recordsRead.size());
|
||||||
}
|
}
|
||||||
@@ -365,10 +347,8 @@ public class TestMergeOnReadTable {
|
|||||||
//verify there are no errors
|
//verify there are no errors
|
||||||
assertNoWriteErrors(statuses);
|
assertNoWriteErrors(statuses);
|
||||||
|
|
||||||
HoodieTableMetaClient metaClient = new HoodieTableMetaClient(jsc.hadoopConfiguration(),
|
HoodieTableMetaClient metaClient = new HoodieTableMetaClient(jsc.hadoopConfiguration(), cfg.getBasePath());
|
||||||
cfg.getBasePath());
|
Optional<HoodieInstant> commit = metaClient.getActiveTimeline().getCommitTimeline().firstInstant();
|
||||||
Optional<HoodieInstant> commit =
|
|
||||||
metaClient.getActiveTimeline().getCommitTimeline().firstInstant();
|
|
||||||
assertTrue(commit.isPresent());
|
assertTrue(commit.isPresent());
|
||||||
assertEquals("commit should be 001", "001", commit.get().getTimestamp());
|
assertEquals("commit should be 001", "001", commit.get().getTimestamp());
|
||||||
|
|
||||||
@@ -391,10 +371,8 @@ public class TestMergeOnReadTable {
|
|||||||
client.rollback(newCommitTime);
|
client.rollback(newCommitTime);
|
||||||
|
|
||||||
metaClient = new HoodieTableMetaClient(jsc.hadoopConfiguration(), cfg.getBasePath());
|
metaClient = new HoodieTableMetaClient(jsc.hadoopConfiguration(), cfg.getBasePath());
|
||||||
HoodieTable hoodieTable = HoodieTable
|
HoodieTable hoodieTable = HoodieTable.getHoodieTable(metaClient, cfg);
|
||||||
.getHoodieTable(metaClient, cfg);
|
FileStatus[] allFiles = HoodieTestUtils.listAllDataFilesInPath(metaClient.getFs(), cfg.getBasePath());
|
||||||
FileStatus[] allFiles = HoodieTestUtils
|
|
||||||
.listAllDataFilesInPath(metaClient.getFs(), cfg.getBasePath());
|
|
||||||
HoodieTableFileSystemView roView = new HoodieTableFileSystemView(metaClient,
|
HoodieTableFileSystemView roView = new HoodieTableFileSystemView(metaClient,
|
||||||
hoodieTable.getCompletedCommitTimeline(), allFiles);
|
hoodieTable.getCompletedCommitTimeline(), allFiles);
|
||||||
|
|
||||||
@@ -428,28 +406,23 @@ public class TestMergeOnReadTable {
|
|||||||
List<WriteStatus> statuses = client.upsert(writeRecords, newCommitTime).collect();
|
List<WriteStatus> statuses = client.upsert(writeRecords, newCommitTime).collect();
|
||||||
assertNoWriteErrors(statuses);
|
assertNoWriteErrors(statuses);
|
||||||
|
|
||||||
HoodieTableMetaClient metaClient = new HoodieTableMetaClient(jsc.hadoopConfiguration(),
|
HoodieTableMetaClient metaClient = new HoodieTableMetaClient(jsc.hadoopConfiguration(), cfg.getBasePath());
|
||||||
cfg.getBasePath());
|
|
||||||
HoodieTable hoodieTable = HoodieTable.getHoodieTable(metaClient, cfg);
|
HoodieTable hoodieTable = HoodieTable.getHoodieTable(metaClient, cfg);
|
||||||
|
|
||||||
Optional<HoodieInstant> deltaCommit =
|
Optional<HoodieInstant> deltaCommit = metaClient.getActiveTimeline().getDeltaCommitTimeline().firstInstant();
|
||||||
metaClient.getActiveTimeline().getDeltaCommitTimeline().firstInstant();
|
|
||||||
assertTrue(deltaCommit.isPresent());
|
assertTrue(deltaCommit.isPresent());
|
||||||
assertEquals("Delta commit should be 001", "001", deltaCommit.get().getTimestamp());
|
assertEquals("Delta commit should be 001", "001", deltaCommit.get().getTimestamp());
|
||||||
|
|
||||||
Optional<HoodieInstant> commit =
|
Optional<HoodieInstant> commit = metaClient.getActiveTimeline().getCommitTimeline().firstInstant();
|
||||||
metaClient.getActiveTimeline().getCommitTimeline().firstInstant();
|
|
||||||
assertFalse(commit.isPresent());
|
assertFalse(commit.isPresent());
|
||||||
|
|
||||||
FileStatus[] allFiles = HoodieTestUtils
|
FileStatus[] allFiles = HoodieTestUtils.listAllDataFilesInPath(metaClient.getFs(), cfg.getBasePath());
|
||||||
.listAllDataFilesInPath(metaClient.getFs(), cfg.getBasePath());
|
|
||||||
TableFileSystemView.ReadOptimizedView roView = new HoodieTableFileSystemView(metaClient,
|
TableFileSystemView.ReadOptimizedView roView = new HoodieTableFileSystemView(metaClient,
|
||||||
hoodieTable.getCommitTimeline().filterCompletedInstants(), allFiles);
|
hoodieTable.getCommitTimeline().filterCompletedInstants(), allFiles);
|
||||||
Stream<HoodieDataFile> dataFilesToRead = roView.getLatestDataFiles();
|
Stream<HoodieDataFile> dataFilesToRead = roView.getLatestDataFiles();
|
||||||
assertTrue(!dataFilesToRead.findAny().isPresent());
|
assertTrue(!dataFilesToRead.findAny().isPresent());
|
||||||
|
|
||||||
roView = new HoodieTableFileSystemView(metaClient, hoodieTable.getCompletedCommitTimeline(),
|
roView = new HoodieTableFileSystemView(metaClient, hoodieTable.getCompletedCommitTimeline(), allFiles);
|
||||||
allFiles);
|
|
||||||
dataFilesToRead = roView.getLatestDataFiles();
|
dataFilesToRead = roView.getLatestDataFiles();
|
||||||
assertTrue("RealtimeTableView should list the parquet files we wrote in the delta commit",
|
assertTrue("RealtimeTableView should list the parquet files we wrote in the delta commit",
|
||||||
dataFilesToRead.findAny().isPresent());
|
dataFilesToRead.findAny().isPresent());
|
||||||
@@ -473,10 +446,8 @@ public class TestMergeOnReadTable {
|
|||||||
commit = metaClient.getActiveTimeline().getCommitTimeline().firstInstant();
|
commit = metaClient.getActiveTimeline().getCommitTimeline().firstInstant();
|
||||||
assertFalse(commit.isPresent());
|
assertFalse(commit.isPresent());
|
||||||
|
|
||||||
List<String> dataFiles = roView.getLatestDataFiles().map(hf -> hf.getPath())
|
List<String> dataFiles = roView.getLatestDataFiles().map(hf -> hf.getPath()).collect(Collectors.toList());
|
||||||
.collect(Collectors.toList());
|
List<GenericRecord> recordsRead = HoodieMergeOnReadTestUtils.getRecordsUsingInputFormat(dataFiles, basePath);
|
||||||
List<GenericRecord> recordsRead = HoodieMergeOnReadTestUtils
|
|
||||||
.getRecordsUsingInputFormat(dataFiles, basePath);
|
|
||||||
|
|
||||||
assertEquals(recordsRead.size(), 200);
|
assertEquals(recordsRead.size(), 200);
|
||||||
|
|
||||||
@@ -485,8 +456,7 @@ public class TestMergeOnReadTable {
|
|||||||
|
|
||||||
metaClient = new HoodieTableMetaClient(jsc.hadoopConfiguration(), cfg.getBasePath());
|
metaClient = new HoodieTableMetaClient(jsc.hadoopConfiguration(), cfg.getBasePath());
|
||||||
hoodieTable = HoodieTable.getHoodieTable(metaClient, cfg);
|
hoodieTable = HoodieTable.getHoodieTable(metaClient, cfg);
|
||||||
roView = new HoodieTableFileSystemView(metaClient, hoodieTable.getCompletedCommitTimeline(),
|
roView = new HoodieTableFileSystemView(metaClient, hoodieTable.getCompletedCommitTimeline(), allFiles);
|
||||||
allFiles);
|
|
||||||
dataFiles = roView.getLatestDataFiles().map(hf -> hf.getPath()).collect(Collectors.toList());
|
dataFiles = roView.getLatestDataFiles().map(hf -> hf.getPath()).collect(Collectors.toList());
|
||||||
recordsRead = HoodieMergeOnReadTestUtils.getRecordsUsingInputFormat(dataFiles, basePath);
|
recordsRead = HoodieMergeOnReadTestUtils.getRecordsUsingInputFormat(dataFiles, basePath);
|
||||||
|
|
||||||
@@ -512,11 +482,10 @@ public class TestMergeOnReadTable {
|
|||||||
allFiles = HoodieTestUtils.listAllDataFilesInPath(metaClient.getFs(), cfg.getBasePath());
|
allFiles = HoodieTestUtils.listAllDataFilesInPath(metaClient.getFs(), cfg.getBasePath());
|
||||||
metaClient = new HoodieTableMetaClient(jsc.hadoopConfiguration(), cfg.getBasePath());
|
metaClient = new HoodieTableMetaClient(jsc.hadoopConfiguration(), cfg.getBasePath());
|
||||||
hoodieTable = HoodieTable.getHoodieTable(metaClient, cfg);
|
hoodieTable = HoodieTable.getHoodieTable(metaClient, cfg);
|
||||||
roView = new HoodieTableFileSystemView(metaClient, hoodieTable.getCommitsTimeline(),
|
roView = new HoodieTableFileSystemView(metaClient, hoodieTable.getCommitsTimeline(), allFiles);
|
||||||
allFiles);
|
|
||||||
|
|
||||||
final String compactedCommitTime = metaClient.getActiveTimeline().reload()
|
final String compactedCommitTime = metaClient.getActiveTimeline().reload().getCommitsTimeline().lastInstant().get()
|
||||||
.getCommitsTimeline().lastInstant().get().getTimestamp();
|
.getTimestamp();
|
||||||
|
|
||||||
assertTrue(roView.getLatestDataFiles().filter(file -> {
|
assertTrue(roView.getLatestDataFiles().filter(file -> {
|
||||||
if (compactedCommitTime.equals(file.getCommitTime())) {
|
if (compactedCommitTime.equals(file.getCommitTime())) {
|
||||||
@@ -531,8 +500,7 @@ public class TestMergeOnReadTable {
|
|||||||
allFiles = HoodieTestUtils.listAllDataFilesInPath(metaClient.getFs(), cfg.getBasePath());
|
allFiles = HoodieTestUtils.listAllDataFilesInPath(metaClient.getFs(), cfg.getBasePath());
|
||||||
metaClient = new HoodieTableMetaClient(jsc.hadoopConfiguration(), cfg.getBasePath());
|
metaClient = new HoodieTableMetaClient(jsc.hadoopConfiguration(), cfg.getBasePath());
|
||||||
hoodieTable = HoodieTable.getHoodieTable(metaClient, cfg);
|
hoodieTable = HoodieTable.getHoodieTable(metaClient, cfg);
|
||||||
roView = new HoodieTableFileSystemView(metaClient, hoodieTable.getCommitsTimeline(),
|
roView = new HoodieTableFileSystemView(metaClient, hoodieTable.getCommitsTimeline(), allFiles);
|
||||||
allFiles);
|
|
||||||
|
|
||||||
assertFalse(roView.getLatestDataFiles().filter(file -> {
|
assertFalse(roView.getLatestDataFiles().filter(file -> {
|
||||||
if (compactedCommitTime.equals(file.getCommitTime())) {
|
if (compactedCommitTime.equals(file.getCommitTime())) {
|
||||||
@@ -564,30 +532,28 @@ public class TestMergeOnReadTable {
|
|||||||
HoodieTableMetaClient metaClient = new HoodieTableMetaClient(jsc.hadoopConfiguration(), cfg.getBasePath());
|
HoodieTableMetaClient metaClient = new HoodieTableMetaClient(jsc.hadoopConfiguration(), cfg.getBasePath());
|
||||||
HoodieTable hoodieTable = HoodieTable.getHoodieTable(metaClient, cfg);
|
HoodieTable hoodieTable = HoodieTable.getHoodieTable(metaClient, cfg);
|
||||||
|
|
||||||
Optional<HoodieInstant> deltaCommit =
|
Optional<HoodieInstant> deltaCommit = metaClient.getActiveTimeline().getDeltaCommitTimeline().firstInstant();
|
||||||
metaClient.getActiveTimeline().getDeltaCommitTimeline().firstInstant();
|
|
||||||
assertTrue(deltaCommit.isPresent());
|
assertTrue(deltaCommit.isPresent());
|
||||||
assertEquals("Delta commit should be 001", "001", deltaCommit.get().getTimestamp());
|
assertEquals("Delta commit should be 001", "001", deltaCommit.get().getTimestamp());
|
||||||
|
|
||||||
Optional<HoodieInstant> commit =
|
Optional<HoodieInstant> commit = metaClient.getActiveTimeline().getCommitTimeline().firstInstant();
|
||||||
metaClient.getActiveTimeline().getCommitTimeline().firstInstant();
|
|
||||||
assertFalse(commit.isPresent());
|
assertFalse(commit.isPresent());
|
||||||
|
|
||||||
FileStatus[] allFiles = HoodieTestUtils
|
FileStatus[] allFiles = HoodieTestUtils.listAllDataFilesInPath(metaClient.getFs(), cfg.getBasePath());
|
||||||
.listAllDataFilesInPath(metaClient.getFs(), cfg.getBasePath());
|
|
||||||
TableFileSystemView.ReadOptimizedView roView = new HoodieTableFileSystemView(metaClient,
|
TableFileSystemView.ReadOptimizedView roView = new HoodieTableFileSystemView(metaClient,
|
||||||
hoodieTable.getCommitsTimeline().filterCompletedInstants(), allFiles);
|
hoodieTable.getCommitsTimeline().filterCompletedInstants(), allFiles);
|
||||||
Stream<HoodieDataFile> dataFilesToRead = roView.getLatestDataFiles();
|
Stream<HoodieDataFile> dataFilesToRead = roView.getLatestDataFiles();
|
||||||
Map<String, Long> parquetFileIdToSize = dataFilesToRead.collect(Collectors.toMap(HoodieDataFile::getFileId, HoodieDataFile::getFileSize));
|
Map<String, Long> parquetFileIdToSize = dataFilesToRead.collect(
|
||||||
|
Collectors.toMap(HoodieDataFile::getFileId, HoodieDataFile::getFileSize));
|
||||||
|
|
||||||
roView = new HoodieTableFileSystemView(metaClient, hoodieTable.getCompletedCommitTimeline(),
|
roView = new HoodieTableFileSystemView(metaClient, hoodieTable.getCompletedCommitTimeline(), allFiles);
|
||||||
allFiles);
|
|
||||||
dataFilesToRead = roView.getLatestDataFiles();
|
dataFilesToRead = roView.getLatestDataFiles();
|
||||||
assertTrue("RealtimeTableView should list the parquet files we wrote in the delta commit",
|
assertTrue("RealtimeTableView should list the parquet files we wrote in the delta commit",
|
||||||
dataFilesToRead.findAny().isPresent());
|
dataFilesToRead.findAny().isPresent());
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Write 2 (only updates + inserts, written to .log file + correction of existing parquet file size)
|
* Write 2 (only updates + inserts, written to .log file + correction of existing parquet
|
||||||
|
* file size)
|
||||||
*/
|
*/
|
||||||
newCommitTime = "002";
|
newCommitTime = "002";
|
||||||
client.startCommitWithTime(newCommitTime);
|
client.startCommitWithTime(newCommitTime);
|
||||||
@@ -608,18 +574,17 @@ public class TestMergeOnReadTable {
|
|||||||
assertFalse(commit.isPresent());
|
assertFalse(commit.isPresent());
|
||||||
|
|
||||||
allFiles = HoodieTestUtils.listAllDataFilesInPath(metaClient.getFs(), cfg.getBasePath());
|
allFiles = HoodieTestUtils.listAllDataFilesInPath(metaClient.getFs(), cfg.getBasePath());
|
||||||
roView = new HoodieTableFileSystemView(metaClient, hoodieTable.getActiveTimeline().reload()
|
roView = new HoodieTableFileSystemView(metaClient,
|
||||||
.getCommitsTimeline().filterCompletedInstants(), allFiles);
|
hoodieTable.getActiveTimeline().reload().getCommitsTimeline().filterCompletedInstants(), allFiles);
|
||||||
dataFilesToRead = roView.getLatestDataFiles();
|
dataFilesToRead = roView.getLatestDataFiles();
|
||||||
Map<String, Long> parquetFileIdToNewSize = dataFilesToRead.collect(Collectors.toMap(HoodieDataFile::getFileId, HoodieDataFile::getFileSize));
|
Map<String, Long> parquetFileIdToNewSize = dataFilesToRead.collect(
|
||||||
|
Collectors.toMap(HoodieDataFile::getFileId, HoodieDataFile::getFileSize));
|
||||||
|
|
||||||
assertTrue(parquetFileIdToNewSize.entrySet().stream()
|
assertTrue(parquetFileIdToNewSize.entrySet().stream()
|
||||||
.filter(entry -> parquetFileIdToSize.get(entry.getKey()) < entry.getValue()).count() > 0);
|
.filter(entry -> parquetFileIdToSize.get(entry.getKey()) < entry.getValue()).count() > 0);
|
||||||
|
|
||||||
List<String> dataFiles = roView.getLatestDataFiles().map(hf -> hf.getPath())
|
List<String> dataFiles = roView.getLatestDataFiles().map(hf -> hf.getPath()).collect(Collectors.toList());
|
||||||
.collect(Collectors.toList());
|
List<GenericRecord> recordsRead = HoodieMergeOnReadTestUtils.getRecordsUsingInputFormat(dataFiles, basePath);
|
||||||
List<GenericRecord> recordsRead = HoodieMergeOnReadTestUtils
|
|
||||||
.getRecordsUsingInputFormat(dataFiles, basePath);
|
|
||||||
//Wrote 20 records in 2 batches
|
//Wrote 20 records in 2 batches
|
||||||
assertEquals("Must contain 40 records", 40, recordsRead.size());
|
assertEquals("Must contain 40 records", 40, recordsRead.size());
|
||||||
}
|
}
|
||||||
@@ -639,8 +604,7 @@ public class TestMergeOnReadTable {
|
|||||||
List<WriteStatus> statuses = writeClient.insert(recordsRDD, newCommitTime).collect();
|
List<WriteStatus> statuses = writeClient.insert(recordsRDD, newCommitTime).collect();
|
||||||
|
|
||||||
// Update all the 100 records
|
// Update all the 100 records
|
||||||
HoodieTableMetaClient metaClient = new HoodieTableMetaClient(jsc.hadoopConfiguration(),
|
HoodieTableMetaClient metaClient = new HoodieTableMetaClient(jsc.hadoopConfiguration(), basePath);
|
||||||
basePath);
|
|
||||||
HoodieTable table = HoodieTable.getHoodieTable(metaClient, config);
|
HoodieTable table = HoodieTable.getHoodieTable(metaClient, config);
|
||||||
|
|
||||||
newCommitTime = "101";
|
newCommitTime = "101";
|
||||||
@@ -653,19 +617,17 @@ public class TestMergeOnReadTable {
|
|||||||
|
|
||||||
// Write them to corresponding avro logfiles
|
// Write them to corresponding avro logfiles
|
||||||
HoodieTestUtils
|
HoodieTestUtils
|
||||||
.writeRecordsToLogFiles(metaClient.getFs(), metaClient.getBasePath(),
|
.writeRecordsToLogFiles(metaClient.getFs(), metaClient.getBasePath(), HoodieTestDataGenerator.avroSchema,
|
||||||
HoodieTestDataGenerator.avroSchema, updatedRecords);
|
updatedRecords);
|
||||||
|
|
||||||
// Verify that all data file has one log file
|
// Verify that all data file has one log file
|
||||||
metaClient = new HoodieTableMetaClient(jsc.hadoopConfiguration(), basePath);
|
metaClient = new HoodieTableMetaClient(jsc.hadoopConfiguration(), basePath);
|
||||||
table = HoodieTable.getHoodieTable(metaClient, config);
|
table = HoodieTable.getHoodieTable(metaClient, config);
|
||||||
for (String partitionPath : dataGen.getPartitionPaths()) {
|
for (String partitionPath : dataGen.getPartitionPaths()) {
|
||||||
List<FileSlice> groupedLogFiles =
|
List<FileSlice> groupedLogFiles = table.getRTFileSystemView().getLatestFileSlices(partitionPath)
|
||||||
table.getRTFileSystemView().getLatestFileSlices(partitionPath)
|
|
||||||
.collect(Collectors.toList());
|
.collect(Collectors.toList());
|
||||||
for (FileSlice fileSlice : groupedLogFiles) {
|
for (FileSlice fileSlice : groupedLogFiles) {
|
||||||
assertEquals("There should be 1 log file written for every data file", 1,
|
assertEquals("There should be 1 log file written for every data file", 1, fileSlice.getLogFiles().count());
|
||||||
fileSlice.getLogFiles().count());
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -674,31 +636,27 @@ public class TestMergeOnReadTable {
|
|||||||
table = HoodieTable.getHoodieTable(metaClient, config);
|
table = HoodieTable.getHoodieTable(metaClient, config);
|
||||||
|
|
||||||
String commitTime = writeClient.startCompaction();
|
String commitTime = writeClient.startCompaction();
|
||||||
JavaRDD<WriteStatus> result =
|
JavaRDD<WriteStatus> result = writeClient.compact(commitTime);
|
||||||
writeClient.compact(commitTime);
|
|
||||||
|
|
||||||
// Verify that recently written compacted data file has no log file
|
// Verify that recently written compacted data file has no log file
|
||||||
metaClient = new HoodieTableMetaClient(jsc.hadoopConfiguration(), basePath);
|
metaClient = new HoodieTableMetaClient(jsc.hadoopConfiguration(), basePath);
|
||||||
table = HoodieTable.getHoodieTable(metaClient, config);
|
table = HoodieTable.getHoodieTable(metaClient, config);
|
||||||
HoodieActiveTimeline timeline = metaClient.getActiveTimeline();
|
HoodieActiveTimeline timeline = metaClient.getActiveTimeline();
|
||||||
|
|
||||||
assertTrue("Compaction commit should be > than last insert",
|
assertTrue("Compaction commit should be > than last insert", HoodieTimeline.compareTimestamps(
|
||||||
HoodieTimeline.compareTimestamps(timeline.lastInstant().get().getTimestamp(), newCommitTime,
|
timeline.lastInstant().get().getTimestamp(), newCommitTime, HoodieTimeline.GREATER));
|
||||||
HoodieTimeline.GREATER));
|
|
||||||
|
|
||||||
for (String partitionPath : dataGen.getPartitionPaths()) {
|
for (String partitionPath : dataGen.getPartitionPaths()) {
|
||||||
List<FileSlice> groupedLogFiles = table.getRTFileSystemView()
|
List<FileSlice> groupedLogFiles = table.getRTFileSystemView().getLatestFileSlices(partitionPath)
|
||||||
.getLatestFileSlices(partitionPath)
|
|
||||||
.collect(Collectors.toList());
|
.collect(Collectors.toList());
|
||||||
for (FileSlice slice : groupedLogFiles) {
|
for (FileSlice slice : groupedLogFiles) {
|
||||||
assertTrue(
|
assertTrue("After compaction there should be no log files visiable on a Realtime view",
|
||||||
"After compaction there should be no log files visiable on a Realtime view",
|
|
||||||
slice.getLogFiles().collect(Collectors.toList()).isEmpty());
|
slice.getLogFiles().collect(Collectors.toList()).isEmpty());
|
||||||
}
|
}
|
||||||
List<WriteStatus> writeStatuses = result.collect();
|
List<WriteStatus> writeStatuses = result.collect();
|
||||||
assertTrue(writeStatuses.stream()
|
assertTrue(writeStatuses.stream()
|
||||||
.filter(writeStatus -> writeStatus.getStat().getPartitionPath()
|
.filter(writeStatus -> writeStatus.getStat().getPartitionPath().contentEquals(partitionPath))
|
||||||
.contentEquals(partitionPath)).count() > 0);
|
.count() > 0);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -707,16 +665,13 @@ public class TestMergeOnReadTable {
|
|||||||
}
|
}
|
||||||
|
|
||||||
private HoodieWriteConfig.Builder getConfigBuilder(Boolean autoCommit) {
|
private HoodieWriteConfig.Builder getConfigBuilder(Boolean autoCommit) {
|
||||||
return HoodieWriteConfig.newBuilder().withPath(basePath)
|
return HoodieWriteConfig.newBuilder().withPath(basePath).withSchema(TRIP_EXAMPLE_SCHEMA).withParallelism(2, 2)
|
||||||
.withSchema(TRIP_EXAMPLE_SCHEMA).withParallelism(2, 2)
|
.withAutoCommit(autoCommit).withAssumeDatePartitioning(true).withCompactionConfig(
|
||||||
.withAutoCommit(autoCommit)
|
HoodieCompactionConfig.newBuilder().compactionSmallFileSize(1024 * 1024 * 1024).withInlineCompaction(false)
|
||||||
.withAssumeDatePartitioning(true)
|
.withMaxNumDeltaCommitsBeforeCompaction(1).build())
|
||||||
.withCompactionConfig(
|
|
||||||
HoodieCompactionConfig.newBuilder().compactionSmallFileSize(1024 * 1024 * 1024)
|
|
||||||
.withInlineCompaction(false).withMaxNumDeltaCommitsBeforeCompaction(1).build())
|
|
||||||
.withStorageConfig(HoodieStorageConfig.newBuilder().limitFileSize(1024 * 1024 * 1024).build())
|
.withStorageConfig(HoodieStorageConfig.newBuilder().limitFileSize(1024 * 1024 * 1024).build())
|
||||||
.forTable("test-trip-table").withIndexConfig(
|
.forTable("test-trip-table")
|
||||||
HoodieIndexConfig.newBuilder().withIndexType(HoodieIndex.IndexType.BLOOM).build());
|
.withIndexConfig(HoodieIndexConfig.newBuilder().withIndexType(HoodieIndex.IndexType.BLOOM).build());
|
||||||
}
|
}
|
||||||
|
|
||||||
private void assertNoWriteErrors(List<WriteStatus> statuses) {
|
private void assertNoWriteErrors(List<WriteStatus> statuses) {
|
||||||
|
|||||||
@@ -33,10 +33,10 @@ public class HoodieAvroWriteSupport extends AvroWriteSupport {
|
|||||||
private String maxRecordKey;
|
private String maxRecordKey;
|
||||||
|
|
||||||
|
|
||||||
public final static String HOODIE_AVRO_BLOOM_FILTER_METADATA_KEY =
|
public static final String HOODIE_AVRO_BLOOM_FILTER_METADATA_KEY =
|
||||||
"com.uber.hoodie.bloomfilter";
|
"com.uber.hoodie.bloomfilter";
|
||||||
public final static String HOODIE_MIN_RECORD_KEY_FOOTER = "hoodie_min_record_key";
|
public static final String HOODIE_MIN_RECORD_KEY_FOOTER = "hoodie_min_record_key";
|
||||||
public final static String HOODIE_MAX_RECORD_KEY_FOOTER = "hoodie_max_record_key";
|
public static final String HOODIE_MAX_RECORD_KEY_FOOTER = "hoodie_max_record_key";
|
||||||
|
|
||||||
|
|
||||||
public HoodieAvroWriteSupport(MessageType schema, Schema avroSchema, BloomFilter bloomFilter) {
|
public HoodieAvroWriteSupport(MessageType schema, Schema avroSchema, BloomFilter bloomFilter) {
|
||||||
|
|||||||
@@ -16,6 +16,7 @@
|
|||||||
|
|
||||||
package com.uber.hoodie.avro;
|
package com.uber.hoodie.avro;
|
||||||
|
|
||||||
|
import com.fasterxml.jackson.databind.ObjectMapper;
|
||||||
import java.io.IOException;
|
import java.io.IOException;
|
||||||
import java.util.ArrayList;
|
import java.util.ArrayList;
|
||||||
import java.util.HashMap;
|
import java.util.HashMap;
|
||||||
@@ -24,11 +25,11 @@ import java.util.Map;
|
|||||||
import org.apache.avro.Schema;
|
import org.apache.avro.Schema;
|
||||||
import org.apache.avro.generic.GenericData;
|
import org.apache.avro.generic.GenericData;
|
||||||
import org.apache.avro.generic.GenericRecord;
|
import org.apache.avro.generic.GenericRecord;
|
||||||
import com.fasterxml.jackson.databind.ObjectMapper;
|
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Marjority of this is copied from https://github.com/jwills/avro-json/blob/master/src/main/java/com/cloudera/science/avro/common/JsonConverter.java
|
* Marjority of this is copied from
|
||||||
* Adjusted for expected behavior of our use cases
|
* https://github.com/jwills/avro-json/blob/master/src/main/java/com/cloudera/science/avro/
|
||||||
|
* common/JsonConverter.java Adjusted for expected behavior of our use cases
|
||||||
*/
|
*/
|
||||||
public class MercifulJsonConverter {
|
public class MercifulJsonConverter {
|
||||||
|
|
||||||
@@ -132,10 +133,10 @@ public class MercifulJsonConverter {
|
|||||||
}
|
}
|
||||||
|
|
||||||
private boolean isOptional(Schema schema) {
|
private boolean isOptional(Schema schema) {
|
||||||
return schema.getType().equals(Schema.Type.UNION) &&
|
return schema.getType().equals(Schema.Type.UNION)
|
||||||
schema.getTypes().size() == 2 &&
|
&& schema.getTypes().size() == 2
|
||||||
(schema.getTypes().get(0).getType().equals(Schema.Type.NULL) ||
|
&& (schema.getTypes().get(0).getType().equals(Schema.Type.NULL)
|
||||||
schema.getTypes().get(1).getType().equals(Schema.Type.NULL));
|
|| schema.getTypes().get(1).getType().equals(Schema.Type.NULL));
|
||||||
}
|
}
|
||||||
|
|
||||||
private Schema getNonNull(Schema schema) {
|
private Schema getNonNull(Schema schema) {
|
||||||
|
|||||||
@@ -113,9 +113,8 @@ public class HoodieCleanStat implements Serializable {
|
|||||||
}
|
}
|
||||||
|
|
||||||
public Builder withEarliestCommitRetained(Optional<HoodieInstant> earliestCommitToRetain) {
|
public Builder withEarliestCommitRetained(Optional<HoodieInstant> earliestCommitToRetain) {
|
||||||
this.earliestCommitToRetain = (earliestCommitToRetain.isPresent()) ?
|
this.earliestCommitToRetain = (earliestCommitToRetain.isPresent())
|
||||||
earliestCommitToRetain.get().getTimestamp() :
|
? earliestCommitToRetain.get().getTimestamp() : "-1";
|
||||||
"-1";
|
|
||||||
return this;
|
return this;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|||||||
@@ -210,12 +210,18 @@ public class HoodieCommitMetadata implements Serializable {
|
|||||||
|
|
||||||
@Override
|
@Override
|
||||||
public boolean equals(Object o) {
|
public boolean equals(Object o) {
|
||||||
if (this == o) return true;
|
if (this == o) {
|
||||||
if (o == null || getClass() != o.getClass()) return false;
|
return true;
|
||||||
|
}
|
||||||
|
if (o == null || getClass() != o.getClass()) {
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
|
||||||
HoodieCommitMetadata that = (HoodieCommitMetadata) o;
|
HoodieCommitMetadata that = (HoodieCommitMetadata) o;
|
||||||
|
|
||||||
if (!partitionToWriteStats.equals(that.partitionToWriteStats)) return false;
|
if (!partitionToWriteStats.equals(that.partitionToWriteStats)) {
|
||||||
|
return false;
|
||||||
|
}
|
||||||
return compacted.equals(that.compacted);
|
return compacted.equals(that.compacted);
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -39,7 +39,6 @@ public class HoodieFileGroup implements Serializable {
|
|||||||
};
|
};
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Partition containing the file group.
|
* Partition containing the file group.
|
||||||
*/
|
*/
|
||||||
@@ -107,8 +106,8 @@ public class HoodieFileGroup implements Serializable {
|
|||||||
*/
|
*/
|
||||||
private boolean isFileSliceCommitted(FileSlice slice) {
|
private boolean isFileSliceCommitted(FileSlice slice) {
|
||||||
String maxCommitTime = lastInstant.get().getTimestamp();
|
String maxCommitTime = lastInstant.get().getTimestamp();
|
||||||
return timeline.containsOrBeforeTimelineStarts(slice.getBaseCommitTime()) &&
|
return timeline.containsOrBeforeTimelineStarts(slice.getBaseCommitTime())
|
||||||
HoodieTimeline.compareTimestamps(slice.getBaseCommitTime(),
|
&& HoodieTimeline.compareTimestamps(slice.getBaseCommitTime(),
|
||||||
maxCommitTime,
|
maxCommitTime,
|
||||||
HoodieTimeline.LESSER_OR_EQUAL);
|
HoodieTimeline.LESSER_OR_EQUAL);
|
||||||
|
|
||||||
@@ -128,7 +127,7 @@ public class HoodieFileGroup implements Serializable {
|
|||||||
|
|
||||||
/**
|
/**
|
||||||
* Gets the latest slice - this can contain either
|
* Gets the latest slice - this can contain either
|
||||||
*
|
* <p>
|
||||||
* - just the log files without data file - (or) data file with 0 or more log files
|
* - just the log files without data file - (or) data file with 0 or more log files
|
||||||
*/
|
*/
|
||||||
public Optional<FileSlice> getLatestFileSlice() {
|
public Optional<FileSlice> getLatestFileSlice() {
|
||||||
|
|||||||
@@ -21,7 +21,7 @@ import java.io.Serializable;
|
|||||||
|
|
||||||
/**
|
/**
|
||||||
* HoodieKey consists of
|
* HoodieKey consists of
|
||||||
*
|
* <p>
|
||||||
* - recordKey : a recordKey that acts as primary key for a record - partitionPath : path to the
|
* - recordKey : a recordKey that acts as primary key for a record - partitionPath : path to the
|
||||||
* partition that contains the record
|
* partition that contains the record
|
||||||
*/
|
*/
|
||||||
@@ -54,8 +54,8 @@ public class HoodieKey implements Serializable {
|
|||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
HoodieKey otherKey = (HoodieKey) o;
|
HoodieKey otherKey = (HoodieKey) o;
|
||||||
return Objects.equal(recordKey, otherKey.recordKey) &&
|
return Objects.equal(recordKey, otherKey.recordKey)
|
||||||
Objects.equal(partitionPath, otherKey.partitionPath);
|
&& Objects.equal(partitionPath, otherKey.partitionPath);
|
||||||
}
|
}
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
|
|||||||
@@ -30,7 +30,7 @@ import org.apache.hadoop.fs.Path;
|
|||||||
/**
|
/**
|
||||||
* Abstracts a single log file. Contains methods to extract metadata like the fileId, version and
|
* Abstracts a single log file. Contains methods to extract metadata like the fileId, version and
|
||||||
* extension from the log file path.
|
* extension from the log file path.
|
||||||
*
|
* <p>
|
||||||
* Also contains logic to roll-over the log file
|
* Also contains logic to roll-over the log file
|
||||||
*/
|
*/
|
||||||
public class HoodieLogFile implements Serializable {
|
public class HoodieLogFile implements Serializable {
|
||||||
@@ -103,8 +103,12 @@ public class HoodieLogFile implements Serializable {
|
|||||||
|
|
||||||
@Override
|
@Override
|
||||||
public boolean equals(Object o) {
|
public boolean equals(Object o) {
|
||||||
if (this == o) return true;
|
if (this == o) {
|
||||||
if (o == null || getClass() != o.getClass()) return false;
|
return true;
|
||||||
|
}
|
||||||
|
if (o == null || getClass() != o.getClass()) {
|
||||||
|
return false;
|
||||||
|
}
|
||||||
HoodieLogFile that = (HoodieLogFile) o;
|
HoodieLogFile that = (HoodieLogFile) o;
|
||||||
return path != null ? path.equals(that.path) : that.path == null;
|
return path != null ? path.equals(that.path) : that.path == null;
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -101,9 +101,8 @@ public class HoodiePartitionMetadata {
|
|||||||
}
|
}
|
||||||
} catch (IOException ioe) {
|
} catch (IOException ioe) {
|
||||||
log.warn(
|
log.warn(
|
||||||
"Error trying to save partition metadata (this is okay, as long as atleast 1 of these succced), "
|
"Error trying to save partition metadata (this is okay, as long as "
|
||||||
+
|
+ "atleast 1 of these succced), " + partitionPath, ioe);
|
||||||
partitionPath, ioe);
|
|
||||||
} finally {
|
} finally {
|
||||||
if (!metafileExists) {
|
if (!metafileExists) {
|
||||||
try {
|
try {
|
||||||
|
|||||||
@@ -118,10 +118,10 @@ public class HoodieRecord<T extends HoodieRecordPayload> implements Serializable
|
|||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
HoodieRecord that = (HoodieRecord) o;
|
HoodieRecord that = (HoodieRecord) o;
|
||||||
return Objects.equal(key, that.key) &&
|
return Objects.equal(key, that.key)
|
||||||
Objects.equal(data, that.data) &&
|
&& Objects.equal(data, that.data)
|
||||||
Objects.equal(currentLocation, that.currentLocation) &&
|
&& Objects.equal(currentLocation, that.currentLocation)
|
||||||
Objects.equal(newLocation, that.newLocation);
|
&& Objects.equal(newLocation, that.newLocation);
|
||||||
}
|
}
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
|
|||||||
@@ -42,8 +42,8 @@ public class HoodieRecordLocation implements Serializable {
|
|||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
HoodieRecordLocation otherLoc = (HoodieRecordLocation) o;
|
HoodieRecordLocation otherLoc = (HoodieRecordLocation) o;
|
||||||
return Objects.equal(commitTime, otherLoc.commitTime) &&
|
return Objects.equal(commitTime, otherLoc.commitTime)
|
||||||
Objects.equal(fileId, otherLoc.fileId);
|
&& Objects.equal(fileId, otherLoc.fileId);
|
||||||
}
|
}
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
|
|||||||
@@ -38,7 +38,7 @@ public interface HoodieRecordPayload<T extends HoodieRecordPayload> extends Seri
|
|||||||
/**
|
/**
|
||||||
* This methods lets you write custom merging/combining logic to produce new values as a function
|
* This methods lets you write custom merging/combining logic to produce new values as a function
|
||||||
* of current value on storage and whats contained in this object.
|
* of current value on storage and whats contained in this object.
|
||||||
*
|
* <p>
|
||||||
* eg: 1) You are updating counters, you may want to add counts to currentValue and write back
|
* eg: 1) You are updating counters, you may want to add counts to currentValue and write back
|
||||||
* updated counts 2) You may be reading DB redo logs, and merge them with current image for a
|
* updated counts 2) You may be reading DB redo logs, and merge them with current image for a
|
||||||
* database row on storage
|
* database row on storage
|
||||||
|
|||||||
@@ -18,16 +18,16 @@ package com.uber.hoodie.common.model;
|
|||||||
|
|
||||||
/**
|
/**
|
||||||
* Type of the Hoodie Table.
|
* Type of the Hoodie Table.
|
||||||
*
|
* <p>
|
||||||
* Currently, 1 type is supported
|
* Currently, 1 type is supported
|
||||||
*
|
* <p>
|
||||||
* COPY_ON_WRITE - Performs upserts by versioning entire files, with later versions containing newer
|
* COPY_ON_WRITE - Performs upserts by versioning entire files, with later versions containing newer
|
||||||
* value of a record.
|
* value of a record.
|
||||||
*
|
* <p>
|
||||||
* In the future, following might be added.
|
* In the future, following might be added.
|
||||||
*
|
* <p>
|
||||||
* MERGE_ON_READ - Speeds up upserts, by delaying merge until enough work piles up.
|
* MERGE_ON_READ - Speeds up upserts, by delaying merge until enough work piles up.
|
||||||
*
|
* <p>
|
||||||
* SIMPLE_LSM - A simple 2 level LSM tree.
|
* SIMPLE_LSM - A simple 2 level LSM tree.
|
||||||
*/
|
*/
|
||||||
public enum HoodieTableType {
|
public enum HoodieTableType {
|
||||||
|
|||||||
Some files were not shown because too many files have changed in this diff Show More
Reference in New Issue
Block a user