1
0

CodeStyle formatting to conform to basic Checkstyle rules.

The code-style rules follow google style with some changes:

1. Increase line length from 100 to 120
2. Disable JavaDoc related checkstyles as this needs more manual work.

Both source and test code are checked for code-style
This commit is contained in:
Balaji Varadarajan
2018-03-20 16:29:20 -07:00
committed by vinoth chandar
parent 987f5d6b96
commit 788e4f2d2e
200 changed files with 6209 additions and 5975 deletions

View File

@@ -7,8 +7,7 @@ permalink: dev_setup.html
### Code Style ### Code Style
We have embraced the [Google Java code style](https://google.github.io/styleguide/javaguide.html). Please setup your IDE accordingly with style files from [here](https://github.com/google/styleguide/blob/gh-pages/intellij-java-google-style.xml) We have embraced the code style largely based on [google format](https://google.github.io/styleguide/javaguide.html).
Also recommend setting up the [Save Action Plugin](https://plugins.jetbrains.com/plugin/7642-save-actions) to auto format & organize imports on save. Please setup your IDE with style files from [here](../style/)
We also recommend setting up the [Save Action Plugin](https://plugins.jetbrains.com/plugin/7642-save-actions) to auto format & organize imports on save.
The Maven Compilation life-cycle will fail if there are checkstyle violations.

View File

@@ -35,11 +35,11 @@ public class HoodiePrompt extends DefaultPromptProvider {
case DATASET: case DATASET:
return "hoodie:" + tableName + "->"; return "hoodie:" + tableName + "->";
case SYNC: case SYNC:
return "hoodie:" + tableName + " <==> " return "hoodie:" + tableName + " <==> " + HoodieCLI.syncTableMetadata.getTableConfig().getTableName() + "->";
+ HoodieCLI.syncTableMetadata.getTableConfig().getTableName() + "->"; default:
}
return "hoodie:" + tableName + "->"; return "hoodie:" + tableName + "->";
} }
}
return "hoodie->"; return "hoodie->";
} }

View File

@@ -24,22 +24,18 @@ import org.springframework.stereotype.Component;
@Component @Component
@Order(Ordered.HIGHEST_PRECEDENCE) @Order(Ordered.HIGHEST_PRECEDENCE)
public class HoodieSplashScreen public class HoodieSplashScreen extends DefaultBannerProvider {
extends DefaultBannerProvider {
private static String screen = private static String screen = "============================================" + OsUtils.LINE_SEPARATOR
"============================================" + OsUtils.LINE_SEPARATOR + + "* *" + OsUtils.LINE_SEPARATOR
"* *" + OsUtils.LINE_SEPARATOR + + "* _ _ _ _ *" + OsUtils.LINE_SEPARATOR
"* _ _ _ _ *" + OsUtils.LINE_SEPARATOR + + "* | | | | | (_) *" + OsUtils.LINE_SEPARATOR
"* | | | | | (_) *" + OsUtils.LINE_SEPARATOR + + "* | |__| | ___ ___ __| |_ ___ *" + OsUtils.LINE_SEPARATOR
"* | |__| | ___ ___ __| |_ ___ *" + OsUtils.LINE_SEPARATOR + + "* | __ |/ _ \\ / _ \\ / _` | |/ _ \\ *" + OsUtils.LINE_SEPARATOR
"* | __ |/ _ \\ / _ \\ / _` | |/ _ \\ *" + + "* | | | | (_) | (_) | (_| | | __/ *" + OsUtils.LINE_SEPARATOR
OsUtils.LINE_SEPARATOR + + "* |_| |_|\\___/ \\___/ \\__,_|_|\\___| *" + OsUtils.LINE_SEPARATOR
"* | | | | (_) | (_) | (_| | | __/ *" + OsUtils.LINE_SEPARATOR + + "* *" + OsUtils.LINE_SEPARATOR
"* |_| |_|\\___/ \\___/ \\__,_|_|\\___| *" + + "============================================" + OsUtils.LINE_SEPARATOR;
OsUtils.LINE_SEPARATOR +
"* *" + OsUtils.LINE_SEPARATOR +
"============================================" + OsUtils.LINE_SEPARATOR;
public String getBanner() { public String getBanner() {
return screen; return screen;

View File

@@ -22,8 +22,7 @@ import org.springframework.shell.Bootstrap;
public class Main { public class Main {
/** /**
* Main class that delegates to Spring Shell's Bootstrap class in order to simplify debugging * Main class that delegates to Spring Shell's Bootstrap class in order to simplify debugging inside an IDE
* inside an IDE
*/ */
public static void main(String[] args) throws IOException { public static void main(String[] args) throws IOException {
Bootstrap.main(args); Bootstrap.main(args);

View File

@@ -47,13 +47,11 @@ public class ArchivedCommitsCommand implements CommandMarker {
} }
@CliCommand(value = "show archived commits", help = "Read commits from archived files and show details") @CliCommand(value = "show archived commits", help = "Read commits from archived files and show details")
public String showCommits( public String showCommits(@CliOption(key = {
@CliOption(key = { "limit"}, mandatory = false, help = "Limit commits", unspecifiedDefaultValue = "10") final Integer limit)
"limit"}, mandatory = false, help = "Limit commits", unspecifiedDefaultValue = "10") throws IOException {
final Integer limit) throws IOException {
System.out System.out.println("===============> Showing only " + limit + " archived commits <===============");
.println("===============> Showing only " + limit + " archived commits <===============");
String basePath = HoodieCLI.tableMetadata.getBasePath(); String basePath = HoodieCLI.tableMetadata.getBasePath();
FileStatus[] fsStatuses = FSUtils.getFs(basePath, HoodieCLI.conf) FileStatus[] fsStatuses = FSUtils.getFs(basePath, HoodieCLI.conf)
.globStatus(new Path(basePath + "/.hoodie/.commits_.archive*")); .globStatus(new Path(basePath + "/.hoodie/.commits_.archive*"));
@@ -61,8 +59,7 @@ public class ArchivedCommitsCommand implements CommandMarker {
int commits = 0; int commits = 0;
for (FileStatus fs : fsStatuses) { for (FileStatus fs : fsStatuses) {
//read the archived file //read the archived file
HoodieLogFormat.Reader reader = HoodieLogFormat HoodieLogFormat.Reader reader = HoodieLogFormat.newReader(FSUtils.getFs(basePath, HoodieCLI.conf),
.newReader(FSUtils.getFs(basePath, HoodieCLI.conf),
new HoodieLogFile(fs.getPath()), HoodieArchivedMetaEntry.getClassSchema()); new HoodieLogFile(fs.getPath()), HoodieArchivedMetaEntry.getClassSchema());
List<IndexedRecord> readRecords = new ArrayList<>(); List<IndexedRecord> readRecords = new ArrayList<>();
@@ -76,15 +73,14 @@ public class ArchivedCommitsCommand implements CommandMarker {
} }
commits++; commits++;
} }
List<String[]> readCommits = readRecords.stream().map(r -> (GenericRecord) r) List<String[]> readCommits = readRecords.stream().map(r -> (GenericRecord) r).map(r -> readCommit(r))
.map(r -> readCommit(r)).collect(Collectors.toList()); .collect(Collectors.toList());
allCommits.addAll(readCommits); allCommits.addAll(readCommits);
if (commits == limit) { if (commits == limit) {
break; break;
} }
} }
return HoodiePrintHelper.print( return HoodiePrintHelper.print(new String[] {"CommitTime", "CommitType", "CommitDetails"},
new String[]{"CommitTime", "CommitType", "CommitDetails"},
allCommits.toArray(new String[allCommits.size()][])); allCommits.toArray(new String[allCommits.size()][]));
} }
@@ -122,6 +118,8 @@ public class ArchivedCommitsCommand implements CommandMarker {
commitDetails.add(record.get("hoodieSavePointMetadata").toString()); commitDetails.add(record.get("hoodieSavePointMetadata").toString());
break; break;
} }
default:
return commitDetails.toArray(new String[commitDetails.size()]);
} }
} catch (Exception e) { } catch (Exception e) {
e.printStackTrace(); e.printStackTrace();

View File

@@ -13,6 +13,7 @@
* See the License for the specific language governing permissions and * See the License for the specific language governing permissions and
* limitations under the License. * limitations under the License.
*/ */
package com.uber.hoodie.cli.commands; package com.uber.hoodie.cli.commands;
import com.uber.hoodie.avro.model.HoodieCleanMetadata; import com.uber.hoodie.avro.model.HoodieCleanMetadata;
@@ -63,42 +64,37 @@ public class CleansCommand implements CommandMarker {
Collections.reverse(cleans); Collections.reverse(cleans);
for (int i = 0; i < cleans.size(); i++) { for (int i = 0; i < cleans.size(); i++) {
HoodieInstant clean = cleans.get(i); HoodieInstant clean = cleans.get(i);
HoodieCleanMetadata cleanMetadata = HoodieCleanMetadata cleanMetadata = AvroUtils
AvroUtils.deserializeHoodieCleanMetadata(timeline.getInstantDetails(clean).get()); .deserializeHoodieCleanMetadata(timeline.getInstantDetails(clean).get());
rows[i] = new String[] {clean.getTimestamp(), cleanMetadata.getEarliestCommitToRetain(), rows[i] = new String[] {clean.getTimestamp(), cleanMetadata.getEarliestCommitToRetain(),
String.valueOf(cleanMetadata.getTotalFilesDeleted()), String.valueOf(cleanMetadata.getTotalFilesDeleted()), String.valueOf(cleanMetadata.getTimeTakenInMillis())};
String.valueOf(cleanMetadata.getTimeTakenInMillis())};
} }
return HoodiePrintHelper.print( return HoodiePrintHelper
new String[]{"CleanTime", "EarliestCommandRetained", "Total Files Deleted", .print(new String[] {"CleanTime", "EarliestCommandRetained", "Total Files Deleted", "Total Time Taken"},
"Total Time Taken"}, rows); rows);
} }
@CliCommand(value = "cleans refresh", help = "Refresh the commits") @CliCommand(value = "cleans refresh", help = "Refresh the commits")
public String refreshCleans() throws IOException { public String refreshCleans() throws IOException {
HoodieTableMetaClient metadata = HoodieTableMetaClient metadata = new HoodieTableMetaClient(HoodieCLI.conf, HoodieCLI.tableMetadata.getBasePath());
new HoodieTableMetaClient(HoodieCLI.conf, HoodieCLI.tableMetadata.getBasePath());
HoodieCLI.setTableMetadata(metadata); HoodieCLI.setTableMetadata(metadata);
return "Metadata for table " + metadata.getTableConfig().getTableName() + " refreshed."; return "Metadata for table " + metadata.getTableConfig().getTableName() + " refreshed.";
} }
@CliCommand(value = "clean showpartitions", help = "Show partition level details of a clean") @CliCommand(value = "clean showpartitions", help = "Show partition level details of a clean")
public String showCleanPartitions( public String showCleanPartitions(@CliOption(key = {"clean"}, help = "clean to show") final String commitTime)
@CliOption(key = {"clean"}, help = "clean to show") throws Exception {
final String commitTime) throws Exception {
HoodieActiveTimeline activeTimeline = HoodieCLI.tableMetadata.getActiveTimeline(); HoodieActiveTimeline activeTimeline = HoodieCLI.tableMetadata.getActiveTimeline();
HoodieTimeline timeline = activeTimeline.getCleanerTimeline().filterCompletedInstants(); HoodieTimeline timeline = activeTimeline.getCleanerTimeline().filterCompletedInstants();
HoodieInstant cleanInstant = HoodieInstant cleanInstant = new HoodieInstant(false, HoodieTimeline.CLEAN_ACTION, commitTime);
new HoodieInstant(false, HoodieTimeline.CLEAN_ACTION, commitTime);
if (!timeline.containsInstant(cleanInstant)) { if (!timeline.containsInstant(cleanInstant)) {
return "Clean " + commitTime + " not found in metadata " + timeline; return "Clean " + commitTime + " not found in metadata " + timeline;
} }
HoodieCleanMetadata cleanMetadata = HoodieCleanMetadata cleanMetadata = AvroUtils.deserializeHoodieCleanMetadata(
AvroUtils.deserializeHoodieCleanMetadata(timeline.getInstantDetails(cleanInstant).get()); timeline.getInstantDetails(cleanInstant).get());
List<String[]> rows = new ArrayList<>(); List<String[]> rows = new ArrayList<>();
for (Map.Entry<String, HoodieCleanPartitionMetadata> entry : cleanMetadata for (Map.Entry<String, HoodieCleanPartitionMetadata> entry : cleanMetadata.getPartitionMetadata().entrySet()) {
.getPartitionMetadata().entrySet()) {
String path = entry.getKey(); String path = entry.getKey();
HoodieCleanPartitionMetadata stats = entry.getValue(); HoodieCleanPartitionMetadata stats = entry.getValue();
String policy = stats.getPolicy(); String policy = stats.getPolicy();

View File

@@ -64,20 +64,17 @@ public class CommitsCommand implements CommandMarker {
} }
@CliCommand(value = "commits show", help = "Show the commits") @CliCommand(value = "commits show", help = "Show the commits")
public String showCommits( public String showCommits(@CliOption(key = {
@CliOption(key = { "limit"}, mandatory = false, help = "Limit commits", unspecifiedDefaultValue = "10") final Integer limit)
"limit"}, mandatory = false, help = "Limit commits", unspecifiedDefaultValue = "10") throws IOException {
final Integer limit) throws IOException {
HoodieActiveTimeline activeTimeline = HoodieCLI.tableMetadata.getActiveTimeline(); HoodieActiveTimeline activeTimeline = HoodieCLI.tableMetadata.getActiveTimeline();
HoodieTimeline timeline = activeTimeline.getCommitsTimeline() HoodieTimeline timeline = activeTimeline.getCommitsTimeline().filterCompletedInstants();
.filterCompletedInstants();
List<HoodieInstant> commits = timeline.getInstants().collect(Collectors.toList()); List<HoodieInstant> commits = timeline.getInstants().collect(Collectors.toList());
String[][] rows = new String[commits.size()][]; String[][] rows = new String[commits.size()][];
Collections.reverse(commits); Collections.reverse(commits);
for (int i = 0; i < commits.size(); i++) { for (int i = 0; i < commits.size(); i++) {
HoodieInstant commit = commits.get(i); HoodieInstant commit = commits.get(i);
HoodieCommitMetadata commitMetadata = HoodieCommitMetadata commitMetadata = HoodieCommitMetadata.fromBytes(timeline.getInstantDetails(commit).get());
HoodieCommitMetadata.fromBytes(timeline.getInstantDetails(commit).get());
rows[i] = new String[] {commit.getTimestamp(), rows[i] = new String[] {commit.getTimestamp(),
NumericUtils.humanReadableByteCount(commitMetadata.fetchTotalBytesWritten()), NumericUtils.humanReadableByteCount(commitMetadata.fetchTotalBytesWritten()),
String.valueOf(commitMetadata.fetchTotalFilesInsert()), String.valueOf(commitMetadata.fetchTotalFilesInsert()),
@@ -88,39 +85,32 @@ public class CommitsCommand implements CommandMarker {
String.valueOf(commitMetadata.fetchTotalWriteErrors())}; String.valueOf(commitMetadata.fetchTotalWriteErrors())};
} }
return HoodiePrintHelper.print( return HoodiePrintHelper.print(
new String[]{"CommitTime", "Total Written (B)", "Total Files Added", new String[] {"CommitTime", "Total Written (B)", "Total Files Added", "Total Files Updated",
"Total Files Updated", "Total Partitions Written", "Total Records Written", "Total Partitions Written", "Total Records Written", "Total Update Records Written", "Total Errors"}, rows);
"Total Update Records Written", "Total Errors"}, rows);
} }
@CliCommand(value = "commits refresh", help = "Refresh the commits") @CliCommand(value = "commits refresh", help = "Refresh the commits")
public String refreshCommits() throws IOException { public String refreshCommits() throws IOException {
HoodieTableMetaClient metadata = HoodieTableMetaClient metadata = new HoodieTableMetaClient(HoodieCLI.conf, HoodieCLI.tableMetadata.getBasePath());
new HoodieTableMetaClient(HoodieCLI.conf, HoodieCLI.tableMetadata.getBasePath());
HoodieCLI.setTableMetadata(metadata); HoodieCLI.setTableMetadata(metadata);
return "Metadata for table " + metadata.getTableConfig().getTableName() + " refreshed."; return "Metadata for table " + metadata.getTableConfig().getTableName() + " refreshed.";
} }
@CliCommand(value = "commit rollback", help = "Rollback a commit") @CliCommand(value = "commit rollback", help = "Rollback a commit")
public String rollbackCommit( public String rollbackCommit(@CliOption(key = {"commit"}, help = "Commit to rollback") final String commitTime,
@CliOption(key = {"commit"}, help = "Commit to rollback") @CliOption(key = {"sparkProperties"}, help = "Spark Properites File Path") final String sparkPropertiesPath)
final String commitTime, throws Exception {
@CliOption(key = {"sparkProperties"}, help = "Spark Properites File Path")
final String sparkPropertiesPath) throws Exception {
HoodieActiveTimeline activeTimeline = HoodieCLI.tableMetadata.getActiveTimeline(); HoodieActiveTimeline activeTimeline = HoodieCLI.tableMetadata.getActiveTimeline();
HoodieTimeline timeline = activeTimeline.getCommitsTimeline() HoodieTimeline timeline = activeTimeline.getCommitsTimeline().filterCompletedInstants();
.filterCompletedInstants(); HoodieInstant commitInstant = new HoodieInstant(false, HoodieTimeline.COMMIT_ACTION, commitTime);
HoodieInstant commitInstant = new HoodieInstant(false, HoodieTimeline.COMMIT_ACTION,
commitTime);
if (!timeline.containsInstant(commitInstant)) { if (!timeline.containsInstant(commitInstant)) {
return "Commit " + commitTime + " not found in Commits " + timeline; return "Commit " + commitTime + " not found in Commits " + timeline;
} }
SparkLauncher sparkLauncher = SparkUtil.initLauncher(sparkPropertiesPath); SparkLauncher sparkLauncher = SparkUtil.initLauncher(sparkPropertiesPath);
sparkLauncher.addAppArgs(SparkMain.SparkCommand.ROLLBACK.toString(), sparkLauncher
commitTime, .addAppArgs(SparkMain.SparkCommand.ROLLBACK.toString(), commitTime, HoodieCLI.tableMetadata.getBasePath());
HoodieCLI.tableMetadata.getBasePath());
Process process = sparkLauncher.launch(); Process process = sparkLauncher.launch();
InputStreamConsumer.captureOutput(process); InputStreamConsumer.captureOutput(process);
int exitCode = process.waitFor(); int exitCode = process.waitFor();
@@ -133,23 +123,18 @@ public class CommitsCommand implements CommandMarker {
} }
@CliCommand(value = "commit showpartitions", help = "Show partition level details of a commit") @CliCommand(value = "commit showpartitions", help = "Show partition level details of a commit")
public String showCommitPartitions( public String showCommitPartitions(@CliOption(key = {"commit"}, help = "Commit to show") final String commitTime)
@CliOption(key = {"commit"}, help = "Commit to show") throws Exception {
final String commitTime) throws Exception {
HoodieActiveTimeline activeTimeline = HoodieCLI.tableMetadata.getActiveTimeline(); HoodieActiveTimeline activeTimeline = HoodieCLI.tableMetadata.getActiveTimeline();
HoodieTimeline timeline = activeTimeline.getCommitsTimeline() HoodieTimeline timeline = activeTimeline.getCommitsTimeline().filterCompletedInstants();
.filterCompletedInstants(); HoodieInstant commitInstant = new HoodieInstant(false, HoodieTimeline.COMMIT_ACTION, commitTime);
HoodieInstant commitInstant = new HoodieInstant(false, HoodieTimeline.COMMIT_ACTION,
commitTime);
if (!timeline.containsInstant(commitInstant)) { if (!timeline.containsInstant(commitInstant)) {
return "Commit " + commitTime + " not found in Commits " + timeline; return "Commit " + commitTime + " not found in Commits " + timeline;
} }
HoodieCommitMetadata meta = HoodieCommitMetadata meta = HoodieCommitMetadata.fromBytes(activeTimeline.getInstantDetails(commitInstant).get());
HoodieCommitMetadata.fromBytes(activeTimeline.getInstantDetails(commitInstant).get());
List<String[]> rows = new ArrayList<String[]>(); List<String[]> rows = new ArrayList<String[]>();
for (Map.Entry<String, List<HoodieWriteStat>> entry : meta.getPartitionToWriteStats() for (Map.Entry<String, List<HoodieWriteStat>> entry : meta.getPartitionToWriteStats().entrySet()) {
.entrySet()) {
String path = entry.getKey(); String path = entry.getKey();
List<HoodieWriteStat> stats = entry.getValue(); List<HoodieWriteStat> stats = entry.getValue();
long totalFilesAdded = 0; long totalFilesAdded = 0;
@@ -169,50 +154,40 @@ public class CommitsCommand implements CommandMarker {
totalBytesWritten += stat.getTotalWriteBytes(); totalBytesWritten += stat.getTotalWriteBytes();
totalWriteErrors += stat.getTotalWriteErrors(); totalWriteErrors += stat.getTotalWriteErrors();
} }
rows.add(new String[]{path, String.valueOf(totalFilesAdded), rows.add(new String[] {path, String.valueOf(totalFilesAdded), String.valueOf(totalFilesUpdated),
String.valueOf(totalFilesUpdated), String.valueOf(totalRecordsInserted), String.valueOf(totalRecordsInserted), String.valueOf(totalRecordsUpdated),
String.valueOf(totalRecordsUpdated), NumericUtils.humanReadableByteCount(totalBytesWritten), String.valueOf(totalWriteErrors)});
NumericUtils.humanReadableByteCount(totalBytesWritten),
String.valueOf(totalWriteErrors)});
} }
return HoodiePrintHelper.print( return HoodiePrintHelper.print(
new String[]{"Partition Path", "Total Files Added", "Total Files Updated", new String[] {"Partition Path", "Total Files Added", "Total Files Updated", "Total Records Inserted",
"Total Records Inserted", "Total Records Updated", "Total Bytes Written", "Total Records Updated", "Total Bytes Written", "Total Errors"}, rows.toArray(new String[rows.size()][]));
"Total Errors"}, rows.toArray(new String[rows.size()][]));
} }
@CliCommand(value = "commit showfiles", help = "Show file level details of a commit") @CliCommand(value = "commit showfiles", help = "Show file level details of a commit")
public String showCommitFiles( public String showCommitFiles(@CliOption(key = {"commit"}, help = "Commit to show") final String commitTime)
@CliOption(key = {"commit"}, help = "Commit to show") throws Exception {
final String commitTime) throws Exception {
HoodieActiveTimeline activeTimeline = HoodieCLI.tableMetadata.getActiveTimeline(); HoodieActiveTimeline activeTimeline = HoodieCLI.tableMetadata.getActiveTimeline();
HoodieTimeline timeline = activeTimeline.getCommitsTimeline() HoodieTimeline timeline = activeTimeline.getCommitsTimeline().filterCompletedInstants();
.filterCompletedInstants(); HoodieInstant commitInstant = new HoodieInstant(false, HoodieTimeline.COMMIT_ACTION, commitTime);
HoodieInstant commitInstant = new HoodieInstant(false, HoodieTimeline.COMMIT_ACTION,
commitTime);
if (!timeline.containsInstant(commitInstant)) { if (!timeline.containsInstant(commitInstant)) {
return "Commit " + commitTime + " not found in Commits " + timeline; return "Commit " + commitTime + " not found in Commits " + timeline;
} }
HoodieCommitMetadata meta = HoodieCommitMetadata meta = HoodieCommitMetadata.fromBytes(activeTimeline.getInstantDetails(commitInstant).get());
HoodieCommitMetadata.fromBytes(activeTimeline.getInstantDetails(commitInstant).get());
List<String[]> rows = new ArrayList<String[]>(); List<String[]> rows = new ArrayList<String[]>();
for (Map.Entry<String, List<HoodieWriteStat>> entry : meta.getPartitionToWriteStats() for (Map.Entry<String, List<HoodieWriteStat>> entry : meta.getPartitionToWriteStats().entrySet()) {
.entrySet()) {
String path = entry.getKey(); String path = entry.getKey();
List<HoodieWriteStat> stats = entry.getValue(); List<HoodieWriteStat> stats = entry.getValue();
for (HoodieWriteStat stat : stats) { for (HoodieWriteStat stat : stats) {
rows.add(new String[]{path, stat.getFileId(), stat.getPrevCommit(), rows.add(new String[] {path, stat.getFileId(), stat.getPrevCommit(), String.valueOf(stat.getNumUpdateWrites()),
String.valueOf(stat.getNumUpdateWrites()), String.valueOf(stat.getNumWrites()), String.valueOf(stat.getNumWrites()), String.valueOf(stat.getTotalWriteBytes()),
String.valueOf(stat.getTotalWriteBytes()),
String.valueOf(stat.getTotalWriteErrors())}); String.valueOf(stat.getTotalWriteErrors())});
} }
} }
return HoodiePrintHelper.print( return HoodiePrintHelper.print(
new String[]{"Partition Path", "File ID", "Previous Commit", "Total Records Updated", new String[] {"Partition Path", "File ID", "Previous Commit", "Total Records Updated", "Total Records Written",
"Total Records Written", "Total Bytes Written", "Total Errors"}, "Total Bytes Written", "Total Errors"}, rows.toArray(new String[rows.size()][]));
rows.toArray(new String[rows.size()][]));
} }
@CliAvailabilityIndicator({"commits compare"}) @CliAvailabilityIndicator({"commits compare"})
@@ -221,38 +196,30 @@ public class CommitsCommand implements CommandMarker {
} }
@CliCommand(value = "commits compare", help = "Compare commits with another Hoodie dataset") @CliCommand(value = "commits compare", help = "Compare commits with another Hoodie dataset")
public String compareCommits( public String compareCommits(@CliOption(key = {"path"}, help = "Path of the dataset to compare to") final String path)
@CliOption(key = {"path"}, help = "Path of the dataset to compare to") throws Exception {
final String path) throws Exception {
HoodieTableMetaClient target = new HoodieTableMetaClient(HoodieCLI.conf, path); HoodieTableMetaClient target = new HoodieTableMetaClient(HoodieCLI.conf, path);
HoodieTimeline targetTimeline = target.getActiveTimeline().getCommitsTimeline() HoodieTimeline targetTimeline = target.getActiveTimeline().getCommitsTimeline().filterCompletedInstants();
.filterCompletedInstants();
HoodieTableMetaClient source = HoodieCLI.tableMetadata; HoodieTableMetaClient source = HoodieCLI.tableMetadata;
HoodieTimeline sourceTimeline = source.getActiveTimeline().getCommitsTimeline() HoodieTimeline sourceTimeline = source.getActiveTimeline().getCommitsTimeline().filterCompletedInstants();
.filterCompletedInstants();
String targetLatestCommit = String targetLatestCommit =
targetTimeline.getInstants().iterator().hasNext() ? "0" targetTimeline.getInstants().iterator().hasNext() ? "0" : targetTimeline.lastInstant().get().getTimestamp();
: targetTimeline.lastInstant().get().getTimestamp();
String sourceLatestCommit = String sourceLatestCommit =
sourceTimeline.getInstants().iterator().hasNext() ? "0" sourceTimeline.getInstants().iterator().hasNext() ? "0" : sourceTimeline.lastInstant().get().getTimestamp();
: sourceTimeline.lastInstant().get().getTimestamp();
if (sourceLatestCommit != null && if (sourceLatestCommit != null && HoodieTimeline.compareTimestamps(targetLatestCommit, sourceLatestCommit,
HoodieTimeline HoodieTimeline.GREATER)) {
.compareTimestamps(targetLatestCommit, sourceLatestCommit, HoodieTimeline.GREATER)) {
// source is behind the target // source is behind the target
List<String> commitsToCatchup = List<String> commitsToCatchup = targetTimeline.findInstantsAfter(sourceLatestCommit, Integer.MAX_VALUE)
targetTimeline.findInstantsAfter(sourceLatestCommit, Integer.MAX_VALUE)
.getInstants().map(HoodieInstant::getTimestamp).collect(Collectors.toList()); .getInstants().map(HoodieInstant::getTimestamp).collect(Collectors.toList());
return "Source " + source.getTableConfig().getTableName() + " is behind by " return "Source " + source.getTableConfig().getTableName() + " is behind by " + commitsToCatchup.size()
+ commitsToCatchup.size() + " commits. Commits to catch up - " + commitsToCatchup; + " commits. Commits to catch up - " + commitsToCatchup;
} else { } else {
List<String> commitsToCatchup = List<String> commitsToCatchup = sourceTimeline.findInstantsAfter(targetLatestCommit, Integer.MAX_VALUE)
sourceTimeline.findInstantsAfter(targetLatestCommit, Integer.MAX_VALUE)
.getInstants().map(HoodieInstant::getTimestamp).collect(Collectors.toList()); .getInstants().map(HoodieInstant::getTimestamp).collect(Collectors.toList());
return "Source " + source.getTableConfig().getTableName() + " is ahead by " return "Source " + source.getTableConfig().getTableName() + " is ahead by " + commitsToCatchup.size()
+ commitsToCatchup.size() + " commits. Commits to catch up - " + commitsToCatchup; + " commits. Commits to catch up - " + commitsToCatchup;
} }
} }
@@ -262,13 +229,12 @@ public class CommitsCommand implements CommandMarker {
} }
@CliCommand(value = "commits sync", help = "Compare commits with another Hoodie dataset") @CliCommand(value = "commits sync", help = "Compare commits with another Hoodie dataset")
public String syncCommits( public String syncCommits(@CliOption(key = {"path"}, help = "Path of the dataset to compare to") final String path)
@CliOption(key = {"path"}, help = "Path of the dataset to compare to") throws Exception {
final String path) throws Exception {
HoodieCLI.syncTableMetadata = new HoodieTableMetaClient(HoodieCLI.conf, path); HoodieCLI.syncTableMetadata = new HoodieTableMetaClient(HoodieCLI.conf, path);
HoodieCLI.state = HoodieCLI.CLIState.SYNC; HoodieCLI.state = HoodieCLI.CLIState.SYNC;
return "Load sync state between " + HoodieCLI.tableMetadata.getTableConfig().getTableName() return "Load sync state between " + HoodieCLI.tableMetadata.getTableConfig().getTableName() + " and "
+ " and " + HoodieCLI.syncTableMetadata.getTableConfig().getTableName(); + HoodieCLI.syncTableMetadata.getTableConfig().getTableName();
} }
} }

View File

@@ -29,13 +29,12 @@ public class DatasetsCommand implements CommandMarker {
@CliCommand(value = "connect", help = "Connect to a hoodie dataset") @CliCommand(value = "connect", help = "Connect to a hoodie dataset")
public String connect( public String connect(
@CliOption(key = {"path"}, mandatory = true, help = "Base Path of the dataset") @CliOption(key = {"path"}, mandatory = true, help = "Base Path of the dataset") final String path)
final String path) throws IOException { throws IOException {
boolean initialized = HoodieCLI.initConf(); boolean initialized = HoodieCLI.initConf();
HoodieCLI.initFS(initialized); HoodieCLI.initFS(initialized);
HoodieCLI.setTableMetadata(new HoodieTableMetaClient(HoodieCLI.conf, path)); HoodieCLI.setTableMetadata(new HoodieTableMetaClient(HoodieCLI.conf, path));
HoodieCLI.state = HoodieCLI.CLIState.DATASET; HoodieCLI.state = HoodieCLI.CLIState.DATASET;
return "Metadata for table " + HoodieCLI.tableMetadata.getTableConfig().getTableName() return "Metadata for table " + HoodieCLI.tableMetadata.getTableConfig().getTableName() + " loaded";
+ " loaded";
} }
} }

View File

@@ -37,44 +37,33 @@ public class HDFSParquetImportCommand implements CommandMarker {
@CliCommand(value = "hdfsparquetimport", help = "Imports hdfs dataset to a hoodie dataset") @CliCommand(value = "hdfsparquetimport", help = "Imports hdfs dataset to a hoodie dataset")
public String convert( public String convert(
@CliOption(key = "srcPath", mandatory = true, help = "Base path for the input dataset") @CliOption(key = "srcPath", mandatory = true, help = "Base path for the input dataset") final String srcPath,
final String srcPath, @CliOption(key = "srcType", mandatory = true, help = "Source type for the input dataset") final String srcType,
@CliOption(key = "srcType", mandatory = true, help = "Source type for the input dataset") @CliOption(key = "targetPath", mandatory = true, help = "Base path for the target hoodie dataset") final String
final String srcType, targetPath,
@CliOption(key = "targetPath", mandatory = true, help = "Base path for the target hoodie dataset") @CliOption(key = "tableName", mandatory = true, help = "Table name") final String tableName,
final String targetPath, @CliOption(key = "tableType", mandatory = true, help = "Table type") final String tableType,
@CliOption(key = "tableName", mandatory = true, help = "Table name") @CliOption(key = "rowKeyField", mandatory = true, help = "Row key field name") final String rowKeyField,
final String tableName, @CliOption(key = "partitionPathField", mandatory = true, help = "Partition path field name") final String
@CliOption(key = "tableType", mandatory = true, help = "Table type") partitionPathField,
final String tableType, @CliOption(key = {
@CliOption(key = "rowKeyField", mandatory = true, help = "Row key field name") "parallelism"}, mandatory = true, help = "Parallelism for hoodie insert") final String parallelism,
final String rowKeyField, @CliOption(key = "schemaFilePath", mandatory = true, help = "Path for Avro schema file") final String
@CliOption(key = "partitionPathField", mandatory = true, help = "Partition path field name") schemaFilePath,
final String partitionPathField, @CliOption(key = "format", mandatory = true, help = "Format for the input data") final String format,
@CliOption(key = {"parallelism"}, mandatory = true, help = "Parallelism for hoodie insert") @CliOption(key = "sparkMemory", mandatory = true, help = "Spark executor memory") final String sparkMemory,
final String parallelism, @CliOption(key = "retry", mandatory = true, help = "Number of retries") final String retry) throws Exception {
@CliOption(key = "schemaFilePath", mandatory = true, help = "Path for Avro schema file")
final String schemaFilePath,
@CliOption(key = "format", mandatory = true, help = "Format for the input data")
final String format,
@CliOption(key = "sparkMemory", mandatory = true, help = "Spark executor memory")
final String sparkMemory,
@CliOption(key = "retry", mandatory = true, help = "Number of retries")
final String retry)
throws Exception {
validate(format, srcType); validate(format, srcType);
boolean initialized = HoodieCLI.initConf(); boolean initialized = HoodieCLI.initConf();
HoodieCLI.initFS(initialized); HoodieCLI.initFS(initialized);
String sparkPropertiesPath = Utils String sparkPropertiesPath = Utils.getDefaultPropertiesFile(
.getDefaultPropertiesFile(
scala.collection.JavaConversions.propertiesAsScalaMap(System.getProperties())); scala.collection.JavaConversions.propertiesAsScalaMap(System.getProperties()));
SparkLauncher sparkLauncher = SparkUtil.initLauncher(sparkPropertiesPath); SparkLauncher sparkLauncher = SparkUtil.initLauncher(sparkPropertiesPath);
sparkLauncher.addAppArgs(SparkCommand.IMPORT.toString(), srcPath, targetPath, tableName, sparkLauncher.addAppArgs(SparkCommand.IMPORT.toString(), srcPath, targetPath, tableName, tableType, rowKeyField,
tableType, rowKeyField, partitionPathField, parallelism, schemaFilePath, sparkMemory, partitionPathField, parallelism, schemaFilePath, sparkMemory, retry);
retry);
Process process = sparkLauncher.launch(); Process process = sparkLauncher.launch();
InputStreamConsumer.captureOutput(process); InputStreamConsumer.captureOutput(process);
int exitCode = process.waitFor(); int exitCode = process.waitFor();

View File

@@ -64,25 +64,25 @@ public class HoodieLogFileCommand implements CommandMarker {
@CliCommand(value = "show logfile metadata", help = "Read commit metadata from log files") @CliCommand(value = "show logfile metadata", help = "Read commit metadata from log files")
public String showLogFileCommits( public String showLogFileCommits(
@CliOption(key = "logFilePathPattern", mandatory = true, help = "Fully qualified path for the log file") @CliOption(key = "logFilePathPattern", mandatory = true, help = "Fully qualified path for the log file") final
final String logFilePathPattern) throws IOException { String logFilePathPattern)
throws IOException {
FileSystem fs = HoodieCLI.tableMetadata.getFs(); FileSystem fs = HoodieCLI.tableMetadata.getFs();
List<String> logFilePaths = Arrays.stream(fs.globStatus(new Path(logFilePathPattern))) List<String> logFilePaths = Arrays.stream(fs.globStatus(new Path(logFilePathPattern)))
.map(status -> status.getPath().toString()).collect(Collectors.toList()); .map(status -> status.getPath().toString()).collect(Collectors.toList());
Map<String, List<Tuple3<HoodieLogBlockType, Tuple2<Map<HeaderMetadataType, String>, Map<HeaderMetadataType, String>>, Integer>>> commitCountAndMetadata = Maps Map<String, List<Tuple3<HoodieLogBlockType, Tuple2<Map<HeaderMetadataType, String>, Map<HeaderMetadataType,
.newHashMap(); String>>, Integer>>>
commitCountAndMetadata = Maps.newHashMap();
int totalEntries = 0; int totalEntries = 0;
int numCorruptBlocks = 0; int numCorruptBlocks = 0;
for (String logFilePath : logFilePaths) { for (String logFilePath : logFilePaths) {
FileStatus[] fsStatus = fs.listStatus( FileStatus[] fsStatus = fs.listStatus(new Path(logFilePath));
new Path(logFilePath)); Schema writerSchema = new AvroSchemaConverter().convert(
Schema writerSchema = new AvroSchemaConverter() SchemaUtil.readSchemaFromLogFile(HoodieCLI.tableMetadata.getFs(), new Path(logFilePath)));
.convert(SchemaUtil HoodieLogFormat.Reader reader = HoodieLogFormat
.readSchemaFromLogFile(HoodieCLI.tableMetadata.getFs(), new Path(logFilePath))); .newReader(fs, new HoodieLogFile(fsStatus[0].getPath()), writerSchema);
HoodieLogFormat.Reader reader = HoodieLogFormat.newReader(fs,
new HoodieLogFile(fsStatus[0].getPath()), writerSchema);
// read the avro blocks // read the avro blocks
while (reader.hasNext()) { while (reader.hasNext()) {
@@ -104,15 +104,14 @@ public class HoodieLogFileCommand implements CommandMarker {
} }
} }
if (commitCountAndMetadata.containsKey(instantTime)) { if (commitCountAndMetadata.containsKey(instantTime)) {
commitCountAndMetadata.get(instantTime) commitCountAndMetadata.get(instantTime).add(
.add(new Tuple3<>(n.getBlockType(), new Tuple3<>(n.getBlockType(), new Tuple2<>(n.getLogBlockHeader(), n.getLogBlockFooter()), recordCount));
new Tuple2<>(n.getLogBlockHeader(), n.getLogBlockFooter()), recordCount));
totalEntries++; totalEntries++;
} else { } else {
List<Tuple3<HoodieLogBlockType, Tuple2<Map<HeaderMetadataType, String>, Map<HeaderMetadataType, String>>, Integer>> list List<Tuple3<HoodieLogBlockType, Tuple2<Map<HeaderMetadataType, String>, Map<HeaderMetadataType, String>>,
= new ArrayList<>(); Integer>> list = new ArrayList<>();
list.add(new Tuple3<>(n.getBlockType(), list.add(
new Tuple2<>(n.getLogBlockHeader(), n.getLogBlockFooter()), recordCount)); new Tuple3<>(n.getBlockType(), new Tuple2<>(n.getLogBlockHeader(), n.getLogBlockFooter()), recordCount));
commitCountAndMetadata.put(instantTime, list); commitCountAndMetadata.put(instantTime, list);
totalEntries++; totalEntries++;
} }
@@ -121,11 +120,12 @@ public class HoodieLogFileCommand implements CommandMarker {
String[][] rows = new String[totalEntries + 1][]; String[][] rows = new String[totalEntries + 1][];
int i = 0; int i = 0;
ObjectMapper objectMapper = new ObjectMapper(); ObjectMapper objectMapper = new ObjectMapper();
for (Map.Entry<String, List<Tuple3<HoodieLogBlockType, Tuple2<Map<HeaderMetadataType, String>, Map<HeaderMetadataType, String>>, Integer>>> entry : commitCountAndMetadata for (Map.Entry<String, List<Tuple3<HoodieLogBlockType,
.entrySet()) { Tuple2<Map<HeaderMetadataType, String>, Map<HeaderMetadataType, String>>, Integer>>> entry
: commitCountAndMetadata.entrySet()) {
String instantTime = entry.getKey().toString(); String instantTime = entry.getKey().toString();
for (Tuple3<HoodieLogBlockType, Tuple2<Map<HeaderMetadataType, String>, Map<HeaderMetadataType, String>>, Integer> tuple3 : entry for (Tuple3<HoodieLogBlockType, Tuple2<Map<HeaderMetadataType, String>,
.getValue()) { Map<HeaderMetadataType, String>>, Integer> tuple3 : entry.getValue()) {
String[] output = new String[5]; String[] output = new String[5];
output[0] = instantTime; output[0] = instantTime;
output[1] = String.valueOf(tuple3._3()); output[1] = String.valueOf(tuple3._3());
@@ -136,24 +136,21 @@ public class HoodieLogFileCommand implements CommandMarker {
i++; i++;
} }
} }
return HoodiePrintHelper.print( return HoodiePrintHelper
new String[]{"InstantTime", "RecordCount", "BlockType", "HeaderMetadata", "FooterMetadata"}, .print(new String[] {"InstantTime", "RecordCount", "BlockType", "HeaderMetadata", "FooterMetadata"},
rows); rows);
} }
@CliCommand(value = "show logfile records", help = "Read records from log files") @CliCommand(value = "show logfile records", help = "Read records from log files")
public String showLogFileRecords( public String showLogFileRecords(@CliOption(key = {
@CliOption(key = { "limit"}, mandatory = false, help = "Limit commits", unspecifiedDefaultValue = "10") final Integer limit,
"limit"}, mandatory = false, help = "Limit commits", unspecifiedDefaultValue = "10")
final Integer limit,
@CliOption(key = "logFilePathPattern", mandatory = true, help = "Fully qualified paths for the log files") @CliOption(key = "logFilePathPattern", mandatory = true, help = "Fully qualified paths for the log files")
final String logFilePathPattern, final String logFilePathPattern,
@CliOption(key = "mergeRecords", mandatory = false, help = "If the records in the log files should be merged", @CliOption(key = "mergeRecords", mandatory = false, help = "If the records in the log files should be merged",
unspecifiedDefaultValue = "false") unspecifiedDefaultValue = "false") final Boolean shouldMerge)
final Boolean shouldMerge) throws IOException { throws IOException {
System.out System.out.println("===============> Showing only " + limit + " records <===============");
.println("===============> Showing only " + limit + " records <===============");
FileSystem fs = HoodieCLI.tableMetadata.getFs(); FileSystem fs = HoodieCLI.tableMetadata.getFs();
List<String> logFilePaths = Arrays.stream(fs.globStatus(new Path(logFilePathPattern))) List<String> logFilePaths = Arrays.stream(fs.globStatus(new Path(logFilePathPattern)))
@@ -162,9 +159,8 @@ public class HoodieLogFileCommand implements CommandMarker {
// TODO : readerSchema can change across blocks/log files, fix this inside Scanner // TODO : readerSchema can change across blocks/log files, fix this inside Scanner
AvroSchemaConverter converter = new AvroSchemaConverter(); AvroSchemaConverter converter = new AvroSchemaConverter();
// get schema from last log file // get schema from last log file
Schema readerSchema = converter Schema readerSchema = converter.convert(
.convert(SchemaUtil SchemaUtil.readSchemaFromLogFile(fs, new Path(logFilePaths.get(logFilePaths.size() - 1))));
.readSchemaFromLogFile(fs, new Path(logFilePaths.get(logFilePaths.size() - 1))));
List<IndexedRecord> allRecords = new ArrayList<>(); List<IndexedRecord> allRecords = new ArrayList<>();
@@ -186,11 +182,10 @@ public class HoodieLogFileCommand implements CommandMarker {
} }
} else { } else {
for (String logFile : logFilePaths) { for (String logFile : logFilePaths) {
Schema writerSchema = new AvroSchemaConverter() Schema writerSchema = new AvroSchemaConverter().convert(
.convert(SchemaUtil SchemaUtil.readSchemaFromLogFile(HoodieCLI.tableMetadata.getFs(), new Path(logFile)));
.readSchemaFromLogFile(HoodieCLI.tableMetadata.getFs(), new Path(logFile))); HoodieLogFormat.Reader reader = HoodieLogFormat
HoodieLogFormat.Reader reader = HoodieLogFormat.newReader(fs, .newReader(fs, new HoodieLogFile(new Path(logFile)), writerSchema);
new HoodieLogFile(new Path(logFile)), writerSchema);
// read the avro blocks // read the avro blocks
while (reader.hasNext()) { while (reader.hasNext()) {
HoodieLogBlock n = reader.next(); HoodieLogBlock n = reader.next();
@@ -216,7 +211,6 @@ public class HoodieLogFileCommand implements CommandMarker {
rows[i] = data; rows[i] = data;
i++; i++;
} }
return HoodiePrintHelper.print( return HoodiePrintHelper.print(new String[] {"Records"}, rows);
new String[]{"Records"}, rows);
} }
} }

View File

@@ -40,26 +40,22 @@ public class HoodieSyncCommand implements CommandMarker {
@CliCommand(value = "sync validate", help = "Validate the sync by counting the number of records") @CliCommand(value = "sync validate", help = "Validate the sync by counting the number of records")
public String validateSync( public String validateSync(
@CliOption(key = {"mode"}, unspecifiedDefaultValue = "complete", help = "Check mode") @CliOption(key = {"mode"}, unspecifiedDefaultValue = "complete", help = "Check mode") final String mode,
final String mode, @CliOption(key = {"sourceDb"}, unspecifiedDefaultValue = "rawdata", help = "source database") final String srcDb,
@CliOption(key = { @CliOption(key = {
"sourceDb"}, unspecifiedDefaultValue = "rawdata", help = "source database") "targetDb"}, unspecifiedDefaultValue = "dwh_hoodie", help = "target database") final String tgtDb,
final String srcDb,
@CliOption(key = {
"targetDb"}, unspecifiedDefaultValue = "dwh_hoodie", help = "target database")
final String tgtDb,
@CliOption(key = { @CliOption(key = {
"partitionCount"}, unspecifiedDefaultValue = "5", help = "total number of recent partitions to validate") "partitionCount"}, unspecifiedDefaultValue = "5", help = "total number of recent partitions to validate")
final int partitionCount, final int partitionCount,
@CliOption(key = { @CliOption(key = {
"hiveServerUrl"}, mandatory = true, help = "hiveServerURL to connect to") "hiveServerUrl"}, mandatory = true, help = "hiveServerURL to connect to") final String hiveServerUrl,
final String hiveServerUrl,
@CliOption(key = { @CliOption(key = {
"hiveUser"}, mandatory = false, unspecifiedDefaultValue = "", help = "hive username to connect to") "hiveUser"}, mandatory = false, unspecifiedDefaultValue = "", help = "hive username to connect to") final
final String hiveUser, String hiveUser,
@CliOption(key = { @CliOption(key = {
"hivePass"}, mandatory = true, unspecifiedDefaultValue = "", help = "hive password to connect to") "hivePass"}, mandatory = true, unspecifiedDefaultValue = "", help = "hive password to connect to") final
final String hivePass) throws Exception { String hivePass)
throws Exception {
HoodieTableMetaClient target = HoodieCLI.syncTableMetadata; HoodieTableMetaClient target = HoodieCLI.syncTableMetadata;
HoodieTimeline targetTimeline = target.getActiveTimeline().getCommitsTimeline(); HoodieTimeline targetTimeline = target.getActiveTimeline().getCommitsTimeline();
HoodieTableMetaClient source = HoodieCLI.tableMetadata; HoodieTableMetaClient source = HoodieCLI.tableMetadata;
@@ -70,52 +66,42 @@ public class HoodieSyncCommand implements CommandMarker {
sourceCount = HiveUtil.countRecords(hiveServerUrl, source, srcDb, hiveUser, hivePass); sourceCount = HiveUtil.countRecords(hiveServerUrl, source, srcDb, hiveUser, hivePass);
targetCount = HiveUtil.countRecords(hiveServerUrl, target, tgtDb, hiveUser, hivePass); targetCount = HiveUtil.countRecords(hiveServerUrl, target, tgtDb, hiveUser, hivePass);
} else if ("latestPartitions".equals(mode)) { } else if ("latestPartitions".equals(mode)) {
sourceCount = HiveUtil sourceCount = HiveUtil.countRecords(hiveServerUrl, source, srcDb, partitionCount, hiveUser, hivePass);
.countRecords(hiveServerUrl, source, srcDb, partitionCount, hiveUser, hivePass); targetCount = HiveUtil.countRecords(hiveServerUrl, target, tgtDb, partitionCount, hiveUser, hivePass);
targetCount = HiveUtil
.countRecords(hiveServerUrl, target, tgtDb, partitionCount, hiveUser, hivePass);
} }
String targetLatestCommit = String targetLatestCommit =
targetTimeline.getInstants().iterator().hasNext() ? "0" targetTimeline.getInstants().iterator().hasNext() ? "0" : targetTimeline.lastInstant().get().getTimestamp();
: targetTimeline.lastInstant().get().getTimestamp();
String sourceLatestCommit = String sourceLatestCommit =
sourceTimeline.getInstants().iterator().hasNext() ? "0" sourceTimeline.getInstants().iterator().hasNext() ? "0" : sourceTimeline.lastInstant().get().getTimestamp();
: sourceTimeline.lastInstant().get().getTimestamp();
if (sourceLatestCommit != null && HoodieTimeline if (sourceLatestCommit != null && HoodieTimeline.compareTimestamps(targetLatestCommit, sourceLatestCommit,
.compareTimestamps(targetLatestCommit, sourceLatestCommit, HoodieTimeline.GREATER)) { HoodieTimeline.GREATER)) {
// source is behind the target // source is behind the target
List<HoodieInstant> commitsToCatchup = List<HoodieInstant> commitsToCatchup = targetTimeline.findInstantsAfter(sourceLatestCommit, Integer.MAX_VALUE)
targetTimeline.findInstantsAfter(sourceLatestCommit, Integer.MAX_VALUE).getInstants() .getInstants().collect(Collectors.toList());
.collect(Collectors.toList());
if (commitsToCatchup.isEmpty()) { if (commitsToCatchup.isEmpty()) {
return "Count difference now is (count(" + target.getTableConfig().getTableName() return "Count difference now is (count(" + target.getTableConfig().getTableName() + ") - count("
+ ") - count(" + source.getTableConfig().getTableName() + ") == " + (targetCount + source.getTableConfig().getTableName() + ") == " + (targetCount - sourceCount);
- sourceCount);
} else { } else {
long newInserts = CommitUtil.countNewRecords(target, long newInserts = CommitUtil.countNewRecords(target,
commitsToCatchup.stream().map(HoodieInstant::getTimestamp) commitsToCatchup.stream().map(HoodieInstant::getTimestamp).collect(Collectors.toList()));
.collect(Collectors.toList())); return "Count difference now is (count(" + target.getTableConfig().getTableName() + ") - count("
return "Count difference now is (count(" + target.getTableConfig().getTableName() + source.getTableConfig().getTableName()
+ ") - count(" + source.getTableConfig().getTableName() + ") == " + (targetCount + ") == " + (targetCount - sourceCount) + ". Catch up count is " + newInserts;
- sourceCount) + ". Catch up count is " + newInserts;
} }
} else { } else {
List<HoodieInstant> commitsToCatchup = List<HoodieInstant> commitsToCatchup = sourceTimeline.findInstantsAfter(targetLatestCommit, Integer.MAX_VALUE)
sourceTimeline.findInstantsAfter(targetLatestCommit, Integer.MAX_VALUE).getInstants() .getInstants().collect(Collectors.toList());
.collect(Collectors.toList());
if (commitsToCatchup.isEmpty()) { if (commitsToCatchup.isEmpty()) {
return "Count difference now is (count(" + source.getTableConfig().getTableName() return "Count difference now is (count(" + source.getTableConfig().getTableName() + ") - count("
+ ") - count(" + target.getTableConfig().getTableName() + ") == " + (sourceCount + target.getTableConfig().getTableName() + ") == " + (sourceCount - targetCount);
- targetCount);
} else { } else {
long newInserts = CommitUtil.countNewRecords(source, long newInserts = CommitUtil.countNewRecords(source,
commitsToCatchup.stream().map(HoodieInstant::getTimestamp) commitsToCatchup.stream().map(HoodieInstant::getTimestamp).collect(Collectors.toList()));
.collect(Collectors.toList())); return "Count difference now is (count(" + source.getTableConfig().getTableName() + ") - count("
return "Count difference now is (count(" + source.getTableConfig().getTableName() + target.getTableConfig().getTableName()
+ ") - count(" + target.getTableConfig().getTableName() + ") == " + (sourceCount + ") == " + (sourceCount - targetCount) + ". Catch up count is " + newInserts;
- targetCount) + ". Catch up count is " + newInserts;
} }
} }

View File

@@ -45,20 +45,20 @@ public class RepairsCommand implements CommandMarker {
return HoodieCLI.tableMetadata != null; return HoodieCLI.tableMetadata != null;
} }
@CliCommand(value = "repair deduplicate", help = "De-duplicate a partition path contains duplicates & produce repaired files to replace with") @CliCommand(value = "repair deduplicate", help = "De-duplicate a partition path contains duplicates & produce "
public String deduplicate( + "repaired files to replace with")
public String deduplicate(@CliOption(key = {
"duplicatedPartitionPath"}, help = "Partition Path containing the duplicates", mandatory = true) final String
duplicatedPartitionPath,
@CliOption(key = { @CliOption(key = {
"duplicatedPartitionPath"}, help = "Partition Path containing the duplicates", mandatory = true) "repairedOutputPath"}, help = "Location to place the repaired files", mandatory = true) final String
final String duplicatedPartitionPath, repairedOutputPath,
@CliOption(key = { @CliOption(key = {
"repairedOutputPath"}, help = "Location to place the repaired files", mandatory = true) "sparkProperties"}, help = "Spark Properites File Path", mandatory = true) final String sparkPropertiesPath)
final String repairedOutputPath, throws Exception {
@CliOption(key = {"sparkProperties"}, help = "Spark Properites File Path", mandatory = true)
final String sparkPropertiesPath) throws Exception {
SparkLauncher sparkLauncher = SparkUtil.initLauncher(sparkPropertiesPath); SparkLauncher sparkLauncher = SparkUtil.initLauncher(sparkPropertiesPath);
sparkLauncher sparkLauncher.addAppArgs(SparkMain.SparkCommand.DEDUPLICATE.toString(), duplicatedPartitionPath, repairedOutputPath,
.addAppArgs(SparkMain.SparkCommand.DEDUPLICATE.toString(), duplicatedPartitionPath, HoodieCLI.tableMetadata.getBasePath());
repairedOutputPath, HoodieCLI.tableMetadata.getBasePath());
Process process = sparkLauncher.launch(); Process process = sparkLauncher.launch();
InputStreamConsumer.captureOutput(process); InputStreamConsumer.captureOutput(process);
int exitCode = process.waitFor(); int exitCode = process.waitFor();
@@ -71,14 +71,12 @@ public class RepairsCommand implements CommandMarker {
@CliCommand(value = "repair addpartitionmeta", help = "Add partition metadata to a dataset, if not present") @CliCommand(value = "repair addpartitionmeta", help = "Add partition metadata to a dataset, if not present")
public String addPartitionMeta( public String addPartitionMeta(@CliOption(key = {
@CliOption(key = {"dryrun"}, "dryrun"}, help = "Should we actually add or just print what would be done", unspecifiedDefaultValue = "true")
help = "Should we actually add or just print what would be done",
unspecifiedDefaultValue = "true")
final boolean dryRun) throws IOException { final boolean dryRun) throws IOException {
String latestCommit = HoodieCLI.tableMetadata.getActiveTimeline().getCommitTimeline() String latestCommit = HoodieCLI.tableMetadata.getActiveTimeline().getCommitTimeline().lastInstant().get()
.lastInstant().get().getTimestamp(); .getTimestamp();
List<String> partitionPaths = FSUtils.getAllFoldersThreeLevelsDown(HoodieCLI.fs, List<String> partitionPaths = FSUtils.getAllFoldersThreeLevelsDown(HoodieCLI.fs,
HoodieCLI.tableMetadata.getBasePath()); HoodieCLI.tableMetadata.getBasePath());
Path basePath = new Path(HoodieCLI.tableMetadata.getBasePath()); Path basePath = new Path(HoodieCLI.tableMetadata.getBasePath());
@@ -94,10 +92,7 @@ public class RepairsCommand implements CommandMarker {
if (!HoodiePartitionMetadata.hasPartitionMetadata(HoodieCLI.fs, partitionPath)) { if (!HoodiePartitionMetadata.hasPartitionMetadata(HoodieCLI.fs, partitionPath)) {
row[1] = "No"; row[1] = "No";
if (!dryRun) { if (!dryRun) {
HoodiePartitionMetadata partitionMetadata = new HoodiePartitionMetadata( HoodiePartitionMetadata partitionMetadata = new HoodiePartitionMetadata(HoodieCLI.fs, latestCommit, basePath,
HoodieCLI.fs,
latestCommit,
basePath,
partitionPath); partitionPath);
partitionMetadata.trySave(0); partitionMetadata.trySave(0);
} }
@@ -105,7 +100,6 @@ public class RepairsCommand implements CommandMarker {
rows[ind++] = row; rows[ind++] = row;
} }
return HoodiePrintHelper.print( return HoodiePrintHelper.print(new String[] {"Partition Path", "Metadata Present?", "Action"}, rows);
new String[]{"Partition Path", "Metadata Present?", "Action"}, rows);
} }
} }

View File

@@ -13,6 +13,7 @@
* See the License for the specific language governing permissions and * See the License for the specific language governing permissions and
* limitations under the License. * limitations under the License.
*/ */
package com.uber.hoodie.cli.commands; package com.uber.hoodie.cli.commands;
import com.uber.hoodie.HoodieWriteClient; import com.uber.hoodie.HoodieWriteClient;
@@ -60,8 +61,8 @@ public class SavepointsCommand implements CommandMarker {
@CliAvailabilityIndicator({"savepoint rollback"}) @CliAvailabilityIndicator({"savepoint rollback"})
public boolean isRollbackToSavepointAvailable() { public boolean isRollbackToSavepointAvailable() {
return HoodieCLI.tableMetadata != null && !HoodieCLI.tableMetadata.getActiveTimeline() return HoodieCLI.tableMetadata != null && !HoodieCLI.tableMetadata.getActiveTimeline().getSavePointTimeline()
.getSavePointTimeline().filterCompletedInstants().empty(); .filterCompletedInstants().empty();
} }
@CliCommand(value = "savepoints show", help = "Show the savepoints") @CliCommand(value = "savepoints show", help = "Show the savepoints")
@@ -79,17 +80,13 @@ public class SavepointsCommand implements CommandMarker {
} }
@CliCommand(value = "savepoint create", help = "Savepoint a commit") @CliCommand(value = "savepoint create", help = "Savepoint a commit")
public String savepoint( public String savepoint(@CliOption(key = {"commit"}, help = "Commit to savepoint") final String commitTime,
@CliOption(key = {"commit"}, help = "Commit to savepoint") @CliOption(key = {"user"}, help = "User who is creating the savepoint") final String user,
final String commitTime, @CliOption(key = {"comments"}, help = "Comments for creating the savepoint") final String comments)
@CliOption(key = {"user"}, help = "User who is creating the savepoint") throws Exception {
final String user,
@CliOption(key = {"comments"}, help = "Comments for creating the savepoint")
final String comments) throws Exception {
HoodieActiveTimeline activeTimeline = HoodieCLI.tableMetadata.getActiveTimeline(); HoodieActiveTimeline activeTimeline = HoodieCLI.tableMetadata.getActiveTimeline();
HoodieTimeline timeline = activeTimeline.getCommitTimeline().filterCompletedInstants(); HoodieTimeline timeline = activeTimeline.getCommitTimeline().filterCompletedInstants();
HoodieInstant HoodieInstant commitInstant = new HoodieInstant(false, HoodieTimeline.COMMIT_ACTION, commitTime);
commitInstant = new HoodieInstant(false, HoodieTimeline.COMMIT_ACTION, commitTime);
if (!timeline.containsInstant(commitInstant)) { if (!timeline.containsInstant(commitInstant)) {
return "Commit " + commitTime + " not found in Commits " + timeline; return "Commit " + commitTime + " not found in Commits " + timeline;
@@ -106,22 +103,19 @@ public class SavepointsCommand implements CommandMarker {
@CliCommand(value = "savepoint rollback", help = "Savepoint a commit") @CliCommand(value = "savepoint rollback", help = "Savepoint a commit")
public String rollbackToSavepoint( public String rollbackToSavepoint(
@CliOption(key = {"savepoint"}, help = "Savepoint to rollback") @CliOption(key = {"savepoint"}, help = "Savepoint to rollback") final String commitTime,
final String commitTime, @CliOption(key = {"sparkProperties"}, help = "Spark Properites File Path") final String sparkPropertiesPath)
@CliOption(key = {"sparkProperties"}, help = "Spark Properites File Path") throws Exception {
final String sparkPropertiesPath) throws Exception {
HoodieActiveTimeline activeTimeline = HoodieCLI.tableMetadata.getActiveTimeline(); HoodieActiveTimeline activeTimeline = HoodieCLI.tableMetadata.getActiveTimeline();
HoodieTimeline timeline = activeTimeline.getCommitTimeline().filterCompletedInstants(); HoodieTimeline timeline = activeTimeline.getCommitTimeline().filterCompletedInstants();
HoodieInstant HoodieInstant commitInstant = new HoodieInstant(false, HoodieTimeline.COMMIT_ACTION, commitTime);
commitInstant = new HoodieInstant(false, HoodieTimeline.COMMIT_ACTION, commitTime);
if (!timeline.containsInstant(commitInstant)) { if (!timeline.containsInstant(commitInstant)) {
return "Commit " + commitTime + " not found in Commits " + timeline; return "Commit " + commitTime + " not found in Commits " + timeline;
} }
SparkLauncher sparkLauncher = SparkUtil.initLauncher(sparkPropertiesPath); SparkLauncher sparkLauncher = SparkUtil.initLauncher(sparkPropertiesPath);
sparkLauncher.addAppArgs(SparkMain.SparkCommand.ROLLBACK_TO_SAVEPOINT.toString(), sparkLauncher.addAppArgs(SparkMain.SparkCommand.ROLLBACK_TO_SAVEPOINT.toString(), commitTime,
commitTime,
HoodieCLI.tableMetadata.getBasePath()); HoodieCLI.tableMetadata.getBasePath());
Process process = sparkLauncher.launch(); Process process = sparkLauncher.launch();
InputStreamConsumer.captureOutput(process); InputStreamConsumer.captureOutput(process);
@@ -137,18 +131,14 @@ public class SavepointsCommand implements CommandMarker {
@CliCommand(value = "savepoints refresh", help = "Refresh the savepoints") @CliCommand(value = "savepoints refresh", help = "Refresh the savepoints")
public String refreshMetaClient() throws IOException { public String refreshMetaClient() throws IOException {
HoodieTableMetaClient metadata = HoodieTableMetaClient metadata = new HoodieTableMetaClient(HoodieCLI.conf, HoodieCLI.tableMetadata.getBasePath());
new HoodieTableMetaClient(HoodieCLI.conf, HoodieCLI.tableMetadata.getBasePath());
HoodieCLI.setTableMetadata(metadata); HoodieCLI.setTableMetadata(metadata);
return "Metadata for table " + metadata.getTableConfig().getTableName() + " refreshed."; return "Metadata for table " + metadata.getTableConfig().getTableName() + " refreshed.";
} }
private static HoodieWriteClient createHoodieClient(JavaSparkContext jsc, String basePath) private static HoodieWriteClient createHoodieClient(JavaSparkContext jsc, String basePath) throws Exception {
throws Exception { HoodieWriteConfig config = HoodieWriteConfig.newBuilder().withPath(basePath).withIndexConfig(
HoodieWriteConfig config = HoodieWriteConfig.newBuilder().withPath(basePath) HoodieIndexConfig.newBuilder().withIndexType(HoodieIndex.IndexType.BLOOM).build()).build();
.withIndexConfig(
HoodieIndexConfig.newBuilder().withIndexType(HoodieIndex.IndexType.BLOOM).build())
.build();
return new HoodieWriteClient(jsc, config, false); return new HoodieWriteClient(jsc, config, false);
} }

View File

@@ -30,18 +30,14 @@ import org.apache.spark.sql.SQLContext;
public class SparkMain { public class SparkMain {
protected final static Logger LOG = Logger.getLogger(SparkMain.class); protected static final Logger LOG = Logger.getLogger(SparkMain.class);
/** /**
* Commands * Commands
*/ */
enum SparkCommand { enum SparkCommand {
ROLLBACK, ROLLBACK, DEDUPLICATE, ROLLBACK_TO_SAVEPOINT, SAVEPOINT, IMPORT
DEDUPLICATE,
ROLLBACK_TO_SAVEPOINT,
SAVEPOINT,
IMPORT
} }
public static void main(String[] args) throws Exception { public static void main(String[] args) throws Exception {
@@ -67,18 +63,19 @@ public class SparkMain {
break; break;
case IMPORT: case IMPORT:
assert (args.length == 11); assert (args.length == 11);
returnCode = dataImport(jsc, args[1], args[2], args[3], args[4], args[5], args[6], returnCode = dataImport(jsc, args[1], args[2], args[3], args[4], args[5], args[6], Integer.parseInt(args[7]),
Integer.parseInt(args[7]), args[8], SparkUtil.DEFUALT_SPARK_MASTER, args[9], args[8], SparkUtil.DEFUALT_SPARK_MASTER, args[9], Integer.parseInt(args[10]));
Integer.parseInt(args[10])); break;
default:
break; break;
} }
System.exit(returnCode); System.exit(returnCode);
} }
private static int dataImport(JavaSparkContext jsc, String srcPath, String targetPath, private static int dataImport(JavaSparkContext jsc, String srcPath, String targetPath, String tableName,
String tableName, String tableType, String rowKey, String partitionKey, int parallelism, String tableType, String rowKey, String partitionKey, int parallelism, String schemaFile, String sparkMaster,
String schemaFile, String sparkMaster, String sparkMemory, int retry) throws Exception { String sparkMemory, int retry) throws Exception {
HDFSParquetImporter.Config cfg = new HDFSParquetImporter.Config(); HDFSParquetImporter.Config cfg = new HDFSParquetImporter.Config();
cfg.srcPath = srcPath; cfg.srcPath = srcPath;
cfg.targetPath = targetPath; cfg.targetPath = targetPath;
@@ -92,19 +89,15 @@ public class SparkMain {
return new HDFSParquetImporter(cfg).dataImport(jsc, retry); return new HDFSParquetImporter(cfg).dataImport(jsc, retry);
} }
private static int deduplicatePartitionPath(JavaSparkContext jsc, private static int deduplicatePartitionPath(JavaSparkContext jsc, String duplicatedPartitionPath,
String duplicatedPartitionPath, String repairedOutputPath, String basePath) throws Exception {
String repairedOutputPath, DedupeSparkJob job = new DedupeSparkJob(basePath, duplicatedPartitionPath, repairedOutputPath, new SQLContext(jsc),
String basePath) FSUtils.getFs(basePath, jsc.hadoopConfiguration()));
throws Exception {
DedupeSparkJob job = new DedupeSparkJob(basePath, duplicatedPartitionPath, repairedOutputPath,
new SQLContext(jsc), FSUtils.getFs(basePath, jsc.hadoopConfiguration()));
job.fixDuplicates(true); job.fixDuplicates(true);
return 0; return 0;
} }
private static int rollback(JavaSparkContext jsc, String commitTime, String basePath) private static int rollback(JavaSparkContext jsc, String commitTime, String basePath) throws Exception {
throws Exception {
HoodieWriteClient client = createHoodieClient(jsc, basePath); HoodieWriteClient client = createHoodieClient(jsc, basePath);
if (client.rollback(commitTime)) { if (client.rollback(commitTime)) {
LOG.info(String.format("The commit \"%s\" rolled back.", commitTime)); LOG.info(String.format("The commit \"%s\" rolled back.", commitTime));
@@ -115,9 +108,7 @@ public class SparkMain {
} }
} }
private static int rollbackToSavepoint(JavaSparkContext jsc, String savepointTime, private static int rollbackToSavepoint(JavaSparkContext jsc, String savepointTime, String basePath) throws Exception {
String basePath)
throws Exception {
HoodieWriteClient client = createHoodieClient(jsc, basePath); HoodieWriteClient client = createHoodieClient(jsc, basePath);
if (client.rollbackToSavepoint(savepointTime)) { if (client.rollbackToSavepoint(savepointTime)) {
LOG.info(String.format("The commit \"%s\" rolled back.", savepointTime)); LOG.info(String.format("The commit \"%s\" rolled back.", savepointTime));
@@ -128,12 +119,9 @@ public class SparkMain {
} }
} }
private static HoodieWriteClient createHoodieClient(JavaSparkContext jsc, String basePath) private static HoodieWriteClient createHoodieClient(JavaSparkContext jsc, String basePath) throws Exception {
throws Exception { HoodieWriteConfig config = HoodieWriteConfig.newBuilder().withPath(basePath).withIndexConfig(
HoodieWriteConfig config = HoodieWriteConfig.newBuilder().withPath(basePath) HoodieIndexConfig.newBuilder().withIndexType(HoodieIndex.IndexType.BLOOM).build()).build();
.withIndexConfig(
HoodieIndexConfig.newBuilder().withIndexType(HoodieIndex.IndexType.BLOOM).build())
.build();
return new HoodieWriteClient(jsc, config); return new HoodieWriteClient(jsc, config);
} }
} }

View File

@@ -16,7 +16,6 @@
package com.uber.hoodie.cli.commands; package com.uber.hoodie.cli.commands;
import com.codahale.metrics.Histogram; import com.codahale.metrics.Histogram;
import com.codahale.metrics.Snapshot; import com.codahale.metrics.Snapshot;
import com.codahale.metrics.UniformReservoir; import com.codahale.metrics.UniformReservoir;
@@ -44,12 +43,15 @@ import org.springframework.stereotype.Component;
@Component @Component
public class StatsCommand implements CommandMarker { public class StatsCommand implements CommandMarker {
private static final int MAX_FILES = 1000000;
@CliAvailabilityIndicator({"stats wa"}) @CliAvailabilityIndicator({"stats wa"})
public boolean isWriteAmpAvailable() { public boolean isWriteAmpAvailable() {
return HoodieCLI.tableMetadata != null; return HoodieCLI.tableMetadata != null;
} }
@CliCommand(value = "stats wa", help = "Write Amplification. Ratio of how many records were upserted to how many records were actually written") @CliCommand(value = "stats wa", help = "Write Amplification. Ratio of how many records were upserted to how many "
+ "records were actually written")
public String writeAmplificationStats() throws IOException { public String writeAmplificationStats() throws IOException {
long totalRecordsUpserted = 0; long totalRecordsUpserted = 0;
long totalRecordsWritten = 0; long totalRecordsWritten = 0;
@@ -60,18 +62,13 @@ public class StatsCommand implements CommandMarker {
String[][] rows = new String[new Long(timeline.countInstants()).intValue() + 1][]; String[][] rows = new String[new Long(timeline.countInstants()).intValue() + 1][];
int i = 0; int i = 0;
DecimalFormat df = new DecimalFormat("#.00"); DecimalFormat df = new DecimalFormat("#.00");
for (HoodieInstant commitTime : timeline.getInstants().collect( for (HoodieInstant commitTime : timeline.getInstants().collect(Collectors.toList())) {
Collectors.toList())) {
String waf = "0"; String waf = "0";
HoodieCommitMetadata commit = HoodieCommitMetadata HoodieCommitMetadata commit = HoodieCommitMetadata.fromBytes(activeTimeline.getInstantDetails(commitTime).get());
.fromBytes(activeTimeline.getInstantDetails(commitTime).get());
if (commit.fetchTotalUpdateRecordsWritten() > 0) { if (commit.fetchTotalUpdateRecordsWritten() > 0) {
waf = df.format( waf = df.format((float) commit.fetchTotalRecordsWritten() / commit.fetchTotalUpdateRecordsWritten());
(float) commit.fetchTotalRecordsWritten() / commit
.fetchTotalUpdateRecordsWritten());
} }
rows[i++] = new String[]{commitTime.getTimestamp(), rows[i++] = new String[] {commitTime.getTimestamp(), String.valueOf(commit.fetchTotalUpdateRecordsWritten()),
String.valueOf(commit.fetchTotalUpdateRecordsWritten()),
String.valueOf(commit.fetchTotalRecordsWritten()), waf}; String.valueOf(commit.fetchTotalRecordsWritten()), waf};
totalRecordsUpserted += commit.fetchTotalUpdateRecordsWritten(); totalRecordsUpserted += commit.fetchTotalUpdateRecordsWritten();
totalRecordsWritten += commit.fetchTotalRecordsWritten(); totalRecordsWritten += commit.fetchTotalRecordsWritten();
@@ -80,43 +77,32 @@ public class StatsCommand implements CommandMarker {
if (totalRecordsUpserted > 0) { if (totalRecordsUpserted > 0) {
waf = df.format((float) totalRecordsWritten / totalRecordsUpserted); waf = df.format((float) totalRecordsWritten / totalRecordsUpserted);
} }
rows[i] = new String[]{"Total", String.valueOf(totalRecordsUpserted), rows[i] = new String[] {"Total", String.valueOf(totalRecordsUpserted), String.valueOf(totalRecordsWritten), waf};
String.valueOf(totalRecordsWritten), waf}; return HoodiePrintHelper
return HoodiePrintHelper.print( .print(new String[] {"CommitTime", "Total Upserted", "Total Written", "Write Amplifiation Factor"},
new String[]{"CommitTime", "Total Upserted", "Total Written", rows);
"Write Amplifiation Factor"}, rows);
} }
private String[] printFileSizeHistogram(String commitTime, Snapshot s) { private String[] printFileSizeHistogram(String commitTime, Snapshot s) {
return new String[]{ return new String[] {commitTime, NumericUtils.humanReadableByteCount(s.getMin()),
commitTime, NumericUtils.humanReadableByteCount(s.getValue(0.1)), NumericUtils.humanReadableByteCount(s.getMedian()),
NumericUtils.humanReadableByteCount(s.getMin()), NumericUtils.humanReadableByteCount(s.getMean()), NumericUtils.humanReadableByteCount(s.get95thPercentile()),
NumericUtils.humanReadableByteCount(s.getValue(0.1)), NumericUtils.humanReadableByteCount(s.getMax()), String.valueOf(s.size()),
NumericUtils.humanReadableByteCount(s.getMedian()), NumericUtils.humanReadableByteCount(s.getStdDev())};
NumericUtils.humanReadableByteCount(s.getMean()),
NumericUtils.humanReadableByteCount(s.get95thPercentile()),
NumericUtils.humanReadableByteCount(s.getMax()),
String.valueOf(s.size()),
NumericUtils.humanReadableByteCount(s.getStdDev())
};
} }
@CliCommand(value = "stats filesizes", help = "File Sizes. Display summary stats on sizes of files") @CliCommand(value = "stats filesizes", help = "File Sizes. Display summary stats on sizes of files")
public String fileSizeStats( public String fileSizeStats(@CliOption(key = {
@CliOption(key = { "partitionPath"}, help = "regex to select files, eg: 2016/08/02", unspecifiedDefaultValue = "*/*/*") final
"partitionPath"}, help = "regex to select files, eg: 2016/08/02", unspecifiedDefaultValue = "*/*/*") String globRegex) throws IOException {
final String globRegex) throws IOException {
FileSystem fs = HoodieCLI.fs; FileSystem fs = HoodieCLI.fs;
String globPath = String.format("%s/%s/*", String globPath = String.format("%s/%s/*", HoodieCLI.tableMetadata.getBasePath(), globRegex);
HoodieCLI.tableMetadata.getBasePath(),
globRegex);
FileStatus[] statuses = fs.globStatus(new Path(globPath)); FileStatus[] statuses = fs.globStatus(new Path(globPath));
// max, min, #small files < 10MB, 50th, avg, 95th // max, min, #small files < 10MB, 50th, avg, 95th
final int MAX_FILES = 1000000;
Histogram globalHistogram = new Histogram(new UniformReservoir(MAX_FILES)); Histogram globalHistogram = new Histogram(new UniformReservoir(MAX_FILES));
HashMap<String, Histogram> commitHistoMap = new HashMap<String, Histogram>(); HashMap<String, Histogram> commitHistoMap = new HashMap<String, Histogram>();
for (FileStatus fileStatus : statuses) { for (FileStatus fileStatus : statuses) {
@@ -138,8 +124,8 @@ public class StatsCommand implements CommandMarker {
Snapshot s = globalHistogram.getSnapshot(); Snapshot s = globalHistogram.getSnapshot();
rows[ind++] = printFileSizeHistogram("ALL", s); rows[ind++] = printFileSizeHistogram("ALL", s);
return HoodiePrintHelper.print( return HoodiePrintHelper
new String[]{"CommitTime", "Min", "10th", "50th", "avg", "95th", "Max", "NumFiles", .print(new String[] {"CommitTime", "Min", "10th", "50th", "avg", "95th", "Max", "NumFiles", "StdDev"},
"StdDev"}, rows); rows);
} }
} }

View File

@@ -25,9 +25,7 @@ import org.springframework.stereotype.Component;
public class UtilsCommand implements CommandMarker { public class UtilsCommand implements CommandMarker {
@CliCommand(value = "utils loadClass", help = "Load a class") @CliCommand(value = "utils loadClass", help = "Load a class")
public String loadClass( public String loadClass(@CliOption(key = {"class"}, help = "Check mode") final String clazz) throws Exception {
@CliOption(key = {"class"}, help = "Check mode") final String clazz
) throws Exception {
Class klass = Class.forName(clazz); Class klass = Class.forName(clazz);
return klass.getProtectionDomain().getCodeSource().getLocation().toExternalForm(); return klass.getProtectionDomain().getCodeSource().getLocation().toExternalForm();
} }

View File

@@ -25,15 +25,12 @@ import java.util.List;
public class CommitUtil { public class CommitUtil {
public static long countNewRecords(HoodieTableMetaClient target, List<String> commitsToCatchup) public static long countNewRecords(HoodieTableMetaClient target, List<String> commitsToCatchup) throws IOException {
throws IOException {
long totalNew = 0; long totalNew = 0;
HoodieTimeline timeline = target.getActiveTimeline().reload().getCommitTimeline() HoodieTimeline timeline = target.getActiveTimeline().reload().getCommitTimeline().filterCompletedInstants();
.filterCompletedInstants();
for (String commit : commitsToCatchup) { for (String commit : commitsToCatchup) {
HoodieCommitMetadata c = HoodieCommitMetadata.fromBytes(timeline HoodieCommitMetadata c = HoodieCommitMetadata.fromBytes(
.getInstantDetails(new HoodieInstant(false, HoodieTimeline.COMMIT_ACTION, commit)) timeline.getInstantDetails(new HoodieInstant(false, HoodieTimeline.COMMIT_ACTION, commit)).get());
.get());
totalNew += c.fetchTotalRecordsWritten() - c.fetchTotalUpdateRecordsWritten(); totalNew += c.fetchTotalRecordsWritten() - c.fetchTotalUpdateRecordsWritten();
} }
return totalNew; return totalNew;

View File

@@ -27,7 +27,7 @@ import org.joda.time.DateTime;
public class HiveUtil { public class HiveUtil {
private static String driverName = "org.apache.hive.jdbc.HiveDriver"; private static final String driverName = "org.apache.hive.jdbc.HiveDriver";
static { static {
try { try {
@@ -39,8 +39,7 @@ public class HiveUtil {
private static Connection connection; private static Connection connection;
private static Connection getConnection(String jdbcUrl, String user, String pass) private static Connection getConnection(String jdbcUrl, String user, String pass) throws SQLException {
throws SQLException {
DataSource ds = getDatasource(jdbcUrl, user, pass); DataSource ds = getDatasource(jdbcUrl, user, pass);
return ds.getConnection(); return ds.getConnection();
} }
@@ -54,8 +53,8 @@ public class HiveUtil {
return ds; return ds;
} }
public static long countRecords(String jdbcUrl, HoodieTableMetaClient source, String dbName, public static long countRecords(String jdbcUrl, HoodieTableMetaClient source, String dbName, String user, String pass)
String user, String pass) throws SQLException { throws SQLException {
Connection conn = HiveUtil.getConnection(jdbcUrl, user, pass); Connection conn = HiveUtil.getConnection(jdbcUrl, user, pass);
ResultSet rs = null; ResultSet rs = null;
Statement stmt = conn.createStatement(); Statement stmt = conn.createStatement();
@@ -64,15 +63,13 @@ public class HiveUtil {
stmt.execute("set hive.input.format=org.apache.hadoop.hive.ql.io.HiveInputFormat"); stmt.execute("set hive.input.format=org.apache.hadoop.hive.ql.io.HiveInputFormat");
stmt.execute("set hive.stats.autogather=false"); stmt.execute("set hive.stats.autogather=false");
rs = stmt.executeQuery( rs = stmt.executeQuery(
"select count(`_hoodie_commit_time`) as cnt from " + dbName + "." + source "select count(`_hoodie_commit_time`) as cnt from " + dbName + "."
.getTableConfig() + source.getTableConfig().getTableName());
.getTableName());
long count = -1; long count = -1;
if (rs.next()) { if (rs.next()) {
count = rs.getLong("cnt"); count = rs.getLong("cnt");
} }
System.out System.out.println("Total records in " + source.getTableConfig().getTableName() + " is " + count);
.println("Total records in " + source.getTableConfig().getTableName() + " is " + count);
return count; return count;
} finally { } finally {
if (rs != null) { if (rs != null) {
@@ -84,22 +81,19 @@ public class HiveUtil {
} }
} }
public static long countRecords(String jdbcUrl, HoodieTableMetaClient source, String srcDb, public static long countRecords(String jdbcUrl, HoodieTableMetaClient source, String srcDb, int partitions,
int partitions, String user, String pass) throws SQLException { String user, String pass) throws SQLException {
DateTime dateTime = DateTime.now(); DateTime dateTime = DateTime.now();
String endDateStr = String endDateStr = dateTime.getYear() + "-" + String.format("%02d", dateTime.getMonthOfYear()) + "-"
dateTime.getYear() + "-" + String.format("%02d", dateTime.getMonthOfYear()) + "-" + + String.format("%02d", dateTime.getDayOfMonth());
String.format("%02d", dateTime.getDayOfMonth());
dateTime = dateTime.minusDays(partitions); dateTime = dateTime.minusDays(partitions);
String startDateStr = String startDateStr = dateTime.getYear() + "-" + String.format("%02d", dateTime.getMonthOfYear()) + "-"
dateTime.getYear() + "-" + String.format("%02d", dateTime.getMonthOfYear()) + "-" + + String.format("%02d", dateTime.getDayOfMonth());
String.format("%02d", dateTime.getDayOfMonth());
System.out.println("Start date " + startDateStr + " and end date " + endDateStr); System.out.println("Start date " + startDateStr + " and end date " + endDateStr);
return countRecords(jdbcUrl, source, srcDb, startDateStr, endDateStr, user, pass); return countRecords(jdbcUrl, source, srcDb, startDateStr, endDateStr, user, pass);
} }
private static long countRecords(String jdbcUrl, HoodieTableMetaClient source, String srcDb, private static long countRecords(String jdbcUrl, HoodieTableMetaClient source, String srcDb, String startDateStr,
String startDateStr,
String endDateStr, String user, String pass) throws SQLException { String endDateStr, String user, String pass) throws SQLException {
Connection conn = HiveUtil.getConnection(jdbcUrl, user, pass); Connection conn = HiveUtil.getConnection(jdbcUrl, user, pass);
ResultSet rs = null; ResultSet rs = null;
@@ -109,9 +103,8 @@ public class HiveUtil {
stmt.execute("set hive.input.format=org.apache.hadoop.hive.ql.io.HiveInputFormat"); stmt.execute("set hive.input.format=org.apache.hadoop.hive.ql.io.HiveInputFormat");
stmt.execute("set hive.stats.autogather=false"); stmt.execute("set hive.stats.autogather=false");
rs = stmt.executeQuery( rs = stmt.executeQuery(
"select count(`_hoodie_commit_time`) as cnt from " + srcDb + "." + source.getTableConfig() "select count(`_hoodie_commit_time`) as cnt from " + srcDb + "." + source.getTableConfig().getTableName()
.getTableName() + " where datestr>'" + startDateStr + "' and datestr<='" + " where datestr>'" + startDateStr + "' and datestr<='" + endDateStr + "'");
+ endDateStr + "'");
if (rs.next()) { if (rs.next()) {
return rs.getLong("cnt"); return rs.getLong("cnt");
} }

View File

@@ -24,7 +24,7 @@ import java.util.logging.Logger;
public class InputStreamConsumer extends Thread { public class InputStreamConsumer extends Thread {
protected final static Logger LOG = Logger.getLogger(InputStreamConsumer.class.getName()); protected static final Logger LOG = Logger.getLogger(InputStreamConsumer.class.getName());
private InputStream is; private InputStream is;
public InputStreamConsumer(InputStream is) { public InputStreamConsumer(InputStream is) {

View File

@@ -35,12 +35,9 @@ public class SparkUtil {
* TODO: Need to fix a bunch of hardcoded stuff here eg: history server, spark distro * TODO: Need to fix a bunch of hardcoded stuff here eg: history server, spark distro
*/ */
public static SparkLauncher initLauncher(String propertiesFile) throws URISyntaxException { public static SparkLauncher initLauncher(String propertiesFile) throws URISyntaxException {
String currentJar = new File( String currentJar = new File(SparkUtil.class.getProtectionDomain().getCodeSource().getLocation().toURI().getPath())
SparkUtil.class.getProtectionDomain().getCodeSource().getLocation().toURI().getPath())
.getAbsolutePath(); .getAbsolutePath();
SparkLauncher sparkLauncher = SparkLauncher sparkLauncher = new SparkLauncher().setAppResource(currentJar).setMainClass(SparkMain.class.getName())
new SparkLauncher().setAppResource(currentJar)
.setMainClass(SparkMain.class.getName())
.setPropertiesFile(propertiesFile); .setPropertiesFile(propertiesFile);
File libDirectory = new File(new File(currentJar).getParent(), "lib"); File libDirectory = new File(new File(currentJar).getParent(), "lib");
for (String library : libDirectory.list()) { for (String library : libDirectory.list()) {
@@ -60,8 +57,7 @@ public class SparkUtil {
// Configure hadoop conf // Configure hadoop conf
sparkConf.set("spark.hadoop.mapred.output.compress", "true"); sparkConf.set("spark.hadoop.mapred.output.compress", "true");
sparkConf.set("spark.hadoop.mapred.output.compression.codec", "true"); sparkConf.set("spark.hadoop.mapred.output.compression.codec", "true");
sparkConf.set("spark.hadoop.mapred.output.compression.codec", sparkConf.set("spark.hadoop.mapred.output.compression.codec", "org.apache.hadoop.io.compress.GzipCodec");
"org.apache.hadoop.io.compress.GzipCodec");
sparkConf.set("spark.hadoop.mapred.output.compression.type", "BLOCK"); sparkConf.set("spark.hadoop.mapred.output.compression.type", "BLOCK");
sparkConf = HoodieWriteClient.registerClasses(sparkConf); sparkConf = HoodieWriteClient.registerClasses(sparkConf);

View File

@@ -50,17 +50,17 @@ import scala.Tuple2;
*/ */
public class HoodieReadClient<T extends HoodieRecordPayload> implements Serializable { public class HoodieReadClient<T extends HoodieRecordPayload> implements Serializable {
private static Logger logger = LogManager.getLogger(HoodieReadClient.class); private static final Logger logger = LogManager.getLogger(HoodieReadClient.class);
private transient final JavaSparkContext jsc; private final transient JavaSparkContext jsc;
private transient final FileSystem fs; private final transient FileSystem fs;
/** /**
* TODO: We need to persist the index type into hoodie.properties and be able to access the index * TODO: We need to persist the index type into hoodie.properties and be able to access the index
* just with a simple basepath pointing to the dataset. Until, then just always assume a * just with a simple basepath pointing to the dataset. Until, then just always assume a
* BloomIndex * BloomIndex
*/ */
private transient final HoodieIndex<T> index; private final transient HoodieIndex<T> index;
private final HoodieTimeline commitTimeline; private final HoodieTimeline commitTimeline;
private HoodieTable hoodieTable; private HoodieTable hoodieTable;
private transient Optional<SQLContext> sqlContextOpt; private transient Optional<SQLContext> sqlContextOpt;
@@ -69,8 +69,7 @@ public class HoodieReadClient<T extends HoodieRecordPayload> implements Serializ
* @param basePath path to Hoodie dataset * @param basePath path to Hoodie dataset
*/ */
public HoodieReadClient(JavaSparkContext jsc, String basePath) { public HoodieReadClient(JavaSparkContext jsc, String basePath) {
this(jsc, HoodieWriteConfig.newBuilder() this(jsc, HoodieWriteConfig.newBuilder().withPath(basePath)
.withPath(basePath)
// by default we use HoodieBloomIndex // by default we use HoodieBloomIndex
.withIndexConfig( .withIndexConfig(
HoodieIndexConfig.newBuilder().withIndexType(HoodieIndex.IndexType.BLOOM).build()) HoodieIndexConfig.newBuilder().withIndexType(HoodieIndex.IndexType.BLOOM).build())
@@ -78,7 +77,6 @@ public class HoodieReadClient<T extends HoodieRecordPayload> implements Serializ
} }
/** /**
*
* @param jsc * @param jsc
* @param basePath * @param basePath
* @param sqlContext * @param sqlContext
@@ -96,8 +94,8 @@ public class HoodieReadClient<T extends HoodieRecordPayload> implements Serializ
this.jsc = jsc; this.jsc = jsc;
this.fs = FSUtils.getFs(basePath, jsc.hadoopConfiguration()); this.fs = FSUtils.getFs(basePath, jsc.hadoopConfiguration());
// Create a Hoodie table which encapsulated the commits and files visible // Create a Hoodie table which encapsulated the commits and files visible
this.hoodieTable = HoodieTable.getHoodieTable( this.hoodieTable = HoodieTable
new HoodieTableMetaClient(jsc.hadoopConfiguration(), basePath, true), .getHoodieTable(new HoodieTableMetaClient(jsc.hadoopConfiguration(), basePath, true),
clientConfig); clientConfig);
this.commitTimeline = hoodieTable.getCommitTimeline().filterCompletedInstants(); this.commitTimeline = hoodieTable.getCommitTimeline().filterCompletedInstants();
this.index = HoodieIndex.createIndex(clientConfig, jsc); this.index = HoodieIndex.createIndex(clientConfig, jsc);
@@ -126,33 +124,27 @@ public class HoodieReadClient<T extends HoodieRecordPayload> implements Serializ
* *
* @return a dataframe * @return a dataframe
*/ */
public Dataset<Row> read(JavaRDD<HoodieKey> hoodieKeys, int parallelism) public Dataset<Row> read(JavaRDD<HoodieKey> hoodieKeys, int parallelism) throws Exception {
throws Exception {
assertSqlContext(); assertSqlContext();
JavaPairRDD<HoodieKey, Optional<String>> keyToFileRDD = JavaPairRDD<HoodieKey, Optional<String>> keyToFileRDD = index
index.fetchRecordLocation(hoodieKeys, hoodieTable); .fetchRecordLocation(hoodieKeys, hoodieTable);
List<String> paths = keyToFileRDD List<String> paths = keyToFileRDD.filter(keyFileTuple -> keyFileTuple._2().isPresent())
.filter(keyFileTuple -> keyFileTuple._2().isPresent()) .map(keyFileTuple -> keyFileTuple._2().get()).collect();
.map(keyFileTuple -> keyFileTuple._2().get())
.collect();
// record locations might be same for multiple keys, so need a unique list // record locations might be same for multiple keys, so need a unique list
Set<String> uniquePaths = new HashSet<>(paths); Set<String> uniquePaths = new HashSet<>(paths);
Dataset<Row> originalDF = sqlContextOpt.get().read() Dataset<Row> originalDF = sqlContextOpt.get().read()
.parquet(uniquePaths.toArray(new String[uniquePaths.size()])); .parquet(uniquePaths.toArray(new String[uniquePaths.size()]));
StructType schema = originalDF.schema(); StructType schema = originalDF.schema();
JavaPairRDD<HoodieKey, Row> keyRowRDD = originalDF.javaRDD() JavaPairRDD<HoodieKey, Row> keyRowRDD = originalDF.javaRDD().mapToPair(row -> {
.mapToPair(row -> { HoodieKey key = new HoodieKey(row.getAs(HoodieRecord.RECORD_KEY_METADATA_FIELD),
HoodieKey key = new HoodieKey(
row.getAs(HoodieRecord.RECORD_KEY_METADATA_FIELD),
row.getAs(HoodieRecord.PARTITION_PATH_METADATA_FIELD)); row.getAs(HoodieRecord.PARTITION_PATH_METADATA_FIELD));
return new Tuple2<>(key, row); return new Tuple2<>(key, row);
}); });
// Now, we need to further filter out, for only rows that match the supplied hoodie keys // Now, we need to further filter out, for only rows that match the supplied hoodie keys
JavaRDD<Row> rowRDD = keyRowRDD.join(keyToFileRDD, parallelism) JavaRDD<Row> rowRDD = keyRowRDD.join(keyToFileRDD, parallelism).map(tuple -> tuple._2()._1());
.map(tuple -> tuple._2()._1());
return sqlContextOpt.get().createDataFrame(rowRDD, schema); return sqlContextOpt.get().createDataFrame(rowRDD, schema);
} }

View File

@@ -81,18 +81,18 @@ import scala.Tuple2;
/** /**
* Hoodie Write Client helps you build datasets on HDFS [insert()] and then perform efficient * Hoodie Write Client helps you build datasets on HDFS [insert()] and then perform efficient
* mutations on a HDFS dataset [upsert()] * mutations on a HDFS dataset [upsert()]
* * <p>
* Note that, at any given time, there can only be one Spark job performing these operatons on a * Note that, at any given time, there can only be one Spark job performing these operatons on a
* Hoodie dataset. * Hoodie dataset.
*/ */
public class HoodieWriteClient<T extends HoodieRecordPayload> implements Serializable { public class HoodieWriteClient<T extends HoodieRecordPayload> implements Serializable {
private static Logger logger = LogManager.getLogger(HoodieWriteClient.class); private static Logger logger = LogManager.getLogger(HoodieWriteClient.class);
private transient final FileSystem fs; private final transient FileSystem fs;
private transient final JavaSparkContext jsc; private final transient JavaSparkContext jsc;
private final HoodieWriteConfig config; private final HoodieWriteConfig config;
private transient final HoodieMetrics metrics; private final transient HoodieMetrics metrics;
private transient final HoodieIndex<T> index; private final transient HoodieIndex<T> index;
private transient Timer.Context writeContext = null; private transient Timer.Context writeContext = null;
/** /**
@@ -100,8 +100,7 @@ public class HoodieWriteClient<T extends HoodieRecordPayload> implements Seriali
* @param clientConfig * @param clientConfig
* @throws Exception * @throws Exception
*/ */
public HoodieWriteClient(JavaSparkContext jsc, HoodieWriteConfig clientConfig) public HoodieWriteClient(JavaSparkContext jsc, HoodieWriteConfig clientConfig) throws Exception {
throws Exception {
this(jsc, clientConfig, false); this(jsc, clientConfig, false);
} }
@@ -129,6 +128,12 @@ public class HoodieWriteClient<T extends HoodieRecordPayload> implements Seriali
} }
} }
public static SparkConf registerClasses(SparkConf conf) {
conf.registerKryoClasses(
new Class[] {HoodieWriteConfig.class, HoodieRecord.class, HoodieKey.class});
return conf;
}
/** /**
* Filter out HoodieRecords that already exists in the output folder. This is useful in * Filter out HoodieRecords that already exists in the output folder. This is useful in
* deduplication. * deduplication.
@@ -139,8 +144,7 @@ public class HoodieWriteClient<T extends HoodieRecordPayload> implements Seriali
public JavaRDD<HoodieRecord<T>> filterExists(JavaRDD<HoodieRecord<T>> hoodieRecords) { public JavaRDD<HoodieRecord<T>> filterExists(JavaRDD<HoodieRecord<T>> hoodieRecords) {
// Create a Hoodie table which encapsulated the commits and files visible // Create a Hoodie table which encapsulated the commits and files visible
HoodieTable<T> table = HoodieTable.getHoodieTable( HoodieTable<T> table = HoodieTable.getHoodieTable(
new HoodieTableMetaClient(jsc.hadoopConfiguration(), config.getBasePath(), true), new HoodieTableMetaClient(jsc.hadoopConfiguration(), config.getBasePath(), true), config);
config);
JavaRDD<HoodieRecord<T>> recordsWithLocation = index.tagLocation(hoodieRecords, table); JavaRDD<HoodieRecord<T>> recordsWithLocation = index.tagLocation(hoodieRecords, table);
return recordsWithLocation.filter(v1 -> !v1.isCurrentLocationKnown()); return recordsWithLocation.filter(v1 -> !v1.isCurrentLocationKnown());
@@ -153,9 +157,8 @@ public class HoodieWriteClient<T extends HoodieRecordPayload> implements Seriali
HoodieTable<T> table = getTableAndInitCtx(); HoodieTable<T> table = getTableAndInitCtx();
try { try {
// De-dupe/merge if needed // De-dupe/merge if needed
JavaRDD<HoodieRecord<T>> dedupedRecords = JavaRDD<HoodieRecord<T>> dedupedRecords = combineOnCondition(
combineOnCondition(config.shouldCombineBeforeUpsert(), records, config.shouldCombineBeforeUpsert(), records, config.getUpsertShuffleParallelism());
config.getUpsertShuffleParallelism());
// perform index loop up to get existing location of records // perform index loop up to get existing location of records
JavaRDD<HoodieRecord<T>> taggedRecords = index.tagLocation(dedupedRecords, table); JavaRDD<HoodieRecord<T>> taggedRecords = index.tagLocation(dedupedRecords, table);
@@ -170,7 +173,7 @@ public class HoodieWriteClient<T extends HoodieRecordPayload> implements Seriali
/** /**
* Upserts the given prepared records into the Hoodie table, at the supplied commitTime. * Upserts the given prepared records into the Hoodie table, at the supplied commitTime.
* * <p>
* This implementation requires that the input records are already tagged, and de-duped if * This implementation requires that the input records are already tagged, and de-duped if
* needed. * needed.
* *
@@ -187,15 +190,15 @@ public class HoodieWriteClient<T extends HoodieRecordPayload> implements Seriali
if (e instanceof HoodieUpsertException) { if (e instanceof HoodieUpsertException) {
throw (HoodieUpsertException) e; throw (HoodieUpsertException) e;
} }
throw new HoodieUpsertException("Failed to upsert prepared records for commit time " + throw new HoodieUpsertException(
commitTime, e); "Failed to upsert prepared records for commit time " + commitTime, e);
} }
} }
/** /**
* Inserts the given HoodieRecords, into the table. This API is intended to be used for normal * Inserts the given HoodieRecords, into the table. This API is intended to be used for normal
* writes. * writes.
* * <p>
* This implementation skips the index check and is able to leverage benefits such as small file * This implementation skips the index check and is able to leverage benefits such as small file
* handling/blocking alignment, as with upsert(), by profiling the workload * handling/blocking alignment, as with upsert(), by profiling the workload
* *
@@ -207,9 +210,8 @@ public class HoodieWriteClient<T extends HoodieRecordPayload> implements Seriali
HoodieTable<T> table = getTableAndInitCtx(); HoodieTable<T> table = getTableAndInitCtx();
try { try {
// De-dupe/merge if needed // De-dupe/merge if needed
JavaRDD<HoodieRecord<T>> dedupedRecords = JavaRDD<HoodieRecord<T>> dedupedRecords = combineOnCondition(
combineOnCondition(config.shouldCombineBeforeInsert(), records, config.shouldCombineBeforeInsert(), records, config.getInsertShuffleParallelism());
config.getInsertShuffleParallelism());
return upsertRecordsInternal(dedupedRecords, commitTime, table, false); return upsertRecordsInternal(dedupedRecords, commitTime, table, false);
} catch (Throwable e) { } catch (Throwable e) {
@@ -222,7 +224,7 @@ public class HoodieWriteClient<T extends HoodieRecordPayload> implements Seriali
/** /**
* Inserts the given prepared records into the Hoodie table, at the supplied commitTime. * Inserts the given prepared records into the Hoodie table, at the supplied commitTime.
* * <p>
* This implementation skips the index check, skips de-duping and is able to leverage benefits * This implementation skips the index check, skips de-duping and is able to leverage benefits
* such as small file handling/blocking alignment, as with insert(), by profiling the workload. * such as small file handling/blocking alignment, as with insert(), by profiling the workload.
* The prepared HoodieRecords should be de-duped if needed. * The prepared HoodieRecords should be de-duped if needed.
@@ -240,8 +242,8 @@ public class HoodieWriteClient<T extends HoodieRecordPayload> implements Seriali
if (e instanceof HoodieInsertException) { if (e instanceof HoodieInsertException) {
throw e; throw e;
} }
throw new HoodieInsertException("Failed to insert prepared records for commit time " + throw new HoodieInsertException(
commitTime, e); "Failed to insert prepared records for commit time " + commitTime, e);
} }
} }
@@ -249,7 +251,7 @@ public class HoodieWriteClient<T extends HoodieRecordPayload> implements Seriali
* Loads the given HoodieRecords, as inserts into the table. This is suitable for doing big bulk * Loads the given HoodieRecords, as inserts into the table. This is suitable for doing big bulk
* loads into a Hoodie table for the very first time (e.g: converting an existing dataset to * loads into a Hoodie table for the very first time (e.g: converting an existing dataset to
* Hoodie). * Hoodie).
* * <p>
* This implementation uses sortBy (which does range partitioning based on reservoir sampling) and * This implementation uses sortBy (which does range partitioning based on reservoir sampling) and
* attempts to control the numbers of files with less memory compared to the {@link * attempts to control the numbers of files with less memory compared to the {@link
* HoodieWriteClient#insert(JavaRDD, String)} * HoodieWriteClient#insert(JavaRDD, String)}
@@ -267,7 +269,7 @@ public class HoodieWriteClient<T extends HoodieRecordPayload> implements Seriali
* Loads the given HoodieRecords, as inserts into the table. This is suitable for doing big bulk * Loads the given HoodieRecords, as inserts into the table. This is suitable for doing big bulk
* loads into a Hoodie table for the very first time (e.g: converting an existing dataset to * loads into a Hoodie table for the very first time (e.g: converting an existing dataset to
* Hoodie). * Hoodie).
* * <p>
* This implementation uses sortBy (which does range partitioning based on reservoir sampling) and * This implementation uses sortBy (which does range partitioning based on reservoir sampling) and
* attempts to control the numbers of files with less memory compared to the {@link * attempts to control the numbers of files with less memory compared to the {@link
* HoodieWriteClient#insert(JavaRDD, String)}. Optionally it allows users to specify their own * HoodieWriteClient#insert(JavaRDD, String)}. Optionally it allows users to specify their own
@@ -280,23 +282,20 @@ public class HoodieWriteClient<T extends HoodieRecordPayload> implements Seriali
* before they are inserted into hoodie. * before they are inserted into hoodie.
* @return JavaRDD[WriteStatus] - RDD of WriteStatus to inspect errors and counts * @return JavaRDD[WriteStatus] - RDD of WriteStatus to inspect errors and counts
*/ */
public JavaRDD<WriteStatus> bulkInsert(JavaRDD<HoodieRecord<T>> records, public JavaRDD<WriteStatus> bulkInsert(JavaRDD<HoodieRecord<T>> records, final String commitTime,
final String commitTime,
Option<UserDefinedBulkInsertPartitioner> bulkInsertPartitioner) { Option<UserDefinedBulkInsertPartitioner> bulkInsertPartitioner) {
HoodieTable<T> table = getTableAndInitCtx(); HoodieTable<T> table = getTableAndInitCtx();
try { try {
// De-dupe/merge if needed // De-dupe/merge if needed
JavaRDD<HoodieRecord<T>> dedupedRecords = JavaRDD<HoodieRecord<T>> dedupedRecords = combineOnCondition(
combineOnCondition(config.shouldCombineBeforeInsert(), records, config.shouldCombineBeforeInsert(), records, config.getInsertShuffleParallelism());
config.getInsertShuffleParallelism());
return bulkInsertInternal(dedupedRecords, commitTime, table, bulkInsertPartitioner); return bulkInsertInternal(dedupedRecords, commitTime, table, bulkInsertPartitioner);
} catch (Throwable e) { } catch (Throwable e) {
if (e instanceof HoodieInsertException) { if (e instanceof HoodieInsertException) {
throw e; throw e;
} }
throw new HoodieInsertException("Failed to bulk insert for commit time " + commitTime, throw new HoodieInsertException("Failed to bulk insert for commit time " + commitTime, e);
e);
} }
} }
@@ -304,7 +303,7 @@ public class HoodieWriteClient<T extends HoodieRecordPayload> implements Seriali
* Loads the given HoodieRecords, as inserts into the table. This is suitable for doing big bulk * Loads the given HoodieRecords, as inserts into the table. This is suitable for doing big bulk
* loads into a Hoodie table for the very first time (e.g: converting an existing dataset to * loads into a Hoodie table for the very first time (e.g: converting an existing dataset to
* Hoodie). The input records should contain no duplicates if needed. * Hoodie). The input records should contain no duplicates if needed.
* * <p>
* This implementation uses sortBy (which does range partitioning based on reservoir sampling) and * This implementation uses sortBy (which does range partitioning based on reservoir sampling) and
* attempts to control the numbers of files with less memory compared to the {@link * attempts to control the numbers of files with less memory compared to the {@link
* HoodieWriteClient#insert(JavaRDD, String)}. Optionally it allows users to specify their own * HoodieWriteClient#insert(JavaRDD, String)}. Optionally it allows users to specify their own
@@ -318,8 +317,7 @@ public class HoodieWriteClient<T extends HoodieRecordPayload> implements Seriali
* @return JavaRDD[WriteStatus] - RDD of WriteStatus to inspect errors and counts * @return JavaRDD[WriteStatus] - RDD of WriteStatus to inspect errors and counts
*/ */
public JavaRDD<WriteStatus> bulkInsertPreppedRecords(JavaRDD<HoodieRecord<T>> preppedRecords, public JavaRDD<WriteStatus> bulkInsertPreppedRecords(JavaRDD<HoodieRecord<T>> preppedRecords,
final String commitTime, final String commitTime, Option<UserDefinedBulkInsertPartitioner> bulkInsertPartitioner) {
Option<UserDefinedBulkInsertPartitioner> bulkInsertPartitioner) {
HoodieTable<T> table = getTableAndInitCtx(); HoodieTable<T> table = getTableAndInitCtx();
try { try {
return bulkInsertInternal(preppedRecords, commitTime, table, bulkInsertPartitioner); return bulkInsertInternal(preppedRecords, commitTime, table, bulkInsertPartitioner);
@@ -327,35 +325,29 @@ public class HoodieWriteClient<T extends HoodieRecordPayload> implements Seriali
if (e instanceof HoodieInsertException) { if (e instanceof HoodieInsertException) {
throw e; throw e;
} }
throw new HoodieInsertException("Failed to bulk insert prepared records for commit time " + throw new HoodieInsertException(
commitTime, e); "Failed to bulk insert prepared records for commit time " + commitTime, e);
} }
} }
private JavaRDD<WriteStatus> bulkInsertInternal( private JavaRDD<WriteStatus> bulkInsertInternal(JavaRDD<HoodieRecord<T>> dedupedRecords,
JavaRDD<HoodieRecord<T>> dedupedRecords, String commitTime, HoodieTable<T> table,
String commitTime,
HoodieTable<T> table,
Option<UserDefinedBulkInsertPartitioner> bulkInsertPartitioner) { Option<UserDefinedBulkInsertPartitioner> bulkInsertPartitioner) {
final JavaRDD<HoodieRecord<T>> repartitionedRecords; final JavaRDD<HoodieRecord<T>> repartitionedRecords;
if (bulkInsertPartitioner.isDefined()) { if (bulkInsertPartitioner.isDefined()) {
repartitionedRecords = repartitionedRecords = bulkInsertPartitioner.get()
bulkInsertPartitioner.get().repartitionRecords(dedupedRecords, .repartitionRecords(dedupedRecords, config.getBulkInsertShuffleParallelism());
config.getBulkInsertShuffleParallelism());
} else { } else {
// Now, sort the records and line them up nicely for loading. // Now, sort the records and line them up nicely for loading.
repartitionedRecords = dedupedRecords repartitionedRecords = dedupedRecords.sortBy(record -> {
.sortBy(record -> {
// Let's use "partitionPath + key" as the sort key. Spark, will ensure // Let's use "partitionPath + key" as the sort key. Spark, will ensure
// the records split evenly across RDD partitions, such that small partitions fit // the records split evenly across RDD partitions, such that small partitions fit
// into 1 RDD partition, while big ones spread evenly across multiple RDD partitions // into 1 RDD partition, while big ones spread evenly across multiple RDD partitions
return String return String.format("%s+%s", record.getPartitionPath(), record.getRecordKey());
.format("%s+%s", record.getPartitionPath(), record.getRecordKey());
}, true, config.getBulkInsertShuffleParallelism()); }, true, config.getBulkInsertShuffleParallelism());
} }
JavaRDD<WriteStatus> writeStatusRDD = repartitionedRecords JavaRDD<WriteStatus> writeStatusRDD = repartitionedRecords
.mapPartitionsWithIndex(new BulkInsertMapFunction<T>(commitTime, config, table), .mapPartitionsWithIndex(new BulkInsertMapFunction<T>(commitTime, config, table), true)
true)
.flatMap(writeStatuses -> writeStatuses.iterator()); .flatMap(writeStatuses -> writeStatuses.iterator());
return updateIndexAndCommitIfNeeded(writeStatusRDD, table, commitTime); return updateIndexAndCommitIfNeeded(writeStatusRDD, table, commitTime);
@@ -375,8 +367,7 @@ public class HoodieWriteClient<T extends HoodieRecordPayload> implements Seriali
} }
private JavaRDD<HoodieRecord<T>> combineOnCondition(boolean condition, private JavaRDD<HoodieRecord<T>> combineOnCondition(boolean condition,
JavaRDD<HoodieRecord<T>> records, JavaRDD<HoodieRecord<T>> records, int parallelism) {
int parallelism) {
if (condition) { if (condition) {
return deduplicateRecords(records, parallelism); return deduplicateRecords(records, parallelism);
} }
@@ -390,8 +381,8 @@ public class HoodieWriteClient<T extends HoodieRecordPayload> implements Seriali
* files) are rolled back based on commit time. // TODO : Create a new WorkloadProfile metadata * files) are rolled back based on commit time. // TODO : Create a new WorkloadProfile metadata
* file instead of using HoodieCommitMetadata * file instead of using HoodieCommitMetadata
*/ */
private void saveWorkloadProfileMetadataToInflight(WorkloadProfile profile, private void saveWorkloadProfileMetadataToInflight(WorkloadProfile profile, HoodieTable<T> table,
HoodieTable<T> table, String commitTime) throws HoodieCommitException { String commitTime) throws HoodieCommitException {
try { try {
HoodieCommitMetadata metadata = new HoodieCommitMetadata(); HoodieCommitMetadata metadata = new HoodieCommitMetadata();
profile.getPartitionPaths().stream().forEach(path -> { profile.getPartitionPaths().stream().forEach(path -> {
@@ -416,9 +407,7 @@ public class HoodieWriteClient<T extends HoodieRecordPayload> implements Seriali
} }
private JavaRDD<WriteStatus> upsertRecordsInternal(JavaRDD<HoodieRecord<T>> preppedRecords, private JavaRDD<WriteStatus> upsertRecordsInternal(JavaRDD<HoodieRecord<T>> preppedRecords,
String commitTime, String commitTime, HoodieTable<T> hoodieTable, final boolean isUpsert) {
HoodieTable<T> hoodieTable,
final boolean isUpsert) {
// Cache the tagged records, so we don't end up computing both // Cache the tagged records, so we don't end up computing both
// TODO: Consistent contract in HoodieWriteClient regarding preppedRecord storage level handling // TODO: Consistent contract in HoodieWriteClient regarding preppedRecord storage level handling
@@ -441,20 +430,16 @@ public class HoodieWriteClient<T extends HoodieRecordPayload> implements Seriali
JavaRDD<WriteStatus> writeStatusRDD = partitionedRecords JavaRDD<WriteStatus> writeStatusRDD = partitionedRecords
.mapPartitionsWithIndex((partition, recordItr) -> { .mapPartitionsWithIndex((partition, recordItr) -> {
if (isUpsert) { if (isUpsert) {
return hoodieTable return hoodieTable.handleUpsertPartition(commitTime, partition, recordItr, partitioner);
.handleUpsertPartition(commitTime, partition, recordItr, partitioner);
} else { } else {
return hoodieTable return hoodieTable.handleInsertPartition(commitTime, partition, recordItr, partitioner);
.handleInsertPartition(commitTime, partition, recordItr, partitioner);
} }
}, true) }, true).flatMap(writeStatuses -> writeStatuses.iterator());
.flatMap(writeStatuses -> writeStatuses.iterator());
return updateIndexAndCommitIfNeeded(writeStatusRDD, hoodieTable, commitTime); return updateIndexAndCommitIfNeeded(writeStatusRDD, hoodieTable, commitTime);
} }
private Partitioner getPartitioner(HoodieTable table, boolean isUpsert, private Partitioner getPartitioner(HoodieTable table, boolean isUpsert, WorkloadProfile profile) {
WorkloadProfile profile) {
if (isUpsert) { if (isUpsert) {
return table.getUpsertPartitioner(profile); return table.getUpsertPartitioner(profile);
} else { } else {
@@ -474,13 +459,9 @@ public class HoodieWriteClient<T extends HoodieRecordPayload> implements Seriali
private JavaRDD<HoodieRecord<T>> partition(JavaRDD<HoodieRecord<T>> dedupedRecords, private JavaRDD<HoodieRecord<T>> partition(JavaRDD<HoodieRecord<T>> dedupedRecords,
Partitioner partitioner) { Partitioner partitioner) {
return dedupedRecords return dedupedRecords.mapToPair(record -> new Tuple2<>(
.mapToPair(record -> new Tuple2<>(record.getKey(), Option.apply(record.getCurrentLocation())), record))
new Tuple2<>( .partitionBy(partitioner).map(tuple -> tuple._2());
new Tuple2<>(record.getKey(), Option.apply(record.getCurrentLocation())),
record))
.partitionBy(partitioner)
.map(tuple -> tuple._2());
} }
/** /**
@@ -493,12 +474,10 @@ public class HoodieWriteClient<T extends HoodieRecordPayload> implements Seriali
/** /**
* Commit changes performed at the given commitTime marker * Commit changes performed at the given commitTime marker
*/ */
public boolean commit(String commitTime, public boolean commit(String commitTime, JavaRDD<WriteStatus> writeStatuses,
JavaRDD<WriteStatus> writeStatuses,
Optional<HashMap<String, String>> extraMetadata) { Optional<HashMap<String, String>> extraMetadata) {
HoodieTable<T> table = HoodieTable.getHoodieTable( HoodieTable<T> table = HoodieTable.getHoodieTable(
new HoodieTableMetaClient(jsc.hadoopConfiguration(), config.getBasePath(), true), new HoodieTableMetaClient(jsc.hadoopConfiguration(), config.getBasePath(), true), config);
config);
return commit(commitTime, writeStatuses, extraMetadata, table.getCommitActionType()); return commit(commitTime, writeStatuses, extraMetadata, table.getCommitActionType());
} }
@@ -508,15 +487,13 @@ public class HoodieWriteClient<T extends HoodieRecordPayload> implements Seriali
logger.info("Commiting " + commitTime); logger.info("Commiting " + commitTime);
// Create a Hoodie table which encapsulated the commits and files visible // Create a Hoodie table which encapsulated the commits and files visible
HoodieTable<T> table = HoodieTable.getHoodieTable( HoodieTable<T> table = HoodieTable.getHoodieTable(
new HoodieTableMetaClient(jsc.hadoopConfiguration(), config.getBasePath(), true), new HoodieTableMetaClient(jsc.hadoopConfiguration(), config.getBasePath(), true), config);
config);
HoodieActiveTimeline activeTimeline = table.getActiveTimeline(); HoodieActiveTimeline activeTimeline = table.getActiveTimeline();
List<Tuple2<String, HoodieWriteStat>> stats = writeStatuses List<Tuple2<String, HoodieWriteStat>> stats = writeStatuses.mapToPair(
.mapToPair((PairFunction<WriteStatus, String, HoodieWriteStat>) writeStatus -> (PairFunction<WriteStatus, String, HoodieWriteStat>) writeStatus -> new Tuple2<>(
new Tuple2<>(writeStatus.getPartitionPath(), writeStatus.getStat())) writeStatus.getPartitionPath(), writeStatus.getStat())).collect();
.collect();
HoodieCommitMetadata metadata = new HoodieCommitMetadata(); HoodieCommitMetadata metadata = new HoodieCommitMetadata();
for (Tuple2<String, HoodieWriteStat> stat : stats) { for (Tuple2<String, HoodieWriteStat> stat : stats) {
@@ -531,8 +508,7 @@ public class HoodieWriteClient<T extends HoodieRecordPayload> implements Seriali
durationInMs.ifPresent(duration -> { durationInMs.ifPresent(duration -> {
logger.info("Finalize write elapsed time (milliseconds): " + duration); logger.info("Finalize write elapsed time (milliseconds): " + duration);
metrics.updateFinalizeWriteMetrics(duration, result.get()); metrics.updateFinalizeWriteMetrics(duration, result.get());
} });
);
} }
// add in extra metadata // add in extra metadata
@@ -541,8 +517,7 @@ public class HoodieWriteClient<T extends HoodieRecordPayload> implements Seriali
} }
try { try {
activeTimeline.saveAsComplete( activeTimeline.saveAsComplete(new HoodieInstant(true, actionType, commitTime),
new HoodieInstant(true, actionType, commitTime),
Optional.of(metadata.toJsonString().getBytes(StandardCharsets.UTF_8))); Optional.of(metadata.toJsonString().getBytes(StandardCharsets.UTF_8)));
// Save was a success // Save was a success
// Do a inline compaction if enabled // Do a inline compaction if enabled
@@ -566,9 +541,9 @@ public class HoodieWriteClient<T extends HoodieRecordPayload> implements Seriali
} }
if (writeContext != null) { if (writeContext != null) {
long durationInMs = metrics.getDurationInMs(writeContext.stop()); long durationInMs = metrics.getDurationInMs(writeContext.stop());
metrics.updateCommitMetrics( metrics
HoodieActiveTimeline.COMMIT_FORMATTER.parse(commitTime).getTime(), durationInMs, .updateCommitMetrics(HoodieActiveTimeline.COMMIT_FORMATTER.parse(commitTime).getTime(),
metadata); durationInMs, metadata);
writeContext = null; writeContext = null;
} }
logger.info("Committed " + commitTime); logger.info("Committed " + commitTime);
@@ -587,10 +562,10 @@ public class HoodieWriteClient<T extends HoodieRecordPayload> implements Seriali
* Savepoint a specific commit. Latest version of data files as of the passed in commitTime will * Savepoint a specific commit. Latest version of data files as of the passed in commitTime will
* be referenced in the savepoint and will never be cleaned. The savepointed commit will never be * be referenced in the savepoint and will never be cleaned. The savepointed commit will never be
* rolledback or archived. * rolledback or archived.
* * <p>
* This gives an option to rollback the state to the savepoint anytime. Savepoint needs to be * This gives an option to rollback the state to the savepoint anytime. Savepoint needs to be
* manually created and deleted. * manually created and deleted.
* * <p>
* Savepoint should be on a commit that could not have been cleaned. * Savepoint should be on a commit that could not have been cleaned.
* *
* @param user - User creating the savepoint * @param user - User creating the savepoint
@@ -599,8 +574,7 @@ public class HoodieWriteClient<T extends HoodieRecordPayload> implements Seriali
*/ */
public boolean savepoint(String user, String comment) { public boolean savepoint(String user, String comment) {
HoodieTable<T> table = HoodieTable.getHoodieTable( HoodieTable<T> table = HoodieTable.getHoodieTable(
new HoodieTableMetaClient(jsc.hadoopConfiguration(), config.getBasePath(), true), new HoodieTableMetaClient(jsc.hadoopConfiguration(), config.getBasePath(), true), config);
config);
if (table.getCompletedCommitTimeline().empty()) { if (table.getCompletedCommitTimeline().empty()) {
throw new HoodieSavepointException("Could not savepoint. Commit timeline is empty"); throw new HoodieSavepointException("Could not savepoint. Commit timeline is empty");
} }
@@ -614,10 +588,10 @@ public class HoodieWriteClient<T extends HoodieRecordPayload> implements Seriali
* Savepoint a specific commit. Latest version of data files as of the passed in commitTime will * Savepoint a specific commit. Latest version of data files as of the passed in commitTime will
* be referenced in the savepoint and will never be cleaned. The savepointed commit will never be * be referenced in the savepoint and will never be cleaned. The savepointed commit will never be
* rolledback or archived. * rolledback or archived.
* * <p>
* This gives an option to rollback the state to the savepoint anytime. Savepoint needs to be * This gives an option to rollback the state to the savepoint anytime. Savepoint needs to be
* manually created and deleted. * manually created and deleted.
* * <p>
* Savepoint should be on a commit that could not have been cleaned. * Savepoint should be on a commit that could not have been cleaned.
* *
* @param commitTime - commit that should be savepointed * @param commitTime - commit that should be savepointed
@@ -627,8 +601,7 @@ public class HoodieWriteClient<T extends HoodieRecordPayload> implements Seriali
*/ */
public boolean savepoint(String commitTime, String user, String comment) { public boolean savepoint(String commitTime, String user, String comment) {
HoodieTable<T> table = HoodieTable.getHoodieTable( HoodieTable<T> table = HoodieTable.getHoodieTable(
new HoodieTableMetaClient(jsc.hadoopConfiguration(), config.getBasePath(), true), new HoodieTableMetaClient(jsc.hadoopConfiguration(), config.getBasePath(), true), config);
config);
Optional<HoodieInstant> cleanInstant = table.getCompletedCleanTimeline().lastInstant(); Optional<HoodieInstant> cleanInstant = table.getCompletedCleanTimeline().lastInstant();
HoodieInstant commitInstant = new HoodieInstant(false, HoodieTimeline.COMMIT_ACTION, HoodieInstant commitInstant = new HoodieInstant(false, HoodieTimeline.COMMIT_ACTION,
@@ -646,8 +619,7 @@ public class HoodieWriteClient<T extends HoodieRecordPayload> implements Seriali
table.getActiveTimeline().getInstantDetails(cleanInstant.get()).get()); table.getActiveTimeline().getInstantDetails(cleanInstant.get()).get());
lastCommitRetained = cleanMetadata.getEarliestCommitToRetain(); lastCommitRetained = cleanMetadata.getEarliestCommitToRetain();
} else { } else {
lastCommitRetained = lastCommitRetained = table.getCompletedCommitTimeline().firstInstant().get().getTimestamp();
table.getCompletedCommitTimeline().firstInstant().get().getTimestamp();
} }
// Cannot allow savepoint time on a commit that could have been cleaned // Cannot allow savepoint time on a commit that could have been cleaned
@@ -656,24 +628,23 @@ public class HoodieWriteClient<T extends HoodieRecordPayload> implements Seriali
"Could not savepoint commit " + commitTime + " as this is beyond the lookup window " "Could not savepoint commit " + commitTime + " as this is beyond the lookup window "
+ lastCommitRetained); + lastCommitRetained);
Map<String, List<String>> latestFilesMap = jsc.parallelize( Map<String, List<String>> latestFilesMap = jsc.parallelize(FSUtils
FSUtils.getAllPartitionPaths(fs, table.getMetaClient().getBasePath(), .getAllPartitionPaths(fs, table.getMetaClient().getBasePath(),
config.shouldAssumeDatePartitioning())) config.shouldAssumeDatePartitioning()))
.mapToPair((PairFunction<String, String, List<String>>) partitionPath -> { .mapToPair((PairFunction<String, String, List<String>>) partitionPath -> {
// Scan all partitions files with this commit time // Scan all partitions files with this commit time
logger.info("Collecting latest files in partition path " + partitionPath); logger.info("Collecting latest files in partition path " + partitionPath);
TableFileSystemView.ReadOptimizedView view = table.getROFileSystemView(); TableFileSystemView.ReadOptimizedView view = table.getROFileSystemView();
List<String> latestFiles = List<String> latestFiles = view.getLatestDataFilesBeforeOrOn(partitionPath, commitTime)
view.getLatestDataFilesBeforeOrOn(partitionPath, commitTime)
.map(HoodieDataFile::getFileName).collect(Collectors.toList()); .map(HoodieDataFile::getFileName).collect(Collectors.toList());
return new Tuple2<>(partitionPath, latestFiles); return new Tuple2<>(partitionPath, latestFiles);
}).collectAsMap(); }).collectAsMap();
HoodieSavepointMetadata metadata = HoodieSavepointMetadata metadata = AvroUtils
AvroUtils.convertSavepointMetadata(user, comment, latestFilesMap); .convertSavepointMetadata(user, comment, latestFilesMap);
// Nothing to save in the savepoint // Nothing to save in the savepoint
table.getActiveTimeline().saveAsComplete( table.getActiveTimeline()
new HoodieInstant(true, HoodieTimeline.SAVEPOINT_ACTION, commitTime), .saveAsComplete(new HoodieInstant(true, HoodieTimeline.SAVEPOINT_ACTION, commitTime),
AvroUtils.serializeSavepointMetadata(metadata)); AvroUtils.serializeSavepointMetadata(metadata));
logger.info("Savepoint " + commitTime + " created"); logger.info("Savepoint " + commitTime + " created");
return true; return true;
@@ -691,22 +662,20 @@ public class HoodieWriteClient<T extends HoodieRecordPayload> implements Seriali
*/ */
public void deleteSavepoint(String savepointTime) { public void deleteSavepoint(String savepointTime) {
HoodieTable<T> table = HoodieTable.getHoodieTable( HoodieTable<T> table = HoodieTable.getHoodieTable(
new HoodieTableMetaClient(jsc.hadoopConfiguration(), config.getBasePath(), true), new HoodieTableMetaClient(jsc.hadoopConfiguration(), config.getBasePath(), true), config);
config);
HoodieActiveTimeline activeTimeline = table.getActiveTimeline(); HoodieActiveTimeline activeTimeline = table.getActiveTimeline();
HoodieInstant savePoint = HoodieInstant savePoint = new HoodieInstant(false, HoodieTimeline.SAVEPOINT_ACTION,
new HoodieInstant(false, HoodieTimeline.SAVEPOINT_ACTION, savepointTime); savepointTime);
boolean isSavepointPresent = boolean isSavepointPresent = table.getCompletedSavepointTimeline().containsInstant(savePoint);
table.getCompletedSavepointTimeline().containsInstant(savePoint);
if (!isSavepointPresent) { if (!isSavepointPresent) {
logger.warn("No savepoint present " + savepointTime); logger.warn("No savepoint present " + savepointTime);
return; return;
} }
activeTimeline.revertToInflight(savePoint); activeTimeline.revertToInflight(savePoint);
activeTimeline.deleteInflight( activeTimeline
new HoodieInstant(true, HoodieTimeline.SAVEPOINT_ACTION, savepointTime)); .deleteInflight(new HoodieInstant(true, HoodieTimeline.SAVEPOINT_ACTION, savepointTime));
logger.info("Savepoint " + savepointTime + " deleted"); logger.info("Savepoint " + savepointTime + " deleted");
} }
@@ -719,30 +688,27 @@ public class HoodieWriteClient<T extends HoodieRecordPayload> implements Seriali
*/ */
public boolean rollbackToSavepoint(String savepointTime) { public boolean rollbackToSavepoint(String savepointTime) {
HoodieTable<T> table = HoodieTable.getHoodieTable( HoodieTable<T> table = HoodieTable.getHoodieTable(
new HoodieTableMetaClient(jsc.hadoopConfiguration(), config.getBasePath(), true), new HoodieTableMetaClient(jsc.hadoopConfiguration(), config.getBasePath(), true), config);
config);
HoodieActiveTimeline activeTimeline = table.getActiveTimeline(); HoodieActiveTimeline activeTimeline = table.getActiveTimeline();
HoodieTimeline commitTimeline = table.getCommitsTimeline(); HoodieTimeline commitTimeline = table.getCommitsTimeline();
HoodieInstant savePoint = HoodieInstant savePoint = new HoodieInstant(false, HoodieTimeline.SAVEPOINT_ACTION,
new HoodieInstant(false, HoodieTimeline.SAVEPOINT_ACTION, savepointTime); savepointTime);
boolean isSavepointPresent = boolean isSavepointPresent = table.getCompletedSavepointTimeline().containsInstant(savePoint);
table.getCompletedSavepointTimeline().containsInstant(savePoint);
if (!isSavepointPresent) { if (!isSavepointPresent) {
throw new HoodieRollbackException("No savepoint for commitTime " + savepointTime); throw new HoodieRollbackException("No savepoint for commitTime " + savepointTime);
} }
List<String> commitsToRollback = List<String> commitsToRollback = commitTimeline
commitTimeline.findInstantsAfter(savepointTime, Integer.MAX_VALUE).getInstants() .findInstantsAfter(savepointTime, Integer.MAX_VALUE).getInstants()
.map(HoodieInstant::getTimestamp).collect(Collectors.toList()); .map(HoodieInstant::getTimestamp).collect(Collectors.toList());
logger.info("Rolling back commits " + commitsToRollback); logger.info("Rolling back commits " + commitsToRollback);
rollback(commitsToRollback); rollback(commitsToRollback);
// Make sure the rollback was successful // Make sure the rollback was successful
Optional<HoodieInstant> lastInstant = Optional<HoodieInstant> lastInstant = activeTimeline.reload().getCommitsTimeline()
activeTimeline.reload().getCommitsTimeline().filterCompletedInstants() .filterCompletedInstants().lastInstant();
.lastInstant();
Preconditions.checkArgument(lastInstant.isPresent()); Preconditions.checkArgument(lastInstant.isPresent());
Preconditions.checkArgument(lastInstant.get().getTimestamp().equals(savepointTime), Preconditions.checkArgument(lastInstant.get().getTimestamp().equals(savepointTime),
savepointTime + "is not the last commit after rolling back " + commitsToRollback savepointTime + "is not the last commit after rolling back " + commitsToRollback
@@ -771,16 +737,14 @@ public class HoodieWriteClient<T extends HoodieRecordPayload> implements Seriali
// Create a Hoodie table which encapsulated the commits and files visible // Create a Hoodie table which encapsulated the commits and files visible
HoodieTable<T> table = HoodieTable.getHoodieTable( HoodieTable<T> table = HoodieTable.getHoodieTable(
new HoodieTableMetaClient(jsc.hadoopConfiguration(), config.getBasePath(), true), new HoodieTableMetaClient(jsc.hadoopConfiguration(), config.getBasePath(), true), config);
config);
HoodieActiveTimeline activeTimeline = table.getActiveTimeline(); HoodieActiveTimeline activeTimeline = table.getActiveTimeline();
HoodieTimeline inflightTimeline = table.getInflightCommitTimeline(); HoodieTimeline inflightTimeline = table.getInflightCommitTimeline();
HoodieTimeline commitTimeline = table.getCompletedCommitTimeline(); HoodieTimeline commitTimeline = table.getCompletedCommitTimeline();
// Check if any of the commits is a savepoint - do not allow rollback on those commits // Check if any of the commits is a savepoint - do not allow rollback on those commits
List<String> savepoints = List<String> savepoints = table.getCompletedSavepointTimeline().getInstants()
table.getCompletedSavepointTimeline().getInstants().map(HoodieInstant::getTimestamp) .map(HoodieInstant::getTimestamp).collect(Collectors.toList());
.collect(Collectors.toList());
commits.forEach(s -> { commits.forEach(s -> {
if (savepoints.contains(s)) { if (savepoints.contains(s)) {
throw new HoodieRollbackException( throw new HoodieRollbackException(
@@ -800,16 +764,15 @@ public class HoodieWriteClient<T extends HoodieRecordPayload> implements Seriali
String lastCommit = commits.get(commits.size() - 1); String lastCommit = commits.get(commits.size() - 1);
if (!commitTimeline.empty() && !commitTimeline if (!commitTimeline.empty() && !commitTimeline
.findInstantsAfter(lastCommit, Integer.MAX_VALUE).empty()) { .findInstantsAfter(lastCommit, Integer.MAX_VALUE).empty()) {
throw new HoodieRollbackException("Found commits after time :" + lastCommit + throw new HoodieRollbackException(
", please rollback greater commits first"); "Found commits after time :" + lastCommit + ", please rollback greater commits first");
} }
List<String> inflights = inflightTimeline.getInstants().map(HoodieInstant::getTimestamp) List<String> inflights = inflightTimeline.getInstants().map(HoodieInstant::getTimestamp)
.collect(Collectors.toList()); .collect(Collectors.toList());
if (!inflights.isEmpty() && inflights.indexOf(lastCommit) != inflights.size() - 1) { if (!inflights.isEmpty() && inflights.indexOf(lastCommit) != inflights.size() - 1) {
throw new HoodieRollbackException( throw new HoodieRollbackException("Found in-flight commits after time :" + lastCommit
"Found in-flight commits after time :" + lastCommit + + ", please rollback greater commits first");
", please rollback greater commits first");
} }
List<HoodieRollbackStat> stats = table.rollback(jsc, commits); List<HoodieRollbackStat> stats = table.rollback(jsc, commits);
@@ -817,8 +780,7 @@ public class HoodieWriteClient<T extends HoodieRecordPayload> implements Seriali
// cleanup index entries // cleanup index entries
commits.stream().forEach(s -> { commits.stream().forEach(s -> {
if (!index.rollbackCommit(s)) { if (!index.rollbackCommit(s)) {
throw new HoodieRollbackException( throw new HoodieRollbackException("Rollback index changes failed, for time :" + s);
"Rollback index changes failed, for time :" + s);
} }
}); });
logger.info("Index rolled back for commits " + commits); logger.info("Index rolled back for commits " + commits);
@@ -826,13 +788,12 @@ public class HoodieWriteClient<T extends HoodieRecordPayload> implements Seriali
Optional<Long> durationInMs = Optional.empty(); Optional<Long> durationInMs = Optional.empty();
if (context != null) { if (context != null) {
durationInMs = Optional.of(metrics.getDurationInMs(context.stop())); durationInMs = Optional.of(metrics.getDurationInMs(context.stop()));
Long numFilesDeleted = stats.stream() Long numFilesDeleted = stats.stream().mapToLong(stat -> stat.getSuccessDeleteFiles().size())
.mapToLong(stat -> stat.getSuccessDeleteFiles().size())
.sum(); .sum();
metrics.updateRollbackMetrics(durationInMs.get(), numFilesDeleted); metrics.updateRollbackMetrics(durationInMs.get(), numFilesDeleted);
} }
HoodieRollbackMetadata rollbackMetadata = HoodieRollbackMetadata rollbackMetadata = AvroUtils
AvroUtils.convertRollbackMetadata(startRollbackTime, durationInMs, commits, stats); .convertRollbackMetadata(startRollbackTime, durationInMs, commits, stats);
table.getActiveTimeline().saveAsComplete( table.getActiveTimeline().saveAsComplete(
new HoodieInstant(true, HoodieTimeline.ROLLBACK_ACTION, startRollbackTime), new HoodieInstant(true, HoodieTimeline.ROLLBACK_ACTION, startRollbackTime),
AvroUtils.serializeRollbackMetadata(rollbackMetadata)); AvroUtils.serializeRollbackMetadata(rollbackMetadata));
@@ -846,8 +807,8 @@ public class HoodieWriteClient<T extends HoodieRecordPayload> implements Seriali
table.getActiveTimeline().getRollbackTimeline().getInstants()); table.getActiveTimeline().getRollbackTimeline().getInstants());
} }
} catch (IOException e) { } catch (IOException e) {
throw new HoodieRollbackException("Failed to rollback " + throw new HoodieRollbackException(
config.getBasePath() + " commits " + commits, e); "Failed to rollback " + config.getBasePath() + " commits " + commits, e);
} }
} }
@@ -880,8 +841,7 @@ public class HoodieWriteClient<T extends HoodieRecordPayload> implements Seriali
// Create a Hoodie table which encapsulated the commits and files visible // Create a Hoodie table which encapsulated the commits and files visible
HoodieTable<T> table = HoodieTable.getHoodieTable( HoodieTable<T> table = HoodieTable.getHoodieTable(
new HoodieTableMetaClient(jsc.hadoopConfiguration(), config.getBasePath(), new HoodieTableMetaClient(jsc.hadoopConfiguration(), config.getBasePath(), true), config);
true), config);
List<HoodieCleanStat> cleanStats = table.clean(jsc); List<HoodieCleanStat> cleanStats = table.clean(jsc);
if (cleanStats.isEmpty()) { if (cleanStats.isEmpty()) {
@@ -896,14 +856,14 @@ public class HoodieWriteClient<T extends HoodieRecordPayload> implements Seriali
} }
// Create the metadata and save it // Create the metadata and save it
HoodieCleanMetadata metadata = HoodieCleanMetadata metadata = AvroUtils
AvroUtils.convertCleanMetadata(startCleanTime, durationInMs, cleanStats); .convertCleanMetadata(startCleanTime, durationInMs, cleanStats);
logger.info("Cleaned " + metadata.getTotalFilesDeleted() + " files"); logger.info("Cleaned " + metadata.getTotalFilesDeleted() + " files");
metrics.updateCleanMetrics(durationInMs.orElseGet(() -> -1L), metrics
metadata.getTotalFilesDeleted()); .updateCleanMetrics(durationInMs.orElseGet(() -> -1L), metadata.getTotalFilesDeleted());
table.getActiveTimeline().saveAsComplete( table.getActiveTimeline()
new HoodieInstant(true, HoodieTimeline.CLEAN_ACTION, startCleanTime), .saveAsComplete(new HoodieInstant(true, HoodieTimeline.CLEAN_ACTION, startCleanTime),
AvroUtils.serializeCleanMetadata(metadata)); AvroUtils.serializeCleanMetadata(metadata));
logger.info("Marked clean started on " + startCleanTime + " as complete"); logger.info("Marked clean started on " + startCleanTime + " as complete");
@@ -930,12 +890,10 @@ public class HoodieWriteClient<T extends HoodieRecordPayload> implements Seriali
public void startCommitWithTime(String commitTime) { public void startCommitWithTime(String commitTime) {
logger.info("Generate a new commit time " + commitTime); logger.info("Generate a new commit time " + commitTime);
HoodieTable<T> table = HoodieTable.getHoodieTable( HoodieTable<T> table = HoodieTable.getHoodieTable(
new HoodieTableMetaClient(jsc.hadoopConfiguration(), config.getBasePath(), true), new HoodieTableMetaClient(jsc.hadoopConfiguration(), config.getBasePath(), true), config);
config);
HoodieActiveTimeline activeTimeline = table.getActiveTimeline(); HoodieActiveTimeline activeTimeline = table.getActiveTimeline();
String commitActionType = table.getCommitActionType(); String commitActionType = table.getCommitActionType();
activeTimeline.createInflight( activeTimeline.createInflight(new HoodieInstant(true, commitActionType, commitTime));
new HoodieInstant(true, commitActionType, commitTime));
} }
/** /**
@@ -948,17 +906,16 @@ public class HoodieWriteClient<T extends HoodieRecordPayload> implements Seriali
return commitTime; return commitTime;
} }
/** Since MOR tableType default to {@link HoodieTimeline#DELTA_COMMIT_ACTION}, /**
* we need to explicitly set to {@link HoodieTimeline#COMMIT_ACTION} for compaction * Since MOR tableType default to {@link HoodieTimeline#DELTA_COMMIT_ACTION}, we need to
* explicitly set to {@link HoodieTimeline#COMMIT_ACTION} for compaction
*/ */
public void startCompactionWithTime(String commitTime) { public void startCompactionWithTime(String commitTime) {
HoodieTable<T> table = HoodieTable.getHoodieTable( HoodieTable<T> table = HoodieTable.getHoodieTable(
new HoodieTableMetaClient(jsc.hadoopConfiguration(), config.getBasePath(), true), new HoodieTableMetaClient(jsc.hadoopConfiguration(), config.getBasePath(), true), config);
config);
HoodieActiveTimeline activeTimeline = table.getActiveTimeline(); HoodieActiveTimeline activeTimeline = table.getActiveTimeline();
String commitActionType = HoodieTimeline.COMMIT_ACTION; String commitActionType = HoodieTimeline.COMMIT_ACTION;
activeTimeline.createInflight( activeTimeline.createInflight(new HoodieInstant(true, commitActionType, commitTime));
new HoodieInstant(true, commitActionType, commitTime));
} }
/** /**
@@ -968,8 +925,7 @@ public class HoodieWriteClient<T extends HoodieRecordPayload> implements Seriali
public JavaRDD<WriteStatus> compact(String commitTime) throws IOException { public JavaRDD<WriteStatus> compact(String commitTime) throws IOException {
// Create a Hoodie table which encapsulated the commits and files visible // Create a Hoodie table which encapsulated the commits and files visible
HoodieTable<T> table = HoodieTable.getHoodieTable( HoodieTable<T> table = HoodieTable.getHoodieTable(
new HoodieTableMetaClient(jsc.hadoopConfiguration(), config.getBasePath(), true), new HoodieTableMetaClient(jsc.hadoopConfiguration(), config.getBasePath(), true), config);
config);
JavaRDD<WriteStatus> statuses = table.compact(jsc, commitTime); JavaRDD<WriteStatus> statuses = table.compact(jsc, commitTime);
// Trigger the insert and collect statuses // Trigger the insert and collect statuses
statuses = statuses.persist(config.getWriteStatusStorageLevel()); statuses = statuses.persist(config.getWriteStatusStorageLevel());
@@ -980,9 +936,6 @@ public class HoodieWriteClient<T extends HoodieRecordPayload> implements Seriali
/** /**
* Commit a compaction operation * Commit a compaction operation
* @param commitTime
* @param writeStatuses
* @param extraMetadata
*/ */
public void commitCompaction(String commitTime, JavaRDD<WriteStatus> writeStatuses, public void commitCompaction(String commitTime, JavaRDD<WriteStatus> writeStatuses,
Optional<HashMap<String, String>> extraMetadata) { Optional<HashMap<String, String>> extraMetadata) {
@@ -992,8 +945,6 @@ public class HoodieWriteClient<T extends HoodieRecordPayload> implements Seriali
/** /**
* Commit a compaction operation * Commit a compaction operation
* @param commitTime
* @param writeStatuses
*/ */
public void commitCompaction(String commitTime, JavaRDD<WriteStatus> writeStatuses) { public void commitCompaction(String commitTime, JavaRDD<WriteStatus> writeStatuses) {
String commitCompactionActionType = HoodieActiveTimeline.COMMIT_ACTION; String commitCompactionActionType = HoodieActiveTimeline.COMMIT_ACTION;
@@ -1006,8 +957,8 @@ public class HoodieWriteClient<T extends HoodieRecordPayload> implements Seriali
*/ */
private void forceCompact(String compactionCommitTime) throws IOException { private void forceCompact(String compactionCommitTime) throws IOException {
// Create a Hoodie table which encapsulated the commits and files visible // Create a Hoodie table which encapsulated the commits and files visible
HoodieTableMetaClient metaClient = HoodieTableMetaClient metaClient = new HoodieTableMetaClient(jsc.hadoopConfiguration(),
new HoodieTableMetaClient(jsc.hadoopConfiguration(), config.getBasePath(), true); config.getBasePath(), true);
HoodieTable<T> table = HoodieTable.getHoodieTable(metaClient, config); HoodieTable<T> table = HoodieTable.getHoodieTable(metaClient, config);
JavaRDD<WriteStatus> compactedStatuses = table.compact(jsc, compactionCommitTime); JavaRDD<WriteStatus> compactedStatuses = table.compact(jsc, compactionCommitTime);
if (!compactedStatuses.isEmpty()) { if (!compactedStatuses.isEmpty()) {
@@ -1029,8 +980,7 @@ public class HoodieWriteClient<T extends HoodieRecordPayload> implements Seriali
} }
private void commitForceCompaction(JavaRDD<WriteStatus> writeStatuses, private void commitForceCompaction(JavaRDD<WriteStatus> writeStatuses,
HoodieTableMetaClient metaClient, HoodieTableMetaClient metaClient, String compactionCommitTime) {
String compactionCommitTime) {
List<HoodieWriteStat> updateStatusMap = writeStatuses.map(writeStatus -> writeStatus.getStat()) List<HoodieWriteStat> updateStatusMap = writeStatuses.map(writeStatus -> writeStatus.getStat())
.collect(); .collect();
@@ -1054,12 +1004,6 @@ public class HoodieWriteClient<T extends HoodieRecordPayload> implements Seriali
} }
} }
public static SparkConf registerClasses(SparkConf conf) {
conf.registerKryoClasses(
new Class[]{HoodieWriteConfig.class, HoodieRecord.class, HoodieKey.class});
return conf;
}
/** /**
* Deduplicate Hoodie records, using the given deduplication funciton. * Deduplicate Hoodie records, using the given deduplication funciton.
*/ */
@@ -1074,13 +1018,13 @@ public class HoodieWriteClient<T extends HoodieRecordPayload> implements Seriali
return new Tuple2<>(key, record); return new Tuple2<>(key, record);
}) })
.reduceByKey((rec1, rec2) -> { .reduceByKey((rec1, rec2) -> {
@SuppressWarnings("unchecked") @SuppressWarnings("unchecked") T reducedData = (T) rec1.getData()
T reducedData = (T) rec1.getData().preCombine(rec2.getData()); .preCombine(rec2.getData());
// we cannot allow the user to change the key or partitionPath, since that will affect everything // we cannot allow the user to change the key or partitionPath, since that will affect
// everything
// so pick it from one of the records. // so pick it from one of the records.
return new HoodieRecord<T>(rec1.getKey(), reducedData); return new HoodieRecord<T>(rec1.getKey(), reducedData);
}, parallelism) }, parallelism).map(recordTuple -> recordTuple._2());
.map(recordTuple -> recordTuple._2());
} }
/** /**
@@ -1088,8 +1032,7 @@ public class HoodieWriteClient<T extends HoodieRecordPayload> implements Seriali
*/ */
private void rollbackInflightCommits() { private void rollbackInflightCommits() {
HoodieTable<T> table = HoodieTable.getHoodieTable( HoodieTable<T> table = HoodieTable.getHoodieTable(
new HoodieTableMetaClient(jsc.hadoopConfiguration(), config.getBasePath(), true), new HoodieTableMetaClient(jsc.hadoopConfiguration(), config.getBasePath(), true), config);
config);
HoodieTimeline inflightTimeline = table.getCommitsTimeline().filterInflights(); HoodieTimeline inflightTimeline = table.getCommitsTimeline().filterInflights();
List<String> commits = inflightTimeline.getInstants().map(HoodieInstant::getTimestamp) List<String> commits = inflightTimeline.getInstants().map(HoodieInstant::getTimestamp)
.collect(Collectors.toList()); .collect(Collectors.toList());
@@ -1103,7 +1046,6 @@ public class HoodieWriteClient<T extends HoodieRecordPayload> implements Seriali
writeContext = metrics.getCommitCtx(); writeContext = metrics.getCommitCtx();
// Create a Hoodie table which encapsulated the commits and files visible // Create a Hoodie table which encapsulated the commits and files visible
return HoodieTable.getHoodieTable( return HoodieTable.getHoodieTable(
new HoodieTableMetaClient(jsc.hadoopConfiguration(), config.getBasePath(), true), new HoodieTableMetaClient(jsc.hadoopConfiguration(), config.getBasePath(), true), config);
config);
} }
} }

View File

@@ -106,14 +106,14 @@ public class WriteStatus implements Serializable {
return globalError != null; return globalError != null;
} }
public void setGlobalError(Throwable t) {
this.globalError = t;
}
public Throwable getGlobalError() { public Throwable getGlobalError() {
return this.globalError; return this.globalError;
} }
public void setGlobalError(Throwable t) {
this.globalError = t;
}
public List<HoodieRecord> getWrittenRecords() { public List<HoodieRecord> getWrittenRecords() {
return writtenRecords; return writtenRecords;
} }

View File

@@ -30,10 +30,6 @@ public class DefaultHoodieConfig implements Serializable {
this.props = props; this.props = props;
} }
public Properties getProps() {
return props;
}
public static void setDefaultOnCondition(Properties props, boolean condition, String propName, public static void setDefaultOnCondition(Properties props, boolean condition, String propName,
String defaultValue) { String defaultValue) {
if (condition) { if (condition) {
@@ -48,4 +44,8 @@ public class DefaultHoodieConfig implements Serializable {
} }
} }
public Properties getProps() {
return props;
}
} }

View File

@@ -34,81 +34,76 @@ import javax.annotation.concurrent.Immutable;
public class HoodieCompactionConfig extends DefaultHoodieConfig { public class HoodieCompactionConfig extends DefaultHoodieConfig {
public static final String CLEANER_POLICY_PROP = "hoodie.cleaner.policy"; public static final String CLEANER_POLICY_PROP = "hoodie.cleaner.policy";
private static final String DEFAULT_CLEANER_POLICY =
HoodieCleaningPolicy.KEEP_LATEST_COMMITS.name();
public static final String AUTO_CLEAN_PROP = "hoodie.clean.automatic"; public static final String AUTO_CLEAN_PROP = "hoodie.clean.automatic";
private static final String DEFAULT_AUTO_CLEAN = "true";
// Turn on inline compaction - after fw delta commits a inline compaction will be run // Turn on inline compaction - after fw delta commits a inline compaction will be run
public static final String INLINE_COMPACT_PROP = "hoodie.compact.inline"; public static final String INLINE_COMPACT_PROP = "hoodie.compact.inline";
private static final String DEFAULT_INLINE_COMPACT = "false";
// Run a compaction every N delta commits // Run a compaction every N delta commits
public static final String INLINE_COMPACT_NUM_DELTA_COMMITS_PROP = "hoodie.compact.inline.max.delta.commits"; public static final String INLINE_COMPACT_NUM_DELTA_COMMITS_PROP =
private static final String DEFAULT_INLINE_COMPACT_NUM_DELTA_COMMITS = "10"; "hoodie.compact.inline.max" + ".delta.commits";
public static final String CLEANER_FILE_VERSIONS_RETAINED_PROP = public static final String CLEANER_FILE_VERSIONS_RETAINED_PROP =
"hoodie.cleaner.fileversions.retained"; "hoodie.cleaner.fileversions" + ".retained";
private static final String DEFAULT_CLEANER_FILE_VERSIONS_RETAINED = "3";
public static final String CLEANER_COMMITS_RETAINED_PROP = "hoodie.cleaner.commits.retained"; public static final String CLEANER_COMMITS_RETAINED_PROP = "hoodie.cleaner.commits.retained";
private static final String DEFAULT_CLEANER_COMMITS_RETAINED = "24";
public static final String MAX_COMMITS_TO_KEEP = "hoodie.keep.max.commits"; public static final String MAX_COMMITS_TO_KEEP = "hoodie.keep.max.commits";
private static final String DEFAULT_MAX_COMMITS_TO_KEEP = String.valueOf(128);
public static final String MIN_COMMITS_TO_KEEP = "hoodie.keep.min.commits"; public static final String MIN_COMMITS_TO_KEEP = "hoodie.keep.min.commits";
private static final String DEFAULT_MIN_COMMITS_TO_KEEP = String.valueOf(96);
// Upsert uses this file size to compact new data onto existing files.. // Upsert uses this file size to compact new data onto existing files..
public static final String PARQUET_SMALL_FILE_LIMIT_BYTES = "hoodie.parquet.small.file.limit"; public static final String PARQUET_SMALL_FILE_LIMIT_BYTES = "hoodie.parquet.small.file.limit";
// Turned off by default // Turned off by default
public static final String DEFAULT_PARQUET_SMALL_FILE_LIMIT_BYTES = String.valueOf(0); public static final String DEFAULT_PARQUET_SMALL_FILE_LIMIT_BYTES = String.valueOf(0);
/** /**
* Configs related to specific table types * Configs related to specific table types
**/ **/
// Number of inserts, that will be put each partition/bucket for writing // Number of inserts, that will be put each partition/bucket for writing
public static final String COPY_ON_WRITE_TABLE_INSERT_SPLIT_SIZE = "hoodie.copyonwrite.insert.split.size"; public static final String COPY_ON_WRITE_TABLE_INSERT_SPLIT_SIZE =
"hoodie.copyonwrite.insert" + ".split.size";
// The rationale to pick the insert parallelism is the following. Writing out 100MB files, // The rationale to pick the insert parallelism is the following. Writing out 100MB files,
// with atleast 1kb records, means 100K records per file. we just overprovision to 500K // with atleast 1kb records, means 100K records per file. we just overprovision to 500K
public static final String DEFAULT_COPY_ON_WRITE_TABLE_INSERT_SPLIT_SIZE = String.valueOf(500000); public static final String DEFAULT_COPY_ON_WRITE_TABLE_INSERT_SPLIT_SIZE = String.valueOf(500000);
// Config to control whether we control insert split sizes automatically based on average
// Config to control whether we control insert split sizes automatically based on average record sizes // record sizes
public static final String COPY_ON_WRITE_TABLE_AUTO_SPLIT_INSERTS = "hoodie.copyonwrite.insert.auto.split"; public static final String COPY_ON_WRITE_TABLE_AUTO_SPLIT_INSERTS =
"hoodie.copyonwrite.insert" + ".auto.split";
// its off by default // its off by default
public static final String DEFAULT_COPY_ON_WRITE_TABLE_AUTO_SPLIT_INSERTS = String.valueOf(false); public static final String DEFAULT_COPY_ON_WRITE_TABLE_AUTO_SPLIT_INSERTS = String.valueOf(false);
// This value is used as a guessimate for the record size, if we can't determine this from
// previous commits
// This value is used as a guessimate for the record size, if we can't determine this from previous commits public static final String COPY_ON_WRITE_TABLE_RECORD_SIZE_ESTIMATE =
public static final String COPY_ON_WRITE_TABLE_RECORD_SIZE_ESTIMATE = "hoodie.copyonwrite.record.size.estimate"; "hoodie.copyonwrite" + ".record.size.estimate";
// Used to determine how much more can be packed into a small file, before it exceeds the size limit. // Used to determine how much more can be packed into a small file, before it exceeds the size
// limit.
public static final String DEFAULT_COPY_ON_WRITE_TABLE_RECORD_SIZE_ESTIMATE = String public static final String DEFAULT_COPY_ON_WRITE_TABLE_RECORD_SIZE_ESTIMATE = String
.valueOf(1024); .valueOf(1024);
public static final String CLEANER_PARALLELISM = "hoodie.cleaner.parallelism"; public static final String CLEANER_PARALLELISM = "hoodie.cleaner.parallelism";
public static final String DEFAULT_CLEANER_PARALLELISM = String.valueOf(200); public static final String DEFAULT_CLEANER_PARALLELISM = String.valueOf(200);
public static final String TARGET_IO_PER_COMPACTION_IN_MB_PROP = "hoodie.compaction.target.io"; public static final String TARGET_IO_PER_COMPACTION_IN_MB_PROP = "hoodie.compaction.target.io";
// 500GB of target IO per compaction (both read and write) // 500GB of target IO per compaction (both read and write)
public static final String DEFAULT_TARGET_IO_PER_COMPACTION_IN_MB = String.valueOf(500 * 1024); public static final String DEFAULT_TARGET_IO_PER_COMPACTION_IN_MB = String.valueOf(500 * 1024);
public static final String COMPACTION_STRATEGY_PROP = "hoodie.compaction.strategy"; public static final String COMPACTION_STRATEGY_PROP = "hoodie.compaction.strategy";
// 200GB of target IO per compaction // 200GB of target IO per compaction
public static final String DEFAULT_COMPACTION_STRATEGY = LogFileSizeBasedCompactionStrategy.class public static final String DEFAULT_COMPACTION_STRATEGY = LogFileSizeBasedCompactionStrategy.class
.getName(); .getName();
// used to merge records written to log file // used to merge records written to log file
public static final String DEFAULT_PAYLOAD_CLASS = HoodieAvroPayload.class.getName(); public static final String DEFAULT_PAYLOAD_CLASS = HoodieAvroPayload.class.getName();
public static final String PAYLOAD_CLASS_PROP = "hoodie.compaction.payload.class"; public static final String PAYLOAD_CLASS_PROP = "hoodie.compaction.payload.class";
// used to choose a trade off between IO vs Memory when performing compaction process // used to choose a trade off between IO vs Memory when performing compaction process
// Depending on outputfile_size and memory provided, choose true to avoid OOM for large file size + small memory // Depending on outputfile_size and memory provided, choose true to avoid OOM for large file
public static final String COMPACTION_LAZY_BLOCK_READ_ENABLED_PROP = "hoodie.compaction.lazy.block.read"; // size + small memory
public static final String COMPACTION_LAZY_BLOCK_READ_ENABLED_PROP =
"hoodie.compaction.lazy" + ".block.read";
public static final String DEFAULT_COMPACTION_LAZY_BLOCK_READ_ENABLED = "false"; public static final String DEFAULT_COMPACTION_LAZY_BLOCK_READ_ENABLED = "false";
// used to choose whether to enable reverse log reading (reverse log traversal) // used to choose whether to enable reverse log reading (reverse log traversal)
public static final String COMPACTION_REVERSE_LOG_READ_ENABLED_PROP = "hoodie.compaction.reverse.log.read"; public static final String COMPACTION_REVERSE_LOG_READ_ENABLED_PROP =
"hoodie.compaction" + ".reverse.log.read";
public static final String DEFAULT_COMPACTION_REVERSE_LOG_READ_ENABLED = "false"; public static final String DEFAULT_COMPACTION_REVERSE_LOG_READ_ENABLED = "false";
private static final String DEFAULT_CLEANER_POLICY = HoodieCleaningPolicy.KEEP_LATEST_COMMITS
.name();
private static final String DEFAULT_AUTO_CLEAN = "true";
private static final String DEFAULT_INLINE_COMPACT = "false";
private static final String DEFAULT_INLINE_COMPACT_NUM_DELTA_COMMITS = "10";
private static final String DEFAULT_CLEANER_FILE_VERSIONS_RETAINED = "3";
private static final String DEFAULT_CLEANER_COMMITS_RETAINED = "24";
private static final String DEFAULT_MAX_COMMITS_TO_KEEP = String.valueOf(128);
private static final String DEFAULT_MIN_COMMITS_TO_KEEP = String.valueOf(96);
private HoodieCompactionConfig(Properties props) { private HoodieCompactionConfig(Properties props) {
super(props); super(props);
@@ -159,8 +154,7 @@ public class HoodieCompactionConfig extends DefaultHoodieConfig {
} }
public Builder retainFileVersions(int fileVersionsRetained) { public Builder retainFileVersions(int fileVersionsRetained) {
props.setProperty(CLEANER_FILE_VERSIONS_RETAINED_PROP, props.setProperty(CLEANER_FILE_VERSIONS_RETAINED_PROP, String.valueOf(fileVersionsRetained));
String.valueOf(fileVersionsRetained));
return this; return this;
} }
@@ -238,22 +232,22 @@ public class HoodieCompactionConfig extends DefaultHoodieConfig {
public HoodieCompactionConfig build() { public HoodieCompactionConfig build() {
HoodieCompactionConfig config = new HoodieCompactionConfig(props); HoodieCompactionConfig config = new HoodieCompactionConfig(props);
setDefaultOnCondition(props, !props.containsKey(AUTO_CLEAN_PROP), setDefaultOnCondition(props, !props.containsKey(AUTO_CLEAN_PROP), AUTO_CLEAN_PROP,
AUTO_CLEAN_PROP, DEFAULT_AUTO_CLEAN); DEFAULT_AUTO_CLEAN);
setDefaultOnCondition(props, !props.containsKey(INLINE_COMPACT_PROP), setDefaultOnCondition(props, !props.containsKey(INLINE_COMPACT_PROP), INLINE_COMPACT_PROP,
INLINE_COMPACT_PROP, DEFAULT_INLINE_COMPACT); DEFAULT_INLINE_COMPACT);
setDefaultOnCondition(props, !props.containsKey(INLINE_COMPACT_NUM_DELTA_COMMITS_PROP), setDefaultOnCondition(props, !props.containsKey(INLINE_COMPACT_NUM_DELTA_COMMITS_PROP),
INLINE_COMPACT_NUM_DELTA_COMMITS_PROP, DEFAULT_INLINE_COMPACT_NUM_DELTA_COMMITS); INLINE_COMPACT_NUM_DELTA_COMMITS_PROP, DEFAULT_INLINE_COMPACT_NUM_DELTA_COMMITS);
setDefaultOnCondition(props, !props.containsKey(CLEANER_POLICY_PROP), setDefaultOnCondition(props, !props.containsKey(CLEANER_POLICY_PROP), CLEANER_POLICY_PROP,
CLEANER_POLICY_PROP, DEFAULT_CLEANER_POLICY); DEFAULT_CLEANER_POLICY);
setDefaultOnCondition(props, !props.containsKey(CLEANER_FILE_VERSIONS_RETAINED_PROP), setDefaultOnCondition(props, !props.containsKey(CLEANER_FILE_VERSIONS_RETAINED_PROP),
CLEANER_FILE_VERSIONS_RETAINED_PROP, DEFAULT_CLEANER_FILE_VERSIONS_RETAINED); CLEANER_FILE_VERSIONS_RETAINED_PROP, DEFAULT_CLEANER_FILE_VERSIONS_RETAINED);
setDefaultOnCondition(props, !props.containsKey(CLEANER_COMMITS_RETAINED_PROP), setDefaultOnCondition(props, !props.containsKey(CLEANER_COMMITS_RETAINED_PROP),
CLEANER_COMMITS_RETAINED_PROP, DEFAULT_CLEANER_COMMITS_RETAINED); CLEANER_COMMITS_RETAINED_PROP, DEFAULT_CLEANER_COMMITS_RETAINED);
setDefaultOnCondition(props, !props.containsKey(MAX_COMMITS_TO_KEEP), setDefaultOnCondition(props, !props.containsKey(MAX_COMMITS_TO_KEEP), MAX_COMMITS_TO_KEEP,
MAX_COMMITS_TO_KEEP, DEFAULT_MAX_COMMITS_TO_KEEP); DEFAULT_MAX_COMMITS_TO_KEEP);
setDefaultOnCondition(props, !props.containsKey(MIN_COMMITS_TO_KEEP), setDefaultOnCondition(props, !props.containsKey(MIN_COMMITS_TO_KEEP), MIN_COMMITS_TO_KEEP,
MIN_COMMITS_TO_KEEP, DEFAULT_MIN_COMMITS_TO_KEEP); DEFAULT_MIN_COMMITS_TO_KEEP);
setDefaultOnCondition(props, !props.containsKey(PARQUET_SMALL_FILE_LIMIT_BYTES), setDefaultOnCondition(props, !props.containsKey(PARQUET_SMALL_FILE_LIMIT_BYTES),
PARQUET_SMALL_FILE_LIMIT_BYTES, DEFAULT_PARQUET_SMALL_FILE_LIMIT_BYTES); PARQUET_SMALL_FILE_LIMIT_BYTES, DEFAULT_PARQUET_SMALL_FILE_LIMIT_BYTES);
setDefaultOnCondition(props, !props.containsKey(COPY_ON_WRITE_TABLE_INSERT_SPLIT_SIZE), setDefaultOnCondition(props, !props.containsKey(COPY_ON_WRITE_TABLE_INSERT_SPLIT_SIZE),
@@ -263,8 +257,8 @@ public class HoodieCompactionConfig extends DefaultHoodieConfig {
setDefaultOnCondition(props, !props.containsKey(COPY_ON_WRITE_TABLE_RECORD_SIZE_ESTIMATE), setDefaultOnCondition(props, !props.containsKey(COPY_ON_WRITE_TABLE_RECORD_SIZE_ESTIMATE),
COPY_ON_WRITE_TABLE_RECORD_SIZE_ESTIMATE, COPY_ON_WRITE_TABLE_RECORD_SIZE_ESTIMATE,
DEFAULT_COPY_ON_WRITE_TABLE_RECORD_SIZE_ESTIMATE); DEFAULT_COPY_ON_WRITE_TABLE_RECORD_SIZE_ESTIMATE);
setDefaultOnCondition(props, !props.containsKey(CLEANER_PARALLELISM), setDefaultOnCondition(props, !props.containsKey(CLEANER_PARALLELISM), CLEANER_PARALLELISM,
CLEANER_PARALLELISM, DEFAULT_CLEANER_PARALLELISM); DEFAULT_CLEANER_PARALLELISM);
setDefaultOnCondition(props, !props.containsKey(COMPACTION_STRATEGY_PROP), setDefaultOnCondition(props, !props.containsKey(COMPACTION_STRATEGY_PROP),
COMPACTION_STRATEGY_PROP, DEFAULT_COMPACTION_STRATEGY); COMPACTION_STRATEGY_PROP, DEFAULT_COMPACTION_STRATEGY);
setDefaultOnCondition(props, !props.containsKey(PAYLOAD_CLASS_PROP), setDefaultOnCondition(props, !props.containsKey(PAYLOAD_CLASS_PROP),
@@ -277,8 +271,7 @@ public class HoodieCompactionConfig extends DefaultHoodieConfig {
COMPACTION_REVERSE_LOG_READ_ENABLED_PROP, DEFAULT_COMPACTION_REVERSE_LOG_READ_ENABLED); COMPACTION_REVERSE_LOG_READ_ENABLED_PROP, DEFAULT_COMPACTION_REVERSE_LOG_READ_ENABLED);
HoodieCleaningPolicy.valueOf(props.getProperty(CLEANER_POLICY_PROP)); HoodieCleaningPolicy.valueOf(props.getProperty(CLEANER_POLICY_PROP));
Preconditions.checkArgument( Preconditions.checkArgument(Integer.parseInt(props.getProperty(MAX_COMMITS_TO_KEEP)) > Integer
Integer.parseInt(props.getProperty(MAX_COMMITS_TO_KEEP)) > Integer
.parseInt(props.getProperty(MIN_COMMITS_TO_KEEP))); .parseInt(props.getProperty(MIN_COMMITS_TO_KEEP)));
return config; return config;
} }

View File

@@ -40,23 +40,25 @@ public class HoodieIndexConfig extends DefaultHoodieConfig {
public static final String BLOOM_INDEX_PARALLELISM_PROP = "hoodie.bloom.index.parallelism"; public static final String BLOOM_INDEX_PARALLELISM_PROP = "hoodie.bloom.index.parallelism";
// Disable explicit bloom index parallelism setting by default - hoodie auto computes // Disable explicit bloom index parallelism setting by default - hoodie auto computes
public static final String DEFAULT_BLOOM_INDEX_PARALLELISM = "0"; public static final String DEFAULT_BLOOM_INDEX_PARALLELISM = "0";
public static final String BLOOM_INDEX_PRUNE_BY_RANGES_PROP = "hoodie.bloom.index.prune.by.ranges"; public static final String BLOOM_INDEX_PRUNE_BY_RANGES_PROP =
"hoodie.bloom.index.prune.by" + ".ranges";
public static final String DEFAULT_BLOOM_INDEX_PRUNE_BY_RANGES = "true"; public static final String DEFAULT_BLOOM_INDEX_PRUNE_BY_RANGES = "true";
public static final String BLOOM_INDEX_USE_CACHING_PROP = "hoodie.bloom.index.use.caching"; public static final String BLOOM_INDEX_USE_CACHING_PROP = "hoodie.bloom.index.use.caching";
public static final String DEFAULT_BLOOM_INDEX_USE_CACHING = "true"; public static final String DEFAULT_BLOOM_INDEX_USE_CACHING = "true";
public static final String BLOOM_INDEX_INPUT_STORAGE_LEVEL = "hoodie.bloom.index.input.storage.level"; public static final String BLOOM_INDEX_INPUT_STORAGE_LEVEL =
"hoodie.bloom.index.input.storage" + ".level";
public static final String DEFAULT_BLOOM_INDEX_INPUT_STORAGE_LEVEL = "MEMORY_AND_DISK_SER"; public static final String DEFAULT_BLOOM_INDEX_INPUT_STORAGE_LEVEL = "MEMORY_AND_DISK_SER";
// ***** HBase Index Configs ***** // ***** HBase Index Configs *****
public final static String HBASE_ZKQUORUM_PROP = "hoodie.index.hbase.zkquorum"; public static final String HBASE_ZKQUORUM_PROP = "hoodie.index.hbase.zkquorum";
public final static String HBASE_ZKPORT_PROP = "hoodie.index.hbase.zkport"; public static final String HBASE_ZKPORT_PROP = "hoodie.index.hbase.zkport";
public final static String HBASE_TABLENAME_PROP = "hoodie.index.hbase.table"; public static final String HBASE_TABLENAME_PROP = "hoodie.index.hbase.table";
public final static String HBASE_GET_BATCH_SIZE_PROP = "hoodie.index.hbase.get.batch.size"; public static final String HBASE_GET_BATCH_SIZE_PROP = "hoodie.index.hbase.get.batch.size";
public final static String HBASE_PUT_BATCH_SIZE_PROP = "hoodie.index.hbase.put.batch.size"; public static final String HBASE_PUT_BATCH_SIZE_PROP = "hoodie.index.hbase.put.batch.size";
public final static String DEFAULT_HBASE_BATCH_SIZE = "100"; public static final String DEFAULT_HBASE_BATCH_SIZE = "100";
// ***** Bucketed Index Configs ***** // ***** Bucketed Index Configs *****
public final static String BUCKETED_INDEX_NUM_BUCKETS_PROP = "hoodie.index.bucketed.numbuckets"; public static final String BUCKETED_INDEX_NUM_BUCKETS_PROP = "hoodie.index.bucketed.numbuckets";
private HoodieIndexConfig(Properties props) { private HoodieIndexConfig(Properties props) {
super(props); super(props);
@@ -152,12 +154,12 @@ public class HoodieIndexConfig extends DefaultHoodieConfig {
public HoodieIndexConfig build() { public HoodieIndexConfig build() {
HoodieIndexConfig config = new HoodieIndexConfig(props); HoodieIndexConfig config = new HoodieIndexConfig(props);
setDefaultOnCondition(props, !props.containsKey(INDEX_TYPE_PROP), setDefaultOnCondition(props, !props.containsKey(INDEX_TYPE_PROP), INDEX_TYPE_PROP,
INDEX_TYPE_PROP, DEFAULT_INDEX_TYPE); DEFAULT_INDEX_TYPE);
setDefaultOnCondition(props, !props.containsKey(BLOOM_FILTER_NUM_ENTRIES), setDefaultOnCondition(props, !props.containsKey(BLOOM_FILTER_NUM_ENTRIES),
BLOOM_FILTER_NUM_ENTRIES, DEFAULT_BLOOM_FILTER_NUM_ENTRIES); BLOOM_FILTER_NUM_ENTRIES, DEFAULT_BLOOM_FILTER_NUM_ENTRIES);
setDefaultOnCondition(props, !props.containsKey(BLOOM_FILTER_FPP), setDefaultOnCondition(props, !props.containsKey(BLOOM_FILTER_FPP), BLOOM_FILTER_FPP,
BLOOM_FILTER_FPP, DEFAULT_BLOOM_FILTER_FPP); DEFAULT_BLOOM_FILTER_FPP);
setDefaultOnCondition(props, !props.containsKey(BLOOM_INDEX_PARALLELISM_PROP), setDefaultOnCondition(props, !props.containsKey(BLOOM_INDEX_PARALLELISM_PROP),
BLOOM_INDEX_PARALLELISM_PROP, DEFAULT_BLOOM_INDEX_PARALLELISM); BLOOM_INDEX_PARALLELISM_PROP, DEFAULT_BLOOM_INDEX_PARALLELISM);
setDefaultOnCondition(props, !props.containsKey(BLOOM_INDEX_PRUNE_BY_RANGES_PROP), setDefaultOnCondition(props, !props.containsKey(BLOOM_INDEX_PRUNE_BY_RANGES_PROP),

View File

@@ -30,8 +30,8 @@ import org.apache.spark.util.Utils;
@Immutable @Immutable
public class HoodieMemoryConfig extends DefaultHoodieConfig { public class HoodieMemoryConfig extends DefaultHoodieConfig {
// This fraction is multiplied with the spark.memory.fraction to get a final fraction of heap space to use during merge // This fraction is multiplied with the spark.memory.fraction to get a final fraction of heap space to use
// This makes it easier to scale this value as one increases the spark.executor.memory // during merge. This makes it easier to scale this value as one increases the spark.executor.memory
public static final String MAX_MEMORY_FRACTION_FOR_MERGE_PROP = "hoodie.memory.merge.fraction"; public static final String MAX_MEMORY_FRACTION_FOR_MERGE_PROP = "hoodie.memory.merge.fraction";
// Default max memory fraction during hash-merge, excess spills to disk // Default max memory fraction during hash-merge, excess spills to disk
public static final String DEFAULT_MAX_MEMORY_FRACTION_FOR_MERGE = String.valueOf(0.6); public static final String DEFAULT_MAX_MEMORY_FRACTION_FOR_MERGE = String.valueOf(0.6);
@@ -87,19 +87,21 @@ public class HoodieMemoryConfig extends DefaultHoodieConfig {
} }
/** /**
* Dynamic calculation of max memory to use for for spillable map. user.available.memory = * Dynamic calculation of max memory to use for for spillable map. user.available.memory = spark.executor.memory *
* spark.executor.memory * (1 - spark.memory.fraction) spillable.available.memory = * (1 - spark.memory.fraction) spillable.available.memory = user.available.memory * hoodie.memory.fraction. Anytime
* user.available.memory * hoodie.memory.fraction. Anytime the spark.executor.memory or the * the spark.executor.memory or the spark.memory.fraction is changed, the memory used for spillable map changes
* spark.memory.fraction is changed, the memory used for spillable map changes accordingly * accordingly
*/ */
private long getMaxMemoryAllowedForMerge(String maxMemoryFraction) { private long getMaxMemoryAllowedForMerge(String maxMemoryFraction) {
final String SPARK_EXECUTOR_MEMORY_PROP = "spark.executor.memory"; final String SPARK_EXECUTOR_MEMORY_PROP = "spark.executor.memory";
final String SPARK_EXECUTOR_MEMORY_FRACTION_PROP = "spark.memory.fraction"; final String SPARK_EXECUTOR_MEMORY_FRACTION_PROP = "spark.memory.fraction";
// This is hard-coded in spark code {@link https://github.com/apache/spark/blob/576c43fb4226e4efa12189b41c3bc862019862c6/core/src/main/scala/org/apache/spark/memory/UnifiedMemoryManager.scala#L231} // This is hard-coded in spark code {@link
// so have to re-define this here // https://github.com/apache/spark/blob/576c43fb4226e4efa12189b41c3bc862019862c6/core/src/main/scala/org/apache/
// spark/memory/UnifiedMemoryManager.scala#L231} so have to re-define this here
final String DEFAULT_SPARK_EXECUTOR_MEMORY_FRACTION = "0.6"; final String DEFAULT_SPARK_EXECUTOR_MEMORY_FRACTION = "0.6";
// This is hard-coded in spark code {@link https://github.com/apache/spark/blob/576c43fb4226e4efa12189b41c3bc862019862c6/core/src/main/scala/org/apache/spark/SparkContext.scala#L471} // This is hard-coded in spark code {@link
// so have to re-define this here // https://github.com/apache/spark/blob/576c43fb4226e4efa12189b41c3bc862019862c6/core/src/main/scala/org/apache/
// spark/SparkContext.scala#L471} so have to re-define this here
final String DEFAULT_SPARK_EXECUTOR_MEMORY_MB = "1024"; // in MB final String DEFAULT_SPARK_EXECUTOR_MEMORY_MB = "1024"; // in MB
if (SparkEnv.get() != null) { if (SparkEnv.get() != null) {
@@ -109,7 +111,8 @@ public class HoodieMemoryConfig extends DefaultHoodieConfig {
DEFAULT_SPARK_EXECUTOR_MEMORY_MB)) * 1024 DEFAULT_SPARK_EXECUTOR_MEMORY_MB)) * 1024
* 1024L); * 1024L);
// 0.6 is the default value used by Spark, // 0.6 is the default value used by Spark,
// look at {@link https://github.com/apache/spark/blob/master/core/src/main/scala/org/apache/spark/SparkConf.scala#L507} // look at {@link
// https://github.com/apache/spark/blob/master/core/src/main/scala/org/apache/spark/SparkConf.scala#L507}
double memoryFraction = Double double memoryFraction = Double
.valueOf(SparkEnv.get().conf().get(SPARK_EXECUTOR_MEMORY_FRACTION_PROP, .valueOf(SparkEnv.get().conf().get(SPARK_EXECUTOR_MEMORY_FRACTION_PROP,
DEFAULT_SPARK_EXECUTOR_MEMORY_FRACTION)); DEFAULT_SPARK_EXECUTOR_MEMORY_FRACTION));
@@ -143,5 +146,4 @@ public class HoodieMemoryConfig extends DefaultHoodieConfig {
return config; return config;
} }
} }
} }

View File

@@ -29,22 +29,22 @@ import javax.annotation.concurrent.Immutable;
@Immutable @Immutable
public class HoodieMetricsConfig extends DefaultHoodieConfig { public class HoodieMetricsConfig extends DefaultHoodieConfig {
public final static String METRIC_PREFIX = "hoodie.metrics"; public static final String METRIC_PREFIX = "hoodie.metrics";
public final static String METRICS_ON = METRIC_PREFIX + ".on"; public static final String METRICS_ON = METRIC_PREFIX + ".on";
public final static boolean DEFAULT_METRICS_ON = false; public static final boolean DEFAULT_METRICS_ON = false;
public final static String METRICS_REPORTER_TYPE = METRIC_PREFIX + ".reporter.type"; public static final String METRICS_REPORTER_TYPE = METRIC_PREFIX + ".reporter.type";
public final static MetricsReporterType DEFAULT_METRICS_REPORTER_TYPE = public static final MetricsReporterType DEFAULT_METRICS_REPORTER_TYPE = MetricsReporterType
MetricsReporterType.GRAPHITE; .GRAPHITE;
// Graphite // Graphite
public final static String GRAPHITE_PREFIX = METRIC_PREFIX + ".graphite"; public static final String GRAPHITE_PREFIX = METRIC_PREFIX + ".graphite";
public final static String GRAPHITE_SERVER_HOST = GRAPHITE_PREFIX + ".host"; public static final String GRAPHITE_SERVER_HOST = GRAPHITE_PREFIX + ".host";
public final static String DEFAULT_GRAPHITE_SERVER_HOST = "localhost"; public static final String DEFAULT_GRAPHITE_SERVER_HOST = "localhost";
public final static String GRAPHITE_SERVER_PORT = GRAPHITE_PREFIX + ".port"; public static final String GRAPHITE_SERVER_PORT = GRAPHITE_PREFIX + ".port";
public final static int DEFAULT_GRAPHITE_SERVER_PORT = 4756; public static final int DEFAULT_GRAPHITE_SERVER_PORT = 4756;
public final static String GRAPHITE_METRIC_PREFIX = GRAPHITE_PREFIX + ".metric.prefix"; public static final String GRAPHITE_METRIC_PREFIX = GRAPHITE_PREFIX + ".metric.prefix";
private HoodieMetricsConfig(Properties props) { private HoodieMetricsConfig(Properties props) {
super(props); super(props);
@@ -103,14 +103,14 @@ public class HoodieMetricsConfig extends DefaultHoodieConfig {
HoodieMetricsConfig config = new HoodieMetricsConfig(props); HoodieMetricsConfig config = new HoodieMetricsConfig(props);
setDefaultOnCondition(props, !props.containsKey(METRICS_ON), METRICS_ON, setDefaultOnCondition(props, !props.containsKey(METRICS_ON), METRICS_ON,
String.valueOf(DEFAULT_METRICS_ON)); String.valueOf(DEFAULT_METRICS_ON));
setDefaultOnCondition(props, !props.containsKey(METRICS_REPORTER_TYPE), setDefaultOnCondition(props, !props.containsKey(METRICS_REPORTER_TYPE), METRICS_REPORTER_TYPE,
METRICS_REPORTER_TYPE, DEFAULT_METRICS_REPORTER_TYPE.name()); DEFAULT_METRICS_REPORTER_TYPE.name());
setDefaultOnCondition(props, !props.containsKey(GRAPHITE_SERVER_HOST), setDefaultOnCondition(props, !props.containsKey(GRAPHITE_SERVER_HOST), GRAPHITE_SERVER_HOST,
GRAPHITE_SERVER_HOST, DEFAULT_GRAPHITE_SERVER_HOST); DEFAULT_GRAPHITE_SERVER_HOST);
setDefaultOnCondition(props, !props.containsKey(GRAPHITE_SERVER_PORT), setDefaultOnCondition(props, !props.containsKey(GRAPHITE_SERVER_PORT), GRAPHITE_SERVER_PORT,
GRAPHITE_SERVER_PORT, String.valueOf(DEFAULT_GRAPHITE_SERVER_PORT)); String.valueOf(DEFAULT_GRAPHITE_SERVER_PORT));
setDefaultOnCondition(props, !props.containsKey(GRAPHITE_SERVER_PORT), setDefaultOnCondition(props, !props.containsKey(GRAPHITE_SERVER_PORT), GRAPHITE_SERVER_PORT,
GRAPHITE_SERVER_PORT, String.valueOf(DEFAULT_GRAPHITE_SERVER_PORT)); String.valueOf(DEFAULT_GRAPHITE_SERVER_PORT));
return config; return config;
} }
} }

View File

@@ -36,7 +36,8 @@ public class HoodieStorageConfig extends DefaultHoodieConfig {
public static final String DEFAULT_PARQUET_PAGE_SIZE_BYTES = String.valueOf(1 * 1024 * 1024); public static final String DEFAULT_PARQUET_PAGE_SIZE_BYTES = String.valueOf(1 * 1024 * 1024);
// used to size log files // used to size log files
public static final String LOGFILE_SIZE_MAX_BYTES = "hoodie.logfile.max.size"; public static final String LOGFILE_SIZE_MAX_BYTES = "hoodie.logfile.max.size";
public static final String DEFAULT_LOGFILE_SIZE_MAX_BYTES = String.valueOf(1024*1024*1024); // 1 GB public static final String DEFAULT_LOGFILE_SIZE_MAX_BYTES = String
.valueOf(1024 * 1024 * 1024); // 1 GB
// used to size data blocks in log file // used to size data blocks in log file
public static final String LOGFILE_DATA_BLOCK_SIZE_MAX_BYTES = "hoodie.logfile.data.block.max.size"; public static final String LOGFILE_DATA_BLOCK_SIZE_MAX_BYTES = "hoodie.logfile.data.block.max.size";
public static final String DEFAULT_LOGFILE_DATA_BLOCK_SIZE_MAX_BYTES = String.valueOf(256 * 1024 * 1024); // 256 MB public static final String DEFAULT_LOGFILE_DATA_BLOCK_SIZE_MAX_BYTES = String.valueOf(256 * 1024 * 1024); // 256 MB

View File

@@ -16,7 +16,6 @@
package com.uber.hoodie.config; package com.uber.hoodie.config;
import com.google.common.base.Preconditions; import com.google.common.base.Preconditions;
import com.uber.hoodie.WriteStatus; import com.uber.hoodie.WriteStatus;
import com.uber.hoodie.common.model.HoodieCleaningPolicy; import com.uber.hoodie.common.model.HoodieCleaningPolicy;
@@ -24,15 +23,14 @@ import com.uber.hoodie.common.util.ReflectionUtils;
import com.uber.hoodie.index.HoodieIndex; import com.uber.hoodie.index.HoodieIndex;
import com.uber.hoodie.io.compact.strategy.CompactionStrategy; import com.uber.hoodie.io.compact.strategy.CompactionStrategy;
import com.uber.hoodie.metrics.MetricsReporterType; import com.uber.hoodie.metrics.MetricsReporterType;
import org.apache.spark.storage.StorageLevel;
import javax.annotation.concurrent.Immutable;
import java.io.File; import java.io.File;
import java.io.FileReader; import java.io.FileReader;
import java.io.IOException; import java.io.IOException;
import java.io.InputStream; import java.io.InputStream;
import java.util.Map; import java.util.Map;
import java.util.Properties; import java.util.Properties;
import javax.annotation.concurrent.Immutable;
import org.apache.spark.storage.StorageLevel;
/** /**
* Class storing configs for the {@link com.uber.hoodie.HoodieWriteClient} * Class storing configs for the {@link com.uber.hoodie.HoodieWriteClient}
@@ -40,9 +38,9 @@ import java.util.Properties;
@Immutable @Immutable
public class HoodieWriteConfig extends DefaultHoodieConfig { public class HoodieWriteConfig extends DefaultHoodieConfig {
public static final String TABLE_NAME = "hoodie.table.name";
private static final String BASE_PATH_PROP = "hoodie.base.path"; private static final String BASE_PATH_PROP = "hoodie.base.path";
private static final String AVRO_SCHEMA = "hoodie.avro.schema"; private static final String AVRO_SCHEMA = "hoodie.avro.schema";
public static final String TABLE_NAME = "hoodie.table.name";
private static final String DEFAULT_PARALLELISM = "200"; private static final String DEFAULT_PARALLELISM = "200";
private static final String INSERT_PARALLELISM = "hoodie.insert.shuffle.parallelism"; private static final String INSERT_PARALLELISM = "hoodie.insert.shuffle.parallelism";
private static final String BULKINSERT_PARALLELISM = "hoodie.bulkinsert.shuffle.parallelism"; private static final String BULKINSERT_PARALLELISM = "hoodie.bulkinsert.shuffle.parallelism";
@@ -57,13 +55,16 @@ public class HoodieWriteConfig extends DefaultHoodieConfig {
private static final String DEFAULT_WRITE_STATUS_STORAGE_LEVEL = "MEMORY_AND_DISK_SER"; private static final String DEFAULT_WRITE_STATUS_STORAGE_LEVEL = "MEMORY_AND_DISK_SER";
private static final String HOODIE_AUTO_COMMIT_PROP = "hoodie.auto.commit"; private static final String HOODIE_AUTO_COMMIT_PROP = "hoodie.auto.commit";
private static final String DEFAULT_HOODIE_AUTO_COMMIT = "true"; private static final String DEFAULT_HOODIE_AUTO_COMMIT = "true";
private static final String HOODIE_ASSUME_DATE_PARTITIONING_PROP = "hoodie.assume.date.partitioning"; private static final String HOODIE_ASSUME_DATE_PARTITIONING_PROP =
"hoodie.assume.date" + ".partitioning";
private static final String DEFAULT_ASSUME_DATE_PARTITIONING = "false"; private static final String DEFAULT_ASSUME_DATE_PARTITIONING = "false";
private static final String HOODIE_WRITE_STATUS_CLASS_PROP = "hoodie.writestatus.class"; private static final String HOODIE_WRITE_STATUS_CLASS_PROP = "hoodie.writestatus.class";
private static final String DEFAULT_HOODIE_WRITE_STATUS_CLASS = WriteStatus.class.getName(); private static final String DEFAULT_HOODIE_WRITE_STATUS_CLASS = WriteStatus.class.getName();
private static final String HOODIE_COPYONWRITE_USE_TEMP_FOLDER_CREATE = "hoodie.copyonwrite.use.temp.folder.for.create"; private static final String HOODIE_COPYONWRITE_USE_TEMP_FOLDER_CREATE =
"hoodie.copyonwrite.use" + ".temp.folder.for.create";
private static final String DEFAULT_HOODIE_COPYONWRITE_USE_TEMP_FOLDER_CREATE = "false"; private static final String DEFAULT_HOODIE_COPYONWRITE_USE_TEMP_FOLDER_CREATE = "false";
private static final String HOODIE_COPYONWRITE_USE_TEMP_FOLDER_MERGE = "hoodie.copyonwrite.use.temp.folder.for.merge"; private static final String HOODIE_COPYONWRITE_USE_TEMP_FOLDER_MERGE =
"hoodie.copyonwrite.use" + ".temp.folder.for.merge";
private static final String DEFAULT_HOODIE_COPYONWRITE_USE_TEMP_FOLDER_MERGE = "false"; private static final String DEFAULT_HOODIE_COPYONWRITE_USE_TEMP_FOLDER_MERGE = "false";
private static final String FINALIZE_WRITE_PARALLELISM = "hoodie.finalize.write.parallelism"; private static final String FINALIZE_WRITE_PARALLELISM = "hoodie.finalize.write.parallelism";
private static final String DEFAULT_FINALIZE_WRITE_PARALLELISM = DEFAULT_PARALLELISM; private static final String DEFAULT_FINALIZE_WRITE_PARALLELISM = DEFAULT_PARALLELISM;
@@ -72,6 +73,10 @@ public class HoodieWriteConfig extends DefaultHoodieConfig {
super(props); super(props);
} }
public static HoodieWriteConfig.Builder newBuilder() {
return new Builder();
}
/** /**
* base properties * base properties
**/ **/
@@ -137,8 +142,8 @@ public class HoodieWriteConfig extends DefaultHoodieConfig {
} }
public boolean shouldUseTempFolderForCopyOnWrite() { public boolean shouldUseTempFolderForCopyOnWrite() {
return shouldUseTempFolderForCopyOnWriteForCreate() || return shouldUseTempFolderForCopyOnWriteForCreate()
shouldUseTempFolderForCopyOnWriteForMerge(); || shouldUseTempFolderForCopyOnWriteForMerge();
} }
public int getFinalizeWriteParallelism() { public int getFinalizeWriteParallelism() {
@@ -154,8 +159,8 @@ public class HoodieWriteConfig extends DefaultHoodieConfig {
} }
public int getCleanerFileVersionsRetained() { public int getCleanerFileVersionsRetained() {
return Integer.parseInt( return Integer
props.getProperty(HoodieCompactionConfig.CLEANER_FILE_VERSIONS_RETAINED_PROP)); .parseInt(props.getProperty(HoodieCompactionConfig.CLEANER_FILE_VERSIONS_RETAINED_PROP));
} }
public int getCleanerCommitsRetained() { public int getCleanerCommitsRetained() {
@@ -177,8 +182,8 @@ public class HoodieWriteConfig extends DefaultHoodieConfig {
} }
public int getCopyOnWriteInsertSplitSize() { public int getCopyOnWriteInsertSplitSize() {
return Integer.parseInt( return Integer
props.getProperty(HoodieCompactionConfig.COPY_ON_WRITE_TABLE_INSERT_SPLIT_SIZE)); .parseInt(props.getProperty(HoodieCompactionConfig.COPY_ON_WRITE_TABLE_INSERT_SPLIT_SIZE));
} }
public int getCopyOnWriteRecordSizeEstimate() { public int getCopyOnWriteRecordSizeEstimate() {
@@ -204,8 +209,8 @@ public class HoodieWriteConfig extends DefaultHoodieConfig {
} }
public int getInlineCompactDeltaCommitMax() { public int getInlineCompactDeltaCommitMax() {
return Integer.parseInt( return Integer
props.getProperty(HoodieCompactionConfig.INLINE_COMPACT_NUM_DELTA_COMMITS_PROP)); .parseInt(props.getProperty(HoodieCompactionConfig.INLINE_COMPACT_NUM_DELTA_COMMITS_PROP));
} }
public CompactionStrategy getCompactionStrategy() { public CompactionStrategy getCompactionStrategy() {
@@ -341,10 +346,6 @@ public class HoodieWriteConfig extends DefaultHoodieConfig {
return props.getProperty(HoodieMetricsConfig.GRAPHITE_METRIC_PREFIX); return props.getProperty(HoodieMetricsConfig.GRAPHITE_METRIC_PREFIX);
} }
public static HoodieWriteConfig.Builder newBuilder() {
return new Builder();
}
/** /**
* memory configs * memory configs
*/ */
@@ -486,15 +487,15 @@ public class HoodieWriteConfig extends DefaultHoodieConfig {
public Builder withUseTempFolderCopyOnWriteForCreate( public Builder withUseTempFolderCopyOnWriteForCreate(
boolean shouldUseTempFolderCopyOnWriteForCreate) { boolean shouldUseTempFolderCopyOnWriteForCreate) {
props.setProperty(HOODIE_COPYONWRITE_USE_TEMP_FOLDER_CREATE, String.valueOf props.setProperty(HOODIE_COPYONWRITE_USE_TEMP_FOLDER_CREATE,
(shouldUseTempFolderCopyOnWriteForCreate)); String.valueOf(shouldUseTempFolderCopyOnWriteForCreate));
return this; return this;
} }
public Builder withUseTempFolderCopyOnWriteForMerge( public Builder withUseTempFolderCopyOnWriteForMerge(
boolean shouldUseTempFolderCopyOnWriteForMerge) { boolean shouldUseTempFolderCopyOnWriteForMerge) {
props.setProperty(HOODIE_COPYONWRITE_USE_TEMP_FOLDER_MERGE, String.valueOf props.setProperty(HOODIE_COPYONWRITE_USE_TEMP_FOLDER_MERGE,
(shouldUseTempFolderCopyOnWriteForMerge)); String.valueOf(shouldUseTempFolderCopyOnWriteForMerge));
return this; return this;
} }
@@ -510,8 +511,7 @@ public class HoodieWriteConfig extends DefaultHoodieConfig {
setDefaultOnCondition(props, !props.containsKey(INSERT_PARALLELISM), INSERT_PARALLELISM, setDefaultOnCondition(props, !props.containsKey(INSERT_PARALLELISM), INSERT_PARALLELISM,
DEFAULT_PARALLELISM); DEFAULT_PARALLELISM);
setDefaultOnCondition(props, !props.containsKey(BULKINSERT_PARALLELISM), setDefaultOnCondition(props, !props.containsKey(BULKINSERT_PARALLELISM),
BULKINSERT_PARALLELISM, BULKINSERT_PARALLELISM, DEFAULT_PARALLELISM);
DEFAULT_PARALLELISM);
setDefaultOnCondition(props, !props.containsKey(UPSERT_PARALLELISM), UPSERT_PARALLELISM, setDefaultOnCondition(props, !props.containsKey(UPSERT_PARALLELISM), UPSERT_PARALLELISM,
DEFAULT_PARALLELISM); DEFAULT_PARALLELISM);
setDefaultOnCondition(props, !props.containsKey(COMBINE_BEFORE_INSERT_PROP), setDefaultOnCondition(props, !props.containsKey(COMBINE_BEFORE_INSERT_PROP),

View File

@@ -21,12 +21,6 @@ import com.google.common.base.Preconditions;
import com.uber.hoodie.common.model.HoodieRecord; import com.uber.hoodie.common.model.HoodieRecord;
import com.uber.hoodie.common.model.HoodieRecordPayload; import com.uber.hoodie.common.model.HoodieRecordPayload;
import com.uber.hoodie.exception.HoodieException; import com.uber.hoodie.exception.HoodieException;
import org.apache.avro.Schema;
import org.apache.avro.generic.IndexedRecord;
import org.apache.log4j.LogManager;
import org.apache.log4j.Logger;
import org.apache.spark.util.SizeEstimator;
import java.util.Iterator; import java.util.Iterator;
import java.util.Optional; import java.util.Optional;
import java.util.concurrent.LinkedBlockingQueue; import java.util.concurrent.LinkedBlockingQueue;
@@ -35,53 +29,63 @@ import java.util.concurrent.TimeUnit;
import java.util.concurrent.atomic.AtomicBoolean; import java.util.concurrent.atomic.AtomicBoolean;
import java.util.concurrent.atomic.AtomicLong; import java.util.concurrent.atomic.AtomicLong;
import java.util.concurrent.atomic.AtomicReference; import java.util.concurrent.atomic.AtomicReference;
import org.apache.avro.Schema;
import org.apache.avro.generic.IndexedRecord;
import org.apache.log4j.LogManager;
import org.apache.log4j.Logger;
import org.apache.spark.util.SizeEstimator;
/** /**
* Used for buffering input records. Buffer limit is controlled by {@link #bufferMemoryLimit}. It internally samples * Used for buffering input records. Buffer limit is controlled by {@link #bufferMemoryLimit}. It
* every {@link #RECORD_SAMPLING_RATE}th record and adjusts number of records in buffer accordingly. This is done to * internally samples every {@link #RECORD_SAMPLING_RATE}th record and adjusts number of records in
* ensure that we don't OOM. * buffer accordingly. This is done to ensure that we don't OOM.
*/ */
public class BufferedIterator<K extends HoodieRecordPayload, T extends HoodieRecord<K>> public class BufferedIterator<K extends HoodieRecordPayload, T extends HoodieRecord<K>> implements
implements Iterator<BufferedIterator.BufferedIteratorPayload<T>> { Iterator<BufferedIterator.BufferedIteratorPayload<T>> {
private static Logger logger = LogManager.getLogger(BufferedIterator.class);
// interval used for polling records in the queue. // interval used for polling records in the queue.
public static final int RECORD_POLL_INTERVAL_SEC = 5; public static final int RECORD_POLL_INTERVAL_SEC = 5;
// rate used for sampling records to determine avg record size in bytes. // rate used for sampling records to determine avg record size in bytes.
public static final int RECORD_SAMPLING_RATE = 64; public static final int RECORD_SAMPLING_RATE = 64;
// maximum records that will be cached // maximum records that will be cached
private static final int RECORD_CACHING_LIMIT = 128 * 1024; private static final int RECORD_CACHING_LIMIT = 128 * 1024;
// It indicates number of records to cache. We will be using sampled record's average size to determine how many private static Logger logger = LogManager.getLogger(BufferedIterator.class);
// It indicates number of records to cache. We will be using sampled record's average size to
// determine how many
// records we should cache and will change (increase/decrease) permits accordingly. // records we should cache and will change (increase/decrease) permits accordingly.
@VisibleForTesting @VisibleForTesting
public final Semaphore rateLimiter = new Semaphore(1); public final Semaphore rateLimiter = new Semaphore(1);
// used for sampling records with "RECORD_SAMPLING_RATE" frequency. // used for sampling records with "RECORD_SAMPLING_RATE" frequency.
public final AtomicLong samplingRecordCounter = new AtomicLong(-1); public final AtomicLong samplingRecordCounter = new AtomicLong(-1);
// indicates rate limit (number of records to cache). it is updated whenever there is a change in avg record size.
@VisibleForTesting
public int currentRateLimit = 1;
// internal buffer to cache buffered records. // internal buffer to cache buffered records.
private final LinkedBlockingQueue<Optional<BufferedIteratorPayload<T>>> buffer = new LinkedBlockingQueue<>(); private final LinkedBlockingQueue<Optional<BufferedIteratorPayload<T>>> buffer = new
LinkedBlockingQueue<>();
// maximum amount of memory to be used for buffering records. // maximum amount of memory to be used for buffering records.
private final long bufferMemoryLimit; private final long bufferMemoryLimit;
// original iterator from where records are read for buffering.
private final Iterator<T> inputIterator;
// it holds the root cause of the exception in case either buffering records (reading from
// inputIterator) fails or
// thread reading records from buffer fails.
private final AtomicReference<Exception> hasFailed = new AtomicReference(null);
// used for indicating that all the records from buffer are read successfully.
private final AtomicBoolean isDone = new AtomicBoolean(false);
// schema used for fetching insertValue from HoodieRecord.
private final Schema schema;
// indicates rate limit (number of records to cache). it is updated whenever there is a change
// in avg record size.
@VisibleForTesting
public int currentRateLimit = 1;
// indicates avg record size in bytes. It is updated whenever a new record is sampled. // indicates avg record size in bytes. It is updated whenever a new record is sampled.
@VisibleForTesting @VisibleForTesting
public long avgRecordSizeInBytes = 0; public long avgRecordSizeInBytes = 0;
// indicates number of samples collected so far. // indicates number of samples collected so far.
private long numSamples = 0; private long numSamples = 0;
// original iterator from where records are read for buffering.
private final Iterator<T> inputIterator;
// it holds the root cause of the exception in case either buffering records (reading from inputIterator) fails or
// thread reading records from buffer fails.
private final AtomicReference<Exception> hasFailed = new AtomicReference(null);
// used for indicating that all the records from buffer are read successfully.
private final AtomicBoolean isDone = new AtomicBoolean(false);
// next record to be read from buffer. // next record to be read from buffer.
private BufferedIteratorPayload<T> nextRecord; private BufferedIteratorPayload<T> nextRecord;
// schema used for fetching insertValue from HoodieRecord.
private final Schema schema;
public BufferedIterator(final Iterator<T> iterator, final long bufferMemoryLimit, final Schema schema) { public BufferedIterator(final Iterator<T> iterator, final long bufferMemoryLimit,
final Schema schema) {
this.inputIterator = iterator; this.inputIterator = iterator;
this.bufferMemoryLimit = bufferMemoryLimit; this.bufferMemoryLimit = bufferMemoryLimit;
this.schema = schema; this.schema = schema;
@@ -92,23 +96,28 @@ public class BufferedIterator<K extends HoodieRecordPayload, T extends HoodieRec
return this.buffer.size(); return this.buffer.size();
} }
// It samples records with "RECORD_SAMPLING_RATE" frequency and computes average record size in bytes. It is used // It samples records with "RECORD_SAMPLING_RATE" frequency and computes average record size in
// for determining how many maximum records to buffer. Based on change in avg size it may increase or decrease // bytes. It is used
// for determining how many maximum records to buffer. Based on change in avg size it may
// increase or decrease
// available permits. // available permits.
private void adjustBufferSizeIfNeeded(final T record) throws InterruptedException { private void adjustBufferSizeIfNeeded(final T record) throws InterruptedException {
if (this.samplingRecordCounter.incrementAndGet() % RECORD_SAMPLING_RATE != 0) { if (this.samplingRecordCounter.incrementAndGet() % RECORD_SAMPLING_RATE != 0) {
return; return;
} }
final long recordSizeInBytes = SizeEstimator.estimate(record); final long recordSizeInBytes = SizeEstimator.estimate(record);
final long newAvgRecordSizeInBytes = final long newAvgRecordSizeInBytes = Math
Math.max(1, (avgRecordSizeInBytes * numSamples + recordSizeInBytes) / (numSamples + 1)); .max(1, (avgRecordSizeInBytes * numSamples + recordSizeInBytes) / (numSamples + 1));
final int newRateLimit = final int newRateLimit = (int) Math
(int) Math.min(RECORD_CACHING_LIMIT, Math.max(1, this.bufferMemoryLimit / newAvgRecordSizeInBytes)); .min(RECORD_CACHING_LIMIT, Math.max(1, this.bufferMemoryLimit / newAvgRecordSizeInBytes));
// System.out.println("recordSizeInBytes:" + recordSizeInBytes + ":newAvgRecordSizeInBytes:" + newAvgRecordSizeInBytes // System.out.println("recordSizeInBytes:" + recordSizeInBytes + ":newAvgRecordSizeInBytes:" +
// + ":newRateLimit:" + newRateLimit + ":currentRateLimit:" + currentRateLimit + ":numSamples:" + numSamples // newAvgRecordSizeInBytes
// + ":newRateLimit:" + newRateLimit + ":currentRateLimit:" + currentRateLimit +
// ":numSamples:" + numSamples
// + ":avgRecordSizeInBytes:" + avgRecordSizeInBytes); // + ":avgRecordSizeInBytes:" + avgRecordSizeInBytes);
// If there is any change in number of records to cache then we will either release (if it increased) or acquire // If there is any change in number of records to cache then we will either release (if it
// increased) or acquire
// (if it decreased) to adjust rate limiting to newly computed value. // (if it decreased) to adjust rate limiting to newly computed value.
if (newRateLimit > currentRateLimit) { if (newRateLimit > currentRateLimit) {
rateLimiter.release(newRateLimit - currentRateLimit); rateLimiter.release(newRateLimit - currentRateLimit);
@@ -120,12 +129,14 @@ public class BufferedIterator<K extends HoodieRecordPayload, T extends HoodieRec
numSamples++; numSamples++;
} }
// inserts record into internal buffer. It also fetches insert value from the record to offload computation work on to // inserts record into internal buffer. It also fetches insert value from the record to offload
// computation work on to
// buffering thread. // buffering thread.
private void insertRecord(T t) throws Exception { private void insertRecord(T t) throws Exception {
rateLimiter.acquire(); rateLimiter.acquire();
adjustBufferSizeIfNeeded(t); adjustBufferSizeIfNeeded(t);
// We are retrieving insert value in the record buffering thread to offload computation around schema validation // We are retrieving insert value in the record buffering thread to offload computation
// around schema validation
// and record creation to it. // and record creation to it.
final BufferedIteratorPayload<T> payload = new BufferedIteratorPayload<>(t, this.schema); final BufferedIteratorPayload<T> payload = new BufferedIteratorPayload<>(t, this.schema);
buffer.put(Optional.of(payload)); buffer.put(Optional.of(payload));
@@ -198,12 +209,15 @@ public class BufferedIterator<K extends HoodieRecordPayload, T extends HoodieRec
public void markAsFailed(Exception e) { public void markAsFailed(Exception e) {
this.hasFailed.set(e); this.hasFailed.set(e);
// release the permits so that if the buffering thread is waiting for permits then it will get it. // release the permits so that if the buffering thread is waiting for permits then it will
// get it.
this.rateLimiter.release(RECORD_CACHING_LIMIT + 1); this.rateLimiter.release(RECORD_CACHING_LIMIT + 1);
} }
// Used for caching HoodieRecord along with insertValue. We need this to offload computation work to buffering thread. // Used for caching HoodieRecord along with insertValue. We need this to offload computation
// work to buffering thread.
static class BufferedIteratorPayload<T extends HoodieRecord> { static class BufferedIteratorPayload<T extends HoodieRecord> {
public T record; public T record;
public Optional<IndexedRecord> insertValue; public Optional<IndexedRecord> insertValue;
// It caches the exception seen while fetching insert value. // It caches the exception seen while fetching insert value.

View File

@@ -29,8 +29,8 @@ import org.apache.spark.api.java.function.Function2;
/** /**
* Map function that handles a sorted stream of HoodieRecords * Map function that handles a sorted stream of HoodieRecords
*/ */
public class BulkInsertMapFunction<T extends HoodieRecordPayload> public class BulkInsertMapFunction<T extends HoodieRecordPayload> implements
implements Function2<Integer, Iterator<HoodieRecord<T>>, Iterator<List<WriteStatus>>> { Function2<Integer, Iterator<HoodieRecord<T>>, Iterator<List<WriteStatus>>> {
private String commitTime; private String commitTime;
private HoodieWriteConfig config; private HoodieWriteConfig config;
@@ -45,8 +45,7 @@ public class BulkInsertMapFunction<T extends HoodieRecordPayload>
@Override @Override
public Iterator<List<WriteStatus>> call(Integer partition, public Iterator<List<WriteStatus>> call(Integer partition,
Iterator<HoodieRecord<T>> sortedRecordItr) Iterator<HoodieRecord<T>> sortedRecordItr) throws Exception {
throws Exception {
return new LazyInsertIterable<>(sortedRecordItr, config, commitTime, hoodieTable); return new LazyInsertIterable<>(sortedRecordItr, config, commitTime, hoodieTable);
} }
} }

View File

@@ -24,11 +24,6 @@ import com.uber.hoodie.exception.HoodieException;
import com.uber.hoodie.io.HoodieCreateHandle; import com.uber.hoodie.io.HoodieCreateHandle;
import com.uber.hoodie.io.HoodieIOHandle; import com.uber.hoodie.io.HoodieIOHandle;
import com.uber.hoodie.table.HoodieTable; import com.uber.hoodie.table.HoodieTable;
import org.apache.log4j.LogManager;
import org.apache.log4j.Logger;
import org.apache.spark.TaskContext;
import org.apache.spark.TaskContext$;
import java.util.ArrayList; import java.util.ArrayList;
import java.util.HashSet; import java.util.HashSet;
import java.util.Iterator; import java.util.Iterator;
@@ -38,6 +33,10 @@ import java.util.Set;
import java.util.concurrent.ExecutorService; import java.util.concurrent.ExecutorService;
import java.util.concurrent.Executors; import java.util.concurrent.Executors;
import java.util.concurrent.Future; import java.util.concurrent.Future;
import org.apache.log4j.LogManager;
import org.apache.log4j.Logger;
import org.apache.spark.TaskContext;
import org.apache.spark.TaskContext$;
/** /**
* Lazy Iterable, that writes a stream of HoodieRecords sorted by the partitionPath, into new * Lazy Iterable, that writes a stream of HoodieRecords sorted by the partitionPath, into new
@@ -68,21 +67,22 @@ public class LazyInsertIterable<T extends HoodieRecordPayload> extends
@Override @Override
protected List<WriteStatus> computeNext() { protected List<WriteStatus> computeNext() {
// Need to set current spark thread's TaskContext into newly launched thread so that new thread can access // Need to set current spark thread's TaskContext into newly launched thread so that new
// thread can access
// TaskContext properties. // TaskContext properties.
final TaskContext sparkThreadTaskContext = TaskContext.get(); final TaskContext sparkThreadTaskContext = TaskContext.get();
// Executor service used for launching writer thread. // Executor service used for launching writer thread.
final ExecutorService writerService = Executors.newFixedThreadPool(1); final ExecutorService writerService = Executors.newFixedThreadPool(1);
try { try {
// Used for buffering records which is controlled by HoodieWriteConfig#WRITE_BUFFER_LIMIT_BYTES. // Used for buffering records which is controlled by
final BufferedIterator<T, HoodieRecord<T>> bufferedIterator = // HoodieWriteConfig#WRITE_BUFFER_LIMIT_BYTES.
new BufferedIterator<>(inputItr, hoodieConfig.getWriteBufferLimitBytes(), final BufferedIterator<T, HoodieRecord<T>> bufferedIterator = new BufferedIterator<>(inputItr,
hoodieConfig.getWriteBufferLimitBytes(),
HoodieIOHandle.createHoodieWriteSchema(hoodieConfig)); HoodieIOHandle.createHoodieWriteSchema(hoodieConfig));
Future<List<WriteStatus>> writerResult = Future<List<WriteStatus>> writerResult = writerService.submit(() -> {
writerService.submit(
() -> {
logger.info("starting hoodie writer thread"); logger.info("starting hoodie writer thread");
// Passing parent thread's TaskContext to newly launched thread for it to access original TaskContext // Passing parent thread's TaskContext to newly launched thread for it to access original
// TaskContext
// properties. // properties.
TaskContext$.MODULE$.setTaskContext(sparkThreadTaskContext); TaskContext$.MODULE$.setTaskContext(sparkThreadTaskContext);
List<WriteStatus> statuses = new LinkedList<>(); List<WriteStatus> statuses = new LinkedList<>();
@@ -96,7 +96,8 @@ public class LazyInsertIterable<T extends HoodieRecordPayload> extends
throw e; throw e;
} }
}); });
// Buffering records into internal buffer. This can throw exception either if reading records from spark fails or // Buffering records into internal buffer. This can throw exception either if reading
// records from spark fails or
// if writing buffered records into parquet file fails. // if writing buffered records into parquet file fails.
bufferedIterator.startBuffering(); bufferedIterator.startBuffering();
logger.info("waiting for hoodie write to finish"); logger.info("waiting for hoodie write to finish");
@@ -110,28 +111,27 @@ public class LazyInsertIterable<T extends HoodieRecordPayload> extends
} }
} }
private List<WriteStatus> handleWrite(final BufferedIterator<T, HoodieRecord<T>> bufferedIterator) { private List<WriteStatus> handleWrite(
final BufferedIterator<T, HoodieRecord<T>> bufferedIterator) {
List<WriteStatus> statuses = new ArrayList<>(); List<WriteStatus> statuses = new ArrayList<>();
while (bufferedIterator.hasNext()) { while (bufferedIterator.hasNext()) {
final BufferedIterator.BufferedIteratorPayload<HoodieRecord<T>> payload = bufferedIterator.next(); final BufferedIterator.BufferedIteratorPayload<HoodieRecord<T>> payload = bufferedIterator
.next();
// clean up any partial failures // clean up any partial failures
if (!partitionsCleaned.contains(payload.record.getPartitionPath())) { if (!partitionsCleaned.contains(payload.record.getPartitionPath())) {
// This insert task could fail multiple times, but Spark will faithfully retry with // This insert task could fail multiple times, but Spark will faithfully retry with
// the same data again. Thus, before we open any files under a given partition, we // the same data again. Thus, before we open any files under a given partition, we
// first delete any files in the same partitionPath written by same Spark partition // first delete any files in the same partitionPath written by same Spark partition
HoodieIOHandle.cleanupTmpFilesFromCurrentCommit(hoodieConfig, HoodieIOHandle.cleanupTmpFilesFromCurrentCommit(hoodieConfig, commitTime,
commitTime, payload.record.getPartitionPath(), TaskContext.getPartitionId(), hoodieTable);
payload.record.getPartitionPath(),
TaskContext.getPartitionId(),
hoodieTable);
partitionsCleaned.add(payload.record.getPartitionPath()); partitionsCleaned.add(payload.record.getPartitionPath());
} }
// lazily initialize the handle, for the first time // lazily initialize the handle, for the first time
if (handle == null) { if (handle == null) {
handle = handle = new HoodieCreateHandle(hoodieConfig, commitTime, hoodieTable,
new HoodieCreateHandle(hoodieConfig, commitTime, hoodieTable, payload.record.getPartitionPath()); payload.record.getPartitionPath());
} }
if (handle.canWrite(payload.record)) { if (handle.canWrite(payload.record)) {
@@ -141,9 +141,10 @@ public class LazyInsertIterable<T extends HoodieRecordPayload> extends
// handle is full. // handle is full.
statuses.add(handle.close()); statuses.add(handle.close());
// Need to handle the rejected payload & open new handle // Need to handle the rejected payload & open new handle
handle = handle = new HoodieCreateHandle(hoodieConfig, commitTime, hoodieTable,
new HoodieCreateHandle(hoodieConfig, commitTime, hoodieTable, payload.record.getPartitionPath()); payload.record.getPartitionPath());
handle.write(payload.record, payload.insertValue, payload.exception); // we should be able to write 1 payload. handle.write(payload.record, payload.insertValue,
payload.exception); // we should be able to write 1 payload.
} }
} }

View File

@@ -23,9 +23,9 @@ import java.util.Iterator;
* inputItr classes in order to simplify the implementation of lazy iterators for mapPartitions use * inputItr classes in order to simplify the implementation of lazy iterators for mapPartitions use
* cases. Note [SPARK-3369], which gives the reasons for backwards compatibility with regard to the * cases. Note [SPARK-3369], which gives the reasons for backwards compatibility with regard to the
* iterable API despite Spark's single pass nature. * iterable API despite Spark's single pass nature.
* * <p>
* Provide a way to obtain a inputItr of type O (output), out of an inputItr of type I (input) * Provide a way to obtain a inputItr of type O (output), out of an inputItr of type I (input)
* * <p>
* Things to remember: - Assumes Spark calls hasNext() to check for elements, before calling next() * Things to remember: - Assumes Spark calls hasNext() to check for elements, before calling next()
* to obtain them - Assumes hasNext() gets called atleast once. - Concrete Implementation is * to obtain them - Assumes hasNext() gets called atleast once. - Concrete Implementation is
* responsible for calling inputIterator.next() and doing the processing in computeNext() * responsible for calling inputIterator.next() and doing the processing in computeNext()

View File

@@ -37,22 +37,30 @@ import org.apache.spark.api.java.JavaSparkContext;
*/ */
public abstract class HoodieIndex<T extends HoodieRecordPayload> implements Serializable { public abstract class HoodieIndex<T extends HoodieRecordPayload> implements Serializable {
protected transient JavaSparkContext jsc = null;
public enum IndexType {
HBASE,
INMEMORY,
BLOOM,
BUCKETED
}
protected final HoodieWriteConfig config; protected final HoodieWriteConfig config;
protected transient JavaSparkContext jsc = null;
protected HoodieIndex(HoodieWriteConfig config, JavaSparkContext jsc) { protected HoodieIndex(HoodieWriteConfig config, JavaSparkContext jsc) {
this.config = config; this.config = config;
this.jsc = jsc; this.jsc = jsc;
} }
public static <T extends HoodieRecordPayload> HoodieIndex<T> createIndex(HoodieWriteConfig config,
JavaSparkContext jsc) throws HoodieIndexException {
switch (config.getIndexType()) {
case HBASE:
return new HBaseIndex<>(config, jsc);
case INMEMORY:
return new InMemoryHashIndex<>(config, jsc);
case BLOOM:
return new HoodieBloomIndex<>(config, jsc);
case BUCKETED:
return new BucketedIndex<>(config, jsc);
default:
throw new HoodieIndexException("Index type unspecified, set " + config.getIndexType());
}
}
/** /**
* Checks if the given [Keys] exists in the hoodie table and returns [Key, Optional[FullFilePath]] * Checks if the given [Keys] exists in the hoodie table and returns [Key, Optional[FullFilePath]]
* If the optional FullFilePath value is not present, then the key is not found. If the * If the optional FullFilePath value is not present, then the key is not found. If the
@@ -71,7 +79,7 @@ public abstract class HoodieIndex<T extends HoodieRecordPayload> implements Seri
/** /**
* Extracts the location of written records, and updates the index. * Extracts the location of written records, and updates the index.
* * <p>
* TODO(vc): We may need to propagate the record as well in a WriteStatus class * TODO(vc): We may need to propagate the record as well in a WriteStatus class
*/ */
public abstract JavaRDD<WriteStatus> updateLocation(JavaRDD<WriteStatus> writeStatusRDD, public abstract JavaRDD<WriteStatus> updateLocation(JavaRDD<WriteStatus> writeStatusRDD,
@@ -107,18 +115,7 @@ public abstract class HoodieIndex<T extends HoodieRecordPayload> implements Seri
public abstract boolean isImplicitWithStorage(); public abstract boolean isImplicitWithStorage();
public static <T extends HoodieRecordPayload> HoodieIndex<T> createIndex( public enum IndexType {
HoodieWriteConfig config, JavaSparkContext jsc) throws HoodieIndexException { HBASE, INMEMORY, BLOOM, BUCKETED
switch (config.getIndexType()) {
case HBASE:
return new HBaseIndex<>(config, jsc);
case INMEMORY:
return new InMemoryHashIndex<>(config, jsc);
case BLOOM:
return new HoodieBloomIndex<>(config, jsc);
case BUCKETED:
return new BucketedIndex<>(config, jsc);
}
throw new HoodieIndexException("Index type unspecified, set " + config.getIndexType());
} }
} }

View File

@@ -49,32 +49,11 @@ public class InMemoryHashIndex<T extends HoodieRecordPayload> extends HoodieInde
} }
@Override @Override
public JavaPairRDD<HoodieKey, Optional<String>> fetchRecordLocation( public JavaPairRDD<HoodieKey, Optional<String>> fetchRecordLocation(JavaRDD<HoodieKey> hoodieKeys,
JavaRDD<HoodieKey> hoodieKeys, final HoodieTable<T> table) { final HoodieTable<T> table) {
throw new UnsupportedOperationException("InMemory index does not implement check exist yet"); throw new UnsupportedOperationException("InMemory index does not implement check exist yet");
} }
/**
* Function that tags each HoodieRecord with an existing location, if known.
*/
class LocationTagFunction
implements Function2<Integer, Iterator<HoodieRecord<T>>, Iterator<HoodieRecord<T>>> {
@Override
public Iterator<HoodieRecord<T>> call(Integer partitionNum,
Iterator<HoodieRecord<T>> hoodieRecordIterator) {
List<HoodieRecord<T>> taggedRecords = new ArrayList<>();
while (hoodieRecordIterator.hasNext()) {
HoodieRecord<T> rec = hoodieRecordIterator.next();
if (recordLocationMap.containsKey(rec.getKey())) {
rec.setCurrentLocation(recordLocationMap.get(rec.getKey()));
}
taggedRecords.add(rec);
}
return taggedRecords.iterator();
}
}
@Override @Override
public JavaRDD<HoodieRecord<T>> tagLocation(JavaRDD<HoodieRecord<T>> recordRDD, public JavaRDD<HoodieRecord<T>> tagLocation(JavaRDD<HoodieRecord<T>> recordRDD,
HoodieTable<T> hoodieTable) { HoodieTable<T> hoodieTable) {
@@ -132,4 +111,25 @@ public class InMemoryHashIndex<T extends HoodieRecordPayload> extends HoodieInde
public boolean isImplicitWithStorage() { public boolean isImplicitWithStorage() {
return false; return false;
} }
/**
* Function that tags each HoodieRecord with an existing location, if known.
*/
class LocationTagFunction implements
Function2<Integer, Iterator<HoodieRecord<T>>, Iterator<HoodieRecord<T>>> {
@Override
public Iterator<HoodieRecord<T>> call(Integer partitionNum,
Iterator<HoodieRecord<T>> hoodieRecordIterator) {
List<HoodieRecord<T>> taggedRecords = new ArrayList<>();
while (hoodieRecordIterator.hasNext()) {
HoodieRecord<T> rec = hoodieRecordIterator.next();
if (recordLocationMap.containsKey(rec.getKey())) {
rec.setCurrentLocation(recordLocationMap.get(rec.getKey()));
}
taggedRecords.add(rec);
}
return taggedRecords.iterator();
}
}
} }

View File

@@ -64,8 +64,7 @@ public class BloomIndexFileInfo implements Serializable {
* Does the given key fall within the range (inclusive) * Does the given key fall within the range (inclusive)
*/ */
public boolean isKeyInRange(String recordKey) { public boolean isKeyInRange(String recordKey) {
return minRecordKey.compareTo(recordKey) <= 0 && return minRecordKey.compareTo(recordKey) <= 0 && maxRecordKey.compareTo(recordKey) >= 0;
maxRecordKey.compareTo(recordKey) >= 0;
} }
@Override @Override
@@ -78,9 +77,8 @@ public class BloomIndexFileInfo implements Serializable {
} }
BloomIndexFileInfo that = (BloomIndexFileInfo) o; BloomIndexFileInfo that = (BloomIndexFileInfo) o;
return Objects.equal(that.fileName, fileName) && return Objects.equal(that.fileName, fileName) && Objects.equal(that.minRecordKey, minRecordKey)
Objects.equal(that.minRecordKey, minRecordKey) && && Objects.equal(that.maxRecordKey, maxRecordKey);
Objects.equal(that.maxRecordKey, maxRecordKey);
} }

View File

@@ -56,12 +56,11 @@ import scala.Tuple2;
*/ */
public class HoodieBloomIndex<T extends HoodieRecordPayload> extends HoodieIndex<T> { public class HoodieBloomIndex<T extends HoodieRecordPayload> extends HoodieIndex<T> {
private static Logger logger = LogManager.getLogger(HoodieBloomIndex.class);
// we need to limit the join such that it stays within 1.5GB per Spark partition. (SPARK-1476) // we need to limit the join such that it stays within 1.5GB per Spark partition. (SPARK-1476)
private static final int SPARK_MAXIMUM_BYTES_PER_PARTITION = 1500 * 1024 * 1024; private static final int SPARK_MAXIMUM_BYTES_PER_PARTITION = 1500 * 1024 * 1024;
// this is how much a triplet of (partitionPath, fileId, recordKey) costs. // this is how much a triplet of (partitionPath, fileId, recordKey) costs.
private static final int BYTES_PER_PARTITION_FILE_KEY_TRIPLET = 300; private static final int BYTES_PER_PARTITION_FILE_KEY_TRIPLET = 300;
private static Logger logger = LogManager.getLogger(HoodieBloomIndex.class);
private static int MAX_ITEMS_PER_SHUFFLE_PARTITION = private static int MAX_ITEMS_PER_SHUFFLE_PARTITION =
SPARK_MAXIMUM_BYTES_PER_PARTITION / BYTES_PER_PARTITION_FILE_KEY_TRIPLET; SPARK_MAXIMUM_BYTES_PER_PARTITION / BYTES_PER_PARTITION_FILE_KEY_TRIPLET;
@@ -108,27 +107,26 @@ public class HoodieBloomIndex<T extends HoodieRecordPayload> extends HoodieIndex
return taggedRecordRDD; return taggedRecordRDD;
} }
public JavaPairRDD<HoodieKey, Optional<String>> fetchRecordLocation( public JavaPairRDD<HoodieKey, Optional<String>> fetchRecordLocation(JavaRDD<HoodieKey> hoodieKeys,
JavaRDD<HoodieKey> hoodieKeys, final HoodieTable<T> table) { final HoodieTable<T> table) {
JavaPairRDD<String, String> partitionRecordKeyPairRDD = JavaPairRDD<String, String> partitionRecordKeyPairRDD = hoodieKeys
hoodieKeys.mapToPair(key -> new Tuple2<>(key.getPartitionPath(), key.getRecordKey())); .mapToPair(key -> new Tuple2<>(key.getPartitionPath(), key.getRecordKey()));
// Lookup indexes for all the partition/recordkey pair // Lookup indexes for all the partition/recordkey pair
JavaPairRDD<String, String> rowKeyFilenamePairRDD = JavaPairRDD<String, String> rowKeyFilenamePairRDD = lookupIndex(partitionRecordKeyPairRDD,
lookupIndex(partitionRecordKeyPairRDD, table); table);
JavaPairRDD<String, HoodieKey> rowKeyHoodieKeyPairRDD = JavaPairRDD<String, HoodieKey> rowKeyHoodieKeyPairRDD = hoodieKeys
hoodieKeys.mapToPair(key -> new Tuple2<>(key.getRecordKey(), key)); .mapToPair(key -> new Tuple2<>(key.getRecordKey(), key));
return rowKeyHoodieKeyPairRDD.leftOuterJoin(rowKeyFilenamePairRDD) return rowKeyHoodieKeyPairRDD.leftOuterJoin(rowKeyFilenamePairRDD).mapToPair(keyPathTuple -> {
.mapToPair(keyPathTuple -> {
Optional<String> recordLocationPath; Optional<String> recordLocationPath;
if (keyPathTuple._2._2.isPresent()) { if (keyPathTuple._2._2.isPresent()) {
String fileName = keyPathTuple._2._2.get(); String fileName = keyPathTuple._2._2.get();
String partitionPath = keyPathTuple._2._1.getPartitionPath(); String partitionPath = keyPathTuple._2._1.getPartitionPath();
recordLocationPath = Optional.of(new Path( recordLocationPath = Optional
new Path(table.getMetaClient().getBasePath(), partitionPath), .of(new Path(new Path(table.getMetaClient().getBasePath(), partitionPath), fileName)
fileName).toUri().getPath()); .toUri().getPath());
} else { } else {
recordLocationPath = Optional.absent(); recordLocationPath = Optional.absent();
} }
@@ -152,21 +150,21 @@ public class HoodieBloomIndex<T extends HoodieRecordPayload> extends HoodieIndex
final Map<String, List<BloomIndexFileInfo>> partitionToFileInfo = fileInfoList.stream() final Map<String, List<BloomIndexFileInfo>> partitionToFileInfo = fileInfoList.stream()
.collect(groupingBy(Tuple2::_1, mapping(Tuple2::_2, toList()))); .collect(groupingBy(Tuple2::_1, mapping(Tuple2::_2, toList())));
// Step 3: Obtain a RDD, for each incoming record, that already exists, with the file id, that contains it. // Step 3: Obtain a RDD, for each incoming record, that already exists, with the file id,
// that contains it.
int parallelism = autoComputeParallelism(recordsPerPartition, partitionToFileInfo, int parallelism = autoComputeParallelism(recordsPerPartition, partitionToFileInfo,
partitionRecordKeyPairRDD); partitionRecordKeyPairRDD);
return findMatchingFilesForRecordKeys(hoodieTable, partitionToFileInfo, return findMatchingFilesForRecordKeys(hoodieTable, partitionToFileInfo,
partitionRecordKeyPairRDD, partitionRecordKeyPairRDD, parallelism);
parallelism);
} }
/** /**
* The index lookup can be skewed in three dimensions : #files, #partitions, #records * The index lookup can be skewed in three dimensions : #files, #partitions, #records
* * <p>
* To be able to smoothly handle skews, we need to compute how to split each partitions into * To be able to smoothly handle skews, we need to compute how to split each partitions into
* subpartitions. We do it here, in a way that keeps the amount of each Spark join partition to < * subpartitions. We do it here, in a way that keeps the amount of each Spark join partition to <
* 2GB. * 2GB.
* * <p>
* If {@link com.uber.hoodie.config.HoodieIndexConfig#BLOOM_INDEX_PARALLELISM_PROP} is specified * If {@link com.uber.hoodie.config.HoodieIndexConfig#BLOOM_INDEX_PARALLELISM_PROP} is specified
* as a NON-zero number, then that is used explicitly. * as a NON-zero number, then that is used explicitly.
*/ */
@@ -184,7 +182,8 @@ public class HoodieBloomIndex<T extends HoodieRecordPayload> extends HoodieIndex
// records for a partition. // records for a partition.
Map<String, Long> filesPerPartition = partitionToFileInfo.entrySet().stream() Map<String, Long> filesPerPartition = partitionToFileInfo.entrySet().stream()
.collect(Collectors.toMap(Map.Entry::getKey, e -> Long.valueOf(e.getValue().size()))); .collect(Collectors.toMap(Map.Entry::getKey, e -> Long.valueOf(e.getValue().size())));
long totalFiles = 0, totalRecords = 0; long totalFiles = 0;
long totalRecords = 0;
for (String partitionPath : recordsPerPartition.keySet()) { for (String partitionPath : recordsPerPartition.keySet()) {
long numRecords = recordsPerPartition.get(partitionPath); long numRecords = recordsPerPartition.get(partitionPath);
long numFiles = long numFiles =
@@ -210,22 +209,22 @@ public class HoodieBloomIndex<T extends HoodieRecordPayload> extends HoodieIndex
/** /**
* Its crucial to pick the right parallelism. * Its crucial to pick the right parallelism.
* * <p>
* totalSubPartitions : this is deemed safe limit, to be nice with Spark. inputParallelism : * totalSubPartitions : this is deemed safe limit, to be nice with Spark. inputParallelism :
* typically number of input file splits * typically number of input file splits
* * <p>
* We pick the max such that, we are always safe, but go higher if say a there are a lot of input * We pick the max such that, we are always safe, but go higher if say a there are a lot of input
* files. (otherwise, we will fallback to number of partitions in input and end up with slow * files. (otherwise, we will fallback to number of partitions in input and end up with slow
* performance) * performance)
*/ */
private int determineParallelism(int inputParallelism, int totalSubPartitions) { private int determineParallelism(int inputParallelism, int totalSubPartitions) {
// If bloom index parallelism is set, use it to to check against the input parallelism and take the max // If bloom index parallelism is set, use it to to check against the input parallelism and
// take the max
int indexParallelism = Math.max(inputParallelism, config.getBloomIndexParallelism()); int indexParallelism = Math.max(inputParallelism, config.getBloomIndexParallelism());
int joinParallelism = Math.max(totalSubPartitions, indexParallelism); int joinParallelism = Math.max(totalSubPartitions, indexParallelism);
logger.info("InputParallelism: ${" + inputParallelism + "}, " + logger.info("InputParallelism: ${" + inputParallelism + "}, " + "IndexParallelism: ${" + config
"IndexParallelism: ${" + config.getBloomIndexParallelism() + "}, " + .getBloomIndexParallelism() + "}, " + "TotalSubParts: ${" + totalSubPartitions + "}, "
"TotalSubParts: ${" + totalSubPartitions + "}, " + + "Join Parallelism set to : " + joinParallelism);
"Join Parallelism set to : " + joinParallelism);
return joinParallelism; return joinParallelism;
} }
@@ -237,29 +236,24 @@ public class HoodieBloomIndex<T extends HoodieRecordPayload> extends HoodieIndex
final HoodieTable<T> hoodieTable) { final HoodieTable<T> hoodieTable) {
// Obtain the latest data files from all the partitions. // Obtain the latest data files from all the partitions.
List<Tuple2<String, HoodieDataFile>> dataFilesList = jsc List<Tuple2<String, HoodieDataFile>> dataFilesList = jsc
.parallelize(partitions, Math.max(partitions.size(), 1)) .parallelize(partitions, Math.max(partitions.size(), 1)).flatMapToPair(partitionPath -> {
.flatMapToPair(partitionPath -> { java.util.Optional<HoodieInstant> latestCommitTime = hoodieTable.getCommitsTimeline()
java.util.Optional<HoodieInstant> latestCommitTime = .filterCompletedInstants().lastInstant();
hoodieTable.getCommitsTimeline().filterCompletedInstants().lastInstant();
List<Tuple2<String, HoodieDataFile>> filteredFiles = new ArrayList<>(); List<Tuple2<String, HoodieDataFile>> filteredFiles = new ArrayList<>();
if (latestCommitTime.isPresent()) { if (latestCommitTime.isPresent()) {
filteredFiles = filteredFiles = hoodieTable.getROFileSystemView()
hoodieTable.getROFileSystemView().getLatestDataFilesBeforeOrOn(partitionPath, .getLatestDataFilesBeforeOrOn(partitionPath, latestCommitTime.get().getTimestamp())
latestCommitTime.get().getTimestamp()) .map(f -> new Tuple2<>(partitionPath, f)).collect(toList());
.map(f -> new Tuple2<>(partitionPath, f))
.collect(toList());
} }
return filteredFiles.iterator(); return filteredFiles.iterator();
}).collect(); }).collect();
if (config.getBloomIndexPruneByRanges()) { if (config.getBloomIndexPruneByRanges()) {
// also obtain file ranges, if range pruning is enabled // also obtain file ranges, if range pruning is enabled
return jsc.parallelize(dataFilesList, Math.max(dataFilesList.size(), 1)) return jsc.parallelize(dataFilesList, Math.max(dataFilesList.size(), 1)).mapToPair(ft -> {
.mapToPair(ft -> {
try { try {
String[] minMaxKeys = ParquetUtils String[] minMaxKeys = ParquetUtils
.readMinMaxRecordKeys(hoodieTable.getHadoopConf(), .readMinMaxRecordKeys(hoodieTable.getHadoopConf(), ft._2().getFileStatus().getPath());
ft._2().getFileStatus().getPath());
return new Tuple2<>(ft._1(), return new Tuple2<>(ft._1(),
new BloomIndexFileInfo(ft._2().getFileName(), minMaxKeys[0], minMaxKeys[1])); new BloomIndexFileInfo(ft._2().getFileName(), minMaxKeys[0], minMaxKeys[1]));
} catch (MetadataNotFoundException me) { } catch (MetadataNotFoundException me) {
@@ -320,21 +314,20 @@ public class HoodieBloomIndex<T extends HoodieRecordPayload> extends HoodieIndex
* (e.g: timestamp as prefix), the number of files to be compared gets cut down a lot from range * (e.g: timestamp as prefix), the number of files to be compared gets cut down a lot from range
* pruning. * pruning.
*/ */
// sub-partition to ensure the records can be looked up against files & also prune file<=>record comparisons based on recordKey // sub-partition to ensure the records can be looked up against files & also prune
// file<=>record comparisons based on recordKey
// ranges in the index info. // ranges in the index info.
@VisibleForTesting @VisibleForTesting
JavaPairRDD<String, Tuple2<String, HoodieKey>> explodeRecordRDDWithFileComparisons( JavaPairRDD<String, Tuple2<String, HoodieKey>> explodeRecordRDDWithFileComparisons(
final Map<String, List<BloomIndexFileInfo>> partitionToFileIndexInfo, final Map<String, List<BloomIndexFileInfo>> partitionToFileIndexInfo,
JavaPairRDD<String, String> partitionRecordKeyPairRDD) { JavaPairRDD<String, String> partitionRecordKeyPairRDD) {
return partitionRecordKeyPairRDD return partitionRecordKeyPairRDD.map(partitionRecordKeyPair -> {
.map(partitionRecordKeyPair -> {
String recordKey = partitionRecordKeyPair._2(); String recordKey = partitionRecordKeyPair._2();
String partitionPath = partitionRecordKeyPair._1(); String partitionPath = partitionRecordKeyPair._1();
List<BloomIndexFileInfo> indexInfos = partitionToFileIndexInfo.get(partitionPath); List<BloomIndexFileInfo> indexInfos = partitionToFileIndexInfo.get(partitionPath);
List<Tuple2<String, Tuple2<String, HoodieKey>>> recordComparisons = new ArrayList<>(); List<Tuple2<String, Tuple2<String, HoodieKey>>> recordComparisons = new ArrayList<>();
if (indexInfos if (indexInfos != null) { // could be null, if there are no files in a given partition yet.
!= null) { // could be null, if there are no files in a given partition yet.
// for each candidate file in partition, that needs to be compared. // for each candidate file in partition, that needs to be compared.
for (BloomIndexFileInfo indexInfo : indexInfos) { for (BloomIndexFileInfo indexInfo : indexInfos) {
if (shouldCompareWithFile(indexInfo, recordKey)) { if (shouldCompareWithFile(indexInfo, recordKey)) {
@@ -346,35 +339,34 @@ public class HoodieBloomIndex<T extends HoodieRecordPayload> extends HoodieIndex
} }
} }
return recordComparisons; return recordComparisons;
}) }).flatMapToPair(t -> t.iterator());
.flatMapToPair(t -> t.iterator());
} }
/** /**
* Find out <RowKey, filename> pair. All workload grouped by file-level. * Find out <RowKey, filename> pair. All workload grouped by file-level.
* * <p>
* Join PairRDD(PartitionPath, RecordKey) and PairRDD(PartitionPath, File) & then repartition such * Join PairRDD(PartitionPath, RecordKey) and PairRDD(PartitionPath, File) & then repartition such
* that each RDD partition is a file, then for each file, we do (1) load bloom filter, (2) load * that each RDD partition is a file, then for each file, we do (1) load bloom filter, (2) load
* rowKeys, (3) Tag rowKey * rowKeys, (3) Tag rowKey
* * <p>
* Make sure the parallelism is atleast the groupby parallelism for tagging location * Make sure the parallelism is atleast the groupby parallelism for tagging location
*/ */
@VisibleForTesting @VisibleForTesting
JavaPairRDD<String, String> findMatchingFilesForRecordKeys(HoodieTable hoodieTable, JavaPairRDD<String, String> findMatchingFilesForRecordKeys(HoodieTable hoodieTable,
final Map<String, List<BloomIndexFileInfo>> partitionToFileIndexInfo, final Map<String, List<BloomIndexFileInfo>> partitionToFileIndexInfo,
JavaPairRDD<String, String> partitionRecordKeyPairRDD, JavaPairRDD<String, String> partitionRecordKeyPairRDD, int totalSubpartitions) {
int totalSubpartitions) {
int joinParallelism = determineParallelism(partitionRecordKeyPairRDD.partitions().size(), int joinParallelism = determineParallelism(partitionRecordKeyPairRDD.partitions().size(),
totalSubpartitions); totalSubpartitions);
JavaPairRDD<String, Tuple2<String, HoodieKey>> fileSortedTripletRDD = explodeRecordRDDWithFileComparisons( JavaPairRDD<String, Tuple2<String, HoodieKey>> fileSortedTripletRDD =
explodeRecordRDDWithFileComparisons(
partitionToFileIndexInfo, partitionRecordKeyPairRDD) partitionToFileIndexInfo, partitionRecordKeyPairRDD)
// sort further based on filename, such that all checking for the file can happen within a single partition, on-the-fly // sort further based on filename, such that all checking for the file can happen within
// a single partition, on-the-fly
.sortByKey(true, joinParallelism); .sortByKey(true, joinParallelism);
return fileSortedTripletRDD return fileSortedTripletRDD.mapPartitionsWithIndex(
.mapPartitionsWithIndex(
new HoodieBloomIndexCheckFunction(hoodieTable, config.getBasePath()), true) new HoodieBloomIndexCheckFunction(hoodieTable, config.getBasePath()), true)
.flatMap(indexLookupResults -> indexLookupResults.iterator()) .flatMap(indexLookupResults -> indexLookupResults.iterator())
.filter(lookupResult -> lookupResult.getMatchingRecordKeys().size() > 0) .filter(lookupResult -> lookupResult.getMatchingRecordKeys().size() > 0)
@@ -391,14 +383,13 @@ public class HoodieBloomIndex<T extends HoodieRecordPayload> extends HoodieIndex
* Tag the <rowKey, filename> back to the original HoodieRecord RDD. * Tag the <rowKey, filename> back to the original HoodieRecord RDD.
*/ */
private JavaRDD<HoodieRecord<T>> tagLocationBacktoRecords( private JavaRDD<HoodieRecord<T>> tagLocationBacktoRecords(
JavaPairRDD<String, String> rowKeyFilenamePairRDD, JavaPairRDD<String, String> rowKeyFilenamePairRDD, JavaRDD<HoodieRecord<T>> recordRDD) {
JavaRDD<HoodieRecord<T>> recordRDD) {
JavaPairRDD<String, HoodieRecord<T>> rowKeyRecordPairRDD = recordRDD JavaPairRDD<String, HoodieRecord<T>> rowKeyRecordPairRDD = recordRDD
.mapToPair(record -> new Tuple2<>(record.getRecordKey(), record)); .mapToPair(record -> new Tuple2<>(record.getRecordKey(), record));
// Here as the recordRDD might have more data than rowKeyRDD (some rowKeys' fileId is null), so we do left outer join. // Here as the recordRDD might have more data than rowKeyRDD (some rowKeys' fileId is null),
return rowKeyRecordPairRDD.leftOuterJoin(rowKeyFilenamePairRDD).values().map( // so we do left outer join.
v1 -> { return rowKeyRecordPairRDD.leftOuterJoin(rowKeyFilenamePairRDD).values().map(v1 -> {
HoodieRecord<T> record = v1._1(); HoodieRecord<T> record = v1._1();
if (v1._2().isPresent()) { if (v1._2().isPresent()) {
String filename = v1._2().get(); String filename = v1._2().get();
@@ -408,8 +399,7 @@ public class HoodieBloomIndex<T extends HoodieRecordPayload> extends HoodieIndex
} }
} }
return record; return record;
} });
);
} }
@Override @Override

View File

@@ -41,7 +41,8 @@ import scala.Tuple2;
* actual files * actual files
*/ */
public class HoodieBloomIndexCheckFunction implements public class HoodieBloomIndexCheckFunction implements
Function2<Integer, Iterator<Tuple2<String, Tuple2<String, HoodieKey>>>, Iterator<List<IndexLookupResult>>> { Function2<Integer, Iterator<Tuple2<String, Tuple2<String, HoodieKey>>>,
Iterator<List<IndexLookupResult>>> {
private static Logger logger = LogManager.getLogger(HoodieBloomIndexCheckFunction.class); private static Logger logger = LogManager.getLogger(HoodieBloomIndexCheckFunction.class);
@@ -58,8 +59,7 @@ public class HoodieBloomIndexCheckFunction implements
* Given a list of row keys and one file, return only row keys existing in that file. * Given a list of row keys and one file, return only row keys existing in that file.
*/ */
public static List<String> checkCandidatesAgainstFile(Configuration configuration, public static List<String> checkCandidatesAgainstFile(Configuration configuration,
List<String> candidateRecordKeys, List<String> candidateRecordKeys, Path filePath) throws HoodieIndexException {
Path filePath) throws HoodieIndexException {
List<String> foundRecordKeys = new ArrayList<>(); List<String> foundRecordKeys = new ArrayList<>();
try { try {
// Load all rowKeys from the file, to double-confirm // Load all rowKeys from the file, to double-confirm
@@ -86,6 +86,13 @@ public class HoodieBloomIndexCheckFunction implements
return foundRecordKeys; return foundRecordKeys;
} }
@Override
public Iterator<List<IndexLookupResult>> call(Integer partition,
Iterator<Tuple2<String, Tuple2<String, HoodieKey>>> fileParitionRecordKeyTripletItr)
throws Exception {
return new LazyKeyCheckIterator(fileParitionRecordKeyTripletItr);
}
class LazyKeyCheckIterator extends class LazyKeyCheckIterator extends
LazyIterableIterator<Tuple2<String, Tuple2<String, HoodieKey>>, List<IndexLookupResult>> { LazyIterableIterator<Tuple2<String, Tuple2<String, HoodieKey>>, List<IndexLookupResult>> {
@@ -143,7 +150,8 @@ public class HoodieBloomIndexCheckFunction implements
// if continue on current file) // if continue on current file)
if (fileName.equals(currentFile)) { if (fileName.equals(currentFile)) {
// check record key against bloom filter of current file & add to possible keys if needed // check record key against bloom filter of current file & add to possible keys if
// needed
if (bloomFilter.mightContain(recordKey)) { if (bloomFilter.mightContain(recordKey)) {
if (logger.isDebugEnabled()) { if (logger.isDebugEnabled()) {
logger.debug("#1 Adding " + recordKey + " as candidate for file " + fileName); logger.debug("#1 Adding " + recordKey + " as candidate for file " + fileName);
@@ -201,12 +209,4 @@ public class HoodieBloomIndexCheckFunction implements
protected void end() { protected void end() {
} }
} }
@Override
public Iterator<List<IndexLookupResult>> call(Integer partition,
Iterator<Tuple2<String, Tuple2<String, HoodieKey>>> fileParitionRecordKeyTripletItr)
throws Exception {
return new LazyKeyCheckIterator(fileParitionRecordKeyTripletItr);
}
} }

View File

@@ -38,9 +38,9 @@ import scala.Tuple2;
/** /**
* An `stateless` index implementation that will using a deterministic mapping function to determine * An `stateless` index implementation that will using a deterministic mapping function to determine
* the fileID for a given record. * the fileID for a given record.
* * <p>
* Pros: - Fast * Pros: - Fast
* * <p>
* Cons : - Need to tune the number of buckets per partition path manually (FIXME: Need to autotune * Cons : - Need to tune the number of buckets per partition path manually (FIXME: Need to autotune
* this) - Could increase write amplification on copy-on-write storage since inserts always rewrite * this) - Could increase write amplification on copy-on-write storage since inserts always rewrite
* files - Not global. * files - Not global.

View File

@@ -27,12 +27,16 @@ import com.uber.hoodie.common.model.HoodieRecordLocation;
import com.uber.hoodie.common.model.HoodieRecordPayload; import com.uber.hoodie.common.model.HoodieRecordPayload;
import com.uber.hoodie.common.table.HoodieTimeline; import com.uber.hoodie.common.table.HoodieTimeline;
import com.uber.hoodie.common.table.timeline.HoodieInstant; import com.uber.hoodie.common.table.timeline.HoodieInstant;
import com.uber.hoodie.config.HoodieIndexConfig;
import com.uber.hoodie.config.HoodieWriteConfig; import com.uber.hoodie.config.HoodieWriteConfig;
import com.uber.hoodie.exception.HoodieDependentSystemUnavailableException; import com.uber.hoodie.exception.HoodieDependentSystemUnavailableException;
import com.uber.hoodie.exception.HoodieIndexException; import com.uber.hoodie.exception.HoodieIndexException;
import com.uber.hoodie.index.HoodieIndex; import com.uber.hoodie.index.HoodieIndex;
import com.uber.hoodie.table.HoodieTable; import com.uber.hoodie.table.HoodieTable;
import java.io.IOException;
import java.util.ArrayList;
import java.util.Iterator;
import java.util.LinkedList;
import java.util.List;
import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.hbase.HBaseConfiguration; import org.apache.hadoop.hbase.HBaseConfiguration;
import org.apache.hadoop.hbase.TableName; import org.apache.hadoop.hbase.TableName;
@@ -51,23 +55,18 @@ import org.apache.spark.api.java.JavaRDD;
import org.apache.spark.api.java.JavaSparkContext; import org.apache.spark.api.java.JavaSparkContext;
import org.apache.spark.api.java.function.Function2; import org.apache.spark.api.java.function.Function2;
import java.io.IOException;
import java.util.ArrayList;
import java.util.Iterator;
import java.util.LinkedList;
import java.util.List;
/** /**
* Hoodie Index implementation backed by HBase * Hoodie Index implementation backed by HBase
*/ */
public class HBaseIndex<T extends HoodieRecordPayload> extends HoodieIndex<T> { public class HBaseIndex<T extends HoodieRecordPayload> extends HoodieIndex<T> {
private final static byte[] SYSTEM_COLUMN_FAMILY = Bytes.toBytes("_s");
private final static byte[] COMMIT_TS_COLUMN = Bytes.toBytes("commit_ts"); private static final byte[] SYSTEM_COLUMN_FAMILY = Bytes.toBytes("_s");
private final static byte[] FILE_NAME_COLUMN = Bytes.toBytes("file_name"); private static final byte[] COMMIT_TS_COLUMN = Bytes.toBytes("commit_ts");
private final static byte[] PARTITION_PATH_COLUMN = Bytes.toBytes("partition_path"); private static final byte[] FILE_NAME_COLUMN = Bytes.toBytes("file_name");
private static final byte[] PARTITION_PATH_COLUMN = Bytes.toBytes("partition_path");
private static Logger logger = LogManager.getLogger(HBaseIndex.class); private static Logger logger = LogManager.getLogger(HBaseIndex.class);
private static Connection hbaseConnection = null;
private final String tableName; private final String tableName;
public HBaseIndex(HoodieWriteConfig config, JavaSparkContext jsc) { public HBaseIndex(HoodieWriteConfig config, JavaSparkContext jsc) {
@@ -77,14 +76,12 @@ public class HBaseIndex<T extends HoodieRecordPayload> extends HoodieIndex<T> {
} }
@Override @Override
public JavaPairRDD<HoodieKey, Optional<String>> fetchRecordLocation( public JavaPairRDD<HoodieKey, Optional<String>> fetchRecordLocation(JavaRDD<HoodieKey> hoodieKeys,
JavaRDD<HoodieKey> hoodieKeys, HoodieTable<T> table) { HoodieTable<T> table) {
//TODO : Change/Remove filterExists in HoodieReadClient() and revisit //TODO : Change/Remove filterExists in HoodieReadClient() and revisit
throw new UnsupportedOperationException("HBase index does not implement check exist"); throw new UnsupportedOperationException("HBase index does not implement check exist");
} }
private static Connection hbaseConnection = null;
private Connection getHBaseConnection() { private Connection getHBaseConnection() {
Configuration hbaseConfig = HBaseConfiguration.create(); Configuration hbaseConfig = HBaseConfiguration.create();
String quorum = config.getHbaseZkQuorum(); String quorum = config.getHbaseZkQuorum();
@@ -100,8 +97,8 @@ public class HBaseIndex<T extends HoodieRecordPayload> extends HoodieIndex<T> {
} }
/** /**
* Since we are sharing the HbaseConnection across tasks in a JVM, make sure the HbaseConnectio is closed when * Since we are sharing the HbaseConnection across tasks in a JVM, make sure the HbaseConnectio is
* JVM exits * closed when JVM exits
*/ */
private void addShutDownHook() { private void addShutDownHook() {
Runtime.getRuntime().addShutdownHook(new Thread() { Runtime.getRuntime().addShutdownHook(new Thread() {
@@ -126,10 +123,11 @@ public class HBaseIndex<T extends HoodieRecordPayload> extends HoodieIndex<T> {
HoodieTimeline commitTimeline = hoodieTable.getCompletedCommitTimeline(); HoodieTimeline commitTimeline = hoodieTable.getCompletedCommitTimeline();
// Check if the last commit ts for this row is 1) present in the timeline or // Check if the last commit ts for this row is 1) present in the timeline or
// 2) is less than the first commit ts in the timeline // 2) is less than the first commit ts in the timeline
return !commitTimeline.empty() && (commitTimeline.containsInstant( return !commitTimeline.empty() && (commitTimeline
new HoodieInstant(false, HoodieTimeline.COMMIT_ACTION, commitTs)) || .containsInstant(new HoodieInstant(false, HoodieTimeline.COMMIT_ACTION, commitTs))
HoodieTimeline.compareTimestamps(commitTimeline.firstInstant().get().getTimestamp(), || HoodieTimeline
commitTs, HoodieTimeline.GREATER)); .compareTimestamps(commitTimeline.firstInstant().get().getTimestamp(), commitTs,
HoodieTimeline.GREATER));
} }
/** /**
@@ -171,16 +169,17 @@ public class HBaseIndex<T extends HoodieRecordPayload> extends HoodieIndex<T> {
HoodieRecord currentRecord = currentBatchOfRecords.remove(0); HoodieRecord currentRecord = currentBatchOfRecords.remove(0);
if (result.getRow() != null) { if (result.getRow() != null) {
String keyFromResult = Bytes.toString(result.getRow()); String keyFromResult = Bytes.toString(result.getRow());
String commitTs = String commitTs = Bytes
Bytes.toString(result.getValue(SYSTEM_COLUMN_FAMILY, COMMIT_TS_COLUMN)); .toString(result.getValue(SYSTEM_COLUMN_FAMILY, COMMIT_TS_COLUMN));
String fileId = String fileId = Bytes
Bytes.toString(result.getValue(SYSTEM_COLUMN_FAMILY, FILE_NAME_COLUMN)); .toString(result.getValue(SYSTEM_COLUMN_FAMILY, FILE_NAME_COLUMN));
String partitionPath = String partitionPath = Bytes
Bytes.toString(result.getValue(SYSTEM_COLUMN_FAMILY, PARTITION_PATH_COLUMN)); .toString(result.getValue(SYSTEM_COLUMN_FAMILY, PARTITION_PATH_COLUMN));
if (checkIfValidCommit(hoodieTable, commitTs)) { if (checkIfValidCommit(hoodieTable, commitTs)) {
currentRecord = new HoodieRecord(new HoodieKey(currentRecord.getRecordKey(), currentRecord = new HoodieRecord(
partitionPath), currentRecord.getData()); new HoodieKey(currentRecord.getRecordKey(), partitionPath),
currentRecord.getData());
currentRecord.setCurrentLocation(new HoodieRecordLocation(commitTs, fileId)); currentRecord.setCurrentLocation(new HoodieRecordLocation(commitTs, fileId));
taggedRecords.add(currentRecord); taggedRecords.add(currentRecord);
// the key from Result and the key being processed should be same // the key from Result and the key being processed should be same
@@ -217,10 +216,10 @@ public class HBaseIndex<T extends HoodieRecordPayload> extends HoodieIndex<T> {
return recordRDD.mapPartitionsWithIndex(locationTagFunction(hoodieTable), true); return recordRDD.mapPartitionsWithIndex(locationTagFunction(hoodieTable), true);
} }
private Function2<Integer, Iterator<WriteStatus>, Iterator<WriteStatus>> updateLocationFunction() { private Function2<Integer, Iterator<WriteStatus>, Iterator<WriteStatus>>
updateLocationFunction() {
return (Function2<Integer, Iterator<WriteStatus>, Iterator<WriteStatus>>) (partition, statusIterator) -> { return (Function2<Integer, Iterator<WriteStatus>, Iterator<WriteStatus>>) (partition,
statusIterator) -> {
Integer multiPutBatchSize = config.getHbaseIndexPutBatchSize(); Integer multiPutBatchSize = config.getHbaseIndexPutBatchSize();
List<WriteStatus> writeStatusList = new ArrayList<>(); List<WriteStatus> writeStatusList = new ArrayList<>();
@@ -292,12 +291,9 @@ public class HBaseIndex<T extends HoodieRecordPayload> extends HoodieIndex<T> {
/** /**
* Helper method to facilitate performing puts and deletes in Hbase * Helper method to facilitate performing puts and deletes in Hbase
* @param hTable
* @param puts
* @param deletes
* @throws IOException
*/ */
private void doPutsAndDeletes(HTable hTable, List<Put> puts, List<Delete> deletes) throws IOException { private void doPutsAndDeletes(HTable hTable, List<Put> puts, List<Delete> deletes)
throws IOException {
if (puts.size() > 0) { if (puts.size() > 0) {
hTable.put(puts); hTable.put(puts);
} }
@@ -323,7 +319,6 @@ public class HBaseIndex<T extends HoodieRecordPayload> extends HoodieIndex<T> {
/** /**
* Only looks up by recordKey * Only looks up by recordKey
* @return
*/ */
@Override @Override
public boolean isGlobal() { public boolean isGlobal() {
@@ -332,7 +327,6 @@ public class HBaseIndex<T extends HoodieRecordPayload> extends HoodieIndex<T> {
/** /**
* Mapping is available in HBase already. * Mapping is available in HBase already.
* @return
*/ */
@Override @Override
public boolean canIndexLogFiles() { public boolean canIndexLogFiles() {
@@ -341,7 +335,6 @@ public class HBaseIndex<T extends HoodieRecordPayload> extends HoodieIndex<T> {
/** /**
* Index needs to be explicitly updated after storage write. * Index needs to be explicitly updated after storage write.
* @return
*/ */
@Override @Override
public boolean isImplicitWithStorage() { public boolean isImplicitWithStorage() {

View File

@@ -37,14 +37,6 @@ import com.uber.hoodie.config.HoodieWriteConfig;
import com.uber.hoodie.exception.HoodieAppendException; import com.uber.hoodie.exception.HoodieAppendException;
import com.uber.hoodie.exception.HoodieUpsertException; import com.uber.hoodie.exception.HoodieUpsertException;
import com.uber.hoodie.table.HoodieTable; import com.uber.hoodie.table.HoodieTable;
import org.apache.avro.generic.GenericRecord;
import org.apache.avro.generic.IndexedRecord;
import org.apache.hadoop.fs.Path;
import org.apache.log4j.LogManager;
import org.apache.log4j.Logger;
import org.apache.spark.TaskContext;
import org.apache.spark.util.SizeEstimator;
import java.io.IOException; import java.io.IOException;
import java.util.ArrayList; import java.util.ArrayList;
import java.util.Comparator; import java.util.Comparator;
@@ -53,6 +45,13 @@ import java.util.List;
import java.util.Map; import java.util.Map;
import java.util.Optional; import java.util.Optional;
import java.util.concurrent.atomic.AtomicLong; import java.util.concurrent.atomic.AtomicLong;
import org.apache.avro.generic.GenericRecord;
import org.apache.avro.generic.IndexedRecord;
import org.apache.hadoop.fs.Path;
import org.apache.log4j.LogManager;
import org.apache.log4j.Logger;
import org.apache.spark.TaskContext;
import org.apache.spark.util.SizeEstimator;
/** /**
* IO Operation to append data onto an existing file. * IO Operation to append data onto an existing file.
@@ -61,14 +60,13 @@ public class HoodieAppendHandle<T extends HoodieRecordPayload> extends HoodieIOH
private static Logger logger = LogManager.getLogger(HoodieAppendHandle.class); private static Logger logger = LogManager.getLogger(HoodieAppendHandle.class);
private static AtomicLong recordIndex = new AtomicLong(1); private static AtomicLong recordIndex = new AtomicLong(1);
private TableFileSystemView.RealtimeView fileSystemView;
private final WriteStatus writeStatus; private final WriteStatus writeStatus;
private final String fileId; private final String fileId;
private String partitionPath;
private Iterator<HoodieRecord<T>> recordItr;
List<IndexedRecord> recordList = new ArrayList<>(); List<IndexedRecord> recordList = new ArrayList<>();
List<String> keysToDelete = new ArrayList<>(); List<String> keysToDelete = new ArrayList<>();
private TableFileSystemView.RealtimeView fileSystemView;
private String partitionPath;
private Iterator<HoodieRecord<T>> recordItr;
private long recordsWritten = 0; private long recordsWritten = 0;
private long recordsDeleted = 0; private long recordsDeleted = 0;
private long averageRecordSize = 0; private long averageRecordSize = 0;
@@ -76,11 +74,8 @@ public class HoodieAppendHandle<T extends HoodieRecordPayload> extends HoodieIOH
private Writer writer; private Writer writer;
private boolean doInit = true; private boolean doInit = true;
public HoodieAppendHandle(HoodieWriteConfig config, public HoodieAppendHandle(HoodieWriteConfig config, String commitTime, HoodieTable<T> hoodieTable,
String commitTime, String fileId, Iterator<HoodieRecord<T>> recordItr) {
HoodieTable<T> hoodieTable,
String fileId,
Iterator<HoodieRecord<T>> recordItr) {
super(config, commitTime, hoodieTable); super(config, commitTime, hoodieTable);
WriteStatus writeStatus = ReflectionUtils.loadClass(config.getWriteStatusClassName()); WriteStatus writeStatus = ReflectionUtils.loadClass(config.getWriteStatusClassName());
writeStatus.setStat(new HoodieDeltaWriteStat()); writeStatus.setStat(new HoodieDeltaWriteStat());
@@ -94,8 +89,8 @@ public class HoodieAppendHandle<T extends HoodieRecordPayload> extends HoodieIOH
// extract some information from the first record // extract some information from the first record
FileSlice fileSlice = fileSystemView.getLatestFileSlices(partitionPath) FileSlice fileSlice = fileSystemView.getLatestFileSlices(partitionPath)
.filter(fileSlice1 -> fileSlice1.getDataFile().get().getFileId().equals(fileId)) .filter(fileSlice1 -> fileSlice1.getDataFile().get().getFileId().equals(fileId)).findFirst()
.findFirst().get(); .get();
// HACK(vc) This also assumes a base file. It will break, if appending without one. // HACK(vc) This also assumes a base file. It will break, if appending without one.
String latestValidFilePath = fileSlice.getDataFile().get().getFileName(); String latestValidFilePath = fileSlice.getDataFile().get().getFileName();
String baseCommitTime = FSUtils.getCommitTime(latestValidFilePath); String baseCommitTime = FSUtils.getCommitTime(latestValidFilePath);
@@ -108,23 +103,21 @@ public class HoodieAppendHandle<T extends HoodieRecordPayload> extends HoodieIOH
try { try {
this.writer = HoodieLogFormat.newWriterBuilder() this.writer = HoodieLogFormat.newWriterBuilder()
.onParentPath(new Path(hoodieTable.getMetaClient().getBasePath(), partitionPath)) .onParentPath(new Path(hoodieTable.getMetaClient().getBasePath(), partitionPath))
.withFileId(fileId).overBaseCommit(baseCommitTime).withLogVersion(fileSlice.getLogFiles() .withFileId(fileId).overBaseCommit(baseCommitTime).withLogVersion(
.map(logFile -> logFile.getLogVersion()) fileSlice.getLogFiles().map(logFile -> logFile.getLogVersion())
.max(Comparator.naturalOrder()).orElse(HoodieLogFile.LOGFILE_BASE_VERSION)) .max(Comparator.naturalOrder()).orElse(HoodieLogFile.LOGFILE_BASE_VERSION))
.withSizeThreshold(config.getLogFileMaxSize()) .withSizeThreshold(config.getLogFileMaxSize()).withFs(fs)
.withFs(fs).withFileExtension(HoodieLogFile.DELTA_EXTENSION).build(); .withFileExtension(HoodieLogFile.DELTA_EXTENSION).build();
this.currentLogFile = writer.getLogFile(); this.currentLogFile = writer.getLogFile();
((HoodieDeltaWriteStat) writeStatus.getStat()) ((HoodieDeltaWriteStat) writeStatus.getStat()).setLogVersion(currentLogFile.getLogVersion());
.setLogVersion(currentLogFile.getLogVersion()); ((HoodieDeltaWriteStat) writeStatus.getStat()).setLogOffset(writer.getCurrentSize());
((HoodieDeltaWriteStat) writeStatus.getStat())
.setLogOffset(writer.getCurrentSize());
} catch (Exception e) { } catch (Exception e) {
logger.error("Error in update task at commit " + commitTime, e); logger.error("Error in update task at commit " + commitTime, e);
writeStatus.setGlobalError(e); writeStatus.setGlobalError(e);
throw new HoodieUpsertException( throw new HoodieUpsertException(
"Failed to initialize HoodieUpdateHandle for FileId: " + fileId "Failed to initialize HoodieUpdateHandle for FileId: " + fileId + " on commit "
+ " on commit " + commitTime + " on HDFS path " + hoodieTable + commitTime + " on HDFS path " + hoodieTable.getMetaClient().getBasePath()
.getMetaClient().getBasePath() + partitionPath, e); + partitionPath, e);
} }
Path path = new Path(partitionPath, Path path = new Path(partitionPath,
FSUtils.makeDataFileName(commitTime, TaskContext.getPartitionId(), fileId)); FSUtils.makeDataFileName(commitTime, TaskContext.getPartitionId(), fileId));
@@ -150,7 +143,8 @@ public class HoodieAppendHandle<T extends HoodieRecordPayload> extends HoodieIOH
} }
writeStatus.markSuccess(hoodieRecord, recordMetadata); writeStatus.markSuccess(hoodieRecord, recordMetadata);
// deflate record payload after recording success. This will help users access payload as a part of marking // deflate record payload after recording success. This will help users access payload as a
// part of marking
// record successful. // record successful.
hoodieRecord.deflate(); hoodieRecord.deflate();
return avroRecord; return avroRecord;
@@ -165,7 +159,8 @@ public class HoodieAppendHandle<T extends HoodieRecordPayload> extends HoodieIOH
// to make sure we don't append records with older (shorter) schema than already appended // to make sure we don't append records with older (shorter) schema than already appended
public void doAppend() { public void doAppend() {
int maxBlockSize = config.getLogFileDataBlockMaxSize(); int numberOfRecords = 0; int maxBlockSize = config.getLogFileDataBlockMaxSize();
int numberOfRecords = 0;
Map<HoodieLogBlock.HeaderMetadataType, String> header = Maps.newHashMap(); Map<HoodieLogBlock.HeaderMetadataType, String> header = Maps.newHashMap();
header.put(HoodieLogBlock.HeaderMetadataType.INSTANT_TIME, commitTime); header.put(HoodieLogBlock.HeaderMetadataType.INSTANT_TIME, commitTime);
header.put(HoodieLogBlock.HeaderMetadataType.SCHEMA, schema.toString()); header.put(HoodieLogBlock.HeaderMetadataType.SCHEMA, schema.toString());
@@ -180,7 +175,8 @@ public class HoodieAppendHandle<T extends HoodieRecordPayload> extends HoodieIOH
} }
// Append if max number of records reached to achieve block size // Append if max number of records reached to achieve block size
if (numberOfRecords >= (int) (maxBlockSize / averageRecordSize)) { if (numberOfRecords >= (int) (maxBlockSize / averageRecordSize)) {
// Recompute averageRecordSize before writing a new block and update existing value with avg of new and old // Recompute averageRecordSize before writing a new block and update existing value with
// avg of new and old
logger.info("AvgRecordSize => " + averageRecordSize); logger.info("AvgRecordSize => " + averageRecordSize);
averageRecordSize = (averageRecordSize + SizeEstimator.estimate(record)) / 2; averageRecordSize = (averageRecordSize + SizeEstimator.estimate(record)) / 2;
doAppend(header); doAppend(header);

View File

@@ -68,8 +68,7 @@ public class HoodieCleanHelper<T extends HoodieRecordPayload<T>> {
throws IOException { throws IOException {
logger.info("Cleaning " + partitionPath + ", retaining latest " + config logger.info("Cleaning " + partitionPath + ", retaining latest " + config
.getCleanerFileVersionsRetained() + " file versions. "); .getCleanerFileVersionsRetained() + " file versions. ");
List<HoodieFileGroup> fileGroups = List<HoodieFileGroup> fileGroups = fileSystemView.getAllFileGroups(partitionPath)
fileSystemView.getAllFileGroups(partitionPath)
.collect(Collectors.toList()); .collect(Collectors.toList());
List<String> deletePaths = new ArrayList<>(); List<String> deletePaths = new ArrayList<>();
// Collect all the datafiles savepointed by all the savepoints // Collect all the datafiles savepointed by all the savepoints
@@ -94,11 +93,9 @@ public class HoodieCleanHelper<T extends HoodieRecordPayload<T>> {
FileSlice nextSlice = fileSliceIterator.next(); FileSlice nextSlice = fileSliceIterator.next();
HoodieDataFile dataFile = nextSlice.getDataFile().get(); HoodieDataFile dataFile = nextSlice.getDataFile().get();
deletePaths.add(dataFile.getFileStatus().getPath().toString()); deletePaths.add(dataFile.getFileStatus().getPath().toString());
if (hoodieTable.getMetaClient().getTableType() if (hoodieTable.getMetaClient().getTableType() == HoodieTableType.MERGE_ON_READ) {
== HoodieTableType.MERGE_ON_READ) {
// If merge on read, then clean the log files for the commits as well // If merge on read, then clean the log files for the commits as well
deletePaths.addAll(nextSlice.getLogFiles() deletePaths.addAll(nextSlice.getLogFiles().map(file -> file.getPath().toString())
.map(file -> file.getPath().toString())
.collect(Collectors.toList())); .collect(Collectors.toList()));
} }
} }
@@ -121,8 +118,8 @@ public class HoodieCleanHelper<T extends HoodieRecordPayload<T>> {
private List<String> getFilesToCleanKeepingLatestCommits(String partitionPath) private List<String> getFilesToCleanKeepingLatestCommits(String partitionPath)
throws IOException { throws IOException {
int commitsRetained = config.getCleanerCommitsRetained(); int commitsRetained = config.getCleanerCommitsRetained();
logger.info( logger
"Cleaning " + partitionPath + ", retaining latest " + commitsRetained + " commits. "); .info("Cleaning " + partitionPath + ", retaining latest " + commitsRetained + " commits. ");
List<String> deletePaths = new ArrayList<>(); List<String> deletePaths = new ArrayList<>();
// Collect all the datafiles savepointed by all the savepoints // Collect all the datafiles savepointed by all the savepoints
@@ -132,15 +129,14 @@ public class HoodieCleanHelper<T extends HoodieRecordPayload<T>> {
// determine if we have enough commits, to start cleaning. // determine if we have enough commits, to start cleaning.
if (commitTimeline.countInstants() > commitsRetained) { if (commitTimeline.countInstants() > commitsRetained) {
HoodieInstant earliestCommitToRetain = getEarliestCommitToRetain().get(); HoodieInstant earliestCommitToRetain = getEarliestCommitToRetain().get();
List<HoodieFileGroup> fileGroups = List<HoodieFileGroup> fileGroups = fileSystemView.getAllFileGroups(partitionPath)
fileSystemView.getAllFileGroups(partitionPath)
.collect(Collectors.toList()); .collect(Collectors.toList());
for (HoodieFileGroup fileGroup : fileGroups) { for (HoodieFileGroup fileGroup : fileGroups) {
List<FileSlice> fileSliceList = fileGroup.getAllFileSlices().collect(Collectors.toList()); List<FileSlice> fileSliceList = fileGroup.getAllFileSlices().collect(Collectors.toList());
HoodieDataFile dataFile = fileSliceList.get(0).getDataFile().get(); HoodieDataFile dataFile = fileSliceList.get(0).getDataFile().get();
String lastVersion = dataFile.getCommitTime(); String lastVersion = dataFile.getCommitTime();
String lastVersionBeforeEarliestCommitToRetain = String lastVersionBeforeEarliestCommitToRetain = getLatestVersionBeforeCommit(fileSliceList,
getLatestVersionBeforeCommit(fileSliceList, earliestCommitToRetain); earliestCommitToRetain);
// Ensure there are more than 1 version of the file (we only clean old files from updates) // Ensure there are more than 1 version of the file (we only clean old files from updates)
// i.e always spare the last commit. // i.e always spare the last commit.
@@ -151,28 +147,26 @@ public class HoodieCleanHelper<T extends HoodieRecordPayload<T>> {
// do not clean up a savepoint data file // do not clean up a savepoint data file
continue; continue;
} }
// Dont delete the latest commit and also the last commit before the earliest commit we are retaining // Dont delete the latest commit and also the last commit before the earliest commit we
// The window of commit retain == max query run time. So a query could be running which still // are retaining
// The window of commit retain == max query run time. So a query could be running which
// still
// uses this file. // uses this file.
if (fileCommitTime.equals(lastVersion) || ( if (fileCommitTime.equals(lastVersion) || (lastVersionBeforeEarliestCommitToRetain != null
lastVersionBeforeEarliestCommitToRetain != null && fileCommitTime && fileCommitTime.equals(lastVersionBeforeEarliestCommitToRetain))) {
.equals(lastVersionBeforeEarliestCommitToRetain))) {
// move on to the next file // move on to the next file
continue; continue;
} }
// Always keep the last commit // Always keep the last commit
if (HoodieTimeline.compareTimestamps( if (HoodieTimeline
earliestCommitToRetain.getTimestamp(), .compareTimestamps(earliestCommitToRetain.getTimestamp(), fileCommitTime,
fileCommitTime,
HoodieTimeline.GREATER)) { HoodieTimeline.GREATER)) {
// this is a commit, that should be cleaned. // this is a commit, that should be cleaned.
deletePaths.add(aFile.getFileStatus().getPath().toString()); deletePaths.add(aFile.getFileStatus().getPath().toString());
if (hoodieTable.getMetaClient().getTableType() if (hoodieTable.getMetaClient().getTableType() == HoodieTableType.MERGE_ON_READ) {
== HoodieTableType.MERGE_ON_READ) {
// If merge on read, then clean the log files for the commits as well // If merge on read, then clean the log files for the commits as well
deletePaths.addAll(aSlice.getLogFiles() deletePaths.addAll(aSlice.getLogFiles().map(file -> file.getPath().toString())
.map(file -> file.getPath().toString())
.collect(Collectors.toList())); .collect(Collectors.toList()));
} }
} }
@@ -190,9 +184,10 @@ public class HoodieCleanHelper<T extends HoodieRecordPayload<T>> {
HoodieInstant commitTime) { HoodieInstant commitTime) {
for (FileSlice file : fileSliceList) { for (FileSlice file : fileSliceList) {
String fileCommitTime = file.getDataFile().get().getCommitTime(); String fileCommitTime = file.getDataFile().get().getCommitTime();
if (HoodieTimeline.compareTimestamps(commitTime.getTimestamp(), fileCommitTime, if (HoodieTimeline
HoodieTimeline.GREATER)) { .compareTimestamps(commitTime.getTimestamp(), fileCommitTime, HoodieTimeline.GREATER)) {
// fileList is sorted on the reverse, so the first commit we find <= commitTime is the one we want // fileList is sorted on the reverse, so the first commit we find <= commitTime is the
// one we want
return fileCommitTime; return fileCommitTime;
} }
} }
@@ -213,8 +208,7 @@ public class HoodieCleanHelper<T extends HoodieRecordPayload<T>> {
} else { } else {
throw new IllegalArgumentException("Unknown cleaning policy : " + policy.name()); throw new IllegalArgumentException("Unknown cleaning policy : " + policy.name());
} }
logger.info( logger.info(deletePaths.size() + " patterns used to delete in partition path:" + partitionPath);
deletePaths.size() + " patterns used to delete in partition path:" + partitionPath);
return deletePaths; return deletePaths;
} }
@@ -227,8 +221,8 @@ public class HoodieCleanHelper<T extends HoodieRecordPayload<T>> {
int commitsRetained = config.getCleanerCommitsRetained(); int commitsRetained = config.getCleanerCommitsRetained();
if (config.getCleanerPolicy() == HoodieCleaningPolicy.KEEP_LATEST_COMMITS if (config.getCleanerPolicy() == HoodieCleaningPolicy.KEEP_LATEST_COMMITS
&& commitTimeline.countInstants() > commitsRetained) { && commitTimeline.countInstants() > commitsRetained) {
earliestCommitToRetain = earliestCommitToRetain = commitTimeline
commitTimeline.nthInstant(commitTimeline.countInstants() - commitsRetained); .nthInstant(commitTimeline.countInstants() - commitsRetained);
} }
return earliestCommitToRetain; return earliestCommitToRetain;
} }

View File

@@ -40,13 +40,6 @@ import com.uber.hoodie.exception.HoodieCommitException;
import com.uber.hoodie.exception.HoodieException; import com.uber.hoodie.exception.HoodieException;
import com.uber.hoodie.exception.HoodieIOException; import com.uber.hoodie.exception.HoodieIOException;
import com.uber.hoodie.table.HoodieTable; import com.uber.hoodie.table.HoodieTable;
import org.apache.avro.Schema;
import org.apache.avro.file.DataFileStream;
import org.apache.avro.generic.IndexedRecord;
import org.apache.hadoop.fs.Path;
import org.apache.log4j.LogManager;
import org.apache.log4j.Logger;
import java.io.IOException; import java.io.IOException;
import java.util.ArrayList; import java.util.ArrayList;
import java.util.List; import java.util.List;
@@ -54,6 +47,11 @@ import java.util.Map;
import java.util.Optional; import java.util.Optional;
import java.util.stream.Collectors; import java.util.stream.Collectors;
import java.util.stream.Stream; import java.util.stream.Stream;
import org.apache.avro.Schema;
import org.apache.avro.generic.IndexedRecord;
import org.apache.hadoop.fs.Path;
import org.apache.log4j.LogManager;
import org.apache.log4j.Logger;
/** /**
* Archiver to bound the growth of <action>.commit files * Archiver to bound the growth of <action>.commit files
@@ -76,11 +74,9 @@ public class HoodieCommitArchiveLog {
private HoodieLogFormat.Writer openWriter() { private HoodieLogFormat.Writer openWriter() {
try { try {
if (this.writer == null) { if (this.writer == null) {
return HoodieLogFormat.newWriterBuilder() return HoodieLogFormat.newWriterBuilder().onParentPath(archiveFilePath.getParent())
.onParentPath(archiveFilePath.getParent())
.withFileId(archiveFilePath.getName()) .withFileId(archiveFilePath.getName())
.withFileExtension(HoodieArchivedLogFile.ARCHIVE_EXTENSION) .withFileExtension(HoodieArchivedLogFile.ARCHIVE_EXTENSION).withFs(metaClient.getFs())
.withFs(metaClient.getFs())
.overBaseCommit("").build(); .overBaseCommit("").build();
} else { } else {
return this.writer; return this.writer;
@@ -136,21 +132,19 @@ public class HoodieCommitArchiveLog {
.getTimelineOfActions(Sets.newHashSet(HoodieTimeline.CLEAN_ACTION)) .getTimelineOfActions(Sets.newHashSet(HoodieTimeline.CLEAN_ACTION))
.filterCompletedInstants(); .filterCompletedInstants();
Stream<HoodieInstant> instants = cleanAndRollbackTimeline.getInstants() Stream<HoodieInstant> instants = cleanAndRollbackTimeline.getInstants()
.collect(Collectors.groupingBy(s -> s.getAction())) .collect(Collectors.groupingBy(s -> s.getAction())).entrySet().stream().map(i -> {
.entrySet()
.stream()
.map(i -> {
if (i.getValue().size() > maxCommitsToKeep) { if (i.getValue().size() > maxCommitsToKeep) {
return i.getValue().subList(0, i.getValue().size() - minCommitsToKeep); return i.getValue().subList(0, i.getValue().size() - minCommitsToKeep);
} else { } else {
return new ArrayList<HoodieInstant>(); return new ArrayList<HoodieInstant>();
} }
}) }).flatMap(i -> i.stream());
.flatMap(i -> i.stream());
//TODO (na) : Add a way to return actions associated with a timeline and then merge/unify with logic above to avoid Stream.concats //TODO (na) : Add a way to return actions associated with a timeline and then merge/unify
// with logic above to avoid Stream.concats
HoodieTimeline commitTimeline = table.getCompletedCommitTimeline(); HoodieTimeline commitTimeline = table.getCompletedCommitTimeline();
// We cannot have any holes in the commit timeline. We cannot archive any commits which are made after the first savepoint present. // We cannot have any holes in the commit timeline. We cannot archive any commits which are
// made after the first savepoint present.
Optional<HoodieInstant> firstSavepoint = table.getCompletedSavepointTimeline().firstInstant(); Optional<HoodieInstant> firstSavepoint = table.getCompletedSavepointTimeline().firstInstant();
if (!commitTimeline.empty() && commitTimeline.countInstants() > maxCommitsToKeep) { if (!commitTimeline.empty() && commitTimeline.countInstants() > maxCommitsToKeep) {
// Actually do the commits // Actually do the commits
@@ -169,16 +163,14 @@ public class HoodieCommitArchiveLog {
log.info("Deleting instants " + archivedInstants); log.info("Deleting instants " + archivedInstants);
boolean success = true; boolean success = true;
for (HoodieInstant archivedInstant : archivedInstants) { for (HoodieInstant archivedInstant : archivedInstants) {
Path commitFile = Path commitFile = new Path(metaClient.getMetaPath(), archivedInstant.getFileName());
new Path(metaClient.getMetaPath(), archivedInstant.getFileName());
try { try {
if (metaClient.getFs().exists(commitFile)) { if (metaClient.getFs().exists(commitFile)) {
success &= metaClient.getFs().delete(commitFile, false); success &= metaClient.getFs().delete(commitFile, false);
log.info("Archived and deleted instant file " + commitFile); log.info("Archived and deleted instant file " + commitFile);
} }
} catch (IOException e) { } catch (IOException e) {
throw new HoodieIOException("Failed to delete archived instant " + archivedInstant, throw new HoodieIOException("Failed to delete archived instant " + archivedInstant, e);
e);
} }
} }
return success; return success;
@@ -186,8 +178,8 @@ public class HoodieCommitArchiveLog {
public void archive(List<HoodieInstant> instants) throws HoodieCommitException { public void archive(List<HoodieInstant> instants) throws HoodieCommitException {
try { try {
HoodieTimeline commitTimeline = HoodieTimeline commitTimeline = metaClient.getActiveTimeline().getAllCommitsTimeline()
metaClient.getActiveTimeline().getAllCommitsTimeline().filterCompletedInstants(); .filterCompletedInstants();
Schema wrapperSchema = HoodieArchivedMetaEntry.getClassSchema(); Schema wrapperSchema = HoodieArchivedMetaEntry.getClassSchema();
log.info("Wrapper schema " + wrapperSchema.toString()); log.info("Wrapper schema " + wrapperSchema.toString());
List<IndexedRecord> records = new ArrayList<>(); List<IndexedRecord> records = new ArrayList<>();
@@ -247,6 +239,8 @@ public class HoodieCommitArchiveLog {
archivedMetaWrapper.setActionType(ActionType.commit.name()); archivedMetaWrapper.setActionType(ActionType.commit.name());
break; break;
} }
default:
throw new UnsupportedOperationException("Action not fully supported yet");
} }
return archivedMetaWrapper; return archivedMetaWrapper;
} }
@@ -256,9 +250,8 @@ public class HoodieCommitArchiveLog {
ObjectMapper mapper = new ObjectMapper(); ObjectMapper mapper = new ObjectMapper();
//Need this to ignore other public get() methods //Need this to ignore other public get() methods
mapper.configure(DeserializationFeature.FAIL_ON_UNKNOWN_PROPERTIES, false); mapper.configure(DeserializationFeature.FAIL_ON_UNKNOWN_PROPERTIES, false);
com.uber.hoodie.avro.model.HoodieCommitMetadata avroMetaData = com.uber.hoodie.avro.model.HoodieCommitMetadata avroMetaData = mapper
mapper.convertValue(hoodieCommitMetadata, .convertValue(hoodieCommitMetadata, com.uber.hoodie.avro.model.HoodieCommitMetadata.class);
com.uber.hoodie.avro.model.HoodieCommitMetadata.class);
return avroMetaData; return avroMetaData;
} }
} }

View File

@@ -49,8 +49,8 @@ public class HoodieCreateHandle<T extends HoodieRecordPayload> extends HoodieIOH
private long recordsWritten = 0; private long recordsWritten = 0;
private long recordsDeleted = 0; private long recordsDeleted = 0;
public HoodieCreateHandle(HoodieWriteConfig config, String commitTime, public HoodieCreateHandle(HoodieWriteConfig config, String commitTime, HoodieTable<T> hoodieTable,
HoodieTable<T> hoodieTable, String partitionPath) { String partitionPath) {
super(config, commitTime, hoodieTable); super(config, commitTime, hoodieTable);
this.status = ReflectionUtils.loadClass(config.getWriteStatusClassName()); this.status = ReflectionUtils.loadClass(config.getWriteStatusClassName());
status.setFileId(UUID.randomUUID().toString()); status.setFileId(UUID.randomUUID().toString());
@@ -64,13 +64,10 @@ public class HoodieCreateHandle<T extends HoodieRecordPayload> extends HoodieIOH
} }
try { try {
HoodiePartitionMetadata partitionMetadata = new HoodiePartitionMetadata(fs, HoodiePartitionMetadata partitionMetadata = new HoodiePartitionMetadata(fs, commitTime,
commitTime, new Path(config.getBasePath()), new Path(config.getBasePath(), partitionPath));
new Path(config.getBasePath()),
new Path(config.getBasePath(), partitionPath));
partitionMetadata.trySave(TaskContext.getPartitionId()); partitionMetadata.trySave(TaskContext.getPartitionId());
this.storageWriter = this.storageWriter = HoodieStorageWriterFactory
HoodieStorageWriterFactory
.getStorageWriter(commitTime, getStorageWriterPath(), hoodieTable, config, schema); .getStorageWriter(commitTime, getStorageWriterPath(), hoodieTable, config, schema);
} catch (IOException e) { } catch (IOException e) {
throw new HoodieInsertException( throw new HoodieInsertException(
@@ -81,13 +78,12 @@ public class HoodieCreateHandle<T extends HoodieRecordPayload> extends HoodieIOH
/** /**
* Determines whether we can accept the incoming records, into the current file, depending on * Determines whether we can accept the incoming records, into the current file, depending on
* * <p>
* - Whether it belongs to the same partitionPath as existing records - Whether the current file * - Whether it belongs to the same partitionPath as existing records - Whether the current file
* written bytes lt max file size * written bytes lt max file size
*/ */
public boolean canWrite(HoodieRecord record) { public boolean canWrite(HoodieRecord record) {
return storageWriter.canWrite() && record.getPartitionPath() return storageWriter.canWrite() && record.getPartitionPath().equals(status.getPartitionPath());
.equals(status.getPartitionPath());
} }
/** /**
@@ -111,7 +107,8 @@ public class HoodieCreateHandle<T extends HoodieRecordPayload> extends HoodieIOH
recordsDeleted++; recordsDeleted++;
} }
status.markSuccess(record, recordMetadata); status.markSuccess(record, recordMetadata);
// deflate record payload after recording success. This will help users access payload as a part of marking // deflate record payload after recording success. This will help users access payload as a
// part of marking
// record successful. // record successful.
record.deflate(); record.deflate();
} catch (Throwable t) { } catch (Throwable t) {
@@ -126,8 +123,7 @@ public class HoodieCreateHandle<T extends HoodieRecordPayload> extends HoodieIOH
* Performs actions to durably, persist the current changes and returns a WriteStatus object * Performs actions to durably, persist the current changes and returns a WriteStatus object
*/ */
public WriteStatus close() { public WriteStatus close() {
logger.info( logger.info("Closing the file " + status.getFileId() + " as we are done with all the records "
"Closing the file " + status.getFileId() + " as we are done with all the records "
+ recordsWritten); + recordsWritten);
try { try {
storageWriter.close(); storageWriter.close();
@@ -144,8 +140,7 @@ public class HoodieCreateHandle<T extends HoodieRecordPayload> extends HoodieIOH
return status; return status;
} catch (IOException e) { } catch (IOException e) {
throw new HoodieInsertException("Failed to close the Insert Handle for path " + path, throw new HoodieInsertException("Failed to close the Insert Handle for path " + path, e);
e);
} }
} }

View File

@@ -39,11 +39,10 @@ public abstract class HoodieIOHandle<T extends HoodieRecordPayload> {
protected final HoodieWriteConfig config; protected final HoodieWriteConfig config;
protected final FileSystem fs; protected final FileSystem fs;
protected final HoodieTable<T> hoodieTable; protected final HoodieTable<T> hoodieTable;
protected HoodieTimeline hoodieTimeline;
protected final Schema schema; protected final Schema schema;
protected HoodieTimeline hoodieTimeline;
public HoodieIOHandle(HoodieWriteConfig config, String commitTime, public HoodieIOHandle(HoodieWriteConfig config, String commitTime, HoodieTable<T> hoodieTable) {
HoodieTable<T> hoodieTable) {
this.commitTime = commitTime; this.commitTime = commitTime;
this.config = config; this.config = config;
this.fs = hoodieTable.getMetaClient().getFs(); this.fs = hoodieTable.getMetaClient().getFs();
@@ -52,6 +51,32 @@ public abstract class HoodieIOHandle<T extends HoodieRecordPayload> {
this.schema = createHoodieWriteSchema(config); this.schema = createHoodieWriteSchema(config);
} }
/**
* Deletes any new tmp files written during the current commit, into the partition
*/
public static void cleanupTmpFilesFromCurrentCommit(HoodieWriteConfig config, String commitTime,
String partitionPath, int taskPartitionId, HoodieTable hoodieTable) {
FileSystem fs = hoodieTable.getMetaClient().getFs();
try {
FileStatus[] prevFailedFiles = fs.globStatus(new Path(String
.format("%s/%s/%s", config.getBasePath(), partitionPath,
FSUtils.maskWithoutFileId(commitTime, taskPartitionId))));
if (prevFailedFiles != null) {
logger.info(
"Deleting " + prevFailedFiles.length + " files generated by previous failed attempts.");
for (FileStatus status : prevFailedFiles) {
fs.delete(status.getPath(), false);
}
}
} catch (IOException e) {
throw new HoodieIOException("Failed to cleanup Temp files from commit " + commitTime, e);
}
}
public static Schema createHoodieWriteSchema(HoodieWriteConfig config) {
return HoodieAvroUtils.addMetadataFields(new Schema.Parser().parse(config.getSchema()));
}
public Path makeNewPath(String partitionPath, int taskPartitionId, String fileName) { public Path makeNewPath(String partitionPath, int taskPartitionId, String fileName) {
Path path = new Path(config.getBasePath(), partitionPath); Path path = new Path(config.getBasePath(), partitionPath);
try { try {
@@ -72,37 +97,7 @@ public abstract class HoodieIOHandle<T extends HoodieRecordPayload> {
taskAttemptId)); taskAttemptId));
} }
/**
* Deletes any new tmp files written during the current commit, into the partition
*/
public static void cleanupTmpFilesFromCurrentCommit(HoodieWriteConfig config,
String commitTime,
String partitionPath,
int taskPartitionId,
HoodieTable hoodieTable) {
FileSystem fs = hoodieTable.getMetaClient().getFs();
try {
FileStatus[] prevFailedFiles = fs.globStatus(new Path(String
.format("%s/%s/%s", config.getBasePath(), partitionPath,
FSUtils.maskWithoutFileId(commitTime, taskPartitionId))));
if (prevFailedFiles != null) {
logger.info("Deleting " + prevFailedFiles.length
+ " files generated by previous failed attempts.");
for (FileStatus status : prevFailedFiles) {
fs.delete(status.getPath(), false);
}
}
} catch (IOException e) {
throw new HoodieIOException("Failed to cleanup Temp files from commit " + commitTime,
e);
}
}
public Schema getSchema() { public Schema getSchema() {
return schema; return schema;
} }
public static Schema createHoodieWriteSchema(HoodieWriteConfig config) {
return HoodieAvroUtils.addMetadataFields(new Schema.Parser().parse(config.getSchema()));
}
} }

View File

@@ -26,14 +26,18 @@ import com.uber.hoodie.common.table.TableFileSystemView;
import com.uber.hoodie.common.util.FSUtils; import com.uber.hoodie.common.util.FSUtils;
import com.uber.hoodie.common.util.ReflectionUtils; import com.uber.hoodie.common.util.ReflectionUtils;
import com.uber.hoodie.common.util.collection.ExternalSpillableMap; import com.uber.hoodie.common.util.collection.ExternalSpillableMap;
import com.uber.hoodie.common.util.collection.converter.StringConverter;
import com.uber.hoodie.common.util.collection.converter.HoodieRecordConverter; import com.uber.hoodie.common.util.collection.converter.HoodieRecordConverter;
import com.uber.hoodie.common.util.collection.converter.StringConverter;
import com.uber.hoodie.config.HoodieWriteConfig; import com.uber.hoodie.config.HoodieWriteConfig;
import com.uber.hoodie.exception.HoodieIOException; import com.uber.hoodie.exception.HoodieIOException;
import com.uber.hoodie.exception.HoodieUpsertException; import com.uber.hoodie.exception.HoodieUpsertException;
import com.uber.hoodie.io.storage.HoodieStorageWriter; import com.uber.hoodie.io.storage.HoodieStorageWriter;
import com.uber.hoodie.io.storage.HoodieStorageWriterFactory; import com.uber.hoodie.io.storage.HoodieStorageWriterFactory;
import com.uber.hoodie.table.HoodieTable; import com.uber.hoodie.table.HoodieTable;
import java.io.IOException;
import java.util.Iterator;
import java.util.Map;
import java.util.Optional;
import org.apache.avro.generic.GenericRecord; import org.apache.avro.generic.GenericRecord;
import org.apache.avro.generic.IndexedRecord; import org.apache.avro.generic.IndexedRecord;
import org.apache.hadoop.fs.Path; import org.apache.hadoop.fs.Path;
@@ -41,11 +45,6 @@ import org.apache.log4j.LogManager;
import org.apache.log4j.Logger; import org.apache.log4j.Logger;
import org.apache.spark.TaskContext; import org.apache.spark.TaskContext;
import java.io.IOException;
import java.util.Iterator;
import java.util.Map;
import java.util.Optional;
@SuppressWarnings("Duplicates") @SuppressWarnings("Duplicates")
public class HoodieMergeHandle<T extends HoodieRecordPayload> extends HoodieIOHandle<T> { public class HoodieMergeHandle<T extends HoodieRecordPayload> extends HoodieIOHandle<T> {
@@ -62,59 +61,46 @@ public class HoodieMergeHandle<T extends HoodieRecordPayload> extends HoodieIOHa
private long recordsDeleted = 0; private long recordsDeleted = 0;
private long updatedRecordsWritten = 0; private long updatedRecordsWritten = 0;
public HoodieMergeHandle(HoodieWriteConfig config, public HoodieMergeHandle(HoodieWriteConfig config, String commitTime, HoodieTable<T> hoodieTable,
String commitTime, Iterator<HoodieRecord<T>> recordItr, String fileId) {
HoodieTable<T> hoodieTable,
Iterator<HoodieRecord<T>> recordItr,
String fileId) {
super(config, commitTime, hoodieTable); super(config, commitTime, hoodieTable);
this.fileSystemView = hoodieTable.getROFileSystemView(); this.fileSystemView = hoodieTable.getROFileSystemView();
init(fileId, init(fileId, recordItr)); init(fileId, init(fileId, recordItr));
} }
public HoodieMergeHandle(HoodieWriteConfig config, public HoodieMergeHandle(HoodieWriteConfig config, String commitTime, HoodieTable<T> hoodieTable,
String commitTime, Map<String, HoodieRecord<T>> keyToNewRecords, String fileId) {
HoodieTable<T> hoodieTable,
Map<String, HoodieRecord<T>> keyToNewRecords,
String fileId) {
super(config, commitTime, hoodieTable); super(config, commitTime, hoodieTable);
this.fileSystemView = hoodieTable.getROFileSystemView(); this.fileSystemView = hoodieTable.getROFileSystemView();
this.keyToNewRecords = keyToNewRecords; this.keyToNewRecords = keyToNewRecords;
init(fileId, keyToNewRecords.get(keyToNewRecords.keySet().stream().findFirst().get()).getPartitionPath()); init(fileId, keyToNewRecords.get(keyToNewRecords.keySet().stream().findFirst().get())
.getPartitionPath());
} }
/** /**
* Extract old file path, initialize StorageWriter and WriteStatus * Extract old file path, initialize StorageWriter and WriteStatus
* @param fileId
* @param partitionPath
*/ */
private void init(String fileId, String partitionPath) { private void init(String fileId, String partitionPath) {
WriteStatus writeStatus = ReflectionUtils.loadClass(config.getWriteStatusClassName()); WriteStatus writeStatus = ReflectionUtils.loadClass(config.getWriteStatusClassName());
writeStatus.setStat(new HoodieWriteStat()); writeStatus.setStat(new HoodieWriteStat());
this.writeStatus = writeStatus; this.writeStatus = writeStatus;
try { try {
String latestValidFilePath = fileSystemView String latestValidFilePath = fileSystemView.getLatestDataFiles(partitionPath)
.getLatestDataFiles(partitionPath) .filter(dataFile -> dataFile.getFileId().equals(fileId)).findFirst().get().getFileName();
.filter(dataFile -> dataFile.getFileId().equals(fileId))
.findFirst()
.get().getFileName();
writeStatus.getStat().setPrevCommit(FSUtils.getCommitTime(latestValidFilePath)); writeStatus.getStat().setPrevCommit(FSUtils.getCommitTime(latestValidFilePath));
HoodiePartitionMetadata partitionMetadata = new HoodiePartitionMetadata(fs, HoodiePartitionMetadata partitionMetadata = new HoodiePartitionMetadata(fs, commitTime,
commitTime, new Path(config.getBasePath()), new Path(config.getBasePath(), partitionPath));
new Path(config.getBasePath()),
new Path(config.getBasePath(), partitionPath));
partitionMetadata.trySave(TaskContext.getPartitionId()); partitionMetadata.trySave(TaskContext.getPartitionId());
oldFilePath = new Path( oldFilePath = new Path(
config.getBasePath() + "/" + partitionPath + "/" config.getBasePath() + "/" + partitionPath + "/" + latestValidFilePath);
+ latestValidFilePath);
String relativePath = new Path(partitionPath + "/" + FSUtils String relativePath = new Path(partitionPath + "/" + FSUtils
.makeDataFileName(commitTime, TaskContext.getPartitionId(), fileId)).toString(); .makeDataFileName(commitTime, TaskContext.getPartitionId(), fileId)).toString();
newFilePath = new Path(config.getBasePath(), relativePath); newFilePath = new Path(config.getBasePath(), relativePath);
if (config.shouldUseTempFolderForCopyOnWriteForMerge()) { if (config.shouldUseTempFolderForCopyOnWriteForMerge()) {
this.tempPath = makeTempPath(partitionPath, TaskContext.getPartitionId(), this.tempPath = makeTempPath(partitionPath, TaskContext.getPartitionId(), fileId,
fileId, TaskContext.get().stageId(), TaskContext.get().taskAttemptId()); TaskContext.get().stageId(), TaskContext.get().taskAttemptId());
} }
// handle cases of partial failures, for update task // handle cases of partial failures, for update task
@@ -122,8 +108,9 @@ public class HoodieMergeHandle<T extends HoodieRecordPayload> extends HoodieIOHa
fs.delete(newFilePath, false); fs.delete(newFilePath, false);
} }
logger.info(String.format("Merging new data into oldPath %s, as newPath %s", logger.info(String
oldFilePath.toString(), getStorageWriterPath().toString())); .format("Merging new data into oldPath %s, as newPath %s", oldFilePath.toString(),
getStorageWriterPath().toString()));
// file name is same for all records, in this bunch // file name is same for all records, in this bunch
writeStatus.setFileId(fileId); writeStatus.setFileId(fileId);
writeStatus.setPartitionPath(partitionPath); writeStatus.setPartitionPath(partitionPath);
@@ -143,9 +130,6 @@ public class HoodieMergeHandle<T extends HoodieRecordPayload> extends HoodieIOHa
/** /**
* Load the new incoming records in a map and return partitionPath * Load the new incoming records in a map and return partitionPath
* @param fileId
* @param newRecordsItr
* @return
*/ */
private String init(String fileId, Iterator<HoodieRecord<T>> newRecordsItr) { private String init(String fileId, Iterator<HoodieRecord<T>> newRecordsItr) {
try { try {
@@ -164,14 +148,14 @@ public class HoodieMergeHandle<T extends HoodieRecordPayload> extends HoodieIOHa
// update the new location of the record, so we know where to find it next // update the new location of the record, so we know where to find it next
record.setNewLocation(new HoodieRecordLocation(commitTime, fileId)); record.setNewLocation(new HoodieRecordLocation(commitTime, fileId));
} }
logger.debug("Number of entries in MemoryBasedMap => " + logger.debug("Number of entries in MemoryBasedMap => "
((ExternalSpillableMap) keyToNewRecords).getInMemoryMapNumEntries() + ((ExternalSpillableMap) keyToNewRecords).getInMemoryMapNumEntries()
+ "Total size in bytes of MemoryBasedMap => " + + "Total size in bytes of MemoryBasedMap => "
((ExternalSpillableMap) keyToNewRecords).getCurrentInMemoryMapSize() + ((ExternalSpillableMap) keyToNewRecords).getCurrentInMemoryMapSize()
+ "Number of entries in DiskBasedMap => " + + "Number of entries in DiskBasedMap => "
((ExternalSpillableMap) keyToNewRecords).getDiskBasedMapNumEntries() + ((ExternalSpillableMap) keyToNewRecords).getDiskBasedMapNumEntries()
+ "Size of file spilled to disk => " + + "Size of file spilled to disk => "
((ExternalSpillableMap) keyToNewRecords).getSizeOfFileOnDiskInBytes()); + ((ExternalSpillableMap) keyToNewRecords).getSizeOfFileOnDiskInBytes());
return partitionPath; return partitionPath;
} }
@@ -189,7 +173,8 @@ public class HoodieMergeHandle<T extends HoodieRecordPayload> extends HoodieIOHa
} }
writeStatus.markSuccess(hoodieRecord, recordMetadata); writeStatus.markSuccess(hoodieRecord, recordMetadata);
// deflate record payload after recording success. This will help users access payload as a part of marking // deflate record payload after recording success. This will help users access payload as a
// part of marking
// record successful. // record successful.
hoodieRecord.deflate(); hoodieRecord.deflate();
return true; return true;
@@ -201,8 +186,7 @@ public class HoodieMergeHandle<T extends HoodieRecordPayload> extends HoodieIOHa
} }
/** /**
* Go through an old record. Here if we detect a newer version shows up, we write the new one to * Go through an old record. Here if we detect a newer version shows up, we write the new one to the file.
* the file.
*/ */
public void write(GenericRecord oldRecord) { public void write(GenericRecord oldRecord) {
String key = oldRecord.get(HoodieRecord.RECORD_KEY_METADATA_FIELD).toString(); String key = oldRecord.get(HoodieRecord.RECORD_KEY_METADATA_FIELD).toString();
@@ -236,8 +220,7 @@ public class HoodieMergeHandle<T extends HoodieRecordPayload> extends HoodieIOHa
try { try {
storageWriter.writeAvro(key, oldRecord); storageWriter.writeAvro(key, oldRecord);
} catch (ClassCastException e) { } catch (ClassCastException e) {
logger.error( logger.error("Schema mismatch when rewriting old record " + oldRecord + " from file "
"Schema mismatch when rewriting old record " + oldRecord + " from file "
+ getOldFilePath() + " to file " + getStorageWriterPath() + " with schema " + schema + getOldFilePath() + " to file " + getStorageWriterPath() + " with schema " + schema
.toString(true)); .toString(true));
throw new HoodieUpsertException(errMsg, e); throw new HoodieUpsertException(errMsg, e);

View File

@@ -53,8 +53,8 @@ public class CompactionOperation implements Serializable {
this.partitionPath = partitionPath; this.partitionPath = partitionPath;
this.dataFileCommitTime = dataFile.getCommitTime(); this.dataFileCommitTime = dataFile.getCommitTime();
this.dataFileSize = dataFile.getFileSize(); this.dataFileSize = dataFile.getFileSize();
this.deltaFilePaths = logFiles.stream().map(s -> s.getPath().toString()).collect( this.deltaFilePaths = logFiles.stream().map(s -> s.getPath().toString())
Collectors.toList()); .collect(Collectors.toList());
this.metrics = writeConfig.getCompactionStrategy() this.metrics = writeConfig.getCompactionStrategy()
.captureMetrics(dataFile, partitionPath, logFiles); .captureMetrics(dataFile, partitionPath, logFiles);
} }

View File

@@ -17,17 +17,15 @@
package com.uber.hoodie.io.compact; package com.uber.hoodie.io.compact;
import com.uber.hoodie.WriteStatus; import com.uber.hoodie.WriteStatus;
import com.uber.hoodie.common.model.HoodieCommitMetadata;
import com.uber.hoodie.common.table.HoodieTimeline; import com.uber.hoodie.common.table.HoodieTimeline;
import com.uber.hoodie.common.table.timeline.HoodieActiveTimeline; import com.uber.hoodie.common.table.timeline.HoodieActiveTimeline;
import com.uber.hoodie.common.table.timeline.HoodieInstant; import com.uber.hoodie.common.table.timeline.HoodieInstant;
import com.uber.hoodie.config.HoodieWriteConfig; import com.uber.hoodie.config.HoodieWriteConfig;
import com.uber.hoodie.table.HoodieTable; import com.uber.hoodie.table.HoodieTable;
import org.apache.spark.api.java.JavaRDD;
import org.apache.spark.api.java.JavaSparkContext;
import java.io.Serializable; import java.io.Serializable;
import java.util.Date; import java.util.Date;
import org.apache.spark.api.java.JavaRDD;
import org.apache.spark.api.java.JavaSparkContext;
/** /**
* A HoodieCompactor runs compaction on a hoodie table * A HoodieCompactor runs compaction on a hoodie table

View File

@@ -17,6 +17,7 @@
package com.uber.hoodie.io.compact; package com.uber.hoodie.io.compact;
import static java.util.stream.Collectors.toList; import static java.util.stream.Collectors.toList;
import com.google.common.base.Preconditions; import com.google.common.base.Preconditions;
import com.google.common.collect.Lists; import com.google.common.collect.Lists;
import com.google.common.collect.Sets; import com.google.common.collect.Sets;
@@ -70,9 +71,8 @@ public class HoodieRealtimeTableCompactor implements HoodieCompactor {
} }
private JavaRDD<WriteStatus> executeCompaction(JavaSparkContext jsc, private JavaRDD<WriteStatus> executeCompaction(JavaSparkContext jsc,
List<CompactionOperation> operations, List<CompactionOperation> operations, HoodieTable hoodieTable, HoodieWriteConfig config,
HoodieTable hoodieTable, String compactionCommitTime) throws IOException {
HoodieWriteConfig config, String compactionCommitTime) throws IOException {
log.info("After filtering, Compacting " + operations + " files"); log.info("After filtering, Compacting " + operations + " files");
return jsc.parallelize(operations, operations.size()) return jsc.parallelize(operations, operations.size())
@@ -80,18 +80,19 @@ public class HoodieRealtimeTableCompactor implements HoodieCompactor {
.flatMap(writeStatusesItr -> writeStatusesItr.iterator()); .flatMap(writeStatusesItr -> writeStatusesItr.iterator());
} }
private List<WriteStatus> compact(HoodieTable hoodieTable, private List<WriteStatus> compact(HoodieTable hoodieTable, HoodieWriteConfig config,
HoodieWriteConfig config, CompactionOperation operation, String commitTime) CompactionOperation operation, String commitTime) throws IOException {
throws IOException {
FileSystem fs = hoodieTable.getMetaClient().getFs(); FileSystem fs = hoodieTable.getMetaClient().getFs();
Schema readerSchema = Schema readerSchema = HoodieAvroUtils
HoodieAvroUtils.addMetadataFields(new Schema.Parser().parse(config.getSchema())); .addMetadataFields(new Schema.Parser().parse(config.getSchema()));
log.info("Compacting base " + operation.getDataFilePath() + " with delta files " + operation log.info("Compacting base " + operation.getDataFilePath() + " with delta files " + operation
.getDeltaFilePaths() + " for commit " + commitTime); .getDeltaFilePaths() + " for commit " + commitTime);
// TODO - FIX THIS // TODO - FIX THIS
// Reads the entire avro file. Always only specific blocks should be read from the avro file (failure recover). // Reads the entire avro file. Always only specific blocks should be read from the avro file
// Load all the delta commits since the last compaction commit and get all the blocks to be loaded and load it using CompositeAvroLogReader // (failure recover).
// Load all the delta commits since the last compaction commit and get all the blocks to be
// loaded and load it using CompositeAvroLogReader
// Since a DeltaCommit is not defined yet, reading all the records. revisit this soon. // Since a DeltaCommit is not defined yet, reading all the records. revisit this soon.
HoodieTableMetaClient metaClient = hoodieTable.getMetaClient(); HoodieTableMetaClient metaClient = hoodieTable.getMetaClient();
String maxInstantTime = metaClient.getActiveTimeline() String maxInstantTime = metaClient.getActiveTimeline()
@@ -114,25 +115,22 @@ public class HoodieRealtimeTableCompactor implements HoodieCompactor {
Iterator<List<WriteStatus>> result = table Iterator<List<WriteStatus>> result = table
.handleUpdate(commitTime, operation.getFileId(), scanner.getRecords()); .handleUpdate(commitTime, operation.getFileId(), scanner.getRecords());
Iterable<List<WriteStatus>> resultIterable = () -> result; Iterable<List<WriteStatus>> resultIterable = () -> result;
return StreamSupport.stream(resultIterable.spliterator(), false) return StreamSupport.stream(resultIterable.spliterator(), false).flatMap(Collection::stream)
.flatMap(Collection::stream)
.map(s -> { .map(s -> {
s.getStat().setTotalRecordsToBeUpdate(scanner.getTotalRecordsToUpdate()); s.getStat().setTotalRecordsToBeUpdate(scanner.getTotalRecordsToUpdate());
s.getStat().setTotalLogFiles(scanner.getTotalLogFiles()); s.getStat().setTotalLogFiles(scanner.getTotalLogFiles());
s.getStat().setTotalLogRecords(scanner.getTotalLogRecords()); s.getStat().setTotalLogRecords(scanner.getTotalLogRecords());
s.getStat().setPartitionPath(operation.getPartitionPath()); s.getStat().setPartitionPath(operation.getPartitionPath());
return s; return s;
}) }).collect(toList());
.collect(toList());
} }
private List<CompactionOperation> getCompactionWorkload(JavaSparkContext jsc, private List<CompactionOperation> getCompactionWorkload(JavaSparkContext jsc,
HoodieTable hoodieTable, HoodieTable hoodieTable, HoodieWriteConfig config, String compactionCommitTime)
HoodieWriteConfig config, String compactionCommitTime)
throws IOException { throws IOException {
Preconditions.checkArgument( Preconditions
hoodieTable.getMetaClient().getTableType() == HoodieTableType.MERGE_ON_READ, .checkArgument(hoodieTable.getMetaClient().getTableType() == HoodieTableType.MERGE_ON_READ,
"HoodieRealtimeTableCompactor can only compact table of type " "HoodieRealtimeTableCompactor can only compact table of type "
+ HoodieTableType.MERGE_ON_READ + " and not " + hoodieTable.getMetaClient() + HoodieTableType.MERGE_ON_READ + " and not " + hoodieTable.getMetaClient()
.getTableType().name()); .getTableType().name());
@@ -141,25 +139,23 @@ public class HoodieRealtimeTableCompactor implements HoodieCompactor {
// TODO - rollback any compactions in flight // TODO - rollback any compactions in flight
HoodieTableMetaClient metaClient = hoodieTable.getMetaClient(); HoodieTableMetaClient metaClient = hoodieTable.getMetaClient();
log.info("Compacting " + metaClient.getBasePath() + " with commit " + compactionCommitTime); log.info("Compacting " + metaClient.getBasePath() + " with commit " + compactionCommitTime);
List<String> partitionPaths = List<String> partitionPaths = FSUtils
FSUtils.getAllPartitionPaths(metaClient.getFs(), metaClient.getBasePath(), .getAllPartitionPaths(metaClient.getFs(), metaClient.getBasePath(),
config.shouldAssumeDatePartitioning()); config.shouldAssumeDatePartitioning());
TableFileSystemView.RealtimeView fileSystemView = hoodieTable.getRTFileSystemView(); TableFileSystemView.RealtimeView fileSystemView = hoodieTable.getRTFileSystemView();
log.info("Compaction looking for files to compact in " + partitionPaths + " partitions"); log.info("Compaction looking for files to compact in " + partitionPaths + " partitions");
List<CompactionOperation> operations = List<CompactionOperation> operations = jsc.parallelize(partitionPaths, partitionPaths.size())
jsc.parallelize(partitionPaths, partitionPaths.size())
.flatMap((FlatMapFunction<String, CompactionOperation>) partitionPath -> fileSystemView .flatMap((FlatMapFunction<String, CompactionOperation>) partitionPath -> fileSystemView
.getLatestFileSlices(partitionPath) .getLatestFileSlices(partitionPath).map(
.map(s -> new CompactionOperation(s.getDataFile().get(), s -> new CompactionOperation(s.getDataFile().get(), partitionPath,
partitionPath,
s.getLogFiles().sorted(HoodieLogFile.getLogVersionComparator().reversed()) s.getLogFiles().sorted(HoodieLogFile.getLogVersionComparator().reversed())
.collect(Collectors.toList()), config)) .collect(Collectors.toList()), config))
.filter(c -> !c.getDeltaFilePaths().isEmpty()) .filter(c -> !c.getDeltaFilePaths().isEmpty()).collect(toList()).iterator()).collect();
.collect(toList()).iterator()).collect();
log.info("Total of " + operations.size() + " compactions are retrieved"); log.info("Total of " + operations.size() + " compactions are retrieved");
// Filter the compactions with the passed in filter. This lets us choose most effective compactions only // Filter the compactions with the passed in filter. This lets us choose most effective
// compactions only
operations = config.getCompactionStrategy().orderAndFilter(config, operations); operations = config.getCompactionStrategy().orderAndFilter(config, operations);
if (operations.isEmpty()) { if (operations.isEmpty()) {
log.warn("After filtering, Nothing to compact for " + metaClient.getBasePath()); log.warn("After filtering, Nothing to compact for " + metaClient.getBasePath());

View File

@@ -44,9 +44,9 @@ public class BoundedIOCompactionStrategy implements CompactionStrategy {
List<HoodieLogFile> logFiles) { List<HoodieLogFile> logFiles) {
Map<String, Object> metrics = Maps.newHashMap(); Map<String, Object> metrics = Maps.newHashMap();
// Total size of all the log files // Total size of all the log files
Long totalLogFileSize = logFiles.stream().map(HoodieLogFile::getFileSize).filter( Long totalLogFileSize = logFiles.stream().map(HoodieLogFile::getFileSize)
Optional::isPresent).map(Optional::get).reduce( .filter(Optional::isPresent).map(Optional::get).reduce((size1, size2) -> size1 + size2)
(size1, size2) -> size1 + size2).orElse(0L); .orElse(0L);
// Total read will be the base file + all the log files // Total read will be the base file + all the log files
Long totalIORead = FSUtils.getSizeInMB(dataFile.getFileSize() + totalLogFileSize); Long totalIORead = FSUtils.getSizeInMB(dataFile.getFileSize() + totalLogFileSize);
// Total write will be similar to the size of the base file // Total write will be similar to the size of the base file
@@ -64,7 +64,8 @@ public class BoundedIOCompactionStrategy implements CompactionStrategy {
@Override @Override
public List<CompactionOperation> orderAndFilter(HoodieWriteConfig writeConfig, public List<CompactionOperation> orderAndFilter(HoodieWriteConfig writeConfig,
List<CompactionOperation> operations) { List<CompactionOperation> operations) {
// Iterate through the operations in order and accept operations as long as we are within the IO limit // Iterate through the operations in order and accept operations as long as we are within the
// IO limit
// Preserves the original ordering of compactions // Preserves the original ordering of compactions
List<CompactionOperation> finalOperations = Lists.newArrayList(); List<CompactionOperation> finalOperations = Lists.newArrayList();
long targetIORemaining = writeConfig.getTargetIOPerCompactionInMB(); long targetIORemaining = writeConfig.getTargetIOPerCompactionInMB();

View File

@@ -28,7 +28,7 @@ import java.util.Map;
* Strategy for compaction. Pluggable implementation of define how compaction should be done. The * Strategy for compaction. Pluggable implementation of define how compaction should be done. The
* implementations of this interface can capture the relevant metrics to order and filter the final * implementations of this interface can capture the relevant metrics to order and filter the final
* list of compaction operation to run in a single compaction. * list of compaction operation to run in a single compaction.
* * <p>
* Implementation of CompactionStrategy cannot hold any state. Difference instantiations can be * Implementation of CompactionStrategy cannot hold any state. Difference instantiations can be
* passed in every time * passed in every time
* *

View File

@@ -20,7 +20,6 @@ package com.uber.hoodie.io.compact.strategy;
import com.uber.hoodie.config.HoodieWriteConfig; import com.uber.hoodie.config.HoodieWriteConfig;
import com.uber.hoodie.exception.HoodieException; import com.uber.hoodie.exception.HoodieException;
import com.uber.hoodie.io.compact.CompactionOperation; import com.uber.hoodie.io.compact.CompactionOperation;
import java.text.ParseException; import java.text.ParseException;
import java.text.SimpleDateFormat; import java.text.SimpleDateFormat;
import java.util.Comparator; import java.util.Comparator;
@@ -30,15 +29,16 @@ import java.util.Locale;
import java.util.stream.Collectors; import java.util.stream.Collectors;
/** /**
* This strategy orders compactions in reverse order of creation of Hive Partitions. * This strategy orders compactions in reverse order of creation of Hive Partitions. It helps to
* It helps to compact data in latest partitions first and then older capped at the Total_IO allowed. * compact data in latest partitions first and then older capped at the Total_IO allowed.
*/ */
public class DayBasedCompactionStrategy extends BoundedIOCompactionStrategy { public class DayBasedCompactionStrategy extends BoundedIOCompactionStrategy {
// For now, use SimpleDateFormat as default partition format // For now, use SimpleDateFormat as default partition format
private static String datePartitionFormat = "yyyy/MM/dd"; private static String datePartitionFormat = "yyyy/MM/dd";
// Sorts compaction in LastInFirstCompacted order // Sorts compaction in LastInFirstCompacted order
private static Comparator<CompactionOperation> comparator = (CompactionOperation leftC, CompactionOperation rightC) -> { private static Comparator<CompactionOperation> comparator = (CompactionOperation leftC,
CompactionOperation rightC) -> {
try { try {
Date left = new SimpleDateFormat(datePartitionFormat, Locale.ENGLISH) Date left = new SimpleDateFormat(datePartitionFormat, Locale.ENGLISH)
.parse(leftC.getPartitionPath()); .parse(leftC.getPartitionPath());
@@ -55,8 +55,10 @@ public class DayBasedCompactionStrategy extends BoundedIOCompactionStrategy {
} }
@Override @Override
public List<CompactionOperation> orderAndFilter(HoodieWriteConfig writeConfig, List<CompactionOperation> operations) { public List<CompactionOperation> orderAndFilter(HoodieWriteConfig writeConfig,
List<CompactionOperation> operations) {
// Iterate through the operations and accept operations as long as we are within the IO limit // Iterate through the operations and accept operations as long as we are within the IO limit
return super.orderAndFilter(writeConfig, operations.stream().sorted(comparator).collect(Collectors.toList())); return super.orderAndFilter(writeConfig,
operations.stream().sorted(comparator).collect(Collectors.toList()));
} }
} }

View File

@@ -44,9 +44,9 @@ public class LogFileSizeBasedCompactionStrategy extends BoundedIOCompactionStrat
Map<String, Object> metrics = super.captureMetrics(dataFile, partitionPath, logFiles); Map<String, Object> metrics = super.captureMetrics(dataFile, partitionPath, logFiles);
// Total size of all the log files // Total size of all the log files
Long totalLogFileSize = logFiles.stream().map(HoodieLogFile::getFileSize).filter( Long totalLogFileSize = logFiles.stream().map(HoodieLogFile::getFileSize)
Optional::isPresent).map(Optional::get).reduce( .filter(Optional::isPresent).map(Optional::get).reduce((size1, size2) -> size1 + size2)
(size1, size2) -> size1 + size2).orElse(0L); .orElse(0L);
// save the metrics needed during the order // save the metrics needed during the order
metrics.put(TOTAL_LOG_FILE_SIZE, totalLogFileSize); metrics.put(TOTAL_LOG_FILE_SIZE, totalLogFileSize);
return metrics; return metrics;

View File

@@ -36,8 +36,8 @@ import org.apache.spark.TaskContext;
* HoodieParquetWriter extends the ParquetWriter to help limit the size of underlying file. Provides * HoodieParquetWriter extends the ParquetWriter to help limit the size of underlying file. Provides
* a way to check if the current file can take more records with the <code>canWrite()</code> * a way to check if the current file can take more records with the <code>canWrite()</code>
*/ */
public class HoodieParquetWriter<T extends HoodieRecordPayload, R extends IndexedRecord> public class HoodieParquetWriter<T extends HoodieRecordPayload, R extends IndexedRecord> extends
extends ParquetWriter<IndexedRecord> implements HoodieStorageWriter<R> { ParquetWriter<IndexedRecord> implements HoodieStorageWriter<R> {
private static AtomicLong recordIndex = new AtomicLong(1); private static AtomicLong recordIndex = new AtomicLong(1);
@@ -49,6 +49,29 @@ public class HoodieParquetWriter<T extends HoodieRecordPayload, R extends Indexe
private final Schema schema; private final Schema schema;
public HoodieParquetWriter(String commitTime, Path file, HoodieParquetConfig parquetConfig,
Schema schema) throws IOException {
super(HoodieWrapperFileSystem.convertToHoodiePath(file, parquetConfig.getHadoopConf()),
ParquetFileWriter.Mode.CREATE, parquetConfig.getWriteSupport(),
parquetConfig.getCompressionCodecName(), parquetConfig.getBlockSize(),
parquetConfig.getPageSize(), parquetConfig.getPageSize(),
ParquetWriter.DEFAULT_IS_DICTIONARY_ENABLED, ParquetWriter.DEFAULT_IS_VALIDATING_ENABLED,
ParquetWriter.DEFAULT_WRITER_VERSION,
registerFileSystem(file, parquetConfig.getHadoopConf()));
this.file = HoodieWrapperFileSystem.convertToHoodiePath(file, parquetConfig.getHadoopConf());
this.fs = (HoodieWrapperFileSystem) this.file
.getFileSystem(registerFileSystem(file, parquetConfig.getHadoopConf()));
// We cannot accurately measure the snappy compressed output file size. We are choosing a
// conservative 10%
// TODO - compute this compression ratio dynamically by looking at the bytes written to the
// stream and the actual file size reported by HDFS
this.maxFileSize = parquetConfig.getMaxFileSize() + Math
.round(parquetConfig.getMaxFileSize() * parquetConfig.getCompressionRatio());
this.writeSupport = parquetConfig.getWriteSupport();
this.commitTime = commitTime;
this.schema = schema;
}
private static Configuration registerFileSystem(Path file, Configuration conf) { private static Configuration registerFileSystem(Path file, Configuration conf) {
Configuration returnConf = new Configuration(conf); Configuration returnConf = new Configuration(conf);
String scheme = FSUtils.getFs(file.toString(), conf).getScheme(); String scheme = FSUtils.getFs(file.toString(), conf).getScheme();
@@ -57,37 +80,12 @@ public class HoodieParquetWriter<T extends HoodieRecordPayload, R extends Indexe
return returnConf; return returnConf;
} }
public HoodieParquetWriter(String commitTime, Path file,
HoodieParquetConfig parquetConfig, Schema schema) throws IOException {
super(HoodieWrapperFileSystem.convertToHoodiePath(file, parquetConfig.getHadoopConf()),
ParquetFileWriter.Mode.CREATE, parquetConfig.getWriteSupport(),
parquetConfig.getCompressionCodecName(), parquetConfig.getBlockSize(),
parquetConfig.getPageSize(), parquetConfig.getPageSize(),
ParquetWriter.DEFAULT_IS_DICTIONARY_ENABLED,
ParquetWriter.DEFAULT_IS_VALIDATING_ENABLED, ParquetWriter.DEFAULT_WRITER_VERSION,
registerFileSystem(file, parquetConfig.getHadoopConf()));
this.file =
HoodieWrapperFileSystem.convertToHoodiePath(file, parquetConfig.getHadoopConf());
this.fs = (HoodieWrapperFileSystem) this.file
.getFileSystem(registerFileSystem(file, parquetConfig.getHadoopConf()));
// We cannot accurately measure the snappy compressed output file size. We are choosing a conservative 10%
// TODO - compute this compression ratio dynamically by looking at the bytes written to the stream and the actual file size reported by HDFS
this.maxFileSize = parquetConfig.getMaxFileSize() + Math
.round(parquetConfig.getMaxFileSize() * parquetConfig.getCompressionRatio());
this.writeSupport = parquetConfig.getWriteSupport();
this.commitTime = commitTime;
this.schema = schema;
}
@Override @Override
public void writeAvroWithMetadata(R avroRecord, HoodieRecord record) throws IOException { public void writeAvroWithMetadata(R avroRecord, HoodieRecord record) throws IOException {
String seqId = HoodieRecord.generateSequenceId(commitTime, TaskContext.getPartitionId(), String seqId = HoodieRecord.generateSequenceId(commitTime, TaskContext.getPartitionId(),
recordIndex.getAndIncrement()); recordIndex.getAndIncrement());
HoodieAvroUtils.addHoodieKeyToRecord((GenericRecord) avroRecord, HoodieAvroUtils.addHoodieKeyToRecord((GenericRecord) avroRecord, record.getRecordKey(),
record.getRecordKey(), record.getPartitionPath(), file.getName());
record.getPartitionPath(),
file.getName());
HoodieAvroUtils.addCommitMetadataToRecord((GenericRecord) avroRecord, commitTime, seqId); HoodieAvroUtils.addCommitMetadataToRecord((GenericRecord) avroRecord, commitTime, seqId);
super.write(avroRecord); super.write(avroRecord);
writeSupport.add(record.getRecordKey()); writeSupport.add(record.getRecordKey());

View File

@@ -30,22 +30,22 @@ import org.apache.parquet.hadoop.metadata.CompressionCodecName;
public class HoodieStorageWriterFactory { public class HoodieStorageWriterFactory {
public static <T extends HoodieRecordPayload, R extends IndexedRecord> HoodieStorageWriter<R> getStorageWriter( public static <T extends HoodieRecordPayload, R extends IndexedRecord> HoodieStorageWriter<R>
String commitTime, Path path, HoodieTable<T> hoodieTable, HoodieWriteConfig config, getStorageWriter(String commitTime, Path path, HoodieTable<T> hoodieTable,
Schema schema) HoodieWriteConfig config, Schema schema) throws IOException {
throws IOException {
//TODO - based on the metadata choose the implementation of HoodieStorageWriter //TODO - based on the metadata choose the implementation of HoodieStorageWriter
// Currently only parquet is supported // Currently only parquet is supported
return newParquetStorageWriter(commitTime, path, config, schema, hoodieTable); return newParquetStorageWriter(commitTime, path, config, schema, hoodieTable);
} }
private static <T extends HoodieRecordPayload, R extends IndexedRecord> HoodieStorageWriter<R> newParquetStorageWriter( private static <T extends HoodieRecordPayload, R extends IndexedRecord> HoodieStorageWriter<R>
newParquetStorageWriter(
String commitTime, Path path, HoodieWriteConfig config, Schema schema, String commitTime, Path path, HoodieWriteConfig config, Schema schema,
HoodieTable hoodieTable) throws IOException { HoodieTable hoodieTable) throws IOException {
BloomFilter filter = BloomFilter filter = new BloomFilter(config.getBloomFilterNumEntries(),
new BloomFilter(config.getBloomFilterNumEntries(), config.getBloomFilterFPP()); config.getBloomFilterFPP());
HoodieAvroWriteSupport writeSupport = HoodieAvroWriteSupport writeSupport = new HoodieAvroWriteSupport(
new HoodieAvroWriteSupport(new AvroSchemaConverter().convert(schema), schema, filter); new AvroSchemaConverter().convert(schema), schema, filter);
HoodieParquetConfig parquetConfig = HoodieParquetConfig parquetConfig =
new HoodieParquetConfig(writeSupport, CompressionCodecName.GZIP, new HoodieParquetConfig(writeSupport, CompressionCodecName.GZIP,

View File

@@ -59,8 +59,8 @@ import org.apache.hadoop.util.Progressable;
*/ */
public class HoodieWrapperFileSystem extends FileSystem { public class HoodieWrapperFileSystem extends FileSystem {
private static final Set<String> SUPPORT_SCHEMES;
public static final String HOODIE_SCHEME_PREFIX = "hoodie-"; public static final String HOODIE_SCHEME_PREFIX = "hoodie-";
private static final Set<String> SUPPORT_SCHEMES;
static { static {
SUPPORT_SCHEMES = new HashSet<>(); SUPPORT_SCHEMES = new HashSet<>();
@@ -69,18 +69,50 @@ public class HoodieWrapperFileSystem extends FileSystem {
SUPPORT_SCHEMES.add("s3"); SUPPORT_SCHEMES.add("s3");
SUPPORT_SCHEMES.add("s3a"); SUPPORT_SCHEMES.add("s3a");
// Hoodie currently relies on underlying object store being fully // Hoodie currently relies on underlying object store being fully
// consistent so only regional buckets should be used. // consistent so only regional buckets should be used.
SUPPORT_SCHEMES.add("gs"); SUPPORT_SCHEMES.add("gs");
SUPPORT_SCHEMES.add("viewfs"); SUPPORT_SCHEMES.add("viewfs");
} }
private ConcurrentMap<String, SizeAwareFSDataOutputStream> openStreams = private ConcurrentMap<String, SizeAwareFSDataOutputStream> openStreams = new
new ConcurrentHashMap<>(); ConcurrentHashMap<>();
private FileSystem fileSystem; private FileSystem fileSystem;
private URI uri; private URI uri;
public static Path convertToHoodiePath(Path file, Configuration conf) {
try {
String scheme = FSUtils.getFs(file.toString(), conf).getScheme();
return convertPathWithScheme(file, getHoodieScheme(scheme));
} catch (HoodieIOException e) {
throw e;
}
}
private static Path convertPathWithScheme(Path oldPath, String newScheme) {
URI oldURI = oldPath.toUri();
URI newURI;
try {
newURI = new URI(newScheme, oldURI.getUserInfo(), oldURI.getHost(), oldURI.getPort(),
oldURI.getPath(), oldURI.getQuery(), oldURI.getFragment());
return new Path(newURI);
} catch (URISyntaxException e) {
// TODO - Better Exception handling
throw new RuntimeException(e);
}
}
public static String getHoodieScheme(String scheme) {
String newScheme;
if (SUPPORT_SCHEMES.contains(scheme)) {
newScheme = HOODIE_SCHEME_PREFIX + scheme;
} else {
throw new IllegalArgumentException(
"BlockAlignedAvroParquetWriter does not support scheme " + scheme);
}
return newScheme;
}
@Override @Override
public void initialize(URI uri, Configuration conf) throws IOException { public void initialize(URI uri, Configuration conf) throws IOException {
// Get the default filesystem to decorate // Get the default filesystem to decorate
@@ -90,7 +122,8 @@ public class HoodieWrapperFileSystem extends FileSystem {
path = new Path(path.toString().replace(HOODIE_SCHEME_PREFIX, "")); path = new Path(path.toString().replace(HOODIE_SCHEME_PREFIX, ""));
} }
this.fileSystem = FSUtils.getFs(path.toString(), conf); this.fileSystem = FSUtils.getFs(path.toString(), conf);
// Do not need to explicitly initialize the default filesystem, its done already in the above FileSystem.get // Do not need to explicitly initialize the default filesystem, its done already in the above
// FileSystem.get
// fileSystem.initialize(FileSystem.getDefaultUri(conf), conf); // fileSystem.initialize(FileSystem.getDefaultUri(conf), conf);
// fileSystem.setConf(conf); // fileSystem.setConf(conf);
this.uri = uri; this.uri = uri;
@@ -108,8 +141,7 @@ public class HoodieWrapperFileSystem extends FileSystem {
@Override @Override
public FSDataOutputStream create(Path f, FsPermission permission, boolean overwrite, public FSDataOutputStream create(Path f, FsPermission permission, boolean overwrite,
int bufferSize, short replication, long blockSize, Progressable progress) int bufferSize, short replication, long blockSize, Progressable progress) throws IOException {
throws IOException {
final Path translatedPath = convertToDefaultPath(f); final Path translatedPath = convertToDefaultPath(f);
return wrapOutputStream(f, fileSystem return wrapOutputStream(f, fileSystem
.create(translatedPath, permission, overwrite, bufferSize, replication, blockSize, .create(translatedPath, permission, overwrite, bufferSize, replication, blockSize,
@@ -122,8 +154,8 @@ public class HoodieWrapperFileSystem extends FileSystem {
return fsDataOutputStream; return fsDataOutputStream;
} }
SizeAwareFSDataOutputStream os = SizeAwareFSDataOutputStream os = new SizeAwareFSDataOutputStream(fsDataOutputStream,
new SizeAwareFSDataOutputStream(fsDataOutputStream, new Runnable() { new Runnable() {
@Override @Override
public void run() { public void run() {
openStreams.remove(path.getName()); openStreams.remove(path.getName());
@@ -160,14 +192,13 @@ public class HoodieWrapperFileSystem extends FileSystem {
} }
@Override @Override
public FSDataOutputStream create(Path f, boolean overwrite, int bufferSize) public FSDataOutputStream create(Path f, boolean overwrite, int bufferSize) throws IOException {
throws IOException {
return fileSystem.create(convertToDefaultPath(f), overwrite, bufferSize); return fileSystem.create(convertToDefaultPath(f), overwrite, bufferSize);
} }
@Override @Override
public FSDataOutputStream create(Path f, boolean overwrite, int bufferSize, public FSDataOutputStream create(Path f, boolean overwrite, int bufferSize, Progressable progress)
Progressable progress) throws IOException { throws IOException {
return fileSystem.create(convertToDefaultPath(f), overwrite, bufferSize, progress); return fileSystem.create(convertToDefaultPath(f), overwrite, bufferSize, progress);
} }
@@ -175,14 +206,12 @@ public class HoodieWrapperFileSystem extends FileSystem {
public FSDataOutputStream create(Path f, boolean overwrite, int bufferSize, short replication, public FSDataOutputStream create(Path f, boolean overwrite, int bufferSize, short replication,
long blockSize, Progressable progress) throws IOException { long blockSize, Progressable progress) throws IOException {
return fileSystem return fileSystem
.create(convertToDefaultPath(f), overwrite, bufferSize, replication, blockSize, .create(convertToDefaultPath(f), overwrite, bufferSize, replication, blockSize, progress);
progress);
} }
@Override @Override
public FSDataOutputStream create(Path f, FsPermission permission, EnumSet<CreateFlag> flags, public FSDataOutputStream create(Path f, FsPermission permission, EnumSet<CreateFlag> flags,
int bufferSize, short replication, long blockSize, Progressable progress) int bufferSize, short replication, long blockSize, Progressable progress) throws IOException {
throws IOException {
return fileSystem return fileSystem
.create(convertToDefaultPath(f), permission, flags, bufferSize, replication, blockSize, .create(convertToDefaultPath(f), permission, flags, bufferSize, replication, blockSize,
progress); progress);
@@ -197,7 +226,6 @@ public class HoodieWrapperFileSystem extends FileSystem {
progress, checksumOpt); progress, checksumOpt);
} }
@Override @Override
public FSDataOutputStream create(Path f, boolean overwrite, int bufferSize, short replication, public FSDataOutputStream create(Path f, boolean overwrite, int bufferSize, short replication,
long blockSize) throws IOException { long blockSize) throws IOException {
@@ -205,7 +233,6 @@ public class HoodieWrapperFileSystem extends FileSystem {
.create(convertToDefaultPath(f), overwrite, bufferSize, replication, blockSize); .create(convertToDefaultPath(f), overwrite, bufferSize, replication, blockSize);
} }
@Override @Override
public FSDataOutputStream append(Path f, int bufferSize, Progressable progress) public FSDataOutputStream append(Path f, int bufferSize, Progressable progress)
throws IOException { throws IOException {
@@ -228,13 +255,13 @@ public class HoodieWrapperFileSystem extends FileSystem {
} }
@Override @Override
public void setWorkingDirectory(Path new_dir) { public Path getWorkingDirectory() {
fileSystem.setWorkingDirectory(convertToDefaultPath(new_dir)); return convertToHoodiePath(fileSystem.getWorkingDirectory());
} }
@Override @Override
public Path getWorkingDirectory() { public void setWorkingDirectory(Path newDir) {
return convertToHoodiePath(fileSystem.getWorkingDirectory()); fileSystem.setWorkingDirectory(convertToDefaultPath(newDir));
} }
@Override @Override
@@ -290,8 +317,7 @@ public class HoodieWrapperFileSystem extends FileSystem {
} }
@Override @Override
public BlockLocation[] getFileBlockLocations(Path p, long start, long len) public BlockLocation[] getFileBlockLocations(Path p, long start, long len) throws IOException {
throws IOException {
return fileSystem.getFileBlockLocations(convertToDefaultPath(p), start, len); return fileSystem.getFileBlockLocations(convertToDefaultPath(p), start, len);
} }
@@ -319,17 +345,16 @@ public class HoodieWrapperFileSystem extends FileSystem {
public FSDataOutputStream createNonRecursive(Path f, boolean overwrite, int bufferSize, public FSDataOutputStream createNonRecursive(Path f, boolean overwrite, int bufferSize,
short replication, long blockSize, Progressable progress) throws IOException { short replication, long blockSize, Progressable progress) throws IOException {
return fileSystem return fileSystem
.createNonRecursive(convertToDefaultPath(f), overwrite, bufferSize, replication, .createNonRecursive(convertToDefaultPath(f), overwrite, bufferSize, replication, blockSize,
blockSize, progress); progress);
} }
@Override @Override
public FSDataOutputStream createNonRecursive(Path f, FsPermission permission, boolean overwrite, public FSDataOutputStream createNonRecursive(Path f, FsPermission permission, boolean overwrite,
int bufferSize, short replication, long blockSize, Progressable progress) int bufferSize, short replication, long blockSize, Progressable progress) throws IOException {
throws IOException {
return fileSystem return fileSystem
.createNonRecursive(convertToDefaultPath(f), permission, overwrite, bufferSize, .createNonRecursive(convertToDefaultPath(f), permission, overwrite, bufferSize, replication,
replication, blockSize, progress); blockSize, progress);
} }
@Override @Override
@@ -418,20 +443,17 @@ public class HoodieWrapperFileSystem extends FileSystem {
} }
@Override @Override
public FileStatus[] listStatus(Path f, PathFilter filter) public FileStatus[] listStatus(Path f, PathFilter filter) throws IOException {
throws IOException {
return fileSystem.listStatus(convertToDefaultPath(f), filter); return fileSystem.listStatus(convertToDefaultPath(f), filter);
} }
@Override @Override
public FileStatus[] listStatus(Path[] files) public FileStatus[] listStatus(Path[] files) throws IOException {
throws IOException {
return fileSystem.listStatus(convertDefaults(files)); return fileSystem.listStatus(convertDefaults(files));
} }
@Override @Override
public FileStatus[] listStatus(Path[] files, PathFilter filter) public FileStatus[] listStatus(Path[] files, PathFilter filter) throws IOException {
throws IOException {
return fileSystem.listStatus(convertDefaults(files), filter); return fileSystem.listStatus(convertDefaults(files), filter);
} }
@@ -441,20 +463,17 @@ public class HoodieWrapperFileSystem extends FileSystem {
} }
@Override @Override
public FileStatus[] globStatus(Path pathPattern, PathFilter filter) public FileStatus[] globStatus(Path pathPattern, PathFilter filter) throws IOException {
throws IOException {
return fileSystem.globStatus(convertToDefaultPath(pathPattern), filter); return fileSystem.globStatus(convertToDefaultPath(pathPattern), filter);
} }
@Override @Override
public RemoteIterator<LocatedFileStatus> listLocatedStatus(Path f) public RemoteIterator<LocatedFileStatus> listLocatedStatus(Path f) throws IOException {
throws IOException {
return fileSystem.listLocatedStatus(convertToDefaultPath(f)); return fileSystem.listLocatedStatus(convertToDefaultPath(f));
} }
@Override @Override
public RemoteIterator<LocatedFileStatus> listFiles(Path f, boolean recursive) public RemoteIterator<LocatedFileStatus> listFiles(Path f, boolean recursive) throws IOException {
throws IOException {
return fileSystem.listFiles(convertToDefaultPath(f), recursive); return fileSystem.listFiles(convertToDefaultPath(f), recursive);
} }
@@ -498,8 +517,8 @@ public class HoodieWrapperFileSystem extends FileSystem {
@Override @Override
public void copyFromLocalFile(boolean delSrc, boolean overwrite, Path src, Path dst) public void copyFromLocalFile(boolean delSrc, boolean overwrite, Path src, Path dst)
throws IOException { throws IOException {
fileSystem.copyFromLocalFile(delSrc, overwrite, convertToDefaultPath(src), fileSystem
convertToDefaultPath(dst)); .copyFromLocalFile(delSrc, overwrite, convertToDefaultPath(src), convertToDefaultPath(dst));
} }
@Override @Override
@@ -525,15 +544,13 @@ public class HoodieWrapperFileSystem extends FileSystem {
} }
@Override @Override
public Path startLocalOutput(Path fsOutputFile, Path tmpLocalFile) public Path startLocalOutput(Path fsOutputFile, Path tmpLocalFile) throws IOException {
throws IOException { return convertToHoodiePath(fileSystem
return convertToHoodiePath(fileSystem.startLocalOutput(convertToDefaultPath(fsOutputFile), .startLocalOutput(convertToDefaultPath(fsOutputFile), convertToDefaultPath(tmpLocalFile)));
convertToDefaultPath(tmpLocalFile)));
} }
@Override @Override
public void completeLocalOutput(Path fsOutputFile, Path tmpLocalFile) public void completeLocalOutput(Path fsOutputFile, Path tmpLocalFile) throws IOException {
throws IOException {
fileSystem.completeLocalOutput(convertToDefaultPath(fsOutputFile), fileSystem.completeLocalOutput(convertToDefaultPath(fsOutputFile),
convertToDefaultPath(tmpLocalFile)); convertToDefaultPath(tmpLocalFile));
} }
@@ -574,23 +591,18 @@ public class HoodieWrapperFileSystem extends FileSystem {
} }
@Override @Override
public void access(Path path, FsAction mode) public void access(Path path, FsAction mode) throws IOException {
throws IOException {
fileSystem.access(convertToDefaultPath(path), mode); fileSystem.access(convertToDefaultPath(path), mode);
} }
@Override @Override
public void createSymlink(Path target, Path link, boolean createParent) public void createSymlink(Path target, Path link, boolean createParent) throws IOException {
throws
IOException {
fileSystem fileSystem
.createSymlink(convertToDefaultPath(target), convertToDefaultPath(link), createParent); .createSymlink(convertToDefaultPath(target), convertToDefaultPath(link), createParent);
} }
@Override @Override
public FileStatus getFileLinkStatus(Path f) public FileStatus getFileLinkStatus(Path f) throws IOException {
throws
IOException {
return fileSystem.getFileLinkStatus(convertToDefaultPath(f)); return fileSystem.getFileLinkStatus(convertToDefaultPath(f));
} }
@@ -651,8 +663,7 @@ public class HoodieWrapperFileSystem extends FileSystem {
@Override @Override
public Path createSnapshot(Path path, String snapshotName) throws IOException { public Path createSnapshot(Path path, String snapshotName) throws IOException {
return convertToHoodiePath( return convertToHoodiePath(fileSystem.createSnapshot(convertToDefaultPath(path), snapshotName));
fileSystem.createSnapshot(convertToDefaultPath(path), snapshotName));
} }
@Override @Override
@@ -718,8 +729,7 @@ public class HoodieWrapperFileSystem extends FileSystem {
} }
@Override @Override
public Map<String, byte[]> getXAttrs(Path path, List<String> names) public Map<String, byte[]> getXAttrs(Path path, List<String> names) throws IOException {
throws IOException {
return fileSystem.getXAttrs(convertToDefaultPath(path), names); return fileSystem.getXAttrs(convertToDefaultPath(path), names);
} }
@@ -734,13 +744,13 @@ public class HoodieWrapperFileSystem extends FileSystem {
} }
@Override @Override
public void setConf(Configuration conf) { public Configuration getConf() {
// ignore this. we will set conf on init return fileSystem.getConf();
} }
@Override @Override
public Configuration getConf() { public void setConf(Configuration conf) {
return fileSystem.getConf(); // ignore this. we will set conf on init
} }
@Override @Override
@@ -762,15 +772,6 @@ public class HoodieWrapperFileSystem extends FileSystem {
return convertPathWithScheme(oldPath, getHoodieScheme(fileSystem.getScheme())); return convertPathWithScheme(oldPath, getHoodieScheme(fileSystem.getScheme()));
} }
public static Path convertToHoodiePath(Path file, Configuration conf) {
try {
String scheme = FSUtils.getFs(file.toString(), conf).getScheme();
return convertPathWithScheme(file, getHoodieScheme(scheme));
} catch (HoodieIOException e) {
throw e;
}
}
private Path convertToDefaultPath(Path oldPath) { private Path convertToDefaultPath(Path oldPath) {
return convertPathWithScheme(oldPath, fileSystem.getScheme()); return convertPathWithScheme(oldPath, fileSystem.getScheme());
} }
@@ -783,30 +784,6 @@ public class HoodieWrapperFileSystem extends FileSystem {
return psrcsNew; return psrcsNew;
} }
private static Path convertPathWithScheme(Path oldPath, String newScheme) {
URI oldURI = oldPath.toUri();
URI newURI;
try {
newURI = new URI(newScheme, oldURI.getUserInfo(), oldURI.getHost(), oldURI.getPort(),
oldURI.getPath(), oldURI.getQuery(), oldURI.getFragment());
return new Path(newURI);
} catch (URISyntaxException e) {
// TODO - Better Exception handling
throw new RuntimeException(e);
}
}
public static String getHoodieScheme(String scheme) {
String newScheme;
if (SUPPORT_SCHEMES.contains(scheme)) {
newScheme = HOODIE_SCHEME_PREFIX + scheme;
} else {
throw new IllegalArgumentException(
"BlockAlignedAvroParquetWriter does not support scheme " + scheme);
}
return newScheme;
}
public long getBytesWritten(Path file) { public long getBytesWritten(Path file) {
if (openStreams.containsKey(file.getName())) { if (openStreams.containsKey(file.getName())) {
return openStreams.get(file.getName()).getBytesWritten(); return openStreams.get(file.getName()).getBytesWritten();

View File

@@ -30,14 +30,14 @@ import org.apache.log4j.Logger;
*/ */
public class HoodieMetrics { public class HoodieMetrics {
private HoodieWriteConfig config = null;
private String tableName = null;
private static Logger logger = LogManager.getLogger(HoodieMetrics.class); private static Logger logger = LogManager.getLogger(HoodieMetrics.class);
// Some timers // Some timers
public String rollbackTimerName = null; public String rollbackTimerName = null;
public String cleanTimerName = null; public String cleanTimerName = null;
public String commitTimerName = null; public String commitTimerName = null;
public String finalizeTimerName = null; public String finalizeTimerName = null;
private HoodieWriteConfig config = null;
private String tableName = null;
private Timer rollbackTimer = null; private Timer rollbackTimer = null;
private Timer cleanTimer = null; private Timer cleanTimer = null;
private Timer commitTimer = null; private Timer commitTimer = null;
@@ -113,8 +113,9 @@ public class HoodieMetrics {
public void updateRollbackMetrics(long durationInMs, long numFilesDeleted) { public void updateRollbackMetrics(long durationInMs, long numFilesDeleted) {
if (config.isMetricsOn()) { if (config.isMetricsOn()) {
logger.info(String.format("Sending rollback metrics (duration=%d, numFilesDeleted=$d)", logger.info(String
durationInMs, numFilesDeleted)); .format("Sending rollback metrics (duration=%d, numFilesDeleted=$d)", durationInMs,
numFilesDeleted));
registerGauge(getMetricsName("rollback", "duration"), durationInMs); registerGauge(getMetricsName("rollback", "duration"), durationInMs);
registerGauge(getMetricsName("rollback", "numFilesDeleted"), numFilesDeleted); registerGauge(getMetricsName("rollback", "numFilesDeleted"), numFilesDeleted);
} }
@@ -122,8 +123,9 @@ public class HoodieMetrics {
public void updateCleanMetrics(long durationInMs, int numFilesDeleted) { public void updateCleanMetrics(long durationInMs, int numFilesDeleted) {
if (config.isMetricsOn()) { if (config.isMetricsOn()) {
logger.info(String.format("Sending clean metrics (duration=%d, numFilesDeleted=%d)", logger.info(String
durationInMs, numFilesDeleted)); .format("Sending clean metrics (duration=%d, numFilesDeleted=%d)", durationInMs,
numFilesDeleted));
registerGauge(getMetricsName("clean", "duration"), durationInMs); registerGauge(getMetricsName("clean", "duration"), durationInMs);
registerGauge(getMetricsName("clean", "numFilesDeleted"), numFilesDeleted); registerGauge(getMetricsName("clean", "numFilesDeleted"), numFilesDeleted);
} }
@@ -131,7 +133,8 @@ public class HoodieMetrics {
public void updateFinalizeWriteMetrics(long durationInMs, int numFilesFinalized) { public void updateFinalizeWriteMetrics(long durationInMs, int numFilesFinalized) {
if (config.isMetricsOn()) { if (config.isMetricsOn()) {
logger.info(String.format("Sending finalize write metrics (duration=%d, numFilesFinalized=%d)", logger.info(String
.format("Sending finalize write metrics (duration=%d, numFilesFinalized=%d)",
durationInMs, numFilesFinalized)); durationInMs, numFilesFinalized));
registerGauge(getMetricsName("finalize", "duration"), durationInMs); registerGauge(getMetricsName("finalize", "duration"), durationInMs);
registerGauge(getMetricsName("finalize", "numFilesFinalized"), numFilesFinalized); registerGauge(getMetricsName("finalize", "numFilesFinalized"), numFilesFinalized);
@@ -140,8 +143,7 @@ public class HoodieMetrics {
@VisibleForTesting @VisibleForTesting
String getMetricsName(String action, String metric) { String getMetricsName(String action, String metric) {
return config == null ? null : return config == null ? null : String.format("%s.%s.%s", tableName, action, metric);
String.format("%s.%s.%s", tableName, action, metric);
} }
void registerGauge(String metricName, final long value) { void registerGauge(String metricName, final long value) {
@@ -154,7 +156,8 @@ public class HoodieMetrics {
} }
}); });
} catch (Exception e) { } catch (Exception e) {
// Here we catch all exception, so the major upsert pipeline will not be affected if the metrics system // Here we catch all exception, so the major upsert pipeline will not be affected if the
// metrics system
// has some issues. // has some issues.
logger.error("Failed to send metrics: ", e); logger.error("Failed to send metrics: ", e);
} }

View File

@@ -33,14 +33,13 @@ import org.apache.log4j.Logger;
*/ */
public class MetricsGraphiteReporter extends MetricsReporter { public class MetricsGraphiteReporter extends MetricsReporter {
private static Logger logger = LogManager.getLogger(MetricsGraphiteReporter.class);
private final MetricRegistry registry; private final MetricRegistry registry;
private final GraphiteReporter graphiteReporter; private final GraphiteReporter graphiteReporter;
private final HoodieWriteConfig config; private final HoodieWriteConfig config;
private String serverHost; private String serverHost;
private int serverPort; private int serverPort;
private static Logger logger = LogManager.getLogger(MetricsGraphiteReporter.class);
public MetricsGraphiteReporter(HoodieWriteConfig config, MetricRegistry registry) { public MetricsGraphiteReporter(HoodieWriteConfig config, MetricRegistry registry) {
this.registry = registry; this.registry = registry;
this.config = config; this.config = config;
@@ -49,8 +48,8 @@ public class MetricsGraphiteReporter extends MetricsReporter {
this.serverHost = config.getGraphiteServerHost(); this.serverHost = config.getGraphiteServerHost();
this.serverPort = config.getGraphiteServerPort(); this.serverPort = config.getGraphiteServerPort();
if (serverHost == null || serverPort == 0) { if (serverHost == null || serverPort == 0) {
throw new RuntimeException( throw new RuntimeException(String
String.format("Graphite cannot be initialized with serverHost[%s] and serverPort[%s].", .format("Graphite cannot be initialized with serverHost[%s] and serverPort[%s].",
serverHost, serverPort)); serverHost, serverPort));
} }
@@ -81,14 +80,10 @@ public class MetricsGraphiteReporter extends MetricsReporter {
} }
private GraphiteReporter createGraphiteReport() { private GraphiteReporter createGraphiteReport() {
Graphite graphite = new Graphite( Graphite graphite = new Graphite(new InetSocketAddress(serverHost, serverPort));
new InetSocketAddress(serverHost, serverPort));
String reporterPrefix = config.getGraphiteMetricPrefix(); String reporterPrefix = config.getGraphiteMetricPrefix();
return GraphiteReporter.forRegistry(registry) return GraphiteReporter.forRegistry(registry).prefixedWith(reporterPrefix)
.prefixedWith(reporterPrefix) .convertRatesTo(TimeUnit.SECONDS).convertDurationsTo(TimeUnit.MILLISECONDS)
.convertRatesTo(TimeUnit.SECONDS) .filter(MetricFilter.ALL).build(graphite);
.convertDurationsTo(TimeUnit.MILLISECONDS)
.filter(MetricFilter.ALL)
.build(graphite);
} }
} }

View File

@@ -28,8 +28,7 @@ public class MetricsReporterFactory {
private static Logger logger = LogManager.getLogger(MetricsReporterFactory.class); private static Logger logger = LogManager.getLogger(MetricsReporterFactory.class);
public static MetricsReporter createReporter(HoodieWriteConfig config, public static MetricsReporter createReporter(HoodieWriteConfig config, MetricRegistry registry) {
MetricRegistry registry) {
MetricsReporterType type = config.getMetricsReporterType(); MetricsReporterType type = config.getMetricsReporterType();
MetricsReporter reporter = null; MetricsReporter reporter = null;
switch (type) { switch (type) {

View File

@@ -21,6 +21,5 @@ package com.uber.hoodie.metrics;
* future. * future.
*/ */
public enum MetricsReporterType { public enum MetricsReporterType {
GRAPHITE, GRAPHITE, INMEMORY
INMEMORY
} }

View File

@@ -75,23 +75,425 @@ import scala.Tuple2;
/** /**
* Implementation of a very heavily read-optimized Hoodie Table where * Implementation of a very heavily read-optimized Hoodie Table where
* * <p>
* INSERTS - Produce new files, block aligned to desired size (or) Merge with the smallest existing * INSERTS - Produce new files, block aligned to desired size (or) Merge with the smallest existing
* file, to expand it * file, to expand it
* * <p>
* UPDATES - Produce a new version of the file, just replacing the updated records with new values * UPDATES - Produce a new version of the file, just replacing the updated records with new values
*/ */
public class HoodieCopyOnWriteTable<T extends HoodieRecordPayload> extends HoodieTable<T> { public class HoodieCopyOnWriteTable<T extends HoodieRecordPayload> extends HoodieTable<T> {
private static Logger logger = LogManager.getLogger(HoodieCopyOnWriteTable.class);
public HoodieCopyOnWriteTable(HoodieWriteConfig config, HoodieTableMetaClient metaClient) { public HoodieCopyOnWriteTable(HoodieWriteConfig config, HoodieTableMetaClient metaClient) {
super(config, metaClient); super(config, metaClient);
} }
private static Logger logger = LogManager.getLogger(HoodieCopyOnWriteTable.class); private static PairFlatMapFunction<Iterator<Tuple2<String, String>>, String,
PartitionCleanStat> deleteFilesFunc(
HoodieTable table) {
return (PairFlatMapFunction<Iterator<Tuple2<String, String>>, String, PartitionCleanStat>)
iter -> {
Map<String, PartitionCleanStat> partitionCleanStatMap = new HashMap<>();
FileSystem fs = table.getMetaClient().getFs();
while (iter.hasNext()) {
Tuple2<String, String> partitionDelFileTuple = iter.next();
String partitionPath = partitionDelFileTuple._1();
String deletePathStr = partitionDelFileTuple._2();
Boolean deletedFileResult = deleteFileAndGetResult(fs, deletePathStr);
if (!partitionCleanStatMap.containsKey(partitionPath)) {
partitionCleanStatMap.put(partitionPath, new PartitionCleanStat(partitionPath));
}
PartitionCleanStat partitionCleanStat = partitionCleanStatMap.get(partitionPath);
partitionCleanStat.addDeleteFilePatterns(deletePathStr);
partitionCleanStat.addDeletedFileResult(deletePathStr, deletedFileResult);
}
return partitionCleanStatMap.entrySet().stream()
.map(e -> new Tuple2<>(e.getKey(), e.getValue()))
.collect(Collectors.toList()).iterator();
};
}
private static PairFlatMapFunction<String, String, String> getFilesToDeleteFunc(HoodieTable table,
HoodieWriteConfig config) {
return (PairFlatMapFunction<String, String, String>) partitionPathToClean -> {
HoodieCleanHelper cleaner = new HoodieCleanHelper(table, config);
return cleaner.getDeletePaths(partitionPathToClean).stream()
.map(deleteFile -> new Tuple2<>(partitionPathToClean, deleteFile.toString())).iterator();
};
}
private static Boolean deleteFileAndGetResult(FileSystem fs, String deletePathStr)
throws IOException {
Path deletePath = new Path(deletePathStr);
logger.debug("Working on delete path :" + deletePath);
boolean deleteResult = fs.delete(deletePath, false);
if (deleteResult) {
logger.debug("Cleaned file at path :" + deletePath);
}
return deleteResult;
}
@Override
public Partitioner getUpsertPartitioner(WorkloadProfile profile) {
if (profile == null) {
throw new HoodieUpsertException("Need workload profile to construct the upsert partitioner.");
}
return new UpsertPartitioner(profile);
}
@Override
public Partitioner getInsertPartitioner(WorkloadProfile profile) {
return getUpsertPartitioner(profile);
}
@Override
public boolean isWorkloadProfileNeeded() {
return true;
}
@Override
public JavaRDD<WriteStatus> compact(JavaSparkContext jsc, String commitTime) {
throw new HoodieNotSupportedException("Compaction is not supported from a CopyOnWrite table");
}
public Iterator<List<WriteStatus>> handleUpdate(String commitTime, String fileLoc,
Iterator<HoodieRecord<T>> recordItr) throws IOException {
// these are updates
HoodieMergeHandle upsertHandle = getUpdateHandle(commitTime, fileLoc, recordItr);
return handleUpdateInternal(upsertHandle, commitTime, fileLoc);
}
public Iterator<List<WriteStatus>> handleUpdate(String commitTime, String fileLoc,
Map<String, HoodieRecord<T>> keyToNewRecords) throws IOException {
// these are updates
HoodieMergeHandle upsertHandle = getUpdateHandle(commitTime, fileLoc, keyToNewRecords);
return handleUpdateInternal(upsertHandle, commitTime, fileLoc);
}
protected Iterator<List<WriteStatus>> handleUpdateInternal(HoodieMergeHandle upsertHandle,
String commitTime, String fileLoc) throws IOException {
if (upsertHandle.getOldFilePath() == null) {
throw new HoodieUpsertException(
"Error in finding the old file path at commit " + commitTime + " at fileLoc: " + fileLoc);
} else {
AvroReadSupport.setAvroReadSchema(getHadoopConf(), upsertHandle.getSchema());
ParquetReader<IndexedRecord> reader = AvroParquetReader.builder(upsertHandle.getOldFilePath())
.withConf(getHadoopConf()).build();
try {
IndexedRecord record;
while ((record = reader.read()) != null) {
// Two types of writes here (new record, and old record).
// We have already catch the exception during writing new records.
// But for old records, we should fail if any exception happens.
upsertHandle.write((GenericRecord) record);
}
} catch (IOException e) {
throw new HoodieUpsertException(
"Failed to read record from " + upsertHandle.getOldFilePath() + " with new Schema "
+ upsertHandle.getSchema(), e);
} finally {
reader.close();
upsertHandle.close();
}
}
//TODO(vc): This needs to be revisited
if (upsertHandle.getWriteStatus().getPartitionPath() == null) {
logger.info("Upsert Handle has partition path as null " + upsertHandle.getOldFilePath() + ", "
+ upsertHandle.getWriteStatus());
}
return Collections.singletonList(Collections.singletonList(upsertHandle.getWriteStatus()))
.iterator();
}
protected HoodieMergeHandle getUpdateHandle(String commitTime, String fileLoc,
Iterator<HoodieRecord<T>> recordItr) {
return new HoodieMergeHandle<>(config, commitTime, this, recordItr, fileLoc);
}
protected HoodieMergeHandle getUpdateHandle(String commitTime, String fileLoc,
Map<String, HoodieRecord<T>> keyToNewRecords) {
return new HoodieMergeHandle<>(config, commitTime, this, keyToNewRecords, fileLoc);
}
public Iterator<List<WriteStatus>> handleInsert(String commitTime,
Iterator<HoodieRecord<T>> recordItr) throws Exception {
return new LazyInsertIterable<>(recordItr, config, commitTime, this);
}
@SuppressWarnings("unchecked")
@Override
public Iterator<List<WriteStatus>> handleUpsertPartition(String commitTime, Integer partition,
Iterator recordItr, Partitioner partitioner) {
UpsertPartitioner upsertPartitioner = (UpsertPartitioner) partitioner;
BucketInfo binfo = upsertPartitioner.getBucketInfo(partition);
BucketType btype = binfo.bucketType;
try {
if (btype.equals(BucketType.INSERT)) {
return handleInsert(commitTime, recordItr);
} else if (btype.equals(BucketType.UPDATE)) {
return handleUpdate(commitTime, binfo.fileLoc, recordItr);
} else {
throw new HoodieUpsertException(
"Unknown bucketType " + btype + " for partition :" + partition);
}
} catch (Throwable t) {
String msg = "Error upserting bucketType " + btype + " for partition :" + partition;
logger.error(msg, t);
throw new HoodieUpsertException(msg, t);
}
}
@Override
public Iterator<List<WriteStatus>> handleInsertPartition(String commitTime, Integer partition,
Iterator recordItr, Partitioner partitioner) {
return handleUpsertPartition(commitTime, partition, recordItr, partitioner);
}
/**
* Performs cleaning of partition paths according to cleaning policy and returns the number of
* files cleaned. Handles skews in partitions to clean by making files to clean as the unit of
* task distribution.
*
* @throws IllegalArgumentException if unknown cleaning policy is provided
*/
@Override
public List<HoodieCleanStat> clean(JavaSparkContext jsc) {
try {
FileSystem fs = getMetaClient().getFs();
List<String> partitionsToClean = FSUtils
.getAllPartitionPaths(fs, getMetaClient().getBasePath(),
config.shouldAssumeDatePartitioning());
logger.info("Partitions to clean up : " + partitionsToClean + ", with policy " + config
.getCleanerPolicy());
if (partitionsToClean.isEmpty()) {
logger.info("Nothing to clean here mom. It is already clean");
return Collections.emptyList();
}
return cleanPartitionPaths(partitionsToClean, jsc);
} catch (IOException e) {
throw new HoodieIOException("Failed to clean up after commit", e);
}
}
/**
* Common method used for cleaning out parquet files under a partition path during rollback of a
* set of commits
*/
protected Map<FileStatus, Boolean> deleteCleanedFiles(String partitionPath, List<String> commits)
throws IOException {
logger.info("Cleaning path " + partitionPath);
FileSystem fs = getMetaClient().getFs();
FileStatus[] toBeDeleted = fs
.listStatus(new Path(config.getBasePath(), partitionPath), path -> {
if (!path.toString().contains(".parquet")) {
return false;
}
String fileCommitTime = FSUtils.getCommitTime(path.getName());
return commits.contains(fileCommitTime);
});
Map<FileStatus, Boolean> results = Maps.newHashMap();
for (FileStatus file : toBeDeleted) {
boolean success = fs.delete(file.getPath(), false);
results.put(file, success);
logger.info("Delete file " + file.getPath() + "\t" + success);
}
return results;
}
@Override
public List<HoodieRollbackStat> rollback(JavaSparkContext jsc, List<String> commits)
throws IOException {
String actionType = this.getCommitActionType();
HoodieActiveTimeline activeTimeline = this.getActiveTimeline();
List<String> inflights = this.getInflightCommitTimeline().getInstants()
.map(HoodieInstant::getTimestamp).collect(Collectors.toList());
// Atomically unpublish all the commits
commits.stream().filter(s -> !inflights.contains(s))
.map(s -> new HoodieInstant(false, actionType, s))
.forEach(activeTimeline::revertToInflight);
logger.info("Unpublished " + commits);
// delete all the data files for all these commits
logger.info("Clean out all parquet files generated for commits: " + commits);
List<HoodieRollbackStat> stats = jsc.parallelize(FSUtils
.getAllPartitionPaths(metaClient.getFs(), getMetaClient().getBasePath(),
config.shouldAssumeDatePartitioning()))
.map((Function<String, HoodieRollbackStat>) partitionPath -> {
// Scan all partitions files with this commit time
Map<FileStatus, Boolean> results = deleteCleanedFiles(partitionPath, commits);
return HoodieRollbackStat.newBuilder().withPartitionPath(partitionPath)
.withDeletedFileResults(results).build();
}).collect();
// clean temporary data files
cleanTemporaryDataFiles(jsc);
// Remove the rolled back inflight commits
commits.stream().map(s -> new HoodieInstant(true, actionType, s))
.forEach(activeTimeline::deleteInflight);
logger.info("Deleted inflight commits " + commits);
return stats;
}
/**
* Finalize the written data files
*
* @param writeStatuses List of WriteStatus
* @return number of files finalized
*/
@Override
@SuppressWarnings("unchecked")
public Optional<Integer> finalizeWrite(JavaSparkContext jsc, List writeStatuses) {
if (!config.shouldUseTempFolderForCopyOnWrite()) {
return Optional.empty();
}
// This is to rename each data file from temporary path to its final location
List<Tuple2<String, Boolean>> results = jsc
.parallelize(writeStatuses, config.getFinalizeWriteParallelism()).map(writeStatus -> {
Tuple2<String, HoodieWriteStat> writeStatTuple2 = (Tuple2<String, HoodieWriteStat>)
writeStatus;
HoodieWriteStat writeStat = writeStatTuple2._2();
final FileSystem fs = getMetaClient().getFs();
final Path finalPath = new Path(config.getBasePath(), writeStat.getPath());
if (writeStat.getTempPath() != null) {
final Path tempPath = new Path(config.getBasePath(), writeStat.getTempPath());
boolean success;
try {
logger.info("Renaming temporary file: " + tempPath + " to " + finalPath);
success = fs.rename(tempPath, finalPath);
} catch (IOException e) {
throw new HoodieIOException(
"Failed to rename file: " + tempPath + " to " + finalPath);
}
if (!success) {
throw new HoodieIOException(
"Failed to rename file: " + tempPath + " to " + finalPath);
}
}
return new Tuple2<>(writeStat.getPath(), true);
}).collect();
// clean temporary data files
cleanTemporaryDataFiles(jsc);
return Optional.of(results.size());
}
/**
* Clean temporary data files that are produced from previous failed commit or retried spark
* stages.
*/
private void cleanTemporaryDataFiles(JavaSparkContext jsc) {
if (!config.shouldUseTempFolderForCopyOnWrite()) {
return;
}
final FileSystem fs = getMetaClient().getFs();
final Path temporaryFolder = new Path(config.getBasePath(),
HoodieTableMetaClient.TEMPFOLDER_NAME);
try {
if (!fs.exists(temporaryFolder)) {
logger.info("Temporary folder does not exist: " + temporaryFolder);
return;
}
List<FileStatus> fileStatusesList = Arrays.asList(fs.listStatus(temporaryFolder));
List<Tuple2<String, Boolean>> results = jsc
.parallelize(fileStatusesList, config.getFinalizeWriteParallelism()).map(fileStatus -> {
FileSystem fs1 = getMetaClient().getFs();
boolean success = fs1.delete(fileStatus.getPath(), false);
logger
.info("Deleting file in temporary folder" + fileStatus.getPath() + "\t" + success);
return new Tuple2<>(fileStatus.getPath().toString(), success);
}).collect();
for (Tuple2<String, Boolean> result : results) {
if (!result._2()) {
logger.info("Failed to delete file: " + result._1());
throw new HoodieIOException("Failed to delete file in temporary folder: " + result._1());
}
}
} catch (IOException e) {
throw new HoodieIOException(
"Failed to clean data files in temporary folder: " + temporaryFolder);
}
}
private List<HoodieCleanStat> cleanPartitionPaths(List<String> partitionsToClean,
JavaSparkContext jsc) {
int cleanerParallelism = Math.min(partitionsToClean.size(), config.getCleanerParallelism());
logger.info("Using cleanerParallelism: " + cleanerParallelism);
List<Tuple2<String, PartitionCleanStat>> partitionCleanStats = jsc
.parallelize(partitionsToClean, cleanerParallelism)
.flatMapToPair(getFilesToDeleteFunc(this, config))
.repartition(cleanerParallelism) // repartition to remove skews
.mapPartitionsToPair(deleteFilesFunc(this)).reduceByKey(
// merge partition level clean stats below
(Function2<PartitionCleanStat, PartitionCleanStat, PartitionCleanStat>) (e1, e2) -> e1
.merge(e2)).collect();
Map<String, PartitionCleanStat> partitionCleanStatsMap = partitionCleanStats.stream()
.collect(Collectors.toMap(e -> e._1(), e -> e._2()));
HoodieCleanHelper cleaner = new HoodieCleanHelper(this, config);
// Return PartitionCleanStat for each partition passed.
return partitionsToClean.stream().map(partitionPath -> {
PartitionCleanStat partitionCleanStat =
(partitionCleanStatsMap.containsKey(partitionPath)) ? partitionCleanStatsMap
.get(partitionPath) : new PartitionCleanStat(partitionPath);
return HoodieCleanStat.newBuilder().withPolicy(config.getCleanerPolicy())
.withPartitionPath(partitionPath)
.withEarliestCommitRetained(cleaner.getEarliestCommitToRetain())
.withDeletePathPattern(partitionCleanStat.deletePathPatterns)
.withSuccessfulDeletes(partitionCleanStat.successDeleteFiles)
.withFailedDeletes(partitionCleanStat.failedDeleteFiles).build();
}).collect(Collectors.toList());
}
enum BucketType { enum BucketType {
UPDATE, UPDATE, INSERT
INSERT }
private static class PartitionCleanStat implements Serializable {
private final String partitionPath;
private final List<String> deletePathPatterns = new ArrayList<>();
private final List<String> successDeleteFiles = new ArrayList<>();
private final List<String> failedDeleteFiles = new ArrayList<>();
private PartitionCleanStat(String partitionPath) {
this.partitionPath = partitionPath;
}
private void addDeletedFileResult(String deletePathStr, Boolean deletedFileResult) {
if (deletedFileResult) {
successDeleteFiles.add(deletePathStr);
} else {
failedDeleteFiles.add(deletePathStr);
}
}
private void addDeleteFilePatterns(String deletePathStr) {
deletePathPatterns.add(deletePathStr);
}
private PartitionCleanStat merge(PartitionCleanStat other) {
if (!this.partitionPath.equals(other.partitionPath)) {
throw new RuntimeException(String
.format("partitionPath is not a match: (%s, %s)", partitionPath, other.partitionPath));
}
successDeleteFiles.addAll(other.successDeleteFiles);
deletePathPatterns.addAll(other.deletePathPatterns);
failedDeleteFiles.addAll(other.failedDeleteFiles);
return this;
}
} }
/** /**
@@ -150,45 +552,37 @@ public class HoodieCopyOnWriteTable<T extends HoodieRecordPayload> extends Hoodi
} }
} }
/** /**
* Packs incoming records to be upserted, into buckets (1 bucket = 1 RDD partition) * Packs incoming records to be upserted, into buckets (1 bucket = 1 RDD partition)
*/ */
class UpsertPartitioner extends Partitioner { class UpsertPartitioner extends Partitioner {
/**
* List of all small files to be corrected
*/
List<SmallFile> smallFiles = new ArrayList<SmallFile>();
/** /**
* Total number of RDD partitions, is determined by total buckets we want to pack the incoming * Total number of RDD partitions, is determined by total buckets we want to pack the incoming
* workload into * workload into
*/ */
private int totalBuckets = 0; private int totalBuckets = 0;
/** /**
* Stat for the current workload. Helps in determining total inserts, upserts etc. * Stat for the current workload. Helps in determining total inserts, upserts etc.
*/ */
private WorkloadStat globalStat; private WorkloadStat globalStat;
/** /**
* Helps decide which bucket an incoming update should go to. * Helps decide which bucket an incoming update should go to.
*/ */
private HashMap<String, Integer> updateLocationToBucket; private HashMap<String, Integer> updateLocationToBucket;
/** /**
* Helps us pack inserts into 1 or more buckets depending on number of incoming records. * Helps us pack inserts into 1 or more buckets depending on number of incoming records.
*/ */
private HashMap<String, List<InsertBucket>> partitionPathToInsertBuckets; private HashMap<String, List<InsertBucket>> partitionPathToInsertBuckets;
/** /**
* Remembers what type each bucket is for later. * Remembers what type each bucket is for later.
*/ */
private HashMap<Integer, BucketInfo> bucketInfoMap; private HashMap<Integer, BucketInfo> bucketInfoMap;
/**
* List of all small files to be corrected
*/
List<SmallFile> smallFiles = new ArrayList<SmallFile>();
UpsertPartitioner(WorkloadProfile profile) { UpsertPartitioner(WorkloadProfile profile) {
updateLocationToBucket = new HashMap<>(); updateLocationToBucket = new HashMap<>();
partitionPathToInsertBuckets = new HashMap<>(); partitionPathToInsertBuckets = new HashMap<>();
@@ -198,16 +592,17 @@ public class HoodieCopyOnWriteTable<T extends HoodieRecordPayload> extends Hoodi
assignUpdates(profile); assignUpdates(profile);
assignInserts(profile); assignInserts(profile);
logger.info("Total Buckets :" + totalBuckets + ", " + logger.info(
"buckets info => " + bucketInfoMap + ", \n" + "Total Buckets :" + totalBuckets + ", " + "buckets info => " + bucketInfoMap + ", \n"
"Partition to insert buckets => " + partitionPathToInsertBuckets + ", \n" + + "Partition to insert buckets => " + partitionPathToInsertBuckets + ", \n"
"UpdateLocations mapped to buckets =>" + updateLocationToBucket); + "UpdateLocations mapped to buckets =>" + updateLocationToBucket);
} }
private void assignUpdates(WorkloadProfile profile) { private void assignUpdates(WorkloadProfile profile) {
// each update location gets a partition // each update location gets a partition
WorkloadStat gStat = profile.getGlobalStat(); WorkloadStat gStat = profile.getGlobalStat();
for (Map.Entry<String, Pair<String, Long>> updateLocEntry : gStat.getUpdateLocationToCount().entrySet()) { for (Map.Entry<String, Pair<String, Long>> updateLocEntry : gStat.getUpdateLocationToCount()
.entrySet()) {
addUpdateBucket(updateLocEntry.getKey()); addUpdateBucket(updateLocEntry.getKey());
} }
} }
@@ -270,10 +665,10 @@ public class HoodieCopyOnWriteTable<T extends HoodieRecordPayload> extends Hoodi
} }
int insertBuckets = (int) Math.max(totalUnassignedInserts / insertRecordsPerBucket, 1L); int insertBuckets = (int) Math.max(totalUnassignedInserts / insertRecordsPerBucket, 1L);
logger logger.info(
.info("After small file assignment: unassignedInserts => " + totalUnassignedInserts "After small file assignment: unassignedInserts => " + totalUnassignedInserts
+ ", totalInsertBuckets => " + insertBuckets + ", totalInsertBuckets => " + insertBuckets + ", recordsPerBucket => "
+ ", recordsPerBucket => " + insertRecordsPerBucket); + insertRecordsPerBucket);
for (int b = 0; b < insertBuckets; b++) { for (int b = 0; b < insertBuckets; b++) {
bucketNumbers.add(totalBuckets); bucketNumbers.add(totalBuckets);
recordsPerBucket.add(totalUnassignedInserts / insertBuckets); recordsPerBucket.add(totalUnassignedInserts / insertBuckets);
@@ -339,8 +734,8 @@ public class HoodieCopyOnWriteTable<T extends HoodieRecordPayload> extends Hoodi
*/ */
private long averageBytesPerRecord() { private long averageBytesPerRecord() {
long avgSize = 0L; long avgSize = 0L;
HoodieTimeline commitTimeline = HoodieTimeline commitTimeline = metaClient.getActiveTimeline().getCommitTimeline()
metaClient.getActiveTimeline().getCommitTimeline().filterCompletedInstants(); .filterCompletedInstants();
try { try {
if (!commitTimeline.empty()) { if (!commitTimeline.empty()) {
HoodieInstant latestCommitTime = commitTimeline.lastInstant().get(); HoodieInstant latestCommitTime = commitTimeline.lastInstant().get();
@@ -372,7 +767,8 @@ public class HoodieCopyOnWriteTable<T extends HoodieRecordPayload> extends Hoodi
@Override @Override
public int getPartition(Object key) { public int getPartition(Object key) {
Tuple2<HoodieKey, Option<HoodieRecordLocation>> keyLocation = (Tuple2<HoodieKey, Option<HoodieRecordLocation>>) key; Tuple2<HoodieKey, Option<HoodieRecordLocation>> keyLocation = (Tuple2<HoodieKey,
Option<HoodieRecordLocation>>) key;
if (keyLocation._2().isDefined()) { if (keyLocation._2().isDefined()) {
HoodieRecordLocation location = keyLocation._2().get(); HoodieRecordLocation location = keyLocation._2().get();
return updateLocationToBucket.get(location.getFileId()); return updateLocationToBucket.get(location.getFileId());
@@ -396,420 +792,4 @@ public class HoodieCopyOnWriteTable<T extends HoodieRecordPayload> extends Hoodi
} }
} }
} }
@Override
public Partitioner getUpsertPartitioner(WorkloadProfile profile) {
if (profile == null) {
throw new HoodieUpsertException("Need workload profile to construct the upsert partitioner.");
}
return new UpsertPartitioner(profile);
}
@Override
public Partitioner getInsertPartitioner(WorkloadProfile profile) {
return getUpsertPartitioner(profile);
}
@Override
public boolean isWorkloadProfileNeeded() {
return true;
}
@Override
public JavaRDD<WriteStatus> compact(JavaSparkContext jsc, String commitTime) {
throw new HoodieNotSupportedException("Compaction is not supported from a CopyOnWrite table");
}
public Iterator<List<WriteStatus>> handleUpdate(String commitTime, String fileLoc,
Iterator<HoodieRecord<T>> recordItr)
throws IOException {
// these are updates
HoodieMergeHandle upsertHandle = getUpdateHandle(commitTime, fileLoc, recordItr);
return handleUpdateInternal(upsertHandle, commitTime, fileLoc);
}
public Iterator<List<WriteStatus>> handleUpdate(String commitTime, String fileLoc,
Map<String, HoodieRecord<T>> keyToNewRecords)
throws IOException {
// these are updates
HoodieMergeHandle upsertHandle = getUpdateHandle(commitTime, fileLoc, keyToNewRecords);
return handleUpdateInternal(upsertHandle, commitTime, fileLoc);
}
protected Iterator<List<WriteStatus>> handleUpdateInternal(HoodieMergeHandle upsertHandle, String commitTime, String fileLoc)
throws IOException {
if (upsertHandle.getOldFilePath() == null) {
throw new HoodieUpsertException("Error in finding the old file path at commit " +
commitTime + " at fileLoc: " + fileLoc);
} else {
AvroReadSupport.setAvroReadSchema(getHadoopConf(), upsertHandle.getSchema());
ParquetReader<IndexedRecord> reader =
AvroParquetReader.builder(upsertHandle.getOldFilePath()).withConf(getHadoopConf())
.build();
try {
IndexedRecord record;
while ((record = reader.read()) != null) {
// Two types of writes here (new record, and old record).
// We have already catch the exception during writing new records.
// But for old records, we should fail if any exception happens.
upsertHandle.write((GenericRecord) record);
}
} catch (IOException e) {
throw new HoodieUpsertException(
"Failed to read record from " + upsertHandle.getOldFilePath()
+ " with new Schema " + upsertHandle.getSchema(), e);
} finally {
reader.close();
upsertHandle.close();
}
}
//TODO(vc): This needs to be revisited
if (upsertHandle.getWriteStatus().getPartitionPath() == null) {
logger.info("Upsert Handle has partition path as null " + upsertHandle.getOldFilePath()
+ ", " + upsertHandle.getWriteStatus());
}
return Collections.singletonList(Collections.singletonList(upsertHandle.getWriteStatus()))
.iterator();
}
protected HoodieMergeHandle getUpdateHandle(String commitTime, String fileLoc,
Iterator<HoodieRecord<T>> recordItr) {
return new HoodieMergeHandle<>(config, commitTime, this, recordItr, fileLoc);
}
protected HoodieMergeHandle getUpdateHandle(String commitTime, String fileLoc,
Map<String, HoodieRecord<T>> keyToNewRecords) {
return new HoodieMergeHandle<>(config, commitTime, this, keyToNewRecords, fileLoc);
}
public Iterator<List<WriteStatus>> handleInsert(String commitTime,
Iterator<HoodieRecord<T>> recordItr) throws Exception {
return new LazyInsertIterable<>(recordItr, config, commitTime, this);
}
@SuppressWarnings("unchecked")
@Override
public Iterator<List<WriteStatus>> handleUpsertPartition(String commitTime, Integer partition,
Iterator recordItr, Partitioner partitioner) {
UpsertPartitioner upsertPartitioner = (UpsertPartitioner) partitioner;
BucketInfo binfo = upsertPartitioner.getBucketInfo(partition);
BucketType btype = binfo.bucketType;
try {
if (btype.equals(BucketType.INSERT)) {
return handleInsert(commitTime, recordItr);
} else if (btype.equals(BucketType.UPDATE)) {
return handleUpdate(commitTime, binfo.fileLoc, recordItr);
} else {
throw new HoodieUpsertException(
"Unknown bucketType " + btype + " for partition :" + partition);
}
} catch (Throwable t) {
String msg = "Error upserting bucketType " + btype + " for partition :" + partition;
logger.error(msg, t);
throw new HoodieUpsertException(msg, t);
}
}
@Override
public Iterator<List<WriteStatus>> handleInsertPartition(String commitTime, Integer partition,
Iterator recordItr,
Partitioner partitioner) {
return handleUpsertPartition(commitTime, partition, recordItr, partitioner);
}
/**
* Performs cleaning of partition paths according to cleaning policy and returns the number of
* files cleaned. Handles skews in partitions to clean by making files to clean as the unit of
* task distribution.
*
* @throws IllegalArgumentException if unknown cleaning policy is provided
*/
@Override
public List<HoodieCleanStat> clean(JavaSparkContext jsc) {
try {
FileSystem fs = getMetaClient().getFs();
List<String> partitionsToClean =
FSUtils.getAllPartitionPaths(fs, getMetaClient().getBasePath(),
config.shouldAssumeDatePartitioning());
logger.info("Partitions to clean up : " + partitionsToClean + ", with policy " + config
.getCleanerPolicy());
if (partitionsToClean.isEmpty()) {
logger.info("Nothing to clean here mom. It is already clean");
return Collections.emptyList();
}
return cleanPartitionPaths(partitionsToClean, jsc);
} catch (IOException e) {
throw new HoodieIOException("Failed to clean up after commit", e);
}
}
/**
* Common method used for cleaning out parquet files under a partition path during rollback of a
* set of commits
*/
protected Map<FileStatus, Boolean> deleteCleanedFiles(String partitionPath, List<String> commits)
throws IOException {
logger.info("Cleaning path " + partitionPath);
FileSystem fs = getMetaClient().getFs();
FileStatus[] toBeDeleted =
fs.listStatus(new Path(config.getBasePath(), partitionPath), path -> {
if (!path.toString().contains(".parquet")) {
return false;
}
String fileCommitTime = FSUtils.getCommitTime(path.getName());
return commits.contains(fileCommitTime);
});
Map<FileStatus, Boolean> results = Maps.newHashMap();
for (FileStatus file : toBeDeleted) {
boolean success = fs.delete(file.getPath(), false);
results.put(file, success);
logger.info("Delete file " + file.getPath() + "\t" + success);
}
return results;
}
@Override
public List<HoodieRollbackStat> rollback(JavaSparkContext jsc, List<String> commits)
throws IOException {
String actionType = this.getCommitActionType();
HoodieActiveTimeline activeTimeline = this.getActiveTimeline();
List<String> inflights = this.getInflightCommitTimeline().getInstants()
.map(HoodieInstant::getTimestamp)
.collect(Collectors.toList());
// Atomically unpublish all the commits
commits.stream().filter(s -> !inflights.contains(s))
.map(s -> new HoodieInstant(false, actionType, s))
.forEach(activeTimeline::revertToInflight);
logger.info("Unpublished " + commits);
// delete all the data files for all these commits
logger.info("Clean out all parquet files generated for commits: " + commits);
List<HoodieRollbackStat> stats = jsc.parallelize(
FSUtils.getAllPartitionPaths(metaClient.getFs(), getMetaClient().getBasePath(),
config.shouldAssumeDatePartitioning()))
.map((Function<String, HoodieRollbackStat>) partitionPath -> {
// Scan all partitions files with this commit time
Map<FileStatus, Boolean> results = deleteCleanedFiles(partitionPath, commits);
return HoodieRollbackStat.newBuilder().withPartitionPath(partitionPath)
.withDeletedFileResults(results).build();
}).collect();
// clean temporary data files
cleanTemporaryDataFiles(jsc);
// Remove the rolled back inflight commits
commits.stream().map(s -> new HoodieInstant(true, actionType, s))
.forEach(activeTimeline::deleteInflight);
logger.info("Deleted inflight commits " + commits);
return stats;
}
/**
* Finalize the written data files
*
* @param writeStatuses List of WriteStatus
* @return number of files finalized
*/
@Override
@SuppressWarnings("unchecked")
public Optional<Integer> finalizeWrite(JavaSparkContext jsc, List writeStatuses) {
if (!config.shouldUseTempFolderForCopyOnWrite()) {
return Optional.empty();
}
// This is to rename each data file from temporary path to its final location
List<Tuple2<String, Boolean>> results = jsc.parallelize(writeStatuses, config.getFinalizeWriteParallelism())
.map(writeStatus -> {
Tuple2<String, HoodieWriteStat> writeStatTuple2 = (Tuple2<String, HoodieWriteStat>) writeStatus;
HoodieWriteStat writeStat = writeStatTuple2._2();
final FileSystem fs = getMetaClient().getFs();
final Path finalPath = new Path(config.getBasePath(), writeStat.getPath());
if (writeStat.getTempPath() != null) {
final Path tempPath = new Path(config.getBasePath(), writeStat.getTempPath());
boolean success;
try {
logger.info("Renaming temporary file: " + tempPath + " to " + finalPath);
success = fs.rename(tempPath, finalPath);
} catch (IOException e) {
throw new HoodieIOException("Failed to rename file: " + tempPath + " to " + finalPath);
}
if (!success) {
throw new HoodieIOException("Failed to rename file: " + tempPath + " to " + finalPath);
}
}
return new Tuple2<>(writeStat.getPath(), true);
}).collect();
// clean temporary data files
cleanTemporaryDataFiles(jsc);
return Optional.of(results.size());
}
/**
* Clean temporary data files that are produced from previous failed commit or retried spark
* stages.
*/
private void cleanTemporaryDataFiles(JavaSparkContext jsc) {
if (!config.shouldUseTempFolderForCopyOnWrite()) {
return;
}
final FileSystem fs = getMetaClient().getFs();
final Path temporaryFolder = new Path(config.getBasePath(),
HoodieTableMetaClient.TEMPFOLDER_NAME);
try {
if (!fs.exists(temporaryFolder)) {
logger.info("Temporary folder does not exist: " + temporaryFolder);
return;
}
List<FileStatus> fileStatusesList = Arrays.asList(fs.listStatus(temporaryFolder));
List<Tuple2<String, Boolean>> results = jsc
.parallelize(fileStatusesList, config.getFinalizeWriteParallelism())
.map(fileStatus -> {
FileSystem fs1 = getMetaClient().getFs();
boolean success = fs1.delete(fileStatus.getPath(), false);
logger.info("Deleting file in temporary folder" + fileStatus.getPath() + "\t"
+ success);
return new Tuple2<>(fileStatus.getPath().toString(), success);
}).collect();
for (Tuple2<String, Boolean> result : results) {
if (!result._2()) {
logger.info("Failed to delete file: " + result._1());
throw new HoodieIOException(
"Failed to delete file in temporary folder: " + result._1());
}
}
} catch (IOException e) {
throw new HoodieIOException(
"Failed to clean data files in temporary folder: " + temporaryFolder);
}
}
private static class PartitionCleanStat implements Serializable {
private final String partitionPath;
private final List<String> deletePathPatterns = new ArrayList<>();
private final List<String> successDeleteFiles = new ArrayList<>();
private final List<String> failedDeleteFiles = new ArrayList<>();
private PartitionCleanStat(String partitionPath) {
this.partitionPath = partitionPath;
}
private void addDeletedFileResult(String deletePathStr, Boolean deletedFileResult) {
if (deletedFileResult) {
successDeleteFiles.add(deletePathStr);
} else {
failedDeleteFiles.add(deletePathStr);
}
}
private void addDeleteFilePatterns(String deletePathStr) {
deletePathPatterns.add(deletePathStr);
}
private PartitionCleanStat merge(PartitionCleanStat other) {
if (!this.partitionPath.equals(other.partitionPath)) {
throw new RuntimeException(String.format(
"partitionPath is not a match: (%s, %s)",
partitionPath, other.partitionPath));
}
successDeleteFiles.addAll(other.successDeleteFiles);
deletePathPatterns.addAll(other.deletePathPatterns);
failedDeleteFiles.addAll(other.failedDeleteFiles);
return this;
}
}
private List<HoodieCleanStat> cleanPartitionPaths(List<String> partitionsToClean,
JavaSparkContext jsc) {
int cleanerParallelism = Math.min(partitionsToClean.size(), config.getCleanerParallelism());
logger.info("Using cleanerParallelism: " + cleanerParallelism);
List<Tuple2<String, PartitionCleanStat>> partitionCleanStats = jsc
.parallelize(partitionsToClean, cleanerParallelism)
.flatMapToPair(getFilesToDeleteFunc(this, config))
.repartition(cleanerParallelism) // repartition to remove skews
.mapPartitionsToPair(deleteFilesFunc(this))
.reduceByKey(
// merge partition level clean stats below
(Function2<PartitionCleanStat, PartitionCleanStat, PartitionCleanStat>) (e1, e2) -> e1
.merge(e2))
.collect();
Map<String, PartitionCleanStat> partitionCleanStatsMap = partitionCleanStats
.stream().collect(Collectors.toMap(e -> e._1(), e -> e._2()));
HoodieCleanHelper cleaner = new HoodieCleanHelper(this, config);
// Return PartitionCleanStat for each partition passed.
return partitionsToClean.stream().map(partitionPath -> {
PartitionCleanStat partitionCleanStat =
(partitionCleanStatsMap.containsKey(partitionPath)) ?
partitionCleanStatsMap.get(partitionPath)
: new PartitionCleanStat(partitionPath);
return HoodieCleanStat.newBuilder()
.withPolicy(config.getCleanerPolicy())
.withPartitionPath(partitionPath)
.withEarliestCommitRetained(cleaner.getEarliestCommitToRetain())
.withDeletePathPattern(partitionCleanStat.deletePathPatterns)
.withSuccessfulDeletes(partitionCleanStat.successDeleteFiles)
.withFailedDeletes(partitionCleanStat.failedDeleteFiles)
.build();
}).collect(Collectors.toList());
}
private static PairFlatMapFunction<Iterator<Tuple2<String, String>>, String, PartitionCleanStat> deleteFilesFunc(
HoodieTable table) {
return (PairFlatMapFunction<Iterator<Tuple2<String, String>>, String, PartitionCleanStat>) iter -> {
Map<String, PartitionCleanStat> partitionCleanStatMap = new HashMap<>();
FileSystem fs = table.getMetaClient().getFs();
while (iter.hasNext()) {
Tuple2<String, String> partitionDelFileTuple = iter.next();
String partitionPath = partitionDelFileTuple._1();
String deletePathStr = partitionDelFileTuple._2();
Boolean deletedFileResult = deleteFileAndGetResult(fs, deletePathStr);
if (!partitionCleanStatMap.containsKey(partitionPath)) {
partitionCleanStatMap.put(partitionPath,
new PartitionCleanStat(partitionPath));
}
PartitionCleanStat partitionCleanStat = partitionCleanStatMap.get(partitionPath);
partitionCleanStat.addDeleteFilePatterns(deletePathStr);
partitionCleanStat.addDeletedFileResult(deletePathStr, deletedFileResult);
}
return partitionCleanStatMap.entrySet().stream()
.map(e -> new Tuple2<>(e.getKey(), e.getValue()))
.collect(Collectors.toList()).iterator();
};
}
private static PairFlatMapFunction<String, String, String> getFilesToDeleteFunc(
HoodieTable table, HoodieWriteConfig config) {
return (PairFlatMapFunction<String, String, String>) partitionPathToClean -> {
HoodieCleanHelper cleaner = new HoodieCleanHelper(table, config);
return cleaner.getDeletePaths(partitionPathToClean).stream()
.map(deleteFile -> new Tuple2<>(partitionPathToClean, deleteFile.toString()))
.iterator();
};
}
private static Boolean deleteFileAndGetResult(FileSystem fs, String deletePathStr)
throws IOException {
Path deletePath = new Path(deletePathStr);
logger.debug("Working on delete path :" + deletePath);
boolean deleteResult = fs.delete(deletePath, false);
if (deleteResult) {
logger.debug("Cleaned file at path :" + deletePath);
}
return deleteResult;
}
} }

View File

@@ -62,19 +62,12 @@ import org.apache.spark.api.java.JavaSparkContext;
import org.apache.spark.api.java.function.Function; import org.apache.spark.api.java.function.Function;
/** /**
* Implementation of a more real-time read-optimized Hoodie Table where * Implementation of a more real-time read-optimized Hoodie Table where <p> INSERTS - Same as
* <p> * HoodieCopyOnWriteTable - Produce new files, block aligned to desired size (or) Merge with the
* INSERTS - Same as HoodieCopyOnWriteTable - Produce new files, block aligned to desired size (or) * smallest existing file, to expand it </p> <p> UPDATES - Appends the changes to a rolling log file
* Merge with the smallest existing file, to expand it * maintained per file Id. Compaction merges the log file into the base file. </p> <p> WARNING - MOR
* </p> * table type does not support nested rollbacks, every rollback must be followed by an attempted
* <p> * commit action </p>
* UPDATES - Appends the changes to a rolling log file maintained per file Id. Compaction merges the
* log file into the base file.
* </p>
* <p>
* WARNING - MOR table type does not support nested rollbacks, every rollback must be followed by an
* attempted commit action
* </p>
*/ */
public class HoodieMergeOnReadTable<T extends HoodieRecordPayload> extends public class HoodieMergeOnReadTable<T extends HoodieRecordPayload> extends
HoodieCopyOnWriteTable<T> { HoodieCopyOnWriteTable<T> {
@@ -88,57 +81,6 @@ public class HoodieMergeOnReadTable<T extends HoodieRecordPayload> extends
super(config, metaClient); super(config, metaClient);
} }
/**
* UpsertPartitioner for MergeOnRead table type, this allows auto correction of small parquet
* files to larger ones without the need for an index in the logFile.
*/
class MergeOnReadUpsertPartitioner extends HoodieCopyOnWriteTable.UpsertPartitioner {
MergeOnReadUpsertPartitioner(WorkloadProfile profile) {
super(profile);
}
@Override
protected List<SmallFile> getSmallFiles(String partitionPath) {
// smallFiles only for partitionPath
List<SmallFile> smallFileLocations = new ArrayList<>();
// Init here since this class (and member variables) might not have been initialized
HoodieTimeline commitTimeline = getCompletedCommitTimeline();
if (!commitTimeline.empty()) {
HoodieInstant latestCommitTime = commitTimeline.lastInstant().get();
// find smallest file in partition and append to it
Optional<FileSlice> smallFileSlice = getRTFileSystemView()
.getLatestFileSlicesBeforeOrOn(partitionPath, latestCommitTime.getTimestamp())
.filter(fileSlice -> fileSlice.getLogFiles().count() < 1 &&
fileSlice.getDataFile().get().getFileSize() < config.getParquetSmallFileLimit())
.sorted((FileSlice left, FileSlice right) ->
left.getDataFile().get().getFileSize() < right.getDataFile().get().getFileSize() ? -1 : 1)
.findFirst();
if(smallFileSlice.isPresent()) {
String filename = smallFileSlice.get().getDataFile().get().getFileName();
SmallFile sf = new SmallFile();
sf.location = new HoodieRecordLocation(FSUtils.getCommitTime(filename),
FSUtils.getFileId(filename));
sf.sizeBytes = smallFileSlice.get().getDataFile().get().getFileSize();
smallFileLocations.add(sf);
// Update the global small files list
smallFiles.add(sf);
}
}
return smallFileLocations;
}
public List<String> getSmallFileIds() {
return (List<String>) smallFiles.stream().map(smallFile -> ((SmallFile) smallFile).location.getFileId())
.collect(Collectors.toList());
}
}
@Override @Override
public Partitioner getUpsertPartitioner(WorkloadProfile profile) { public Partitioner getUpsertPartitioner(WorkloadProfile profile) {
if (profile == null) { if (profile == null) {
@@ -154,11 +96,12 @@ public class HoodieMergeOnReadTable<T extends HoodieRecordPayload> extends
logger.info("Merging updates for commit " + commitTime + " for file " + fileId); logger.info("Merging updates for commit " + commitTime + " for file " + fileId);
if (mergeOnReadUpsertPartitioner.getSmallFileIds().contains(fileId)) { if (mergeOnReadUpsertPartitioner.getSmallFileIds().contains(fileId)) {
logger.info("Small file corrections for updates for commit " + commitTime + " for file " + fileId); logger.info(
"Small file corrections for updates for commit " + commitTime + " for file " + fileId);
return super.handleUpdate(commitTime, fileId, recordItr); return super.handleUpdate(commitTime, fileId, recordItr);
} else { } else {
HoodieAppendHandle<T> appendHandle = HoodieAppendHandle<T> appendHandle = new HoodieAppendHandle<>(config, commitTime, this,
new HoodieAppendHandle<>(config, commitTime, this, fileId, recordItr); fileId, recordItr);
appendHandle.doAppend(); appendHandle.doAppend();
appendHandle.close(); appendHandle.close();
return Collections.singletonList(Collections.singletonList(appendHandle.getWriteStatus())) return Collections.singletonList(Collections.singletonList(appendHandle.getWriteStatus()))
@@ -202,11 +145,9 @@ public class HoodieMergeOnReadTable<T extends HoodieRecordPayload> extends
if (commits.size() > 1) { if (commits.size() > 1) {
throw new UnsupportedOperationException("Nested Rollbacks are not supported"); throw new UnsupportedOperationException("Nested Rollbacks are not supported");
} }
Map<String, HoodieInstant> commitsAndCompactions = Map<String, HoodieInstant> commitsAndCompactions = this.getActiveTimeline()
this.getActiveTimeline()
.getTimelineOfActions(Sets.newHashSet(HoodieActiveTimeline.COMMIT_ACTION, .getTimelineOfActions(Sets.newHashSet(HoodieActiveTimeline.COMMIT_ACTION,
HoodieActiveTimeline.DELTA_COMMIT_ACTION)) HoodieActiveTimeline.DELTA_COMMIT_ACTION)).getInstants()
.getInstants()
.filter(i -> commits.contains(i.getTimestamp())) .filter(i -> commits.contains(i.getTimestamp()))
.collect(Collectors.toMap(i -> i.getTimestamp(), i -> i)); .collect(Collectors.toMap(i -> i.getTimestamp(), i -> i));
@@ -218,9 +159,9 @@ public class HoodieMergeOnReadTable<T extends HoodieRecordPayload> extends
Long startTime = System.currentTimeMillis(); Long startTime = System.currentTimeMillis();
List<HoodieRollbackStat> allRollbackStats = jsc.parallelize List<HoodieRollbackStat> allRollbackStats = jsc.parallelize(FSUtils
(FSUtils.getAllPartitionPaths(this.metaClient.getFs(), .getAllPartitionPaths(this.metaClient.getFs(), this.getMetaClient().getBasePath(),
this.getMetaClient().getBasePath(), config.shouldAssumeDatePartitioning())) config.shouldAssumeDatePartitioning()))
.map((Function<String, List<HoodieRollbackStat>>) partitionPath -> { .map((Function<String, List<HoodieRollbackStat>>) partitionPath -> {
return commits.stream().map(commit -> { return commits.stream().map(commit -> {
HoodieInstant instant = commitsAndCompactions.get(commit); HoodieInstant instant = commitsAndCompactions.get(commit);
@@ -228,23 +169,27 @@ public class HoodieMergeOnReadTable<T extends HoodieRecordPayload> extends
switch (instant.getAction()) { switch (instant.getAction()) {
case HoodieTimeline.COMMIT_ACTION: case HoodieTimeline.COMMIT_ACTION:
try { try {
Map<FileStatus, Boolean> results = super.deleteCleanedFiles(partitionPath, Arrays.asList(commit)); Map<FileStatus, Boolean> results = super
hoodieRollbackStats = HoodieRollbackStat.newBuilder().withPartitionPath(partitionPath) .deleteCleanedFiles(partitionPath, Arrays.asList(commit));
.withDeletedFileResults(results).build(); hoodieRollbackStats = HoodieRollbackStat.newBuilder()
.withPartitionPath(partitionPath).withDeletedFileResults(results).build();
break; break;
} catch (IOException io) { } catch (IOException io) {
throw new UncheckedIOException("Failed to rollback for commit " + commit, io); throw new UncheckedIOException("Failed to rollback for commit " + commit, io);
} }
case HoodieTimeline.DELTA_COMMIT_ACTION: case HoodieTimeline.DELTA_COMMIT_ACTION:
try { try {
HoodieCommitMetadata commitMetadata = HoodieCommitMetadata HoodieCommitMetadata commitMetadata = HoodieCommitMetadata.fromBytes(
.fromBytes(this.getCommitTimeline().getInstantDetails(new HoodieInstant(true, instant.getAction(), instant.getTimestamp())).get()); this.getCommitTimeline().getInstantDetails(
new HoodieInstant(true, instant.getAction(), instant.getTimestamp()))
.get());
// read commit file and (either append delete blocks or delete file) // read commit file and (either append delete blocks or delete file)
Map<FileStatus, Boolean> filesToDeletedStatus = new HashMap<>(); Map<FileStatus, Boolean> filesToDeletedStatus = new HashMap<>();
Map<FileStatus, Long> filesToNumBlocksRollback = new HashMap<>(); Map<FileStatus, Long> filesToNumBlocksRollback = new HashMap<>();
// we do not know fileIds for inserts (first inserts are parquet files), delete all parquet files for the corresponding failed commit, if present (same as COW) // we do not know fileIds for inserts (first inserts are parquet files), delete
// all parquet files for the corresponding failed commit, if present (same as COW)
filesToDeletedStatus = super filesToDeletedStatus = super
.deleteCleanedFiles(partitionPath, Arrays.asList(commit)); .deleteCleanedFiles(partitionPath, Arrays.asList(commit));
@@ -252,32 +197,35 @@ public class HoodieMergeOnReadTable<T extends HoodieRecordPayload> extends
if (commitMetadata.getPartitionToWriteStats().containsKey(partitionPath)) { if (commitMetadata.getPartitionToWriteStats().containsKey(partitionPath)) {
commitMetadata.getPartitionToWriteStats().get(partitionPath).stream() commitMetadata.getPartitionToWriteStats().get(partitionPath).stream()
.filter(wStat -> { .filter(wStat -> {
return wStat != null && wStat.getPrevCommit() != HoodieWriteStat.NULL_COMMIT return wStat != null
&& wStat.getPrevCommit() != HoodieWriteStat.NULL_COMMIT
&& wStat.getPrevCommit() != null; && wStat.getPrevCommit() != null;
}) }).forEach(wStat -> {
.forEach(wStat -> {
HoodieLogFormat.Writer writer = null; HoodieLogFormat.Writer writer = null;
try { try {
writer = HoodieLogFormat.newWriterBuilder() writer = HoodieLogFormat.newWriterBuilder().onParentPath(
.onParentPath(new Path(this.getMetaClient().getBasePath(), partitionPath)) new Path(this.getMetaClient().getBasePath(), partitionPath))
.withFileId(wStat.getFileId()).overBaseCommit(wStat.getPrevCommit()) .withFileId(wStat.getFileId()).overBaseCommit(wStat.getPrevCommit())
.withFs(this.metaClient.getFs()) .withFs(this.metaClient.getFs())
.withFileExtension(HoodieLogFile.DELTA_EXTENSION).build(); .withFileExtension(HoodieLogFile.DELTA_EXTENSION).build();
Long numRollbackBlocks = 0L; Long numRollbackBlocks = 0L;
// generate metadata // generate metadata
Map<HoodieLogBlock.HeaderMetadataType, String> header = Maps.newHashMap(); Map<HoodieLogBlock.HeaderMetadataType, String> header =
Maps.newHashMap();
header.put(HoodieLogBlock.HeaderMetadataType.INSTANT_TIME, header.put(HoodieLogBlock.HeaderMetadataType.INSTANT_TIME,
metaClient.getActiveTimeline().lastInstant().get().getTimestamp()); metaClient.getActiveTimeline().lastInstant().get().getTimestamp());
header.put(HoodieLogBlock.HeaderMetadataType.TARGET_INSTANT_TIME, commit); header.put(HoodieLogBlock.HeaderMetadataType.TARGET_INSTANT_TIME,
header.put(HoodieLogBlock.HeaderMetadataType.COMMAND_BLOCK_TYPE, commit);
String.valueOf(HoodieCommandBlock.HoodieCommandBlockTypeEnum.ROLLBACK_PREVIOUS_BLOCK.ordinal())); header.put(HoodieLogBlock.HeaderMetadataType.COMMAND_BLOCK_TYPE, String
.valueOf(
HoodieCommandBlock.HoodieCommandBlockTypeEnum
.ROLLBACK_PREVIOUS_BLOCK
.ordinal()));
// if update belongs to an existing log file // if update belongs to an existing log file
writer = writer.appendBlock(new HoodieCommandBlock( writer = writer.appendBlock(new HoodieCommandBlock(header));
header));
numRollbackBlocks++; numRollbackBlocks++;
filesToNumBlocksRollback filesToNumBlocksRollback.put(this.getMetaClient().getFs()
.put(this.getMetaClient().getFs().getFileStatus(writer.getLogFile().getPath()), .getFileStatus(writer.getLogFile().getPath()), numRollbackBlocks);
numRollbackBlocks);
} catch (IOException | InterruptedException io) { } catch (IOException | InterruptedException io) {
throw new HoodieRollbackException( throw new HoodieRollbackException(
"Failed to rollback for commit " + commit, io); "Failed to rollback for commit " + commit, io);
@@ -289,7 +237,8 @@ public class HoodieMergeOnReadTable<T extends HoodieRecordPayload> extends
} }
} }
}); });
hoodieRollbackStats = HoodieRollbackStat.newBuilder().withPartitionPath(partitionPath) hoodieRollbackStats = HoodieRollbackStat.newBuilder()
.withPartitionPath(partitionPath)
.withDeletedFileResults(filesToDeletedStatus) .withDeletedFileResults(filesToDeletedStatus)
.withRollbackBlockAppendResults(filesToNumBlocksRollback).build(); .withRollbackBlockAppendResults(filesToNumBlocksRollback).build();
} }
@@ -297,17 +246,19 @@ public class HoodieMergeOnReadTable<T extends HoodieRecordPayload> extends
} catch (IOException io) { } catch (IOException io) {
throw new UncheckedIOException("Failed to rollback for commit " + commit, io); throw new UncheckedIOException("Failed to rollback for commit " + commit, io);
} }
default:
break;
} }
return hoodieRollbackStats; return hoodieRollbackStats;
}).collect(Collectors.toList()); }).collect(Collectors.toList());
}).flatMap(x -> x.iterator()).filter(x -> x != null).collect(); }).flatMap(x -> x.iterator()).filter(x -> x != null).collect();
commitsAndCompactions.entrySet().stream() commitsAndCompactions.entrySet().stream().map(
.map(entry -> new HoodieInstant(true, entry.getValue().getAction(), entry -> new HoodieInstant(true, entry.getValue().getAction(),
entry.getValue().getTimestamp())) entry.getValue().getTimestamp())).forEach(this.getActiveTimeline()::deleteInflight);
.forEach(this.getActiveTimeline()::deleteInflight);
logger.debug("Time(in ms) taken to finish rollback " + (System.currentTimeMillis() - startTime)); logger
.debug("Time(in ms) taken to finish rollback " + (System.currentTimeMillis() - startTime));
return allRollbackStats; return allRollbackStats;
} }
@@ -317,4 +268,56 @@ public class HoodieMergeOnReadTable<T extends HoodieRecordPayload> extends
// do nothing for MOR tables // do nothing for MOR tables
return Optional.empty(); return Optional.empty();
} }
/**
* UpsertPartitioner for MergeOnRead table type, this allows auto correction of small parquet
* files to larger ones without the need for an index in the logFile.
*/
class MergeOnReadUpsertPartitioner extends HoodieCopyOnWriteTable.UpsertPartitioner {
MergeOnReadUpsertPartitioner(WorkloadProfile profile) {
super(profile);
}
@Override
protected List<SmallFile> getSmallFiles(String partitionPath) {
// smallFiles only for partitionPath
List<SmallFile> smallFileLocations = new ArrayList<>();
// Init here since this class (and member variables) might not have been initialized
HoodieTimeline commitTimeline = getCompletedCommitTimeline();
if (!commitTimeline.empty()) {
HoodieInstant latestCommitTime = commitTimeline.lastInstant().get();
// find smallest file in partition and append to it
Optional<FileSlice> smallFileSlice = getRTFileSystemView()
.getLatestFileSlicesBeforeOrOn(partitionPath, latestCommitTime.getTimestamp()).filter(
fileSlice -> fileSlice.getLogFiles().count() < 1
&& fileSlice.getDataFile().get().getFileSize() < config
.getParquetSmallFileLimit()).sorted((FileSlice left, FileSlice right) ->
left.getDataFile().get().getFileSize() < right.getDataFile().get().getFileSize()
? -1 : 1).findFirst();
if (smallFileSlice.isPresent()) {
String filename = smallFileSlice.get().getDataFile().get().getFileName();
SmallFile sf = new SmallFile();
sf.location = new HoodieRecordLocation(FSUtils.getCommitTime(filename),
FSUtils.getFileId(filename));
sf.sizeBytes = smallFileSlice.get().getDataFile().get().getFileSize();
smallFileLocations.add(sf);
// Update the global small files list
smallFiles.add(sf);
}
}
return smallFileLocations;
}
public List<String> getSmallFileIds() {
return (List<String>) smallFiles.stream()
.map(smallFile -> ((SmallFile) smallFile).location.getFileId())
.collect(Collectors.toList());
}
}
} }

View File

@@ -60,18 +60,28 @@ public abstract class HoodieTable<T extends HoodieRecordPayload> implements Seri
this.metaClient = metaClient; this.metaClient = metaClient;
} }
public static <T extends HoodieRecordPayload> HoodieTable<T> getHoodieTable(
HoodieTableMetaClient metaClient, HoodieWriteConfig config) {
switch (metaClient.getTableType()) {
case COPY_ON_WRITE:
return new HoodieCopyOnWriteTable<>(config, metaClient);
case MERGE_ON_READ:
return new HoodieMergeOnReadTable<>(config, metaClient);
default:
throw new HoodieException("Unsupported table type :" + metaClient.getTableType());
}
}
/** /**
* Provides a partitioner to perform the upsert operation, based on the workload profile * Provides a partitioner to perform the upsert operation, based on the workload profile
*/ */
public abstract Partitioner getUpsertPartitioner(WorkloadProfile profile); public abstract Partitioner getUpsertPartitioner(WorkloadProfile profile);
/** /**
* Provides a partitioner to perform the insert operation, based on the workload profile * Provides a partitioner to perform the insert operation, based on the workload profile
*/ */
public abstract Partitioner getInsertPartitioner(WorkloadProfile profile); public abstract Partitioner getInsertPartitioner(WorkloadProfile profile);
/** /**
* Return whether this HoodieTable implementation can benefit from workload profiling * Return whether this HoodieTable implementation can benefit from workload profiling
*/ */
@@ -131,7 +141,6 @@ public abstract class HoodieTable<T extends HoodieRecordPayload> implements Seri
return getCommitsTimeline().filterInflights(); return getCommitsTimeline().filterInflights();
} }
/** /**
* Get only the completed (no-inflights) clean timeline * Get only the completed (no-inflights) clean timeline
*/ */
@@ -162,12 +171,12 @@ public abstract class HoodieTable<T extends HoodieRecordPayload> implements Seri
throw new HoodieSavepointException( throw new HoodieSavepointException(
"Could not get data files for savepoint " + savepointTime + ". No such savepoint."); "Could not get data files for savepoint " + savepointTime + ". No such savepoint.");
} }
HoodieInstant instant = HoodieInstant instant = new HoodieInstant(false, HoodieTimeline.SAVEPOINT_ACTION,
new HoodieInstant(false, HoodieTimeline.SAVEPOINT_ACTION, savepointTime); savepointTime);
HoodieSavepointMetadata metadata = null; HoodieSavepointMetadata metadata = null;
try { try {
metadata = AvroUtils.deserializeHoodieSavepointMetadata( metadata = AvroUtils
getActiveTimeline().getInstantDetails(instant).get()); .deserializeHoodieSavepointMetadata(getActiveTimeline().getInstantDetails(instant).get());
} catch (IOException e) { } catch (IOException e) {
throw new HoodieSavepointException( throw new HoodieSavepointException(
"Could not get savepointed data files for savepoint " + savepointTime, e); "Could not get savepointed data files for savepoint " + savepointTime, e);
@@ -189,7 +198,8 @@ public abstract class HoodieTable<T extends HoodieRecordPayload> implements Seri
return getActiveTimeline().getCommitTimeline(); return getActiveTimeline().getCommitTimeline();
case MERGE_ON_READ: case MERGE_ON_READ:
// We need to include the parquet files written out in delta commits // We need to include the parquet files written out in delta commits
// Include commit action to be able to start doing a MOR over a COW dataset - no migration required // Include commit action to be able to start doing a MOR over a COW dataset - no
// migration required
return getActiveTimeline().getCommitsTimeline(); return getActiveTimeline().getCommitsTimeline();
default: default:
throw new HoodieException("Unsupported table type :" + metaClient.getTableType()); throw new HoodieException("Unsupported table type :" + metaClient.getTableType());
@@ -219,10 +229,11 @@ public abstract class HoodieTable<T extends HoodieRecordPayload> implements Seri
return HoodieActiveTimeline.COMMIT_ACTION; return HoodieActiveTimeline.COMMIT_ACTION;
case MERGE_ON_READ: case MERGE_ON_READ:
return HoodieActiveTimeline.DELTA_COMMIT_ACTION; return HoodieActiveTimeline.DELTA_COMMIT_ACTION;
} default:
throw new HoodieCommitException( throw new HoodieCommitException(
"Could not commit on unknown storage type " + metaClient.getTableType()); "Could not commit on unknown storage type " + metaClient.getTableType());
} }
}
/** /**
* Perform the ultimate IO for a given upserted (RDD) partition * Perform the ultimate IO for a given upserted (RDD) partition
@@ -236,21 +247,9 @@ public abstract class HoodieTable<T extends HoodieRecordPayload> implements Seri
public abstract Iterator<List<WriteStatus>> handleInsertPartition(String commitTime, public abstract Iterator<List<WriteStatus>> handleInsertPartition(String commitTime,
Integer partition, Iterator<HoodieRecord<T>> recordIterator, Partitioner partitioner); Integer partition, Iterator<HoodieRecord<T>> recordIterator, Partitioner partitioner);
public static <T extends HoodieRecordPayload> HoodieTable<T> getHoodieTable(
HoodieTableMetaClient metaClient, HoodieWriteConfig config) {
switch (metaClient.getTableType()) {
case COPY_ON_WRITE:
return new HoodieCopyOnWriteTable<>(config, metaClient);
case MERGE_ON_READ:
return new HoodieMergeOnReadTable<>(config, metaClient);
default:
throw new HoodieException("Unsupported table type :" + metaClient.getTableType());
}
}
/** /**
* Run Compaction on the table. * Run Compaction on the table. Compaction arranges the data so that it is optimized for data
* Compaction arranges the data so that it is optimized for data access * access
*/ */
public abstract JavaRDD<WriteStatus> compact(JavaSparkContext jsc, String commitTime); public abstract JavaRDD<WriteStatus> compact(JavaSparkContext jsc, String commitTime);

View File

@@ -13,6 +13,7 @@
* See the License for the specific language governing permissions and * See the License for the specific language governing permissions and
* limitations under the License. * limitations under the License.
*/ */
package com.uber.hoodie.table; package com.uber.hoodie.table;
import com.uber.hoodie.common.model.HoodieRecord; import com.uber.hoodie.common.model.HoodieRecord;

View File

@@ -16,7 +16,6 @@
package com.uber.hoodie.table; package com.uber.hoodie.table;
import com.uber.hoodie.common.model.HoodieRecord; import com.uber.hoodie.common.model.HoodieRecord;
import com.uber.hoodie.common.model.HoodieRecordLocation; import com.uber.hoodie.common.model.HoodieRecordLocation;
import com.uber.hoodie.common.model.HoodieRecordPayload; import com.uber.hoodie.common.model.HoodieRecordPayload;
@@ -31,7 +30,7 @@ import scala.Tuple2;
/** /**
* Information about incoming records for upsert/insert obtained either via sampling or * Information about incoming records for upsert/insert obtained either via sampling or
* introspecting the data fully * introspecting the data fully
* * <p>
* TODO(vc): Think about obtaining this directly from index.tagLocation * TODO(vc): Think about obtaining this directly from index.tagLocation
*/ */
public class WorkloadProfile<T extends HoodieRecordPayload> implements Serializable { public class WorkloadProfile<T extends HoodieRecordPayload> implements Serializable {
@@ -60,11 +59,9 @@ public class WorkloadProfile<T extends HoodieRecordPayload> implements Serializa
private void buildProfile() { private void buildProfile() {
Map<Tuple2<String, Option<HoodieRecordLocation>>, Long> partitionLocationCounts = taggedRecords Map<Tuple2<String, Option<HoodieRecordLocation>>, Long> partitionLocationCounts = taggedRecords
.mapToPair(record -> .mapToPair(record -> new Tuple2<>(
new Tuple2<>(
new Tuple2<>(record.getPartitionPath(), Option.apply(record.getCurrentLocation())), new Tuple2<>(record.getPartitionPath(), Option.apply(record.getCurrentLocation())),
record)) record)).countByKey();
.countByKey();
for (Map.Entry<Tuple2<String, Option<HoodieRecordLocation>>, Long> e : partitionLocationCounts for (Map.Entry<Tuple2<String, Option<HoodieRecordLocation>>, Long> e : partitionLocationCounts
.entrySet()) { .entrySet()) {

View File

@@ -17,10 +17,9 @@
package com.uber.hoodie.table; package com.uber.hoodie.table;
import com.uber.hoodie.common.model.HoodieRecordLocation; import com.uber.hoodie.common.model.HoodieRecordLocation;
import org.apache.commons.lang3.tuple.Pair;
import java.io.Serializable; import java.io.Serializable;
import java.util.HashMap; import java.util.HashMap;
import org.apache.commons.lang3.tuple.Pair;
/** /**
* Wraps stats about a single partition path. * Wraps stats about a single partition path.

View File

@@ -14,7 +14,6 @@
* limitations under the License. * limitations under the License.
*/ */
import com.beust.jcommander.JCommander; import com.beust.jcommander.JCommander;
import com.beust.jcommander.Parameter; import com.beust.jcommander.Parameter;
import com.uber.hoodie.HoodieWriteClient; import com.uber.hoodie.HoodieWriteClient;
@@ -38,24 +37,19 @@ import org.apache.spark.api.java.JavaRDD;
import org.apache.spark.api.java.JavaSparkContext; import org.apache.spark.api.java.JavaSparkContext;
/** /**
* Driver program that uses the Hoodie client with synthetic workload, and performs basic * Driver program that uses the Hoodie client with synthetic workload, and performs basic operations. <p>
* operations. <p>
*/ */
public class HoodieClientExample { public class HoodieClientExample {
@Parameter(names = {"--table-path", "-p"}, description = "path for Hoodie sample table") private static Logger logger = LogManager.getLogger(HoodieClientExample.class);
private String tablePath = "file:///tmp/hoodie/sample-table";
@Parameter(names = {"--table-name", "-n"}, description = "table name for Hoodie sample table")
private String tableName = "hoodie_rt";
@Parameter(names = {"--table-type", "-t"}, description = "One of COPY_ON_WRITE or MERGE_ON_READ")
private String tableType = HoodieTableType.COPY_ON_WRITE.name();
@Parameter(names = {"--help", "-h"}, help = true) @Parameter(names = {"--help", "-h"}, help = true)
public Boolean help = false; public Boolean help = false;
@Parameter(names = {"--table-path", "-p"}, description = "path for Hoodie sample table")
private static Logger logger = LogManager.getLogger(HoodieClientExample.class); private String tablePath = "file:///tmp/hoodie/sample-table";
@Parameter(names = {"--table-name", "-n"}, description = "table name for Hoodie sample table")
private String tableName = "hoodie_rt";
@Parameter(names = {"--table-type", "-t"}, description = "One of COPY_ON_WRITE or MERGE_ON_READ")
private String tableType = HoodieTableType.COPY_ON_WRITE.name();
public static void main(String[] args) throws Exception { public static void main(String[] args) throws Exception {
HoodieClientExample cli = new HoodieClientExample(); HoodieClientExample cli = new HoodieClientExample();
@@ -92,10 +86,10 @@ public class HoodieClientExample {
// Create the write client to write some records in // Create the write client to write some records in
HoodieWriteConfig cfg = HoodieWriteConfig.newBuilder().withPath(tablePath) HoodieWriteConfig cfg = HoodieWriteConfig.newBuilder().withPath(tablePath)
.withSchema(HoodieTestDataGenerator.TRIP_EXAMPLE_SCHEMA).withParallelism(2, 2) .withSchema(HoodieTestDataGenerator.TRIP_EXAMPLE_SCHEMA).withParallelism(2, 2)
.forTable(tableName).withIndexConfig( .forTable(tableName)
HoodieIndexConfig.newBuilder().withIndexType(IndexType.BLOOM).build()) .withIndexConfig(HoodieIndexConfig.newBuilder().withIndexType(IndexType.BLOOM).build())
.withCompactionConfig(HoodieCompactionConfig.newBuilder().archiveCommitsWith(2, 3).build()) .withCompactionConfig(
.build(); HoodieCompactionConfig.newBuilder().archiveCommitsWith(2, 3).build()).build();
HoodieWriteClient client = new HoodieWriteClient(jsc, cfg); HoodieWriteClient client = new HoodieWriteClient(jsc, cfg);
/** /**

View File

@@ -16,7 +16,6 @@
package com.uber.hoodie; package com.uber.hoodie;
import static org.junit.Assert.assertEquals; import static org.junit.Assert.assertEquals;
import com.uber.hoodie.common.HoodieClientTestUtils; import com.uber.hoodie.common.HoodieClientTestUtils;
@@ -58,11 +57,11 @@ public class TestMultiFS implements Serializable {
private static MiniDFSCluster dfsCluster; private static MiniDFSCluster dfsCluster;
private static DistributedFileSystem dfs; private static DistributedFileSystem dfs;
private static Logger logger = LogManager.getLogger(TestMultiFS.class); private static Logger logger = LogManager.getLogger(TestMultiFS.class);
private static JavaSparkContext jsc;
private static SQLContext sqlContext;
private String tablePath = "file:///tmp/hoodie/sample-table"; private String tablePath = "file:///tmp/hoodie/sample-table";
private String tableName = "hoodie_rt"; private String tableName = "hoodie_rt";
private String tableType = HoodieTableType.COPY_ON_WRITE.name(); private String tableType = HoodieTableType.COPY_ON_WRITE.name();
private static JavaSparkContext jsc;
private static SQLContext sqlContext;
@BeforeClass @BeforeClass
public static void initClass() throws Exception { public static void initClass() throws Exception {
@@ -92,7 +91,8 @@ public class TestMultiFS implements Serializable {
hdfsTestService.stop(); hdfsTestService.stop();
dfsCluster.shutdown(); dfsCluster.shutdown();
} }
// Need to closeAll to clear FileSystem.Cache, required because DFS and LocalFS used in the same JVM // Need to closeAll to clear FileSystem.Cache, required because DFS and LocalFS used in the
// same JVM
FileSystem.closeAll(); FileSystem.closeAll();
} }
@@ -111,8 +111,7 @@ public class TestMultiFS implements Serializable {
HoodieWriteConfig cfg = HoodieWriteConfig.newBuilder().withPath(dfsBasePath) HoodieWriteConfig cfg = HoodieWriteConfig.newBuilder().withPath(dfsBasePath)
.withSchema(HoodieTestDataGenerator.TRIP_EXAMPLE_SCHEMA).withParallelism(2, 2) .withSchema(HoodieTestDataGenerator.TRIP_EXAMPLE_SCHEMA).withParallelism(2, 2)
.forTable(tableName).withIndexConfig( .forTable(tableName).withIndexConfig(
HoodieIndexConfig.newBuilder().withIndexType(HoodieIndex.IndexType.BLOOM).build()) HoodieIndexConfig.newBuilder().withIndexType(HoodieIndex.IndexType.BLOOM).build()).build();
.build();
HoodieWriteClient hdfsWriteClient = new HoodieWriteClient(jsc, cfg); HoodieWriteClient hdfsWriteClient = new HoodieWriteClient(jsc, cfg);
// Write generated data to hdfs (only inserts) // Write generated data to hdfs (only inserts)
@@ -125,10 +124,8 @@ public class TestMultiFS implements Serializable {
// Read from hdfs // Read from hdfs
FileSystem fs = FSUtils.getFs(dfsBasePath, HoodieTestUtils.getDefaultHadoopConf()); FileSystem fs = FSUtils.getFs(dfsBasePath, HoodieTestUtils.getDefaultHadoopConf());
HoodieTableMetaClient metaClient = new HoodieTableMetaClient(fs.getConf(), dfsBasePath); HoodieTableMetaClient metaClient = new HoodieTableMetaClient(fs.getConf(), dfsBasePath);
HoodieTimeline timeline = new HoodieActiveTimeline(metaClient) HoodieTimeline timeline = new HoodieActiveTimeline(metaClient).getCommitTimeline();
.getCommitTimeline(); Dataset<Row> readRecords = HoodieClientTestUtils.readCommit(dfsBasePath, sqlContext, timeline, readCommitTime);
Dataset<Row> readRecords = HoodieClientTestUtils
.readCommit(dfsBasePath, sqlContext, timeline, readCommitTime);
assertEquals("Should contain 100 records", readRecords.count(), records.size()); assertEquals("Should contain 100 records", readRecords.count(), records.size());
// Write to local // Write to local
@@ -138,8 +135,7 @@ public class TestMultiFS implements Serializable {
HoodieWriteConfig localConfig = HoodieWriteConfig.newBuilder().withPath(tablePath) HoodieWriteConfig localConfig = HoodieWriteConfig.newBuilder().withPath(tablePath)
.withSchema(HoodieTestDataGenerator.TRIP_EXAMPLE_SCHEMA).withParallelism(2, 2) .withSchema(HoodieTestDataGenerator.TRIP_EXAMPLE_SCHEMA).withParallelism(2, 2)
.forTable(tableName).withIndexConfig( .forTable(tableName).withIndexConfig(
HoodieIndexConfig.newBuilder().withIndexType(HoodieIndex.IndexType.BLOOM).build()) HoodieIndexConfig.newBuilder().withIndexType(HoodieIndex.IndexType.BLOOM).build()).build();
.build();
HoodieWriteClient localWriteClient = new HoodieWriteClient(jsc, localConfig); HoodieWriteClient localWriteClient = new HoodieWriteClient(jsc, localConfig);
String writeCommitTime = localWriteClient.startCommit(); String writeCommitTime = localWriteClient.startCommit();
@@ -153,8 +149,7 @@ public class TestMultiFS implements Serializable {
fs = FSUtils.getFs(tablePath, HoodieTestUtils.getDefaultHadoopConf()); fs = FSUtils.getFs(tablePath, HoodieTestUtils.getDefaultHadoopConf());
metaClient = new HoodieTableMetaClient(fs.getConf(), tablePath); metaClient = new HoodieTableMetaClient(fs.getConf(), tablePath);
timeline = new HoodieActiveTimeline(metaClient).getCommitTimeline(); timeline = new HoodieActiveTimeline(metaClient).getCommitTimeline();
Dataset<Row> localReadRecords = HoodieClientTestUtils Dataset<Row> localReadRecords = HoodieClientTestUtils.readCommit(tablePath, sqlContext, timeline, writeCommitTime);
.readCommit(tablePath, sqlContext, timeline, writeCommitTime);
assertEquals("Should contain 100 records", localReadRecords.count(), localRecords.size()); assertEquals("Should contain 100 records", localReadRecords.count(), localRecords.size());
} }
} }

View File

@@ -69,8 +69,7 @@ public class HoodieClientTestUtils {
return keys; return keys;
} }
private static void fakeMetaFile(String basePath, String commitTime, String suffix) private static void fakeMetaFile(String basePath, String commitTime, String suffix) throws IOException {
throws IOException {
String parentPath = basePath + "/" + HoodieTableMetaClient.METAFOLDER_NAME; String parentPath = basePath + "/" + HoodieTableMetaClient.METAFOLDER_NAME;
new File(parentPath).mkdirs(); new File(parentPath).mkdirs();
new File(parentPath + "/" + commitTime + suffix).createNewFile(); new File(parentPath + "/" + commitTime + suffix).createNewFile();
@@ -85,55 +84,48 @@ public class HoodieClientTestUtils {
fakeMetaFile(basePath, commitTime, HoodieTimeline.INFLIGHT_EXTENSION); fakeMetaFile(basePath, commitTime, HoodieTimeline.INFLIGHT_EXTENSION);
} }
public static void fakeDataFile(String basePath, String partitionPath, String commitTime, public static void fakeDataFile(String basePath, String partitionPath, String commitTime, String fileId)
String fileId) throws Exception { throws Exception {
fakeDataFile(basePath, partitionPath, commitTime, fileId, 0); fakeDataFile(basePath, partitionPath, commitTime, fileId, 0);
} }
public static void fakeDataFile(String basePath, String partitionPath, String commitTime, public static void fakeDataFile(String basePath, String partitionPath, String commitTime, String fileId, long length)
String fileId, long length) throws Exception { throws Exception {
String parentPath = String.format("%s/%s", basePath, partitionPath); String parentPath = String.format("%s/%s", basePath, partitionPath);
new File(parentPath).mkdirs(); new File(parentPath).mkdirs();
String path = String String path = String.format("%s/%s", parentPath, FSUtils.makeDataFileName(commitTime, 0, fileId));
.format("%s/%s", parentPath, FSUtils.makeDataFileName(commitTime, 0, fileId));
new File(path).createNewFile(); new File(path).createNewFile();
new RandomAccessFile(path, "rw").setLength(length); new RandomAccessFile(path, "rw").setLength(length);
} }
public static SparkConf getSparkConfForTest(String appName) { public static SparkConf getSparkConfForTest(String appName) {
SparkConf sparkConf = new SparkConf() System.out.println("HIII" + "HII2");
.setAppName(appName) SparkConf sparkConf = new SparkConf().setAppName(appName)
.set("spark.serializer", "org.apache.spark.serializer.KryoSerializer") .set("spark.serializer", "org.apache.spark.serializer.KryoSerializer")
.setMaster("local[1]"); .setMaster("local[1]");
return HoodieReadClient.addHoodieSupport(sparkConf); return HoodieReadClient.addHoodieSupport(sparkConf);
} }
public static HashMap<String, String> getLatestFileIDsToFullPath(String basePath, public static HashMap<String, String> getLatestFileIDsToFullPath(String basePath, HoodieTimeline commitTimeline,
HoodieTimeline commitTimeline,
List<HoodieInstant> commitsToReturn) throws IOException { List<HoodieInstant> commitsToReturn) throws IOException {
HashMap<String, String> fileIdToFullPath = new HashMap<>(); HashMap<String, String> fileIdToFullPath = new HashMap<>();
for (HoodieInstant commit : commitsToReturn) { for (HoodieInstant commit : commitsToReturn) {
HoodieCommitMetadata metadata = HoodieCommitMetadata metadata = HoodieCommitMetadata.fromBytes(commitTimeline.getInstantDetails(commit).get());
HoodieCommitMetadata.fromBytes(commitTimeline.getInstantDetails(commit).get());
fileIdToFullPath.putAll(metadata.getFileIdAndFullPaths(basePath)); fileIdToFullPath.putAll(metadata.getFileIdAndFullPaths(basePath));
} }
return fileIdToFullPath; return fileIdToFullPath;
} }
public static Dataset<Row> readCommit(String basePath, public static Dataset<Row> readCommit(String basePath, SQLContext sqlContext, HoodieTimeline commitTimeline,
SQLContext sqlContext,
HoodieTimeline commitTimeline,
String commitTime) { String commitTime) {
HoodieInstant commitInstant = HoodieInstant commitInstant = new HoodieInstant(false, HoodieTimeline.COMMIT_ACTION, commitTime);
new HoodieInstant(false, HoodieTimeline.COMMIT_ACTION, commitTime);
if (!commitTimeline.containsInstant(commitInstant)) { if (!commitTimeline.containsInstant(commitInstant)) {
new HoodieException("No commit exists at " + commitTime); new HoodieException("No commit exists at " + commitTime);
} }
try { try {
HashMap<String, String> paths = getLatestFileIDsToFullPath(basePath, commitTimeline, HashMap<String, String> paths = getLatestFileIDsToFullPath(basePath, commitTimeline,
Arrays.asList(commitInstant)); Arrays.asList(commitInstant));
return sqlContext.read() return sqlContext.read().parquet(paths.values().toArray(new String[paths.size()]))
.parquet(paths.values().toArray(new String[paths.size()]))
.filter(String.format("%s ='%s'", HoodieRecord.COMMIT_TIME_METADATA_FIELD, commitTime)); .filter(String.format("%s ='%s'", HoodieRecord.COMMIT_TIME_METADATA_FIELD, commitTime));
} catch (Exception e) { } catch (Exception e) {
throw new HoodieException("Error reading commit " + commitTime, e); throw new HoodieException("Error reading commit " + commitTime, e);
@@ -143,50 +135,37 @@ public class HoodieClientTestUtils {
/** /**
* Obtain all new data written into the Hoodie dataset since the given timestamp. * Obtain all new data written into the Hoodie dataset since the given timestamp.
*/ */
public static Dataset<Row> readSince(String basePath, public static Dataset<Row> readSince(String basePath, SQLContext sqlContext, HoodieTimeline commitTimeline,
SQLContext sqlContext,
HoodieTimeline commitTimeline,
String lastCommitTime) { String lastCommitTime) {
List<HoodieInstant> commitsToReturn = List<HoodieInstant> commitsToReturn = commitTimeline.findInstantsAfter(lastCommitTime, Integer.MAX_VALUE)
commitTimeline.findInstantsAfter(lastCommitTime, Integer.MAX_VALUE)
.getInstants().collect(Collectors.toList()); .getInstants().collect(Collectors.toList());
try { try {
// Go over the commit metadata, and obtain the new files that need to be read. // Go over the commit metadata, and obtain the new files that need to be read.
HashMap<String, String> fileIdToFullPath = getLatestFileIDsToFullPath(basePath, HashMap<String, String> fileIdToFullPath = getLatestFileIDsToFullPath(basePath, commitTimeline, commitsToReturn);
commitTimeline, commitsToReturn); return sqlContext.read().parquet(fileIdToFullPath.values().toArray(new String[fileIdToFullPath.size()]))
return sqlContext.read() .filter(String.format("%s >'%s'", HoodieRecord.COMMIT_TIME_METADATA_FIELD, lastCommitTime));
.parquet(fileIdToFullPath.values().toArray(new String[fileIdToFullPath.size()]))
.filter(
String.format("%s >'%s'", HoodieRecord.COMMIT_TIME_METADATA_FIELD, lastCommitTime));
} catch (IOException e) { } catch (IOException e) {
throw new HoodieException( throw new HoodieException("Error pulling data incrementally from commitTimestamp :" + lastCommitTime, e);
"Error pulling data incrementally from commitTimestamp :" + lastCommitTime, e);
} }
} }
/** /**
* Reads the paths under the a hoodie dataset out as a DataFrame * Reads the paths under the a hoodie dataset out as a DataFrame
*/ */
public static Dataset<Row> read(String basePath, public static Dataset<Row> read(String basePath, SQLContext sqlContext, FileSystem fs, String... paths) {
SQLContext sqlContext,
FileSystem fs,
String... paths) {
List<String> filteredPaths = new ArrayList<>(); List<String> filteredPaths = new ArrayList<>();
try { try {
HoodieTable hoodieTable = HoodieTable HoodieTable hoodieTable = HoodieTable
.getHoodieTable(new HoodieTableMetaClient(fs.getConf(), basePath, true), null); .getHoodieTable(new HoodieTableMetaClient(fs.getConf(), basePath, true), null);
for (String path : paths) { for (String path : paths) {
TableFileSystemView.ReadOptimizedView fileSystemView = new HoodieTableFileSystemView( TableFileSystemView.ReadOptimizedView fileSystemView = new HoodieTableFileSystemView(
hoodieTable.getMetaClient(), hoodieTable.getMetaClient(), hoodieTable.getCompletedCommitTimeline(), fs.globStatus(new Path(path)));
hoodieTable.getCompletedCommitTimeline(), fs.globStatus(new Path(path))); List<HoodieDataFile> latestFiles = fileSystemView.getLatestDataFiles().collect(Collectors.toList());
List<HoodieDataFile> latestFiles = fileSystemView.getLatestDataFiles().collect(
Collectors.toList());
for (HoodieDataFile file : latestFiles) { for (HoodieDataFile file : latestFiles) {
filteredPaths.add(file.getPath()); filteredPaths.add(file.getPath());
} }
} }
return sqlContext.read() return sqlContext.read().parquet(filteredPaths.toArray(new String[filteredPaths.size()]));
.parquet(filteredPaths.toArray(new String[filteredPaths.size()]));
} catch (Exception e) { } catch (Exception e) {
throw new HoodieException("Error reading hoodie dataset as a dataframe", e); throw new HoodieException("Error reading hoodie dataset as a dataframe", e);
} }

View File

@@ -42,8 +42,7 @@ import org.apache.hadoop.mapred.RecordReader;
*/ */
public class HoodieMergeOnReadTestUtils { public class HoodieMergeOnReadTestUtils {
public static List<GenericRecord> getRecordsUsingInputFormat(List<String> inputPaths, public static List<GenericRecord> getRecordsUsingInputFormat(List<String> inputPaths, String basePath)
String basePath)
throws IOException { throws IOException {
JobConf jobConf = new JobConf(); JobConf jobConf = new JobConf();
Schema schema = HoodieAvroUtils.addMetadataFields(Schema.parse(TRIP_EXAMPLE_SCHEMA)); Schema schema = HoodieAvroUtils.addMetadataFields(Schema.parse(TRIP_EXAMPLE_SCHEMA));
@@ -59,7 +58,8 @@ public class HoodieMergeOnReadTestUtils {
ArrayWritable writable = (ArrayWritable) recordReader.createValue(); ArrayWritable writable = (ArrayWritable) recordReader.createValue();
while (recordReader.next(key, writable)) { while (recordReader.next(key, writable)) {
GenericRecordBuilder newRecord = new GenericRecordBuilder(schema); GenericRecordBuilder newRecord = new GenericRecordBuilder(schema);
// writable returns an array with [field1, field2, _hoodie_commit_time, _hoodie_commit_seqno] // writable returns an array with [field1, field2, _hoodie_commit_time,
// _hoodie_commit_seqno]
Writable[] values = writable.get(); Writable[] values = writable.get();
schema.getFields().forEach(field -> { schema.getFields().forEach(field -> {
newRecord.set(field, values[2]); newRecord.set(field, values[2]);
@@ -76,12 +76,11 @@ public class HoodieMergeOnReadTestUtils {
}).get(); }).get();
} }
private static void setPropsForInputFormat(HoodieRealtimeInputFormat inputFormat, JobConf jobConf, private static void setPropsForInputFormat(HoodieRealtimeInputFormat inputFormat, JobConf jobConf, Schema schema,
Schema schema, String basePath) { String basePath) {
List<Schema.Field> fields = schema.getFields(); List<Schema.Field> fields = schema.getFields();
String names = fields.stream().map(f -> f.name().toString()).collect(Collectors.joining(",")); String names = fields.stream().map(f -> f.name().toString()).collect(Collectors.joining(","));
String postions = fields.stream().map(f -> String.valueOf(f.pos())) String postions = fields.stream().map(f -> String.valueOf(f.pos())).collect(Collectors.joining(","));
.collect(Collectors.joining(","));
Configuration conf = HoodieTestUtils.getDefaultHadoopConf(); Configuration conf = HoodieTestUtils.getDefaultHadoopConf();
jobConf.set(ColumnProjectionUtils.READ_COLUMN_NAMES_CONF_STR, names); jobConf.set(ColumnProjectionUtils.READ_COLUMN_NAMES_CONF_STR, names);
jobConf.set(ColumnProjectionUtils.READ_COLUMN_IDS_CONF_STR, postions); jobConf.set(ColumnProjectionUtils.READ_COLUMN_IDS_CONF_STR, postions);

View File

@@ -41,20 +41,15 @@ import org.apache.hadoop.fs.Path;
/** /**
* Class to be used in tests to keep generating test inserts and updates against a corpus. * Class to be used in tests to keep generating test inserts and updates against a corpus.
* * <p>
* Test data uses a toy Uber trips, data model. * Test data uses a toy Uber trips, data model.
*/ */
public class HoodieTestDataGenerator { public class HoodieTestDataGenerator {
static class KeyPartition { // based on examination of sample file, the schema produces the following per record size
public static final int SIZE_PER_RECORD = 50 * 1024;
HoodieKey key; public static final String[] DEFAULT_PARTITION_PATHS = {"2016/03/15", "2015/03/16", "2015/03/17"};
String partitionPath; public static String TRIP_EXAMPLE_SCHEMA = "{\"type\": \"record\"," + "\"name\": \"triprec\"," + "\"fields\": [ "
}
public static String TRIP_EXAMPLE_SCHEMA = "{\"type\": \"record\","
+ "\"name\": \"triprec\","
+ "\"fields\": [ "
+ "{\"name\": \"timestamp\",\"type\": \"double\"}," + "{\"name\": \"timestamp\",\"type\": \"double\"},"
+ "{\"name\": \"_row_key\", \"type\": \"string\"}," + "{\"name\": \"_row_key\", \"type\": \"string\"},"
+ "{\"name\": \"rider\", \"type\": \"string\"}," + "{\"name\": \"rider\", \"type\": \"string\"},"
@@ -64,25 +59,9 @@ public class HoodieTestDataGenerator {
+ "{\"name\": \"end_lat\", \"type\": \"double\"}," + "{\"name\": \"end_lat\", \"type\": \"double\"},"
+ "{\"name\": \"end_lon\", \"type\": \"double\"}," + "{\"name\": \"end_lon\", \"type\": \"double\"},"
+ "{\"name\":\"fare\",\"type\": \"double\"}]}"; + "{\"name\":\"fare\",\"type\": \"double\"}]}";
public static Schema avroSchema = HoodieAvroUtils.addMetadataFields(new Schema.Parser().parse(TRIP_EXAMPLE_SCHEMA));
// based on examination of sample file, the schema produces the following per record size
public static final int SIZE_PER_RECORD = 50 * 1024;
public static final String[] DEFAULT_PARTITION_PATHS = {"2016/03/15", "2015/03/16", "2015/03/17"};
public static void writePartitionMetadata(FileSystem fs, String[] partitionPaths,
String basePath) {
for (String partitionPath : partitionPaths) {
new HoodiePartitionMetadata(fs, "000", new Path(basePath), new Path(basePath, partitionPath))
.trySave(0);
}
}
private List<KeyPartition> existingKeysList = new ArrayList<>();
public static Schema avroSchema = HoodieAvroUtils
.addMetadataFields(new Schema.Parser().parse(TRIP_EXAMPLE_SCHEMA));
private static Random rand = new Random(46474747); private static Random rand = new Random(46474747);
private List<KeyPartition> existingKeysList = new ArrayList<>();
private String[] partitionPaths = DEFAULT_PARTITION_PATHS; private String[] partitionPaths = DEFAULT_PARTITION_PATHS;
public HoodieTestDataGenerator(String[] partitionPaths) { public HoodieTestDataGenerator(String[] partitionPaths) {
@@ -93,10 +72,66 @@ public class HoodieTestDataGenerator {
this(new String[] {"2016/03/15", "2015/03/16", "2015/03/17"}); this(new String[] {"2016/03/15", "2015/03/16", "2015/03/17"});
} }
public static void writePartitionMetadata(FileSystem fs, String[] partitionPaths, String basePath) {
for (String partitionPath : partitionPaths) {
new HoodiePartitionMetadata(fs, "000", new Path(basePath), new Path(basePath, partitionPath)).trySave(0);
}
}
/** /**
* Generates new inserts, uniformly across the partition paths above. It also updates the list of * Generates a new avro record of the above schema format, retaining the key if optionally provided.
* existing keys. */
public static TestRawTripPayload generateRandomValue(HoodieKey key, String commitTime) throws IOException {
GenericRecord rec = generateGenericRecord(key.getRecordKey(), "rider-" + commitTime, "driver-" + commitTime, 0.0);
HoodieAvroUtils.addCommitMetadataToRecord(rec, commitTime, "-1");
return new TestRawTripPayload(rec.toString(), key.getRecordKey(), key.getPartitionPath(), TRIP_EXAMPLE_SCHEMA);
}
public static GenericRecord generateGenericRecord(String rowKey, String riderName, String driverName,
double timestamp) {
GenericRecord rec = new GenericData.Record(avroSchema);
rec.put("_row_key", rowKey);
rec.put("timestamp", timestamp);
rec.put("rider", riderName);
rec.put("driver", driverName);
rec.put("begin_lat", rand.nextDouble());
rec.put("begin_lon", rand.nextDouble());
rec.put("end_lat", rand.nextDouble());
rec.put("end_lon", rand.nextDouble());
rec.put("fare", rand.nextDouble() * 100);
return rec;
}
public static void createCommitFile(String basePath, String commitTime) throws IOException {
Path commitFile = new Path(
basePath + "/" + HoodieTableMetaClient.METAFOLDER_NAME + "/" + HoodieTimeline.makeCommitFileName(commitTime));
FileSystem fs = FSUtils.getFs(basePath, HoodieTestUtils.getDefaultHadoopConf());
FSDataOutputStream os = fs.create(commitFile, true);
HoodieCommitMetadata commitMetadata = new HoodieCommitMetadata();
try {
// Write empty commit metadata
os.writeBytes(new String(commitMetadata.toJsonString().getBytes(StandardCharsets.UTF_8)));
} finally {
os.close();
}
}
public static void createSavepointFile(String basePath, String commitTime) throws IOException {
Path commitFile = new Path(basePath + "/" + HoodieTableMetaClient.METAFOLDER_NAME
+ "/" + HoodieTimeline.makeSavePointFileName(commitTime));
FileSystem fs = FSUtils.getFs(basePath, HoodieTestUtils.getDefaultHadoopConf());
FSDataOutputStream os = fs.create(commitFile, true);
HoodieCommitMetadata commitMetadata = new HoodieCommitMetadata();
try {
// Write empty commit metadata
os.writeBytes(new String(commitMetadata.toJsonString().getBytes(StandardCharsets.UTF_8)));
} finally {
os.close();
}
}
/**
* Generates new inserts, uniformly across the partition paths above. It also updates the list of existing keys.
*/ */
public List<HoodieRecord> generateInserts(String commitTime, int n) throws IOException { public List<HoodieRecord> generateInserts(String commitTime, int n) throws IOException {
List<HoodieRecord> inserts = new ArrayList<>(); List<HoodieRecord> inserts = new ArrayList<>();
@@ -119,8 +154,7 @@ public class HoodieTestDataGenerator {
return generateDeletesFromExistingRecords(inserts); return generateDeletesFromExistingRecords(inserts);
} }
public List<HoodieRecord> generateDeletesFromExistingRecords(List<HoodieRecord> existingRecords) public List<HoodieRecord> generateDeletesFromExistingRecords(List<HoodieRecord> existingRecords) throws IOException {
throws IOException {
List<HoodieRecord> deletes = new ArrayList<>(); List<HoodieRecord> deletes = new ArrayList<>();
for (HoodieRecord existingRecord : existingRecords) { for (HoodieRecord existingRecord : existingRecords) {
HoodieRecord record = generateDeleteRecord(existingRecord); HoodieRecord record = generateDeleteRecord(existingRecord);
@@ -132,17 +166,15 @@ public class HoodieTestDataGenerator {
public HoodieRecord generateDeleteRecord(HoodieRecord existingRecord) throws IOException { public HoodieRecord generateDeleteRecord(HoodieRecord existingRecord) throws IOException {
HoodieKey key = existingRecord.getKey(); HoodieKey key = existingRecord.getKey();
TestRawTripPayload payload = new TestRawTripPayload(Optional.empty(), key.getRecordKey(), TestRawTripPayload payload = new TestRawTripPayload(Optional.empty(), key.getRecordKey(), key.getPartitionPath(),
key.getPartitionPath(), null, true); null, true);
return new HoodieRecord(key, payload); return new HoodieRecord(key, payload);
} }
public List<HoodieRecord> generateUpdates(String commitTime, List<HoodieRecord> baseRecords) public List<HoodieRecord> generateUpdates(String commitTime, List<HoodieRecord> baseRecords) throws IOException {
throws IOException {
List<HoodieRecord> updates = new ArrayList<>(); List<HoodieRecord> updates = new ArrayList<>();
for (HoodieRecord baseRecord : baseRecords) { for (HoodieRecord baseRecord : baseRecords) {
HoodieRecord record = new HoodieRecord(baseRecord.getKey(), HoodieRecord record = new HoodieRecord(baseRecord.getKey(), generateRandomValue(baseRecord.getKey(), commitTime));
generateRandomValue(baseRecord.getKey(), commitTime));
updates.add(record); updates.add(record);
} }
return updates; return updates;
@@ -161,68 +193,13 @@ public class HoodieTestDataGenerator {
return updates; return updates;
} }
/**
* Generates a new avro record of the above schema format, retaining the key if optionally
* provided.
*/
public static TestRawTripPayload generateRandomValue(HoodieKey key, String commitTime)
throws IOException {
GenericRecord rec = generateGenericRecord(key.getRecordKey(), "rider-" + commitTime,
"driver-" + commitTime, 0.0);
HoodieAvroUtils.addCommitMetadataToRecord(rec, commitTime, "-1");
return new TestRawTripPayload(rec.toString(), key.getRecordKey(), key.getPartitionPath(),
TRIP_EXAMPLE_SCHEMA);
}
public static GenericRecord generateGenericRecord(String rowKey, String riderName,
String driverName, double timestamp) {
GenericRecord rec = new GenericData.Record(avroSchema);
rec.put("_row_key", rowKey);
rec.put("timestamp", timestamp);
rec.put("rider", riderName);
rec.put("driver", driverName);
rec.put("begin_lat", rand.nextDouble());
rec.put("begin_lon", rand.nextDouble());
rec.put("end_lat", rand.nextDouble());
rec.put("end_lon", rand.nextDouble());
rec.put("fare", rand.nextDouble() * 100);
return rec;
}
public static void createCommitFile(String basePath, String commitTime) throws IOException {
Path commitFile =
new Path(basePath + "/" + HoodieTableMetaClient.METAFOLDER_NAME + "/" + HoodieTimeline
.makeCommitFileName(commitTime));
FileSystem fs = FSUtils.getFs(basePath, HoodieTestUtils.getDefaultHadoopConf());
FSDataOutputStream os = fs.create(commitFile, true);
HoodieCommitMetadata commitMetadata = new HoodieCommitMetadata();
try {
// Write empty commit metadata
os.writeBytes(new String(commitMetadata.toJsonString().getBytes(
StandardCharsets.UTF_8)));
} finally {
os.close();
}
}
public static void createSavepointFile(String basePath, String commitTime) throws IOException {
Path commitFile =
new Path(basePath + "/" + HoodieTableMetaClient.METAFOLDER_NAME + "/" + HoodieTimeline
.makeSavePointFileName(commitTime));
FileSystem fs = FSUtils.getFs(basePath, HoodieTestUtils.getDefaultHadoopConf());
FSDataOutputStream os = fs.create(commitFile, true);
HoodieCommitMetadata commitMetadata = new HoodieCommitMetadata();
try {
// Write empty commit metadata
os.writeBytes(new String(commitMetadata.toJsonString().getBytes(
StandardCharsets.UTF_8)));
} finally {
os.close();
}
}
public String[] getPartitionPaths() { public String[] getPartitionPaths() {
return partitionPaths; return partitionPaths;
} }
static class KeyPartition {
HoodieKey key;
String partitionPath;
}
} }

View File

@@ -43,15 +43,15 @@ import org.apache.commons.io.IOUtils;
*/ */
public class TestRawTripPayload implements HoodieRecordPayload<TestRawTripPayload> { public class TestRawTripPayload implements HoodieRecordPayload<TestRawTripPayload> {
private transient static final ObjectMapper mapper = new ObjectMapper(); private static final transient ObjectMapper mapper = new ObjectMapper();
private String partitionPath; private String partitionPath;
private String rowKey; private String rowKey;
private byte[] jsonDataCompressed; private byte[] jsonDataCompressed;
private int dataSize; private int dataSize;
private boolean isDeleted; private boolean isDeleted;
public TestRawTripPayload(Optional<String> jsonData, String rowKey, String partitionPath, public TestRawTripPayload(Optional<String> jsonData, String rowKey, String partitionPath, String schemaStr,
String schemaStr, Boolean isDeleted) throws IOException { Boolean isDeleted) throws IOException {
if (jsonData.isPresent()) { if (jsonData.isPresent()) {
this.jsonDataCompressed = compressData(jsonData.get()); this.jsonDataCompressed = compressData(jsonData.get());
this.dataSize = jsonData.get().length(); this.dataSize = jsonData.get().length();
@@ -61,8 +61,7 @@ public class TestRawTripPayload implements HoodieRecordPayload<TestRawTripPayloa
this.isDeleted = isDeleted; this.isDeleted = isDeleted;
} }
public TestRawTripPayload(String jsonData, String rowKey, String partitionPath, public TestRawTripPayload(String jsonData, String rowKey, String partitionPath, String schemaStr) throws IOException {
String schemaStr) throws IOException {
this(Optional.of(jsonData), rowKey, partitionPath, schemaStr, false); this(Optional.of(jsonData), rowKey, partitionPath, schemaStr, false);
} }
@@ -86,8 +85,7 @@ public class TestRawTripPayload implements HoodieRecordPayload<TestRawTripPayloa
} }
@Override @Override
public Optional<IndexedRecord> combineAndGetUpdateValue(IndexedRecord oldRec, Schema schema) public Optional<IndexedRecord> combineAndGetUpdateValue(IndexedRecord oldRec, Schema schema) throws IOException {
throws IOException {
return this.getInsertValue(schema); return this.getInsertValue(schema);
} }
@@ -120,8 +118,7 @@ public class TestRawTripPayload implements HoodieRecordPayload<TestRawTripPayloa
private byte[] compressData(String jsonData) throws IOException { private byte[] compressData(String jsonData) throws IOException {
ByteArrayOutputStream baos = new ByteArrayOutputStream(); ByteArrayOutputStream baos = new ByteArrayOutputStream();
DeflaterOutputStream dos = DeflaterOutputStream dos = new DeflaterOutputStream(baos, new Deflater(Deflater.BEST_COMPRESSION), true);
new DeflaterOutputStream(baos, new Deflater(Deflater.BEST_COMPRESSION), true);
try { try {
dos.write(jsonData.getBytes()); dos.write(jsonData.getBytes());
} finally { } finally {
@@ -140,13 +137,36 @@ public class TestRawTripPayload implements HoodieRecordPayload<TestRawTripPayloa
} }
/** /**
* A custom {@link WriteStatus} that merges passed metadata key value map to {@code * A custom {@link WriteStatus} that merges passed metadata key value map to {@code WriteStatus.markSuccess()} and
* WriteStatus.markSuccess()} and {@code WriteStatus.markFailure()}. * {@code WriteStatus.markFailure()}.
*/ */
public static class MetadataMergeWriteStatus extends WriteStatus { public static class MetadataMergeWriteStatus extends WriteStatus {
private Map<String, String> mergedMetadataMap = new HashMap<>(); private Map<String, String> mergedMetadataMap = new HashMap<>();
public static Map<String, String> mergeMetadataForWriteStatuses(List<WriteStatus> writeStatuses) {
Map<String, String> allWriteStatusMergedMetadataMap = new HashMap<>();
for (WriteStatus writeStatus : writeStatuses) {
MetadataMergeWriteStatus.mergeMetadataMaps(((MetadataMergeWriteStatus) writeStatus).getMergedMetadataMap(),
allWriteStatusMergedMetadataMap);
}
return allWriteStatusMergedMetadataMap;
}
private static void mergeMetadataMaps(Map<String, String> mergeFromMap, Map<String, String> mergeToMap) {
for (Entry<String, String> entry : mergeFromMap.entrySet()) {
String key = entry.getKey();
if (!mergeToMap.containsKey(key)) {
mergeToMap.put(key, "0");
}
mergeToMap.put(key, addStrsAsInt(entry.getValue(), mergeToMap.get(key)));
}
}
private static String addStrsAsInt(String a, String b) {
return String.valueOf(Integer.parseInt(a) + Integer.parseInt(b));
}
@Override @Override
public void markSuccess(HoodieRecord record, Optional<Map<String, String>> recordMetadata) { public void markSuccess(HoodieRecord record, Optional<Map<String, String>> recordMetadata) {
super.markSuccess(record, recordMetadata); super.markSuccess(record, recordMetadata);
@@ -156,43 +176,15 @@ public class TestRawTripPayload implements HoodieRecordPayload<TestRawTripPayloa
} }
@Override @Override
public void markFailure(HoodieRecord record, Throwable t, public void markFailure(HoodieRecord record, Throwable t, Optional<Map<String, String>> recordMetadata) {
Optional<Map<String, String>> recordMetadata) {
super.markFailure(record, t, recordMetadata); super.markFailure(record, t, recordMetadata);
if (recordMetadata.isPresent()) { if (recordMetadata.isPresent()) {
mergeMetadataMaps(recordMetadata.get(), mergedMetadataMap); mergeMetadataMaps(recordMetadata.get(), mergedMetadataMap);
} }
} }
public static Map<String, String> mergeMetadataForWriteStatuses(
List<WriteStatus> writeStatuses) {
Map<String, String> allWriteStatusMergedMetadataMap = new HashMap<>();
for (WriteStatus writeStatus : writeStatuses) {
MetadataMergeWriteStatus.mergeMetadataMaps(
((MetadataMergeWriteStatus) writeStatus).getMergedMetadataMap(),
allWriteStatusMergedMetadataMap);
}
return allWriteStatusMergedMetadataMap;
}
private static void mergeMetadataMaps(Map<String, String> mergeFromMap,
Map<String, String> mergeToMap) {
for (Entry<String, String> entry : mergeFromMap.entrySet()) {
String key = entry.getKey();
if (!mergeToMap.containsKey(key)) {
mergeToMap.put(key, "0");
}
mergeToMap
.put(key, addStrsAsInt(entry.getValue(), mergeToMap.get(key)));
}
}
private Map<String, String> getMergedMetadataMap() { private Map<String, String> getMergedMetadataMap() {
return mergedMetadataMap; return mergedMetadataMap;
} }
private static String addStrsAsInt(String a, String b) {
return String.valueOf(Integer.parseInt(a) + Integer.parseInt(b));
}
} }
} }

View File

@@ -49,8 +49,7 @@ public class HoodieWriteConfigTest {
assertEquals(config.getMinCommitsToKeep(), 2); assertEquals(config.getMinCommitsToKeep(), 2);
} }
private ByteArrayOutputStream saveParamsIntoOutputStream(Map<String, String> params) private ByteArrayOutputStream saveParamsIntoOutputStream(Map<String, String> params) throws IOException {
throws IOException {
Properties properties = new Properties(); Properties properties = new Properties();
properties.putAll(params); properties.putAll(params);
ByteArrayOutputStream outStream = new ByteArrayOutputStream(); ByteArrayOutputStream outStream = new ByteArrayOutputStream();

View File

@@ -16,18 +16,13 @@
package com.uber.hoodie.func; package com.uber.hoodie.func;
import static org.mockito.Mockito.mock;
import static org.mockito.Mockito.when;
import com.uber.hoodie.common.HoodieTestDataGenerator; import com.uber.hoodie.common.HoodieTestDataGenerator;
import com.uber.hoodie.common.model.HoodieRecord; import com.uber.hoodie.common.model.HoodieRecord;
import com.uber.hoodie.common.table.timeline.HoodieActiveTimeline; import com.uber.hoodie.common.table.timeline.HoodieActiveTimeline;
import com.uber.hoodie.exception.HoodieException; import com.uber.hoodie.exception.HoodieException;
import org.apache.avro.generic.IndexedRecord;
import org.apache.commons.io.FileUtils;
import org.apache.spark.util.SizeEstimator;
import org.junit.After;
import org.junit.Assert;
import org.junit.Before;
import org.junit.Test;
import java.io.IOException; import java.io.IOException;
import java.util.Iterator; import java.util.Iterator;
import java.util.List; import java.util.List;
@@ -37,9 +32,13 @@ import java.util.concurrent.ExecutorService;
import java.util.concurrent.Executors; import java.util.concurrent.Executors;
import java.util.concurrent.Future; import java.util.concurrent.Future;
import java.util.concurrent.Semaphore; import java.util.concurrent.Semaphore;
import org.apache.avro.generic.IndexedRecord;
import static org.mockito.Mockito.mock; import org.apache.commons.io.FileUtils;
import static org.mockito.Mockito.when; import org.apache.spark.util.SizeEstimator;
import org.junit.After;
import org.junit.Assert;
import org.junit.Before;
import org.junit.Test;
public class TestBufferedIterator { public class TestBufferedIterator {
@@ -60,26 +59,24 @@ public class TestBufferedIterator {
} }
} }
// Test to ensure that we are reading all records from buffered iterator in the same order without any exceptions. // Test to ensure that we are reading all records from buffered iterator in the same order
// without any exceptions.
@Test(timeout = 60000) @Test(timeout = 60000)
public void testRecordReading() throws IOException, ExecutionException, InterruptedException { public void testRecordReading() throws IOException, ExecutionException, InterruptedException {
final int numRecords = 128; final int numRecords = 128;
final List<HoodieRecord> hoodieRecords = hoodieTestDataGenerator.generateInserts(commitTime, numRecords); final List<HoodieRecord> hoodieRecords = hoodieTestDataGenerator.generateInserts(commitTime, numRecords);
final BufferedIterator bufferedIterator = final BufferedIterator bufferedIterator = new BufferedIterator(hoodieRecords.iterator(), FileUtils.ONE_KB,
new BufferedIterator(hoodieRecords.iterator(), FileUtils.ONE_KB, HoodieTestDataGenerator.avroSchema); HoodieTestDataGenerator.avroSchema);
Future<Boolean> result = Future<Boolean> result = recordReader.submit(() -> {
recordReader.submit(
() -> {
bufferedIterator.startBuffering(); bufferedIterator.startBuffering();
return true; return true;
} });
);
final Iterator<HoodieRecord> originalRecordIterator = hoodieRecords.iterator(); final Iterator<HoodieRecord> originalRecordIterator = hoodieRecords.iterator();
int recordsRead = 0; int recordsRead = 0;
while (bufferedIterator.hasNext()) { while (bufferedIterator.hasNext()) {
final HoodieRecord originalRecord = originalRecordIterator.next(); final HoodieRecord originalRecord = originalRecordIterator.next();
final Optional<IndexedRecord> originalInsertValue = final Optional<IndexedRecord> originalInsertValue = originalRecord.getData()
originalRecord.getData().getInsertValue(HoodieTestDataGenerator.avroSchema); .getInsertValue(HoodieTestDataGenerator.avroSchema);
final BufferedIterator.BufferedIteratorPayload payload = bufferedIterator.next(); final BufferedIterator.BufferedIteratorPayload payload = bufferedIterator.next();
// Ensure that record ordering is guaranteed. // Ensure that record ordering is guaranteed.
Assert.assertEquals(originalRecord, payload.record); Assert.assertEquals(originalRecord, payload.record);
@@ -102,15 +99,12 @@ public class TestBufferedIterator {
// maximum number of records to keep in memory. // maximum number of records to keep in memory.
final int recordLimit = 5; final int recordLimit = 5;
final long memoryLimitInBytes = recordLimit * SizeEstimator.estimate(hoodieRecords.get(0)); final long memoryLimitInBytes = recordLimit * SizeEstimator.estimate(hoodieRecords.get(0));
final BufferedIterator bufferedIterator = final BufferedIterator bufferedIterator = new BufferedIterator(hoodieRecords.iterator(), memoryLimitInBytes,
new BufferedIterator(hoodieRecords.iterator(), memoryLimitInBytes, HoodieTestDataGenerator.avroSchema); HoodieTestDataGenerator.avroSchema);
Future<Boolean> result = Future<Boolean> result = recordReader.submit(() -> {
recordReader.submit(
() -> {
bufferedIterator.startBuffering(); bufferedIterator.startBuffering();
return true; return true;
} });
);
// waiting for permits to expire. // waiting for permits to expire.
while (!isQueueFull(bufferedIterator.rateLimiter)) { while (!isQueueFull(bufferedIterator.rateLimiter)) {
Thread.sleep(10); Thread.sleep(10);
@@ -128,7 +122,8 @@ public class TestBufferedIterator {
while (!isQueueFull(bufferedIterator.rateLimiter)) { while (!isQueueFull(bufferedIterator.rateLimiter)) {
Thread.sleep(10); Thread.sleep(10);
} }
// No change is expected in rate limit or number of buffered records. We only expect buffering thread to read // No change is expected in rate limit or number of buffered records. We only expect
// buffering thread to read
// 2 more records into the buffer. // 2 more records into the buffer.
Assert.assertEquals(0, bufferedIterator.rateLimiter.availablePermits()); Assert.assertEquals(0, bufferedIterator.rateLimiter.availablePermits());
Assert.assertEquals(recordLimit, bufferedIterator.currentRateLimit); Assert.assertEquals(recordLimit, bufferedIterator.currentRateLimit);
@@ -136,7 +131,8 @@ public class TestBufferedIterator {
Assert.assertEquals(recordLimit - 1 + 2, bufferedIterator.samplingRecordCounter.get()); Assert.assertEquals(recordLimit - 1 + 2, bufferedIterator.samplingRecordCounter.get());
} }
// Test to ensure that exception in either buffering thread or BufferedIterator-reader thread is propagated to // Test to ensure that exception in either buffering thread or BufferedIterator-reader thread
// is propagated to
// another thread. // another thread.
@Test(timeout = 60000) @Test(timeout = 60000)
public void testException() throws IOException, InterruptedException { public void testException() throws IOException, InterruptedException {
@@ -145,17 +141,15 @@ public class TestBufferedIterator {
// buffer memory limit // buffer memory limit
final long memoryLimitInBytes = 4 * SizeEstimator.estimate(hoodieRecords.get(0)); final long memoryLimitInBytes = 4 * SizeEstimator.estimate(hoodieRecords.get(0));
// first let us throw exception from bufferIterator reader and test that buffering thread stops and throws // first let us throw exception from bufferIterator reader and test that buffering thread
// stops and throws
// correct exception back. // correct exception back.
BufferedIterator bufferedIterator1 = BufferedIterator bufferedIterator1 = new BufferedIterator(hoodieRecords.iterator(), memoryLimitInBytes,
new BufferedIterator(hoodieRecords.iterator(), memoryLimitInBytes, HoodieTestDataGenerator.avroSchema); HoodieTestDataGenerator.avroSchema);
Future<Boolean> result = Future<Boolean> result = recordReader.submit(() -> {
recordReader.submit(
() -> {
bufferedIterator1.startBuffering(); bufferedIterator1.startBuffering();
return true; return true;
} });
);
// waiting for permits to expire. // waiting for permits to expire.
while (!isQueueFull(bufferedIterator1.rateLimiter)) { while (!isQueueFull(bufferedIterator1.rateLimiter)) {
Thread.sleep(10); Thread.sleep(10);
@@ -171,21 +165,19 @@ public class TestBufferedIterator {
Assert.assertEquals(e, e1.getCause().getCause()); Assert.assertEquals(e, e1.getCause().getCause());
} }
// second let us raise an exception while doing record buffering. this exception should get propagated to // second let us raise an exception while doing record buffering. this exception should get
// propagated to
// buffered iterator reader. // buffered iterator reader.
final RuntimeException expectedException = new RuntimeException("failing record reading"); final RuntimeException expectedException = new RuntimeException("failing record reading");
final Iterator<HoodieRecord> mockHoodieRecordsIterator = mock(Iterator.class); final Iterator<HoodieRecord> mockHoodieRecordsIterator = mock(Iterator.class);
when(mockHoodieRecordsIterator.hasNext()).thenReturn(true); when(mockHoodieRecordsIterator.hasNext()).thenReturn(true);
when(mockHoodieRecordsIterator.next()).thenThrow(expectedException); when(mockHoodieRecordsIterator.next()).thenThrow(expectedException);
BufferedIterator bufferedIterator2 = BufferedIterator bufferedIterator2 = new BufferedIterator(mockHoodieRecordsIterator, memoryLimitInBytes,
new BufferedIterator(mockHoodieRecordsIterator, memoryLimitInBytes, HoodieTestDataGenerator.avroSchema); HoodieTestDataGenerator.avroSchema);
Future<Boolean> result2 = Future<Boolean> result2 = recordReader.submit(() -> {
recordReader.submit(
() -> {
bufferedIterator2.startBuffering(); bufferedIterator2.startBuffering();
return true; return true;
} });
);
try { try {
bufferedIterator2.hasNext(); bufferedIterator2.hasNext();
Assert.fail("exception is expected"); Assert.fail("exception is expected");

View File

@@ -55,32 +55,24 @@ public class TestUpdateMapFunction {
public void testSchemaEvolutionOnUpdate() throws Exception { public void testSchemaEvolutionOnUpdate() throws Exception {
// Create a bunch of records with a old version of schema // Create a bunch of records with a old version of schema
HoodieWriteConfig config = makeHoodieClientConfig("/exampleSchema.txt"); HoodieWriteConfig config = makeHoodieClientConfig("/exampleSchema.txt");
HoodieTableMetaClient metaClient = new HoodieTableMetaClient( HoodieTableMetaClient metaClient = new HoodieTableMetaClient(HoodieTestUtils.getDefaultHadoopConf(), basePath);
HoodieTestUtils.getDefaultHadoopConf(), basePath);
HoodieCopyOnWriteTable table = new HoodieCopyOnWriteTable(config, metaClient); HoodieCopyOnWriteTable table = new HoodieCopyOnWriteTable(config, metaClient);
String recordStr1 = String recordStr1 = "{\"_row_key\":\"8eb5b87a-1feh-4edd-87b4-6ec96dc405a0\","
"{\"_row_key\":\"8eb5b87a-1feh-4edd-87b4-6ec96dc405a0\",\"time\":\"2016-01-31T03:16:41.415Z\",\"number\":12}"; + "\"time\":\"2016-01-31T03:16:41.415Z\",\"number\":12}";
String recordStr2 = String recordStr2 = "{\"_row_key\":\"8eb5b87b-1feu-4edd-87b4-6ec96dc405a0\","
"{\"_row_key\":\"8eb5b87b-1feu-4edd-87b4-6ec96dc405a0\",\"time\":\"2016-01-31T03:20:41.415Z\",\"number\":100}"; + "\"time\":\"2016-01-31T03:20:41.415Z\",\"number\":100}";
String recordStr3 = String recordStr3 = "{\"_row_key\":\"8eb5b87c-1fej-4edd-87b4-6ec96dc405a0\","
"{\"_row_key\":\"8eb5b87c-1fej-4edd-87b4-6ec96dc405a0\",\"time\":\"2016-01-31T03:16:41.415Z\",\"number\":15}"; + "\"time\":\"2016-01-31T03:16:41.415Z\",\"number\":15}";
List<HoodieRecord> records = new ArrayList<>(); List<HoodieRecord> records = new ArrayList<>();
TestRawTripPayload rowChange1 = new TestRawTripPayload(recordStr1); TestRawTripPayload rowChange1 = new TestRawTripPayload(recordStr1);
records.add( records.add(new HoodieRecord(new HoodieKey(rowChange1.getRowKey(), rowChange1.getPartitionPath()), rowChange1));
new HoodieRecord(new HoodieKey(rowChange1.getRowKey(), rowChange1.getPartitionPath()),
rowChange1));
TestRawTripPayload rowChange2 = new TestRawTripPayload(recordStr2); TestRawTripPayload rowChange2 = new TestRawTripPayload(recordStr2);
records.add( records.add(new HoodieRecord(new HoodieKey(rowChange2.getRowKey(), rowChange2.getPartitionPath()), rowChange2));
new HoodieRecord(new HoodieKey(rowChange2.getRowKey(), rowChange2.getPartitionPath()),
rowChange2));
TestRawTripPayload rowChange3 = new TestRawTripPayload(recordStr3); TestRawTripPayload rowChange3 = new TestRawTripPayload(recordStr3);
records.add( records.add(new HoodieRecord(new HoodieKey(rowChange3.getRowKey(), rowChange3.getPartitionPath()), rowChange3));
new HoodieRecord(new HoodieKey(rowChange3.getRowKey(), rowChange3.getPartitionPath()),
rowChange3));
Iterator<List<WriteStatus>> insertResult = table.handleInsert("100", records.iterator()); Iterator<List<WriteStatus>> insertResult = table.handleInsert("100", records.iterator());
Path commitFile = Path commitFile = new Path(config.getBasePath() + "/.hoodie/" + HoodieTimeline.makeCommitFileName("100"));
new Path(config.getBasePath() + "/.hoodie/" + HoodieTimeline.makeCommitFileName("100"));
FSUtils.getFs(basePath, HoodieTestUtils.getDefaultHadoopConf()).create(commitFile); FSUtils.getFs(basePath, HoodieTestUtils.getDefaultHadoopConf()).create(commitFile);
// Now try an update with an evolved schema // Now try an update with an evolved schema
@@ -92,12 +84,11 @@ public class TestUpdateMapFunction {
table = new HoodieCopyOnWriteTable(config, metaClient); table = new HoodieCopyOnWriteTable(config, metaClient);
// New content with values for the newly added field // New content with values for the newly added field
recordStr1 = recordStr1 = "{\"_row_key\":\"8eb5b87a-1feh-4edd-87b4-6ec96dc405a0\","
"{\"_row_key\":\"8eb5b87a-1feh-4edd-87b4-6ec96dc405a0\",\"time\":\"2016-01-31T03:16:41.415Z\",\"number\":12,\"added_field\":1}"; + "\"time\":\"2016-01-31T03:16:41.415Z\",\"number\":12,\"added_field\":1}";
records = new ArrayList<>(); records = new ArrayList<>();
rowChange1 = new TestRawTripPayload(recordStr1); rowChange1 = new TestRawTripPayload(recordStr1);
HoodieRecord record1 = HoodieRecord record1 = new HoodieRecord(new HoodieKey(rowChange1.getRowKey(), rowChange1.getPartitionPath()),
new HoodieRecord(new HoodieKey(rowChange1.getRowKey(), rowChange1.getPartitionPath()),
rowChange1); rowChange1);
record1.setCurrentLocation(new HoodieRecordLocation("100", fileId)); record1.setCurrentLocation(new HoodieRecordLocation("100", fileId));
records.add(record1); records.add(record1);
@@ -105,8 +96,8 @@ public class TestUpdateMapFunction {
try { try {
table.handleUpdate("101", fileId, records.iterator()); table.handleUpdate("101", fileId, records.iterator());
} catch (ClassCastException e) { } catch (ClassCastException e) {
fail( fail("UpdateFunction could not read records written with exampleSchema.txt using the "
"UpdateFunction could not read records written with exampleSchema.txt using the exampleEvolvedSchema.txt"); + "exampleEvolvedSchema.txt");
} }
} }

View File

@@ -16,6 +16,12 @@
package com.uber.hoodie.index; package com.uber.hoodie.index;
import static org.junit.Assert.assertFalse;
import static org.junit.Assert.assertTrue;
import static org.mockito.Matchers.anyObject;
import static org.mockito.Mockito.atMost;
import static org.mockito.Mockito.times;
import com.uber.hoodie.HoodieWriteClient; import com.uber.hoodie.HoodieWriteClient;
import com.uber.hoodie.WriteStatus; import com.uber.hoodie.WriteStatus;
import com.uber.hoodie.common.HoodieTestDataGenerator; import com.uber.hoodie.common.HoodieTestDataGenerator;
@@ -23,19 +29,16 @@ import com.uber.hoodie.common.model.HoodieRecord;
import com.uber.hoodie.common.model.HoodieTableType; import com.uber.hoodie.common.model.HoodieTableType;
import com.uber.hoodie.common.table.HoodieTableConfig; import com.uber.hoodie.common.table.HoodieTableConfig;
import com.uber.hoodie.common.table.HoodieTableMetaClient; import com.uber.hoodie.common.table.HoodieTableMetaClient;
import com.uber.hoodie.common.table.TableFileSystemView;
import com.uber.hoodie.common.table.view.HoodieTableFileSystemView;
import com.uber.hoodie.common.util.FSUtils;
import com.uber.hoodie.config.HoodieCompactionConfig; import com.uber.hoodie.config.HoodieCompactionConfig;
import com.uber.hoodie.config.HoodieIndexConfig; import com.uber.hoodie.config.HoodieIndexConfig;
import com.uber.hoodie.config.HoodieStorageConfig; import com.uber.hoodie.config.HoodieStorageConfig;
import com.uber.hoodie.config.HoodieWriteConfig; import com.uber.hoodie.config.HoodieWriteConfig;
import com.uber.hoodie.index.hbase.HBaseIndex; import com.uber.hoodie.index.hbase.HBaseIndex;
import com.uber.hoodie.table.HoodieTable; import com.uber.hoodie.table.HoodieTable;
import java.io.File;
import java.util.List;
import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileStatus;
import org.apache.hadoop.fs.FileSystem; import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.hbase.HBaseTestingUtility; import org.apache.hadoop.hbase.HBaseTestingUtility;
import org.apache.hadoop.hbase.TableName; import org.apache.hadoop.hbase.TableName;
import org.apache.hadoop.hbase.client.Connection; import org.apache.hadoop.hbase.client.Connection;
@@ -56,37 +59,26 @@ import org.junit.Test;
import org.junit.rules.TemporaryFolder; import org.junit.rules.TemporaryFolder;
import org.junit.runners.MethodSorters; import org.junit.runners.MethodSorters;
import org.mockito.Mockito; import org.mockito.Mockito;
import scala.Tuple2;
import java.io.File;
import java.io.IOException;
import java.util.List;
import static org.junit.Assert.assertEquals;
import static org.junit.Assert.assertFalse;
import static org.junit.Assert.assertTrue;
import static org.mockito.Matchers.anyObject;
import static org.mockito.Mockito.atLeast;
import static org.mockito.Mockito.atMost;
import static org.mockito.Mockito.times;
/** /**
* Note :: HBaseTestingUtility is really flaky with issues where the HbaseMiniCluster fails to shutdown * Note :: HBaseTestingUtility is really flaky with issues where the HbaseMiniCluster fails to shutdown across tests,
* across tests, (see one problem here : https://issues.apache.org/jira/browse/HBASE-15835). * (see one problem here : https://issues.apache .org/jira/browse/HBASE-15835). Hence, the need to use
* Hence, the need to use MethodSorters.NAME_ASCENDING to make sure the tests run in order. Please alter * MethodSorters.NAME_ASCENDING to make sure the tests run in order. Please alter the order of tests running carefully.
* the order of tests running carefully.
*/ */
@FixMethodOrder(MethodSorters.NAME_ASCENDING) @FixMethodOrder(MethodSorters.NAME_ASCENDING)
public class TestHbaseIndex { public class TestHbaseIndex {
private static JavaSparkContext jsc = null; private static JavaSparkContext jsc = null;
private String basePath = null;
private transient FileSystem fs;
private static HBaseTestingUtility utility; private static HBaseTestingUtility utility;
private static Configuration hbaseConfig; private static Configuration hbaseConfig;
private static String tableName = "test_table"; private static String tableName = "test_table";
private String basePath = null;
private transient FileSystem fs;
private HoodieTableMetaClient metaClient; private HoodieTableMetaClient metaClient;
public TestHbaseIndex() throws Exception {
}
@AfterClass @AfterClass
public static void clean() throws Exception { public static void clean() throws Exception {
if (jsc != null) { if (jsc != null) {
@@ -97,6 +89,20 @@ public class TestHbaseIndex {
} }
} }
@BeforeClass
public static void init() throws Exception {
// Initialize HbaseMiniCluster
utility = new HBaseTestingUtility();
utility.startMiniCluster();
hbaseConfig = utility.getConnection().getConfiguration();
utility.createTable(TableName.valueOf(tableName), Bytes.toBytes("_s"));
// Initialize a local spark env
SparkConf sparkConf = new SparkConf().setAppName("TestHbaseIndex").setMaster("local[1]");
jsc = new JavaSparkContext(sparkConf);
jsc.hadoopConfiguration().addResource(utility.getConfiguration());
}
@After @After
public void clear() throws Exception { public void clear() throws Exception {
if (basePath != null) { if (basePath != null) {
@@ -112,25 +118,8 @@ public class TestHbaseIndex {
basePath = folder.getRoot().getAbsolutePath(); basePath = folder.getRoot().getAbsolutePath();
// Initialize table // Initialize table
metaClient = HoodieTableMetaClient metaClient = HoodieTableMetaClient
.initTableType(utility.getConfiguration(), basePath, HoodieTableType.COPY_ON_WRITE, .initTableType(utility.getConfiguration(), basePath, HoodieTableType.COPY_ON_WRITE, tableName,
tableName, HoodieTableConfig.DEFAULT_PAYLOAD_CLASS); HoodieTableConfig.DEFAULT_PAYLOAD_CLASS);
}
public TestHbaseIndex() throws Exception {
}
@BeforeClass
public static void init() throws Exception {
// Initialize HbaseMiniCluster
utility = new HBaseTestingUtility();
utility.startMiniCluster();
hbaseConfig = utility.getConnection().getConfiguration();
utility.createTable(TableName.valueOf(tableName), Bytes.toBytes("_s"));
// Initialize a local spark env
SparkConf sparkConf = new SparkConf().setAppName("TestHbaseIndex").setMaster("local[1]");
jsc = new JavaSparkContext(sparkConf);
jsc.hadoopConfiguration().addResource(utility.getConfiguration());
} }
@Test @Test
@@ -156,7 +145,8 @@ public class TestHbaseIndex {
JavaRDD<WriteStatus> writeStatues = writeClient.upsert(writeRecords, newCommitTime); JavaRDD<WriteStatus> writeStatues = writeClient.upsert(writeRecords, newCommitTime);
assertNoWriteErrors(writeStatues.collect()); assertNoWriteErrors(writeStatues.collect());
// Now tagLocation for these records, hbaseIndex should not tag them since it was a failed commit // Now tagLocation for these records, hbaseIndex should not tag them since it was a failed
// commit
javaRDD = index.tagLocation(writeRecords, hoodieTable); javaRDD = index.tagLocation(writeRecords, hoodieTable);
assert (javaRDD.filter(record -> record.isCurrentLocationKnown()).collect().size() == 0); assert (javaRDD.filter(record -> record.isCurrentLocationKnown()).collect().size() == 0);
@@ -167,8 +157,9 @@ public class TestHbaseIndex {
javaRDD = index.tagLocation(writeRecords, hoodieTable); javaRDD = index.tagLocation(writeRecords, hoodieTable);
assertTrue(javaRDD.filter(record -> record.isCurrentLocationKnown()).collect().size() == 200); assertTrue(javaRDD.filter(record -> record.isCurrentLocationKnown()).collect().size() == 200);
assertTrue(javaRDD.map(record -> record.getKey().getRecordKey()).distinct().count() == 200); assertTrue(javaRDD.map(record -> record.getKey().getRecordKey()).distinct().count() == 200);
assertTrue(javaRDD.filter(record -> (record.getCurrentLocation() != null assertTrue(javaRDD.filter(
&& record.getCurrentLocation().getCommitTime().equals(newCommitTime))).distinct().count() == 200); record -> (record.getCurrentLocation() != null && record.getCurrentLocation().getCommitTime()
.equals(newCommitTime))).distinct().count() == 200);
} }
@@ -208,7 +199,8 @@ public class TestHbaseIndex {
// Rollback the last commit // Rollback the last commit
writeClient.rollback(newCommitTime); writeClient.rollback(newCommitTime);
// Now tagLocation for these records, hbaseIndex should not tag them since it was a rolled back commit // Now tagLocation for these records, hbaseIndex should not tag them since it was a rolled
// back commit
javaRDD = index.tagLocation(writeRecords, hoodieTable); javaRDD = index.tagLocation(writeRecords, hoodieTable);
assert (javaRDD.filter(record -> record.isCurrentLocationKnown()).collect().size() == 0); assert (javaRDD.filter(record -> record.isCurrentLocationKnown()).collect().size() == 0);
assert (javaRDD.filter(record -> record.getCurrentLocation() != null).collect().size() == 0); assert (javaRDD.filter(record -> record.getCurrentLocation() != null).collect().size() == 0);
@@ -302,12 +294,10 @@ public class TestHbaseIndex {
} }
private HoodieWriteConfig.Builder getConfigBuilder() { private HoodieWriteConfig.Builder getConfigBuilder() {
return HoodieWriteConfig.newBuilder().withPath(basePath) return HoodieWriteConfig.newBuilder().withPath(basePath).withSchema(HoodieTestDataGenerator.TRIP_EXAMPLE_SCHEMA)
.withSchema(HoodieTestDataGenerator.TRIP_EXAMPLE_SCHEMA).withParallelism(1, 1) .withParallelism(1, 1).withCompactionConfig(
.withCompactionConfig( HoodieCompactionConfig.newBuilder().compactionSmallFileSize(1024 * 1024).withInlineCompaction(false)
HoodieCompactionConfig.newBuilder().compactionSmallFileSize(1024 * 1024) .build()).withAutoCommit(false)
.withInlineCompaction(false).build())
.withAutoCommit(false)
.withStorageConfig(HoodieStorageConfig.newBuilder().limitFileSize(1024 * 1024).build()) .withStorageConfig(HoodieStorageConfig.newBuilder().limitFileSize(1024 * 1024).build())
.forTable("test-trip-table").withIndexConfig( .forTable("test-trip-table").withIndexConfig(
HoodieIndexConfig.newBuilder().withIndexType(HoodieIndex.IndexType.HBASE) HoodieIndexConfig.newBuilder().withIndexType(HoodieIndex.IndexType.HBASE)

View File

@@ -31,16 +31,14 @@ public class TestHoodieIndex {
HoodieWriteConfig.Builder clientConfigBuilder = HoodieWriteConfig.newBuilder(); HoodieWriteConfig.Builder clientConfigBuilder = HoodieWriteConfig.newBuilder();
HoodieIndexConfig.Builder indexConfigBuilder = HoodieIndexConfig.newBuilder(); HoodieIndexConfig.Builder indexConfigBuilder = HoodieIndexConfig.newBuilder();
// Different types // Different types
HoodieWriteConfig config = clientConfigBuilder.withPath("") HoodieWriteConfig config = clientConfigBuilder.withPath("").withIndexConfig(
.withIndexConfig(indexConfigBuilder.withIndexType(HoodieIndex.IndexType.HBASE).build()) indexConfigBuilder.withIndexType(HoodieIndex.IndexType.HBASE).build()).build();
.build();
assertTrue(HoodieIndex.createIndex(config, null) instanceof HBaseIndex); assertTrue(HoodieIndex.createIndex(config, null) instanceof HBaseIndex);
config = clientConfigBuilder.withPath("").withIndexConfig( config = clientConfigBuilder.withPath("")
indexConfigBuilder.withIndexType(HoodieIndex.IndexType.INMEMORY).build()).build(); .withIndexConfig(indexConfigBuilder.withIndexType(HoodieIndex.IndexType.INMEMORY).build()).build();
assertTrue(HoodieIndex.createIndex(config, null) instanceof InMemoryHashIndex); assertTrue(HoodieIndex.createIndex(config, null) instanceof InMemoryHashIndex);
config = clientConfigBuilder.withPath("") config = clientConfigBuilder.withPath("")
.withIndexConfig(indexConfigBuilder.withIndexType(HoodieIndex.IndexType.BLOOM).build()) .withIndexConfig(indexConfigBuilder.withIndexType(HoodieIndex.IndexType.BLOOM).build()).build();
.build();
assertTrue(HoodieIndex.createIndex(config, null) instanceof HoodieBloomIndex); assertTrue(HoodieIndex.createIndex(config, null) instanceof HoodieBloomIndex);
} }
} }

View File

@@ -98,31 +98,33 @@ public class TestHoodieBloomIndex {
@Test @Test
public void testLoadUUIDsInMemory() throws IOException { public void testLoadUUIDsInMemory() throws IOException {
// Create one RDD of hoodie record // Create one RDD of hoodie record
String recordStr1 = "{\"_row_key\":\"1eb5b87a-1feh-4edd-87b4-6ec96dc405a0\",\"time\":\"2016-01-31T03:16:41.415Z\",\"number\":12}"; String recordStr1 = "{\"_row_key\":\"1eb5b87a-1feh-4edd-87b4-6ec96dc405a0\","
String recordStr2 = "{\"_row_key\":\"2eb5b87b-1feu-4edd-87b4-6ec96dc405a0\",\"time\":\"2016-01-31T03:20:41.415Z\",\"number\":100}"; + "\"time\":\"2016-01-31T03:16:41.415Z\",\"number\":12}";
String recordStr3 = "{\"_row_key\":\"3eb5b87c-1fej-4edd-87b4-6ec96dc405a0\",\"time\":\"2016-01-31T03:16:41.415Z\",\"number\":15}"; String recordStr2 = "{\"_row_key\":\"2eb5b87b-1feu-4edd-87b4-6ec96dc405a0\","
String recordStr4 = "{\"_row_key\":\"4eb5b87c-1fej-4edd-87b4-6ec96dc405a0\",\"time\":\"2015-01-31T03:16:41.415Z\",\"number\":32}"; + "\"time\":\"2016-01-31T03:20:41.415Z\",\"number\":100}";
String recordStr3 = "{\"_row_key\":\"3eb5b87c-1fej-4edd-87b4-6ec96dc405a0\","
+ "\"time\":\"2016-01-31T03:16:41.415Z\",\"number\":15}";
String recordStr4 = "{\"_row_key\":\"4eb5b87c-1fej-4edd-87b4-6ec96dc405a0\","
+ "\"time\":\"2015-01-31T03:16:41.415Z\",\"number\":32}";
TestRawTripPayload rowChange1 = new TestRawTripPayload(recordStr1); TestRawTripPayload rowChange1 = new TestRawTripPayload(recordStr1);
HoodieRecord record1 = new HoodieRecord( HoodieRecord record1 = new HoodieRecord(new HoodieKey(rowChange1.getRowKey(), rowChange1.getPartitionPath()),
new HoodieKey(rowChange1.getRowKey(), rowChange1.getPartitionPath()), rowChange1); rowChange1);
TestRawTripPayload rowChange2 = new TestRawTripPayload(recordStr2); TestRawTripPayload rowChange2 = new TestRawTripPayload(recordStr2);
HoodieRecord record2 = new HoodieRecord( HoodieRecord record2 = new HoodieRecord(new HoodieKey(rowChange2.getRowKey(), rowChange2.getPartitionPath()),
new HoodieKey(rowChange2.getRowKey(), rowChange2.getPartitionPath()), rowChange2); rowChange2);
TestRawTripPayload rowChange3 = new TestRawTripPayload(recordStr3); TestRawTripPayload rowChange3 = new TestRawTripPayload(recordStr3);
HoodieRecord record3 = new HoodieRecord( HoodieRecord record3 = new HoodieRecord(new HoodieKey(rowChange3.getRowKey(), rowChange3.getPartitionPath()),
new HoodieKey(rowChange3.getRowKey(), rowChange3.getPartitionPath()), rowChange3); rowChange3);
TestRawTripPayload rowChange4 = new TestRawTripPayload(recordStr4); TestRawTripPayload rowChange4 = new TestRawTripPayload(recordStr4);
HoodieRecord record4 = new HoodieRecord( HoodieRecord record4 = new HoodieRecord(new HoodieKey(rowChange4.getRowKey(), rowChange4.getPartitionPath()),
new HoodieKey(rowChange4.getRowKey(), rowChange4.getPartitionPath()), rowChange4); rowChange4);
JavaRDD<HoodieRecord> recordRDD = jsc JavaRDD<HoodieRecord> recordRDD = jsc.parallelize(Arrays.asList(record1, record2, record3, record4));
.parallelize(Arrays.asList(record1, record2, record3, record4));
// Load to memory // Load to memory
Map<String, Iterable<String>> map = recordRDD Map<String, Iterable<String>> map = recordRDD.mapToPair(
.mapToPair(record -> new Tuple2<>(record.getPartitionPath(), record.getRecordKey())) record -> new Tuple2<>(record.getPartitionPath(), record.getRecordKey())).groupByKey().collectAsMap();
.groupByKey().collectAsMap();
assertEquals(map.size(), 2); assertEquals(map.size(), 2);
List<String> list1 = Lists.newArrayList(map.get("2016/01/31")); List<String> list1 = Lists.newArrayList(map.get("2016/01/31"));
List<String> list2 = Lists.newArrayList(map.get("2015/01/31")); List<String> list2 = Lists.newArrayList(map.get("2015/01/31"));
@@ -132,44 +134,40 @@ public class TestHoodieBloomIndex {
@Test @Test
public void testLoadInvolvedFiles() throws IOException { public void testLoadInvolvedFiles() throws IOException {
HoodieWriteConfig config = HoodieWriteConfig.newBuilder() HoodieWriteConfig config = HoodieWriteConfig.newBuilder().withPath(basePath).build();
.withPath(basePath)
.build();
HoodieBloomIndex index = new HoodieBloomIndex(config, jsc); HoodieBloomIndex index = new HoodieBloomIndex(config, jsc);
// Create some partitions, and put some files // Create some partitions, and put some files
// "2016/01/21": 0 file // "2016/01/21": 0 file
// "2016/04/01": 1 file (2_0_20160401010101.parquet) // "2016/04/01": 1 file (2_0_20160401010101.parquet)
// "2015/03/12": 3 files (1_0_20150312101010.parquet, 3_0_20150312101010.parquet, 4_0_20150312101010.parquet) // "2015/03/12": 3 files (1_0_20150312101010.parquet, 3_0_20150312101010.parquet,
// 4_0_20150312101010.parquet)
new File(basePath + "/2016/01/21").mkdirs(); new File(basePath + "/2016/01/21").mkdirs();
new File(basePath + "/2016/04/01").mkdirs(); new File(basePath + "/2016/04/01").mkdirs();
new File(basePath + "/2015/03/12").mkdirs(); new File(basePath + "/2015/03/12").mkdirs();
TestRawTripPayload rowChange1 = new TestRawTripPayload( TestRawTripPayload rowChange1 = new TestRawTripPayload(
"{\"_row_key\":\"000\",\"time\":\"2016-01-31T03:16:41.415Z\",\"number\":12}"); "{\"_row_key\":\"000\",\"time\":\"2016-01-31T03:16:41.415Z\",\"number\":12}");
HoodieRecord record1 = new HoodieRecord( HoodieRecord record1 = new HoodieRecord(new HoodieKey(rowChange1.getRowKey(), rowChange1.getPartitionPath()),
new HoodieKey(rowChange1.getRowKey(), rowChange1.getPartitionPath()), rowChange1); rowChange1);
TestRawTripPayload rowChange2 = new TestRawTripPayload( TestRawTripPayload rowChange2 = new TestRawTripPayload(
"{\"_row_key\":\"001\",\"time\":\"2016-01-31T03:16:41.415Z\",\"number\":12}"); "{\"_row_key\":\"001\",\"time\":\"2016-01-31T03:16:41.415Z\",\"number\":12}");
HoodieRecord record2 = new HoodieRecord( HoodieRecord record2 = new HoodieRecord(new HoodieKey(rowChange2.getRowKey(), rowChange2.getPartitionPath()),
new HoodieKey(rowChange2.getRowKey(), rowChange2.getPartitionPath()), rowChange2); rowChange2);
TestRawTripPayload rowChange3 = new TestRawTripPayload( TestRawTripPayload rowChange3 = new TestRawTripPayload(
"{\"_row_key\":\"002\",\"time\":\"2016-01-31T03:16:41.415Z\",\"number\":12}"); "{\"_row_key\":\"002\",\"time\":\"2016-01-31T03:16:41.415Z\",\"number\":12}");
HoodieRecord record3 = new HoodieRecord( HoodieRecord record3 = new HoodieRecord(new HoodieKey(rowChange3.getRowKey(), rowChange3.getPartitionPath()),
new HoodieKey(rowChange3.getRowKey(), rowChange3.getPartitionPath()), rowChange3); rowChange3);
TestRawTripPayload rowChange4 = new TestRawTripPayload( TestRawTripPayload rowChange4 = new TestRawTripPayload(
"{\"_row_key\":\"003\",\"time\":\"2016-01-31T03:16:41.415Z\",\"number\":12}"); "{\"_row_key\":\"003\",\"time\":\"2016-01-31T03:16:41.415Z\",\"number\":12}");
HoodieRecord record4 = new HoodieRecord( HoodieRecord record4 = new HoodieRecord(new HoodieKey(rowChange4.getRowKey(), rowChange4.getPartitionPath()),
new HoodieKey(rowChange4.getRowKey(), rowChange4.getPartitionPath()), rowChange4); rowChange4);
writeParquetFile("2016/04/01", "2_0_20160401010101.parquet", Lists.newArrayList(), schema, null, writeParquetFile("2016/04/01", "2_0_20160401010101.parquet", Lists.newArrayList(), schema, null, false);
writeParquetFile("2015/03/12", "1_0_20150312101010.parquet", Lists.newArrayList(), schema, null, false);
writeParquetFile("2015/03/12", "3_0_20150312101010.parquet", Arrays.asList(record1), schema, null, false);
writeParquetFile("2015/03/12", "4_0_20150312101010.parquet", Arrays.asList(record2, record3, record4), schema, null,
false); false);
writeParquetFile("2015/03/12", "1_0_20150312101010.parquet", Lists.newArrayList(), schema, null,
false);
writeParquetFile("2015/03/12", "3_0_20150312101010.parquet", Arrays.asList(record1), schema,
null, false);
writeParquetFile("2015/03/12", "4_0_20150312101010.parquet",
Arrays.asList(record2, record3, record4), schema, null, false);
List<String> partitions = Arrays.asList("2016/01/21", "2016/04/01", "2015/03/12"); List<String> partitions = Arrays.asList("2016/01/21", "2016/04/01", "2015/03/12");
HoodieTableMetaClient metadata = new HoodieTableMetaClient(jsc.hadoopConfiguration(), basePath); HoodieTableMetaClient metadata = new HoodieTableMetaClient(jsc.hadoopConfiguration(), basePath);
@@ -198,51 +196,32 @@ public class TestHoodieBloomIndex {
List<Tuple2<String, BloomIndexFileInfo>> expected = Arrays.asList( List<Tuple2<String, BloomIndexFileInfo>> expected = Arrays.asList(
new Tuple2<>("2016/04/01", new BloomIndexFileInfo("2_0_20160401010101.parquet")), new Tuple2<>("2016/04/01", new BloomIndexFileInfo("2_0_20160401010101.parquet")),
new Tuple2<>("2015/03/12", new BloomIndexFileInfo("1_0_20150312101010.parquet")), new Tuple2<>("2015/03/12", new BloomIndexFileInfo("1_0_20150312101010.parquet")),
new Tuple2<>("2015/03/12", new Tuple2<>("2015/03/12", new BloomIndexFileInfo("3_0_20150312101010.parquet", "000", "000")),
new BloomIndexFileInfo("3_0_20150312101010.parquet", "000", "000")), new Tuple2<>("2015/03/12", new BloomIndexFileInfo("4_0_20150312101010.parquet", "001", "003")));
new Tuple2<>("2015/03/12",
new BloomIndexFileInfo("4_0_20150312101010.parquet", "001", "003"))
);
assertEquals(expected, filesList); assertEquals(expected, filesList);
} }
@Test @Test
public void testRangePruning() { public void testRangePruning() {
HoodieWriteConfig config = HoodieWriteConfig.newBuilder() HoodieWriteConfig config = HoodieWriteConfig.newBuilder().withPath(basePath).build();
.withPath(basePath)
.build();
HoodieBloomIndex index = new HoodieBloomIndex(config, jsc); HoodieBloomIndex index = new HoodieBloomIndex(config, jsc);
final Map<String, List<BloomIndexFileInfo>> partitionToFileIndexInfo = new HashMap<>(); final Map<String, List<BloomIndexFileInfo>> partitionToFileIndexInfo = new HashMap<>();
partitionToFileIndexInfo.put("2017/10/22", Arrays.asList( partitionToFileIndexInfo.put("2017/10/22", Arrays.asList(new BloomIndexFileInfo("f1"),
new BloomIndexFileInfo("f1"), new BloomIndexFileInfo("f2", "000", "000"), new BloomIndexFileInfo("f3", "001", "003"),
new BloomIndexFileInfo("f2", "000", "000"), new BloomIndexFileInfo("f4", "002", "007"), new BloomIndexFileInfo("f5", "009", "010")));
new BloomIndexFileInfo("f3", "001", "003"),
new BloomIndexFileInfo("f4", "002", "007"),
new BloomIndexFileInfo("f5", "009", "010")
));
JavaPairRDD<String, String> partitionRecordKeyPairRDD = jsc JavaPairRDD<String, String> partitionRecordKeyPairRDD = jsc.parallelize(Arrays.asList(
.parallelize(Arrays.asList( new Tuple2<>("2017/10/22", "003"), new Tuple2<>("2017/10/22", "002"), new Tuple2<>("2017/10/22", "005"),
new Tuple2<>("2017/10/22", "003"), new Tuple2<>("2017/10/22", "004"))).mapToPair(t -> t);
new Tuple2<>("2017/10/22", "002"),
new Tuple2<>("2017/10/22", "005"),
new Tuple2<>("2017/10/22", "004")
))
.mapToPair(t -> t);
List<Tuple2<String, Tuple2<String, HoodieKey>>> comparisonKeyList = index List<Tuple2<String, Tuple2<String, HoodieKey>>> comparisonKeyList = index.explodeRecordRDDWithFileComparisons(
.explodeRecordRDDWithFileComparisons(partitionToFileIndexInfo, partitionRecordKeyPairRDD) partitionToFileIndexInfo, partitionRecordKeyPairRDD).collect();
.collect();
assertEquals(10, comparisonKeyList.size()); assertEquals(10, comparisonKeyList.size());
Map<String, List<String>> recordKeyToFileComps = comparisonKeyList.stream() Map<String, List<String>> recordKeyToFileComps = comparisonKeyList.stream().collect(Collectors.groupingBy(
.collect(Collectors.groupingBy( t -> t._2()._2().getRecordKey(), Collectors.mapping(t -> t._2()._1().split("#")[0], Collectors.toList())));
t -> t._2()._2().getRecordKey(),
Collectors.mapping(t -> t._2()._1().split("#")[0], Collectors.toList()
)
));
assertEquals(4, recordKeyToFileComps.size()); assertEquals(4, recordKeyToFileComps.size());
assertEquals(Arrays.asList("f1", "f3", "f4"), recordKeyToFileComps.get("002")); assertEquals(Arrays.asList("f1", "f3", "f4"), recordKeyToFileComps.get("002"));
@@ -252,32 +231,35 @@ public class TestHoodieBloomIndex {
} }
@Test @Test
public void testCheckUUIDsAgainstOneFile() public void testCheckUUIDsAgainstOneFile() throws IOException, InterruptedException, ClassNotFoundException {
throws IOException, InterruptedException, ClassNotFoundException {
// Create some records to use // Create some records to use
String recordStr1 = "{\"_row_key\":\"1eb5b87a-1feh-4edd-87b4-6ec96dc405a0\",\"time\":\"2016-01-31T03:16:41.415Z\",\"number\":12}"; String recordStr1 = "{\"_row_key\":\"1eb5b87a-1feh-4edd-87b4-6ec96dc405a0\","
String recordStr2 = "{\"_row_key\":\"2eb5b87b-1feu-4edd-87b4-6ec96dc405a0\",\"time\":\"2016-01-31T03:20:41.415Z\",\"number\":100}"; + "\"time\":\"2016-01-31T03:16:41.415Z\",\"number\":12}";
String recordStr3 = "{\"_row_key\":\"3eb5b87c-1fej-4edd-87b4-6ec96dc405a0\",\"time\":\"2016-01-31T03:16:41.415Z\",\"number\":15}"; String recordStr2 = "{\"_row_key\":\"2eb5b87b-1feu-4edd-87b4-6ec96dc405a0\","
String recordStr4 = "{\"_row_key\":\"4eb5b87c-1fej-4edd-87b4-6ec96dc405a0\",\"time\":\"2016-01-31T03:16:41.415Z\",\"number\":32}"; + "\"time\":\"2016-01-31T03:20:41.415Z\",\"number\":100}";
String recordStr3 = "{\"_row_key\":\"3eb5b87c-1fej-4edd-87b4-6ec96dc405a0\","
+ "\"time\":\"2016-01-31T03:16:41.415Z\",\"number\":15}";
String recordStr4 = "{\"_row_key\":\"4eb5b87c-1fej-4edd-87b4-6ec96dc405a0\","
+ "\"time\":\"2016-01-31T03:16:41.415Z\",\"number\":32}";
TestRawTripPayload rowChange1 = new TestRawTripPayload(recordStr1); TestRawTripPayload rowChange1 = new TestRawTripPayload(recordStr1);
HoodieRecord record1 = new HoodieRecord( HoodieRecord record1 = new HoodieRecord(new HoodieKey(rowChange1.getRowKey(), rowChange1.getPartitionPath()),
new HoodieKey(rowChange1.getRowKey(), rowChange1.getPartitionPath()), rowChange1); rowChange1);
TestRawTripPayload rowChange2 = new TestRawTripPayload(recordStr2); TestRawTripPayload rowChange2 = new TestRawTripPayload(recordStr2);
HoodieRecord record2 = new HoodieRecord( HoodieRecord record2 = new HoodieRecord(new HoodieKey(rowChange2.getRowKey(), rowChange2.getPartitionPath()),
new HoodieKey(rowChange2.getRowKey(), rowChange2.getPartitionPath()), rowChange2); rowChange2);
TestRawTripPayload rowChange3 = new TestRawTripPayload(recordStr3); TestRawTripPayload rowChange3 = new TestRawTripPayload(recordStr3);
HoodieRecord record3 = new HoodieRecord( HoodieRecord record3 = new HoodieRecord(new HoodieKey(rowChange3.getRowKey(), rowChange3.getPartitionPath()),
new HoodieKey(rowChange3.getRowKey(), rowChange3.getPartitionPath()), rowChange3); rowChange3);
TestRawTripPayload rowChange4 = new TestRawTripPayload(recordStr4); TestRawTripPayload rowChange4 = new TestRawTripPayload(recordStr4);
HoodieRecord record4 = new HoodieRecord( HoodieRecord record4 = new HoodieRecord(new HoodieKey(rowChange4.getRowKey(), rowChange4.getPartitionPath()),
new HoodieKey(rowChange4.getRowKey(), rowChange4.getPartitionPath()), rowChange4); rowChange4);
// We write record1, record2 to a parquet file, but the bloom filter contains (record1, record2, record3). // We write record1, record2 to a parquet file, but the bloom filter contains (record1,
// record2, record3).
BloomFilter filter = new BloomFilter(10000, 0.0000001); BloomFilter filter = new BloomFilter(10000, 0.0000001);
filter.add(record3.getRecordKey()); filter.add(record3.getRecordKey());
String filename = writeParquetFile("2016/01/31", Arrays.asList(record1, record2), schema, String filename = writeParquetFile("2016/01/31", Arrays.asList(record1, record2), schema, filter, true);
filter, true);
// The bloom filter contains 3 records // The bloom filter contains 3 records
assertTrue(filter.mightContain(record1.getRecordKey())); assertTrue(filter.mightContain(record1.getRecordKey()));
@@ -286,17 +268,16 @@ public class TestHoodieBloomIndex {
assertFalse(filter.mightContain(record4.getRecordKey())); assertFalse(filter.mightContain(record4.getRecordKey()));
// Compare with file // Compare with file
List<String> uuids = Arrays.asList(record1.getRecordKey(), record2.getRecordKey(), List<String> uuids = Arrays.asList(record1.getRecordKey(), record2.getRecordKey(), record3.getRecordKey(),
record3.getRecordKey(), record4.getRecordKey()); record4.getRecordKey());
List<String> results = HoodieBloomIndexCheckFunction List<String> results = HoodieBloomIndexCheckFunction.checkCandidatesAgainstFile(jsc.hadoopConfiguration(), uuids,
.checkCandidatesAgainstFile(jsc.hadoopConfiguration(), uuids,
new Path(basePath + "/2016/01/31/" + filename)); new Path(basePath + "/2016/01/31/" + filename));
assertEquals(results.size(), 2); assertEquals(results.size(), 2);
assertTrue(results.get(0).equals("1eb5b87a-1feh-4edd-87b4-6ec96dc405a0") assertTrue(results.get(0).equals("1eb5b87a-1feh-4edd-87b4-6ec96dc405a0") || results.get(1).equals(
|| results.get(1).equals("1eb5b87a-1feh-4edd-87b4-6ec96dc405a0")); "1eb5b87a-1feh-4edd-87b4-6ec96dc405a0"));
assertTrue(results.get(0).equals("2eb5b87b-1feu-4edd-87b4-6ec96dc405a0") assertTrue(results.get(0).equals("2eb5b87b-1feu-4edd-87b4-6ec96dc405a0") || results.get(1).equals(
|| results.get(1).equals("2eb5b87b-1feu-4edd-87b4-6ec96dc405a0")); "2eb5b87b-1feu-4edd-87b4-6ec96dc405a0"));
// TODO(vc): Need more coverage on actual filenames // TODO(vc): Need more coverage on actual filenames
//assertTrue(results.get(0)._2().equals(filename)); //assertTrue(results.get(0)._2().equals(filename));
//assertTrue(results.get(1)._2().equals(filename)); //assertTrue(results.get(1)._2().equals(filename));
@@ -317,8 +298,7 @@ public class TestHoodieBloomIndex {
try { try {
bloomIndex.tagLocation(recordRDD, table); bloomIndex.tagLocation(recordRDD, table);
} catch (IllegalArgumentException e) { } catch (IllegalArgumentException e) {
fail( fail("EmptyRDD should not result in IllegalArgumentException: Positive number of slices " + "required");
"EmptyRDD should not result in IllegalArgumentException: Positive number of slices required");
} }
} }
@@ -327,24 +307,27 @@ public class TestHoodieBloomIndex {
public void testTagLocation() throws Exception { public void testTagLocation() throws Exception {
// We have some records to be tagged (two different partitions) // We have some records to be tagged (two different partitions)
String recordStr1 = "{\"_row_key\":\"1eb5b87a-1feh-4edd-87b4-6ec96dc405a0\",\"time\":\"2016-01-31T03:16:41.415Z\",\"number\":12}"; String recordStr1 = "{\"_row_key\":\"1eb5b87a-1feh-4edd-87b4-6ec96dc405a0\","
String recordStr2 = "{\"_row_key\":\"2eb5b87b-1feu-4edd-87b4-6ec96dc405a0\",\"time\":\"2016-01-31T03:20:41.415Z\",\"number\":100}"; + "\"time\":\"2016-01-31T03:16:41.415Z\",\"number\":12}";
String recordStr3 = "{\"_row_key\":\"3eb5b87c-1fej-4edd-87b4-6ec96dc405a0\",\"time\":\"2016-01-31T03:16:41.415Z\",\"number\":15}"; String recordStr2 = "{\"_row_key\":\"2eb5b87b-1feu-4edd-87b4-6ec96dc405a0\","
String recordStr4 = "{\"_row_key\":\"4eb5b87c-1fej-4edd-87b4-6ec96dc405a0\",\"time\":\"2015-01-31T03:16:41.415Z\",\"number\":32}"; + "\"time\":\"2016-01-31T03:20:41.415Z\",\"number\":100}";
String recordStr3 = "{\"_row_key\":\"3eb5b87c-1fej-4edd-87b4-6ec96dc405a0\","
+ "\"time\":\"2016-01-31T03:16:41.415Z\",\"number\":15}";
String recordStr4 = "{\"_row_key\":\"4eb5b87c-1fej-4edd-87b4-6ec96dc405a0\","
+ "\"time\":\"2015-01-31T03:16:41.415Z\",\"number\":32}";
TestRawTripPayload rowChange1 = new TestRawTripPayload(recordStr1); TestRawTripPayload rowChange1 = new TestRawTripPayload(recordStr1);
HoodieRecord record1 = new HoodieRecord( HoodieRecord record1 = new HoodieRecord(new HoodieKey(rowChange1.getRowKey(), rowChange1.getPartitionPath()),
new HoodieKey(rowChange1.getRowKey(), rowChange1.getPartitionPath()), rowChange1); rowChange1);
TestRawTripPayload rowChange2 = new TestRawTripPayload(recordStr2); TestRawTripPayload rowChange2 = new TestRawTripPayload(recordStr2);
HoodieRecord record2 = new HoodieRecord( HoodieRecord record2 = new HoodieRecord(new HoodieKey(rowChange2.getRowKey(), rowChange2.getPartitionPath()),
new HoodieKey(rowChange2.getRowKey(), rowChange2.getPartitionPath()), rowChange2); rowChange2);
TestRawTripPayload rowChange3 = new TestRawTripPayload(recordStr3); TestRawTripPayload rowChange3 = new TestRawTripPayload(recordStr3);
HoodieRecord record3 = new HoodieRecord( HoodieRecord record3 = new HoodieRecord(new HoodieKey(rowChange3.getRowKey(), rowChange3.getPartitionPath()),
new HoodieKey(rowChange3.getRowKey(), rowChange3.getPartitionPath()), rowChange3); rowChange3);
TestRawTripPayload rowChange4 = new TestRawTripPayload(recordStr4); TestRawTripPayload rowChange4 = new TestRawTripPayload(recordStr4);
HoodieRecord record4 = new HoodieRecord( HoodieRecord record4 = new HoodieRecord(new HoodieKey(rowChange4.getRowKey(), rowChange4.getPartitionPath()),
new HoodieKey(rowChange4.getRowKey(), rowChange4.getPartitionPath()), rowChange4); rowChange4);
JavaRDD<HoodieRecord> recordRDD = jsc JavaRDD<HoodieRecord> recordRDD = jsc.parallelize(Arrays.asList(record1, record2, record3, record4));
.parallelize(Arrays.asList(record1, record2, record3, record4));
// Also create the metadata and config // Also create the metadata and config
HoodieTableMetaClient metadata = new HoodieTableMetaClient(jsc.hadoopConfiguration(), basePath); HoodieTableMetaClient metadata = new HoodieTableMetaClient(jsc.hadoopConfiguration(), basePath);
@@ -389,10 +372,14 @@ public class TestHoodieBloomIndex {
public void testCheckExists() throws Exception { public void testCheckExists() throws Exception {
// We have some records to be tagged (two different partitions) // We have some records to be tagged (two different partitions)
String recordStr1 = "{\"_row_key\":\"1eb5b87a-1feh-4edd-87b4-6ec96dc405a0\",\"time\":\"2016-01-31T03:16:41.415Z\",\"number\":12}"; String recordStr1 = "{\"_row_key\":\"1eb5b87a-1feh-4edd-87b4-6ec96dc405a0\","
String recordStr2 = "{\"_row_key\":\"2eb5b87b-1feu-4edd-87b4-6ec96dc405a0\",\"time\":\"2016-01-31T03:20:41.415Z\",\"number\":100}"; + "\"time\":\"2016-01-31T03:16:41.415Z\",\"number\":12}";
String recordStr3 = "{\"_row_key\":\"3eb5b87c-1fej-4edd-87b4-6ec96dc405a0\",\"time\":\"2016-01-31T03:16:41.415Z\",\"number\":15}"; String recordStr2 = "{\"_row_key\":\"2eb5b87b-1feu-4edd-87b4-6ec96dc405a0\","
String recordStr4 = "{\"_row_key\":\"4eb5b87c-1fej-4edd-87b4-6ec96dc405a0\",\"time\":\"2015-01-31T03:16:41.415Z\",\"number\":32}"; + "\"time\":\"2016-01-31T03:20:41.415Z\",\"number\":100}";
String recordStr3 = "{\"_row_key\":\"3eb5b87c-1fej-4edd-87b4-6ec96dc405a0\","
+ "\"time\":\"2016-01-31T03:16:41.415Z\",\"number\":15}";
String recordStr4 = "{\"_row_key\":\"4eb5b87c-1fej-4edd-87b4-6ec96dc405a0\","
+ "\"time\":\"2015-01-31T03:16:41.415Z\",\"number\":32}";
TestRawTripPayload rowChange1 = new TestRawTripPayload(recordStr1); TestRawTripPayload rowChange1 = new TestRawTripPayload(recordStr1);
HoodieKey key1 = new HoodieKey(rowChange1.getRowKey(), rowChange1.getPartitionPath()); HoodieKey key1 = new HoodieKey(rowChange1.getRowKey(), rowChange1.getPartitionPath());
HoodieRecord record1 = new HoodieRecord(key1, rowChange1); HoodieRecord record1 = new HoodieRecord(key1, rowChange1);
@@ -414,8 +401,7 @@ public class TestHoodieBloomIndex {
// Let's tag // Let's tag
HoodieBloomIndex bloomIndex = new HoodieBloomIndex(config, jsc); HoodieBloomIndex bloomIndex = new HoodieBloomIndex(config, jsc);
JavaPairRDD<HoodieKey, Optional<String>> taggedRecordRDD = bloomIndex JavaPairRDD<HoodieKey, Optional<String>> taggedRecordRDD = bloomIndex.fetchRecordLocation(keysRDD, table);
.fetchRecordLocation(keysRDD, table);
// Should not find any files // Should not find any files
for (Tuple2<HoodieKey, Optional<String>> record : taggedRecordRDD.collect()) { for (Tuple2<HoodieKey, Optional<String>> record : taggedRecordRDD.collect()) {
@@ -456,16 +442,18 @@ public class TestHoodieBloomIndex {
@Test @Test
public void testBloomFilterFalseError() throws IOException, InterruptedException { public void testBloomFilterFalseError() throws IOException, InterruptedException {
// We have two hoodie records // We have two hoodie records
String recordStr1 = "{\"_row_key\":\"1eb5b87a-1feh-4edd-87b4-6ec96dc405a0\",\"time\":\"2016-01-31T03:16:41.415Z\",\"number\":12}"; String recordStr1 = "{\"_row_key\":\"1eb5b87a-1feh-4edd-87b4-6ec96dc405a0\","
String recordStr2 = "{\"_row_key\":\"2eb5b87b-1feu-4edd-87b4-6ec96dc405a0\",\"time\":\"2016-01-31T03:20:41.415Z\",\"number\":100}"; + "\"time\":\"2016-01-31T03:16:41.415Z\",\"number\":12}";
String recordStr2 = "{\"_row_key\":\"2eb5b87b-1feu-4edd-87b4-6ec96dc405a0\","
+ "\"time\":\"2016-01-31T03:20:41.415Z\",\"number\":100}";
// We write record1 to a parquet file, using a bloom filter having both records // We write record1 to a parquet file, using a bloom filter having both records
TestRawTripPayload rowChange1 = new TestRawTripPayload(recordStr1); TestRawTripPayload rowChange1 = new TestRawTripPayload(recordStr1);
HoodieRecord record1 = new HoodieRecord( HoodieRecord record1 = new HoodieRecord(new HoodieKey(rowChange1.getRowKey(), rowChange1.getPartitionPath()),
new HoodieKey(rowChange1.getRowKey(), rowChange1.getPartitionPath()), rowChange1); rowChange1);
TestRawTripPayload rowChange2 = new TestRawTripPayload(recordStr2); TestRawTripPayload rowChange2 = new TestRawTripPayload(recordStr2);
HoodieRecord record2 = new HoodieRecord( HoodieRecord record2 = new HoodieRecord(new HoodieKey(rowChange2.getRowKey(), rowChange2.getPartitionPath()),
new HoodieKey(rowChange2.getRowKey(), rowChange2.getPartitionPath()), rowChange2); rowChange2);
BloomFilter filter = new BloomFilter(10000, 0.0000001); BloomFilter filter = new BloomFilter(10000, 0.0000001);
filter.add(record2.getRecordKey()); filter.add(record2.getRecordKey());
@@ -492,8 +480,8 @@ public class TestHoodieBloomIndex {
} }
} }
private String writeParquetFile(String partitionPath, List<HoodieRecord> records, Schema schema, private String writeParquetFile(String partitionPath, List<HoodieRecord> records, Schema schema, BloomFilter filter,
BloomFilter filter, boolean createCommitTime) throws IOException, InterruptedException { boolean createCommitTime) throws IOException, InterruptedException {
Thread.sleep(1000); Thread.sleep(1000);
String commitTime = new SimpleDateFormat("yyyyMMddHHmmss").format(new Date()); String commitTime = new SimpleDateFormat("yyyyMMddHHmmss").format(new Date());
String fileId = UUID.randomUUID().toString(); String fileId = UUID.randomUUID().toString();
@@ -502,15 +490,14 @@ public class TestHoodieBloomIndex {
return writeParquetFile(partitionPath, filename, records, schema, filter, createCommitTime); return writeParquetFile(partitionPath, filename, records, schema, filter, createCommitTime);
} }
private String writeParquetFile(String partitionPath, String filename, List<HoodieRecord> records, private String writeParquetFile(String partitionPath, String filename, List<HoodieRecord> records, Schema schema,
Schema schema,
BloomFilter filter, boolean createCommitTime) throws IOException { BloomFilter filter, boolean createCommitTime) throws IOException {
if (filter == null) { if (filter == null) {
filter = new BloomFilter(10000, 0.0000001); filter = new BloomFilter(10000, 0.0000001);
} }
HoodieAvroWriteSupport writeSupport = new HoodieAvroWriteSupport( HoodieAvroWriteSupport writeSupport = new HoodieAvroWriteSupport(new AvroSchemaConverter().convert(schema), schema,
new AvroSchemaConverter().convert(schema), schema, filter); filter);
String commitTime = FSUtils.getCommitTime(filename); String commitTime = FSUtils.getCommitTime(filename);
HoodieParquetConfig config = new HoodieParquetConfig(writeSupport, CompressionCodecName.GZIP, HoodieParquetConfig config = new HoodieParquetConfig(writeSupport, CompressionCodecName.GZIP,
ParquetWriter.DEFAULT_BLOCK_SIZE, ParquetWriter.DEFAULT_PAGE_SIZE, 120 * 1024 * 1024, ParquetWriter.DEFAULT_BLOCK_SIZE, ParquetWriter.DEFAULT_PAGE_SIZE, 120 * 1024 * 1024,
@@ -525,9 +512,7 @@ public class TestHoodieBloomIndex {
for (HoodieRecord record : records) { for (HoodieRecord record : records) {
GenericRecord avroRecord = (GenericRecord) record.getData().getInsertValue(schema).get(); GenericRecord avroRecord = (GenericRecord) record.getData().getInsertValue(schema).get();
HoodieAvroUtils.addCommitMetadataToRecord(avroRecord, commitTime, "" + seqId++); HoodieAvroUtils.addCommitMetadataToRecord(avroRecord, commitTime, "" + seqId++);
HoodieAvroUtils HoodieAvroUtils.addHoodieKeyToRecord(avroRecord, record.getRecordKey(), record.getPartitionPath(), filename);
.addHoodieKeyToRecord(avroRecord, record.getRecordKey(), record.getPartitionPath(),
filename);
writer.writeAvro(record.getRecordKey(), avroRecord); writer.writeAvro(record.getRecordKey(), avroRecord);
filter.add(record.getRecordKey()); filter.add(record.getRecordKey());
} }
@@ -536,9 +521,7 @@ public class TestHoodieBloomIndex {
if (createCommitTime) { if (createCommitTime) {
// Also make sure the commit is valid // Also make sure the commit is valid
new File(basePath + "/" + HoodieTableMetaClient.METAFOLDER_NAME).mkdirs(); new File(basePath + "/" + HoodieTableMetaClient.METAFOLDER_NAME).mkdirs();
new File( new File(basePath + "/" + HoodieTableMetaClient.METAFOLDER_NAME + "/" + commitTime + ".commit").createNewFile();
basePath + "/" + HoodieTableMetaClient.METAFOLDER_NAME + "/" + commitTime + ".commit")
.createNewFile();
} }
return filename; return filename;
} }

View File

@@ -77,7 +77,8 @@ public class TestHoodieCommitArchiveLog {
public void testArchiveDatasetWithArchival() throws IOException { public void testArchiveDatasetWithArchival() throws IOException {
HoodieWriteConfig cfg = HoodieWriteConfig.newBuilder().withPath(basePath) HoodieWriteConfig cfg = HoodieWriteConfig.newBuilder().withPath(basePath)
.withSchema(HoodieTestDataGenerator.TRIP_EXAMPLE_SCHEMA).withParallelism(2, 2) .withSchema(HoodieTestDataGenerator.TRIP_EXAMPLE_SCHEMA).withParallelism(2, 2)
.withCompactionConfig(HoodieCompactionConfig.newBuilder().archiveCommitsWith(2, 4).build()) .withCompactionConfig(
HoodieCompactionConfig.newBuilder().archiveCommitsWith(2, 4).build())
.forTable("test-trip-table").build(); .forTable("test-trip-table").build();
HoodieTestUtils.init(hadoopConf, basePath); HoodieTestUtils.init(hadoopConf, basePath);
HoodieTestDataGenerator.createCommitFile(basePath, "100"); HoodieTestDataGenerator.createCommitFile(basePath, "100");
@@ -88,8 +89,7 @@ public class TestHoodieCommitArchiveLog {
HoodieTestDataGenerator.createCommitFile(basePath, "105"); HoodieTestDataGenerator.createCommitFile(basePath, "105");
HoodieTableMetaClient metaClient = new HoodieTableMetaClient(fs.getConf(), basePath); HoodieTableMetaClient metaClient = new HoodieTableMetaClient(fs.getConf(), basePath);
HoodieTimeline timeline = HoodieTimeline timeline = metaClient.getActiveTimeline().getCommitsTimeline().filterCompletedInstants();
metaClient.getActiveTimeline().getCommitsTimeline().filterCompletedInstants();
assertEquals("Loaded 6 commits and the count should match", 6, timeline.countInstants()); assertEquals("Loaded 6 commits and the count should match", 6, timeline.countInstants());
@@ -103,8 +103,7 @@ public class TestHoodieCommitArchiveLog {
HoodieTestUtils.createInflightCleanFiles(basePath, "106", "107"); HoodieTestUtils.createInflightCleanFiles(basePath, "106", "107");
//reload the timeline and get all the commmits before archive //reload the timeline and get all the commmits before archive
timeline = metaClient.getActiveTimeline().reload().getAllCommitsTimeline() timeline = metaClient.getActiveTimeline().reload().getAllCommitsTimeline().filterCompletedInstants();
.filterCompletedInstants();
List<HoodieInstant> originalCommits = timeline.getInstants().collect(Collectors.toList()); List<HoodieInstant> originalCommits = timeline.getInstants().collect(Collectors.toList());
assertEquals("Loaded 6 commits and the count should match", 12, timeline.countInstants()); assertEquals("Loaded 6 commits and the count should match", 12, timeline.countInstants());
@@ -118,13 +117,12 @@ public class TestHoodieCommitArchiveLog {
assertTrue(archiveLog.archiveIfRequired()); assertTrue(archiveLog.archiveIfRequired());
//reload the timeline and remove the remaining commits //reload the timeline and remove the remaining commits
timeline = metaClient.getActiveTimeline().reload().getAllCommitsTimeline() timeline = metaClient.getActiveTimeline().reload().getAllCommitsTimeline().filterCompletedInstants();
.filterCompletedInstants();
originalCommits.removeAll(timeline.getInstants().collect(Collectors.toList())); originalCommits.removeAll(timeline.getInstants().collect(Collectors.toList()));
//read the file //read the file
HoodieLogFormat.Reader reader = HoodieLogFormat HoodieLogFormat.Reader reader = HoodieLogFormat.newReader(fs,
.newReader(fs, new HoodieLogFile(new Path(basePath + "/.hoodie/.commits_.archive.1")), new HoodieLogFile(new Path(basePath + "/.hoodie/.commits_.archive.1")),
HoodieArchivedMetaEntry.getClassSchema()); HoodieArchivedMetaEntry.getClassSchema());
int archivedRecordsCount = 0; int archivedRecordsCount = 0;
@@ -137,8 +135,7 @@ public class TestHoodieCommitArchiveLog {
assertEquals("Archived and read records for each block are same", 8, records.size()); assertEquals("Archived and read records for each block are same", 8, records.size());
archivedRecordsCount += records.size(); archivedRecordsCount += records.size();
} }
assertEquals("Total archived records and total read records are the same count", 8, assertEquals("Total archived records and total read records are the same count", 8, archivedRecordsCount);
archivedRecordsCount);
//make sure the archived commits are the same as the (originalcommits - commitsleft) //make sure the archived commits are the same as the (originalcommits - commitsleft)
List<String> readCommits = readRecords.stream().map(r -> (GenericRecord) r).map(r -> { List<String> readCommits = readRecords.stream().map(r -> (GenericRecord) r).map(r -> {
@@ -146,10 +143,8 @@ public class TestHoodieCommitArchiveLog {
}).collect(Collectors.toList()); }).collect(Collectors.toList());
Collections.sort(readCommits); Collections.sort(readCommits);
assertEquals( assertEquals("Read commits map should match the originalCommits - commitsLoadedFromArchival",
"Read commits map should match the originalCommits - commitsLoadedFromArchival", originalCommits.stream().map(HoodieInstant::getTimestamp).collect(Collectors.toList()), readCommits);
originalCommits.stream().map(HoodieInstant::getTimestamp).collect(Collectors.toList()),
readCommits);
// verify in-flight instants after archive // verify in-flight instants after archive
verifyInflightInstants(metaClient, 3); verifyInflightInstants(metaClient, 3);
@@ -168,15 +163,12 @@ public class TestHoodieCommitArchiveLog {
HoodieTestDataGenerator.createCommitFile(basePath, "102"); HoodieTestDataGenerator.createCommitFile(basePath, "102");
HoodieTestDataGenerator.createCommitFile(basePath, "103"); HoodieTestDataGenerator.createCommitFile(basePath, "103");
HoodieTimeline timeline = metaClient.getActiveTimeline().getCommitsTimeline() HoodieTimeline timeline = metaClient.getActiveTimeline().getCommitsTimeline().filterCompletedInstants();
.filterCompletedInstants();
assertEquals("Loaded 4 commits and the count should match", 4, timeline.countInstants()); assertEquals("Loaded 4 commits and the count should match", 4, timeline.countInstants());
boolean result = archiveLog.archiveIfRequired(); boolean result = archiveLog.archiveIfRequired();
assertTrue(result); assertTrue(result);
timeline = metaClient.getActiveTimeline().reload().getCommitsTimeline() timeline = metaClient.getActiveTimeline().reload().getCommitsTimeline().filterCompletedInstants();
.filterCompletedInstants(); assertEquals("Should not archive commits when maxCommitsToKeep is 5", 4, timeline.countInstants());
assertEquals("Should not archive commits when maxCommitsToKeep is 5", 4,
timeline.countInstants());
} }
@Test @Test
@@ -194,21 +186,15 @@ public class TestHoodieCommitArchiveLog {
HoodieTestDataGenerator.createCommitFile(basePath, "104"); HoodieTestDataGenerator.createCommitFile(basePath, "104");
HoodieTestDataGenerator.createCommitFile(basePath, "105"); HoodieTestDataGenerator.createCommitFile(basePath, "105");
HoodieTimeline timeline = metaClient.getActiveTimeline().getCommitsTimeline() HoodieTimeline timeline = metaClient.getActiveTimeline().getCommitsTimeline().filterCompletedInstants();
.filterCompletedInstants();
assertEquals("Loaded 6 commits and the count should match", 6, timeline.countInstants()); assertEquals("Loaded 6 commits and the count should match", 6, timeline.countInstants());
boolean result = archiveLog.archiveIfRequired(); boolean result = archiveLog.archiveIfRequired();
assertTrue(result); assertTrue(result);
timeline = metaClient.getActiveTimeline().reload().getCommitsTimeline() timeline = metaClient.getActiveTimeline().reload().getCommitsTimeline().filterCompletedInstants();
.filterCompletedInstants(); assertTrue("Archived commits should always be safe", timeline.containsOrBeforeTimelineStarts("100"));
assertTrue("Archived commits should always be safe", assertTrue("Archived commits should always be safe", timeline.containsOrBeforeTimelineStarts("101"));
timeline.containsOrBeforeTimelineStarts("100")); assertTrue("Archived commits should always be safe", timeline.containsOrBeforeTimelineStarts("102"));
assertTrue("Archived commits should always be safe", assertTrue("Archived commits should always be safe", timeline.containsOrBeforeTimelineStarts("103"));
timeline.containsOrBeforeTimelineStarts("101"));
assertTrue("Archived commits should always be safe",
timeline.containsOrBeforeTimelineStarts("102"));
assertTrue("Archived commits should always be safe",
timeline.containsOrBeforeTimelineStarts("103"));
} }
@Test @Test
@@ -227,16 +213,14 @@ public class TestHoodieCommitArchiveLog {
HoodieTestDataGenerator.createCommitFile(basePath, "104"); HoodieTestDataGenerator.createCommitFile(basePath, "104");
HoodieTestDataGenerator.createCommitFile(basePath, "105"); HoodieTestDataGenerator.createCommitFile(basePath, "105");
HoodieTimeline timeline = metaClient.getActiveTimeline().getCommitsTimeline() HoodieTimeline timeline = metaClient.getActiveTimeline().getCommitsTimeline().filterCompletedInstants();
.filterCompletedInstants();
assertEquals("Loaded 6 commits and the count should match", 6, timeline.countInstants()); assertEquals("Loaded 6 commits and the count should match", 6, timeline.countInstants());
boolean result = archiveLog.archiveIfRequired(); boolean result = archiveLog.archiveIfRequired();
assertTrue(result); assertTrue(result);
timeline = metaClient.getActiveTimeline().reload().getCommitsTimeline() timeline = metaClient.getActiveTimeline().reload().getCommitsTimeline().filterCompletedInstants();
.filterCompletedInstants();
assertEquals( assertEquals(
"Since we have a savepoint at 101, we should never archive any commit after 101 (we only archive 100)", "Since we have a savepoint at 101, we should never archive any commit after 101 (we only " + "archive 100)", 5,
5, timeline.countInstants()); timeline.countInstants());
assertTrue("Archived commits should always be safe", assertTrue("Archived commits should always be safe",
timeline.containsInstant(new HoodieInstant(false, HoodieTimeline.COMMIT_ACTION, "101"))); timeline.containsInstant(new HoodieInstant(false, HoodieTimeline.COMMIT_ACTION, "101")));
assertTrue("Archived commits should always be safe", assertTrue("Archived commits should always be safe",
@@ -248,7 +232,7 @@ public class TestHoodieCommitArchiveLog {
private void verifyInflightInstants(HoodieTableMetaClient metaClient, int expectedTotalInstants) { private void verifyInflightInstants(HoodieTableMetaClient metaClient, int expectedTotalInstants) {
HoodieTimeline timeline = metaClient.getActiveTimeline().reload() HoodieTimeline timeline = metaClient.getActiveTimeline().reload()
.getTimelineOfActions(Sets.newHashSet(HoodieTimeline.CLEAN_ACTION)).filterInflights(); .getTimelineOfActions(Sets.newHashSet(HoodieTimeline.CLEAN_ACTION)).filterInflights();
assertEquals("Loaded inflight clean actions and the count should match", assertEquals("Loaded inflight clean actions and the count should match", expectedTotalInstants,
expectedTotalInstants, timeline.countInstants()); timeline.countInstants());
} }
} }

View File

@@ -93,32 +93,27 @@ public class TestHoodieCompactor {
} }
private HoodieWriteConfig.Builder getConfigBuilder() { private HoodieWriteConfig.Builder getConfigBuilder() {
return HoodieWriteConfig.newBuilder().withPath(basePath) return HoodieWriteConfig.newBuilder().withPath(basePath).withSchema(HoodieTestDataGenerator.TRIP_EXAMPLE_SCHEMA)
.withSchema(HoodieTestDataGenerator.TRIP_EXAMPLE_SCHEMA).withParallelism(2, 2) .withParallelism(2, 2).withCompactionConfig(
.withCompactionConfig( HoodieCompactionConfig.newBuilder().compactionSmallFileSize(1024 * 1024).withInlineCompaction(false)
HoodieCompactionConfig.newBuilder().compactionSmallFileSize(1024 * 1024) .build()).withStorageConfig(HoodieStorageConfig.newBuilder().limitFileSize(1024 * 1024).build())
.withInlineCompaction(false).build()) .forTable("test-trip-table")
.withStorageConfig(HoodieStorageConfig.newBuilder().limitFileSize(1024 * 1024).build()) .withIndexConfig(HoodieIndexConfig.newBuilder().withIndexType(HoodieIndex.IndexType.BLOOM).build());
.forTable("test-trip-table").withIndexConfig(
HoodieIndexConfig.newBuilder().withIndexType(HoodieIndex.IndexType.BLOOM).build());
} }
@Test(expected = IllegalArgumentException.class) @Test(expected = IllegalArgumentException.class)
public void testCompactionOnCopyOnWriteFail() throws Exception { public void testCompactionOnCopyOnWriteFail() throws Exception {
HoodieTestUtils.initTableType(hadoopConf, basePath, HoodieTableType.COPY_ON_WRITE); HoodieTestUtils.initTableType(hadoopConf, basePath, HoodieTableType.COPY_ON_WRITE);
HoodieTableMetaClient metaClient = new HoodieTableMetaClient(jsc.hadoopConfiguration(), HoodieTableMetaClient metaClient = new HoodieTableMetaClient(jsc.hadoopConfiguration(), basePath);
basePath);
HoodieTable table = HoodieTable.getHoodieTable(metaClient, getConfig()); HoodieTable table = HoodieTable.getHoodieTable(metaClient, getConfig());
compactor.compact(jsc, getConfig(), table, HoodieActiveTimeline.createNewCommitTime()); compactor.compact(jsc, getConfig(), table, HoodieActiveTimeline.createNewCommitTime());
} }
@Test @Test
public void testCompactionEmpty() throws Exception { public void testCompactionEmpty() throws Exception {
HoodieTableMetaClient metaClient = new HoodieTableMetaClient(jsc.hadoopConfiguration(), HoodieTableMetaClient metaClient = new HoodieTableMetaClient(jsc.hadoopConfiguration(), basePath);
basePath);
HoodieWriteConfig config = getConfig(); HoodieWriteConfig config = getConfig();
HoodieTable table = HoodieTable HoodieTable table = HoodieTable.getHoodieTable(metaClient, config);
.getHoodieTable(metaClient, config);
HoodieWriteClient writeClient = new HoodieWriteClient(jsc, config); HoodieWriteClient writeClient = new HoodieWriteClient(jsc, config);
String newCommitTime = writeClient.startCommit(); String newCommitTime = writeClient.startCommit();
@@ -126,10 +121,9 @@ public class TestHoodieCompactor {
JavaRDD<HoodieRecord> recordsRDD = jsc.parallelize(records, 1); JavaRDD<HoodieRecord> recordsRDD = jsc.parallelize(records, 1);
writeClient.insert(recordsRDD, newCommitTime).collect(); writeClient.insert(recordsRDD, newCommitTime).collect();
JavaRDD<WriteStatus> result = JavaRDD<WriteStatus> result = compactor
compactor.compact(jsc, getConfig(), table, HoodieActiveTimeline.createNewCommitTime()); .compact(jsc, getConfig(), table, HoodieActiveTimeline.createNewCommitTime());
assertTrue("If there is nothing to compact, result will be empty", assertTrue("If there is nothing to compact, result will be empty", result.isEmpty());
result.isEmpty());
} }
@Test @Test
@@ -145,8 +139,7 @@ public class TestHoodieCompactor {
List<WriteStatus> statuses = writeClient.insert(recordsRDD, newCommitTime).collect(); List<WriteStatus> statuses = writeClient.insert(recordsRDD, newCommitTime).collect();
// Update all the 100 records // Update all the 100 records
HoodieTableMetaClient metaClient = new HoodieTableMetaClient(jsc.hadoopConfiguration(), HoodieTableMetaClient metaClient = new HoodieTableMetaClient(jsc.hadoopConfiguration(), basePath);
basePath);
HoodieTable table = HoodieTable.getHoodieTable(metaClient, config); HoodieTable table = HoodieTable.getHoodieTable(metaClient, config);
newCommitTime = "101"; newCommitTime = "101";
@@ -159,19 +152,16 @@ public class TestHoodieCompactor {
// Write them to corresponding avro logfiles // Write them to corresponding avro logfiles
HoodieTestUtils HoodieTestUtils
.writeRecordsToLogFiles(fs, metaClient.getBasePath(), HoodieTestDataGenerator.avroSchema, .writeRecordsToLogFiles(fs, metaClient.getBasePath(), HoodieTestDataGenerator.avroSchema, updatedRecords);
updatedRecords);
// Verify that all data file has one log file // Verify that all data file has one log file
metaClient = new HoodieTableMetaClient(jsc.hadoopConfiguration(), basePath); metaClient = new HoodieTableMetaClient(jsc.hadoopConfiguration(), basePath);
table = HoodieTable.getHoodieTable(metaClient, config); table = HoodieTable.getHoodieTable(metaClient, config);
for (String partitionPath : dataGen.getPartitionPaths()) { for (String partitionPath : dataGen.getPartitionPaths()) {
List<FileSlice> groupedLogFiles = List<FileSlice> groupedLogFiles = table.getRTFileSystemView().getLatestFileSlices(partitionPath)
table.getRTFileSystemView().getLatestFileSlices(partitionPath)
.collect(Collectors.toList()); .collect(Collectors.toList());
for (FileSlice fileSlice : groupedLogFiles) { for (FileSlice fileSlice : groupedLogFiles) {
assertEquals("There should be 1 log file written for every data file", 1, assertEquals("There should be 1 log file written for every data file", 1, fileSlice.getLogFiles().count());
fileSlice.getLogFiles().count());
} }
} }
@@ -179,18 +169,19 @@ public class TestHoodieCompactor {
metaClient = new HoodieTableMetaClient(jsc.hadoopConfiguration(), basePath); metaClient = new HoodieTableMetaClient(jsc.hadoopConfiguration(), basePath);
table = HoodieTable.getHoodieTable(metaClient, config); table = HoodieTable.getHoodieTable(metaClient, config);
JavaRDD<WriteStatus> result = JavaRDD<WriteStatus> result = compactor
compactor.compact(jsc, getConfig(), table, HoodieActiveTimeline.createNewCommitTime()); .compact(jsc, getConfig(), table, HoodieActiveTimeline.createNewCommitTime());
// Verify that all partition paths are present in the WriteStatus result // Verify that all partition paths are present in the WriteStatus result
for (String partitionPath : dataGen.getPartitionPaths()) { for (String partitionPath : dataGen.getPartitionPaths()) {
List<WriteStatus> writeStatuses = result.collect(); List<WriteStatus> writeStatuses = result.collect();
assertTrue(writeStatuses.stream() assertTrue(writeStatuses.stream()
.filter(writeStatus -> writeStatus.getStat().getPartitionPath() .filter(writeStatus -> writeStatus.getStat().getPartitionPath().contentEquals(partitionPath))
.contentEquals(partitionPath)).count() > 0); .count() > 0);
} }
} }
// TODO - after modifying HoodieReadClient to support realtime tables - add more tests to make sure the data read is the updated data (compaction correctness) // TODO - after modifying HoodieReadClient to support realtime tables - add more tests to make
// sure the data read is the updated data (compaction correctness)
// TODO - add more test cases for compactions after a failed commit/compaction // TODO - add more test cases for compactions after a failed commit/compaction
} }

View File

@@ -16,6 +16,9 @@
package com.uber.hoodie.io.strategy; package com.uber.hoodie.io.strategy;
import static org.junit.Assert.assertEquals;
import static org.junit.Assert.assertTrue;
import com.beust.jcommander.internal.Lists; import com.beust.jcommander.internal.Lists;
import com.google.common.collect.Maps; import com.google.common.collect.Maps;
import com.uber.hoodie.config.HoodieCompactionConfig; import com.uber.hoodie.config.HoodieCompactionConfig;
@@ -25,15 +28,11 @@ import com.uber.hoodie.io.compact.strategy.BoundedIOCompactionStrategy;
import com.uber.hoodie.io.compact.strategy.DayBasedCompactionStrategy; import com.uber.hoodie.io.compact.strategy.DayBasedCompactionStrategy;
import com.uber.hoodie.io.compact.strategy.LogFileSizeBasedCompactionStrategy; import com.uber.hoodie.io.compact.strategy.LogFileSizeBasedCompactionStrategy;
import com.uber.hoodie.io.compact.strategy.UnBoundedCompactionStrategy; import com.uber.hoodie.io.compact.strategy.UnBoundedCompactionStrategy;
import org.junit.Test;
import java.util.List; import java.util.List;
import java.util.Map; import java.util.Map;
import java.util.Random; import java.util.Random;
import java.util.stream.Collectors; import java.util.stream.Collectors;
import org.junit.Test;
import static org.junit.Assert.assertEquals;
import static org.junit.Assert.assertTrue;
public class TestHoodieCompactionStrategy { public class TestHoodieCompactionStrategy {
@@ -48,8 +47,7 @@ public class TestHoodieCompactionStrategy {
sizesMap.put(100 * MB, Lists.newArrayList(MB)); sizesMap.put(100 * MB, Lists.newArrayList(MB));
sizesMap.put(90 * MB, Lists.newArrayList(1024 * MB)); sizesMap.put(90 * MB, Lists.newArrayList(1024 * MB));
UnBoundedCompactionStrategy strategy = new UnBoundedCompactionStrategy(); UnBoundedCompactionStrategy strategy = new UnBoundedCompactionStrategy();
HoodieWriteConfig writeConfig = HoodieWriteConfig.newBuilder().withPath("/tmp") HoodieWriteConfig writeConfig = HoodieWriteConfig.newBuilder().withPath("/tmp").withCompactionConfig(
.withCompactionConfig(
HoodieCompactionConfig.newBuilder().withCompactionStrategy(strategy).build()).build(); HoodieCompactionConfig.newBuilder().withCompactionStrategy(strategy).build()).build();
List<CompactionOperation> operations = createCompactionOperations(writeConfig, sizesMap); List<CompactionOperation> operations = createCompactionOperations(writeConfig, sizesMap);
List<CompactionOperation> returned = strategy.orderAndFilter(writeConfig, operations); List<CompactionOperation> returned = strategy.orderAndFilter(writeConfig, operations);
@@ -64,23 +62,19 @@ public class TestHoodieCompactionStrategy {
sizesMap.put(100 * MB, Lists.newArrayList(MB)); sizesMap.put(100 * MB, Lists.newArrayList(MB));
sizesMap.put(90 * MB, Lists.newArrayList(1024 * MB)); sizesMap.put(90 * MB, Lists.newArrayList(1024 * MB));
BoundedIOCompactionStrategy strategy = new BoundedIOCompactionStrategy(); BoundedIOCompactionStrategy strategy = new BoundedIOCompactionStrategy();
HoodieWriteConfig writeConfig = HoodieWriteConfig.newBuilder().withPath("/tmp") HoodieWriteConfig writeConfig = HoodieWriteConfig.newBuilder().withPath("/tmp").withCompactionConfig(
.withCompactionConfig( HoodieCompactionConfig.newBuilder().withCompactionStrategy(strategy).withTargetIOPerCompactionInMB(400).build())
HoodieCompactionConfig.newBuilder().withCompactionStrategy(strategy) .build();
.withTargetIOPerCompactionInMB(400).build()).build();
List<CompactionOperation> operations = createCompactionOperations(writeConfig, sizesMap); List<CompactionOperation> operations = createCompactionOperations(writeConfig, sizesMap);
List<CompactionOperation> returned = strategy.orderAndFilter(writeConfig, operations); List<CompactionOperation> returned = strategy.orderAndFilter(writeConfig, operations);
assertTrue("BoundedIOCompaction should have resulted in fewer compactions", assertTrue("BoundedIOCompaction should have resulted in fewer compactions", returned.size() < operations.size());
returned.size() < operations.size()); assertEquals("BoundedIOCompaction should have resulted in 2 compactions being chosen", 2, returned.size());
assertEquals("BoundedIOCompaction should have resulted in 2 compactions being chosen",
2, returned.size());
// Total size of all the log files // Total size of all the log files
Long returnedSize = returned.stream() Long returnedSize = returned.stream().map(s -> s.getMetrics().get(BoundedIOCompactionStrategy.TOTAL_IO_MB))
.map(s -> s.getMetrics().get(BoundedIOCompactionStrategy.TOTAL_IO_MB)).map(s -> (Long) s) .map(s -> (Long) s).reduce((size1, size2) -> size1 + size2).orElse(0L);
.reduce((size1, size2) -> size1 + size2).orElse(0L); assertEquals("Should chose the first 2 compactions which should result in a total IO of 690 MB", 610,
assertEquals("Should chose the first 2 compactions which should result in a total IO of 690 MB", (long) returnedSize);
610, (long) returnedSize);
} }
@Test @Test
@@ -91,23 +85,20 @@ public class TestHoodieCompactionStrategy {
sizesMap.put(100 * MB, Lists.newArrayList(MB)); sizesMap.put(100 * MB, Lists.newArrayList(MB));
sizesMap.put(90 * MB, Lists.newArrayList(1024 * MB)); sizesMap.put(90 * MB, Lists.newArrayList(1024 * MB));
LogFileSizeBasedCompactionStrategy strategy = new LogFileSizeBasedCompactionStrategy(); LogFileSizeBasedCompactionStrategy strategy = new LogFileSizeBasedCompactionStrategy();
HoodieWriteConfig writeConfig = HoodieWriteConfig.newBuilder().withPath("/tmp") HoodieWriteConfig writeConfig = HoodieWriteConfig.newBuilder().withPath("/tmp").withCompactionConfig(
.withCompactionConfig( HoodieCompactionConfig.newBuilder().withCompactionStrategy(strategy).withTargetIOPerCompactionInMB(400).build())
HoodieCompactionConfig.newBuilder().withCompactionStrategy(strategy) .build();
.withTargetIOPerCompactionInMB(400).build()).build();
List<CompactionOperation> operations = createCompactionOperations(writeConfig, sizesMap); List<CompactionOperation> operations = createCompactionOperations(writeConfig, sizesMap);
List<CompactionOperation> returned = strategy.orderAndFilter(writeConfig, operations); List<CompactionOperation> returned = strategy.orderAndFilter(writeConfig, operations);
assertTrue("LogFileSizeBasedCompactionStrategy should have resulted in fewer compactions", assertTrue("LogFileSizeBasedCompactionStrategy should have resulted in fewer compactions",
returned.size() < operations.size()); returned.size() < operations.size());
assertEquals("LogFileSizeBasedCompactionStrategy should have resulted in 1 compaction", assertEquals("LogFileSizeBasedCompactionStrategy should have resulted in 1 compaction", 1, returned.size());
1, returned.size());
// Total size of all the log files // Total size of all the log files
Long returnedSize = returned.stream() Long returnedSize = returned.stream().map(s -> s.getMetrics().get(BoundedIOCompactionStrategy.TOTAL_IO_MB))
.map(s -> s.getMetrics().get(BoundedIOCompactionStrategy.TOTAL_IO_MB)).map(s -> (Long) s) .map(s -> (Long) s).reduce((size1, size2) -> size1 + size2).orElse(0L);
.reduce((size1, size2) -> size1 + size2).orElse(0L); assertEquals("Should chose the first 2 compactions which should result in a total IO of 690 MB", 1204,
assertEquals("Should chose the first 2 compactions which should result in a total IO of 690 MB", (long) returnedSize);
1204, (long) returnedSize);
} }
@Test @Test
@@ -118,10 +109,9 @@ public class TestHoodieCompactionStrategy {
sizesMap.put(100 * MB, Lists.newArrayList(MB)); sizesMap.put(100 * MB, Lists.newArrayList(MB));
sizesMap.put(90 * MB, Lists.newArrayList(1024 * MB)); sizesMap.put(90 * MB, Lists.newArrayList(1024 * MB));
DayBasedCompactionStrategy strategy = new DayBasedCompactionStrategy(); DayBasedCompactionStrategy strategy = new DayBasedCompactionStrategy();
HoodieWriteConfig writeConfig = HoodieWriteConfig.newBuilder().withPath("/tmp") HoodieWriteConfig writeConfig = HoodieWriteConfig.newBuilder().withPath("/tmp").withCompactionConfig(
.withCompactionConfig( HoodieCompactionConfig.newBuilder().withCompactionStrategy(strategy).withTargetIOPerCompactionInMB(400).build())
HoodieCompactionConfig.newBuilder().withCompactionStrategy(strategy) .build();
.withTargetIOPerCompactionInMB(400).build()).build();
List<CompactionOperation> operations = createCompactionOperations(writeConfig, sizesMap); List<CompactionOperation> operations = createCompactionOperations(writeConfig, sizesMap);
List<CompactionOperation> returned = strategy.orderAndFilter(writeConfig, operations); List<CompactionOperation> returned = strategy.orderAndFilter(writeConfig, operations);
@@ -130,8 +120,7 @@ public class TestHoodieCompactionStrategy {
int comparision = strategy.getComparator().compare(returned.get(returned.size() - 1), returned.get(0)); int comparision = strategy.getComparator().compare(returned.get(returned.size() - 1), returned.get(0));
// Either the partition paths are sorted in descending order or they are equal // Either the partition paths are sorted in descending order or they are equal
assertTrue("DayBasedCompactionStrategy should sort partitions in descending order", assertTrue("DayBasedCompactionStrategy should sort partitions in descending order", comparision >= 0);
comparision >= 0);
} }
private List<CompactionOperation> createCompactionOperations(HoodieWriteConfig config, private List<CompactionOperation> createCompactionOperations(HoodieWriteConfig config,
@@ -140,8 +129,7 @@ public class TestHoodieCompactionStrategy {
sizesMap.forEach((k, v) -> { sizesMap.forEach((k, v) -> {
operations.add(new CompactionOperation(TestHoodieDataFile.newDataFile(k), operations.add(new CompactionOperation(TestHoodieDataFile.newDataFile(k),
partitionPaths[new Random().nextInt(partitionPaths.length - 1)], partitionPaths[new Random().nextInt(partitionPaths.length - 1)],
v.stream().map(TestHoodieLogFile::newLogFile).collect( v.stream().map(TestHoodieLogFile::newLogFile).collect(Collectors.toList()), config));
Collectors.toList()), config));
}); });
return operations; return operations;
} }

View File

@@ -28,6 +28,10 @@ public class TestHoodieDataFile extends HoodieDataFile {
this.size = size; this.size = size;
} }
public static HoodieDataFile newDataFile(long size) {
return new TestHoodieDataFile(size);
}
@Override @Override
public String getPath() { public String getPath() {
return "/tmp/test"; return "/tmp/test";
@@ -43,13 +47,8 @@ public class TestHoodieDataFile extends HoodieDataFile {
return "100"; return "100";
} }
@Override @Override
public long getFileSize() { public long getFileSize() {
return size; return size;
} }
public static HoodieDataFile newDataFile(long size) {
return new TestHoodieDataFile(size);
}
} }

View File

@@ -29,6 +29,10 @@ public class TestHoodieLogFile extends HoodieLogFile {
this.size = size; this.size = size;
} }
public static HoodieLogFile newLogFile(long size) {
return new TestHoodieLogFile(size);
}
@Override @Override
public Path getPath() { public Path getPath() {
return new Path("/tmp/test-log"); return new Path("/tmp/test-log");
@@ -38,8 +42,4 @@ public class TestHoodieLogFile extends HoodieLogFile {
public Optional<Long> getFileSize() { public Optional<Long> getFileSize() {
return Optional.of(size); return Optional.of(size);
} }
public static HoodieLogFile newLogFile(long size) {
return new TestHoodieLogFile(size);
}
} }

View File

@@ -40,7 +40,6 @@ public class TestHoodieMetrics {
@Test @Test
public void testRegisterGauge() { public void testRegisterGauge() {
metrics.registerGauge("metric1", 123L); metrics.registerGauge("metric1", 123L);
assertTrue(Metrics.getInstance().getRegistry().getGauges().get("metric1").getValue().toString() assertTrue(Metrics.getInstance().getRegistry().getGauges().get("metric1").getValue().toString().equals("123"));
.equals("123"));
} }
} }

View File

@@ -89,14 +89,13 @@ public class TestCopyOnWriteTable {
String commitTime = HoodieTestUtils.makeNewCommitTime(); String commitTime = HoodieTestUtils.makeNewCommitTime();
HoodieWriteConfig config = makeHoodieClientConfig(); HoodieWriteConfig config = makeHoodieClientConfig();
HoodieTableMetaClient metaClient = new HoodieTableMetaClient(jsc.hadoopConfiguration(), HoodieTableMetaClient metaClient = new HoodieTableMetaClient(jsc.hadoopConfiguration(), basePath);
basePath);
HoodieTable table = HoodieTable.getHoodieTable(metaClient, config); HoodieTable table = HoodieTable.getHoodieTable(metaClient, config);
HoodieCreateHandle io = new HoodieCreateHandle(config, commitTime, table, partitionPath); HoodieCreateHandle io = new HoodieCreateHandle(config, commitTime, table, partitionPath);
Path newPath = io.makeNewPath(record.getPartitionPath(), unitNumber, fileName); Path newPath = io.makeNewPath(record.getPartitionPath(), unitNumber, fileName);
assertTrue(newPath.toString().equals(this.basePath + "/" + partitionPath + "/" + FSUtils assertTrue(newPath.toString().equals(
.makeDataFileName(commitTime, unitNumber, fileName))); this.basePath + "/" + partitionPath + "/" + FSUtils.makeDataFileName(commitTime, unitNumber, fileName)));
} }
private HoodieWriteConfig makeHoodieClientConfig() throws Exception { private HoodieWriteConfig makeHoodieClientConfig() throws Exception {
@@ -105,8 +104,7 @@ public class TestCopyOnWriteTable {
private HoodieWriteConfig.Builder makeHoodieClientConfigBuilder() throws Exception { private HoodieWriteConfig.Builder makeHoodieClientConfigBuilder() throws Exception {
// Prepare the AvroParquetIO // Prepare the AvroParquetIO
String schemaStr = IOUtils String schemaStr = IOUtils.toString(getClass().getResourceAsStream("/exampleSchema.txt"), "UTF-8");
.toString(getClass().getResourceAsStream("/exampleSchema.txt"), "UTF-8");
return HoodieWriteConfig.newBuilder().withPath(basePath).withSchema(schemaStr); return HoodieWriteConfig.newBuilder().withPath(basePath).withSchema(schemaStr);
} }
@@ -122,28 +120,27 @@ public class TestCopyOnWriteTable {
HoodieCopyOnWriteTable table = new HoodieCopyOnWriteTable(config, metadata); HoodieCopyOnWriteTable table = new HoodieCopyOnWriteTable(config, metadata);
// Get some records belong to the same partition (2016/01/31) // Get some records belong to the same partition (2016/01/31)
String recordStr1 = "{\"_row_key\":\"8eb5b87a-1feh-4edd-87b4-6ec96dc405a0\",\"time\":\"2016-01-31T03:16:41.415Z\",\"number\":12}"; String recordStr1 = "{\"_row_key\":\"8eb5b87a-1feh-4edd-87b4-6ec96dc405a0\","
String recordStr2 = "{\"_row_key\":\"8eb5b87b-1feu-4edd-87b4-6ec96dc405a0\",\"time\":\"2016-01-31T03:20:41.415Z\",\"number\":100}"; + "\"time\":\"2016-01-31T03:16:41.415Z\",\"number\":12}";
String recordStr3 = "{\"_row_key\":\"8eb5b87c-1fej-4edd-87b4-6ec96dc405a0\",\"time\":\"2016-01-31T03:16:41.415Z\",\"number\":15}"; String recordStr2 = "{\"_row_key\":\"8eb5b87b-1feu-4edd-87b4-6ec96dc405a0\","
String recordStr4 = "{\"_row_key\":\"8eb5b87d-1fej-4edd-87b4-6ec96dc405a0\",\"time\":\"2016-01-31T03:16:41.415Z\",\"number\":51}"; + "\"time\":\"2016-01-31T03:20:41.415Z\",\"number\":100}";
String recordStr3 = "{\"_row_key\":\"8eb5b87c-1fej-4edd-87b4-6ec96dc405a0\","
+ "\"time\":\"2016-01-31T03:16:41.415Z\",\"number\":15}";
String recordStr4 = "{\"_row_key\":\"8eb5b87d-1fej-4edd-87b4-6ec96dc405a0\","
+ "\"time\":\"2016-01-31T03:16:41.415Z\",\"number\":51}";
List<HoodieRecord> records = new ArrayList<>(); List<HoodieRecord> records = new ArrayList<>();
TestRawTripPayload rowChange1 = new TestRawTripPayload(recordStr1); TestRawTripPayload rowChange1 = new TestRawTripPayload(recordStr1);
records.add( records.add(new HoodieRecord(new HoodieKey(rowChange1.getRowKey(), rowChange1.getPartitionPath()), rowChange1));
new HoodieRecord(new HoodieKey(rowChange1.getRowKey(), rowChange1.getPartitionPath()),
rowChange1));
TestRawTripPayload rowChange2 = new TestRawTripPayload(recordStr2); TestRawTripPayload rowChange2 = new TestRawTripPayload(recordStr2);
records.add( records.add(new HoodieRecord(new HoodieKey(rowChange2.getRowKey(), rowChange2.getPartitionPath()), rowChange2));
new HoodieRecord(new HoodieKey(rowChange2.getRowKey(), rowChange2.getPartitionPath()),
rowChange2));
TestRawTripPayload rowChange3 = new TestRawTripPayload(recordStr3); TestRawTripPayload rowChange3 = new TestRawTripPayload(recordStr3);
records.add( records.add(new HoodieRecord(new HoodieKey(rowChange3.getRowKey(), rowChange3.getPartitionPath()), rowChange3));
new HoodieRecord(new HoodieKey(rowChange3.getRowKey(), rowChange3.getPartitionPath()),
rowChange3));
// Insert new records // Insert new records
HoodieClientTestUtils.collectStatuses(table.handleInsert(firstCommitTime, records.iterator())); HoodieClientTestUtils.collectStatuses(table.handleInsert(firstCommitTime, records.iterator()));
// We should have a parquet file generated (TODO: better control # files after we revise AvroParquetIO) // We should have a parquet file generated (TODO: better control # files after we revise
// AvroParquetIO)
File parquetFile = null; File parquetFile = null;
for (File file : new File(this.basePath + partitionPath).listFiles()) { for (File file : new File(this.basePath + partitionPath).listFiles()) {
if (file.getName().endsWith(".parquet")) { if (file.getName().endsWith(".parquet")) {
@@ -155,18 +152,17 @@ public class TestCopyOnWriteTable {
// Read out the bloom filter and make sure filter can answer record exist or not // Read out the bloom filter and make sure filter can answer record exist or not
Path parquetFilePath = new Path(parquetFile.getAbsolutePath()); Path parquetFilePath = new Path(parquetFile.getAbsolutePath());
BloomFilter filter = ParquetUtils BloomFilter filter = ParquetUtils.readBloomFilterFromParquetMetadata(jsc.hadoopConfiguration(), parquetFilePath);
.readBloomFilterFromParquetMetadata(jsc.hadoopConfiguration(), parquetFilePath);
for (HoodieRecord record : records) { for (HoodieRecord record : records) {
assertTrue(filter.mightContain(record.getRecordKey())); assertTrue(filter.mightContain(record.getRecordKey()));
} }
// Create a commit file // Create a commit file
new File(this.basePath + "/" + HoodieTableMetaClient.METAFOLDER_NAME + "/" new File(
+ FSUtils.getCommitTime(parquetFile.getName()) + ".commit").createNewFile(); this.basePath + "/" + HoodieTableMetaClient.METAFOLDER_NAME + "/" + FSUtils.getCommitTime(parquetFile.getName())
+ ".commit").createNewFile();
// Read the parquet file, check the record content // Read the parquet file, check the record content
List<GenericRecord> fileRecords = ParquetUtils List<GenericRecord> fileRecords = ParquetUtils.readAvroRecords(jsc.hadoopConfiguration(), parquetFilePath);
.readAvroRecords(jsc.hadoopConfiguration(), parquetFilePath);
GenericRecord newRecord; GenericRecord newRecord;
int index = 0; int index = 0;
for (GenericRecord record : fileRecords) { for (GenericRecord record : fileRecords) {
@@ -175,13 +171,12 @@ public class TestCopyOnWriteTable {
} }
// We update the 1st record & add a new record // We update the 1st record & add a new record
String updateRecordStr1 = "{\"_row_key\":\"8eb5b87a-1feh-4edd-87b4-6ec96dc405a0\",\"time\":\"2016-01-31T03:16:41.415Z\",\"number\":15}"; String updateRecordStr1 = "{\"_row_key\":\"8eb5b87a-1feh-4edd-87b4-6ec96dc405a0\","
+ "\"time\":\"2016-01-31T03:16:41.415Z\",\"number\":15}";
TestRawTripPayload updateRowChanges1 = new TestRawTripPayload(updateRecordStr1); TestRawTripPayload updateRowChanges1 = new TestRawTripPayload(updateRecordStr1);
HoodieRecord updatedRecord1 = new HoodieRecord( HoodieRecord updatedRecord1 = new HoodieRecord(
new HoodieKey(updateRowChanges1.getRowKey(), updateRowChanges1.getPartitionPath()), new HoodieKey(updateRowChanges1.getRowKey(), updateRowChanges1.getPartitionPath()), updateRowChanges1);
updateRowChanges1); updatedRecord1.setCurrentLocation(new HoodieRecordLocation(null, FSUtils.getFileId(parquetFile.getName())));
updatedRecord1.setCurrentLocation(
new HoodieRecordLocation(null, FSUtils.getFileId(parquetFile.getName())));
TestRawTripPayload rowChange4 = new TestRawTripPayload(recordStr4); TestRawTripPayload rowChange4 = new TestRawTripPayload(recordStr4);
HoodieRecord insertedRecord1 = new HoodieRecord( HoodieRecord insertedRecord1 = new HoodieRecord(
@@ -201,9 +196,8 @@ public class TestCopyOnWriteTable {
File updatedParquetFile = null; File updatedParquetFile = null;
for (File file : new File(basePath + "/2016/01/31").listFiles()) { for (File file : new File(basePath + "/2016/01/31").listFiles()) {
if (file.getName().endsWith(".parquet")) { if (file.getName().endsWith(".parquet")) {
if (FSUtils.getFileId(file.getName()) if (FSUtils.getFileId(file.getName()).equals(FSUtils.getFileId(parquetFile.getName()))
.equals(FSUtils.getFileId(parquetFile.getName())) && && HoodieTimeline.compareTimestamps(FSUtils.getCommitTime(file.getName()),
HoodieTimeline.compareTimestamps(FSUtils.getCommitTime(file.getName()),
FSUtils.getCommitTime(parquetFile.getName()), HoodieTimeline.GREATER)) { FSUtils.getCommitTime(parquetFile.getName()), HoodieTimeline.GREATER)) {
updatedParquetFile = file; updatedParquetFile = file;
break; break;
@@ -213,8 +207,8 @@ public class TestCopyOnWriteTable {
assertTrue(updatedParquetFile != null); assertTrue(updatedParquetFile != null);
// Check whether the record has been updated // Check whether the record has been updated
Path updatedParquetFilePath = new Path(updatedParquetFile.getAbsolutePath()); Path updatedParquetFilePath = new Path(updatedParquetFile.getAbsolutePath());
BloomFilter updatedFilter = ParquetUtils BloomFilter updatedFilter = ParquetUtils.readBloomFilterFromParquetMetadata(jsc.hadoopConfiguration(),
.readBloomFilterFromParquetMetadata(jsc.hadoopConfiguration(), updatedParquetFilePath); updatedParquetFilePath);
for (HoodieRecord record : records) { for (HoodieRecord record : records) {
// No change to the _row_key // No change to the _row_key
assertTrue(updatedFilter.mightContain(record.getRecordKey())); assertTrue(updatedFilter.mightContain(record.getRecordKey()));
@@ -223,8 +217,7 @@ public class TestCopyOnWriteTable {
assertTrue(updatedFilter.mightContain(insertedRecord1.getRecordKey())); assertTrue(updatedFilter.mightContain(insertedRecord1.getRecordKey()));
records.add(insertedRecord1);// add this so it can further check below records.add(insertedRecord1);// add this so it can further check below
ParquetReader updatedReader = ParquetReader ParquetReader updatedReader = ParquetReader.builder(new AvroReadSupport<>(), updatedParquetFilePath).build();
.builder(new AvroReadSupport<>(), updatedParquetFilePath).build();
index = 0; index = 0;
while ((newRecord = (GenericRecord) updatedReader.read()) != null) { while ((newRecord = (GenericRecord) updatedReader.read()) != null) {
assertTrue(newRecord.get("_row_key").toString().equals(records.get(index).getRecordKey())); assertTrue(newRecord.get("_row_key").toString().equals(records.get(index).getRecordKey()));
@@ -246,13 +239,9 @@ public class TestCopyOnWriteTable {
List<HoodieRecord> records = new ArrayList<>(); List<HoodieRecord> records = new ArrayList<>();
for (int i = 0; i < n; i++) { for (int i = 0; i < n; i++) {
String recordStr = String.format("{\"_row_key\":\"%s\",\"time\":\"%s\",\"number\":%d}", String recordStr = String.format("{\"_row_key\":\"%s\",\"time\":\"%s\",\"number\":%d}",
UUID.randomUUID().toString(), UUID.randomUUID().toString(), time, i);
time,
i);
TestRawTripPayload rowChange = new TestRawTripPayload(recordStr); TestRawTripPayload rowChange = new TestRawTripPayload(recordStr);
records.add(new HoodieRecord( records.add(new HoodieRecord(new HoodieKey(rowChange.getRowKey(), rowChange.getPartitionPath()), rowChange));
new HoodieKey(rowChange.getRowKey(), rowChange.getPartitionPath()),
rowChange));
} }
return records; return records;
} }
@@ -261,31 +250,28 @@ public class TestCopyOnWriteTable {
@Test @Test
public void testMetadataAggregateFromWriteStatus() throws Exception { public void testMetadataAggregateFromWriteStatus() throws Exception {
// Prepare the AvroParquetIO // Prepare the AvroParquetIO
HoodieWriteConfig config = makeHoodieClientConfigBuilder() HoodieWriteConfig config = makeHoodieClientConfigBuilder().withWriteStatusClass(MetadataMergeWriteStatus.class)
.withWriteStatusClass(MetadataMergeWriteStatus.class).build(); .build();
String firstCommitTime = HoodieTestUtils.makeNewCommitTime(); String firstCommitTime = HoodieTestUtils.makeNewCommitTime();
HoodieTableMetaClient metadata = new HoodieTableMetaClient(jsc.hadoopConfiguration(), basePath); HoodieTableMetaClient metadata = new HoodieTableMetaClient(jsc.hadoopConfiguration(), basePath);
HoodieCopyOnWriteTable table = new HoodieCopyOnWriteTable(config, metadata); HoodieCopyOnWriteTable table = new HoodieCopyOnWriteTable(config, metadata);
// Get some records belong to the same partition (2016/01/31) // Get some records belong to the same partition (2016/01/31)
String recordStr1 = "{\"_row_key\":\"8eb5b87a-1feh-4edd-87b4-6ec96dc405a0\",\"time\":\"2016-01-31T03:16:41.415Z\",\"number\":12}"; String recordStr1 = "{\"_row_key\":\"8eb5b87a-1feh-4edd-87b4-6ec96dc405a0\","
String recordStr2 = "{\"_row_key\":\"8eb5b87b-1feu-4edd-87b4-6ec96dc405a0\",\"time\":\"2016-01-31T03:20:41.415Z\",\"number\":100}"; + "\"time\":\"2016-01-31T03:16:41.415Z\",\"number\":12}";
String recordStr3 = "{\"_row_key\":\"8eb5b87c-1fej-4edd-87b4-6ec96dc405a0\",\"time\":\"2016-01-31T03:16:41.415Z\",\"number\":15}"; String recordStr2 = "{\"_row_key\":\"8eb5b87b-1feu-4edd-87b4-6ec96dc405a0\","
+ "\"time\":\"2016-01-31T03:20:41.415Z\",\"number\":100}";
String recordStr3 = "{\"_row_key\":\"8eb5b87c-1fej-4edd-87b4-6ec96dc405a0\","
+ "\"time\":\"2016-01-31T03:16:41.415Z\",\"number\":15}";
List<HoodieRecord> records = new ArrayList<>(); List<HoodieRecord> records = new ArrayList<>();
TestRawTripPayload rowChange1 = new TestRawTripPayload(recordStr1); TestRawTripPayload rowChange1 = new TestRawTripPayload(recordStr1);
records.add( records.add(new HoodieRecord(new HoodieKey(rowChange1.getRowKey(), rowChange1.getPartitionPath()), rowChange1));
new HoodieRecord(new HoodieKey(rowChange1.getRowKey(), rowChange1.getPartitionPath()),
rowChange1));
TestRawTripPayload rowChange2 = new TestRawTripPayload(recordStr2); TestRawTripPayload rowChange2 = new TestRawTripPayload(recordStr2);
records.add( records.add(new HoodieRecord(new HoodieKey(rowChange2.getRowKey(), rowChange2.getPartitionPath()), rowChange2));
new HoodieRecord(new HoodieKey(rowChange2.getRowKey(), rowChange2.getPartitionPath()),
rowChange2));
TestRawTripPayload rowChange3 = new TestRawTripPayload(recordStr3); TestRawTripPayload rowChange3 = new TestRawTripPayload(recordStr3);
records.add( records.add(new HoodieRecord(new HoodieKey(rowChange3.getRowKey(), rowChange3.getPartitionPath()), rowChange3));
new HoodieRecord(new HoodieKey(rowChange3.getRowKey(), rowChange3.getPartitionPath()),
rowChange3));
// Insert new records // Insert new records
List<WriteStatus> writeStatuses = HoodieClientTestUtils List<WriteStatus> writeStatuses = HoodieClientTestUtils
@@ -293,7 +279,8 @@ public class TestCopyOnWriteTable {
Map<String, String> allWriteStatusMergedMetadataMap = MetadataMergeWriteStatus Map<String, String> allWriteStatusMergedMetadataMap = MetadataMergeWriteStatus
.mergeMetadataForWriteStatuses(writeStatuses); .mergeMetadataForWriteStatuses(writeStatuses);
assertTrue(allWriteStatusMergedMetadataMap.containsKey("InputRecordCount_1506582000")); assertTrue(allWriteStatusMergedMetadataMap.containsKey("InputRecordCount_1506582000"));
// For metadata key InputRecordCount_1506582000, value is 2 for each record. So sum of this should be 2 * 3 // For metadata key InputRecordCount_1506582000, value is 2 for each record. So sum of this
// should be 2 * 3
assertEquals("6", allWriteStatusMergedMetadataMap.get("InputRecordCount_1506582000")); assertEquals("6", allWriteStatusMergedMetadataMap.get("InputRecordCount_1506582000"));
} }
@@ -314,26 +301,19 @@ public class TestCopyOnWriteTable {
List<WriteStatus> statuses = HoodieClientTestUtils List<WriteStatus> statuses = HoodieClientTestUtils
.collectStatuses(table.handleInsert(commitTime, records.iterator())); .collectStatuses(table.handleInsert(commitTime, records.iterator()));
WriteStatus status = statuses.get(0); WriteStatus status = statuses.get(0);
Path partialFile = new Path(String.format("%s/%s/%s", Path partialFile = new Path(String.format("%s/%s/%s", basePath, status.getPartitionPath(),
basePath, FSUtils.makeDataFileName(commitTime, 0, status.getFileId())));
status.getPartitionPath(),
FSUtils.makeDataFileName(commitTime, 0, status.getFileId()))
);
assertTrue(fs.exists(partialFile)); assertTrue(fs.exists(partialFile));
// When we retry // When we retry
records = newHoodieRecords(10, "2016-01-31T03:16:41.415Z"); records = newHoodieRecords(10, "2016-01-31T03:16:41.415Z");
records.addAll(newHoodieRecords(1, "2016-02-01T03:16:41.415Z")); records.addAll(newHoodieRecords(1, "2016-02-01T03:16:41.415Z"));
statuses = HoodieClientTestUtils statuses = HoodieClientTestUtils.collectStatuses(table.handleInsert(commitTime, records.iterator()));
.collectStatuses(table.handleInsert(commitTime, records.iterator()));
status = statuses.get(0); status = statuses.get(0);
Path retriedFIle = new Path(String.format("%s/%s/%s", Path retriedFIle = new Path(String.format("%s/%s/%s", basePath, status.getPartitionPath(),
basePath, FSUtils.makeDataFileName(commitTime, 0, status.getFileId())));
status.getPartitionPath(),
FSUtils.makeDataFileName(commitTime, 0, status.getFileId()))
);
assertTrue(fs.exists(retriedFIle)); assertTrue(fs.exists(retriedFIle));
assertFalse(fs.exists(partialFile)); assertFalse(fs.exists(partialFile));
} }
@@ -371,8 +351,7 @@ public class TestCopyOnWriteTable {
records.addAll(newHoodieRecords(1, "2016-02-02T03:16:41.415Z")); records.addAll(newHoodieRecords(1, "2016-02-02T03:16:41.415Z"));
// Insert new records // Insert new records
returnedStatuses = HoodieClientTestUtils returnedStatuses = HoodieClientTestUtils.collectStatuses(table.handleInsert(commitTime, records.iterator()));
.collectStatuses(table.handleInsert(commitTime, records.iterator()));
assertEquals(3, returnedStatuses.size()); assertEquals(3, returnedStatuses.size());
assertEquals("2016/01/31", returnedStatuses.get(0).getPartitionPath()); assertEquals("2016/01/31", returnedStatuses.get(0).getPartitionPath());
@@ -389,8 +368,8 @@ public class TestCopyOnWriteTable {
@Test @Test
public void testFileSizeUpsertRecords() throws Exception { public void testFileSizeUpsertRecords() throws Exception {
HoodieWriteConfig config = makeHoodieClientConfigBuilder().withStorageConfig( HoodieWriteConfig config = makeHoodieClientConfigBuilder().withStorageConfig(
HoodieStorageConfig.newBuilder().limitFileSize(64 * 1024).parquetBlockSize(64 * 1024) HoodieStorageConfig.newBuilder().limitFileSize(64 * 1024).parquetBlockSize(64 * 1024).parquetPageSize(64 * 1024)
.parquetPageSize(64 * 1024).build()).build(); .build()).build();
String commitTime = HoodieTestUtils.makeNewCommitTime(); String commitTime = HoodieTestUtils.makeNewCommitTime();
HoodieTableMetaClient metadata = new HoodieTableMetaClient(jsc.hadoopConfiguration(), basePath); HoodieTableMetaClient metadata = new HoodieTableMetaClient(jsc.hadoopConfiguration(), basePath);
HoodieCopyOnWriteTable table = new HoodieCopyOnWriteTable(config, metadata); HoodieCopyOnWriteTable table = new HoodieCopyOnWriteTable(config, metadata);
@@ -398,12 +377,11 @@ public class TestCopyOnWriteTable {
List<HoodieRecord> records = new ArrayList<>(); List<HoodieRecord> records = new ArrayList<>();
// Approx 1150 records are written for block size of 64KB // Approx 1150 records are written for block size of 64KB
for (int i = 0; i < 2000; i++) { for (int i = 0; i < 2000; i++) {
String recordStr = "{\"_row_key\":\"" + UUID.randomUUID().toString() String recordStr =
+ "\",\"time\":\"2016-01-31T03:16:41.415Z\",\"number\":" + i + "}"; "{\"_row_key\":\"" + UUID.randomUUID().toString() + "\",\"time\":\"2016-01-31T03:16:41.415Z\",\"number\":" + i
+ "}";
TestRawTripPayload rowChange = new TestRawTripPayload(recordStr); TestRawTripPayload rowChange = new TestRawTripPayload(recordStr);
records records.add(new HoodieRecord(new HoodieKey(rowChange.getRowKey(), rowChange.getPartitionPath()), rowChange));
.add(new HoodieRecord(new HoodieKey(rowChange.getRowKey(), rowChange.getPartitionPath()),
rowChange));
} }
// Insert new records // Insert new records
@@ -412,39 +390,30 @@ public class TestCopyOnWriteTable {
// Check the updated file // Check the updated file
int counts = 0; int counts = 0;
for (File file : new File(basePath + "/2016/01/31").listFiles()) { for (File file : new File(basePath + "/2016/01/31").listFiles()) {
if (file.getName().endsWith(".parquet") && FSUtils.getCommitTime(file.getName()) if (file.getName().endsWith(".parquet") && FSUtils.getCommitTime(file.getName()).equals(commitTime)) {
.equals(commitTime)) {
System.out.println(file.getName() + "-" + file.length()); System.out.println(file.getName() + "-" + file.length());
counts++; counts++;
} }
} }
assertEquals( assertEquals("If the number of records are more than 1150, then there should be a new file", 3, counts);
"If the number of records are more than 1150, then there should be a new file", 3,
counts);
} }
private List<HoodieCopyOnWriteTable.InsertBucket> testUpsertPartitioner(int smallFileSize, private List<HoodieCopyOnWriteTable.InsertBucket> testUpsertPartitioner(int smallFileSize, int numInserts,
int numInserts, int numUpdates, int fileSize, boolean autoSplitInserts) throws Exception {
int numUpdates, final String testPartitionPath = "2016/09/26";
int fileSize, HoodieWriteConfig config = makeHoodieClientConfigBuilder().withCompactionConfig(
boolean autoSplitInserts) throws Exception { HoodieCompactionConfig.newBuilder().compactionSmallFileSize(smallFileSize).insertSplitSize(100)
final String TEST_PARTITION_PATH = "2016/09/26"; .autoTuneInsertSplits(autoSplitInserts).build()).withStorageConfig(
HoodieWriteConfig config = makeHoodieClientConfigBuilder() HoodieStorageConfig.newBuilder().limitFileSize(1000 * 1024).build()).build();
.withCompactionConfig(HoodieCompactionConfig.newBuilder()
.compactionSmallFileSize(smallFileSize).insertSplitSize(100)
.autoTuneInsertSplits(autoSplitInserts).build())
.withStorageConfig(HoodieStorageConfig.newBuilder().limitFileSize(1000 * 1024).build())
.build();
HoodieClientTestUtils.fakeCommitFile(basePath, "001"); HoodieClientTestUtils.fakeCommitFile(basePath, "001");
HoodieClientTestUtils.fakeDataFile(basePath, TEST_PARTITION_PATH, "001", "file1", fileSize); HoodieClientTestUtils.fakeDataFile(basePath, testPartitionPath, "001", "file1", fileSize);
HoodieTableMetaClient metadata = new HoodieTableMetaClient(jsc.hadoopConfiguration(), basePath); HoodieTableMetaClient metadata = new HoodieTableMetaClient(jsc.hadoopConfiguration(), basePath);
HoodieCopyOnWriteTable table = new HoodieCopyOnWriteTable(config, metadata); HoodieCopyOnWriteTable table = new HoodieCopyOnWriteTable(config, metadata);
HoodieTestDataGenerator dataGenerator = new HoodieTestDataGenerator( HoodieTestDataGenerator dataGenerator = new HoodieTestDataGenerator(new String[] {testPartitionPath});
new String[]{TEST_PARTITION_PATH});
List<HoodieRecord> insertRecords = dataGenerator.generateInserts("001", numInserts); List<HoodieRecord> insertRecords = dataGenerator.generateInserts("001", numInserts);
List<HoodieRecord> updateRecords = dataGenerator.generateUpdates("001", numUpdates); List<HoodieRecord> updateRecords = dataGenerator.generateUpdates("001", numUpdates);
for (HoodieRecord updateRec : updateRecords) { for (HoodieRecord updateRec : updateRecords) {
@@ -454,8 +423,8 @@ public class TestCopyOnWriteTable {
records.addAll(insertRecords); records.addAll(insertRecords);
records.addAll(updateRecords); records.addAll(updateRecords);
WorkloadProfile profile = new WorkloadProfile(jsc.parallelize(records)); WorkloadProfile profile = new WorkloadProfile(jsc.parallelize(records));
HoodieCopyOnWriteTable.UpsertPartitioner partitioner = (HoodieCopyOnWriteTable.UpsertPartitioner) HoodieCopyOnWriteTable.UpsertPartitioner partitioner =
table.getUpsertPartitioner(profile); (HoodieCopyOnWriteTable.UpsertPartitioner) table.getUpsertPartitioner(profile);
assertEquals("Should have 3 partitions", 3, partitioner.numPartitions()); assertEquals("Should have 3 partitions", 3, partitioner.numPartitions());
assertEquals("Bucket 0 is UPDATE", HoodieCopyOnWriteTable.BucketType.UPDATE, assertEquals("Bucket 0 is UPDATE", HoodieCopyOnWriteTable.BucketType.UPDATE,
@@ -464,40 +433,35 @@ public class TestCopyOnWriteTable {
partitioner.getBucketInfo(1).bucketType); partitioner.getBucketInfo(1).bucketType);
assertEquals("Bucket 2 is INSERT", HoodieCopyOnWriteTable.BucketType.INSERT, assertEquals("Bucket 2 is INSERT", HoodieCopyOnWriteTable.BucketType.INSERT,
partitioner.getBucketInfo(2).bucketType); partitioner.getBucketInfo(2).bucketType);
assertEquals("Update record should have gone to the 1 update partiton", 0, assertEquals("Update record should have gone to the 1 update partiton", 0, partitioner.getPartition(
partitioner.getPartition(new Tuple2<>(updateRecords.get(0).getKey(), new Tuple2<>(updateRecords.get(0).getKey(), Option.apply(updateRecords.get(0).getCurrentLocation()))));
Option.apply(updateRecords.get(0).getCurrentLocation())))); return partitioner.getInsertBuckets(testPartitionPath);
return partitioner.getInsertBuckets(TEST_PARTITION_PATH);
} }
@Test @Test
public void testUpsertPartitioner() throws Exception { public void testUpsertPartitioner() throws Exception {
// Inserts + Updates... Check all updates go together & inserts subsplit // Inserts + Updates... Check all updates go together & inserts subsplit
List<HoodieCopyOnWriteTable.InsertBucket> insertBuckets = testUpsertPartitioner(0, 200, 100, List<HoodieCopyOnWriteTable.InsertBucket> insertBuckets = testUpsertPartitioner(0, 200, 100, 1024, false);
1024, false);
assertEquals("Total of 2 insert buckets", 2, insertBuckets.size()); assertEquals("Total of 2 insert buckets", 2, insertBuckets.size());
} }
@Test @Test
public void testUpsertPartitionerWithSmallInsertHandling() throws Exception { public void testUpsertPartitionerWithSmallInsertHandling() throws Exception {
// Inserts + Updates .. Check updates go together & inserts subsplit, after expanding smallest file // Inserts + Updates .. Check updates go together & inserts subsplit, after expanding
List<HoodieCopyOnWriteTable.InsertBucket> insertBuckets = testUpsertPartitioner(1000 * 1024, // smallest file
400, 100, 800 * 1024, false); List<HoodieCopyOnWriteTable.InsertBucket> insertBuckets = testUpsertPartitioner(1000 * 1024, 400, 100, 800 * 1024,
false);
assertEquals("Total of 3 insert buckets", 3, insertBuckets.size()); assertEquals("Total of 3 insert buckets", 3, insertBuckets.size());
assertEquals("First insert bucket must be same as update bucket", 0, assertEquals("First insert bucket must be same as update bucket", 0, insertBuckets.get(0).bucketNumber);
insertBuckets.get(0).bucketNumber); assertEquals("First insert bucket should have weight 0.5", 0.5, insertBuckets.get(0).weight, 0.01);
assertEquals("First insert bucket should have weight 0.5", 0.5, insertBuckets.get(0).weight,
0.01);
// Now with insert split size auto tuned // Now with insert split size auto tuned
insertBuckets = testUpsertPartitioner(1000 * 1024, 2400, 100, 800 * 1024, true); insertBuckets = testUpsertPartitioner(1000 * 1024, 2400, 100, 800 * 1024, true);
assertEquals("Total of 3 insert buckets", 3, insertBuckets.size()); assertEquals("Total of 3 insert buckets", 3, insertBuckets.size());
assertEquals("First insert bucket must be same as update bucket", 0, assertEquals("First insert bucket must be same as update bucket", 0, insertBuckets.get(0).bucketNumber);
insertBuckets.get(0).bucketNumber); assertEquals("First insert bucket should have weight 0.5", 200.0 / 2400, insertBuckets.get(0).weight, 0.01);
assertEquals("First insert bucket should have weight 0.5", 200.0 / 2400,
insertBuckets.get(0).weight, 0.01);
} }
@After @After

View File

@@ -18,7 +18,6 @@
package com.uber.hoodie.table; package com.uber.hoodie.table;
import static com.uber.hoodie.common.HoodieTestDataGenerator.TRIP_EXAMPLE_SCHEMA; import static com.uber.hoodie.common.HoodieTestDataGenerator.TRIP_EXAMPLE_SCHEMA;
import static org.junit.Assert.assertEquals; import static org.junit.Assert.assertEquals;
import static org.junit.Assert.assertFalse; import static org.junit.Assert.assertFalse;
@@ -76,15 +75,14 @@ import org.junit.rules.TemporaryFolder;
public class TestMergeOnReadTable { public class TestMergeOnReadTable {
private transient JavaSparkContext jsc = null;
private transient SQLContext sqlContext;
private static String basePath = null; private static String basePath = null;
//NOTE : Be careful in using DFS (FileSystem.class) vs LocalFs(RawLocalFileSystem.class) //NOTE : Be careful in using DFS (FileSystem.class) vs LocalFs(RawLocalFileSystem.class)
//The implementation and gurantees of many API's differ, for example check rename(src,dst) //The implementation and gurantees of many API's differ, for example check rename(src,dst)
private static MiniDFSCluster dfsCluster; private static MiniDFSCluster dfsCluster;
private static DistributedFileSystem dfs; private static DistributedFileSystem dfs;
private static HdfsTestService hdfsTestService; private static HdfsTestService hdfsTestService;
private transient JavaSparkContext jsc = null;
private transient SQLContext sqlContext;
@AfterClass @AfterClass
public static void cleanUp() throws Exception { public static void cleanUp() throws Exception {
@@ -92,13 +90,15 @@ public class TestMergeOnReadTable {
hdfsTestService.stop(); hdfsTestService.stop();
dfsCluster.shutdown(); dfsCluster.shutdown();
} }
// Need to closeAll to clear FileSystem.Cache, required because DFS and LocalFS used in the same JVM // Need to closeAll to clear FileSystem.Cache, required because DFS and LocalFS used in the
// same JVM
FileSystem.closeAll(); FileSystem.closeAll();
} }
@BeforeClass @BeforeClass
public static void setUpDFS() throws IOException { public static void setUpDFS() throws IOException {
// Need to closeAll to clear FileSystem.Cache, required because DFS and LocalFS used in the same JVM // Need to closeAll to clear FileSystem.Cache, required because DFS and LocalFS used in the
// same JVM
FileSystem.closeAll(); FileSystem.closeAll();
if (hdfsTestService == null) { if (hdfsTestService == null) {
hdfsTestService = new HdfsTestService(); hdfsTestService = new HdfsTestService();
@@ -111,8 +111,7 @@ public class TestMergeOnReadTable {
@Before @Before
public void init() throws IOException { public void init() throws IOException {
// Initialize a local spark env // Initialize a local spark env
jsc = new JavaSparkContext( jsc = new JavaSparkContext(HoodieClientTestUtils.getSparkConfForTest("TestHoodieMergeOnReadTable"));
HoodieClientTestUtils.getSparkConfForTest("TestHoodieMergeOnReadTable"));
// Create a temp folder as the base path // Create a temp folder as the base path
TemporaryFolder folder = new TemporaryFolder(); TemporaryFolder folder = new TemporaryFolder();
@@ -154,28 +153,23 @@ public class TestMergeOnReadTable {
List<WriteStatus> statuses = client.upsert(writeRecords, newCommitTime).collect(); List<WriteStatus> statuses = client.upsert(writeRecords, newCommitTime).collect();
assertNoWriteErrors(statuses); assertNoWriteErrors(statuses);
HoodieTableMetaClient metaClient = new HoodieTableMetaClient(jsc.hadoopConfiguration(), HoodieTableMetaClient metaClient = new HoodieTableMetaClient(jsc.hadoopConfiguration(), cfg.getBasePath());
cfg.getBasePath());
HoodieTable hoodieTable = HoodieTable.getHoodieTable(metaClient, cfg); HoodieTable hoodieTable = HoodieTable.getHoodieTable(metaClient, cfg);
Optional<HoodieInstant> deltaCommit = Optional<HoodieInstant> deltaCommit = metaClient.getActiveTimeline().getDeltaCommitTimeline().firstInstant();
metaClient.getActiveTimeline().getDeltaCommitTimeline().firstInstant();
assertTrue(deltaCommit.isPresent()); assertTrue(deltaCommit.isPresent());
assertEquals("Delta commit should be 001", "001", deltaCommit.get().getTimestamp()); assertEquals("Delta commit should be 001", "001", deltaCommit.get().getTimestamp());
Optional<HoodieInstant> commit = Optional<HoodieInstant> commit = metaClient.getActiveTimeline().getCommitTimeline().firstInstant();
metaClient.getActiveTimeline().getCommitTimeline().firstInstant();
assertFalse(commit.isPresent()); assertFalse(commit.isPresent());
FileStatus[] allFiles = HoodieTestUtils FileStatus[] allFiles = HoodieTestUtils.listAllDataFilesInPath(metaClient.getFs(), cfg.getBasePath());
.listAllDataFilesInPath(metaClient.getFs(), cfg.getBasePath());
TableFileSystemView.ReadOptimizedView roView = new HoodieTableFileSystemView(metaClient, TableFileSystemView.ReadOptimizedView roView = new HoodieTableFileSystemView(metaClient,
hoodieTable.getCommitTimeline().filterCompletedInstants(), allFiles); hoodieTable.getCommitTimeline().filterCompletedInstants(), allFiles);
Stream<HoodieDataFile> dataFilesToRead = roView.getLatestDataFiles(); Stream<HoodieDataFile> dataFilesToRead = roView.getLatestDataFiles();
assertTrue(!dataFilesToRead.findAny().isPresent()); assertTrue(!dataFilesToRead.findAny().isPresent());
roView = new HoodieTableFileSystemView(metaClient, hoodieTable.getCompletedCommitTimeline(), roView = new HoodieTableFileSystemView(metaClient, hoodieTable.getCompletedCommitTimeline(), allFiles);
allFiles);
dataFilesToRead = roView.getLatestDataFiles(); dataFilesToRead = roView.getLatestDataFiles();
assertTrue("RealtimeTableView should list the parquet files we wrote in the delta commit", assertTrue("RealtimeTableView should list the parquet files we wrote in the delta commit",
dataFilesToRead.findAny().isPresent()); dataFilesToRead.findAny().isPresent());
@@ -209,21 +203,17 @@ public class TestMergeOnReadTable {
client.compact(compactionCommitTime); client.compact(compactionCommitTime);
allFiles = HoodieTestUtils.listAllDataFilesInPath(dfs, cfg.getBasePath()); allFiles = HoodieTestUtils.listAllDataFilesInPath(dfs, cfg.getBasePath());
roView = new HoodieTableFileSystemView(metaClient, hoodieTable.getCompletedCommitTimeline(), roView = new HoodieTableFileSystemView(metaClient, hoodieTable.getCompletedCommitTimeline(), allFiles);
allFiles);
dataFilesToRead = roView.getLatestDataFiles(); dataFilesToRead = roView.getLatestDataFiles();
assertTrue(dataFilesToRead.findAny().isPresent()); assertTrue(dataFilesToRead.findAny().isPresent());
// verify that there is a commit // verify that there is a commit
HoodieTable table = HoodieTable.getHoodieTable( HoodieTable table = HoodieTable.getHoodieTable(
new HoodieTableMetaClient(jsc.hadoopConfiguration(), cfg.getBasePath(), true), new HoodieTableMetaClient(jsc.hadoopConfiguration(), cfg.getBasePath(), true), getConfig(false));
getConfig(false));
HoodieTimeline timeline = table.getCommitTimeline().filterCompletedInstants(); HoodieTimeline timeline = table.getCommitTimeline().filterCompletedInstants();
assertEquals("Expecting a single commit.", 1, assertEquals("Expecting a single commit.", 1, timeline.findInstantsAfter("000", Integer.MAX_VALUE).countInstants());
timeline.findInstantsAfter("000", Integer.MAX_VALUE).countInstants());
String latestCompactionCommitTime = timeline.lastInstant().get().getTimestamp(); String latestCompactionCommitTime = timeline.lastInstant().get().getTimestamp();
assertTrue(HoodieTimeline assertTrue(HoodieTimeline.compareTimestamps("000", latestCompactionCommitTime, HoodieTimeline.LESSER));
.compareTimestamps("000", latestCompactionCommitTime, HoodieTimeline.LESSER));
assertEquals("Must contain 200 records", 200, assertEquals("Must contain 200 records", 200,
HoodieClientTestUtils.readSince(basePath, sqlContext, timeline, "000").count()); HoodieClientTestUtils.readSince(basePath, sqlContext, timeline, "000").count());
@@ -232,8 +222,7 @@ public class TestMergeOnReadTable {
// Check if record level metadata is aggregated properly at the end of write. // Check if record level metadata is aggregated properly at the end of write.
@Test @Test
public void testMetadataAggregateFromWriteStatus() throws Exception { public void testMetadataAggregateFromWriteStatus() throws Exception {
HoodieWriteConfig cfg = getConfigBuilder(false) HoodieWriteConfig cfg = getConfigBuilder(false).withWriteStatusClass(MetadataMergeWriteStatus.class).build();
.withWriteStatusClass(MetadataMergeWriteStatus.class).build();
HoodieWriteClient client = new HoodieWriteClient(jsc, cfg); HoodieWriteClient client = new HoodieWriteClient(jsc, cfg);
String newCommitTime = "001"; String newCommitTime = "001";
@@ -248,7 +237,8 @@ public class TestMergeOnReadTable {
Map<String, String> allWriteStatusMergedMetadataMap = MetadataMergeWriteStatus Map<String, String> allWriteStatusMergedMetadataMap = MetadataMergeWriteStatus
.mergeMetadataForWriteStatuses(statuses); .mergeMetadataForWriteStatuses(statuses);
assertTrue(allWriteStatusMergedMetadataMap.containsKey("InputRecordCount_1506582000")); assertTrue(allWriteStatusMergedMetadataMap.containsKey("InputRecordCount_1506582000"));
// For metadata key InputRecordCount_1506582000, value is 2 for each record. So sum of this should be 2 * records.size() // For metadata key InputRecordCount_1506582000, value is 2 for each record. So sum of this
// should be 2 * records.size()
assertEquals(String.valueOf(2 * records.size()), assertEquals(String.valueOf(2 * records.size()),
allWriteStatusMergedMetadataMap.get("InputRecordCount_1506582000")); allWriteStatusMergedMetadataMap.get("InputRecordCount_1506582000"));
} }
@@ -271,28 +261,23 @@ public class TestMergeOnReadTable {
List<WriteStatus> statuses = client.upsert(writeRecords, newCommitTime).collect(); List<WriteStatus> statuses = client.upsert(writeRecords, newCommitTime).collect();
assertNoWriteErrors(statuses); assertNoWriteErrors(statuses);
HoodieTableMetaClient metaClient = new HoodieTableMetaClient(jsc.hadoopConfiguration(), HoodieTableMetaClient metaClient = new HoodieTableMetaClient(jsc.hadoopConfiguration(), cfg.getBasePath());
cfg.getBasePath());
HoodieTable hoodieTable = HoodieTable.getHoodieTable(metaClient, cfg); HoodieTable hoodieTable = HoodieTable.getHoodieTable(metaClient, cfg);
Optional<HoodieInstant> deltaCommit = Optional<HoodieInstant> deltaCommit = metaClient.getActiveTimeline().getDeltaCommitTimeline().firstInstant();
metaClient.getActiveTimeline().getDeltaCommitTimeline().firstInstant();
assertTrue(deltaCommit.isPresent()); assertTrue(deltaCommit.isPresent());
assertEquals("Delta commit should be 001", "001", deltaCommit.get().getTimestamp()); assertEquals("Delta commit should be 001", "001", deltaCommit.get().getTimestamp());
Optional<HoodieInstant> commit = Optional<HoodieInstant> commit = metaClient.getActiveTimeline().getCommitTimeline().firstInstant();
metaClient.getActiveTimeline().getCommitTimeline().firstInstant();
assertFalse(commit.isPresent()); assertFalse(commit.isPresent());
FileStatus[] allFiles = HoodieTestUtils FileStatus[] allFiles = HoodieTestUtils.listAllDataFilesInPath(metaClient.getFs(), cfg.getBasePath());
.listAllDataFilesInPath(metaClient.getFs(), cfg.getBasePath());
TableFileSystemView.ReadOptimizedView roView = new HoodieTableFileSystemView(metaClient, TableFileSystemView.ReadOptimizedView roView = new HoodieTableFileSystemView(metaClient,
hoodieTable.getCommitTimeline().filterCompletedInstants(), allFiles); hoodieTable.getCommitTimeline().filterCompletedInstants(), allFiles);
Stream<HoodieDataFile> dataFilesToRead = roView.getLatestDataFiles(); Stream<HoodieDataFile> dataFilesToRead = roView.getLatestDataFiles();
assertTrue(!dataFilesToRead.findAny().isPresent()); assertTrue(!dataFilesToRead.findAny().isPresent());
roView = new HoodieTableFileSystemView(metaClient, hoodieTable.getCompletedCommitTimeline(), roView = new HoodieTableFileSystemView(metaClient, hoodieTable.getCompletedCommitTimeline(), allFiles);
allFiles);
dataFilesToRead = roView.getLatestDataFiles(); dataFilesToRead = roView.getLatestDataFiles();
assertTrue("RealtimeTableView should list the parquet files we wrote in the delta commit", assertTrue("RealtimeTableView should list the parquet files we wrote in the delta commit",
dataFilesToRead.findAny().isPresent()); dataFilesToRead.findAny().isPresent());
@@ -329,15 +314,12 @@ public class TestMergeOnReadTable {
assertFalse(commit.isPresent()); assertFalse(commit.isPresent());
allFiles = HoodieTestUtils.listAllDataFilesInPath(dfs, cfg.getBasePath()); allFiles = HoodieTestUtils.listAllDataFilesInPath(dfs, cfg.getBasePath());
roView = new HoodieTableFileSystemView(metaClient, hoodieTable.getCompletedCommitTimeline(), roView = new HoodieTableFileSystemView(metaClient, hoodieTable.getCompletedCommitTimeline(), allFiles);
allFiles);
dataFilesToRead = roView.getLatestDataFiles(); dataFilesToRead = roView.getLatestDataFiles();
assertTrue(dataFilesToRead.findAny().isPresent()); assertTrue(dataFilesToRead.findAny().isPresent());
List<String> dataFiles = roView.getLatestDataFiles().map(hf -> hf.getPath()) List<String> dataFiles = roView.getLatestDataFiles().map(hf -> hf.getPath()).collect(Collectors.toList());
.collect(Collectors.toList()); List<GenericRecord> recordsRead = HoodieMergeOnReadTestUtils.getRecordsUsingInputFormat(dataFiles, basePath);
List<GenericRecord> recordsRead = HoodieMergeOnReadTestUtils
.getRecordsUsingInputFormat(dataFiles, basePath);
//Wrote 40 records and deleted 20 records, so remaining 40-20 = 20 //Wrote 40 records and deleted 20 records, so remaining 40-20 = 20
assertEquals("Must contain 20 records", 20, recordsRead.size()); assertEquals("Must contain 20 records", 20, recordsRead.size());
} }
@@ -365,10 +347,8 @@ public class TestMergeOnReadTable {
//verify there are no errors //verify there are no errors
assertNoWriteErrors(statuses); assertNoWriteErrors(statuses);
HoodieTableMetaClient metaClient = new HoodieTableMetaClient(jsc.hadoopConfiguration(), HoodieTableMetaClient metaClient = new HoodieTableMetaClient(jsc.hadoopConfiguration(), cfg.getBasePath());
cfg.getBasePath()); Optional<HoodieInstant> commit = metaClient.getActiveTimeline().getCommitTimeline().firstInstant();
Optional<HoodieInstant> commit =
metaClient.getActiveTimeline().getCommitTimeline().firstInstant();
assertTrue(commit.isPresent()); assertTrue(commit.isPresent());
assertEquals("commit should be 001", "001", commit.get().getTimestamp()); assertEquals("commit should be 001", "001", commit.get().getTimestamp());
@@ -391,10 +371,8 @@ public class TestMergeOnReadTable {
client.rollback(newCommitTime); client.rollback(newCommitTime);
metaClient = new HoodieTableMetaClient(jsc.hadoopConfiguration(), cfg.getBasePath()); metaClient = new HoodieTableMetaClient(jsc.hadoopConfiguration(), cfg.getBasePath());
HoodieTable hoodieTable = HoodieTable HoodieTable hoodieTable = HoodieTable.getHoodieTable(metaClient, cfg);
.getHoodieTable(metaClient, cfg); FileStatus[] allFiles = HoodieTestUtils.listAllDataFilesInPath(metaClient.getFs(), cfg.getBasePath());
FileStatus[] allFiles = HoodieTestUtils
.listAllDataFilesInPath(metaClient.getFs(), cfg.getBasePath());
HoodieTableFileSystemView roView = new HoodieTableFileSystemView(metaClient, HoodieTableFileSystemView roView = new HoodieTableFileSystemView(metaClient,
hoodieTable.getCompletedCommitTimeline(), allFiles); hoodieTable.getCompletedCommitTimeline(), allFiles);
@@ -428,28 +406,23 @@ public class TestMergeOnReadTable {
List<WriteStatus> statuses = client.upsert(writeRecords, newCommitTime).collect(); List<WriteStatus> statuses = client.upsert(writeRecords, newCommitTime).collect();
assertNoWriteErrors(statuses); assertNoWriteErrors(statuses);
HoodieTableMetaClient metaClient = new HoodieTableMetaClient(jsc.hadoopConfiguration(), HoodieTableMetaClient metaClient = new HoodieTableMetaClient(jsc.hadoopConfiguration(), cfg.getBasePath());
cfg.getBasePath());
HoodieTable hoodieTable = HoodieTable.getHoodieTable(metaClient, cfg); HoodieTable hoodieTable = HoodieTable.getHoodieTable(metaClient, cfg);
Optional<HoodieInstant> deltaCommit = Optional<HoodieInstant> deltaCommit = metaClient.getActiveTimeline().getDeltaCommitTimeline().firstInstant();
metaClient.getActiveTimeline().getDeltaCommitTimeline().firstInstant();
assertTrue(deltaCommit.isPresent()); assertTrue(deltaCommit.isPresent());
assertEquals("Delta commit should be 001", "001", deltaCommit.get().getTimestamp()); assertEquals("Delta commit should be 001", "001", deltaCommit.get().getTimestamp());
Optional<HoodieInstant> commit = Optional<HoodieInstant> commit = metaClient.getActiveTimeline().getCommitTimeline().firstInstant();
metaClient.getActiveTimeline().getCommitTimeline().firstInstant();
assertFalse(commit.isPresent()); assertFalse(commit.isPresent());
FileStatus[] allFiles = HoodieTestUtils FileStatus[] allFiles = HoodieTestUtils.listAllDataFilesInPath(metaClient.getFs(), cfg.getBasePath());
.listAllDataFilesInPath(metaClient.getFs(), cfg.getBasePath());
TableFileSystemView.ReadOptimizedView roView = new HoodieTableFileSystemView(metaClient, TableFileSystemView.ReadOptimizedView roView = new HoodieTableFileSystemView(metaClient,
hoodieTable.getCommitTimeline().filterCompletedInstants(), allFiles); hoodieTable.getCommitTimeline().filterCompletedInstants(), allFiles);
Stream<HoodieDataFile> dataFilesToRead = roView.getLatestDataFiles(); Stream<HoodieDataFile> dataFilesToRead = roView.getLatestDataFiles();
assertTrue(!dataFilesToRead.findAny().isPresent()); assertTrue(!dataFilesToRead.findAny().isPresent());
roView = new HoodieTableFileSystemView(metaClient, hoodieTable.getCompletedCommitTimeline(), roView = new HoodieTableFileSystemView(metaClient, hoodieTable.getCompletedCommitTimeline(), allFiles);
allFiles);
dataFilesToRead = roView.getLatestDataFiles(); dataFilesToRead = roView.getLatestDataFiles();
assertTrue("RealtimeTableView should list the parquet files we wrote in the delta commit", assertTrue("RealtimeTableView should list the parquet files we wrote in the delta commit",
dataFilesToRead.findAny().isPresent()); dataFilesToRead.findAny().isPresent());
@@ -473,10 +446,8 @@ public class TestMergeOnReadTable {
commit = metaClient.getActiveTimeline().getCommitTimeline().firstInstant(); commit = metaClient.getActiveTimeline().getCommitTimeline().firstInstant();
assertFalse(commit.isPresent()); assertFalse(commit.isPresent());
List<String> dataFiles = roView.getLatestDataFiles().map(hf -> hf.getPath()) List<String> dataFiles = roView.getLatestDataFiles().map(hf -> hf.getPath()).collect(Collectors.toList());
.collect(Collectors.toList()); List<GenericRecord> recordsRead = HoodieMergeOnReadTestUtils.getRecordsUsingInputFormat(dataFiles, basePath);
List<GenericRecord> recordsRead = HoodieMergeOnReadTestUtils
.getRecordsUsingInputFormat(dataFiles, basePath);
assertEquals(recordsRead.size(), 200); assertEquals(recordsRead.size(), 200);
@@ -485,8 +456,7 @@ public class TestMergeOnReadTable {
metaClient = new HoodieTableMetaClient(jsc.hadoopConfiguration(), cfg.getBasePath()); metaClient = new HoodieTableMetaClient(jsc.hadoopConfiguration(), cfg.getBasePath());
hoodieTable = HoodieTable.getHoodieTable(metaClient, cfg); hoodieTable = HoodieTable.getHoodieTable(metaClient, cfg);
roView = new HoodieTableFileSystemView(metaClient, hoodieTable.getCompletedCommitTimeline(), roView = new HoodieTableFileSystemView(metaClient, hoodieTable.getCompletedCommitTimeline(), allFiles);
allFiles);
dataFiles = roView.getLatestDataFiles().map(hf -> hf.getPath()).collect(Collectors.toList()); dataFiles = roView.getLatestDataFiles().map(hf -> hf.getPath()).collect(Collectors.toList());
recordsRead = HoodieMergeOnReadTestUtils.getRecordsUsingInputFormat(dataFiles, basePath); recordsRead = HoodieMergeOnReadTestUtils.getRecordsUsingInputFormat(dataFiles, basePath);
@@ -512,11 +482,10 @@ public class TestMergeOnReadTable {
allFiles = HoodieTestUtils.listAllDataFilesInPath(metaClient.getFs(), cfg.getBasePath()); allFiles = HoodieTestUtils.listAllDataFilesInPath(metaClient.getFs(), cfg.getBasePath());
metaClient = new HoodieTableMetaClient(jsc.hadoopConfiguration(), cfg.getBasePath()); metaClient = new HoodieTableMetaClient(jsc.hadoopConfiguration(), cfg.getBasePath());
hoodieTable = HoodieTable.getHoodieTable(metaClient, cfg); hoodieTable = HoodieTable.getHoodieTable(metaClient, cfg);
roView = new HoodieTableFileSystemView(metaClient, hoodieTable.getCommitsTimeline(), roView = new HoodieTableFileSystemView(metaClient, hoodieTable.getCommitsTimeline(), allFiles);
allFiles);
final String compactedCommitTime = metaClient.getActiveTimeline().reload() final String compactedCommitTime = metaClient.getActiveTimeline().reload().getCommitsTimeline().lastInstant().get()
.getCommitsTimeline().lastInstant().get().getTimestamp(); .getTimestamp();
assertTrue(roView.getLatestDataFiles().filter(file -> { assertTrue(roView.getLatestDataFiles().filter(file -> {
if (compactedCommitTime.equals(file.getCommitTime())) { if (compactedCommitTime.equals(file.getCommitTime())) {
@@ -531,8 +500,7 @@ public class TestMergeOnReadTable {
allFiles = HoodieTestUtils.listAllDataFilesInPath(metaClient.getFs(), cfg.getBasePath()); allFiles = HoodieTestUtils.listAllDataFilesInPath(metaClient.getFs(), cfg.getBasePath());
metaClient = new HoodieTableMetaClient(jsc.hadoopConfiguration(), cfg.getBasePath()); metaClient = new HoodieTableMetaClient(jsc.hadoopConfiguration(), cfg.getBasePath());
hoodieTable = HoodieTable.getHoodieTable(metaClient, cfg); hoodieTable = HoodieTable.getHoodieTable(metaClient, cfg);
roView = new HoodieTableFileSystemView(metaClient, hoodieTable.getCommitsTimeline(), roView = new HoodieTableFileSystemView(metaClient, hoodieTable.getCommitsTimeline(), allFiles);
allFiles);
assertFalse(roView.getLatestDataFiles().filter(file -> { assertFalse(roView.getLatestDataFiles().filter(file -> {
if (compactedCommitTime.equals(file.getCommitTime())) { if (compactedCommitTime.equals(file.getCommitTime())) {
@@ -564,30 +532,28 @@ public class TestMergeOnReadTable {
HoodieTableMetaClient metaClient = new HoodieTableMetaClient(jsc.hadoopConfiguration(), cfg.getBasePath()); HoodieTableMetaClient metaClient = new HoodieTableMetaClient(jsc.hadoopConfiguration(), cfg.getBasePath());
HoodieTable hoodieTable = HoodieTable.getHoodieTable(metaClient, cfg); HoodieTable hoodieTable = HoodieTable.getHoodieTable(metaClient, cfg);
Optional<HoodieInstant> deltaCommit = Optional<HoodieInstant> deltaCommit = metaClient.getActiveTimeline().getDeltaCommitTimeline().firstInstant();
metaClient.getActiveTimeline().getDeltaCommitTimeline().firstInstant();
assertTrue(deltaCommit.isPresent()); assertTrue(deltaCommit.isPresent());
assertEquals("Delta commit should be 001", "001", deltaCommit.get().getTimestamp()); assertEquals("Delta commit should be 001", "001", deltaCommit.get().getTimestamp());
Optional<HoodieInstant> commit = Optional<HoodieInstant> commit = metaClient.getActiveTimeline().getCommitTimeline().firstInstant();
metaClient.getActiveTimeline().getCommitTimeline().firstInstant();
assertFalse(commit.isPresent()); assertFalse(commit.isPresent());
FileStatus[] allFiles = HoodieTestUtils FileStatus[] allFiles = HoodieTestUtils.listAllDataFilesInPath(metaClient.getFs(), cfg.getBasePath());
.listAllDataFilesInPath(metaClient.getFs(), cfg.getBasePath());
TableFileSystemView.ReadOptimizedView roView = new HoodieTableFileSystemView(metaClient, TableFileSystemView.ReadOptimizedView roView = new HoodieTableFileSystemView(metaClient,
hoodieTable.getCommitsTimeline().filterCompletedInstants(), allFiles); hoodieTable.getCommitsTimeline().filterCompletedInstants(), allFiles);
Stream<HoodieDataFile> dataFilesToRead = roView.getLatestDataFiles(); Stream<HoodieDataFile> dataFilesToRead = roView.getLatestDataFiles();
Map<String, Long> parquetFileIdToSize = dataFilesToRead.collect(Collectors.toMap(HoodieDataFile::getFileId, HoodieDataFile::getFileSize)); Map<String, Long> parquetFileIdToSize = dataFilesToRead.collect(
Collectors.toMap(HoodieDataFile::getFileId, HoodieDataFile::getFileSize));
roView = new HoodieTableFileSystemView(metaClient, hoodieTable.getCompletedCommitTimeline(), roView = new HoodieTableFileSystemView(metaClient, hoodieTable.getCompletedCommitTimeline(), allFiles);
allFiles);
dataFilesToRead = roView.getLatestDataFiles(); dataFilesToRead = roView.getLatestDataFiles();
assertTrue("RealtimeTableView should list the parquet files we wrote in the delta commit", assertTrue("RealtimeTableView should list the parquet files we wrote in the delta commit",
dataFilesToRead.findAny().isPresent()); dataFilesToRead.findAny().isPresent());
/** /**
* Write 2 (only updates + inserts, written to .log file + correction of existing parquet file size) * Write 2 (only updates + inserts, written to .log file + correction of existing parquet
* file size)
*/ */
newCommitTime = "002"; newCommitTime = "002";
client.startCommitWithTime(newCommitTime); client.startCommitWithTime(newCommitTime);
@@ -608,18 +574,17 @@ public class TestMergeOnReadTable {
assertFalse(commit.isPresent()); assertFalse(commit.isPresent());
allFiles = HoodieTestUtils.listAllDataFilesInPath(metaClient.getFs(), cfg.getBasePath()); allFiles = HoodieTestUtils.listAllDataFilesInPath(metaClient.getFs(), cfg.getBasePath());
roView = new HoodieTableFileSystemView(metaClient, hoodieTable.getActiveTimeline().reload() roView = new HoodieTableFileSystemView(metaClient,
.getCommitsTimeline().filterCompletedInstants(), allFiles); hoodieTable.getActiveTimeline().reload().getCommitsTimeline().filterCompletedInstants(), allFiles);
dataFilesToRead = roView.getLatestDataFiles(); dataFilesToRead = roView.getLatestDataFiles();
Map<String, Long> parquetFileIdToNewSize = dataFilesToRead.collect(Collectors.toMap(HoodieDataFile::getFileId, HoodieDataFile::getFileSize)); Map<String, Long> parquetFileIdToNewSize = dataFilesToRead.collect(
Collectors.toMap(HoodieDataFile::getFileId, HoodieDataFile::getFileSize));
assertTrue(parquetFileIdToNewSize.entrySet().stream() assertTrue(parquetFileIdToNewSize.entrySet().stream()
.filter(entry -> parquetFileIdToSize.get(entry.getKey()) < entry.getValue()).count() > 0); .filter(entry -> parquetFileIdToSize.get(entry.getKey()) < entry.getValue()).count() > 0);
List<String> dataFiles = roView.getLatestDataFiles().map(hf -> hf.getPath()) List<String> dataFiles = roView.getLatestDataFiles().map(hf -> hf.getPath()).collect(Collectors.toList());
.collect(Collectors.toList()); List<GenericRecord> recordsRead = HoodieMergeOnReadTestUtils.getRecordsUsingInputFormat(dataFiles, basePath);
List<GenericRecord> recordsRead = HoodieMergeOnReadTestUtils
.getRecordsUsingInputFormat(dataFiles, basePath);
//Wrote 20 records in 2 batches //Wrote 20 records in 2 batches
assertEquals("Must contain 40 records", 40, recordsRead.size()); assertEquals("Must contain 40 records", 40, recordsRead.size());
} }
@@ -639,8 +604,7 @@ public class TestMergeOnReadTable {
List<WriteStatus> statuses = writeClient.insert(recordsRDD, newCommitTime).collect(); List<WriteStatus> statuses = writeClient.insert(recordsRDD, newCommitTime).collect();
// Update all the 100 records // Update all the 100 records
HoodieTableMetaClient metaClient = new HoodieTableMetaClient(jsc.hadoopConfiguration(), HoodieTableMetaClient metaClient = new HoodieTableMetaClient(jsc.hadoopConfiguration(), basePath);
basePath);
HoodieTable table = HoodieTable.getHoodieTable(metaClient, config); HoodieTable table = HoodieTable.getHoodieTable(metaClient, config);
newCommitTime = "101"; newCommitTime = "101";
@@ -653,19 +617,17 @@ public class TestMergeOnReadTable {
// Write them to corresponding avro logfiles // Write them to corresponding avro logfiles
HoodieTestUtils HoodieTestUtils
.writeRecordsToLogFiles(metaClient.getFs(), metaClient.getBasePath(), .writeRecordsToLogFiles(metaClient.getFs(), metaClient.getBasePath(), HoodieTestDataGenerator.avroSchema,
HoodieTestDataGenerator.avroSchema, updatedRecords); updatedRecords);
// Verify that all data file has one log file // Verify that all data file has one log file
metaClient = new HoodieTableMetaClient(jsc.hadoopConfiguration(), basePath); metaClient = new HoodieTableMetaClient(jsc.hadoopConfiguration(), basePath);
table = HoodieTable.getHoodieTable(metaClient, config); table = HoodieTable.getHoodieTable(metaClient, config);
for (String partitionPath : dataGen.getPartitionPaths()) { for (String partitionPath : dataGen.getPartitionPaths()) {
List<FileSlice> groupedLogFiles = List<FileSlice> groupedLogFiles = table.getRTFileSystemView().getLatestFileSlices(partitionPath)
table.getRTFileSystemView().getLatestFileSlices(partitionPath)
.collect(Collectors.toList()); .collect(Collectors.toList());
for (FileSlice fileSlice : groupedLogFiles) { for (FileSlice fileSlice : groupedLogFiles) {
assertEquals("There should be 1 log file written for every data file", 1, assertEquals("There should be 1 log file written for every data file", 1, fileSlice.getLogFiles().count());
fileSlice.getLogFiles().count());
} }
} }
@@ -674,31 +636,27 @@ public class TestMergeOnReadTable {
table = HoodieTable.getHoodieTable(metaClient, config); table = HoodieTable.getHoodieTable(metaClient, config);
String commitTime = writeClient.startCompaction(); String commitTime = writeClient.startCompaction();
JavaRDD<WriteStatus> result = JavaRDD<WriteStatus> result = writeClient.compact(commitTime);
writeClient.compact(commitTime);
// Verify that recently written compacted data file has no log file // Verify that recently written compacted data file has no log file
metaClient = new HoodieTableMetaClient(jsc.hadoopConfiguration(), basePath); metaClient = new HoodieTableMetaClient(jsc.hadoopConfiguration(), basePath);
table = HoodieTable.getHoodieTable(metaClient, config); table = HoodieTable.getHoodieTable(metaClient, config);
HoodieActiveTimeline timeline = metaClient.getActiveTimeline(); HoodieActiveTimeline timeline = metaClient.getActiveTimeline();
assertTrue("Compaction commit should be > than last insert", assertTrue("Compaction commit should be > than last insert", HoodieTimeline.compareTimestamps(
HoodieTimeline.compareTimestamps(timeline.lastInstant().get().getTimestamp(), newCommitTime, timeline.lastInstant().get().getTimestamp(), newCommitTime, HoodieTimeline.GREATER));
HoodieTimeline.GREATER));
for (String partitionPath : dataGen.getPartitionPaths()) { for (String partitionPath : dataGen.getPartitionPaths()) {
List<FileSlice> groupedLogFiles = table.getRTFileSystemView() List<FileSlice> groupedLogFiles = table.getRTFileSystemView().getLatestFileSlices(partitionPath)
.getLatestFileSlices(partitionPath)
.collect(Collectors.toList()); .collect(Collectors.toList());
for (FileSlice slice : groupedLogFiles) { for (FileSlice slice : groupedLogFiles) {
assertTrue( assertTrue("After compaction there should be no log files visiable on a Realtime view",
"After compaction there should be no log files visiable on a Realtime view",
slice.getLogFiles().collect(Collectors.toList()).isEmpty()); slice.getLogFiles().collect(Collectors.toList()).isEmpty());
} }
List<WriteStatus> writeStatuses = result.collect(); List<WriteStatus> writeStatuses = result.collect();
assertTrue(writeStatuses.stream() assertTrue(writeStatuses.stream()
.filter(writeStatus -> writeStatus.getStat().getPartitionPath() .filter(writeStatus -> writeStatus.getStat().getPartitionPath().contentEquals(partitionPath))
.contentEquals(partitionPath)).count() > 0); .count() > 0);
} }
} }
@@ -707,16 +665,13 @@ public class TestMergeOnReadTable {
} }
private HoodieWriteConfig.Builder getConfigBuilder(Boolean autoCommit) { private HoodieWriteConfig.Builder getConfigBuilder(Boolean autoCommit) {
return HoodieWriteConfig.newBuilder().withPath(basePath) return HoodieWriteConfig.newBuilder().withPath(basePath).withSchema(TRIP_EXAMPLE_SCHEMA).withParallelism(2, 2)
.withSchema(TRIP_EXAMPLE_SCHEMA).withParallelism(2, 2) .withAutoCommit(autoCommit).withAssumeDatePartitioning(true).withCompactionConfig(
.withAutoCommit(autoCommit) HoodieCompactionConfig.newBuilder().compactionSmallFileSize(1024 * 1024 * 1024).withInlineCompaction(false)
.withAssumeDatePartitioning(true) .withMaxNumDeltaCommitsBeforeCompaction(1).build())
.withCompactionConfig(
HoodieCompactionConfig.newBuilder().compactionSmallFileSize(1024 * 1024 * 1024)
.withInlineCompaction(false).withMaxNumDeltaCommitsBeforeCompaction(1).build())
.withStorageConfig(HoodieStorageConfig.newBuilder().limitFileSize(1024 * 1024 * 1024).build()) .withStorageConfig(HoodieStorageConfig.newBuilder().limitFileSize(1024 * 1024 * 1024).build())
.forTable("test-trip-table").withIndexConfig( .forTable("test-trip-table")
HoodieIndexConfig.newBuilder().withIndexType(HoodieIndex.IndexType.BLOOM).build()); .withIndexConfig(HoodieIndexConfig.newBuilder().withIndexType(HoodieIndex.IndexType.BLOOM).build());
} }
private void assertNoWriteErrors(List<WriteStatus> statuses) { private void assertNoWriteErrors(List<WriteStatus> statuses) {

View File

@@ -33,10 +33,10 @@ public class HoodieAvroWriteSupport extends AvroWriteSupport {
private String maxRecordKey; private String maxRecordKey;
public final static String HOODIE_AVRO_BLOOM_FILTER_METADATA_KEY = public static final String HOODIE_AVRO_BLOOM_FILTER_METADATA_KEY =
"com.uber.hoodie.bloomfilter"; "com.uber.hoodie.bloomfilter";
public final static String HOODIE_MIN_RECORD_KEY_FOOTER = "hoodie_min_record_key"; public static final String HOODIE_MIN_RECORD_KEY_FOOTER = "hoodie_min_record_key";
public final static String HOODIE_MAX_RECORD_KEY_FOOTER = "hoodie_max_record_key"; public static final String HOODIE_MAX_RECORD_KEY_FOOTER = "hoodie_max_record_key";
public HoodieAvroWriteSupport(MessageType schema, Schema avroSchema, BloomFilter bloomFilter) { public HoodieAvroWriteSupport(MessageType schema, Schema avroSchema, BloomFilter bloomFilter) {

View File

@@ -16,6 +16,7 @@
package com.uber.hoodie.avro; package com.uber.hoodie.avro;
import com.fasterxml.jackson.databind.ObjectMapper;
import java.io.IOException; import java.io.IOException;
import java.util.ArrayList; import java.util.ArrayList;
import java.util.HashMap; import java.util.HashMap;
@@ -24,11 +25,11 @@ import java.util.Map;
import org.apache.avro.Schema; import org.apache.avro.Schema;
import org.apache.avro.generic.GenericData; import org.apache.avro.generic.GenericData;
import org.apache.avro.generic.GenericRecord; import org.apache.avro.generic.GenericRecord;
import com.fasterxml.jackson.databind.ObjectMapper;
/** /**
* Marjority of this is copied from https://github.com/jwills/avro-json/blob/master/src/main/java/com/cloudera/science/avro/common/JsonConverter.java * Marjority of this is copied from
* Adjusted for expected behavior of our use cases * https://github.com/jwills/avro-json/blob/master/src/main/java/com/cloudera/science/avro/
* common/JsonConverter.java Adjusted for expected behavior of our use cases
*/ */
public class MercifulJsonConverter { public class MercifulJsonConverter {
@@ -132,10 +133,10 @@ public class MercifulJsonConverter {
} }
private boolean isOptional(Schema schema) { private boolean isOptional(Schema schema) {
return schema.getType().equals(Schema.Type.UNION) && return schema.getType().equals(Schema.Type.UNION)
schema.getTypes().size() == 2 && && schema.getTypes().size() == 2
(schema.getTypes().get(0).getType().equals(Schema.Type.NULL) || && (schema.getTypes().get(0).getType().equals(Schema.Type.NULL)
schema.getTypes().get(1).getType().equals(Schema.Type.NULL)); || schema.getTypes().get(1).getType().equals(Schema.Type.NULL));
} }
private Schema getNonNull(Schema schema) { private Schema getNonNull(Schema schema) {

View File

@@ -113,9 +113,8 @@ public class HoodieCleanStat implements Serializable {
} }
public Builder withEarliestCommitRetained(Optional<HoodieInstant> earliestCommitToRetain) { public Builder withEarliestCommitRetained(Optional<HoodieInstant> earliestCommitToRetain) {
this.earliestCommitToRetain = (earliestCommitToRetain.isPresent()) ? this.earliestCommitToRetain = (earliestCommitToRetain.isPresent())
earliestCommitToRetain.get().getTimestamp() : ? earliestCommitToRetain.get().getTimestamp() : "-1";
"-1";
return this; return this;
} }

View File

@@ -210,12 +210,18 @@ public class HoodieCommitMetadata implements Serializable {
@Override @Override
public boolean equals(Object o) { public boolean equals(Object o) {
if (this == o) return true; if (this == o) {
if (o == null || getClass() != o.getClass()) return false; return true;
}
if (o == null || getClass() != o.getClass()) {
return false;
}
HoodieCommitMetadata that = (HoodieCommitMetadata) o; HoodieCommitMetadata that = (HoodieCommitMetadata) o;
if (!partitionToWriteStats.equals(that.partitionToWriteStats)) return false; if (!partitionToWriteStats.equals(that.partitionToWriteStats)) {
return false;
}
return compacted.equals(that.compacted); return compacted.equals(that.compacted);
} }

View File

@@ -39,7 +39,6 @@ public class HoodieFileGroup implements Serializable {
}; };
} }
/** /**
* Partition containing the file group. * Partition containing the file group.
*/ */
@@ -107,8 +106,8 @@ public class HoodieFileGroup implements Serializable {
*/ */
private boolean isFileSliceCommitted(FileSlice slice) { private boolean isFileSliceCommitted(FileSlice slice) {
String maxCommitTime = lastInstant.get().getTimestamp(); String maxCommitTime = lastInstant.get().getTimestamp();
return timeline.containsOrBeforeTimelineStarts(slice.getBaseCommitTime()) && return timeline.containsOrBeforeTimelineStarts(slice.getBaseCommitTime())
HoodieTimeline.compareTimestamps(slice.getBaseCommitTime(), && HoodieTimeline.compareTimestamps(slice.getBaseCommitTime(),
maxCommitTime, maxCommitTime,
HoodieTimeline.LESSER_OR_EQUAL); HoodieTimeline.LESSER_OR_EQUAL);
@@ -128,7 +127,7 @@ public class HoodieFileGroup implements Serializable {
/** /**
* Gets the latest slice - this can contain either * Gets the latest slice - this can contain either
* * <p>
* - just the log files without data file - (or) data file with 0 or more log files * - just the log files without data file - (or) data file with 0 or more log files
*/ */
public Optional<FileSlice> getLatestFileSlice() { public Optional<FileSlice> getLatestFileSlice() {

View File

@@ -21,7 +21,7 @@ import java.io.Serializable;
/** /**
* HoodieKey consists of * HoodieKey consists of
* * <p>
* - recordKey : a recordKey that acts as primary key for a record - partitionPath : path to the * - recordKey : a recordKey that acts as primary key for a record - partitionPath : path to the
* partition that contains the record * partition that contains the record
*/ */
@@ -54,8 +54,8 @@ public class HoodieKey implements Serializable {
return false; return false;
} }
HoodieKey otherKey = (HoodieKey) o; HoodieKey otherKey = (HoodieKey) o;
return Objects.equal(recordKey, otherKey.recordKey) && return Objects.equal(recordKey, otherKey.recordKey)
Objects.equal(partitionPath, otherKey.partitionPath); && Objects.equal(partitionPath, otherKey.partitionPath);
} }
@Override @Override

View File

@@ -30,7 +30,7 @@ import org.apache.hadoop.fs.Path;
/** /**
* Abstracts a single log file. Contains methods to extract metadata like the fileId, version and * Abstracts a single log file. Contains methods to extract metadata like the fileId, version and
* extension from the log file path. * extension from the log file path.
* * <p>
* Also contains logic to roll-over the log file * Also contains logic to roll-over the log file
*/ */
public class HoodieLogFile implements Serializable { public class HoodieLogFile implements Serializable {
@@ -103,8 +103,12 @@ public class HoodieLogFile implements Serializable {
@Override @Override
public boolean equals(Object o) { public boolean equals(Object o) {
if (this == o) return true; if (this == o) {
if (o == null || getClass() != o.getClass()) return false; return true;
}
if (o == null || getClass() != o.getClass()) {
return false;
}
HoodieLogFile that = (HoodieLogFile) o; HoodieLogFile that = (HoodieLogFile) o;
return path != null ? path.equals(that.path) : that.path == null; return path != null ? path.equals(that.path) : that.path == null;
} }

View File

@@ -101,9 +101,8 @@ public class HoodiePartitionMetadata {
} }
} catch (IOException ioe) { } catch (IOException ioe) {
log.warn( log.warn(
"Error trying to save partition metadata (this is okay, as long as atleast 1 of these succced), " "Error trying to save partition metadata (this is okay, as long as "
+ + "atleast 1 of these succced), " + partitionPath, ioe);
partitionPath, ioe);
} finally { } finally {
if (!metafileExists) { if (!metafileExists) {
try { try {

View File

@@ -118,10 +118,10 @@ public class HoodieRecord<T extends HoodieRecordPayload> implements Serializable
return false; return false;
} }
HoodieRecord that = (HoodieRecord) o; HoodieRecord that = (HoodieRecord) o;
return Objects.equal(key, that.key) && return Objects.equal(key, that.key)
Objects.equal(data, that.data) && && Objects.equal(data, that.data)
Objects.equal(currentLocation, that.currentLocation) && && Objects.equal(currentLocation, that.currentLocation)
Objects.equal(newLocation, that.newLocation); && Objects.equal(newLocation, that.newLocation);
} }
@Override @Override

View File

@@ -42,8 +42,8 @@ public class HoodieRecordLocation implements Serializable {
return false; return false;
} }
HoodieRecordLocation otherLoc = (HoodieRecordLocation) o; HoodieRecordLocation otherLoc = (HoodieRecordLocation) o;
return Objects.equal(commitTime, otherLoc.commitTime) && return Objects.equal(commitTime, otherLoc.commitTime)
Objects.equal(fileId, otherLoc.fileId); && Objects.equal(fileId, otherLoc.fileId);
} }
@Override @Override

View File

@@ -38,7 +38,7 @@ public interface HoodieRecordPayload<T extends HoodieRecordPayload> extends Seri
/** /**
* This methods lets you write custom merging/combining logic to produce new values as a function * This methods lets you write custom merging/combining logic to produce new values as a function
* of current value on storage and whats contained in this object. * of current value on storage and whats contained in this object.
* * <p>
* eg: 1) You are updating counters, you may want to add counts to currentValue and write back * eg: 1) You are updating counters, you may want to add counts to currentValue and write back
* updated counts 2) You may be reading DB redo logs, and merge them with current image for a * updated counts 2) You may be reading DB redo logs, and merge them with current image for a
* database row on storage * database row on storage

View File

@@ -18,16 +18,16 @@ package com.uber.hoodie.common.model;
/** /**
* Type of the Hoodie Table. * Type of the Hoodie Table.
* * <p>
* Currently, 1 type is supported * Currently, 1 type is supported
* * <p>
* COPY_ON_WRITE - Performs upserts by versioning entire files, with later versions containing newer * COPY_ON_WRITE - Performs upserts by versioning entire files, with later versions containing newer
* value of a record. * value of a record.
* * <p>
* In the future, following might be added. * In the future, following might be added.
* * <p>
* MERGE_ON_READ - Speeds up upserts, by delaying merge until enough work piles up. * MERGE_ON_READ - Speeds up upserts, by delaying merge until enough work piles up.
* * <p>
* SIMPLE_LSM - A simple 2 level LSM tree. * SIMPLE_LSM - A simple 2 level LSM tree.
*/ */
public enum HoodieTableType { public enum HoodieTableType {

Some files were not shown because too many files have changed in this diff Show More