1
0

Reformatting code per Google Code Style all over

This commit is contained in:
Vinoth Chandar
2017-11-12 22:54:56 -08:00
committed by vinoth chandar
parent 5a62480a92
commit e45679f5e2
254 changed files with 21580 additions and 21108 deletions

View File

@@ -15,7 +15,9 @@
~ limitations under the License. ~ limitations under the License.
--> -->
<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd"> <project xmlns="http://maven.apache.org/POM/4.0.0"
xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
<parent> <parent>
<artifactId>hoodie</artifactId> <artifactId>hoodie</artifactId>
<groupId>com.uber.hoodie</groupId> <groupId>com.uber.hoodie</groupId>

View File

@@ -17,12 +17,12 @@
package com.uber.hoodie.cli; package com.uber.hoodie.cli;
import com.uber.hoodie.common.table.HoodieTableMetaClient; import com.uber.hoodie.common.table.HoodieTableMetaClient;
import java.io.IOException;
import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileSystem; import org.apache.hadoop.fs.FileSystem;
import java.io.IOException;
public class HoodieCLI { public class HoodieCLI {
public static Configuration conf; public static Configuration conf;
public static FileSystem fs; public static FileSystem fs;
public static CLIState state = CLIState.INIT; public static CLIState state = CLIState.INIT;
@@ -43,7 +43,7 @@ public class HoodieCLI {
} }
public static void initFS(boolean force) throws IOException { public static void initFS(boolean force) throws IOException {
if(fs == null || force) { if (fs == null || force) {
fs = FileSystem.get(conf); fs = FileSystem.get(conf);
} }
} }

View File

@@ -17,7 +17,6 @@
package com.uber.hoodie.cli; package com.uber.hoodie.cli;
import dnl.utils.text.table.TextTable; import dnl.utils.text.table.TextTable;
import java.io.ByteArrayOutputStream; import java.io.ByteArrayOutputStream;
import java.io.PrintStream; import java.io.PrintStream;
import java.nio.charset.Charset; import java.nio.charset.Charset;

View File

@@ -16,7 +16,6 @@
package com.uber.hoodie.cli; package com.uber.hoodie.cli;
import com.uber.hoodie.common.table.HoodieTableConfig;
import org.springframework.core.Ordered; import org.springframework.core.Ordered;
import org.springframework.core.annotation.Order; import org.springframework.core.annotation.Order;
import org.springframework.shell.plugin.support.DefaultPromptProvider; import org.springframework.shell.plugin.support.DefaultPromptProvider;

View File

@@ -22,9 +22,13 @@ import org.springframework.shell.plugin.support.DefaultBannerProvider;
import org.springframework.shell.support.util.OsUtils; import org.springframework.shell.support.util.OsUtils;
import org.springframework.stereotype.Component; import org.springframework.stereotype.Component;
@Component @Order(Ordered.HIGHEST_PRECEDENCE) public class HoodieSplashScreen @Component
@Order(Ordered.HIGHEST_PRECEDENCE)
public class HoodieSplashScreen
extends DefaultBannerProvider { extends DefaultBannerProvider {
private static String screen = "============================================" + OsUtils.LINE_SEPARATOR +
private static String screen =
"============================================" + OsUtils.LINE_SEPARATOR +
"* *" + OsUtils.LINE_SEPARATOR + "* *" + OsUtils.LINE_SEPARATOR +
"* _ _ _ _ *" + OsUtils.LINE_SEPARATOR + "* _ _ _ _ *" + OsUtils.LINE_SEPARATOR +
"* | | | | | (_) *" + OsUtils.LINE_SEPARATOR + "* | | | | | (_) *" + OsUtils.LINE_SEPARATOR +
@@ -49,7 +53,8 @@ import org.springframework.stereotype.Component;
return "Welcome to Hoodie CLI. Please type help if you are looking for help. "; return "Welcome to Hoodie CLI. Please type help if you are looking for help. ";
} }
@Override public String getProviderName() { @Override
public String getProviderName() {
return "Hoodie Banner"; return "Hoodie Banner";
} }
} }

View File

@@ -16,16 +16,14 @@
package com.uber.hoodie.cli; package com.uber.hoodie.cli;
import java.io.IOException;
import org.springframework.shell.Bootstrap; import org.springframework.shell.Bootstrap;
import java.io.IOException;
public class Main { public class Main {
/** /**
* Main class that delegates to Spring Shell's Bootstrap class in order to simplify debugging inside an IDE * Main class that delegates to Spring Shell's Bootstrap class in order to simplify debugging
* * inside an IDE
* @param args
* @throws IOException
*/ */
public static void main(String[] args) throws IOException { public static void main(String[] args) throws IOException {
Bootstrap.main(args); Bootstrap.main(args);

View File

@@ -24,6 +24,10 @@ import com.uber.hoodie.common.table.HoodieTimeline;
import com.uber.hoodie.common.table.log.HoodieLogFormat; import com.uber.hoodie.common.table.log.HoodieLogFormat;
import com.uber.hoodie.common.table.log.block.HoodieAvroDataBlock; import com.uber.hoodie.common.table.log.block.HoodieAvroDataBlock;
import com.uber.hoodie.common.util.FSUtils; import com.uber.hoodie.common.util.FSUtils;
import java.io.IOException;
import java.util.ArrayList;
import java.util.List;
import java.util.stream.Collectors;
import org.apache.avro.generic.GenericRecord; import org.apache.avro.generic.GenericRecord;
import org.apache.avro.generic.IndexedRecord; import org.apache.avro.generic.IndexedRecord;
import org.apache.hadoop.fs.FileStatus; import org.apache.hadoop.fs.FileStatus;
@@ -34,11 +38,6 @@ import org.springframework.shell.core.annotation.CliCommand;
import org.springframework.shell.core.annotation.CliOption; import org.springframework.shell.core.annotation.CliOption;
import org.springframework.stereotype.Component; import org.springframework.stereotype.Component;
import java.io.IOException;
import java.util.ArrayList;
import java.util.List;
import java.util.stream.Collectors;
@Component @Component
public class ArchivedCommitsCommand implements CommandMarker { public class ArchivedCommitsCommand implements CommandMarker {
@@ -49,13 +48,16 @@ public class ArchivedCommitsCommand implements CommandMarker {
@CliCommand(value = "show archived commits", help = "Read commits from archived files and show details") @CliCommand(value = "show archived commits", help = "Read commits from archived files and show details")
public String showCommits( public String showCommits(
@CliOption(key = {"limit"}, mandatory = false, help = "Limit commits", unspecifiedDefaultValue = "10") @CliOption(key = {
"limit"}, mandatory = false, help = "Limit commits", unspecifiedDefaultValue = "10")
final Integer limit) throws IOException { final Integer limit) throws IOException {
System.out.println("===============> Showing only " + limit + " archived commits <==============="); System.out
FileStatus [] fsStatuses = FSUtils.getFs().globStatus(new Path(HoodieCLI.tableMetadata.getBasePath() + "/.hoodie/.commits_.archive*")); .println("===============> Showing only " + limit + " archived commits <===============");
FileStatus[] fsStatuses = FSUtils.getFs().globStatus(
new Path(HoodieCLI.tableMetadata.getBasePath() + "/.hoodie/.commits_.archive*"));
List<String[]> allCommits = new ArrayList<>(); List<String[]> allCommits = new ArrayList<>();
for(FileStatus fs : fsStatuses) { for (FileStatus fs : fsStatuses) {
//read the archived file //read the archived file
HoodieLogFormat.Reader reader = HoodieLogFormat.newReader(FSUtils.getFs(), HoodieLogFormat.Reader reader = HoodieLogFormat.newReader(FSUtils.getFs(),
new HoodieLogFile(fs.getPath()), HoodieArchivedMetaEntry.getClassSchema(), false); new HoodieLogFile(fs.getPath()), HoodieArchivedMetaEntry.getClassSchema(), false);
@@ -67,11 +69,13 @@ public class ArchivedCommitsCommand implements CommandMarker {
List<IndexedRecord> records = blk.getRecords(); List<IndexedRecord> records = blk.getRecords();
readRecords.addAll(records); readRecords.addAll(records);
} }
List<String[]> readCommits = readRecords.stream().map(r -> (GenericRecord)r).map(r -> readCommit(r)).limit(limit).collect(Collectors.toList()); List<String[]> readCommits = readRecords.stream().map(r -> (GenericRecord) r)
.map(r -> readCommit(r)).limit(limit).collect(Collectors.toList());
allCommits.addAll(readCommits); allCommits.addAll(readCommits);
} }
return HoodiePrintHelper.print( return HoodiePrintHelper.print(
new String[] {"CommitTime", "CommitType", "CommitDetails"}, allCommits.toArray(new String[allCommits.size()][])); new String[]{"CommitTime", "CommitType", "CommitDetails"},
allCommits.toArray(new String[allCommits.size()][]));
} }
private String[] readCommit(GenericRecord record) { private String[] readCommit(GenericRecord record) {

View File

@@ -24,21 +24,21 @@ import com.uber.hoodie.common.table.HoodieTimeline;
import com.uber.hoodie.common.table.timeline.HoodieActiveTimeline; import com.uber.hoodie.common.table.timeline.HoodieActiveTimeline;
import com.uber.hoodie.common.table.timeline.HoodieInstant; import com.uber.hoodie.common.table.timeline.HoodieInstant;
import com.uber.hoodie.common.util.AvroUtils; import com.uber.hoodie.common.util.AvroUtils;
import org.springframework.shell.core.CommandMarker;
import org.springframework.shell.core.annotation.CliAvailabilityIndicator;
import org.springframework.shell.core.annotation.CliCommand;
import org.springframework.shell.core.annotation.CliOption;
import org.springframework.stereotype.Component;
import java.io.IOException; import java.io.IOException;
import java.util.ArrayList; import java.util.ArrayList;
import java.util.Collections; import java.util.Collections;
import java.util.List; import java.util.List;
import java.util.Map; import java.util.Map;
import java.util.stream.Collectors; import java.util.stream.Collectors;
import org.springframework.shell.core.CommandMarker;
import org.springframework.shell.core.annotation.CliAvailabilityIndicator;
import org.springframework.shell.core.annotation.CliCommand;
import org.springframework.shell.core.annotation.CliOption;
import org.springframework.stereotype.Component;
@Component @Component
public class CleansCommand implements CommandMarker { public class CleansCommand implements CommandMarker {
@CliAvailabilityIndicator({"cleans show"}) @CliAvailabilityIndicator({"cleans show"})
public boolean isShowAvailable() { public boolean isShowAvailable() {
return HoodieCLI.tableMetadata != null; return HoodieCLI.tableMetadata != null;
@@ -65,12 +65,12 @@ public class CleansCommand implements CommandMarker {
HoodieInstant clean = cleans.get(i); HoodieInstant clean = cleans.get(i);
HoodieCleanMetadata cleanMetadata = HoodieCleanMetadata cleanMetadata =
AvroUtils.deserializeHoodieCleanMetadata(timeline.getInstantDetails(clean).get()); AvroUtils.deserializeHoodieCleanMetadata(timeline.getInstantDetails(clean).get());
rows[i] = new String[] {clean.getTimestamp(), cleanMetadata.getEarliestCommitToRetain(), rows[i] = new String[]{clean.getTimestamp(), cleanMetadata.getEarliestCommitToRetain(),
String.valueOf(cleanMetadata.getTotalFilesDeleted()), String.valueOf(cleanMetadata.getTotalFilesDeleted()),
String.valueOf(cleanMetadata.getTimeTakenInMillis())}; String.valueOf(cleanMetadata.getTimeTakenInMillis())};
} }
return HoodiePrintHelper.print( return HoodiePrintHelper.print(
new String[] {"CleanTime", "EarliestCommandRetained", "Total Files Deleted", new String[]{"CleanTime", "EarliestCommandRetained", "Total Files Deleted",
"Total Time Taken"}, rows); "Total Time Taken"}, rows);
} }
@@ -97,16 +97,17 @@ public class CleansCommand implements CommandMarker {
HoodieCleanMetadata cleanMetadata = HoodieCleanMetadata cleanMetadata =
AvroUtils.deserializeHoodieCleanMetadata(timeline.getInstantDetails(cleanInstant).get()); AvroUtils.deserializeHoodieCleanMetadata(timeline.getInstantDetails(cleanInstant).get());
List<String[]> rows = new ArrayList<>(); List<String[]> rows = new ArrayList<>();
for (Map.Entry<String, HoodieCleanPartitionMetadata> entry : cleanMetadata.getPartitionMetadata().entrySet()) { for (Map.Entry<String, HoodieCleanPartitionMetadata> entry : cleanMetadata
.getPartitionMetadata().entrySet()) {
String path = entry.getKey(); String path = entry.getKey();
HoodieCleanPartitionMetadata stats = entry.getValue(); HoodieCleanPartitionMetadata stats = entry.getValue();
String policy = stats.getPolicy(); String policy = stats.getPolicy();
String totalSuccessDeletedFiles = String.valueOf(stats.getSuccessDeleteFiles().size()); String totalSuccessDeletedFiles = String.valueOf(stats.getSuccessDeleteFiles().size());
String totalFailedDeletedFiles = String.valueOf(stats.getFailedDeleteFiles().size()); String totalFailedDeletedFiles = String.valueOf(stats.getFailedDeleteFiles().size());
rows.add(new String[] {path, policy, totalSuccessDeletedFiles, totalFailedDeletedFiles}); rows.add(new String[]{path, policy, totalSuccessDeletedFiles, totalFailedDeletedFiles});
} }
return HoodiePrintHelper.print( return HoodiePrintHelper.print(
new String[] {"Partition Path", "Cleaning policy", "Total Files Successfully Deleted", new String[]{"Partition Path", "Cleaning policy", "Total Files Successfully Deleted",
"Total Failed Deletions"}, rows.toArray(new String[rows.size()][])); "Total Failed Deletions"}, rows.toArray(new String[rows.size()][]));
} }
} }

View File

@@ -27,7 +27,12 @@ import com.uber.hoodie.common.table.HoodieTimeline;
import com.uber.hoodie.common.table.timeline.HoodieActiveTimeline; import com.uber.hoodie.common.table.timeline.HoodieActiveTimeline;
import com.uber.hoodie.common.table.timeline.HoodieInstant; import com.uber.hoodie.common.table.timeline.HoodieInstant;
import com.uber.hoodie.common.util.NumericUtils; import com.uber.hoodie.common.util.NumericUtils;
import java.io.IOException;
import java.util.ArrayList;
import java.util.Collections;
import java.util.List;
import java.util.Map;
import java.util.stream.Collectors;
import org.apache.spark.launcher.SparkLauncher; import org.apache.spark.launcher.SparkLauncher;
import org.springframework.shell.core.CommandMarker; import org.springframework.shell.core.CommandMarker;
import org.springframework.shell.core.annotation.CliAvailabilityIndicator; import org.springframework.shell.core.annotation.CliAvailabilityIndicator;
@@ -35,15 +40,9 @@ import org.springframework.shell.core.annotation.CliCommand;
import org.springframework.shell.core.annotation.CliOption; import org.springframework.shell.core.annotation.CliOption;
import org.springframework.stereotype.Component; import org.springframework.stereotype.Component;
import java.io.IOException;
import java.util.ArrayList;
import java.util.Collections;
import java.util.List;
import java.util.Map;
import java.util.stream.Collectors;
@Component @Component
public class CommitsCommand implements CommandMarker { public class CommitsCommand implements CommandMarker {
@CliAvailabilityIndicator({"commits show"}) @CliAvailabilityIndicator({"commits show"})
public boolean isShowAvailable() { public boolean isShowAvailable() {
return HoodieCLI.tableMetadata != null; return HoodieCLI.tableMetadata != null;
@@ -70,7 +69,8 @@ public class CommitsCommand implements CommandMarker {
"limit"}, mandatory = false, help = "Limit commits", unspecifiedDefaultValue = "10") "limit"}, mandatory = false, help = "Limit commits", unspecifiedDefaultValue = "10")
final Integer limit) throws IOException { final Integer limit) throws IOException {
HoodieActiveTimeline activeTimeline = HoodieCLI.tableMetadata.getActiveTimeline(); HoodieActiveTimeline activeTimeline = HoodieCLI.tableMetadata.getActiveTimeline();
HoodieTimeline timeline = activeTimeline.getCommitsAndCompactionsTimeline().filterCompletedInstants(); HoodieTimeline timeline = activeTimeline.getCommitsAndCompactionsTimeline()
.filterCompletedInstants();
List<HoodieInstant> commits = timeline.getInstants().collect(Collectors.toList()); List<HoodieInstant> commits = timeline.getInstants().collect(Collectors.toList());
String[][] rows = new String[commits.size()][]; String[][] rows = new String[commits.size()][];
Collections.reverse(commits); Collections.reverse(commits);
@@ -78,7 +78,7 @@ public class CommitsCommand implements CommandMarker {
HoodieInstant commit = commits.get(i); HoodieInstant commit = commits.get(i);
HoodieCommitMetadata commitMetadata = HoodieCommitMetadata commitMetadata =
HoodieCommitMetadata.fromBytes(timeline.getInstantDetails(commit).get()); HoodieCommitMetadata.fromBytes(timeline.getInstantDetails(commit).get());
rows[i] = new String[] {commit.getTimestamp(), rows[i] = new String[]{commit.getTimestamp(),
NumericUtils.humanReadableByteCount(commitMetadata.fetchTotalBytesWritten()), NumericUtils.humanReadableByteCount(commitMetadata.fetchTotalBytesWritten()),
String.valueOf(commitMetadata.fetchTotalFilesInsert()), String.valueOf(commitMetadata.fetchTotalFilesInsert()),
String.valueOf(commitMetadata.fetchTotalFilesUpdated()), String.valueOf(commitMetadata.fetchTotalFilesUpdated()),
@@ -88,7 +88,7 @@ public class CommitsCommand implements CommandMarker {
String.valueOf(commitMetadata.fetchTotalWriteErrors())}; String.valueOf(commitMetadata.fetchTotalWriteErrors())};
} }
return HoodiePrintHelper.print( return HoodiePrintHelper.print(
new String[] {"CommitTime", "Total Written (B)", "Total Files Added", new String[]{"CommitTime", "Total Written (B)", "Total Files Added",
"Total Files Updated", "Total Partitions Written", "Total Records Written", "Total Files Updated", "Total Partitions Written", "Total Records Written",
"Total Update Records Written", "Total Errors"}, rows); "Total Update Records Written", "Total Errors"}, rows);
} }
@@ -108,8 +108,10 @@ public class CommitsCommand implements CommandMarker {
@CliOption(key = {"sparkProperties"}, help = "Spark Properites File Path") @CliOption(key = {"sparkProperties"}, help = "Spark Properites File Path")
final String sparkPropertiesPath) throws Exception { final String sparkPropertiesPath) throws Exception {
HoodieActiveTimeline activeTimeline = HoodieCLI.tableMetadata.getActiveTimeline(); HoodieActiveTimeline activeTimeline = HoodieCLI.tableMetadata.getActiveTimeline();
HoodieTimeline timeline = activeTimeline.getCommitsAndCompactionsTimeline().filterCompletedInstants(); HoodieTimeline timeline = activeTimeline.getCommitsAndCompactionsTimeline()
HoodieInstant commitInstant = new HoodieInstant(false, HoodieTimeline.COMMIT_ACTION, commitTime); .filterCompletedInstants();
HoodieInstant commitInstant = new HoodieInstant(false, HoodieTimeline.COMMIT_ACTION,
commitTime);
if (!timeline.containsInstant(commitInstant)) { if (!timeline.containsInstant(commitInstant)) {
return "Commit " + commitTime + " not found in Commits " + timeline; return "Commit " + commitTime + " not found in Commits " + timeline;
@@ -135,8 +137,10 @@ public class CommitsCommand implements CommandMarker {
@CliOption(key = {"commit"}, help = "Commit to show") @CliOption(key = {"commit"}, help = "Commit to show")
final String commitTime) throws Exception { final String commitTime) throws Exception {
HoodieActiveTimeline activeTimeline = HoodieCLI.tableMetadata.getActiveTimeline(); HoodieActiveTimeline activeTimeline = HoodieCLI.tableMetadata.getActiveTimeline();
HoodieTimeline timeline = activeTimeline.getCommitsAndCompactionsTimeline().filterCompletedInstants(); HoodieTimeline timeline = activeTimeline.getCommitsAndCompactionsTimeline()
HoodieInstant commitInstant = new HoodieInstant(false, HoodieTimeline.COMMIT_ACTION, commitTime); .filterCompletedInstants();
HoodieInstant commitInstant = new HoodieInstant(false, HoodieTimeline.COMMIT_ACTION,
commitTime);
if (!timeline.containsInstant(commitInstant)) { if (!timeline.containsInstant(commitInstant)) {
return "Commit " + commitTime + " not found in Commits " + timeline; return "Commit " + commitTime + " not found in Commits " + timeline;
@@ -165,7 +169,7 @@ public class CommitsCommand implements CommandMarker {
totalBytesWritten += stat.getTotalWriteBytes(); totalBytesWritten += stat.getTotalWriteBytes();
totalWriteErrors += stat.getTotalWriteErrors(); totalWriteErrors += stat.getTotalWriteErrors();
} }
rows.add(new String[] {path, String.valueOf(totalFilesAdded), rows.add(new String[]{path, String.valueOf(totalFilesAdded),
String.valueOf(totalFilesUpdated), String.valueOf(totalRecordsInserted), String.valueOf(totalFilesUpdated), String.valueOf(totalRecordsInserted),
String.valueOf(totalRecordsUpdated), String.valueOf(totalRecordsUpdated),
NumericUtils.humanReadableByteCount(totalBytesWritten), NumericUtils.humanReadableByteCount(totalBytesWritten),
@@ -173,7 +177,7 @@ public class CommitsCommand implements CommandMarker {
} }
return HoodiePrintHelper.print( return HoodiePrintHelper.print(
new String[] {"Partition Path", "Total Files Added", "Total Files Updated", new String[]{"Partition Path", "Total Files Added", "Total Files Updated",
"Total Records Inserted", "Total Records Updated", "Total Bytes Written", "Total Records Inserted", "Total Records Updated", "Total Bytes Written",
"Total Errors"}, rows.toArray(new String[rows.size()][])); "Total Errors"}, rows.toArray(new String[rows.size()][]));
} }
@@ -183,8 +187,10 @@ public class CommitsCommand implements CommandMarker {
@CliOption(key = {"commit"}, help = "Commit to show") @CliOption(key = {"commit"}, help = "Commit to show")
final String commitTime) throws Exception { final String commitTime) throws Exception {
HoodieActiveTimeline activeTimeline = HoodieCLI.tableMetadata.getActiveTimeline(); HoodieActiveTimeline activeTimeline = HoodieCLI.tableMetadata.getActiveTimeline();
HoodieTimeline timeline = activeTimeline.getCommitsAndCompactionsTimeline().filterCompletedInstants(); HoodieTimeline timeline = activeTimeline.getCommitsAndCompactionsTimeline()
HoodieInstant commitInstant = new HoodieInstant(false, HoodieTimeline.COMMIT_ACTION, commitTime); .filterCompletedInstants();
HoodieInstant commitInstant = new HoodieInstant(false, HoodieTimeline.COMMIT_ACTION,
commitTime);
if (!timeline.containsInstant(commitInstant)) { if (!timeline.containsInstant(commitInstant)) {
return "Commit " + commitTime + " not found in Commits " + timeline; return "Commit " + commitTime + " not found in Commits " + timeline;
@@ -197,14 +203,14 @@ public class CommitsCommand implements CommandMarker {
String path = entry.getKey(); String path = entry.getKey();
List<HoodieWriteStat> stats = entry.getValue(); List<HoodieWriteStat> stats = entry.getValue();
for (HoodieWriteStat stat : stats) { for (HoodieWriteStat stat : stats) {
rows.add(new String[] {path, stat.getFileId(), stat.getPrevCommit(), rows.add(new String[]{path, stat.getFileId(), stat.getPrevCommit(),
String.valueOf(stat.getNumUpdateWrites()), String.valueOf(stat.getNumWrites()), String.valueOf(stat.getNumUpdateWrites()), String.valueOf(stat.getNumWrites()),
String.valueOf(stat.getTotalWriteBytes()), String.valueOf(stat.getTotalWriteBytes()),
String.valueOf(stat.getTotalWriteErrors())}); String.valueOf(stat.getTotalWriteErrors())});
} }
} }
return HoodiePrintHelper.print( return HoodiePrintHelper.print(
new String[] {"Partition Path", "File ID", "Previous Commit", "Total Records Updated", new String[]{"Partition Path", "File ID", "Previous Commit", "Total Records Updated",
"Total Records Written", "Total Bytes Written", "Total Errors"}, "Total Records Written", "Total Bytes Written", "Total Errors"},
rows.toArray(new String[rows.size()][])); rows.toArray(new String[rows.size()][]));
} }
@@ -219,16 +225,23 @@ public class CommitsCommand implements CommandMarker {
@CliOption(key = {"path"}, help = "Path of the dataset to compare to") @CliOption(key = {"path"}, help = "Path of the dataset to compare to")
final String path) throws Exception { final String path) throws Exception {
HoodieTableMetaClient target = new HoodieTableMetaClient(HoodieCLI.fs, path); HoodieTableMetaClient target = new HoodieTableMetaClient(HoodieCLI.fs, path);
HoodieTimeline targetTimeline = target.getActiveTimeline().getCommitsAndCompactionsTimeline().filterCompletedInstants();; HoodieTimeline targetTimeline = target.getActiveTimeline().getCommitsAndCompactionsTimeline()
.filterCompletedInstants();
;
HoodieTableMetaClient source = HoodieCLI.tableMetadata; HoodieTableMetaClient source = HoodieCLI.tableMetadata;
HoodieTimeline sourceTimeline = source.getActiveTimeline().getCommitsAndCompactionsTimeline().filterCompletedInstants();; HoodieTimeline sourceTimeline = source.getActiveTimeline().getCommitsAndCompactionsTimeline()
.filterCompletedInstants();
;
String targetLatestCommit = String targetLatestCommit =
targetTimeline.getInstants().iterator().hasNext() ? "0" : targetTimeline.lastInstant().get().getTimestamp(); targetTimeline.getInstants().iterator().hasNext() ? "0"
: targetTimeline.lastInstant().get().getTimestamp();
String sourceLatestCommit = String sourceLatestCommit =
sourceTimeline.getInstants().iterator().hasNext() ? "0" : sourceTimeline.lastInstant().get().getTimestamp(); sourceTimeline.getInstants().iterator().hasNext() ? "0"
: sourceTimeline.lastInstant().get().getTimestamp();
if (sourceLatestCommit != null && if (sourceLatestCommit != null &&
HoodieTimeline.compareTimestamps(targetLatestCommit, sourceLatestCommit, HoodieTimeline.GREATER)) { HoodieTimeline
.compareTimestamps(targetLatestCommit, sourceLatestCommit, HoodieTimeline.GREATER)) {
// source is behind the target // source is behind the target
List<String> commitsToCatchup = List<String> commitsToCatchup =
targetTimeline.findInstantsAfter(sourceLatestCommit, Integer.MAX_VALUE) targetTimeline.findInstantsAfter(sourceLatestCommit, Integer.MAX_VALUE)

View File

@@ -18,15 +18,15 @@ package com.uber.hoodie.cli.commands;
import com.uber.hoodie.cli.HoodieCLI; import com.uber.hoodie.cli.HoodieCLI;
import com.uber.hoodie.common.table.HoodieTableMetaClient; import com.uber.hoodie.common.table.HoodieTableMetaClient;
import java.io.IOException;
import org.springframework.shell.core.CommandMarker; import org.springframework.shell.core.CommandMarker;
import org.springframework.shell.core.annotation.CliCommand; import org.springframework.shell.core.annotation.CliCommand;
import org.springframework.shell.core.annotation.CliOption; import org.springframework.shell.core.annotation.CliOption;
import org.springframework.stereotype.Component; import org.springframework.stereotype.Component;
import java.io.IOException;
@Component @Component
public class DatasetsCommand implements CommandMarker { public class DatasetsCommand implements CommandMarker {
@CliCommand(value = "connect", help = "Connect to a hoodie dataset") @CliCommand(value = "connect", help = "Connect to a hoodie dataset")
public String connect( public String connect(
@CliOption(key = {"path"}, mandatory = true, help = "Base Path of the dataset") @CliOption(key = {"path"}, mandatory = true, help = "Base Path of the dataset")

View File

@@ -68,7 +68,8 @@ public class HDFSParquetImportCommand implements CommandMarker {
boolean initialized = HoodieCLI.initConf(); boolean initialized = HoodieCLI.initConf();
HoodieCLI.initFS(initialized); HoodieCLI.initFS(initialized);
String sparkPropertiesPath = Utils String sparkPropertiesPath = Utils
.getDefaultPropertiesFile(scala.collection.JavaConversions.propertiesAsScalaMap(System.getProperties())); .getDefaultPropertiesFile(
scala.collection.JavaConversions.propertiesAsScalaMap(System.getProperties()));
SparkLauncher sparkLauncher = SparkUtil.initLauncher(sparkPropertiesPath); SparkLauncher sparkLauncher = SparkUtil.initLauncher(sparkPropertiesPath);
sparkLauncher.addAppArgs(SparkCommand.IMPORT.toString(), srcPath, targetPath, tableName, sparkLauncher.addAppArgs(SparkCommand.IMPORT.toString(), srcPath, targetPath, tableName,

View File

@@ -16,23 +16,23 @@
package com.uber.hoodie.cli.commands; package com.uber.hoodie.cli.commands;
import com.uber.hoodie.cli.HoodieCLI;
import com.uber.hoodie.cli.utils.CommitUtil; import com.uber.hoodie.cli.utils.CommitUtil;
import com.uber.hoodie.cli.utils.HiveUtil; import com.uber.hoodie.cli.utils.HiveUtil;
import com.uber.hoodie.cli.HoodieCLI;
import com.uber.hoodie.common.table.HoodieTableMetaClient; import com.uber.hoodie.common.table.HoodieTableMetaClient;
import com.uber.hoodie.common.table.HoodieTimeline; import com.uber.hoodie.common.table.HoodieTimeline;
import com.uber.hoodie.common.table.timeline.HoodieInstant; import com.uber.hoodie.common.table.timeline.HoodieInstant;
import java.util.List;
import java.util.stream.Collectors;
import org.springframework.shell.core.CommandMarker; import org.springframework.shell.core.CommandMarker;
import org.springframework.shell.core.annotation.CliAvailabilityIndicator; import org.springframework.shell.core.annotation.CliAvailabilityIndicator;
import org.springframework.shell.core.annotation.CliCommand; import org.springframework.shell.core.annotation.CliCommand;
import org.springframework.shell.core.annotation.CliOption; import org.springframework.shell.core.annotation.CliOption;
import org.springframework.stereotype.Component; import org.springframework.stereotype.Component;
import java.util.List;
import java.util.stream.Collectors;
@Component @Component
public class HoodieSyncCommand implements CommandMarker { public class HoodieSyncCommand implements CommandMarker {
@CliAvailabilityIndicator({"sync validate"}) @CliAvailabilityIndicator({"sync validate"})
public boolean isSyncVerificationAvailable() { public boolean isSyncVerificationAvailable() {
return HoodieCLI.tableMetadata != null && HoodieCLI.syncTableMetadata != null; return HoodieCLI.tableMetadata != null && HoodieCLI.syncTableMetadata != null;
@@ -70,14 +70,18 @@ public class HoodieSyncCommand implements CommandMarker {
sourceCount = HiveUtil.countRecords(hiveServerUrl, source, srcDb, hiveUser, hivePass); sourceCount = HiveUtil.countRecords(hiveServerUrl, source, srcDb, hiveUser, hivePass);
targetCount = HiveUtil.countRecords(hiveServerUrl, target, tgtDb, hiveUser, hivePass); targetCount = HiveUtil.countRecords(hiveServerUrl, target, tgtDb, hiveUser, hivePass);
} else if ("latestPartitions".equals(mode)) { } else if ("latestPartitions".equals(mode)) {
sourceCount = HiveUtil.countRecords(hiveServerUrl, source, srcDb, partitionCount, hiveUser, hivePass); sourceCount = HiveUtil
targetCount = HiveUtil.countRecords(hiveServerUrl, target, tgtDb, partitionCount, hiveUser, hivePass); .countRecords(hiveServerUrl, source, srcDb, partitionCount, hiveUser, hivePass);
targetCount = HiveUtil
.countRecords(hiveServerUrl, target, tgtDb, partitionCount, hiveUser, hivePass);
} }
String targetLatestCommit = String targetLatestCommit =
targetTimeline.getInstants().iterator().hasNext() ? "0" : targetTimeline.lastInstant().get().getTimestamp(); targetTimeline.getInstants().iterator().hasNext() ? "0"
: targetTimeline.lastInstant().get().getTimestamp();
String sourceLatestCommit = String sourceLatestCommit =
sourceTimeline.getInstants().iterator().hasNext() ? "0" : sourceTimeline.lastInstant().get().getTimestamp(); sourceTimeline.getInstants().iterator().hasNext() ? "0"
: sourceTimeline.lastInstant().get().getTimestamp();
if (sourceLatestCommit != null && HoodieTimeline if (sourceLatestCommit != null && HoodieTimeline
.compareTimestamps(targetLatestCommit, sourceLatestCommit, HoodieTimeline.GREATER)) { .compareTimestamps(targetLatestCommit, sourceLatestCommit, HoodieTimeline.GREATER)) {

View File

@@ -22,7 +22,8 @@ import com.uber.hoodie.cli.utils.InputStreamConsumer;
import com.uber.hoodie.cli.utils.SparkUtil; import com.uber.hoodie.cli.utils.SparkUtil;
import com.uber.hoodie.common.model.HoodiePartitionMetadata; import com.uber.hoodie.common.model.HoodiePartitionMetadata;
import com.uber.hoodie.common.util.FSUtils; import com.uber.hoodie.common.util.FSUtils;
import java.io.IOException;
import java.util.List;
import org.apache.hadoop.fs.Path; import org.apache.hadoop.fs.Path;
import org.apache.spark.launcher.SparkLauncher; import org.apache.spark.launcher.SparkLauncher;
import org.springframework.shell.core.CommandMarker; import org.springframework.shell.core.CommandMarker;
@@ -31,9 +32,6 @@ import org.springframework.shell.core.annotation.CliCommand;
import org.springframework.shell.core.annotation.CliOption; import org.springframework.shell.core.annotation.CliOption;
import org.springframework.stereotype.Component; import org.springframework.stereotype.Component;
import java.io.IOException;
import java.util.List;
@Component @Component
public class RepairsCommand implements CommandMarker { public class RepairsCommand implements CommandMarker {
@@ -52,7 +50,8 @@ public class RepairsCommand implements CommandMarker {
@CliOption(key = { @CliOption(key = {
"duplicatedPartitionPath"}, help = "Partition Path containing the duplicates", mandatory = true) "duplicatedPartitionPath"}, help = "Partition Path containing the duplicates", mandatory = true)
final String duplicatedPartitionPath, final String duplicatedPartitionPath,
@CliOption(key = {"repairedOutputPath"}, help = "Location to place the repaired files", mandatory = true) @CliOption(key = {
"repairedOutputPath"}, help = "Location to place the repaired files", mandatory = true)
final String repairedOutputPath, final String repairedOutputPath,
@CliOption(key = {"sparkProperties"}, help = "Spark Properites File Path", mandatory = true) @CliOption(key = {"sparkProperties"}, help = "Spark Properites File Path", mandatory = true)
final String sparkPropertiesPath) throws Exception { final String sparkPropertiesPath) throws Exception {
@@ -71,7 +70,6 @@ public class RepairsCommand implements CommandMarker {
} }
@CliCommand(value = "repair addpartitionmeta", help = "Add partition metadata to a dataset, if not present") @CliCommand(value = "repair addpartitionmeta", help = "Add partition metadata to a dataset, if not present")
public String addPartitionMeta( public String addPartitionMeta(
@CliOption(key = {"dryrun"}, @CliOption(key = {"dryrun"},
@@ -79,17 +77,20 @@ public class RepairsCommand implements CommandMarker {
unspecifiedDefaultValue = "true") unspecifiedDefaultValue = "true")
final boolean dryRun) throws IOException { final boolean dryRun) throws IOException {
String latestCommit = HoodieCLI.tableMetadata.getActiveTimeline().getCommitTimeline().lastInstant().get().getTimestamp(); String latestCommit = HoodieCLI.tableMetadata.getActiveTimeline().getCommitTimeline()
.lastInstant().get().getTimestamp();
List<String> partitionPaths = FSUtils.getAllFoldersThreeLevelsDown(HoodieCLI.fs, List<String> partitionPaths = FSUtils.getAllFoldersThreeLevelsDown(HoodieCLI.fs,
HoodieCLI.tableMetadata.getBasePath()); HoodieCLI.tableMetadata.getBasePath());
Path basePath = new Path(HoodieCLI.tableMetadata.getBasePath()); Path basePath = new Path(HoodieCLI.tableMetadata.getBasePath());
String[][] rows = new String[partitionPaths.size() + 1][]; String[][] rows = new String[partitionPaths.size() + 1][];
int ind = 0; int ind = 0;
for (String partition: partitionPaths) { for (String partition : partitionPaths) {
Path partitionPath = new Path(basePath, partition); Path partitionPath = new Path(basePath, partition);
String[] row = new String[3]; String[] row = new String[3];
row[0] = partition; row[1] = "Yes"; row[2] = "None"; row[0] = partition;
row[1] = "Yes";
row[2] = "None";
if (!HoodiePartitionMetadata.hasPartitionMetadata(HoodieCLI.fs, partitionPath)) { if (!HoodiePartitionMetadata.hasPartitionMetadata(HoodieCLI.fs, partitionPath)) {
row[1] = "No"; row[1] = "No";
if (!dryRun) { if (!dryRun) {
@@ -105,6 +106,6 @@ public class RepairsCommand implements CommandMarker {
} }
return HoodiePrintHelper.print( return HoodiePrintHelper.print(
new String[] {"Partition Path", "Metadata Present?", "Action"}, rows); new String[]{"Partition Path", "Metadata Present?", "Action"}, rows);
} }
} }

View File

@@ -27,6 +27,10 @@ import com.uber.hoodie.common.table.timeline.HoodieInstant;
import com.uber.hoodie.config.HoodieIndexConfig; import com.uber.hoodie.config.HoodieIndexConfig;
import com.uber.hoodie.config.HoodieWriteConfig; import com.uber.hoodie.config.HoodieWriteConfig;
import com.uber.hoodie.index.HoodieIndex; import com.uber.hoodie.index.HoodieIndex;
import java.io.IOException;
import java.util.Collections;
import java.util.List;
import java.util.stream.Collectors;
import org.apache.spark.api.java.JavaSparkContext; import org.apache.spark.api.java.JavaSparkContext;
import org.apache.spark.launcher.SparkLauncher; import org.apache.spark.launcher.SparkLauncher;
import org.springframework.shell.core.CommandMarker; import org.springframework.shell.core.CommandMarker;
@@ -35,13 +39,9 @@ import org.springframework.shell.core.annotation.CliCommand;
import org.springframework.shell.core.annotation.CliOption; import org.springframework.shell.core.annotation.CliOption;
import org.springframework.stereotype.Component; import org.springframework.stereotype.Component;
import java.io.IOException;
import java.util.Collections;
import java.util.List;
import java.util.stream.Collectors;
@Component @Component
public class SavepointsCommand implements CommandMarker { public class SavepointsCommand implements CommandMarker {
@CliAvailabilityIndicator({"savepoints show"}) @CliAvailabilityIndicator({"savepoints show"})
public boolean isShowAvailable() { public boolean isShowAvailable() {
return HoodieCLI.tableMetadata != null; return HoodieCLI.tableMetadata != null;
@@ -60,7 +60,8 @@ public class SavepointsCommand implements CommandMarker {
@CliAvailabilityIndicator({"savepoint rollback"}) @CliAvailabilityIndicator({"savepoint rollback"})
public boolean isRollbackToSavepointAvailable() { public boolean isRollbackToSavepointAvailable() {
return HoodieCLI.tableMetadata != null && !HoodieCLI.tableMetadata.getActiveTimeline().getSavePointTimeline().filterCompletedInstants().empty(); return HoodieCLI.tableMetadata != null && !HoodieCLI.tableMetadata.getActiveTimeline()
.getSavePointTimeline().filterCompletedInstants().empty();
} }
@CliCommand(value = "savepoints show", help = "Show the savepoints") @CliCommand(value = "savepoints show", help = "Show the savepoints")
@@ -72,9 +73,9 @@ public class SavepointsCommand implements CommandMarker {
Collections.reverse(commits); Collections.reverse(commits);
for (int i = 0; i < commits.size(); i++) { for (int i = 0; i < commits.size(); i++) {
HoodieInstant commit = commits.get(i); HoodieInstant commit = commits.get(i);
rows[i] = new String[] {commit.getTimestamp()}; rows[i] = new String[]{commit.getTimestamp()};
} }
return HoodiePrintHelper.print(new String[] {"SavepointTime"}, rows); return HoodiePrintHelper.print(new String[]{"SavepointTime"}, rows);
} }
@CliCommand(value = "savepoint create", help = "Savepoint a commit") @CliCommand(value = "savepoint create", help = "Savepoint a commit")
@@ -152,5 +153,4 @@ public class SavepointsCommand implements CommandMarker {
} }
} }

View File

@@ -52,7 +52,7 @@ public class SparkMain {
JavaSparkContext jsc = SparkUtil.initJavaSparkConf("hoodie-cli-" + command); JavaSparkContext jsc = SparkUtil.initJavaSparkConf("hoodie-cli-" + command);
int returnCode = 0; int returnCode = 0;
switch(cmd) { switch (cmd) {
case ROLLBACK: case ROLLBACK:
assert (args.length == 3); assert (args.length == 3);
returnCode = rollback(jsc, args[1], args[2]); returnCode = rollback(jsc, args[1], args[2]);
@@ -98,7 +98,7 @@ public class SparkMain {
String basePath) String basePath)
throws Exception { throws Exception {
DedupeSparkJob job = new DedupeSparkJob(basePath, DedupeSparkJob job = new DedupeSparkJob(basePath,
duplicatedPartitionPath,repairedOutputPath,new SQLContext(jsc), FSUtils.getFs()); duplicatedPartitionPath, repairedOutputPath, new SQLContext(jsc), FSUtils.getFs());
job.fixDuplicates(true); job.fixDuplicates(true);
return 0; return 0;
} }
@@ -115,7 +115,8 @@ public class SparkMain {
} }
} }
private static int rollbackToSavepoint(JavaSparkContext jsc, String savepointTime, String basePath) private static int rollbackToSavepoint(JavaSparkContext jsc, String savepointTime,
String basePath)
throws Exception { throws Exception {
HoodieWriteClient client = createHoodieClient(jsc, basePath); HoodieWriteClient client = createHoodieClient(jsc, basePath);
if (client.rollbackToSavepoint(savepointTime)) { if (client.rollbackToSavepoint(savepointTime)) {

View File

@@ -28,7 +28,10 @@ import com.uber.hoodie.common.table.timeline.HoodieActiveTimeline;
import com.uber.hoodie.common.table.timeline.HoodieInstant; import com.uber.hoodie.common.table.timeline.HoodieInstant;
import com.uber.hoodie.common.util.FSUtils; import com.uber.hoodie.common.util.FSUtils;
import com.uber.hoodie.common.util.NumericUtils; import com.uber.hoodie.common.util.NumericUtils;
import java.io.IOException;
import java.text.DecimalFormat;
import java.util.HashMap;
import java.util.stream.Collectors;
import org.apache.hadoop.fs.FileStatus; import org.apache.hadoop.fs.FileStatus;
import org.apache.hadoop.fs.FileSystem; import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path; import org.apache.hadoop.fs.Path;
@@ -38,13 +41,9 @@ import org.springframework.shell.core.annotation.CliCommand;
import org.springframework.shell.core.annotation.CliOption; import org.springframework.shell.core.annotation.CliOption;
import org.springframework.stereotype.Component; import org.springframework.stereotype.Component;
import java.io.IOException;
import java.text.DecimalFormat;
import java.util.HashMap;
import java.util.stream.Collectors;
@Component @Component
public class StatsCommand implements CommandMarker { public class StatsCommand implements CommandMarker {
@CliAvailabilityIndicator({"stats wa"}) @CliAvailabilityIndicator({"stats wa"})
public boolean isWriteAmpAvailable() { public boolean isWriteAmpAvailable() {
return HoodieCLI.tableMetadata != null; return HoodieCLI.tableMetadata != null;
@@ -64,13 +63,14 @@ public class StatsCommand implements CommandMarker {
for (HoodieInstant commitTime : timeline.getInstants().collect( for (HoodieInstant commitTime : timeline.getInstants().collect(
Collectors.toList())) { Collectors.toList())) {
String waf = "0"; String waf = "0";
HoodieCommitMetadata commit = HoodieCommitMetadata.fromBytes(activeTimeline.getInstantDetails(commitTime).get()); HoodieCommitMetadata commit = HoodieCommitMetadata
.fromBytes(activeTimeline.getInstantDetails(commitTime).get());
if (commit.fetchTotalUpdateRecordsWritten() > 0) { if (commit.fetchTotalUpdateRecordsWritten() > 0) {
waf = df.format( waf = df.format(
(float) commit.fetchTotalRecordsWritten() / commit (float) commit.fetchTotalRecordsWritten() / commit
.fetchTotalUpdateRecordsWritten()); .fetchTotalUpdateRecordsWritten());
} }
rows[i++] = new String[] {commitTime.getTimestamp(), rows[i++] = new String[]{commitTime.getTimestamp(),
String.valueOf(commit.fetchTotalUpdateRecordsWritten()), String.valueOf(commit.fetchTotalUpdateRecordsWritten()),
String.valueOf(commit.fetchTotalRecordsWritten()), waf}; String.valueOf(commit.fetchTotalRecordsWritten()), waf};
totalRecordsUpserted += commit.fetchTotalUpdateRecordsWritten(); totalRecordsUpserted += commit.fetchTotalUpdateRecordsWritten();
@@ -80,10 +80,10 @@ public class StatsCommand implements CommandMarker {
if (totalRecordsUpserted > 0) { if (totalRecordsUpserted > 0) {
waf = df.format((float) totalRecordsWritten / totalRecordsUpserted); waf = df.format((float) totalRecordsWritten / totalRecordsUpserted);
} }
rows[i] = new String[] {"Total", String.valueOf(totalRecordsUpserted), rows[i] = new String[]{"Total", String.valueOf(totalRecordsUpserted),
String.valueOf(totalRecordsWritten), waf}; String.valueOf(totalRecordsWritten), waf};
return HoodiePrintHelper.print( return HoodiePrintHelper.print(
new String[] {"CommitTime", "Total Upserted", "Total Written", new String[]{"CommitTime", "Total Upserted", "Total Written",
"Write Amplifiation Factor"}, rows); "Write Amplifiation Factor"}, rows);
} }
@@ -105,7 +105,8 @@ public class StatsCommand implements CommandMarker {
@CliCommand(value = "stats filesizes", help = "File Sizes. Display summary stats on sizes of files") @CliCommand(value = "stats filesizes", help = "File Sizes. Display summary stats on sizes of files")
public String fileSizeStats( public String fileSizeStats(
@CliOption(key = {"partitionPath"}, help = "regex to select files, eg: 2016/08/02", unspecifiedDefaultValue = "*/*/*") @CliOption(key = {
"partitionPath"}, help = "regex to select files, eg: 2016/08/02", unspecifiedDefaultValue = "*/*/*")
final String globRegex) throws IOException { final String globRegex) throws IOException {
FileSystem fs = HoodieCLI.fs; FileSystem fs = HoodieCLI.fs;
@@ -118,7 +119,7 @@ public class StatsCommand implements CommandMarker {
final int MAX_FILES = 1000000; final int MAX_FILES = 1000000;
Histogram globalHistogram = new Histogram(new UniformReservoir(MAX_FILES)); Histogram globalHistogram = new Histogram(new UniformReservoir(MAX_FILES));
HashMap<String, Histogram> commitHistoMap = new HashMap<String, Histogram>(); HashMap<String, Histogram> commitHistoMap = new HashMap<String, Histogram>();
for (FileStatus fileStatus: statuses) { for (FileStatus fileStatus : statuses) {
String commitTime = FSUtils.getCommitTime(fileStatus.getPath().getName()); String commitTime = FSUtils.getCommitTime(fileStatus.getPath().getName());
long sz = fileStatus.getLen(); long sz = fileStatus.getLen();
if (!commitHistoMap.containsKey(commitTime)) { if (!commitHistoMap.containsKey(commitTime)) {
@@ -130,7 +131,7 @@ public class StatsCommand implements CommandMarker {
String[][] rows = new String[commitHistoMap.size() + 1][]; String[][] rows = new String[commitHistoMap.size() + 1][];
int ind = 0; int ind = 0;
for (String commitTime: commitHistoMap.keySet()) { for (String commitTime : commitHistoMap.keySet()) {
Snapshot s = commitHistoMap.get(commitTime).getSnapshot(); Snapshot s = commitHistoMap.get(commitTime).getSnapshot();
rows[ind++] = printFileSizeHistogram(commitTime, s); rows[ind++] = printFileSizeHistogram(commitTime, s);
} }
@@ -138,6 +139,7 @@ public class StatsCommand implements CommandMarker {
rows[ind++] = printFileSizeHistogram("ALL", s); rows[ind++] = printFileSizeHistogram("ALL", s);
return HoodiePrintHelper.print( return HoodiePrintHelper.print(
new String[] {"CommitTime", "Min", "10th", "50th", "avg", "95th", "Max", "NumFiles", "StdDev"}, rows); new String[]{"CommitTime", "Min", "10th", "50th", "avg", "95th", "Max", "NumFiles",
"StdDev"}, rows);
} }
} }

View File

@@ -23,9 +23,10 @@ import org.springframework.stereotype.Component;
@Component @Component
public class UtilsCommand implements CommandMarker { public class UtilsCommand implements CommandMarker {
@CliCommand(value = "utils loadClass", help = "Load a class" )
@CliCommand(value = "utils loadClass", help = "Load a class")
public String loadClass( public String loadClass(
@CliOption(key = {"class"}, help = "Check mode" ) final String clazz @CliOption(key = {"class"}, help = "Check mode") final String clazz
) throws Exception { ) throws Exception {
Class klass = Class.forName(clazz); Class klass = Class.forName(clazz);
return klass.getProtectionDomain().getCodeSource().getLocation().toExternalForm(); return klass.getProtectionDomain().getCodeSource().getLocation().toExternalForm();

View File

@@ -20,16 +20,17 @@ import com.uber.hoodie.common.model.HoodieCommitMetadata;
import com.uber.hoodie.common.table.HoodieTableMetaClient; import com.uber.hoodie.common.table.HoodieTableMetaClient;
import com.uber.hoodie.common.table.HoodieTimeline; import com.uber.hoodie.common.table.HoodieTimeline;
import com.uber.hoodie.common.table.timeline.HoodieInstant; import com.uber.hoodie.common.table.timeline.HoodieInstant;
import java.io.IOException; import java.io.IOException;
import java.util.List; import java.util.List;
public class CommitUtil { public class CommitUtil {
public static long countNewRecords(HoodieTableMetaClient target, List<String> commitsToCatchup) public static long countNewRecords(HoodieTableMetaClient target, List<String> commitsToCatchup)
throws IOException { throws IOException {
long totalNew = 0; long totalNew = 0;
HoodieTimeline timeline = target.getActiveTimeline().reload().getCommitTimeline().filterCompletedInstants(); HoodieTimeline timeline = target.getActiveTimeline().reload().getCommitTimeline()
for(String commit:commitsToCatchup) { .filterCompletedInstants();
for (String commit : commitsToCatchup) {
HoodieCommitMetadata c = HoodieCommitMetadata.fromBytes(timeline HoodieCommitMetadata c = HoodieCommitMetadata.fromBytes(timeline
.getInstantDetails(new HoodieInstant(false, HoodieTimeline.COMMIT_ACTION, commit)) .getInstantDetails(new HoodieInstant(false, HoodieTimeline.COMMIT_ACTION, commit))
.get()); .get());

View File

@@ -17,16 +17,16 @@
package com.uber.hoodie.cli.utils; package com.uber.hoodie.cli.utils;
import com.uber.hoodie.common.table.HoodieTableMetaClient; import com.uber.hoodie.common.table.HoodieTableMetaClient;
import org.apache.commons.dbcp.BasicDataSource;
import org.joda.time.DateTime;
import javax.sql.DataSource;
import java.sql.Connection; import java.sql.Connection;
import java.sql.ResultSet; import java.sql.ResultSet;
import java.sql.SQLException; import java.sql.SQLException;
import java.sql.Statement; import java.sql.Statement;
import javax.sql.DataSource;
import org.apache.commons.dbcp.BasicDataSource;
import org.joda.time.DateTime;
public class HiveUtil { public class HiveUtil {
private static String driverName = "org.apache.hive.jdbc.HiveDriver"; private static String driverName = "org.apache.hive.jdbc.HiveDriver";
static { static {
@@ -39,7 +39,8 @@ public class HiveUtil {
private static Connection connection; private static Connection connection;
private static Connection getConnection(String jdbcUrl, String user, String pass) throws SQLException { private static Connection getConnection(String jdbcUrl, String user, String pass)
throws SQLException {
DataSource ds = getDatasource(jdbcUrl, user, pass); DataSource ds = getDatasource(jdbcUrl, user, pass);
return ds.getConnection(); return ds.getConnection();
} }
@@ -53,22 +54,25 @@ public class HiveUtil {
return ds; return ds;
} }
public static long countRecords(String jdbcUrl, HoodieTableMetaClient source, String dbName, String user, String pass) throws SQLException { public static long countRecords(String jdbcUrl, HoodieTableMetaClient source, String dbName,
String user, String pass) throws SQLException {
Connection conn = HiveUtil.getConnection(jdbcUrl, user, pass); Connection conn = HiveUtil.getConnection(jdbcUrl, user, pass);
ResultSet rs = null; ResultSet rs = null;
Statement stmt = conn.createStatement(); Statement stmt = conn.createStatement();
try { try {
//stmt.execute("set mapred.job.queue.name=<queue_name>"); //stmt.execute("set mapred.job.queue.name=<queue_name>");
stmt.execute("set hive.input.format=org.apache.hadoop.hive.ql.io.HiveInputFormat" ); stmt.execute("set hive.input.format=org.apache.hadoop.hive.ql.io.HiveInputFormat");
stmt.execute("set hive.stats.autogather=false" ); stmt.execute("set hive.stats.autogather=false");
rs = stmt.executeQuery( rs = stmt.executeQuery(
"select count(`_hoodie_commit_time`) as cnt from " + dbName + "." + source.getTableConfig() "select count(`_hoodie_commit_time`) as cnt from " + dbName + "." + source
.getTableConfig()
.getTableName()); .getTableName());
long count = -1; long count = -1;
if(rs.next()) { if (rs.next()) {
count = rs.getLong("cnt"); count = rs.getLong("cnt");
} }
System.out.println("Total records in " + source.getTableConfig().getTableName() + " is " + count); System.out
.println("Total records in " + source.getTableConfig().getTableName() + " is " + count);
return count; return count;
} finally { } finally {
if (rs != null) { if (rs != null) {
@@ -94,7 +98,8 @@ public class HiveUtil {
return countRecords(jdbcUrl, source, srcDb, startDateStr, endDateStr, user, pass); return countRecords(jdbcUrl, source, srcDb, startDateStr, endDateStr, user, pass);
} }
private static long countRecords(String jdbcUrl, HoodieTableMetaClient source, String srcDb, String startDateStr, private static long countRecords(String jdbcUrl, HoodieTableMetaClient source, String srcDb,
String startDateStr,
String endDateStr, String user, String pass) throws SQLException { String endDateStr, String user, String pass) throws SQLException {
Connection conn = HiveUtil.getConnection(jdbcUrl, user, pass); Connection conn = HiveUtil.getConnection(jdbcUrl, user, pass);
ResultSet rs = null; ResultSet rs = null;
@@ -107,7 +112,7 @@ public class HiveUtil {
"select count(`_hoodie_commit_time`) as cnt from " + srcDb + "." + source.getTableConfig() "select count(`_hoodie_commit_time`) as cnt from " + srcDb + "." + source.getTableConfig()
.getTableName() + " where datestr>'" + startDateStr + "' and datestr<='" .getTableName() + " where datestr>'" + startDateStr + "' and datestr<='"
+ endDateStr + "'"); + endDateStr + "'");
if(rs.next()) { if (rs.next()) {
return rs.getLong("cnt"); return rs.getLong("cnt");
} }
return -1; return -1;

View File

@@ -23,8 +23,10 @@ import java.io.InputStreamReader;
import java.util.logging.Logger; import java.util.logging.Logger;
public class InputStreamConsumer extends Thread { public class InputStreamConsumer extends Thread {
protected final static Logger LOG = Logger.getLogger(InputStreamConsumer.class.getName()); protected final static Logger LOG = Logger.getLogger(InputStreamConsumer.class.getName());
private InputStream is; private InputStream is;
public InputStreamConsumer(InputStream is) { public InputStreamConsumer(InputStream is) {
this.is = is; this.is = is;
} }
@@ -35,8 +37,9 @@ public class InputStreamConsumer extends Thread {
InputStreamReader isr = new InputStreamReader(is); InputStreamReader isr = new InputStreamReader(is);
BufferedReader br = new BufferedReader(isr); BufferedReader br = new BufferedReader(isr);
String line; String line;
while ( (line = br.readLine()) != null) while ((line = br.readLine()) != null) {
LOG.info(line); LOG.info(line);
}
} catch (IOException ioe) { } catch (IOException ioe) {
LOG.severe(ioe.toString()); LOG.severe(ioe.toString());
ioe.printStackTrace(); ioe.printStackTrace();

View File

@@ -18,26 +18,20 @@ package com.uber.hoodie.cli.utils;
import com.uber.hoodie.HoodieWriteClient; import com.uber.hoodie.HoodieWriteClient;
import com.uber.hoodie.cli.commands.SparkMain; import com.uber.hoodie.cli.commands.SparkMain;
import java.io.File;
import java.net.URISyntaxException;
import org.apache.log4j.Logger; import org.apache.log4j.Logger;
import org.apache.spark.SparkConf; import org.apache.spark.SparkConf;
import org.apache.spark.api.java.JavaSparkContext; import org.apache.spark.api.java.JavaSparkContext;
import org.apache.spark.launcher.SparkLauncher; import org.apache.spark.launcher.SparkLauncher;
import java.io.File;
import java.net.URISyntaxException;
public class SparkUtil { public class SparkUtil {
public static Logger logger = Logger.getLogger(SparkUtil.class); public static Logger logger = Logger.getLogger(SparkUtil.class);
public static final String DEFUALT_SPARK_MASTER = "yarn-client"; public static final String DEFUALT_SPARK_MASTER = "yarn-client";
/** /**
*
* TODO: Need to fix a bunch of hardcoded stuff here eg: history server, spark distro * TODO: Need to fix a bunch of hardcoded stuff here eg: history server, spark distro
*
* @return
* @throws URISyntaxException
*/ */
public static SparkLauncher initLauncher(String propertiesFile) throws URISyntaxException { public static SparkLauncher initLauncher(String propertiesFile) throws URISyntaxException {
String currentJar = new File( String currentJar = new File(
@@ -65,7 +59,8 @@ public class SparkUtil {
// Configure hadoop conf // Configure hadoop conf
sparkConf.set("spark.hadoop.mapred.output.compress", "true"); sparkConf.set("spark.hadoop.mapred.output.compress", "true");
sparkConf.set("spark.hadoop.mapred.output.compression.codec", "true"); sparkConf.set("spark.hadoop.mapred.output.compression.codec", "true");
sparkConf.set("spark.hadoop.mapred.output.compression.codec", "org.apache.hadoop.io.compress.GzipCodec"); sparkConf.set("spark.hadoop.mapred.output.compression.codec",
"org.apache.hadoop.io.compress.GzipCodec");
sparkConf.set("spark.hadoop.mapred.output.compression.type", "BLOCK"); sparkConf.set("spark.hadoop.mapred.output.compression.type", "BLOCK");
sparkConf = HoodieWriteClient.registerClasses(sparkConf); sparkConf = HoodieWriteClient.registerClasses(sparkConf);

View File

@@ -21,6 +21,6 @@
xsi:schemaLocation="http://www.springframework.org/schema/beans http://www.springframework.org/schema/beans/spring-beans.xsd xsi:schemaLocation="http://www.springframework.org/schema/beans http://www.springframework.org/schema/beans/spring-beans.xsd
http://www.springframework.org/schema/context http://www.springframework.org/schema/context/spring-context.xsd"> http://www.springframework.org/schema/context http://www.springframework.org/schema/context/spring-context.xsd">
<context:component-scan base-package="com.uber.hoodie.cli" /> <context:component-scan base-package="com.uber.hoodie.cli"/>
</beans> </beans>

View File

@@ -34,7 +34,7 @@ import scala.collection.mutable._
/** /**
* Spark job to de-duplicate data present in a partition path * Spark job to de-duplicate data present in a partition path
*/ */
class DedupeSparkJob (basePath: String, class DedupeSparkJob(basePath: String,
duplicatedPartitionPath: String, duplicatedPartitionPath: String,
repairOutputPath: String, repairOutputPath: String,
sqlContext: SQLContext, sqlContext: SQLContext,
@@ -50,8 +50,9 @@ class DedupeSparkJob (basePath: String,
* @param tblName * @param tblName
* @return * @return
*/ */
def getDupeKeyDF(tblName: String) : DataFrame = { def getDupeKeyDF(tblName: String): DataFrame = {
val dupeSql = s""" val dupeSql =
s"""
select `${HoodieRecord.RECORD_KEY_METADATA_FIELD}` as dupe_key, select `${HoodieRecord.RECORD_KEY_METADATA_FIELD}` as dupe_key,
count(*) as dupe_cnt count(*) as dupe_cnt
from ${tblName} from ${tblName}
@@ -69,7 +70,7 @@ class DedupeSparkJob (basePath: String,
* *
* @return * @return
*/ */
private def planDuplicateFix() : HashMap[String, HashSet[String]] = { private def planDuplicateFix(): HashMap[String, HashSet[String]] = {
val tmpTableName = s"htbl_${System.currentTimeMillis()}" val tmpTableName = s"htbl_${System.currentTimeMillis()}"
val dedupeTblName = s"${tmpTableName}_dupeKeys" val dedupeTblName = s"${tmpTableName}_dupeKeys"
@@ -78,17 +79,18 @@ class DedupeSparkJob (basePath: String,
val allFiles = fs.listStatus(new org.apache.hadoop.fs.Path(s"${basePath}/${duplicatedPartitionPath}")) val allFiles = fs.listStatus(new org.apache.hadoop.fs.Path(s"${basePath}/${duplicatedPartitionPath}"))
val fsView = new HoodieTableFileSystemView(metadata, metadata.getActiveTimeline.getCommitTimeline.filterCompletedInstants(), allFiles) val fsView = new HoodieTableFileSystemView(metadata, metadata.getActiveTimeline.getCommitTimeline.filterCompletedInstants(), allFiles)
val latestFiles:java.util.List[HoodieDataFile] = fsView.getLatestDataFiles().collect(Collectors.toList[HoodieDataFile]()) val latestFiles: java.util.List[HoodieDataFile] = fsView.getLatestDataFiles().collect(Collectors.toList[HoodieDataFile]())
val filteredStatuses = latestFiles.map(f => f.getPath) val filteredStatuses = latestFiles.map(f => f.getPath)
LOG.info(s" List of files under partition: ${} => ${filteredStatuses.mkString(" ")}") LOG.info(s" List of files under partition: ${} => ${filteredStatuses.mkString(" ")}")
val df = sqlContext.parquetFile(filteredStatuses:_*) val df = sqlContext.parquetFile(filteredStatuses: _*)
df.registerTempTable(tmpTableName) df.registerTempTable(tmpTableName)
val dupeKeyDF = getDupeKeyDF(tmpTableName) val dupeKeyDF = getDupeKeyDF(tmpTableName)
dupeKeyDF.registerTempTable(dedupeTblName) dupeKeyDF.registerTempTable(dedupeTblName)
// Obtain necessary satellite information for duplicate rows // Obtain necessary satellite information for duplicate rows
val dupeDataSql = s""" val dupeDataSql =
s"""
SELECT `_hoodie_record_key`, `_hoodie_partition_path`, `_hoodie_file_name`, `_hoodie_commit_time` SELECT `_hoodie_record_key`, `_hoodie_partition_path`, `_hoodie_file_name`, `_hoodie_commit_time`
FROM ${tmpTableName} h FROM ${tmpTableName} h
JOIN ${dedupeTblName} d JOIN ${dedupeTblName} d
@@ -111,9 +113,9 @@ class DedupeSparkJob (basePath: String,
rows.foreach(r => { rows.foreach(r => {
val c = r(3).asInstanceOf[String].toLong val c = r(3).asInstanceOf[String].toLong
if (c != maxCommit){ if (c != maxCommit) {
val f = r(2).asInstanceOf[String].split("_")(0) val f = r(2).asInstanceOf[String].split("_")(0)
if (!fileToDeleteKeyMap.contains(f)){ if (!fileToDeleteKeyMap.contains(f)) {
fileToDeleteKeyMap(f) = HashSet[String]() fileToDeleteKeyMap(f) = HashSet[String]()
} }
fileToDeleteKeyMap(f).add(key) fileToDeleteKeyMap(f).add(key)
@@ -130,28 +132,30 @@ class DedupeSparkJob (basePath: String,
val allFiles = fs.listStatus(new Path(s"${basePath}/${duplicatedPartitionPath}")) val allFiles = fs.listStatus(new Path(s"${basePath}/${duplicatedPartitionPath}"))
val fsView = new HoodieTableFileSystemView(metadata, metadata.getActiveTimeline.getCommitTimeline.filterCompletedInstants(), allFiles) val fsView = new HoodieTableFileSystemView(metadata, metadata.getActiveTimeline.getCommitTimeline.filterCompletedInstants(), allFiles)
val latestFiles:java.util.List[HoodieDataFile] = fsView.getLatestDataFiles().collect(Collectors.toList[HoodieDataFile]()) val latestFiles: java.util.List[HoodieDataFile] = fsView.getLatestDataFiles().collect(Collectors.toList[HoodieDataFile]())
val fileNameToPathMap = latestFiles.map(f => (f.getFileId, new Path(f.getPath))).toMap val fileNameToPathMap = latestFiles.map(f => (f.getFileId, new Path(f.getPath))).toMap
val dupeFixPlan = planDuplicateFix() val dupeFixPlan = planDuplicateFix()
// 1. Copy all latest files into the temp fix path // 1. Copy all latest files into the temp fix path
fileNameToPathMap.foreach{ case(fileName, filePath) => { fileNameToPathMap.foreach { case (fileName, filePath) => {
val badSuffix = if (dupeFixPlan.contains(fileName)) ".bad" else "" val badSuffix = if (dupeFixPlan.contains(fileName)) ".bad" else ""
val dstPath = new Path(s"${repairOutputPath}/${filePath.getName}${badSuffix}") val dstPath = new Path(s"${repairOutputPath}/${filePath.getName}${badSuffix}")
LOG.info(s"Copying from ${filePath} to ${dstPath}") LOG.info(s"Copying from ${filePath} to ${dstPath}")
FileUtil.copy(fs, filePath, fs, dstPath, false, true, fs.getConf) FileUtil.copy(fs, filePath, fs, dstPath, false, true, fs.getConf)
}} }
}
// 2. Remove duplicates from the bad files // 2. Remove duplicates from the bad files
dupeFixPlan.foreach{case(fileName, keysToSkip) => { dupeFixPlan.foreach { case (fileName, keysToSkip) => {
val commitTime = FSUtils.getCommitTime(fileNameToPathMap(fileName).getName) val commitTime = FSUtils.getCommitTime(fileNameToPathMap(fileName).getName)
val badFilePath = new Path(s"${repairOutputPath}/${fileNameToPathMap(fileName).getName}.bad") val badFilePath = new Path(s"${repairOutputPath}/${fileNameToPathMap(fileName).getName}.bad")
val newFilePath = new Path(s"${repairOutputPath}/${fileNameToPathMap(fileName).getName}") val newFilePath = new Path(s"${repairOutputPath}/${fileNameToPathMap(fileName).getName}")
LOG.info(" Skipping and writing new file for : " + fileName) LOG.info(" Skipping and writing new file for : " + fileName)
SparkHelpers.skipKeysAndWriteNewFile(commitTime, fs, badFilePath, newFilePath, dupeFixPlan(fileName)) SparkHelpers.skipKeysAndWriteNewFile(commitTime, fs, badFilePath, newFilePath, dupeFixPlan(fileName))
fs.delete(badFilePath, false) fs.delete(badFilePath, false)
}} }
}
// 3. Check that there are no duplicates anymore. // 3. Check that there are no duplicates anymore.
val df = sqlContext.read.parquet(s"${repairOutputPath}/*.parquet") val df = sqlContext.read.parquet(s"${repairOutputPath}/*.parquet")
@@ -186,6 +190,7 @@ class DedupeSparkJob (basePath: String,
LOG.info(s"[FOR REAL!!!] Copying from ${srcPath} to ${dstPath}") LOG.info(s"[FOR REAL!!!] Copying from ${srcPath} to ${dstPath}")
FileUtil.copy(fs, srcPath, fs, dstPath, false, true, fs.getConf) FileUtil.copy(fs, srcPath, fs, dstPath, false, true, fs.getConf)
} }
}} }
}
} }
} }

View File

@@ -17,9 +17,9 @@
package com.uber.hoodie.cli package com.uber.hoodie.cli
import com.uber.hoodie.avro.HoodieAvroWriteSupport import com.uber.hoodie.avro.HoodieAvroWriteSupport
import com.uber.hoodie.common.{BloomFilter, HoodieJsonPayload}
import com.uber.hoodie.common.model.HoodieRecord import com.uber.hoodie.common.model.HoodieRecord
import com.uber.hoodie.common.util.ParquetUtils import com.uber.hoodie.common.util.ParquetUtils
import com.uber.hoodie.common.{BloomFilter, HoodieJsonPayload}
import com.uber.hoodie.config.{HoodieIndexConfig, HoodieStorageConfig} import com.uber.hoodie.config.{HoodieIndexConfig, HoodieStorageConfig}
import com.uber.hoodie.io.storage.{HoodieParquetConfig, HoodieParquetWriter} import com.uber.hoodie.io.storage.{HoodieParquetConfig, HoodieParquetWriter}
import org.apache.avro.Schema import org.apache.avro.Schema
@@ -107,7 +107,7 @@ class SparkHelper(sqlContext: SQLContext, fs: FileSystem) {
* @param file * @param file
* @param sqlContext * @param sqlContext
*/ */
def getKeyCount(file: String, sqlContext: org.apache.spark.sql.SQLContext) ={ def getKeyCount(file: String, sqlContext: org.apache.spark.sql.SQLContext) = {
println(getRowKeyDF(file).collect().size) println(getRowKeyDF(file).collect().size)
} }
@@ -122,7 +122,7 @@ class SparkHelper(sqlContext: SQLContext, fs: FileSystem) {
* @param file * @param file
* @return * @return
*/ */
def fileKeysAgainstBF(conf: Configuration, sqlContext: SQLContext, file: String) : Boolean = { def fileKeysAgainstBF(conf: Configuration, sqlContext: SQLContext, file: String): Boolean = {
val bfStr = SparkHelpers.getBloomFilter(file, conf) val bfStr = SparkHelpers.getBloomFilter(file, conf)
val bf = new com.uber.hoodie.common.BloomFilter(bfStr) val bf = new com.uber.hoodie.common.BloomFilter(bfStr)
val foundCount = sqlContext.parquetFile(file) val foundCount = sqlContext.parquetFile(file)
@@ -134,7 +134,7 @@ class SparkHelper(sqlContext: SQLContext, fs: FileSystem) {
totalCount == foundCount totalCount == foundCount
} }
def getDistinctKeyDF(paths: List[String]) : DataFrame = { def getDistinctKeyDF(paths: List[String]): DataFrame = {
sqlContext.read.parquet(paths:_*).select(s"`${HoodieRecord.RECORD_KEY_METADATA_FIELD}`").distinct() sqlContext.read.parquet(paths: _*).select(s"`${HoodieRecord.RECORD_KEY_METADATA_FIELD}`").distinct()
} }
} }

View File

@@ -15,7 +15,9 @@
~ limitations under the License. ~ limitations under the License.
--> -->
<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd"> <project xmlns="http://maven.apache.org/POM/4.0.0"
xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
<parent> <parent>
<artifactId>hoodie</artifactId> <artifactId>hoodie</artifactId>
<groupId>com.uber.hoodie</groupId> <groupId>com.uber.hoodie</groupId>

View File

@@ -17,25 +17,19 @@
package com.uber.hoodie; package com.uber.hoodie;
import com.google.common.base.Optional; import com.google.common.base.Optional;
import com.uber.hoodie.common.model.HoodieCommitMetadata;
import com.uber.hoodie.common.model.HoodieDataFile;
import com.uber.hoodie.common.model.HoodieKey; import com.uber.hoodie.common.model.HoodieKey;
import com.uber.hoodie.common.model.HoodieRecord; import com.uber.hoodie.common.model.HoodieRecord;
import com.uber.hoodie.common.table.HoodieTableMetaClient; import com.uber.hoodie.common.table.HoodieTableMetaClient;
import com.uber.hoodie.common.table.HoodieTimeline; import com.uber.hoodie.common.table.HoodieTimeline;
import com.uber.hoodie.common.table.TableFileSystemView;
import com.uber.hoodie.common.table.timeline.HoodieInstant;
import com.uber.hoodie.common.table.view.HoodieTableFileSystemView;
import com.uber.hoodie.common.util.FSUtils; import com.uber.hoodie.common.util.FSUtils;
import com.uber.hoodie.config.HoodieWriteConfig; import com.uber.hoodie.config.HoodieWriteConfig;
import com.uber.hoodie.exception.HoodieException;
import com.uber.hoodie.index.bloom.HoodieBloomIndex; import com.uber.hoodie.index.bloom.HoodieBloomIndex;
import com.uber.hoodie.table.HoodieTable; import com.uber.hoodie.table.HoodieTable;
import java.io.Serializable;
import java.util.HashSet;
import java.util.List;
import java.util.Set;
import org.apache.hadoop.fs.FileSystem; import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.log4j.LogManager; import org.apache.log4j.LogManager;
import org.apache.log4j.Logger; import org.apache.log4j.Logger;
import org.apache.spark.SparkConf; import org.apache.spark.SparkConf;
@@ -46,21 +40,10 @@ import org.apache.spark.sql.Dataset;
import org.apache.spark.sql.Row; import org.apache.spark.sql.Row;
import org.apache.spark.sql.SQLContext; import org.apache.spark.sql.SQLContext;
import org.apache.spark.sql.types.StructType; import org.apache.spark.sql.types.StructType;
import java.io.IOException;
import java.io.Serializable;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.HashSet;
import java.util.List;
import java.util.Set;
import java.util.stream.Collectors;
import scala.Tuple2; import scala.Tuple2;
/** /**
* Provides an RDD based API for accessing/filtering Hoodie tables, based on keys. * Provides an RDD based API for accessing/filtering Hoodie tables, based on keys.
*
*/ */
public class HoodieReadClient implements Serializable { public class HoodieReadClient implements Serializable {
@@ -70,8 +53,8 @@ public class HoodieReadClient implements Serializable {
private transient final FileSystem fs; private transient final FileSystem fs;
/** /**
* TODO: We need to persist the index type into hoodie.properties and be able to access the * TODO: We need to persist the index type into hoodie.properties and be able to access the index
* index just with a simple basepath pointing to the dataset. Until, then just always assume a * just with a simple basepath pointing to the dataset. Until, then just always assume a
* BloomIndex * BloomIndex
*/ */
private transient final HoodieBloomIndex index; private transient final HoodieBloomIndex index;
@@ -117,7 +100,8 @@ public class HoodieReadClient implements Serializable {
private void assertSqlContext() { private void assertSqlContext() {
if (!sqlContextOpt.isPresent()) { if (!sqlContextOpt.isPresent()) {
throw new IllegalStateException("SQLContext must be set, when performing dataframe operations"); throw new IllegalStateException(
"SQLContext must be set, when performing dataframe operations");
} }
} }
@@ -158,10 +142,10 @@ public class HoodieReadClient implements Serializable {
} }
/** /**
* Checks if the given [Keys] exists in the hoodie table and returns [Key, * Checks if the given [Keys] exists in the hoodie table and returns [Key, Optional[FullFilePath]]
* Optional[FullFilePath]] If the optional FullFilePath value is not present, then the key is * If the optional FullFilePath value is not present, then the key is not found. If the
* not found. If the FullFilePath value is present, it is the path component (without scheme) of * FullFilePath value is present, it is the path component (without scheme) of the URI underlying
* the URI underlying file * file
*/ */
public JavaPairRDD<HoodieKey, Optional<String>> checkExists(JavaRDD<HoodieKey> hoodieKeys) { public JavaPairRDD<HoodieKey, Optional<String>> checkExists(JavaRDD<HoodieKey> hoodieKeys) {
return index.fetchRecordLocation(hoodieKeys, hoodieTable); return index.fetchRecordLocation(hoodieKeys, hoodieTable);

View File

@@ -50,10 +50,21 @@ import com.uber.hoodie.func.BulkInsertMapFunction;
import com.uber.hoodie.index.HoodieIndex; import com.uber.hoodie.index.HoodieIndex;
import com.uber.hoodie.io.HoodieCommitArchiveLog; import com.uber.hoodie.io.HoodieCommitArchiveLog;
import com.uber.hoodie.metrics.HoodieMetrics; import com.uber.hoodie.metrics.HoodieMetrics;
import com.uber.hoodie.table.UserDefinedBulkInsertPartitioner;
import com.uber.hoodie.table.HoodieTable; import com.uber.hoodie.table.HoodieTable;
import com.uber.hoodie.table.UserDefinedBulkInsertPartitioner;
import com.uber.hoodie.table.WorkloadProfile; import com.uber.hoodie.table.WorkloadProfile;
import com.uber.hoodie.table.WorkloadStat; import com.uber.hoodie.table.WorkloadStat;
import java.io.IOException;
import java.io.Serializable;
import java.nio.charset.StandardCharsets;
import java.text.ParseException;
import java.util.Collections;
import java.util.Date;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import java.util.Optional;
import java.util.stream.Collectors;
import org.apache.hadoop.fs.FileSystem; import org.apache.hadoop.fs.FileSystem;
import org.apache.log4j.LogManager; import org.apache.log4j.LogManager;
import org.apache.log4j.Logger; import org.apache.log4j.Logger;
@@ -66,25 +77,12 @@ import org.apache.spark.storage.StorageLevel;
import scala.Option; import scala.Option;
import scala.Tuple2; import scala.Tuple2;
import java.io.IOException;
import java.io.Serializable;
import java.nio.charset.StandardCharsets;
import java.text.ParseException;
import java.util.Collections;
import java.util.Date;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import java.util.Optional;
import java.util.stream.Collectors;
/** /**
* Hoodie Write Client helps you build datasets on HDFS [insert()] and then * Hoodie Write Client helps you build datasets on HDFS [insert()] and then perform efficient
* perform efficient mutations on a HDFS dataset [upsert()] * mutations on a HDFS dataset [upsert()]
*
* Note that, at any given time, there can only be one Spark job performing
* these operatons on a Hoodie dataset.
* *
* Note that, at any given time, there can only be one Spark job performing these operatons on a
* Hoodie dataset.
*/ */
public class HoodieWriteClient<T extends HoodieRecordPayload> implements Serializable { public class HoodieWriteClient<T extends HoodieRecordPayload> implements Serializable {
@@ -102,7 +100,8 @@ public class HoodieWriteClient<T extends HoodieRecordPayload> implements Seriali
* @param clientConfig * @param clientConfig
* @throws Exception * @throws Exception
*/ */
public HoodieWriteClient(JavaSparkContext jsc, HoodieWriteConfig clientConfig) throws Exception { public HoodieWriteClient(JavaSparkContext jsc, HoodieWriteConfig clientConfig)
throws Exception {
this(jsc, clientConfig, false); this(jsc, clientConfig, false);
} }
@@ -111,7 +110,8 @@ public class HoodieWriteClient<T extends HoodieRecordPayload> implements Seriali
* @param clientConfig * @param clientConfig
* @param rollbackInFlight * @param rollbackInFlight
*/ */
public HoodieWriteClient(JavaSparkContext jsc, HoodieWriteConfig clientConfig, boolean rollbackInFlight) { public HoodieWriteClient(JavaSparkContext jsc, HoodieWriteConfig clientConfig,
boolean rollbackInFlight) {
this.fs = FSUtils.getFs(); this.fs = FSUtils.getFs();
this.jsc = jsc; this.jsc = jsc;
this.config = clientConfig; this.config = clientConfig;
@@ -170,8 +170,8 @@ public class HoodieWriteClient<T extends HoodieRecordPayload> implements Seriali
* Inserts the given HoodieRecords, into the table. This API is intended to be used for normal * Inserts the given HoodieRecords, into the table. This API is intended to be used for normal
* writes. * writes.
* *
* This implementation skips the index check and is able to leverage benefits such as * This implementation skips the index check and is able to leverage benefits such as small file
* small file handling/blocking alignment, as with upsert(), by profiling the workload * handling/blocking alignment, as with upsert(), by profiling the workload
* *
* @param records HoodieRecords to insert * @param records HoodieRecords to insert
* @param commitTime Commit Time handle * @param commitTime Commit Time handle
@@ -210,7 +210,8 @@ public class HoodieWriteClient<T extends HoodieRecordPayload> implements Seriali
* @param commitTime Commit Time handle * @param commitTime Commit Time handle
* @return JavaRDD[WriteStatus] - RDD of WriteStatus to inspect errors and counts * @return JavaRDD[WriteStatus] - RDD of WriteStatus to inspect errors and counts
*/ */
public JavaRDD<WriteStatus> bulkInsert(JavaRDD<HoodieRecord<T>> records, final String commitTime) { public JavaRDD<WriteStatus> bulkInsert(JavaRDD<HoodieRecord<T>> records,
final String commitTime) {
return bulkInsert(records, commitTime, Option.empty()); return bulkInsert(records, commitTime, Option.empty());
} }
@@ -221,16 +222,18 @@ public class HoodieWriteClient<T extends HoodieRecordPayload> implements Seriali
* *
* This implementation uses sortBy (which does range partitioning based on reservoir sampling) and * This implementation uses sortBy (which does range partitioning based on reservoir sampling) and
* attempts to control the numbers of files with less memory compared to the {@link * attempts to control the numbers of files with less memory compared to the {@link
* HoodieWriteClient#insert(JavaRDD, String)}. Optionally it allows users to specify their own partitioner. If * HoodieWriteClient#insert(JavaRDD, String)}. Optionally it allows users to specify their own
* specified then it will be used for repartitioning records. See {@link UserDefinedBulkInsertPartitioner}. * partitioner. If specified then it will be used for repartitioning records. See {@link
* UserDefinedBulkInsertPartitioner}.
* *
* @param records HoodieRecords to insert * @param records HoodieRecords to insert
* @param commitTime Commit Time handle * @param commitTime Commit Time handle
* @param bulkInsertPartitioner If specified then it will be used to partition input records before they are * @param bulkInsertPartitioner If specified then it will be used to partition input records
* inserted into hoodie. * before they are inserted into hoodie.
* @return JavaRDD[WriteStatus] - RDD of WriteStatus to inspect errors and counts * @return JavaRDD[WriteStatus] - RDD of WriteStatus to inspect errors and counts
*/ */
public JavaRDD<WriteStatus> bulkInsert(JavaRDD<HoodieRecord<T>> records, final String commitTime, public JavaRDD<WriteStatus> bulkInsert(JavaRDD<HoodieRecord<T>> records,
final String commitTime,
Option<UserDefinedBulkInsertPartitioner> bulkInsertPartitioner) { Option<UserDefinedBulkInsertPartitioner> bulkInsertPartitioner) {
writeContext = metrics.getCommitCtx(); writeContext = metrics.getCommitCtx();
// Create a Hoodie table which encapsulated the commits and files visible // Create a Hoodie table which encapsulated the commits and files visible
@@ -240,7 +243,8 @@ public class HoodieWriteClient<T extends HoodieRecordPayload> implements Seriali
try { try {
// De-dupe/merge if needed // De-dupe/merge if needed
JavaRDD<HoodieRecord<T>> dedupedRecords = JavaRDD<HoodieRecord<T>> dedupedRecords =
combineOnCondition(config.shouldCombineBeforeInsert(), records, config.getInsertShuffleParallelism()); combineOnCondition(config.shouldCombineBeforeInsert(), records,
config.getInsertShuffleParallelism());
final JavaRDD<HoodieRecord<T>> repartitionedRecords; final JavaRDD<HoodieRecord<T>> repartitionedRecords;
if (bulkInsertPartitioner.isDefined()) { if (bulkInsertPartitioner.isDefined()) {
@@ -259,7 +263,8 @@ public class HoodieWriteClient<T extends HoodieRecordPayload> implements Seriali
}, true, config.getBulkInsertShuffleParallelism()); }, true, config.getBulkInsertShuffleParallelism());
} }
JavaRDD<WriteStatus> writeStatusRDD = repartitionedRecords JavaRDD<WriteStatus> writeStatusRDD = repartitionedRecords
.mapPartitionsWithIndex(new BulkInsertMapFunction<T>(commitTime, config, table), true) .mapPartitionsWithIndex(new BulkInsertMapFunction<T>(commitTime, config, table),
true)
.flatMap(writeStatuses -> writeStatuses.iterator()); .flatMap(writeStatuses -> writeStatuses.iterator());
return updateIndexAndCommitIfNeeded(writeStatusRDD, table, commitTime); return updateIndexAndCommitIfNeeded(writeStatusRDD, table, commitTime);
@@ -267,12 +272,13 @@ public class HoodieWriteClient<T extends HoodieRecordPayload> implements Seriali
if (e instanceof HoodieInsertException) { if (e instanceof HoodieInsertException) {
throw e; throw e;
} }
throw new HoodieInsertException("Failed to bulk insert for commit time " + commitTime, e); throw new HoodieInsertException("Failed to bulk insert for commit time " + commitTime,
e);
} }
} }
private void commitOnAutoCommit(String commitTime, JavaRDD<WriteStatus> resultRDD) { private void commitOnAutoCommit(String commitTime, JavaRDD<WriteStatus> resultRDD) {
if(config.shouldAutoCommit()) { if (config.shouldAutoCommit()) {
logger.info("Auto commit enabled: Committing " + commitTime); logger.info("Auto commit enabled: Committing " + commitTime);
boolean commitResult = commit(commitTime, resultRDD); boolean commitResult = commit(commitTime, resultRDD);
if (!commitResult) { if (!commitResult) {
@@ -286,24 +292,22 @@ public class HoodieWriteClient<T extends HoodieRecordPayload> implements Seriali
private JavaRDD<HoodieRecord<T>> combineOnCondition(boolean condition, private JavaRDD<HoodieRecord<T>> combineOnCondition(boolean condition,
JavaRDD<HoodieRecord<T>> records, JavaRDD<HoodieRecord<T>> records,
int parallelism) { int parallelism) {
if(condition) { if (condition) {
return deduplicateRecords(records, parallelism); return deduplicateRecords(records, parallelism);
} }
return records; return records;
} }
/** /**
* * Save the workload profile in an intermediate file (here re-using commit files) This is useful
* Save the workload profile in an intermediate file (here re-using commit files) * when performing rollback for MOR datasets. Only updates are recorded in the workload profile
* This is useful when performing rollback for MOR datasets. Only updates are recorded * metadata since updates to log blocks are unknown across batches Inserts (which are new parquet
* in the workload profile metadata since updates to log blocks are unknown across batches * files) are rolled back based on commit time. // TODO : Create a new WorkloadProfile metadata
* Inserts (which are new parquet files) are rolled back based on commit time. * file instead of using HoodieCommitMetadata
* // TODO : Create a new WorkloadProfile metadata file instead of using HoodieCommitMetadata
* @param profile
* @param commitTime
* @throws HoodieCommitException
*/ */
private void saveWorkloadProfileMetadataToInflight(WorkloadProfile profile, HoodieTable<T> table, String commitTime) throws HoodieCommitException { private void saveWorkloadProfileMetadataToInflight(WorkloadProfile profile,
HoodieTable<T> table,
String commitTime) throws HoodieCommitException {
try { try {
HoodieCommitMetadata metadata = new HoodieCommitMetadata(); HoodieCommitMetadata metadata = new HoodieCommitMetadata();
profile.getPartitionPaths().stream().forEach(path -> { profile.getPartitionPaths().stream().forEach(path -> {
@@ -320,8 +324,9 @@ public class HoodieWriteClient<T extends HoodieRecordPayload> implements Seriali
Optional<HoodieInstant> instant = activeTimeline.filterInflights().lastInstant(); Optional<HoodieInstant> instant = activeTimeline.filterInflights().lastInstant();
activeTimeline.saveToInflight(instant.get(), activeTimeline.saveToInflight(instant.get(),
Optional.of(metadata.toJsonString().getBytes(StandardCharsets.UTF_8))); Optional.of(metadata.toJsonString().getBytes(StandardCharsets.UTF_8)));
} catch(IOException io) { } catch (IOException io) {
throw new HoodieCommitException("Failed to commit " + commitTime + " unable to save inflight metadata ", io); throw new HoodieCommitException(
"Failed to commit " + commitTime + " unable to save inflight metadata ", io);
} }
} }
@@ -358,7 +363,8 @@ public class HoodieWriteClient<T extends HoodieRecordPayload> implements Seriali
return updateIndexAndCommitIfNeeded(writeStatusRDD, hoodieTable, commitTime); return updateIndexAndCommitIfNeeded(writeStatusRDD, hoodieTable, commitTime);
} }
private Partitioner getPartitioner(HoodieTable table, boolean isUpsert, WorkloadProfile profile) { private Partitioner getPartitioner(HoodieTable table, boolean isUpsert,
WorkloadProfile profile) {
if (isUpsert) { if (isUpsert) {
return table.getUpsertPartitioner(profile); return table.getUpsertPartitioner(profile);
} else { } else {
@@ -366,7 +372,8 @@ public class HoodieWriteClient<T extends HoodieRecordPayload> implements Seriali
} }
} }
private JavaRDD<WriteStatus> updateIndexAndCommitIfNeeded(JavaRDD<WriteStatus> writeStatusRDD, HoodieTable<T> table, String commitTime) { private JavaRDD<WriteStatus> updateIndexAndCommitIfNeeded(JavaRDD<WriteStatus> writeStatusRDD,
HoodieTable<T> table, String commitTime) {
// Update the index back // Update the index back
JavaRDD<WriteStatus> statuses = index.updateLocation(writeStatusRDD, table); JavaRDD<WriteStatus> statuses = index.updateLocation(writeStatusRDD, table);
// Trigger the insert and collect statuses // Trigger the insert and collect statuses
@@ -375,10 +382,13 @@ public class HoodieWriteClient<T extends HoodieRecordPayload> implements Seriali
return statuses; return statuses;
} }
private JavaRDD<HoodieRecord<T>> partition(JavaRDD<HoodieRecord<T>> dedupedRecords, Partitioner partitioner) { private JavaRDD<HoodieRecord<T>> partition(JavaRDD<HoodieRecord<T>> dedupedRecords,
Partitioner partitioner) {
return dedupedRecords return dedupedRecords
.mapToPair(record -> .mapToPair(record ->
new Tuple2<>(new Tuple2<>(record.getKey(), Option.apply(record.getCurrentLocation())), record)) new Tuple2<>(
new Tuple2<>(record.getKey(), Option.apply(record.getCurrentLocation())),
record))
.partitionBy(partitioner) .partitionBy(partitioner)
.map(tuple -> tuple._2()); .map(tuple -> tuple._2());
} }
@@ -438,7 +448,7 @@ public class HoodieWriteClient<T extends HoodieRecordPayload> implements Seriali
// We cannot have unbounded commit files. Archive commits if we have to archive // We cannot have unbounded commit files. Archive commits if we have to archive
archiveLog.archiveIfRequired(); archiveLog.archiveIfRequired();
if(config.isAutoClean()) { if (config.isAutoClean()) {
// Call clean to cleanup if there is anything to cleanup after the commit, // Call clean to cleanup if there is anything to cleanup after the commit,
logger.info("Auto cleaning is enabled. Running cleaner now"); logger.info("Auto cleaning is enabled. Running cleaner now");
clean(commitTime); clean(commitTime);
@@ -465,12 +475,12 @@ public class HoodieWriteClient<T extends HoodieRecordPayload> implements Seriali
} }
/** /**
* Savepoint a specific commit. Latest version of data files as of the passed in commitTime * Savepoint a specific commit. Latest version of data files as of the passed in commitTime will
* will be referenced in the savepoint and will never be cleaned. The savepointed commit * be referenced in the savepoint and will never be cleaned. The savepointed commit will never be
* will never be rolledback or archived. * rolledback or archived.
* *
* This gives an option to rollback the state to the savepoint anytime. * This gives an option to rollback the state to the savepoint anytime. Savepoint needs to be
* Savepoint needs to be manually created and deleted. * manually created and deleted.
* *
* Savepoint should be on a commit that could not have been cleaned. * Savepoint should be on a commit that could not have been cleaned.
* *
@@ -491,12 +501,12 @@ public class HoodieWriteClient<T extends HoodieRecordPayload> implements Seriali
} }
/** /**
* Savepoint a specific commit. Latest version of data files as of the passed in commitTime * Savepoint a specific commit. Latest version of data files as of the passed in commitTime will
* will be referenced in the savepoint and will never be cleaned. The savepointed commit * be referenced in the savepoint and will never be cleaned. The savepointed commit will never be
* will never be rolledback or archived. * rolledback or archived.
* *
* This gives an option to rollback the state to the savepoint anytime. * This gives an option to rollback the state to the savepoint anytime. Savepoint needs to be
* Savepoint needs to be manually created and deleted. * manually created and deleted.
* *
* Savepoint should be on a commit that could not have been cleaned. * Savepoint should be on a commit that could not have been cleaned.
* *
@@ -510,9 +520,11 @@ public class HoodieWriteClient<T extends HoodieRecordPayload> implements Seriali
.getHoodieTable(new HoodieTableMetaClient(fs, config.getBasePath(), true), config); .getHoodieTable(new HoodieTableMetaClient(fs, config.getBasePath(), true), config);
Optional<HoodieInstant> cleanInstant = table.getCompletedCleanTimeline().lastInstant(); Optional<HoodieInstant> cleanInstant = table.getCompletedCleanTimeline().lastInstant();
HoodieInstant commitInstant = new HoodieInstant(false, HoodieTimeline.COMMIT_ACTION, commitTime); HoodieInstant commitInstant = new HoodieInstant(false, HoodieTimeline.COMMIT_ACTION,
if(!table.getCompletedCommitTimeline().containsInstant(commitInstant)) { commitTime);
throw new HoodieSavepointException("Could not savepoint non-existing commit " + commitInstant); if (!table.getCompletedCommitTimeline().containsInstant(commitInstant)) {
throw new HoodieSavepointException(
"Could not savepoint non-existing commit " + commitInstant);
} }
try { try {
@@ -534,7 +546,8 @@ public class HoodieWriteClient<T extends HoodieRecordPayload> implements Seriali
+ lastCommitRetained); + lastCommitRetained);
Map<String, List<String>> latestFilesMap = jsc.parallelize( Map<String, List<String>> latestFilesMap = jsc.parallelize(
FSUtils.getAllPartitionPaths(fs, table.getMetaClient().getBasePath(), config.shouldAssumeDatePartitioning())) FSUtils.getAllPartitionPaths(fs, table.getMetaClient().getBasePath(),
config.shouldAssumeDatePartitioning()))
.mapToPair((PairFunction<String, String, List<String>>) partitionPath -> { .mapToPair((PairFunction<String, String, List<String>>) partitionPath -> {
// Scan all partitions files with this commit time // Scan all partitions files with this commit time
logger.info("Collecting latest files in partition path " + partitionPath); logger.info("Collecting latest files in partition path " + partitionPath);
@@ -559,8 +572,8 @@ public class HoodieWriteClient<T extends HoodieRecordPayload> implements Seriali
} }
/** /**
* Delete a savepoint that was created. Once the savepoint is deleted, the commit can be rolledback * Delete a savepoint that was created. Once the savepoint is deleted, the commit can be
* and cleaner may clean up data files. * rolledback and cleaner may clean up data files.
* *
* @param savepointTime - delete the savepoint * @param savepointTime - delete the savepoint
* @return true if the savepoint was deleted successfully * @return true if the savepoint was deleted successfully
@@ -586,9 +599,8 @@ public class HoodieWriteClient<T extends HoodieRecordPayload> implements Seriali
} }
/** /**
* Rollback the state to the savepoint. * Rollback the state to the savepoint. WARNING: This rollsback recent commits and deleted data
* WARNING: This rollsback recent commits and deleted data files. Queries accessing the files * files. Queries accessing the files will mostly fail. This should be done during a downtime.
* will mostly fail. This should be done during a downtime.
* *
* @param savepointTime - savepoint time to rollback to * @param savepointTime - savepoint time to rollback to
* @return true if the savepoint was rollecback to successfully * @return true if the savepoint was rollecback to successfully
@@ -616,7 +628,8 @@ public class HoodieWriteClient<T extends HoodieRecordPayload> implements Seriali
// Make sure the rollback was successful // Make sure the rollback was successful
Optional<HoodieInstant> lastInstant = Optional<HoodieInstant> lastInstant =
activeTimeline.reload().getCommitsAndCompactionsTimeline().filterCompletedInstants().lastInstant(); activeTimeline.reload().getCommitsAndCompactionsTimeline().filterCompletedInstants()
.lastInstant();
Preconditions.checkArgument(lastInstant.isPresent()); Preconditions.checkArgument(lastInstant.isPresent());
Preconditions.checkArgument(lastInstant.get().getTimestamp().equals(savepointTime), Preconditions.checkArgument(lastInstant.get().getTimestamp().equals(savepointTime),
savepointTime + "is not the last commit after rolling back " + commitsToRollback savepointTime + "is not the last commit after rolling back " + commitsToRollback
@@ -625,12 +638,9 @@ public class HoodieWriteClient<T extends HoodieRecordPayload> implements Seriali
} }
/** /**
* Rollback the (inflight/committed) record changes with the given commit time. * Rollback the (inflight/committed) record changes with the given commit time. Three steps: (1)
* Three steps: * Atomically unpublish this commit (2) clean indexing data, (3) clean new generated parquet
* (1) Atomically unpublish this commit * files. (4) Finally delete .commit or .inflight file,
* (2) clean indexing data,
* (3) clean new generated parquet files.
* (4) Finally delete .commit or .inflight file,
*/ */
public boolean rollback(final String commitTime) throws HoodieRollbackException { public boolean rollback(final String commitTime) throws HoodieRollbackException {
rollback(Lists.newArrayList(commitTime)); rollback(Lists.newArrayList(commitTime));
@@ -638,7 +648,7 @@ public class HoodieWriteClient<T extends HoodieRecordPayload> implements Seriali
} }
private void rollback(List<String> commits) { private void rollback(List<String> commits) {
if(commits.isEmpty()) { if (commits.isEmpty()) {
logger.info("List of commits to rollback is empty"); logger.info("List of commits to rollback is empty");
return; return;
} }
@@ -702,7 +712,9 @@ public class HoodieWriteClient<T extends HoodieRecordPayload> implements Seriali
Optional<Long> durationInMs = Optional.empty(); Optional<Long> durationInMs = Optional.empty();
if (context != null) { if (context != null) {
durationInMs = Optional.of(metrics.getDurationInMs(context.stop())); durationInMs = Optional.of(metrics.getDurationInMs(context.stop()));
Long numFilesDeleted = stats.stream().mapToLong(stat -> stat.getSuccessDeleteFiles().size()).sum(); Long numFilesDeleted = stats.stream()
.mapToLong(stat -> stat.getSuccessDeleteFiles().size())
.sum();
metrics.updateRollbackMetrics(durationInMs.get(), numFilesDeleted); metrics.updateRollbackMetrics(durationInMs.get(), numFilesDeleted);
} }
HoodieRollbackMetadata rollbackMetadata = HoodieRollbackMetadata rollbackMetadata =
@@ -733,9 +745,9 @@ public class HoodieWriteClient<T extends HoodieRecordPayload> implements Seriali
} }
/** /**
* Clean up any stale/old files/data lying around (either on file storage or index storage) * Clean up any stale/old files/data lying around (either on file storage or index storage) based
* based on the configurations and CleaningPolicy used. (typically files that no longer can be used * on the configurations and CleaningPolicy used. (typically files that no longer can be used by a
* by a running query can be cleaned) * running query can be cleaned)
*/ */
public void clean() throws HoodieIOException { public void clean() throws HoodieIOException {
String startCleanTime = HoodieActiveTimeline.createNewCommitTime(); String startCleanTime = HoodieActiveTimeline.createNewCommitTime();
@@ -743,9 +755,9 @@ public class HoodieWriteClient<T extends HoodieRecordPayload> implements Seriali
} }
/** /**
* Clean up any stale/old files/data lying around (either on file storage or index storage) * Clean up any stale/old files/data lying around (either on file storage or index storage) based
* based on the configurations and CleaningPolicy used. (typically files that no longer can be used * on the configurations and CleaningPolicy used. (typically files that no longer can be used by a
* by a running query can be cleaned) * running query can be cleaned)
*/ */
private void clean(String startCleanTime) throws HoodieIOException { private void clean(String startCleanTime) throws HoodieIOException {
try { try {
@@ -811,14 +823,16 @@ public class HoodieWriteClient<T extends HoodieRecordPayload> implements Seriali
} }
public static SparkConf registerClasses(SparkConf conf) { public static SparkConf registerClasses(SparkConf conf) {
conf.registerKryoClasses(new Class[]{HoodieWriteConfig.class, HoodieRecord.class, HoodieKey.class}); conf.registerKryoClasses(
new Class[]{HoodieWriteConfig.class, HoodieRecord.class, HoodieKey.class});
return conf; return conf;
} }
/** /**
* Deduplicate Hoodie records, using the given deduplication funciton. * Deduplicate Hoodie records, using the given deduplication funciton.
*/ */
private JavaRDD<HoodieRecord<T>> deduplicateRecords(JavaRDD<HoodieRecord<T>> records, int parallelism) { private JavaRDD<HoodieRecord<T>> deduplicateRecords(JavaRDD<HoodieRecord<T>> records,
int parallelism) {
return records return records
.mapToPair(record -> new Tuple2<>(record.getKey(), record)) .mapToPair(record -> new Tuple2<>(record.getKey(), record))
.reduceByKey((rec1, rec2) -> { .reduceByKey((rec1, rec2) -> {
@@ -833,8 +847,6 @@ public class HoodieWriteClient<T extends HoodieRecordPayload> implements Seriali
/** /**
* Cleanup all inflight commits * Cleanup all inflight commits
*
* @throws IOException
*/ */
private void rollbackInflightCommits() { private void rollbackInflightCommits() {
HoodieTable<T> table = HoodieTable HoodieTable<T> table = HoodieTable

View File

@@ -19,7 +19,6 @@ package com.uber.hoodie;
import com.uber.hoodie.common.model.HoodieKey; import com.uber.hoodie.common.model.HoodieKey;
import com.uber.hoodie.common.model.HoodieRecord; import com.uber.hoodie.common.model.HoodieRecord;
import com.uber.hoodie.common.model.HoodieWriteStat; import com.uber.hoodie.common.model.HoodieWriteStat;
import java.io.Serializable; import java.io.Serializable;
import java.util.ArrayList; import java.util.ArrayList;
import java.util.HashMap; import java.util.HashMap;
@@ -50,12 +49,14 @@ public class WriteStatus implements Serializable {
private long totalErrorRecords = 0; private long totalErrorRecords = 0;
/** /**
* Mark write as success, optionally using given parameters for the purpose of calculating * Mark write as success, optionally using given parameters for the purpose of calculating some
* some aggregate metrics. This method is not meant to cache passed arguments, since WriteStatus * aggregate metrics. This method is not meant to cache passed arguments, since WriteStatus
* objects are collected in Spark Driver. * objects are collected in Spark Driver.
* *
* @param record deflated {@code HoodieRecord} containing information that uniquely identifies it. * @param record deflated {@code HoodieRecord} containing information that uniquely identifies
* @param optionalRecordMetadata optional metadata related to data contained in {@link HoodieRecord} before deflation. * it.
* @param optionalRecordMetadata optional metadata related to data contained in {@link
* HoodieRecord} before deflation.
*/ */
public void markSuccess(HoodieRecord record, public void markSuccess(HoodieRecord record,
Optional<Map<String, String>> optionalRecordMetadata) { Optional<Map<String, String>> optionalRecordMetadata) {
@@ -64,12 +65,14 @@ public class WriteStatus implements Serializable {
} }
/** /**
* Mark write as failed, optionally using given parameters for the purpose of calculating * Mark write as failed, optionally using given parameters for the purpose of calculating some
* some aggregate metrics. This method is not meant to cache passed arguments, since WriteStatus * aggregate metrics. This method is not meant to cache passed arguments, since WriteStatus
* objects are collected in Spark Driver. * objects are collected in Spark Driver.
* *
* @param record deflated {@code HoodieRecord} containing information that uniquely identifies it. * @param record deflated {@code HoodieRecord} containing information that uniquely identifies
* @param optionalRecordMetadata optional metadata related to data contained in {@link HoodieRecord} before deflation. * it.
* @param optionalRecordMetadata optional metadata related to data contained in {@link
* HoodieRecord} before deflation.
*/ */
public void markFailure(HoodieRecord record, Throwable t, public void markFailure(HoodieRecord record, Throwable t,
Optional<Map<String, String>> optionalRecordMetadata) { Optional<Map<String, String>> optionalRecordMetadata) {
@@ -139,7 +142,9 @@ public class WriteStatus implements Serializable {
return totalRecords; return totalRecords;
} }
public long getTotalErrorRecords() { return totalErrorRecords; } public long getTotalErrorRecords() {
return totalErrorRecords;
}
@Override @Override
public String toString() { public String toString() {

View File

@@ -17,14 +17,15 @@
package com.uber.hoodie.config; package com.uber.hoodie.config;
import java.io.Serializable; import java.io.Serializable;
import java.util.Map;
import java.util.Properties; import java.util.Properties;
/** /**
* Default Way to load Hoodie config through a java.util.Properties * Default Way to load Hoodie config through a java.util.Properties
*/ */
public class DefaultHoodieConfig implements Serializable { public class DefaultHoodieConfig implements Serializable {
protected final Properties props; protected final Properties props;
public DefaultHoodieConfig(Properties props) { public DefaultHoodieConfig(Properties props) {
this.props = props; this.props = props;
} }
@@ -40,7 +41,8 @@ public class DefaultHoodieConfig implements Serializable {
} }
} }
public static void setDefaultOnCondition(Properties props, boolean condition, DefaultHoodieConfig config) { public static void setDefaultOnCondition(Properties props, boolean condition,
DefaultHoodieConfig config) {
if (condition) { if (condition) {
props.putAll(config.getProps()); props.putAll(config.getProps());
} }

View File

@@ -19,21 +19,20 @@ package com.uber.hoodie.config;
import com.google.common.base.Preconditions; import com.google.common.base.Preconditions;
import com.uber.hoodie.common.model.HoodieAvroPayload; import com.uber.hoodie.common.model.HoodieAvroPayload;
import com.uber.hoodie.common.model.HoodieCleaningPolicy; import com.uber.hoodie.common.model.HoodieCleaningPolicy;
import com.uber.hoodie.common.model.HoodieRecordPayload;
import com.uber.hoodie.io.compact.strategy.CompactionStrategy; import com.uber.hoodie.io.compact.strategy.CompactionStrategy;
import com.uber.hoodie.io.compact.strategy.LogFileSizeBasedCompactionStrategy; import com.uber.hoodie.io.compact.strategy.LogFileSizeBasedCompactionStrategy;
import javax.annotation.concurrent.Immutable;
import java.io.File; import java.io.File;
import java.io.FileReader; import java.io.FileReader;
import java.io.IOException; import java.io.IOException;
import java.util.Properties; import java.util.Properties;
import javax.annotation.concurrent.Immutable;
/** /**
* Compaction related config * Compaction related config
*/ */
@Immutable @Immutable
public class HoodieCompactionConfig extends DefaultHoodieConfig { public class HoodieCompactionConfig extends DefaultHoodieConfig {
public static final String CLEANER_POLICY_PROP = "hoodie.cleaner.policy"; public static final String CLEANER_POLICY_PROP = "hoodie.cleaner.policy";
private static final String DEFAULT_CLEANER_POLICY = private static final String DEFAULT_CLEANER_POLICY =
HoodieCleaningPolicy.KEEP_LATEST_COMMITS.name(); HoodieCleaningPolicy.KEEP_LATEST_COMMITS.name();
@@ -66,7 +65,9 @@ public class HoodieCompactionConfig extends DefaultHoodieConfig {
public static final String DEFAULT_PARQUET_SMALL_FILE_LIMIT_BYTES = String.valueOf(0); public static final String DEFAULT_PARQUET_SMALL_FILE_LIMIT_BYTES = String.valueOf(0);
/** Configs related to specific table types **/ /**
* Configs related to specific table types
**/
// Number of inserts, that will be put each partition/bucket for writing // Number of inserts, that will be put each partition/bucket for writing
public static final String COPY_ON_WRITE_TABLE_INSERT_SPLIT_SIZE = "hoodie.copyonwrite.insert.split.size"; public static final String COPY_ON_WRITE_TABLE_INSERT_SPLIT_SIZE = "hoodie.copyonwrite.insert.split.size";
// The rationale to pick the insert parallelism is the following. Writing out 100MB files, // The rationale to pick the insert parallelism is the following. Writing out 100MB files,
@@ -82,7 +83,8 @@ public class HoodieCompactionConfig extends DefaultHoodieConfig {
// This value is used as a guessimate for the record size, if we can't determine this from previous commits // This value is used as a guessimate for the record size, if we can't determine this from previous commits
public static final String COPY_ON_WRITE_TABLE_RECORD_SIZE_ESTIMATE = "hoodie.copyonwrite.record.size.estimate"; public static final String COPY_ON_WRITE_TABLE_RECORD_SIZE_ESTIMATE = "hoodie.copyonwrite.record.size.estimate";
// Used to determine how much more can be packed into a small file, before it exceeds the size limit. // Used to determine how much more can be packed into a small file, before it exceeds the size limit.
public static final String DEFAULT_COPY_ON_WRITE_TABLE_RECORD_SIZE_ESTIMATE = String.valueOf(1024); public static final String DEFAULT_COPY_ON_WRITE_TABLE_RECORD_SIZE_ESTIMATE = String
.valueOf(1024);
public static final String CLEANER_PARALLELISM = "hoodie.cleaner.parallelism"; public static final String CLEANER_PARALLELISM = "hoodie.cleaner.parallelism";
public static final String DEFAULT_CLEANER_PARALLELISM = String.valueOf(200); public static final String DEFAULT_CLEANER_PARALLELISM = String.valueOf(200);
@@ -93,7 +95,8 @@ public class HoodieCompactionConfig extends DefaultHoodieConfig {
public static final String COMPACTION_STRATEGY_PROP = "hoodie.compaction.strategy"; public static final String COMPACTION_STRATEGY_PROP = "hoodie.compaction.strategy";
// 200GB of target IO per compaction // 200GB of target IO per compaction
public static final String DEFAULT_COMPACTION_STRATEGY = LogFileSizeBasedCompactionStrategy.class.getName(); public static final String DEFAULT_COMPACTION_STRATEGY = LogFileSizeBasedCompactionStrategy.class
.getName();
// used to merge records written to log file // used to merge records written to log file
public static final String DEFAULT_PAYLOAD_CLASS = HoodieAvroPayload.class.getName(); public static final String DEFAULT_PAYLOAD_CLASS = HoodieAvroPayload.class.getName();
@@ -108,6 +111,7 @@ public class HoodieCompactionConfig extends DefaultHoodieConfig {
} }
public static class Builder { public static class Builder {
private final Properties props = new Properties(); private final Properties props = new Properties();
public Builder fromFile(File propertiesFile) throws IOException { public Builder fromFile(File propertiesFile) throws IOException {
@@ -174,12 +178,14 @@ public class HoodieCompactionConfig extends DefaultHoodieConfig {
} }
public Builder autoTuneInsertSplits(boolean autoTuneInsertSplits) { public Builder autoTuneInsertSplits(boolean autoTuneInsertSplits) {
props.setProperty(COPY_ON_WRITE_TABLE_AUTO_SPLIT_INSERTS, String.valueOf(autoTuneInsertSplits)); props.setProperty(COPY_ON_WRITE_TABLE_AUTO_SPLIT_INSERTS,
String.valueOf(autoTuneInsertSplits));
return this; return this;
} }
public Builder approxRecordSize(int recordSizeEstimate) { public Builder approxRecordSize(int recordSizeEstimate) {
props.setProperty(COPY_ON_WRITE_TABLE_RECORD_SIZE_ESTIMATE, String.valueOf(recordSizeEstimate)); props.setProperty(COPY_ON_WRITE_TABLE_RECORD_SIZE_ESTIMATE,
String.valueOf(recordSizeEstimate));
return this; return this;
} }
@@ -199,7 +205,8 @@ public class HoodieCompactionConfig extends DefaultHoodieConfig {
} }
public Builder withTargetIOPerCompactionInMB(long targetIOPerCompactionInMB) { public Builder withTargetIOPerCompactionInMB(long targetIOPerCompactionInMB) {
props.setProperty(TARGET_IO_PER_COMPACTION_IN_MB_PROP, String.valueOf(targetIOPerCompactionInMB)); props.setProperty(TARGET_IO_PER_COMPACTION_IN_MB_PROP,
String.valueOf(targetIOPerCompactionInMB));
return this; return this;
} }
@@ -228,7 +235,8 @@ public class HoodieCompactionConfig extends DefaultHoodieConfig {
setDefaultOnCondition(props, !props.containsKey(COPY_ON_WRITE_TABLE_AUTO_SPLIT_INSERTS), setDefaultOnCondition(props, !props.containsKey(COPY_ON_WRITE_TABLE_AUTO_SPLIT_INSERTS),
COPY_ON_WRITE_TABLE_AUTO_SPLIT_INSERTS, DEFAULT_COPY_ON_WRITE_TABLE_AUTO_SPLIT_INSERTS); COPY_ON_WRITE_TABLE_AUTO_SPLIT_INSERTS, DEFAULT_COPY_ON_WRITE_TABLE_AUTO_SPLIT_INSERTS);
setDefaultOnCondition(props, !props.containsKey(COPY_ON_WRITE_TABLE_RECORD_SIZE_ESTIMATE), setDefaultOnCondition(props, !props.containsKey(COPY_ON_WRITE_TABLE_RECORD_SIZE_ESTIMATE),
COPY_ON_WRITE_TABLE_RECORD_SIZE_ESTIMATE, DEFAULT_COPY_ON_WRITE_TABLE_RECORD_SIZE_ESTIMATE); COPY_ON_WRITE_TABLE_RECORD_SIZE_ESTIMATE,
DEFAULT_COPY_ON_WRITE_TABLE_RECORD_SIZE_ESTIMATE);
setDefaultOnCondition(props, !props.containsKey(CLEANER_PARALLELISM), setDefaultOnCondition(props, !props.containsKey(CLEANER_PARALLELISM),
CLEANER_PARALLELISM, DEFAULT_CLEANER_PARALLELISM); CLEANER_PARALLELISM, DEFAULT_CLEANER_PARALLELISM);
setDefaultOnCondition(props, !props.containsKey(COMPACTION_STRATEGY_PROP), setDefaultOnCondition(props, !props.containsKey(COMPACTION_STRATEGY_PROP),

View File

@@ -16,14 +16,12 @@
package com.uber.hoodie.config; package com.uber.hoodie.config;
import com.google.common.base.Preconditions;
import com.uber.hoodie.index.HoodieIndex; import com.uber.hoodie.index.HoodieIndex;
import javax.annotation.concurrent.Immutable;
import java.io.File; import java.io.File;
import java.io.FileReader; import java.io.FileReader;
import java.io.IOException; import java.io.IOException;
import java.util.Properties; import java.util.Properties;
import javax.annotation.concurrent.Immutable;
/** /**
* Indexing related config * Indexing related config
@@ -64,6 +62,7 @@ public class HoodieIndexConfig extends DefaultHoodieConfig {
} }
public static class Builder { public static class Builder {
private final Properties props = new Properties(); private final Properties props = new Properties();
public Builder fromFile(File propertiesFile) throws IOException { public Builder fromFile(File propertiesFile) throws IOException {

View File

@@ -17,12 +17,11 @@
package com.uber.hoodie.config; package com.uber.hoodie.config;
import com.uber.hoodie.metrics.MetricsReporterType; import com.uber.hoodie.metrics.MetricsReporterType;
import javax.annotation.concurrent.Immutable;
import java.io.File; import java.io.File;
import java.io.FileReader; import java.io.FileReader;
import java.io.IOException; import java.io.IOException;
import java.util.Properties; import java.util.Properties;
import javax.annotation.concurrent.Immutable;
/** /**
* Fetch the configurations used by the Metrics system. * Fetch the configurations used by the Metrics system.
@@ -56,6 +55,7 @@ public class HoodieMetricsConfig extends DefaultHoodieConfig {
} }
public static class Builder { public static class Builder {
private final Properties props = new Properties(); private final Properties props = new Properties();
public Builder fromFile(File propertiesFile) throws IOException { public Builder fromFile(File propertiesFile) throws IOException {

View File

@@ -16,17 +16,18 @@
package com.uber.hoodie.config; package com.uber.hoodie.config;
import javax.annotation.concurrent.Immutable;
import java.io.File; import java.io.File;
import java.io.FileReader; import java.io.FileReader;
import java.io.IOException; import java.io.IOException;
import java.util.Properties; import java.util.Properties;
import javax.annotation.concurrent.Immutable;
/** /**
* Storage related config * Storage related config
*/ */
@Immutable @Immutable
public class HoodieStorageConfig extends DefaultHoodieConfig { public class HoodieStorageConfig extends DefaultHoodieConfig {
public static final String PARQUET_FILE_MAX_BYTES = "hoodie.parquet.max.file.size"; public static final String PARQUET_FILE_MAX_BYTES = "hoodie.parquet.max.file.size";
public static final String DEFAULT_PARQUET_FILE_MAX_BYTES = String.valueOf(120 * 1024 * 1024); public static final String DEFAULT_PARQUET_FILE_MAX_BYTES = String.valueOf(120 * 1024 * 1024);
public static final String PARQUET_BLOCK_SIZE_BYTES = "hoodie.parquet.block.size"; public static final String PARQUET_BLOCK_SIZE_BYTES = "hoodie.parquet.block.size";
@@ -43,6 +44,7 @@ public class HoodieStorageConfig extends DefaultHoodieConfig {
} }
public static class Builder { public static class Builder {
private final Properties props = new Properties(); private final Properties props = new Properties();
public Builder fromFile(File propertiesFile) throws IOException { public Builder fromFile(File propertiesFile) throws IOException {

View File

@@ -24,21 +24,21 @@ import com.uber.hoodie.common.util.ReflectionUtils;
import com.uber.hoodie.index.HoodieIndex; import com.uber.hoodie.index.HoodieIndex;
import com.uber.hoodie.io.compact.strategy.CompactionStrategy; import com.uber.hoodie.io.compact.strategy.CompactionStrategy;
import com.uber.hoodie.metrics.MetricsReporterType; import com.uber.hoodie.metrics.MetricsReporterType;
import org.apache.spark.storage.StorageLevel;
import javax.annotation.concurrent.Immutable;
import java.io.File; import java.io.File;
import java.io.FileReader; import java.io.FileReader;
import java.io.IOException; import java.io.IOException;
import java.io.InputStream; import java.io.InputStream;
import java.util.Map; import java.util.Map;
import java.util.Properties; import java.util.Properties;
import javax.annotation.concurrent.Immutable;
import org.apache.spark.storage.StorageLevel;
/** /**
* Class storing configs for the {@link com.uber.hoodie.HoodieWriteClient} * Class storing configs for the {@link com.uber.hoodie.HoodieWriteClient}
*/ */
@Immutable @Immutable
public class HoodieWriteConfig extends DefaultHoodieConfig { public class HoodieWriteConfig extends DefaultHoodieConfig {
private static final String BASE_PATH_PROP = "hoodie.base.path"; private static final String BASE_PATH_PROP = "hoodie.base.path";
private static final String AVRO_SCHEMA = "hoodie.avro.schema"; private static final String AVRO_SCHEMA = "hoodie.avro.schema";
public static final String TABLE_NAME = "hoodie.table.name"; public static final String TABLE_NAME = "hoodie.table.name";
@@ -141,7 +141,8 @@ public class HoodieWriteConfig extends DefaultHoodieConfig {
} }
public int getParquetSmallFileLimit() { public int getParquetSmallFileLimit() {
return Integer.parseInt(props.getProperty(HoodieCompactionConfig.PARQUET_SMALL_FILE_LIMIT_BYTES)); return Integer
.parseInt(props.getProperty(HoodieCompactionConfig.PARQUET_SMALL_FILE_LIMIT_BYTES));
} }
public int getCopyOnWriteInsertSplitSize() { public int getCopyOnWriteInsertSplitSize() {
@@ -177,11 +178,13 @@ public class HoodieWriteConfig extends DefaultHoodieConfig {
} }
public CompactionStrategy getCompactionStrategy() { public CompactionStrategy getCompactionStrategy() {
return ReflectionUtils.loadClass(props.getProperty(HoodieCompactionConfig.COMPACTION_STRATEGY_PROP)); return ReflectionUtils
.loadClass(props.getProperty(HoodieCompactionConfig.COMPACTION_STRATEGY_PROP));
} }
public Long getTargetIOPerCompactionInMB() { public Long getTargetIOPerCompactionInMB() {
return Long.parseLong(props.getProperty(HoodieCompactionConfig.TARGET_IO_PER_COMPACTION_IN_MB_PROP)); return Long
.parseLong(props.getProperty(HoodieCompactionConfig.TARGET_IO_PER_COMPACTION_IN_MB_PROP));
} }
/** /**
@@ -216,7 +219,8 @@ public class HoodieWriteConfig extends DefaultHoodieConfig {
} }
public boolean getBloomIndexPruneByRanges() { public boolean getBloomIndexPruneByRanges() {
return Boolean.parseBoolean(props.getProperty(HoodieIndexConfig.BLOOM_INDEX_PRUNE_BY_RANGES_PROP)); return Boolean
.parseBoolean(props.getProperty(HoodieIndexConfig.BLOOM_INDEX_PRUNE_BY_RANGES_PROP));
} }
public boolean getBloomIndexUseCaching() { public boolean getBloomIndexUseCaching() {
@@ -271,8 +275,8 @@ public class HoodieWriteConfig extends DefaultHoodieConfig {
} }
public static class Builder { public static class Builder {
private final Properties props = new Properties(); private final Properties props = new Properties();
private boolean isIndexConfigSet = false; private boolean isIndexConfigSet = false;
private boolean isStorageConfigSet = false; private boolean isStorageConfigSet = false;
@@ -371,7 +375,8 @@ public class HoodieWriteConfig extends DefaultHoodieConfig {
} }
public Builder withAssumeDatePartitioning(boolean assumeDatePartitioning) { public Builder withAssumeDatePartitioning(boolean assumeDatePartitioning) {
props.setProperty(HOODIE_ASSUME_DATE_PARTITIONING_PROP, String.valueOf(assumeDatePartitioning)); props.setProperty(HOODIE_ASSUME_DATE_PARTITIONING_PROP,
String.valueOf(assumeDatePartitioning));
return this; return this;
} }
@@ -386,7 +391,8 @@ public class HoodieWriteConfig extends DefaultHoodieConfig {
Preconditions.checkArgument(config.getBasePath() != null); Preconditions.checkArgument(config.getBasePath() != null);
setDefaultOnCondition(props, !props.containsKey(INSERT_PARALLELISM), INSERT_PARALLELISM, setDefaultOnCondition(props, !props.containsKey(INSERT_PARALLELISM), INSERT_PARALLELISM,
DEFAULT_PARALLELISM); DEFAULT_PARALLELISM);
setDefaultOnCondition(props, !props.containsKey(BULKINSERT_PARALLELISM), BULKINSERT_PARALLELISM, setDefaultOnCondition(props, !props.containsKey(BULKINSERT_PARALLELISM),
BULKINSERT_PARALLELISM,
DEFAULT_PARALLELISM); DEFAULT_PARALLELISM);
setDefaultOnCondition(props, !props.containsKey(UPSERT_PARALLELISM), UPSERT_PARALLELISM, setDefaultOnCondition(props, !props.containsKey(UPSERT_PARALLELISM), UPSERT_PARALLELISM,
DEFAULT_PARALLELISM); DEFAULT_PARALLELISM);

View File

@@ -17,11 +17,11 @@
package com.uber.hoodie.exception; package com.uber.hoodie.exception;
/** /**
* <p> * <p> Exception thrown for any higher level errors when <code>HoodieClient</code> is doing a delta
* Exception thrown for any higher level errors when <code>HoodieClient</code> is doing a delta commit * commit </p>
* </p>
*/ */
public class HoodieAppendException extends HoodieException { public class HoodieAppendException extends HoodieException {
public HoodieAppendException(String msg, Throwable e) { public HoodieAppendException(String msg, Throwable e) {
super(msg, e); super(msg, e);
} }

View File

@@ -17,11 +17,11 @@
package com.uber.hoodie.exception; package com.uber.hoodie.exception;
/** /**
* <p> * <p> Exception thrown for any higher level errors when <code>HoodieClient</code> is doing a Commit
* Exception thrown for any higher level errors when <code>HoodieClient</code> is doing a Commit
* </p> * </p>
*/ */
public class HoodieCommitException extends HoodieException { public class HoodieCommitException extends HoodieException {
public HoodieCommitException(String msg) { public HoodieCommitException(String msg) {
super(msg); super(msg);
} }

View File

@@ -17,6 +17,7 @@
package com.uber.hoodie.exception; package com.uber.hoodie.exception;
public class HoodieCompactionException extends HoodieException { public class HoodieCompactionException extends HoodieException {
public HoodieCompactionException(String msg) { public HoodieCompactionException(String msg) {
super(msg); super(msg);
} }

View File

@@ -18,11 +18,10 @@ package com.uber.hoodie.exception;
/** /**
* <p> * <p> Exception thrown when dependent system is not available </p>
* Exception thrown when dependent system is not available
* </p>
*/ */
public class HoodieDependentSystemUnavailableException extends HoodieException { public class HoodieDependentSystemUnavailableException extends HoodieException {
public static final String HBASE = "HBASE"; public static final String HBASE = "HBASE";
public HoodieDependentSystemUnavailableException(String system, String connectURL) { public HoodieDependentSystemUnavailableException(String system, String connectURL) {

View File

@@ -16,14 +16,12 @@
package com.uber.hoodie.exception; package com.uber.hoodie.exception;
import java.io.IOException;
/** /**
* <p> * <p> Exception thrown for any higher level errors when <code>HoodieClient</code> is doing a bulk
* Exception thrown for any higher level errors when <code>HoodieClient</code> is doing a bulk insert * insert </p>
* </p>
*/ */
public class HoodieInsertException extends HoodieException { public class HoodieInsertException extends HoodieException {
public HoodieInsertException(String msg, Throwable e) { public HoodieInsertException(String msg, Throwable e) {
super(msg, e); super(msg, e);
} }

View File

@@ -17,11 +17,11 @@
package com.uber.hoodie.exception; package com.uber.hoodie.exception;
/** /**
* <p> * <p> Exception thrown for any higher level errors when <code>HoodieClient</code> is doing a
* Exception thrown for any higher level errors when <code>HoodieClient</code> is doing a incremental upsert * incremental upsert </p>
* </p>
*/ */
public class HoodieUpsertException extends HoodieException { public class HoodieUpsertException extends HoodieException {
public HoodieUpsertException(String msg, Throwable e) { public HoodieUpsertException(String msg, Throwable e) {
super(msg, e); super(msg, e);
} }

View File

@@ -16,16 +16,14 @@
package com.uber.hoodie.func; package com.uber.hoodie.func;
import com.uber.hoodie.common.table.HoodieTableMetaClient;
import com.uber.hoodie.config.HoodieWriteConfig;
import com.uber.hoodie.WriteStatus; import com.uber.hoodie.WriteStatus;
import com.uber.hoodie.common.model.HoodieRecord; import com.uber.hoodie.common.model.HoodieRecord;
import com.uber.hoodie.common.model.HoodieRecordPayload; import com.uber.hoodie.common.model.HoodieRecordPayload;
import com.uber.hoodie.config.HoodieWriteConfig;
import com.uber.hoodie.table.HoodieTable; import com.uber.hoodie.table.HoodieTable;
import org.apache.spark.api.java.function.Function2;
import java.util.Iterator; import java.util.Iterator;
import java.util.List; import java.util.List;
import org.apache.spark.api.java.function.Function2;
/** /**
@@ -46,7 +44,8 @@ public class BulkInsertMapFunction<T extends HoodieRecordPayload>
} }
@Override @Override
public Iterator<List<WriteStatus>> call(Integer partition, Iterator<HoodieRecord<T>> sortedRecordItr) public Iterator<List<WriteStatus>> call(Integer partition,
Iterator<HoodieRecord<T>> sortedRecordItr)
throws Exception { throws Exception {
return new LazyInsertIterable<>(sortedRecordItr, config, commitTime, hoodieTable); return new LazyInsertIterable<>(sortedRecordItr, config, commitTime, hoodieTable);
} }

View File

@@ -16,27 +16,26 @@
package com.uber.hoodie.func; package com.uber.hoodie.func;
import com.uber.hoodie.config.HoodieWriteConfig;
import com.uber.hoodie.WriteStatus; import com.uber.hoodie.WriteStatus;
import com.uber.hoodie.common.model.HoodieRecord; import com.uber.hoodie.common.model.HoodieRecord;
import com.uber.hoodie.common.model.HoodieRecordPayload; import com.uber.hoodie.common.model.HoodieRecordPayload;
import com.uber.hoodie.config.HoodieWriteConfig;
import com.uber.hoodie.io.HoodieIOHandle;
import com.uber.hoodie.io.HoodieCreateHandle; import com.uber.hoodie.io.HoodieCreateHandle;
import com.uber.hoodie.io.HoodieIOHandle;
import com.uber.hoodie.table.HoodieTable; import com.uber.hoodie.table.HoodieTable;
import org.apache.spark.TaskContext;
import java.util.ArrayList; import java.util.ArrayList;
import java.util.HashSet; import java.util.HashSet;
import java.util.Iterator; import java.util.Iterator;
import java.util.List; import java.util.List;
import java.util.Set; import java.util.Set;
import org.apache.spark.TaskContext;
/** /**
* Lazy Iterable, that writes a stream of HoodieRecords sorted by the partitionPath, * Lazy Iterable, that writes a stream of HoodieRecords sorted by the partitionPath, into new
* into new files. * files.
*/ */
public class LazyInsertIterable<T extends HoodieRecordPayload> extends LazyIterableIterator<HoodieRecord<T>, List<WriteStatus>> { public class LazyInsertIterable<T extends HoodieRecordPayload> extends
LazyIterableIterator<HoodieRecord<T>, List<WriteStatus>> {
private final HoodieWriteConfig hoodieConfig; private final HoodieWriteConfig hoodieConfig;
private final String commitTime; private final String commitTime;
@@ -53,11 +52,13 @@ public class LazyInsertIterable<T extends HoodieRecordPayload> extends LazyItera
this.hoodieTable = hoodieTable; this.hoodieTable = hoodieTable;
} }
@Override protected void start() { @Override
protected void start() {
} }
@Override protected List<WriteStatus> computeNext() { @Override
protected List<WriteStatus> computeNext() {
List<WriteStatus> statuses = new ArrayList<>(); List<WriteStatus> statuses = new ArrayList<>();
while (inputItr.hasNext()) { while (inputItr.hasNext()) {
@@ -108,7 +109,8 @@ public class LazyInsertIterable<T extends HoodieRecordPayload> extends LazyItera
return statuses; return statuses;
} }
@Override protected void end() { @Override
protected void end() {
} }
} }

View File

@@ -31,6 +31,7 @@ import java.util.Iterator;
* responsible for calling inputIterator.next() and doing the processing in computeNext() * responsible for calling inputIterator.next() and doing the processing in computeNext()
*/ */
public abstract class LazyIterableIterator<I, O> implements Iterable<O>, Iterator<O> { public abstract class LazyIterableIterator<I, O> implements Iterable<O>, Iterator<O> {
protected Iterator<I> inputItr = null; protected Iterator<I> inputItr = null;
private boolean consumed = false; private boolean consumed = false;
private boolean startCalled = false; private boolean startCalled = false;
@@ -56,7 +57,6 @@ public abstract class LazyIterableIterator<I, O> implements Iterable<O>, Iterato
*/ */
protected abstract void end(); protected abstract void end();
////////////////// //////////////////
// iterable implementation // iterable implementation
@@ -87,8 +87,9 @@ public abstract class LazyIterableIterator<I, O> implements Iterable<O>, Iterato
@Override @Override
public Iterator<O> iterator() { public Iterator<O> iterator() {
//check for consumed inputItr //check for consumed inputItr
if (consumed) if (consumed) {
throw new RuntimeException("Invalid repeated inputItr consumption."); throw new RuntimeException("Invalid repeated inputItr consumption.");
}
//hand out self as inputItr exactly once (note: do not hand out the input //hand out self as inputItr exactly once (note: do not hand out the input
//inputItr since it is consumed by the self inputItr implementation) //inputItr since it is consumed by the self inputItr implementation)

View File

@@ -17,29 +17,26 @@
package com.uber.hoodie.index; package com.uber.hoodie.index;
import com.google.common.base.Optional; import com.google.common.base.Optional;
import com.uber.hoodie.config.HoodieWriteConfig;
import com.uber.hoodie.WriteStatus; import com.uber.hoodie.WriteStatus;
import com.uber.hoodie.common.model.HoodieKey; import com.uber.hoodie.common.model.HoodieKey;
import com.uber.hoodie.common.model.HoodieRecordPayload;
import com.uber.hoodie.common.model.HoodieRecord; import com.uber.hoodie.common.model.HoodieRecord;
import com.uber.hoodie.common.model.HoodieRecordPayload;
import com.uber.hoodie.config.HoodieWriteConfig;
import com.uber.hoodie.exception.HoodieIndexException; import com.uber.hoodie.exception.HoodieIndexException;
import com.uber.hoodie.index.bloom.HoodieBloomIndex; import com.uber.hoodie.index.bloom.HoodieBloomIndex;
import com.uber.hoodie.index.bucketed.BucketedIndex; import com.uber.hoodie.index.bucketed.BucketedIndex;
import com.uber.hoodie.index.hbase.HBaseIndex; import com.uber.hoodie.index.hbase.HBaseIndex;
import com.uber.hoodie.table.HoodieTable; import com.uber.hoodie.table.HoodieTable;
import java.io.Serializable;
import org.apache.spark.api.java.JavaPairRDD; import org.apache.spark.api.java.JavaPairRDD;
import org.apache.spark.api.java.JavaRDD; import org.apache.spark.api.java.JavaRDD;
import org.apache.spark.api.java.JavaSparkContext; import org.apache.spark.api.java.JavaSparkContext;
import java.io.Serializable;
/** /**
* Base class for different types of indexes to determine the mapping from uuid * Base class for different types of indexes to determine the mapping from uuid
*
*/ */
public abstract class HoodieIndex<T extends HoodieRecordPayload> implements Serializable { public abstract class HoodieIndex<T extends HoodieRecordPayload> implements Serializable {
protected transient JavaSparkContext jsc = null; protected transient JavaSparkContext jsc = null;
public enum IndexType { public enum IndexType {
@@ -58,12 +55,9 @@ public abstract class HoodieIndex<T extends HoodieRecordPayload> implements Seri
/** /**
* Checks if the given [Keys] exists in the hoodie table and returns [Key, Optional[FullFilePath]] * Checks if the given [Keys] exists in the hoodie table and returns [Key, Optional[FullFilePath]]
* If the optional FullFilePath value is not present, then the key is not found. If the FullFilePath * If the optional FullFilePath value is not present, then the key is not found. If the
* value is present, it is the path component (without scheme) of the URI underlying file * FullFilePath value is present, it is the path component (without scheme) of the URI underlying
* * file
* @param hoodieKeys
* @param table
* @return
*/ */
public abstract JavaPairRDD<HoodieKey, Optional<String>> fetchRecordLocation( public abstract JavaPairRDD<HoodieKey, Optional<String>> fetchRecordLocation(
JavaRDD<HoodieKey> hoodieKeys, final HoodieTable<T> table); JavaRDD<HoodieKey> hoodieKeys, final HoodieTable<T> table);
@@ -89,17 +83,17 @@ public abstract class HoodieIndex<T extends HoodieRecordPayload> implements Seri
public abstract boolean rollbackCommit(String commitTime); public abstract boolean rollbackCommit(String commitTime);
/** /**
* An index is `global` if {@link HoodieKey} to fileID mapping, does not depend on the `partitionPath`. * An index is `global` if {@link HoodieKey} to fileID mapping, does not depend on the
* Such an implementation is able to obtain the same mapping, for two hoodie keys with same `recordKey` * `partitionPath`. Such an implementation is able to obtain the same mapping, for two hoodie keys
* but different `partitionPath` * with same `recordKey` but different `partitionPath`
* *
* @return whether or not, the index implementation is global in nature * @return whether or not, the index implementation is global in nature
*/ */
public abstract boolean isGlobal(); public abstract boolean isGlobal();
/** /**
* This is used by storage to determine, if its safe to send inserts, straight to the log, * This is used by storage to determine, if its safe to send inserts, straight to the log, i.e
* i.e having a {@link com.uber.hoodie.common.model.FileSlice}, with no data file. * having a {@link com.uber.hoodie.common.model.FileSlice}, with no data file.
* *
* @return Returns true/false depending on whether the impl has this capability * @return Returns true/false depending on whether the impl has this capability
*/ */
@@ -107,12 +101,8 @@ public abstract class HoodieIndex<T extends HoodieRecordPayload> implements Seri
/** /**
*
* An index is "implicit" with respect to storage, if just writing new data to a file slice, * An index is "implicit" with respect to storage, if just writing new data to a file slice,
* updates the index as well. This is used by storage, to save memory footprint in * updates the index as well. This is used by storage, to save memory footprint in certain cases.
* certain cases.
*
* @return
*/ */
public abstract boolean isImplicitWithStorage(); public abstract boolean isImplicitWithStorage();

View File

@@ -17,32 +17,27 @@
package com.uber.hoodie.index; package com.uber.hoodie.index;
import com.google.common.base.Optional; import com.google.common.base.Optional;
import com.uber.hoodie.common.table.HoodieTableMetaClient;
import com.uber.hoodie.config.HoodieWriteConfig;
import com.uber.hoodie.WriteStatus; import com.uber.hoodie.WriteStatus;
import com.uber.hoodie.common.model.HoodieKey; import com.uber.hoodie.common.model.HoodieKey;
import com.uber.hoodie.common.model.HoodieRecord; import com.uber.hoodie.common.model.HoodieRecord;
import com.uber.hoodie.common.model.HoodieRecordLocation; import com.uber.hoodie.common.model.HoodieRecordLocation;
import com.uber.hoodie.common.model.HoodieRecordPayload; import com.uber.hoodie.common.model.HoodieRecordPayload;
import com.uber.hoodie.config.HoodieWriteConfig;
import com.uber.hoodie.table.HoodieTable; import com.uber.hoodie.table.HoodieTable;
import java.util.ArrayList;
import java.util.Iterator;
import java.util.List;
import java.util.concurrent.ConcurrentHashMap;
import java.util.concurrent.ConcurrentMap;
import org.apache.spark.api.java.JavaPairRDD; import org.apache.spark.api.java.JavaPairRDD;
import org.apache.spark.api.java.JavaRDD; import org.apache.spark.api.java.JavaRDD;
import org.apache.spark.api.java.JavaSparkContext; import org.apache.spark.api.java.JavaSparkContext;
import org.apache.spark.api.java.function.Function; import org.apache.spark.api.java.function.Function;
import org.apache.spark.api.java.function.Function2; import org.apache.spark.api.java.function.Function2;
import java.util.ArrayList;
import java.util.Iterator;
import java.util.List;
import java.util.concurrent.ConcurrentHashMap;
import java.util.concurrent.ConcurrentMap;
/** /**
* Hoodie Index implementation backed by an in-memory Hash map. * Hoodie Index implementation backed by an in-memory Hash map. <p> ONLY USE FOR LOCAL TESTING
* <p>
* ONLY USE FOR LOCAL TESTING
*/ */
public class InMemoryHashIndex<T extends HoodieRecordPayload> extends HoodieIndex<T> { public class InMemoryHashIndex<T extends HoodieRecordPayload> extends HoodieIndex<T> {
@@ -64,6 +59,7 @@ public class InMemoryHashIndex<T extends HoodieRecordPayload> extends HoodieInde
*/ */
class LocationTagFunction class LocationTagFunction
implements Function2<Integer, Iterator<HoodieRecord<T>>, Iterator<HoodieRecord<T>>> { implements Function2<Integer, Iterator<HoodieRecord<T>>, Iterator<HoodieRecord<T>>> {
@Override @Override
public Iterator<HoodieRecord<T>> call(Integer partitionNum, public Iterator<HoodieRecord<T>> call(Integer partitionNum,
Iterator<HoodieRecord<T>> hoodieRecordIterator) { Iterator<HoodieRecord<T>> hoodieRecordIterator) {
@@ -115,8 +111,6 @@ public class InMemoryHashIndex<T extends HoodieRecordPayload> extends HoodieInde
/** /**
* Only looks up by recordKey * Only looks up by recordKey
*
* @return
*/ */
@Override @Override
public boolean isGlobal() { public boolean isGlobal() {
@@ -125,8 +119,6 @@ public class InMemoryHashIndex<T extends HoodieRecordPayload> extends HoodieInde
/** /**
* Mapping is available in HBase already. * Mapping is available in HBase already.
*
* @return
*/ */
@Override @Override
public boolean canIndexLogFiles() { public boolean canIndexLogFiles() {
@@ -135,8 +127,6 @@ public class InMemoryHashIndex<T extends HoodieRecordPayload> extends HoodieInde
/** /**
* Index needs to be explicitly updated after storage write. * Index needs to be explicitly updated after storage write.
*
* @return
*/ */
@Override @Override
public boolean isImplicitWithStorage() { public boolean isImplicitWithStorage() {

View File

@@ -19,7 +19,6 @@
package com.uber.hoodie.index.bloom; package com.uber.hoodie.index.bloom;
import com.google.common.base.Objects; import com.google.common.base.Objects;
import java.io.Serializable; import java.io.Serializable;
/** /**
@@ -63,8 +62,6 @@ public class BloomIndexFileInfo implements Serializable {
/** /**
* Does the given key fall within the range (inclusive) * Does the given key fall within the range (inclusive)
* @param recordKey
* @return
*/ */
public boolean isKeyInRange(String recordKey) { public boolean isKeyInRange(String recordKey) {
return minRecordKey.compareTo(recordKey) <= 0 && return minRecordKey.compareTo(recordKey) <= 0 &&
@@ -73,8 +70,12 @@ public class BloomIndexFileInfo implements Serializable {
@Override @Override
public boolean equals(Object o) { public boolean equals(Object o) {
if (this == o) return true; if (this == o) {
if (o == null || getClass() != o.getClass()) return false; return true;
}
if (o == null || getClass() != o.getClass()) {
return false;
}
BloomIndexFileInfo that = (BloomIndexFileInfo) o; BloomIndexFileInfo that = (BloomIndexFileInfo) o;
return Objects.equal(that.fileName, fileName) && return Objects.equal(that.fileName, fileName) &&

View File

@@ -18,9 +18,12 @@
package com.uber.hoodie.index.bloom; package com.uber.hoodie.index.bloom;
import static java.util.stream.Collectors.groupingBy;
import static java.util.stream.Collectors.mapping;
import static java.util.stream.Collectors.toList;
import com.google.common.annotations.VisibleForTesting; import com.google.common.annotations.VisibleForTesting;
import com.google.common.base.Optional; import com.google.common.base.Optional;
import com.uber.hoodie.WriteStatus; import com.uber.hoodie.WriteStatus;
import com.uber.hoodie.common.model.HoodieDataFile; import com.uber.hoodie.common.model.HoodieDataFile;
import com.uber.hoodie.common.model.HoodieKey; import com.uber.hoodie.common.model.HoodieKey;
@@ -34,7 +37,10 @@ import com.uber.hoodie.config.HoodieWriteConfig;
import com.uber.hoodie.exception.MetadataNotFoundException; import com.uber.hoodie.exception.MetadataNotFoundException;
import com.uber.hoodie.index.HoodieIndex; import com.uber.hoodie.index.HoodieIndex;
import com.uber.hoodie.table.HoodieTable; import com.uber.hoodie.table.HoodieTable;
import java.util.ArrayList;
import java.util.List;
import java.util.Map;
import java.util.stream.Collectors;
import org.apache.hadoop.fs.Path; import org.apache.hadoop.fs.Path;
import org.apache.log4j.LogManager; import org.apache.log4j.LogManager;
import org.apache.log4j.Logger; import org.apache.log4j.Logger;
@@ -42,16 +48,8 @@ import org.apache.spark.api.java.JavaPairRDD;
import org.apache.spark.api.java.JavaRDD; import org.apache.spark.api.java.JavaRDD;
import org.apache.spark.api.java.JavaSparkContext; import org.apache.spark.api.java.JavaSparkContext;
import org.apache.spark.storage.StorageLevel; import org.apache.spark.storage.StorageLevel;
import scala.Tuple2; import scala.Tuple2;
import java.util.ArrayList;
import java.util.List;
import java.util.Map;
import java.util.stream.Collectors;
import static java.util.stream.Collectors.*;
/** /**
* Indexing mechanism based on bloom filter. Each parquet file includes its row_key bloom filter in * Indexing mechanism based on bloom filter. Each parquet file includes its row_key bloom filter in
* its metadata. * its metadata.
@@ -64,14 +62,16 @@ public class HoodieBloomIndex<T extends HoodieRecordPayload> extends HoodieIndex
private static final int SPARK_MAXIMUM_BYTES_PER_PARTITION = 1500 * 1024 * 1024; private static final int SPARK_MAXIMUM_BYTES_PER_PARTITION = 1500 * 1024 * 1024;
// this is how much a triplet of (partitionPath, fileId, recordKey) costs. // this is how much a triplet of (partitionPath, fileId, recordKey) costs.
private static final int BYTES_PER_PARTITION_FILE_KEY_TRIPLET = 300; private static final int BYTES_PER_PARTITION_FILE_KEY_TRIPLET = 300;
private static int MAX_ITEMS_PER_SHUFFLE_PARTITION = SPARK_MAXIMUM_BYTES_PER_PARTITION / BYTES_PER_PARTITION_FILE_KEY_TRIPLET; private static int MAX_ITEMS_PER_SHUFFLE_PARTITION =
SPARK_MAXIMUM_BYTES_PER_PARTITION / BYTES_PER_PARTITION_FILE_KEY_TRIPLET;
public HoodieBloomIndex(HoodieWriteConfig config, JavaSparkContext jsc) { public HoodieBloomIndex(HoodieWriteConfig config, JavaSparkContext jsc) {
super(config, jsc); super(config, jsc);
} }
@Override @Override
public JavaRDD<HoodieRecord<T>> tagLocation(JavaRDD<HoodieRecord<T>> recordRDD, final HoodieTable<T> hoodieTable) { public JavaRDD<HoodieRecord<T>> tagLocation(JavaRDD<HoodieRecord<T>> recordRDD,
final HoodieTable<T> hoodieTable) {
// Step 0: cache the input record RDD // Step 0: cache the input record RDD
if (config.getBloomIndexUseCaching()) { if (config.getBloomIndexUseCaching()) {
@@ -83,7 +83,8 @@ public class HoodieBloomIndex<T extends HoodieRecordPayload> extends HoodieIndex
.mapToPair(record -> new Tuple2<>(record.getPartitionPath(), record.getRecordKey())); .mapToPair(record -> new Tuple2<>(record.getPartitionPath(), record.getRecordKey()));
// Lookup indexes for all the partition/recordkey pair // Lookup indexes for all the partition/recordkey pair
JavaPairRDD<String, String> rowKeyFilenamePairRDD = lookupIndex(partitionRecordKeyPairRDD, hoodieTable); JavaPairRDD<String, String> rowKeyFilenamePairRDD = lookupIndex(partitionRecordKeyPairRDD,
hoodieTable);
// Cache the result, for subsequent stages. // Cache the result, for subsequent stages.
if (config.getBloomIndexUseCaching()) { if (config.getBloomIndexUseCaching()) {
@@ -96,7 +97,8 @@ public class HoodieBloomIndex<T extends HoodieRecordPayload> extends HoodieIndex
// Step 4: Tag the incoming records, as inserts or updates, by joining with existing record keys // Step 4: Tag the incoming records, as inserts or updates, by joining with existing record keys
// Cost: 4 sec. // Cost: 4 sec.
JavaRDD<HoodieRecord<T>> taggedRecordRDD = tagLocationBacktoRecords(rowKeyFilenamePairRDD, recordRDD); JavaRDD<HoodieRecord<T>> taggedRecordRDD = tagLocationBacktoRecords(rowKeyFilenamePairRDD,
recordRDD);
if (config.getBloomIndexUseCaching()) { if (config.getBloomIndexUseCaching()) {
recordRDD.unpersist(); // unpersist the input Record RDD recordRDD.unpersist(); // unpersist the input Record RDD
@@ -135,8 +137,8 @@ public class HoodieBloomIndex<T extends HoodieRecordPayload> extends HoodieIndex
} }
/** /**
* Lookup the location for each record key and return the pair<record_key,location> for all * Lookup the location for each record key and return the pair<record_key,location> for all record
* record keys already present and drop the record keys if not present * keys already present and drop the record keys if not present
*/ */
private JavaPairRDD<String, String> lookupIndex( private JavaPairRDD<String, String> lookupIndex(
JavaPairRDD<String, String> partitionRecordKeyPairRDD, final HoodieTable<T> hoodieTable) { JavaPairRDD<String, String> partitionRecordKeyPairRDD, final HoodieTable<T> hoodieTable) {
@@ -145,25 +147,27 @@ public class HoodieBloomIndex<T extends HoodieRecordPayload> extends HoodieIndex
List<String> affectedPartitionPathList = new ArrayList<>(recordsPerPartition.keySet()); List<String> affectedPartitionPathList = new ArrayList<>(recordsPerPartition.keySet());
// Step 2: Load all involved files as <Partition, filename> pairs // Step 2: Load all involved files as <Partition, filename> pairs
List<Tuple2<String, BloomIndexFileInfo>> fileInfoList = loadInvolvedFiles(affectedPartitionPathList, hoodieTable); List<Tuple2<String, BloomIndexFileInfo>> fileInfoList = loadInvolvedFiles(
affectedPartitionPathList, hoodieTable);
final Map<String, List<BloomIndexFileInfo>> partitionToFileInfo = fileInfoList.stream() final Map<String, List<BloomIndexFileInfo>> partitionToFileInfo = fileInfoList.stream()
.collect(groupingBy(Tuple2::_1, mapping(Tuple2::_2, toList()))); .collect(groupingBy(Tuple2::_1, mapping(Tuple2::_2, toList())));
// Step 3: Obtain a RDD, for each incoming record, that already exists, with the file id, that contains it. // Step 3: Obtain a RDD, for each incoming record, that already exists, with the file id, that contains it.
int parallelism = autoComputeParallelism(recordsPerPartition, partitionToFileInfo, partitionRecordKeyPairRDD); int parallelism = autoComputeParallelism(recordsPerPartition, partitionToFileInfo,
return findMatchingFilesForRecordKeys(partitionToFileInfo, partitionRecordKeyPairRDD, parallelism); partitionRecordKeyPairRDD);
return findMatchingFilesForRecordKeys(partitionToFileInfo, partitionRecordKeyPairRDD,
parallelism);
} }
/** /**
* The index lookup can be skewed in three dimensions : #files, #partitions, #records * The index lookup can be skewed in three dimensions : #files, #partitions, #records
* *
* To be able to smoothly handle skews, we need to compute how to split each partitions into * To be able to smoothly handle skews, we need to compute how to split each partitions into
* subpartitions. We do it here, in a way that keeps the amount of each Spark join partition to * subpartitions. We do it here, in a way that keeps the amount of each Spark join partition to <
* < 2GB. * 2GB.
*
* If {@link com.uber.hoodie.config.HoodieIndexConfig#BLOOM_INDEX_PARALLELISM_PROP} is specified as a NON-zero number,
* then that is used explicitly.
* *
* If {@link com.uber.hoodie.config.HoodieIndexConfig#BLOOM_INDEX_PARALLELISM_PROP} is specified
* as a NON-zero number, then that is used explicitly.
*/ */
private int autoComputeParallelism(final Map<String, Long> recordsPerPartition, private int autoComputeParallelism(final Map<String, Long> recordsPerPartition,
final Map<String, List<BloomIndexFileInfo>> partitionToFileInfo, final Map<String, List<BloomIndexFileInfo>> partitionToFileInfo,
@@ -172,7 +176,8 @@ public class HoodieBloomIndex<T extends HoodieRecordPayload> extends HoodieIndex
long totalComparisons = 0; long totalComparisons = 0;
if (config.getBloomIndexPruneByRanges()) { if (config.getBloomIndexPruneByRanges()) {
// we will just try exploding the input and then count to determine comparisons // we will just try exploding the input and then count to determine comparisons
totalComparisons = explodeRecordRDDWithFileComparisons(partitionToFileInfo, partitionRecordKeyPairRDD).count(); totalComparisons = explodeRecordRDDWithFileComparisons(partitionToFileInfo,
partitionRecordKeyPairRDD).count();
} else { } else {
// if not pruning by ranges, then each file in a partition needs to compared against all // if not pruning by ranges, then each file in a partition needs to compared against all
// records for a partition. // records for a partition.
@@ -181,30 +186,36 @@ public class HoodieBloomIndex<T extends HoodieRecordPayload> extends HoodieIndex
long totalFiles = 0, totalRecords = 0; long totalFiles = 0, totalRecords = 0;
for (String partitionPath : recordsPerPartition.keySet()) { for (String partitionPath : recordsPerPartition.keySet()) {
long numRecords = recordsPerPartition.get(partitionPath); long numRecords = recordsPerPartition.get(partitionPath);
long numFiles = filesPerPartition.containsKey(partitionPath) ? filesPerPartition.get(partitionPath) : 1L; long numFiles =
filesPerPartition.containsKey(partitionPath) ? filesPerPartition.get(partitionPath)
: 1L;
totalComparisons += numFiles * numRecords; totalComparisons += numFiles * numRecords;
totalFiles += filesPerPartition.containsKey(partitionPath) ? filesPerPartition.get(partitionPath) : 0L; totalFiles +=
filesPerPartition.containsKey(partitionPath) ? filesPerPartition.get(partitionPath)
: 0L;
totalRecords += numRecords; totalRecords += numRecords;
} }
logger.info("TotalRecords: " + totalRecords + ", TotalFiles: " + totalFiles + ", TotalAffectedPartitions:" + recordsPerPartition.size()); logger.info("TotalRecords: " + totalRecords + ", TotalFiles: " + totalFiles
+ ", TotalAffectedPartitions:" + recordsPerPartition.size());
} }
// each partition will have an item per comparison. // each partition will have an item per comparison.
int parallelism = (int) (totalComparisons/ MAX_ITEMS_PER_SHUFFLE_PARTITION + 1); int parallelism = (int) (totalComparisons / MAX_ITEMS_PER_SHUFFLE_PARTITION + 1);
logger.info("Auto computed parallelism :" + parallelism + ", totalComparisons: " + totalComparisons); logger.info(
"Auto computed parallelism :" + parallelism + ", totalComparisons: " + totalComparisons);
return parallelism; return parallelism;
} }
/** /**
* Its crucial to pick the right parallelism. * Its crucial to pick the right parallelism.
* *
* totalSubPartitions : this is deemed safe limit, to be nice with Spark. * totalSubPartitions : this is deemed safe limit, to be nice with Spark. inputParallelism :
* inputParallelism : typically number of input file splits * typically number of input file splits
* *
* We pick the max such that, we are always safe, but go higher if say a there are a lot of * We pick the max such that, we are always safe, but go higher if say a there are a lot of input
* input files. (otherwise, we will fallback to number of partitions in input and end up with * files. (otherwise, we will fallback to number of partitions in input and end up with slow
* slow performance) * performance)
*/ */
private int determineParallelism(int inputParallelism, int totalSubPartitions) { private int determineParallelism(int inputParallelism, int totalSubPartitions) {
// If bloom index parallelism is set, use it to to check against the input parallelism and take the max // If bloom index parallelism is set, use it to to check against the input parallelism and take the max
@@ -221,9 +232,11 @@ public class HoodieBloomIndex<T extends HoodieRecordPayload> extends HoodieIndex
* Load all involved files as <Partition, filename> pair RDD. * Load all involved files as <Partition, filename> pair RDD.
*/ */
@VisibleForTesting @VisibleForTesting
List<Tuple2<String, BloomIndexFileInfo>> loadInvolvedFiles(List<String> partitions, final HoodieTable<T> hoodieTable) { List<Tuple2<String, BloomIndexFileInfo>> loadInvolvedFiles(List<String> partitions,
final HoodieTable<T> hoodieTable) {
// Obtain the latest data files from all the partitions. // Obtain the latest data files from all the partitions.
List<Tuple2<String, HoodieDataFile>> dataFilesList = jsc.parallelize(partitions, Math.max(partitions.size(), 1)) List<Tuple2<String, HoodieDataFile>> dataFilesList = jsc
.parallelize(partitions, Math.max(partitions.size(), 1))
.flatMapToPair(partitionPath -> { .flatMapToPair(partitionPath -> {
java.util.Optional<HoodieInstant> latestCommitTime = java.util.Optional<HoodieInstant> latestCommitTime =
hoodieTable.getCommitTimeline().filterCompletedInstants().lastInstant(); hoodieTable.getCommitTimeline().filterCompletedInstants().lastInstant();
@@ -243,8 +256,10 @@ public class HoodieBloomIndex<T extends HoodieRecordPayload> extends HoodieIndex
return jsc.parallelize(dataFilesList, Math.max(dataFilesList.size(), 1)) return jsc.parallelize(dataFilesList, Math.max(dataFilesList.size(), 1))
.mapToPair(ft -> { .mapToPair(ft -> {
try { try {
String[] minMaxKeys = ParquetUtils.readMinMaxRecordKeys(ft._2().getFileStatus().getPath()); String[] minMaxKeys = ParquetUtils
return new Tuple2<>(ft._1(), new BloomIndexFileInfo(ft._2().getFileName(), minMaxKeys[0], minMaxKeys[1])); .readMinMaxRecordKeys(ft._2().getFileStatus().getPath());
return new Tuple2<>(ft._1(),
new BloomIndexFileInfo(ft._2().getFileName(), minMaxKeys[0], minMaxKeys[1]));
} catch (MetadataNotFoundException me) { } catch (MetadataNotFoundException me) {
logger.warn("Unable to find range metadata in file :" + ft._2()); logger.warn("Unable to find range metadata in file :" + ft._2());
return new Tuple2<>(ft._1(), new BloomIndexFileInfo(ft._2().getFileName())); return new Tuple2<>(ft._1(), new BloomIndexFileInfo(ft._2().getFileName()));
@@ -266,8 +281,6 @@ public class HoodieBloomIndex<T extends HoodieRecordPayload> extends HoodieIndex
/** /**
* This is not global, since we depend on the partitionPath to do the lookup * This is not global, since we depend on the partitionPath to do the lookup
*
* @return
*/ */
@Override @Override
public boolean isGlobal() { public boolean isGlobal() {
@@ -276,8 +289,6 @@ public class HoodieBloomIndex<T extends HoodieRecordPayload> extends HoodieIndex
/** /**
* No indexes into log files yet. * No indexes into log files yet.
*
* @return
*/ */
@Override @Override
public boolean canIndexLogFiles() { public boolean canIndexLogFiles() {
@@ -286,8 +297,6 @@ public class HoodieBloomIndex<T extends HoodieRecordPayload> extends HoodieIndex
/** /**
* Bloom filters are stored, into the same data files. * Bloom filters are stored, into the same data files.
*
* @return
*/ */
@Override @Override
public boolean isImplicitWithStorage() { public boolean isImplicitWithStorage() {
@@ -295,12 +304,8 @@ public class HoodieBloomIndex<T extends HoodieRecordPayload> extends HoodieIndex
} }
/** /**
* if we dont have key ranges, then also we need to compare against the file. no other choice * if we dont have key ranges, then also we need to compare against the file. no other choice if
* if we do, then only compare the file if the record key falls in range. * we do, then only compare the file if the record key falls in range.
* @param indexInfo
* @param recordKey
* @return
*/ */
private boolean shouldCompareWithFile(BloomIndexFileInfo indexInfo, String recordKey) { private boolean shouldCompareWithFile(BloomIndexFileInfo indexInfo, String recordKey) {
return !indexInfo.hasKeyRanges() || indexInfo.isKeyInRange(recordKey); return !indexInfo.hasKeyRanges() || indexInfo.isKeyInRange(recordKey);
@@ -308,19 +313,16 @@ public class HoodieBloomIndex<T extends HoodieRecordPayload> extends HoodieIndex
/** /**
* For each incoming record, produce N output records, 1 each for each file against which the record's key * For each incoming record, produce N output records, 1 each for each file against which the
* needs to be checked. For datasets, where the keys have a definite insert order (e.g: timestamp as prefix), * record's key needs to be checked. For datasets, where the keys have a definite insert order
* the number of files to be compared gets cut down a lot from range pruning. * (e.g: timestamp as prefix), the number of files to be compared gets cut down a lot from range
* * pruning.
*
* @param partitionToFileIndexInfo
* @param partitionRecordKeyPairRDD
* @return
*/ */
// sub-partition to ensure the records can be looked up against files & also prune file<=>record comparisons based on recordKey // sub-partition to ensure the records can be looked up against files & also prune file<=>record comparisons based on recordKey
// ranges in the index info. // ranges in the index info.
@VisibleForTesting @VisibleForTesting
JavaPairRDD<String, Tuple2<String, HoodieKey>> explodeRecordRDDWithFileComparisons(final Map<String, List<BloomIndexFileInfo>> partitionToFileIndexInfo, JavaPairRDD<String, Tuple2<String, HoodieKey>> explodeRecordRDDWithFileComparisons(
final Map<String, List<BloomIndexFileInfo>> partitionToFileIndexInfo,
JavaPairRDD<String, String> partitionRecordKeyPairRDD) { JavaPairRDD<String, String> partitionRecordKeyPairRDD) {
return partitionRecordKeyPairRDD return partitionRecordKeyPairRDD
.map(partitionRecordKeyPair -> { .map(partitionRecordKeyPair -> {
@@ -329,13 +331,15 @@ public class HoodieBloomIndex<T extends HoodieRecordPayload> extends HoodieIndex
List<BloomIndexFileInfo> indexInfos = partitionToFileIndexInfo.get(partitionPath); List<BloomIndexFileInfo> indexInfos = partitionToFileIndexInfo.get(partitionPath);
List<Tuple2<String, Tuple2<String, HoodieKey>>> recordComparisons = new ArrayList<>(); List<Tuple2<String, Tuple2<String, HoodieKey>>> recordComparisons = new ArrayList<>();
if (indexInfos != null) { // could be null, if there are no files in a given partition yet. if (indexInfos
!= null) { // could be null, if there are no files in a given partition yet.
// for each candidate file in partition, that needs to be compared. // for each candidate file in partition, that needs to be compared.
for (BloomIndexFileInfo indexInfo : indexInfos) { for (BloomIndexFileInfo indexInfo : indexInfos) {
if (shouldCompareWithFile(indexInfo, recordKey)) { if (shouldCompareWithFile(indexInfo, recordKey)) {
recordComparisons.add( recordComparisons.add(
new Tuple2<>(String.format("%s#%s", indexInfo.getFileName(), recordKey), new Tuple2<>(String.format("%s#%s", indexInfo.getFileName(), recordKey),
new Tuple2<>(indexInfo.getFileName(), new HoodieKey(recordKey, partitionPath)))); new Tuple2<>(indexInfo.getFileName(),
new HoodieKey(recordKey, partitionPath))));
} }
} }
} }
@@ -347,22 +351,23 @@ public class HoodieBloomIndex<T extends HoodieRecordPayload> extends HoodieIndex
/** /**
* Find out <RowKey, filename> pair. All workload grouped by file-level. * Find out <RowKey, filename> pair. All workload grouped by file-level.
* *
* Join PairRDD(PartitionPath, RecordKey) and PairRDD(PartitionPath, File) & then repartition * Join PairRDD(PartitionPath, RecordKey) and PairRDD(PartitionPath, File) & then repartition such
* such that each RDD partition is a file, then for each file, we do * that each RDD partition is a file, then for each file, we do (1) load bloom filter, (2) load
* (1) load bloom filter, * rowKeys, (3) Tag rowKey
* (2) load rowKeys,
* (3) Tag rowKey
* *
* Make sure the parallelism is atleast the groupby parallelism for tagging location * Make sure the parallelism is atleast the groupby parallelism for tagging location
*/ */
@VisibleForTesting @VisibleForTesting
JavaPairRDD<String, String> findMatchingFilesForRecordKeys(final Map<String, List<BloomIndexFileInfo>> partitionToFileIndexInfo, JavaPairRDD<String, String> findMatchingFilesForRecordKeys(
final Map<String, List<BloomIndexFileInfo>> partitionToFileIndexInfo,
JavaPairRDD<String, String> partitionRecordKeyPairRDD, JavaPairRDD<String, String> partitionRecordKeyPairRDD,
int totalSubpartitions) { int totalSubpartitions) {
int joinParallelism = determineParallelism(partitionRecordKeyPairRDD.partitions().size(), totalSubpartitions); int joinParallelism = determineParallelism(partitionRecordKeyPairRDD.partitions().size(),
totalSubpartitions);
JavaPairRDD<String, Tuple2<String, HoodieKey>> fileSortedTripletRDD = explodeRecordRDDWithFileComparisons(partitionToFileIndexInfo, partitionRecordKeyPairRDD) JavaPairRDD<String, Tuple2<String, HoodieKey>> fileSortedTripletRDD = explodeRecordRDDWithFileComparisons(
partitionToFileIndexInfo, partitionRecordKeyPairRDD)
// sort further based on filename, such that all checking for the file can happen within a single partition, on-the-fly // sort further based on filename, such that all checking for the file can happen within a single partition, on-the-fly
.sortByKey(true, joinParallelism); .sortByKey(true, joinParallelism);
@@ -382,7 +387,8 @@ public class HoodieBloomIndex<T extends HoodieRecordPayload> extends HoodieIndex
/** /**
* Tag the <rowKey, filename> back to the original HoodieRecord RDD. * Tag the <rowKey, filename> back to the original HoodieRecord RDD.
*/ */
private JavaRDD<HoodieRecord<T>> tagLocationBacktoRecords(JavaPairRDD<String, String> rowKeyFilenamePairRDD, private JavaRDD<HoodieRecord<T>> tagLocationBacktoRecords(
JavaPairRDD<String, String> rowKeyFilenamePairRDD,
JavaRDD<HoodieRecord<T>> recordRDD) { JavaRDD<HoodieRecord<T>> recordRDD) {
JavaPairRDD<String, HoodieRecord<T>> rowKeyRecordPairRDD = recordRDD JavaPairRDD<String, HoodieRecord<T>> rowKeyRecordPairRDD = recordRDD
.mapToPair(record -> new Tuple2<>(record.getRecordKey(), record)); .mapToPair(record -> new Tuple2<>(record.getRecordKey(), record));
@@ -404,7 +410,8 @@ public class HoodieBloomIndex<T extends HoodieRecordPayload> extends HoodieIndex
} }
@Override @Override
public JavaRDD<WriteStatus> updateLocation(JavaRDD<WriteStatus> writeStatusRDD, HoodieTable<T> hoodieTable) { public JavaRDD<WriteStatus> updateLocation(JavaRDD<WriteStatus> writeStatusRDD,
HoodieTable<T> hoodieTable) {
return writeStatusRDD; return writeStatusRDD;
} }
} }

View File

@@ -24,24 +24,22 @@ import com.uber.hoodie.common.util.ParquetUtils;
import com.uber.hoodie.exception.HoodieException; import com.uber.hoodie.exception.HoodieException;
import com.uber.hoodie.exception.HoodieIndexException; import com.uber.hoodie.exception.HoodieIndexException;
import com.uber.hoodie.func.LazyIterableIterator; import com.uber.hoodie.func.LazyIterableIterator;
import org.apache.hadoop.fs.Path;
import org.apache.log4j.LogManager;
import org.apache.log4j.Logger;
import org.apache.spark.api.java.function.Function2;
import java.util.ArrayList; import java.util.ArrayList;
import java.util.Iterator; import java.util.Iterator;
import java.util.List; import java.util.List;
import java.util.Set; import java.util.Set;
import org.apache.hadoop.fs.Path;
import org.apache.log4j.LogManager;
import org.apache.log4j.Logger;
import org.apache.spark.api.java.function.Function2;
import scala.Tuple2; import scala.Tuple2;
/** /**
* Function performing actual checking of RDD parition containing (fileId, hoodieKeys) against the * Function performing actual checking of RDD parition containing (fileId, hoodieKeys) against the
* actual files * actual files
*/ */
public class HoodieBloomIndexCheckFunction implements Function2<Integer, Iterator<Tuple2<String, Tuple2<String, HoodieKey>>>, Iterator<List<IndexLookupResult>>> { public class HoodieBloomIndexCheckFunction implements
Function2<Integer, Iterator<Tuple2<String, Tuple2<String, HoodieKey>>>, Iterator<List<IndexLookupResult>>> {
private static Logger logger = LogManager.getLogger(HoodieBloomIndexCheckFunction.class); private static Logger logger = LogManager.getLogger(HoodieBloomIndexCheckFunction.class);
@@ -54,7 +52,8 @@ public class HoodieBloomIndexCheckFunction implements Function2<Integer, Iterato
/** /**
* Given a list of row keys and one file, return only row keys existing in that file. * Given a list of row keys and one file, return only row keys existing in that file.
*/ */
public static List<String> checkCandidatesAgainstFile(List<String> candidateRecordKeys, Path filePath) throws HoodieIndexException { public static List<String> checkCandidatesAgainstFile(List<String> candidateRecordKeys,
Path filePath) throws HoodieIndexException {
List<String> foundRecordKeys = new ArrayList<>(); List<String> foundRecordKeys = new ArrayList<>();
try { try {
// Load all rowKeys from the file, to double-confirm // Load all rowKeys from the file, to double-confirm
@@ -69,18 +68,20 @@ public class HoodieBloomIndexCheckFunction implements Function2<Integer, Iterato
foundRecordKeys.add(rowKey); foundRecordKeys.add(rowKey);
} }
} }
logger.info("After checking with row keys, we have " + foundRecordKeys.size() + " results, for file " + filePath + " => " + foundRecordKeys); logger.info("After checking with row keys, we have " + foundRecordKeys.size()
+ " results, for file " + filePath + " => " + foundRecordKeys);
if (logger.isDebugEnabled()) { if (logger.isDebugEnabled()) {
logger.debug("Keys matching for file " + filePath + " => " + foundRecordKeys); logger.debug("Keys matching for file " + filePath + " => " + foundRecordKeys);
} }
} }
} catch (Exception e){ } catch (Exception e) {
throw new HoodieIndexException("Error checking candidate keys against file.", e); throw new HoodieIndexException("Error checking candidate keys against file.", e);
} }
return foundRecordKeys; return foundRecordKeys;
} }
class LazyKeyCheckIterator extends LazyIterableIterator<Tuple2<String, Tuple2<String, HoodieKey>>, List<IndexLookupResult>> { class LazyKeyCheckIterator extends
LazyIterableIterator<Tuple2<String, Tuple2<String, HoodieKey>>, List<IndexLookupResult>> {
private List<String> candidateRecordKeys; private List<String> candidateRecordKeys;
@@ -90,7 +91,8 @@ public class HoodieBloomIndexCheckFunction implements Function2<Integer, Iterato
private String currentParitionPath; private String currentParitionPath;
LazyKeyCheckIterator(Iterator<Tuple2<String, Tuple2<String, HoodieKey>>> fileParitionRecordKeyTripletItr) { LazyKeyCheckIterator(
Iterator<Tuple2<String, Tuple2<String, HoodieKey>>> fileParitionRecordKeyTripletItr) {
super(fileParitionRecordKeyTripletItr); super(fileParitionRecordKeyTripletItr);
currentFile = null; currentFile = null;
candidateRecordKeys = new ArrayList<>(); candidateRecordKeys = new ArrayList<>();
@@ -144,11 +146,15 @@ public class HoodieBloomIndexCheckFunction implements Function2<Integer, Iterato
} else { } else {
// do the actual checking of file & break out // do the actual checking of file & break out
Path filePath = new Path(basePath + "/" + currentParitionPath + "/" + currentFile); Path filePath = new Path(basePath + "/" + currentParitionPath + "/" + currentFile);
logger.info("#1 After bloom filter, the candidate row keys is reduced to " + candidateRecordKeys.size() + " for " + filePath); logger.info(
"#1 After bloom filter, the candidate row keys is reduced to " + candidateRecordKeys
.size() + " for " + filePath);
if (logger.isDebugEnabled()) { if (logger.isDebugEnabled()) {
logger.debug("#The candidate row keys for " + filePath + " => " + candidateRecordKeys); logger
.debug("#The candidate row keys for " + filePath + " => " + candidateRecordKeys);
} }
ret.add(new IndexLookupResult(currentFile, checkCandidatesAgainstFile(candidateRecordKeys, filePath))); ret.add(new IndexLookupResult(currentFile,
checkCandidatesAgainstFile(candidateRecordKeys, filePath)));
initState(fileName, partitionPath); initState(fileName, partitionPath);
if (bloomFilter.mightContain(recordKey)) { if (bloomFilter.mightContain(recordKey)) {
@@ -164,11 +170,14 @@ public class HoodieBloomIndexCheckFunction implements Function2<Integer, Iterato
// handle case, where we ran out of input, finish pending work, update return val // handle case, where we ran out of input, finish pending work, update return val
if (!inputItr.hasNext()) { if (!inputItr.hasNext()) {
Path filePath = new Path(basePath + "/" + currentParitionPath + "/" + currentFile); Path filePath = new Path(basePath + "/" + currentParitionPath + "/" + currentFile);
logger.info("#2 After bloom filter, the candidate row keys is reduced to " + candidateRecordKeys.size() + " for " + filePath); logger.info(
"#2 After bloom filter, the candidate row keys is reduced to " + candidateRecordKeys
.size() + " for " + filePath);
if (logger.isDebugEnabled()) { if (logger.isDebugEnabled()) {
logger.debug("#The candidate row keys for " + filePath + " => " + candidateRecordKeys); logger.debug("#The candidate row keys for " + filePath + " => " + candidateRecordKeys);
} }
ret.add(new IndexLookupResult(currentFile, checkCandidatesAgainstFile(candidateRecordKeys, filePath))); ret.add(new IndexLookupResult(currentFile,
checkCandidatesAgainstFile(candidateRecordKeys, filePath)));
} }
} catch (Throwable e) { } catch (Throwable e) {
@@ -189,7 +198,8 @@ public class HoodieBloomIndexCheckFunction implements Function2<Integer, Iterato
@Override @Override
public Iterator<List<IndexLookupResult>> call(Integer partition, public Iterator<List<IndexLookupResult>> call(Integer partition,
Iterator<Tuple2<String, Tuple2<String, HoodieKey>>> fileParitionRecordKeyTripletItr) throws Exception { Iterator<Tuple2<String, Tuple2<String, HoodieKey>>> fileParitionRecordKeyTripletItr)
throws Exception {
return new LazyKeyCheckIterator(fileParitionRecordKeyTripletItr); return new LazyKeyCheckIterator(fileParitionRecordKeyTripletItr);
} }
} }

View File

@@ -19,7 +19,6 @@
package com.uber.hoodie.index.bucketed; package com.uber.hoodie.index.bucketed;
import com.google.common.base.Optional; import com.google.common.base.Optional;
import com.uber.hoodie.WriteStatus; import com.uber.hoodie.WriteStatus;
import com.uber.hoodie.common.model.HoodieKey; import com.uber.hoodie.common.model.HoodieKey;
import com.uber.hoodie.common.model.HoodieRecord; import com.uber.hoodie.common.model.HoodieRecord;
@@ -29,29 +28,22 @@ import com.uber.hoodie.config.HoodieWriteConfig;
import com.uber.hoodie.exception.HoodieIndexException; import com.uber.hoodie.exception.HoodieIndexException;
import com.uber.hoodie.index.HoodieIndex; import com.uber.hoodie.index.HoodieIndex;
import com.uber.hoodie.table.HoodieTable; import com.uber.hoodie.table.HoodieTable;
import org.apache.log4j.LogManager; import org.apache.log4j.LogManager;
import org.apache.log4j.Logger; import org.apache.log4j.Logger;
import org.apache.spark.api.java.JavaPairRDD; import org.apache.spark.api.java.JavaPairRDD;
import org.apache.spark.api.java.JavaRDD; import org.apache.spark.api.java.JavaRDD;
import org.apache.spark.api.java.JavaSparkContext; import org.apache.spark.api.java.JavaSparkContext;
import scala.Tuple2; import scala.Tuple2;
/** /**
* An `stateless` index implementation that will using a deterministic mapping function to * An `stateless` index implementation that will using a deterministic mapping function to determine
* determine the fileID for a given record. * the fileID for a given record.
*
* Pros:
* - Fast
*
* Cons :
* - Need to tune the number of buckets per partition path manually (FIXME: Need to autotune this)
* - Could increase write amplification on copy-on-write storage since inserts always rewrite files
* - Not global.
*
* *
* Pros: - Fast
* *
* Cons : - Need to tune the number of buckets per partition path manually (FIXME: Need to autotune
* this) - Could increase write amplification on copy-on-write storage since inserts always rewrite
* files - Not global.
*/ */
public class BucketedIndex<T extends HoodieRecordPayload> extends HoodieIndex<T> { public class BucketedIndex<T extends HoodieRecordPayload> extends HoodieIndex<T> {
@@ -66,12 +58,14 @@ public class BucketedIndex<T extends HoodieRecordPayload> extends HoodieIndex<T>
} }
@Override @Override
public JavaPairRDD<HoodieKey, Optional<String>> fetchRecordLocation(JavaRDD<HoodieKey> hoodieKeys, HoodieTable<T> table) { public JavaPairRDD<HoodieKey, Optional<String>> fetchRecordLocation(JavaRDD<HoodieKey> hoodieKeys,
HoodieTable<T> table) {
return hoodieKeys.mapToPair(hk -> new Tuple2<>(hk, Optional.of(getBucket(hk.getRecordKey())))); return hoodieKeys.mapToPair(hk -> new Tuple2<>(hk, Optional.of(getBucket(hk.getRecordKey()))));
} }
@Override @Override
public JavaRDD<HoodieRecord<T>> tagLocation(JavaRDD<HoodieRecord<T>> recordRDD, HoodieTable<T> hoodieTable) throws HoodieIndexException { public JavaRDD<HoodieRecord<T>> tagLocation(JavaRDD<HoodieRecord<T>> recordRDD,
HoodieTable<T> hoodieTable) throws HoodieIndexException {
return recordRDD.map(record -> { return recordRDD.map(record -> {
String bucket = getBucket(record.getRecordKey()); String bucket = getBucket(record.getRecordKey());
//HACK(vc) a non-existent commit is provided here. //HACK(vc) a non-existent commit is provided here.
@@ -81,7 +75,8 @@ public class BucketedIndex<T extends HoodieRecordPayload> extends HoodieIndex<T>
} }
@Override @Override
public JavaRDD<WriteStatus> updateLocation(JavaRDD<WriteStatus> writeStatusRDD, HoodieTable<T> hoodieTable) throws HoodieIndexException { public JavaRDD<WriteStatus> updateLocation(JavaRDD<WriteStatus> writeStatusRDD,
HoodieTable<T> hoodieTable) throws HoodieIndexException {
return writeStatusRDD; return writeStatusRDD;
} }
@@ -93,8 +88,6 @@ public class BucketedIndex<T extends HoodieRecordPayload> extends HoodieIndex<T>
/** /**
* Bucketing is still done within each partition. * Bucketing is still done within each partition.
*
* @return
*/ */
@Override @Override
public boolean isGlobal() { public boolean isGlobal() {
@@ -102,10 +95,8 @@ public class BucketedIndex<T extends HoodieRecordPayload> extends HoodieIndex<T>
} }
/** /**
* Since indexing is just a deterministic hash, we can identify file group correctly even without an index * Since indexing is just a deterministic hash, we can identify file group correctly even without
* on the actual log file. * an index on the actual log file.
*
* @return
*/ */
@Override @Override
public boolean canIndexLogFiles() { public boolean canIndexLogFiles() {
@@ -114,8 +105,6 @@ public class BucketedIndex<T extends HoodieRecordPayload> extends HoodieIndex<T>
/** /**
* Indexing is just a hash function. * Indexing is just a hash function.
*
* @return
*/ */
@Override @Override
public boolean isImplicitWithStorage() { public boolean isImplicitWithStorage() {

View File

@@ -19,24 +19,33 @@
package com.uber.hoodie.index.hbase; package com.uber.hoodie.index.hbase;
import com.google.common.base.Optional; import com.google.common.base.Optional;
import com.uber.hoodie.common.table.HoodieTimeline;
import com.uber.hoodie.common.table.timeline.HoodieInstant;
import com.uber.hoodie.config.HoodieWriteConfig;
import com.uber.hoodie.WriteStatus; import com.uber.hoodie.WriteStatus;
import com.uber.hoodie.common.model.HoodieKey; import com.uber.hoodie.common.model.HoodieKey;
import com.uber.hoodie.common.model.HoodieRecord;
import com.uber.hoodie.common.model.HoodieRecordLocation; import com.uber.hoodie.common.model.HoodieRecordLocation;
import com.uber.hoodie.common.model.HoodieRecordPayload; import com.uber.hoodie.common.model.HoodieRecordPayload;
import com.uber.hoodie.common.model.HoodieRecord; import com.uber.hoodie.common.table.HoodieTimeline;
import com.uber.hoodie.common.table.timeline.HoodieInstant;
import com.uber.hoodie.config.HoodieIndexConfig; import com.uber.hoodie.config.HoodieIndexConfig;
import com.uber.hoodie.config.HoodieWriteConfig;
import com.uber.hoodie.exception.HoodieDependentSystemUnavailableException; import com.uber.hoodie.exception.HoodieDependentSystemUnavailableException;
import com.uber.hoodie.exception.HoodieIndexException; import com.uber.hoodie.exception.HoodieIndexException;
import com.uber.hoodie.index.HoodieIndex; import com.uber.hoodie.index.HoodieIndex;
import com.uber.hoodie.table.HoodieTable; import com.uber.hoodie.table.HoodieTable;
import java.io.IOException;
import java.util.ArrayList;
import java.util.Iterator;
import java.util.List;
import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.hbase.HBaseConfiguration; import org.apache.hadoop.hbase.HBaseConfiguration;
import org.apache.hadoop.hbase.TableName; import org.apache.hadoop.hbase.TableName;
import org.apache.hadoop.hbase.client.*; import org.apache.hadoop.hbase.client.Connection;
import org.apache.hadoop.hbase.client.ConnectionFactory;
import org.apache.hadoop.hbase.client.Delete;
import org.apache.hadoop.hbase.client.Get;
import org.apache.hadoop.hbase.client.HTable;
import org.apache.hadoop.hbase.client.Put;
import org.apache.hadoop.hbase.client.Result;
import org.apache.hadoop.hbase.util.Bytes; import org.apache.hadoop.hbase.util.Bytes;
import org.apache.log4j.LogManager; import org.apache.log4j.LogManager;
import org.apache.log4j.Logger; import org.apache.log4j.Logger;
@@ -45,15 +54,11 @@ import org.apache.spark.api.java.JavaRDD;
import org.apache.spark.api.java.JavaSparkContext; import org.apache.spark.api.java.JavaSparkContext;
import org.apache.spark.api.java.function.Function2; import org.apache.spark.api.java.function.Function2;
import java.io.IOException;
import java.util.ArrayList;
import java.util.Iterator;
import java.util.List;
/** /**
* Hoodie Index implementation backed by HBase * Hoodie Index implementation backed by HBase
*/ */
public class HBaseIndex<T extends HoodieRecordPayload> extends HoodieIndex<T> { public class HBaseIndex<T extends HoodieRecordPayload> extends HoodieIndex<T> {
private final static byte[] SYSTEM_COLUMN_FAMILY = Bytes.toBytes("_s"); private final static byte[] SYSTEM_COLUMN_FAMILY = Bytes.toBytes("_s");
private final static byte[] COMMIT_TS_COLUMN = Bytes.toBytes("commit_ts"); private final static byte[] COMMIT_TS_COLUMN = Bytes.toBytes("commit_ts");
private final static byte[] FILE_NAME_COLUMN = Bytes.toBytes("file_name"); private final static byte[] FILE_NAME_COLUMN = Bytes.toBytes("file_name");
@@ -144,9 +149,7 @@ public class HBaseIndex<T extends HoodieRecordPayload> extends HoodieIndex<T> {
} catch (IOException e) { } catch (IOException e) {
throw new HoodieIndexException( throw new HoodieIndexException(
"Failed to Tag indexed locations because of exception with HBase Client", e); "Failed to Tag indexed locations because of exception with HBase Client", e);
} } finally {
finally {
if (hTable != null) { if (hTable != null) {
try { try {
hTable.close(); hTable.close();
@@ -161,11 +164,14 @@ public class HBaseIndex<T extends HoodieRecordPayload> extends HoodieIndex<T> {
} }
@Override @Override
public JavaRDD<HoodieRecord<T>> tagLocation(JavaRDD<HoodieRecord<T>> recordRDD, HoodieTable<T> hoodieTable) { public JavaRDD<HoodieRecord<T>> tagLocation(JavaRDD<HoodieRecord<T>> recordRDD,
HoodieTable<T> hoodieTable) {
return recordRDD.mapPartitionsWithIndex(this.new LocationTagFunction(hoodieTable), true); return recordRDD.mapPartitionsWithIndex(this.new LocationTagFunction(hoodieTable), true);
} }
class UpdateLocationTask implements Function2<Integer, Iterator<WriteStatus>, Iterator<WriteStatus>> { class UpdateLocationTask implements
Function2<Integer, Iterator<WriteStatus>, Iterator<WriteStatus>> {
@Override @Override
public Iterator<WriteStatus> call(Integer partition, Iterator<WriteStatus> statusIterator) { public Iterator<WriteStatus> call(Integer partition, Iterator<WriteStatus> statusIterator) {
@@ -187,7 +193,7 @@ public class HBaseIndex<T extends HoodieRecordPayload> extends HoodieIndex<T> {
for (HoodieRecord rec : writeStatus.getWrittenRecords()) { for (HoodieRecord rec : writeStatus.getWrittenRecords()) {
if (!writeStatus.isErrored(rec.getKey())) { if (!writeStatus.isErrored(rec.getKey())) {
java.util.Optional<HoodieRecordLocation> loc = rec.getNewLocation(); java.util.Optional<HoodieRecordLocation> loc = rec.getNewLocation();
if(loc.isPresent()) { if (loc.isPresent()) {
Put put = new Put(Bytes.toBytes(rec.getRecordKey())); Put put = new Put(Bytes.toBytes(rec.getRecordKey()));
put.addColumn(SYSTEM_COLUMN_FAMILY, COMMIT_TS_COLUMN, put.addColumn(SYSTEM_COLUMN_FAMILY, COMMIT_TS_COLUMN,
Bytes.toBytes(loc.get().getCommitTime())); Bytes.toBytes(loc.get().getCommitTime()));
@@ -244,8 +250,6 @@ public class HBaseIndex<T extends HoodieRecordPayload> extends HoodieIndex<T> {
/** /**
* Only looks up by recordKey * Only looks up by recordKey
*
* @return
*/ */
@Override @Override
public boolean isGlobal() { public boolean isGlobal() {
@@ -254,8 +258,6 @@ public class HBaseIndex<T extends HoodieRecordPayload> extends HoodieIndex<T> {
/** /**
* Mapping is available in HBase already. * Mapping is available in HBase already.
*
* @return
*/ */
@Override @Override
public boolean canIndexLogFiles() { public boolean canIndexLogFiles() {
@@ -264,8 +266,6 @@ public class HBaseIndex<T extends HoodieRecordPayload> extends HoodieIndex<T> {
/** /**
* Index needs to be explicitly updated after storage write. * Index needs to be explicitly updated after storage write.
*
* @return
*/ */
@Override @Override
public boolean isImplicitWithStorage() { public boolean isImplicitWithStorage() {

View File

@@ -36,13 +36,6 @@ import com.uber.hoodie.config.HoodieWriteConfig;
import com.uber.hoodie.exception.HoodieAppendException; import com.uber.hoodie.exception.HoodieAppendException;
import com.uber.hoodie.exception.HoodieUpsertException; import com.uber.hoodie.exception.HoodieUpsertException;
import com.uber.hoodie.table.HoodieTable; import com.uber.hoodie.table.HoodieTable;
import org.apache.avro.generic.GenericRecord;
import org.apache.avro.generic.IndexedRecord;
import org.apache.hadoop.fs.Path;
import org.apache.log4j.LogManager;
import org.apache.log4j.Logger;
import org.apache.spark.TaskContext;
import java.io.IOException; import java.io.IOException;
import java.util.ArrayList; import java.util.ArrayList;
import java.util.Iterator; import java.util.Iterator;
@@ -50,13 +43,18 @@ import java.util.List;
import java.util.Map; import java.util.Map;
import java.util.Optional; import java.util.Optional;
import java.util.concurrent.atomic.AtomicLong; import java.util.concurrent.atomic.AtomicLong;
import org.apache.avro.generic.GenericRecord;
import org.apache.avro.generic.IndexedRecord;
import org.apache.hadoop.fs.Path;
import org.apache.log4j.LogManager;
import org.apache.log4j.Logger;
import org.apache.spark.TaskContext;
/** /**
* IO Operation to append data onto an existing file. * IO Operation to append data onto an existing file.
*
* @param <T>
*/ */
public class HoodieAppendHandle<T extends HoodieRecordPayload> extends HoodieIOHandle<T> { public class HoodieAppendHandle<T extends HoodieRecordPayload> extends HoodieIOHandle<T> {
private static Logger logger = LogManager.getLogger(HoodieMergeHandle.class); private static Logger logger = LogManager.getLogger(HoodieMergeHandle.class);
private static AtomicLong recordIndex = new AtomicLong(1); private static AtomicLong recordIndex = new AtomicLong(1);
@@ -133,7 +131,7 @@ public class HoodieAppendHandle<T extends HoodieRecordPayload> extends HoodieIOH
try { try {
Optional<IndexedRecord> avroRecord = hoodieRecord.getData().getInsertValue(schema); Optional<IndexedRecord> avroRecord = hoodieRecord.getData().getInsertValue(schema);
if(avroRecord.isPresent()) { if (avroRecord.isPresent()) {
String seqId = HoodieRecord.generateSequenceId(commitTime, TaskContext.getPartitionId(), String seqId = HoodieRecord.generateSequenceId(commitTime, TaskContext.getPartitionId(),
recordIndex.getAndIncrement()); recordIndex.getAndIncrement());
HoodieAvroUtils HoodieAvroUtils
@@ -164,18 +162,19 @@ public class HoodieAppendHandle<T extends HoodieRecordPayload> extends HoodieIOH
metadata.put(HoodieLogBlock.LogMetadataType.INSTANT_TIME, commitTime); metadata.put(HoodieLogBlock.LogMetadataType.INSTANT_TIME, commitTime);
records.stream().forEach(record -> { records.stream().forEach(record -> {
Optional<IndexedRecord> indexedRecord = getIndexedRecord(record); Optional<IndexedRecord> indexedRecord = getIndexedRecord(record);
if(indexedRecord.isPresent()) { if (indexedRecord.isPresent()) {
recordList.add(indexedRecord.get()); recordList.add(indexedRecord.get());
} else { } else {
keysToDelete.add(record.getRecordKey()); keysToDelete.add(record.getRecordKey());
} }
}); });
try { try {
if(recordList.size() > 0) { if (recordList.size() > 0) {
writer = writer.appendBlock(new HoodieAvroDataBlock(recordList, schema, metadata)); writer = writer.appendBlock(new HoodieAvroDataBlock(recordList, schema, metadata));
} }
if(keysToDelete.size() > 0) { if (keysToDelete.size() > 0) {
writer = writer.appendBlock(new HoodieDeleteBlock(keysToDelete.stream().toArray(String[]::new), metadata)); writer = writer.appendBlock(
new HoodieDeleteBlock(keysToDelete.stream().toArray(String[]::new), metadata));
} }
} catch (Exception e) { } catch (Exception e) {
throw new HoodieAppendException( throw new HoodieAppendException(

View File

@@ -27,27 +27,24 @@ import com.uber.hoodie.common.table.TableFileSystemView;
import com.uber.hoodie.common.table.timeline.HoodieInstant; import com.uber.hoodie.common.table.timeline.HoodieInstant;
import com.uber.hoodie.config.HoodieWriteConfig; import com.uber.hoodie.config.HoodieWriteConfig;
import com.uber.hoodie.table.HoodieTable; import com.uber.hoodie.table.HoodieTable;
import org.apache.hadoop.fs.FileSystem;
import org.apache.log4j.LogManager;
import org.apache.log4j.Logger;
import java.io.IOException; import java.io.IOException;
import java.util.ArrayList; import java.util.ArrayList;
import java.util.Iterator; import java.util.Iterator;
import java.util.List; import java.util.List;
import java.util.Optional; import java.util.Optional;
import java.util.stream.Collectors; import java.util.stream.Collectors;
import org.apache.hadoop.fs.FileSystem;
import org.apache.log4j.LogManager;
import org.apache.log4j.Logger;
/** /**
* Cleaner is responsible for garbage collecting older files in a given partition path, such that * Cleaner is responsible for garbage collecting older files in a given partition path, such that
* <p> * <p> 1) It provides sufficient time for existing queries running on older versions, to finish <p>
* 1) It provides sufficient time for existing queries running on older versions, to finish * 2) It bounds the growth of the files in the file system <p> TODO: Should all cleaning be done
* <p> * based on {@link com.uber.hoodie.common.model.HoodieCommitMetadata}
* 2) It bounds the growth of the files in the file system
* <p>
* TODO: Should all cleaning be done based on {@link com.uber.hoodie.common.model.HoodieCommitMetadata}
*/ */
public class HoodieCleanHelper<T extends HoodieRecordPayload<T>> { public class HoodieCleanHelper<T extends HoodieRecordPayload<T>> {
private static Logger logger = LogManager.getLogger(HoodieCleanHelper.class); private static Logger logger = LogManager.getLogger(HoodieCleanHelper.class);
private final TableFileSystemView fileSystemView; private final TableFileSystemView fileSystemView;
@@ -66,13 +63,9 @@ public class HoodieCleanHelper<T extends HoodieRecordPayload<T>> {
/** /**
* Selects the older versions of files for cleaning, such that it bounds the number of versions of each file. * Selects the older versions of files for cleaning, such that it bounds the number of versions of
* This policy is useful, if you are simply interested in querying the table, and you don't want too many * each file. This policy is useful, if you are simply interested in querying the table, and you
* versions for a single file (i.e run it with versionsRetained = 1) * don't want too many versions for a single file (i.e run it with versionsRetained = 1)
*
* @param partitionPath
* @return
* @throws IOException
*/ */
private List<String> getFilesToCleanKeepingLatestVersions(String partitionPath) private List<String> getFilesToCleanKeepingLatestVersions(String partitionPath)
throws IOException { throws IOException {
@@ -93,7 +86,7 @@ public class HoodieCleanHelper<T extends HoodieRecordPayload<T>> {
// Skip this most recent version // Skip this most recent version
FileSlice nextSlice = fileSliceIterator.next(); FileSlice nextSlice = fileSliceIterator.next();
HoodieDataFile dataFile = nextSlice.getDataFile().get(); HoodieDataFile dataFile = nextSlice.getDataFile().get();
if(savepointedFiles.contains(dataFile.getFileName())) { if (savepointedFiles.contains(dataFile.getFileName())) {
// do not clean up a savepoint data file // do not clean up a savepoint data file
continue; continue;
} }
@@ -118,22 +111,15 @@ public class HoodieCleanHelper<T extends HoodieRecordPayload<T>> {
/** /**
* Selects the versions for file for cleaning, such that it * Selects the versions for file for cleaning, such that it <p> - Leaves the latest version of the
* <p> * file untouched - For older versions, - It leaves all the commits untouched which has occured in
* - Leaves the latest version of the file untouched * last <code>config.getCleanerCommitsRetained()</code> commits - It leaves ONE commit before this
* - For older versions, * window. We assume that the max(query execution time) == commit_batch_time *
* - It leaves all the commits untouched which has occured in last <code>config.getCleanerCommitsRetained()</code> commits * config.getCleanerCommitsRetained(). This is 12 hours by default. This is essential to leave the
* - It leaves ONE commit before this window. We assume that the max(query execution time) == commit_batch_time * config.getCleanerCommitsRetained(). This is 12 hours by default. * file used by the query thats running for the max time. <p> This provides the effect of having
* This is essential to leave the file used by the query thats running for the max time. * lookback into all changes that happened in the last X commits. (eg: if you retain 24 commits,
* <p> * and commit batch time is 30 mins, then you have 12 hrs of lookback) <p> This policy is the
* This provides the effect of having lookback into all changes that happened in the last X * default.
* commits. (eg: if you retain 24 commits, and commit batch time is 30 mins, then you have 12 hrs of lookback)
* <p>
* This policy is the default.
*
* @param partitionPath
* @return
* @throws IOException
*/ */
private List<String> getFilesToCleanKeepingLatestCommits(String partitionPath) private List<String> getFilesToCleanKeepingLatestCommits(String partitionPath)
throws IOException { throws IOException {
@@ -164,7 +150,7 @@ public class HoodieCleanHelper<T extends HoodieRecordPayload<T>> {
for (FileSlice aSlice : fileSliceList) { for (FileSlice aSlice : fileSliceList) {
HoodieDataFile aFile = aSlice.getDataFile().get(); HoodieDataFile aFile = aSlice.getDataFile().get();
String fileCommitTime = aFile.getCommitTime(); String fileCommitTime = aFile.getCommitTime();
if(savepointedFiles.contains(aFile.getFileName())) { if (savepointedFiles.contains(aFile.getFileName())) {
// do not clean up a savepoint data file // do not clean up a savepoint data file
continue; continue;
} }

View File

@@ -39,6 +39,12 @@ import com.uber.hoodie.exception.HoodieCommitException;
import com.uber.hoodie.exception.HoodieException; import com.uber.hoodie.exception.HoodieException;
import com.uber.hoodie.exception.HoodieIOException; import com.uber.hoodie.exception.HoodieIOException;
import com.uber.hoodie.table.HoodieTable; import com.uber.hoodie.table.HoodieTable;
import java.io.IOException;
import java.util.ArrayList;
import java.util.List;
import java.util.Optional;
import java.util.stream.Collectors;
import java.util.stream.Stream;
import org.apache.avro.Schema; import org.apache.avro.Schema;
import org.apache.avro.generic.IndexedRecord; import org.apache.avro.generic.IndexedRecord;
import org.apache.hadoop.fs.FileSystem; import org.apache.hadoop.fs.FileSystem;
@@ -46,17 +52,11 @@ import org.apache.hadoop.fs.Path;
import org.apache.log4j.LogManager; import org.apache.log4j.LogManager;
import org.apache.log4j.Logger; import org.apache.log4j.Logger;
import java.io.IOException;
import java.util.ArrayList;
import java.util.List;
import java.util.Optional;
import java.util.stream.Collectors;
import java.util.stream.Stream;
/** /**
* Archiver to bound the growth of <action>.commit files * Archiver to bound the growth of <action>.commit files
*/ */
public class HoodieCommitArchiveLog { public class HoodieCommitArchiveLog {
private static Logger log = LogManager.getLogger(HoodieCommitArchiveLog.class); private static Logger log = LogManager.getLogger(HoodieCommitArchiveLog.class);
private final Path archiveFilePath; private final Path archiveFilePath;
@@ -73,7 +73,7 @@ public class HoodieCommitArchiveLog {
private HoodieLogFormat.Writer openWriter() { private HoodieLogFormat.Writer openWriter() {
try { try {
if(this.writer == null) { if (this.writer == null) {
return HoodieLogFormat.newWriterBuilder() return HoodieLogFormat.newWriterBuilder()
.onParentPath(archiveFilePath.getParent()) .onParentPath(archiveFilePath.getParent())
.withFileId(archiveFilePath.getName()) .withFileId(archiveFilePath.getName())
@@ -83,17 +83,17 @@ public class HoodieCommitArchiveLog {
} else { } else {
return this.writer; return this.writer;
} }
} catch(InterruptedException | IOException e) { } catch (InterruptedException | IOException e) {
throw new HoodieException("Unable to initialize HoodieLogFormat writer", e); throw new HoodieException("Unable to initialize HoodieLogFormat writer", e);
} }
} }
private void close() { private void close() {
try { try {
if(this.writer != null) { if (this.writer != null) {
this.writer.close(); this.writer.close();
} }
} catch(IOException e) { } catch (IOException e) {
throw new HoodieException("Unable to close HoodieLogFormat writer", e); throw new HoodieException("Unable to close HoodieLogFormat writer", e);
} }
} }
@@ -125,10 +125,12 @@ public class HoodieCommitArchiveLog {
int maxCommitsToKeep = config.getMaxCommitsToKeep(); int maxCommitsToKeep = config.getMaxCommitsToKeep();
int minCommitsToKeep = config.getMinCommitsToKeep(); int minCommitsToKeep = config.getMinCommitsToKeep();
HoodieTable table = HoodieTable.getHoodieTable(new HoodieTableMetaClient(fs, config.getBasePath(), true), config); HoodieTable table = HoodieTable
.getHoodieTable(new HoodieTableMetaClient(fs, config.getBasePath(), true), config);
// GroupBy each action and limit each action timeline to maxCommitsToKeep // GroupBy each action and limit each action timeline to maxCommitsToKeep
HoodieTimeline cleanAndRollbackTimeline = table.getActiveTimeline().getTimelineOfActions(Sets.newHashSet(HoodieTimeline.CLEAN_ACTION, HoodieTimeline cleanAndRollbackTimeline = table.getActiveTimeline()
.getTimelineOfActions(Sets.newHashSet(HoodieTimeline.CLEAN_ACTION,
HoodieTimeline.ROLLBACK_ACTION)); HoodieTimeline.ROLLBACK_ACTION));
Stream<HoodieInstant> instants = cleanAndRollbackTimeline.getInstants() Stream<HoodieInstant> instants = cleanAndRollbackTimeline.getInstants()
.collect(Collectors.groupingBy(s -> s.getAction())) .collect(Collectors.groupingBy(s -> s.getAction()))
@@ -198,7 +200,7 @@ public class HoodieCommitArchiveLog {
} }
HoodieAvroDataBlock block = new HoodieAvroDataBlock(records, wrapperSchema); HoodieAvroDataBlock block = new HoodieAvroDataBlock(records, wrapperSchema);
this.writer = writer.appendBlock(block); this.writer = writer.appendBlock(block);
} catch(Exception e) { } catch (Exception e) {
throw new HoodieCommitException("Failed to archive commits", e); throw new HoodieCommitException("Failed to archive commits", e);
} }
} }
@@ -207,40 +209,48 @@ public class HoodieCommitArchiveLog {
return archiveFilePath; return archiveFilePath;
} }
private IndexedRecord convertToAvroRecord(HoodieTimeline commitTimeline, HoodieInstant hoodieInstant) throws IOException { private IndexedRecord convertToAvroRecord(HoodieTimeline commitTimeline,
HoodieInstant hoodieInstant) throws IOException {
HoodieArchivedMetaEntry archivedMetaWrapper = new HoodieArchivedMetaEntry(); HoodieArchivedMetaEntry archivedMetaWrapper = new HoodieArchivedMetaEntry();
archivedMetaWrapper.setCommitTime(hoodieInstant.getTimestamp()); archivedMetaWrapper.setCommitTime(hoodieInstant.getTimestamp());
switch(hoodieInstant.getAction()) { switch (hoodieInstant.getAction()) {
case HoodieTimeline.CLEAN_ACTION:{ case HoodieTimeline.CLEAN_ACTION: {
archivedMetaWrapper.setHoodieCleanMetadata(AvroUtils.deserializeAvroMetadata(commitTimeline.getInstantDetails(hoodieInstant).get(), HoodieCleanMetadata.class)); archivedMetaWrapper.setHoodieCleanMetadata(AvroUtils
.deserializeAvroMetadata(commitTimeline.getInstantDetails(hoodieInstant).get(),
HoodieCleanMetadata.class));
archivedMetaWrapper.setActionType(ActionType.clean.name()); archivedMetaWrapper.setActionType(ActionType.clean.name());
break; break;
} }
case HoodieTimeline.COMMIT_ACTION:{ case HoodieTimeline.COMMIT_ACTION: {
HoodieCommitMetadata commitMetadata = HoodieCommitMetadata HoodieCommitMetadata commitMetadata = HoodieCommitMetadata
.fromBytes(commitTimeline.getInstantDetails(hoodieInstant).get()); .fromBytes(commitTimeline.getInstantDetails(hoodieInstant).get());
archivedMetaWrapper.setHoodieCommitMetadata(commitMetadataConverter(commitMetadata)); archivedMetaWrapper.setHoodieCommitMetadata(commitMetadataConverter(commitMetadata));
archivedMetaWrapper.setActionType(ActionType.commit.name()); archivedMetaWrapper.setActionType(ActionType.commit.name());
break; break;
} }
case HoodieTimeline.COMPACTION_ACTION:{ case HoodieTimeline.COMPACTION_ACTION: {
com.uber.hoodie.common.model.HoodieCompactionMetadata compactionMetadata = com.uber.hoodie.common.model.HoodieCompactionMetadata com.uber.hoodie.common.model.HoodieCompactionMetadata compactionMetadata = com.uber.hoodie.common.model.HoodieCompactionMetadata
.fromBytes(commitTimeline.getInstantDetails(hoodieInstant).get()); .fromBytes(commitTimeline.getInstantDetails(hoodieInstant).get());
archivedMetaWrapper.setHoodieCompactionMetadata(compactionMetadataConverter(compactionMetadata)); archivedMetaWrapper
.setHoodieCompactionMetadata(compactionMetadataConverter(compactionMetadata));
archivedMetaWrapper.setActionType(ActionType.compaction.name()); archivedMetaWrapper.setActionType(ActionType.compaction.name());
break; break;
} }
case HoodieTimeline.ROLLBACK_ACTION:{ case HoodieTimeline.ROLLBACK_ACTION: {
archivedMetaWrapper.setHoodieRollbackMetadata(AvroUtils.deserializeAvroMetadata(commitTimeline.getInstantDetails(hoodieInstant).get(), HoodieRollbackMetadata.class)); archivedMetaWrapper.setHoodieRollbackMetadata(AvroUtils
.deserializeAvroMetadata(commitTimeline.getInstantDetails(hoodieInstant).get(),
HoodieRollbackMetadata.class));
archivedMetaWrapper.setActionType(ActionType.rollback.name()); archivedMetaWrapper.setActionType(ActionType.rollback.name());
break; break;
} }
case HoodieTimeline.SAVEPOINT_ACTION:{ case HoodieTimeline.SAVEPOINT_ACTION: {
archivedMetaWrapper.setHoodieSavePointMetadata(AvroUtils.deserializeAvroMetadata(commitTimeline.getInstantDetails(hoodieInstant).get(), HoodieSavepointMetadata.class)); archivedMetaWrapper.setHoodieSavePointMetadata(AvroUtils
.deserializeAvroMetadata(commitTimeline.getInstantDetails(hoodieInstant).get(),
HoodieSavepointMetadata.class));
archivedMetaWrapper.setActionType(ActionType.savepoint.name()); archivedMetaWrapper.setActionType(ActionType.savepoint.name());
break; break;
} }
case HoodieTimeline.DELTA_COMMIT_ACTION:{ case HoodieTimeline.DELTA_COMMIT_ACTION: {
HoodieCommitMetadata commitMetadata = HoodieCommitMetadata HoodieCommitMetadata commitMetadata = HoodieCommitMetadata
.fromBytes(commitTimeline.getInstantDetails(hoodieInstant).get()); .fromBytes(commitTimeline.getInstantDetails(hoodieInstant).get());
archivedMetaWrapper.setHoodieCommitMetadata(commitMetadataConverter(commitMetadata)); archivedMetaWrapper.setHoodieCommitMetadata(commitMetadataConverter(commitMetadata));
@@ -251,19 +261,23 @@ public class HoodieCommitArchiveLog {
return archivedMetaWrapper; return archivedMetaWrapper;
} }
private com.uber.hoodie.avro.model.HoodieCommitMetadata commitMetadataConverter(HoodieCommitMetadata hoodieCommitMetadata) { private com.uber.hoodie.avro.model.HoodieCommitMetadata commitMetadataConverter(
HoodieCommitMetadata hoodieCommitMetadata) {
ObjectMapper mapper = new ObjectMapper(); ObjectMapper mapper = new ObjectMapper();
//Need this to ignore other public get() methods //Need this to ignore other public get() methods
mapper.configure(DeserializationFeature.FAIL_ON_UNKNOWN_PROPERTIES, false); mapper.configure(DeserializationFeature.FAIL_ON_UNKNOWN_PROPERTIES, false);
com.uber.hoodie.avro.model.HoodieCommitMetadata avroMetaData = com.uber.hoodie.avro.model.HoodieCommitMetadata avroMetaData =
mapper.convertValue(hoodieCommitMetadata, com.uber.hoodie.avro.model.HoodieCommitMetadata.class); mapper.convertValue(hoodieCommitMetadata,
com.uber.hoodie.avro.model.HoodieCommitMetadata.class);
return avroMetaData; return avroMetaData;
} }
private com.uber.hoodie.avro.model.HoodieCompactionMetadata compactionMetadataConverter(HoodieCompactionMetadata hoodieCompactionMetadata) { private com.uber.hoodie.avro.model.HoodieCompactionMetadata compactionMetadataConverter(
HoodieCompactionMetadata hoodieCompactionMetadata) {
ObjectMapper mapper = new ObjectMapper(); ObjectMapper mapper = new ObjectMapper();
mapper.configure(DeserializationFeature.FAIL_ON_UNKNOWN_PROPERTIES, false); mapper.configure(DeserializationFeature.FAIL_ON_UNKNOWN_PROPERTIES, false);
com.uber.hoodie.avro.model.HoodieCompactionMetadata avroMetaData = mapper.convertValue(hoodieCompactionMetadata, com.uber.hoodie.avro.model.HoodieCompactionMetadata avroMetaData = mapper
.convertValue(hoodieCompactionMetadata,
com.uber.hoodie.avro.model.HoodieCompactionMetadata.class); com.uber.hoodie.avro.model.HoodieCompactionMetadata.class);
return avroMetaData; return avroMetaData;
} }

View File

@@ -29,17 +29,17 @@ import com.uber.hoodie.exception.HoodieInsertException;
import com.uber.hoodie.io.storage.HoodieStorageWriter; import com.uber.hoodie.io.storage.HoodieStorageWriter;
import com.uber.hoodie.io.storage.HoodieStorageWriterFactory; import com.uber.hoodie.io.storage.HoodieStorageWriterFactory;
import com.uber.hoodie.table.HoodieTable; import com.uber.hoodie.table.HoodieTable;
import java.io.IOException;
import java.util.Optional;
import java.util.UUID;
import org.apache.avro.generic.IndexedRecord; import org.apache.avro.generic.IndexedRecord;
import org.apache.hadoop.fs.Path; import org.apache.hadoop.fs.Path;
import org.apache.log4j.LogManager; import org.apache.log4j.LogManager;
import org.apache.log4j.Logger; import org.apache.log4j.Logger;
import org.apache.spark.TaskContext; import org.apache.spark.TaskContext;
import java.io.IOException;
import java.util.Optional;
import java.util.UUID;
public class HoodieCreateHandle<T extends HoodieRecordPayload> extends HoodieIOHandle<T> { public class HoodieCreateHandle<T extends HoodieRecordPayload> extends HoodieIOHandle<T> {
private static Logger logger = LogManager.getLogger(HoodieCreateHandle.class); private static Logger logger = LogManager.getLogger(HoodieCreateHandle.class);
private final WriteStatus status; private final WriteStatus status;
@@ -63,7 +63,8 @@ public class HoodieCreateHandle<T extends HoodieRecordPayload> extends HoodieIOH
new Path(config.getBasePath(), partitionPath)); new Path(config.getBasePath(), partitionPath));
partitionMetadata.trySave(TaskContext.getPartitionId()); partitionMetadata.trySave(TaskContext.getPartitionId());
this.storageWriter = this.storageWriter =
HoodieStorageWriterFactory.getStorageWriter(commitTime, path, hoodieTable, config, schema); HoodieStorageWriterFactory
.getStorageWriter(commitTime, path, hoodieTable, config, schema);
} catch (IOException e) { } catch (IOException e) {
throw new HoodieInsertException( throw new HoodieInsertException(
"Failed to initialize HoodieStorageWriter for path " + path, e); "Failed to initialize HoodieStorageWriter for path " + path, e);
@@ -74,10 +75,8 @@ public class HoodieCreateHandle<T extends HoodieRecordPayload> extends HoodieIOH
/** /**
* Determines whether we can accept the incoming records, into the current file, depending on * Determines whether we can accept the incoming records, into the current file, depending on
* *
* - Whether it belongs to the same partitionPath as existing records * - Whether it belongs to the same partitionPath as existing records - Whether the current file
* - Whether the current file written bytes lt max file size * written bytes lt max file size
*
* @return
*/ */
public boolean canWrite(HoodieRecord record) { public boolean canWrite(HoodieRecord record) {
return storageWriter.canWrite() && record.getPartitionPath() return storageWriter.canWrite() && record.getPartitionPath()
@@ -86,15 +85,13 @@ public class HoodieCreateHandle<T extends HoodieRecordPayload> extends HoodieIOH
/** /**
* Perform the actual writing of the given record into the backing file. * Perform the actual writing of the given record into the backing file.
*
* @param record
*/ */
public void write(HoodieRecord record) { public void write(HoodieRecord record) {
Optional recordMetadata = record.getData().getMetadata(); Optional recordMetadata = record.getData().getMetadata();
try { try {
Optional<IndexedRecord> avroRecord = record.getData().getInsertValue(schema); Optional<IndexedRecord> avroRecord = record.getData().getInsertValue(schema);
if(avroRecord.isPresent()) { if (avroRecord.isPresent()) {
storageWriter.writeAvroWithMetadata(avroRecord.get(), record); storageWriter.writeAvroWithMetadata(avroRecord.get(), record);
// update the new location of record, so we know where to find it next // update the new location of record, so we know where to find it next
record.setNewLocation(new HoodieRecordLocation(commitTime, status.getFileId())); record.setNewLocation(new HoodieRecordLocation(commitTime, status.getFileId()));
@@ -114,8 +111,6 @@ public class HoodieCreateHandle<T extends HoodieRecordPayload> extends HoodieIOH
/** /**
* Performs actions to durably, persist the current changes and returns a WriteStatus object * Performs actions to durably, persist the current changes and returns a WriteStatus object
*
* @return
*/ */
public WriteStatus close() { public WriteStatus close() {
logger.info( logger.info(

View File

@@ -24,6 +24,7 @@ import com.uber.hoodie.common.util.HoodieAvroUtils;
import com.uber.hoodie.config.HoodieWriteConfig; import com.uber.hoodie.config.HoodieWriteConfig;
import com.uber.hoodie.exception.HoodieIOException; import com.uber.hoodie.exception.HoodieIOException;
import com.uber.hoodie.table.HoodieTable; import com.uber.hoodie.table.HoodieTable;
import java.io.IOException;
import org.apache.avro.Schema; import org.apache.avro.Schema;
import org.apache.hadoop.fs.FileStatus; import org.apache.hadoop.fs.FileStatus;
import org.apache.hadoop.fs.FileSystem; import org.apache.hadoop.fs.FileSystem;
@@ -31,9 +32,8 @@ import org.apache.hadoop.fs.Path;
import org.apache.log4j.LogManager; import org.apache.log4j.LogManager;
import org.apache.log4j.Logger; import org.apache.log4j.Logger;
import java.io.IOException;
public abstract class HoodieIOHandle<T extends HoodieRecordPayload> { public abstract class HoodieIOHandle<T extends HoodieRecordPayload> {
private static Logger logger = LogManager.getLogger(HoodieIOHandle.class); private static Logger logger = LogManager.getLogger(HoodieIOHandle.class);
protected final String commitTime; protected final String commitTime;
protected final HoodieWriteConfig config; protected final HoodieWriteConfig config;

View File

@@ -16,19 +16,23 @@
package com.uber.hoodie.io; package com.uber.hoodie.io;
import com.uber.hoodie.common.model.HoodiePartitionMetadata;
import com.uber.hoodie.common.util.ReflectionUtils;
import com.uber.hoodie.config.HoodieWriteConfig;
import com.uber.hoodie.WriteStatus; import com.uber.hoodie.WriteStatus;
import com.uber.hoodie.common.model.HoodiePartitionMetadata;
import com.uber.hoodie.common.model.HoodieRecord; import com.uber.hoodie.common.model.HoodieRecord;
import com.uber.hoodie.common.model.HoodieRecordLocation; import com.uber.hoodie.common.model.HoodieRecordLocation;
import com.uber.hoodie.common.model.HoodieRecordPayload; import com.uber.hoodie.common.model.HoodieRecordPayload;
import com.uber.hoodie.common.model.HoodieWriteStat; import com.uber.hoodie.common.model.HoodieWriteStat;
import com.uber.hoodie.common.util.FSUtils; import com.uber.hoodie.common.util.FSUtils;
import com.uber.hoodie.common.util.ReflectionUtils;
import com.uber.hoodie.config.HoodieWriteConfig;
import com.uber.hoodie.exception.HoodieUpsertException; import com.uber.hoodie.exception.HoodieUpsertException;
import com.uber.hoodie.io.storage.HoodieStorageWriter; import com.uber.hoodie.io.storage.HoodieStorageWriter;
import com.uber.hoodie.io.storage.HoodieStorageWriterFactory; import com.uber.hoodie.io.storage.HoodieStorageWriterFactory;
import com.uber.hoodie.table.HoodieTable; import com.uber.hoodie.table.HoodieTable;
import java.io.IOException;
import java.util.HashMap;
import java.util.Iterator;
import java.util.Optional;
import org.apache.avro.generic.GenericRecord; import org.apache.avro.generic.GenericRecord;
import org.apache.avro.generic.IndexedRecord; import org.apache.avro.generic.IndexedRecord;
import org.apache.hadoop.fs.Path; import org.apache.hadoop.fs.Path;
@@ -36,13 +40,9 @@ import org.apache.log4j.LogManager;
import org.apache.log4j.Logger; import org.apache.log4j.Logger;
import org.apache.spark.TaskContext; import org.apache.spark.TaskContext;
import java.io.IOException;
import java.util.HashMap;
import java.util.Iterator;
import java.util.Optional;
@SuppressWarnings("Duplicates") @SuppressWarnings("Duplicates")
public class HoodieMergeHandle<T extends HoodieRecordPayload> extends HoodieIOHandle<T> { public class HoodieMergeHandle<T extends HoodieRecordPayload> extends HoodieIOHandle<T> {
private static Logger logger = LogManager.getLogger(HoodieMergeHandle.class); private static Logger logger = LogManager.getLogger(HoodieMergeHandle.class);
private WriteStatus writeStatus; private WriteStatus writeStatus;
@@ -94,7 +94,7 @@ public class HoodieMergeHandle<T extends HoodieRecordPayload> extends HoodieIOHa
oldFilePath = new Path( oldFilePath = new Path(
config.getBasePath() + "/" + record.getPartitionPath() + "/" config.getBasePath() + "/" + record.getPartitionPath() + "/"
+ latestValidFilePath); + latestValidFilePath);
String relativePath = new Path( record.getPartitionPath() + "/" + FSUtils String relativePath = new Path(record.getPartitionPath() + "/" + FSUtils
.makeDataFileName(commitTime, TaskContext.getPartitionId(), fileId)).toString(); .makeDataFileName(commitTime, TaskContext.getPartitionId(), fileId)).toString();
newFilePath = new Path(config.getBasePath(), relativePath); newFilePath = new Path(config.getBasePath(), relativePath);
@@ -129,10 +129,11 @@ public class HoodieMergeHandle<T extends HoodieRecordPayload> extends HoodieIOHa
} }
private boolean writeUpdateRecord(HoodieRecord<T> hoodieRecord, Optional<IndexedRecord> indexedRecord) { private boolean writeUpdateRecord(HoodieRecord<T> hoodieRecord,
Optional<IndexedRecord> indexedRecord) {
Optional recordMetadata = hoodieRecord.getData().getMetadata(); Optional recordMetadata = hoodieRecord.getData().getMetadata();
try { try {
if(indexedRecord.isPresent()) { if (indexedRecord.isPresent()) {
storageWriter.writeAvroWithMetadata(indexedRecord.get(), hoodieRecord); storageWriter.writeAvroWithMetadata(indexedRecord.get(), hoodieRecord);
recordsWritten++; recordsWritten++;
updatedRecordsWritten++; updatedRecordsWritten++;
@@ -144,14 +145,15 @@ public class HoodieMergeHandle<T extends HoodieRecordPayload> extends HoodieIOHa
writeStatus.markSuccess(hoodieRecord, recordMetadata); writeStatus.markSuccess(hoodieRecord, recordMetadata);
return true; return true;
} catch (Exception e) { } catch (Exception e) {
logger.error("Error writing record "+ hoodieRecord, e); logger.error("Error writing record " + hoodieRecord, e);
writeStatus.markFailure(hoodieRecord, e, recordMetadata); writeStatus.markFailure(hoodieRecord, e, recordMetadata);
} }
return false; return false;
} }
/** /**
* Go through an old record. Here if we detect a newer version shows up, we write the new one to the file. * Go through an old record. Here if we detect a newer version shows up, we write the new one to
* the file.
*/ */
public void write(GenericRecord oldRecord) { public void write(GenericRecord oldRecord) {
String key = oldRecord.get(HoodieRecord.RECORD_KEY_METADATA_FIELD).toString(); String key = oldRecord.get(HoodieRecord.RECORD_KEY_METADATA_FIELD).toString();
@@ -159,7 +161,8 @@ public class HoodieMergeHandle<T extends HoodieRecordPayload> extends HoodieIOHa
boolean copyOldRecord = true; boolean copyOldRecord = true;
if (keyToNewRecords.containsKey(key)) { if (keyToNewRecords.containsKey(key)) {
try { try {
Optional<IndexedRecord> combinedAvroRecord = hoodieRecord.getData().combineAndGetUpdateValue(oldRecord, schema); Optional<IndexedRecord> combinedAvroRecord = hoodieRecord.getData()
.combineAndGetUpdateValue(oldRecord, schema);
if (writeUpdateRecord(hoodieRecord, combinedAvroRecord)) { if (writeUpdateRecord(hoodieRecord, combinedAvroRecord)) {
/* ONLY WHEN /* ONLY WHEN
* 1) we have an update for this key AND * 1) we have an update for this key AND
@@ -171,7 +174,8 @@ public class HoodieMergeHandle<T extends HoodieRecordPayload> extends HoodieIOHa
} }
keyToNewRecords.remove(key); keyToNewRecords.remove(key);
} catch (Exception e) { } catch (Exception e) {
throw new HoodieUpsertException("Failed to combine/merge new record with old value in storage, for new record {" throw new HoodieUpsertException(
"Failed to combine/merge new record with old value in storage, for new record {"
+ keyToNewRecords.get(key) + "}, old value {" + oldRecord + "}", e); + keyToNewRecords.get(key) + "}, old value {" + oldRecord + "}", e);
} }
} }
@@ -193,7 +197,7 @@ public class HoodieMergeHandle<T extends HoodieRecordPayload> extends HoodieIOHa
+ getOldFilePath() + " to new file " + newFilePath, e); + getOldFilePath() + " to new file " + newFilePath, e);
throw new HoodieUpsertException(errMsg, e); throw new HoodieUpsertException(errMsg, e);
} }
recordsWritten ++; recordsWritten++;
} }
} }

View File

@@ -18,7 +18,6 @@ package com.uber.hoodie.io.compact;
import com.uber.hoodie.common.model.HoodieDataFile; import com.uber.hoodie.common.model.HoodieDataFile;
import com.uber.hoodie.common.model.HoodieLogFile; import com.uber.hoodie.common.model.HoodieLogFile;
import com.uber.hoodie.config.HoodieWriteConfig; import com.uber.hoodie.config.HoodieWriteConfig;
import com.uber.hoodie.io.compact.strategy.CompactionStrategy; import com.uber.hoodie.io.compact.strategy.CompactionStrategy;
import java.io.Serializable; import java.io.Serializable;
@@ -27,8 +26,8 @@ import java.util.Map;
import java.util.stream.Collectors; import java.util.stream.Collectors;
/** /**
* Encapsulates all the needed information about a compaction * Encapsulates all the needed information about a compaction and make a decision whether this
* and make a decision whether this compaction is effective or not * compaction is effective or not
* *
* @see CompactionStrategy * @see CompactionStrategy
*/ */

View File

@@ -22,18 +22,17 @@ import com.uber.hoodie.common.table.timeline.HoodieActiveTimeline;
import com.uber.hoodie.common.table.timeline.HoodieInstant; import com.uber.hoodie.common.table.timeline.HoodieInstant;
import com.uber.hoodie.config.HoodieWriteConfig; import com.uber.hoodie.config.HoodieWriteConfig;
import com.uber.hoodie.table.HoodieTable; import com.uber.hoodie.table.HoodieTable;
import org.apache.spark.api.java.JavaSparkContext;
import java.io.Serializable; import java.io.Serializable;
import java.util.Date; import java.util.Date;
import org.apache.spark.api.java.JavaSparkContext;
/** /**
* A HoodieCompactor runs compaction on a hoodie table * A HoodieCompactor runs compaction on a hoodie table
*/ */
public interface HoodieCompactor extends Serializable { public interface HoodieCompactor extends Serializable {
/** /**
* Compact the delta files with the data files * Compact the delta files with the data files
* @throws Exception
*/ */
HoodieCompactionMetadata compact(JavaSparkContext jsc, final HoodieWriteConfig config, HoodieCompactionMetadata compact(JavaSparkContext jsc, final HoodieWriteConfig config,
HoodieTable hoodieTable) throws Exception; HoodieTable hoodieTable) throws Exception;

View File

@@ -16,14 +16,14 @@
package com.uber.hoodie.io.compact; package com.uber.hoodie.io.compact;
import static java.util.stream.Collectors.toList;
import com.google.common.base.Preconditions; import com.google.common.base.Preconditions;
import com.google.common.collect.Lists; import com.google.common.collect.Lists;
import com.google.common.collect.Sets; import com.google.common.collect.Sets;
import com.uber.hoodie.WriteStatus; import com.uber.hoodie.WriteStatus;
import com.uber.hoodie.common.model.CompactionWriteStat; import com.uber.hoodie.common.model.CompactionWriteStat;
import com.uber.hoodie.common.model.HoodieAvroPayload;
import com.uber.hoodie.common.model.HoodieCompactionMetadata; import com.uber.hoodie.common.model.HoodieCompactionMetadata;
import com.uber.hoodie.common.model.HoodieRecordPayload;
import com.uber.hoodie.common.model.HoodieTableType; import com.uber.hoodie.common.model.HoodieTableType;
import com.uber.hoodie.common.table.HoodieTableMetaClient; import com.uber.hoodie.common.table.HoodieTableMetaClient;
import com.uber.hoodie.common.table.HoodieTimeline; import com.uber.hoodie.common.table.HoodieTimeline;
@@ -36,7 +36,12 @@ import com.uber.hoodie.config.HoodieWriteConfig;
import com.uber.hoodie.exception.HoodieCompactionException; import com.uber.hoodie.exception.HoodieCompactionException;
import com.uber.hoodie.table.HoodieCopyOnWriteTable; import com.uber.hoodie.table.HoodieCopyOnWriteTable;
import com.uber.hoodie.table.HoodieTable; import com.uber.hoodie.table.HoodieTable;
import java.io.IOException;
import java.nio.charset.StandardCharsets;
import java.util.Collection; import java.util.Collection;
import java.util.Iterator;
import java.util.List;
import java.util.Optional;
import java.util.stream.Collectors; import java.util.stream.Collectors;
import java.util.stream.StreamSupport; import java.util.stream.StreamSupport;
import org.apache.avro.Schema; import org.apache.avro.Schema;
@@ -46,18 +51,10 @@ import org.apache.log4j.Logger;
import org.apache.spark.api.java.JavaSparkContext; import org.apache.spark.api.java.JavaSparkContext;
import org.apache.spark.api.java.function.FlatMapFunction; import org.apache.spark.api.java.function.FlatMapFunction;
import java.io.IOException;
import java.nio.charset.StandardCharsets;
import java.util.Iterator;
import java.util.List;
import java.util.Optional;
import static java.util.stream.Collectors.*;
/** /**
* HoodieRealtimeTableCompactor compacts a hoodie table with merge on read storage. * HoodieRealtimeTableCompactor compacts a hoodie table with merge on read storage. Computes all
* Computes all possible compactions, passes it through a CompactionFilter and executes * possible compactions, passes it through a CompactionFilter and executes all the compactions and
* all the compactions and writes a new version of base files and make a normal commit * writes a new version of base files and make a normal commit
* *
* @see HoodieCompactor * @see HoodieCompactor
*/ */
@@ -80,7 +77,8 @@ public class HoodieRealtimeTableCompactor implements HoodieCompactor {
String compactionCommit = startCompactionCommit(hoodieTable); String compactionCommit = startCompactionCommit(hoodieTable);
log.info("Compacting " + metaClient.getBasePath() + " with commit " + compactionCommit); log.info("Compacting " + metaClient.getBasePath() + " with commit " + compactionCommit);
List<String> partitionPaths = List<String> partitionPaths =
FSUtils.getAllPartitionPaths(metaClient.getFs(), metaClient.getBasePath(), config.shouldAssumeDatePartitioning()); FSUtils.getAllPartitionPaths(metaClient.getFs(), metaClient.getBasePath(),
config.shouldAssumeDatePartitioning());
log.info("Compaction looking for files to compact in " + partitionPaths + " partitions"); log.info("Compaction looking for files to compact in " + partitionPaths + " partitions");
List<CompactionOperation> operations = List<CompactionOperation> operations =
@@ -156,7 +154,8 @@ public class HoodieRealtimeTableCompactor implements HoodieCompactor {
HoodieTimeline.DELTA_COMMIT_ACTION)) HoodieTimeline.DELTA_COMMIT_ACTION))
.filterCompletedInstants().lastInstant().get().getTimestamp(); .filterCompletedInstants().lastInstant().get().getTimestamp();
HoodieCompactedLogRecordScanner scanner = new HoodieCompactedLogRecordScanner(fs, metaClient.getBasePath(), HoodieCompactedLogRecordScanner scanner = new HoodieCompactedLogRecordScanner(fs,
metaClient.getBasePath(),
operation.getDeltaFilePaths(), readerSchema, maxInstantTime); operation.getDeltaFilePaths(), readerSchema, maxInstantTime);
if (!scanner.iterator().hasNext()) { if (!scanner.iterator().hasNext()) {
return Lists.newArrayList(); return Lists.newArrayList();

View File

@@ -28,8 +28,8 @@ import java.util.Map;
import java.util.Optional; import java.util.Optional;
/** /**
* CompactionStrategy which looks at total IO to be done for the compaction (read + write) * CompactionStrategy which looks at total IO to be done for the compaction (read + write) and
* and limits the list of compactions to be under a configured limit on the IO * limits the list of compactions to be under a configured limit on the IO
* *
* @see CompactionStrategy * @see CompactionStrategy
*/ */
@@ -62,7 +62,8 @@ public class BoundedIOCompactionStrategy implements CompactionStrategy {
} }
@Override @Override
public List<CompactionOperation> orderAndFilter(HoodieWriteConfig writeConfig, List<CompactionOperation> operations) { public List<CompactionOperation> orderAndFilter(HoodieWriteConfig writeConfig,
List<CompactionOperation> operations) {
// Iterate through the operations in order and accept operations as long as we are within the IO limit // Iterate through the operations in order and accept operations as long as we are within the IO limit
// Preserves the original ordering of compactions // Preserves the original ordering of compactions
List<CompactionOperation> finalOperations = Lists.newArrayList(); List<CompactionOperation> finalOperations = Lists.newArrayList();

View File

@@ -25,12 +25,12 @@ import java.util.List;
import java.util.Map; import java.util.Map;
/** /**
* Strategy for compaction. Pluggable implementation of define how compaction should be done. * Strategy for compaction. Pluggable implementation of define how compaction should be done. The
* The implementations of this interface can capture the relevant metrics to order and filter * implementations of this interface can capture the relevant metrics to order and filter the final
* the final list of compaction operation to run in a single compaction. * list of compaction operation to run in a single compaction.
* *
* Implementation of CompactionStrategy cannot hold any state. * Implementation of CompactionStrategy cannot hold any state. Difference instantiations can be
* Difference instantiations can be passed in every time * passed in every time
* *
* @see com.uber.hoodie.io.compact.HoodieRealtimeTableCompactor * @see com.uber.hoodie.io.compact.HoodieRealtimeTableCompactor
* @see CompactionOperation * @see CompactionOperation
@@ -38,8 +38,8 @@ import java.util.Map;
public interface CompactionStrategy extends Serializable { public interface CompactionStrategy extends Serializable {
/** /**
* Callback hook when a CompactionOperation is created. Individual strategies can * Callback hook when a CompactionOperation is created. Individual strategies can capture the
* capture the metrics they need to decide on the priority. * metrics they need to decide on the priority.
* *
* @param dataFile - Base file to compact * @param dataFile - Base file to compact
* @param partitionPath - Partition path * @param partitionPath - Partition path
@@ -50,8 +50,8 @@ public interface CompactionStrategy extends Serializable {
List<HoodieLogFile> logFiles); List<HoodieLogFile> logFiles);
/** /**
* Order and Filter the list of compactions. Use the metrics captured with the * Order and Filter the list of compactions. Use the metrics captured with the captureMetrics to
* captureMetrics to order and filter out compactions * order and filter out compactions
* *
* @param writeConfig - HoodieWriteConfig - config for this compaction is passed in * @param writeConfig - HoodieWriteConfig - config for this compaction is passed in
* @param operations - list of compactions collected * @param operations - list of compactions collected

View File

@@ -27,8 +27,8 @@ import java.util.Optional;
import java.util.stream.Collectors; import java.util.stream.Collectors;
/** /**
* LogFileSizeBasedCompactionStrategy orders the compactions based on the total log files size * LogFileSizeBasedCompactionStrategy orders the compactions based on the total log files size and
* and limits the compactions within a configured IO bound * limits the compactions within a configured IO bound
* *
* @see BoundedIOCompactionStrategy * @see BoundedIOCompactionStrategy
* @see CompactionStrategy * @see CompactionStrategy

View File

@@ -25,9 +25,9 @@ import java.util.List;
import java.util.Map; import java.util.Map;
/** /**
* UnBoundedCompactionStrategy will not change ordering or filter any compaction. * UnBoundedCompactionStrategy will not change ordering or filter any compaction. It is a
* It is a pass-through and will compact all the base files which has a log file. * pass-through and will compact all the base files which has a log file. This usually means
* This usually means no-intelligence on compaction. * no-intelligence on compaction.
* *
* @see CompactionStrategy * @see CompactionStrategy
*/ */

View File

@@ -17,11 +17,11 @@
package com.uber.hoodie.io.storage; package com.uber.hoodie.io.storage;
import com.uber.hoodie.avro.HoodieAvroWriteSupport; import com.uber.hoodie.avro.HoodieAvroWriteSupport;
import org.apache.avro.Schema;
import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.conf.Configuration;
import org.apache.parquet.hadoop.metadata.CompressionCodecName; import org.apache.parquet.hadoop.metadata.CompressionCodecName;
public class HoodieParquetConfig { public class HoodieParquetConfig {
private HoodieAvroWriteSupport writeSupport; private HoodieAvroWriteSupport writeSupport;
private CompressionCodecName compressionCodecName; private CompressionCodecName compressionCodecName;
private int blockSize; private int blockSize;

View File

@@ -20,6 +20,8 @@ import com.uber.hoodie.avro.HoodieAvroWriteSupport;
import com.uber.hoodie.common.model.HoodieRecord; import com.uber.hoodie.common.model.HoodieRecord;
import com.uber.hoodie.common.model.HoodieRecordPayload; import com.uber.hoodie.common.model.HoodieRecordPayload;
import com.uber.hoodie.common.util.HoodieAvroUtils; import com.uber.hoodie.common.util.HoodieAvroUtils;
import java.io.IOException;
import java.util.concurrent.atomic.AtomicLong;
import org.apache.avro.Schema; import org.apache.avro.Schema;
import org.apache.avro.generic.GenericRecord; import org.apache.avro.generic.GenericRecord;
import org.apache.avro.generic.IndexedRecord; import org.apache.avro.generic.IndexedRecord;
@@ -30,17 +32,13 @@ import org.apache.parquet.hadoop.ParquetFileWriter;
import org.apache.parquet.hadoop.ParquetWriter; import org.apache.parquet.hadoop.ParquetWriter;
import org.apache.spark.TaskContext; import org.apache.spark.TaskContext;
import java.io.IOException;
import java.util.concurrent.atomic.AtomicLong;
/** /**
* HoodieParquetWriter extends the ParquetWriter to help limit the size of underlying file. * HoodieParquetWriter extends the ParquetWriter to help limit the size of underlying file. Provides
* Provides a way to check if the current file can take more records with the <code>canWrite()</code> * a way to check if the current file can take more records with the <code>canWrite()</code>
*
* @param <T>
*/ */
public class HoodieParquetWriter<T extends HoodieRecordPayload, R extends IndexedRecord> public class HoodieParquetWriter<T extends HoodieRecordPayload, R extends IndexedRecord>
extends ParquetWriter<IndexedRecord> implements HoodieStorageWriter<R> { extends ParquetWriter<IndexedRecord> implements HoodieStorageWriter<R> {
private static double STREAM_COMPRESSION_RATIO = 0.1; private static double STREAM_COMPRESSION_RATIO = 0.1;
private static AtomicLong recordIndex = new AtomicLong(1); private static AtomicLong recordIndex = new AtomicLong(1);
@@ -101,7 +99,8 @@ public class HoodieParquetWriter<T extends HoodieRecordPayload, R extends Indexe
return fs.getBytesWritten(file) < maxFileSize; return fs.getBytesWritten(file) < maxFileSize;
} }
@Override public void writeAvro(String key, IndexedRecord object) throws IOException { @Override
public void writeAvro(String key, IndexedRecord object) throws IOException {
super.write(object); super.write(object);
writeSupport.add(key); writeSupport.add(key);
} }

View File

@@ -17,13 +17,16 @@
package com.uber.hoodie.io.storage; package com.uber.hoodie.io.storage;
import com.uber.hoodie.common.model.HoodieRecord; import com.uber.hoodie.common.model.HoodieRecord;
import java.io.IOException;
import org.apache.avro.generic.IndexedRecord; import org.apache.avro.generic.IndexedRecord;
import java.io.IOException;
public interface HoodieStorageWriter<R extends IndexedRecord> { public interface HoodieStorageWriter<R extends IndexedRecord> {
void writeAvroWithMetadata(R newRecord, HoodieRecord record) throws IOException; void writeAvroWithMetadata(R newRecord, HoodieRecord record) throws IOException;
boolean canWrite(); boolean canWrite();
void close() throws IOException; void close() throws IOException;
void writeAvro(String key, R oldRecord) throws IOException; void writeAvro(String key, R oldRecord) throws IOException;
} }

View File

@@ -16,24 +16,24 @@
package com.uber.hoodie.io.storage; package com.uber.hoodie.io.storage;
import com.uber.hoodie.common.table.HoodieTableMetaClient;
import com.uber.hoodie.config.HoodieWriteConfig;
import com.uber.hoodie.avro.HoodieAvroWriteSupport; import com.uber.hoodie.avro.HoodieAvroWriteSupport;
import com.uber.hoodie.common.BloomFilter; import com.uber.hoodie.common.BloomFilter;
import com.uber.hoodie.common.model.HoodieRecordPayload; import com.uber.hoodie.common.model.HoodieRecordPayload;
import com.uber.hoodie.common.util.FSUtils; import com.uber.hoodie.common.util.FSUtils;
import com.uber.hoodie.config.HoodieWriteConfig;
import com.uber.hoodie.table.HoodieTable; import com.uber.hoodie.table.HoodieTable;
import java.io.IOException;
import org.apache.avro.Schema; import org.apache.avro.Schema;
import org.apache.avro.generic.IndexedRecord; import org.apache.avro.generic.IndexedRecord;
import org.apache.hadoop.fs.Path; import org.apache.hadoop.fs.Path;
import org.apache.parquet.avro.AvroSchemaConverter; import org.apache.parquet.avro.AvroSchemaConverter;
import org.apache.parquet.hadoop.metadata.CompressionCodecName; import org.apache.parquet.hadoop.metadata.CompressionCodecName;
import java.io.IOException;
public class HoodieStorageWriterFactory { public class HoodieStorageWriterFactory {
public static <T extends HoodieRecordPayload, R extends IndexedRecord> HoodieStorageWriter<R> getStorageWriter( public static <T extends HoodieRecordPayload, R extends IndexedRecord> HoodieStorageWriter<R> getStorageWriter(
String commitTime, Path path, HoodieTable<T> hoodieTable, HoodieWriteConfig config, Schema schema) String commitTime, Path path, HoodieTable<T> hoodieTable, HoodieWriteConfig config,
Schema schema)
throws IOException { throws IOException {
//TODO - based on the metadata choose the implementation of HoodieStorageWriter //TODO - based on the metadata choose the implementation of HoodieStorageWriter
// Currently only parquet is supported // Currently only parquet is supported

View File

@@ -16,17 +16,6 @@
package com.uber.hoodie.io.storage; package com.uber.hoodie.io.storage;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.*;
import org.apache.hadoop.fs.permission.AclEntry;
import org.apache.hadoop.fs.permission.AclStatus;
import org.apache.hadoop.fs.permission.FsAction;
import org.apache.hadoop.fs.permission.FsPermission;
import org.apache.hadoop.security.AccessControlException;
import org.apache.hadoop.security.Credentials;
import org.apache.hadoop.security.token.Token;
import org.apache.hadoop.util.Progressable;
import java.io.FileNotFoundException; import java.io.FileNotFoundException;
import java.io.IOException; import java.io.IOException;
import java.net.URI; import java.net.URI;
@@ -38,13 +27,41 @@ import java.util.Map;
import java.util.Set; import java.util.Set;
import java.util.concurrent.ConcurrentHashMap; import java.util.concurrent.ConcurrentHashMap;
import java.util.concurrent.ConcurrentMap; import java.util.concurrent.ConcurrentMap;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.BlockLocation;
import org.apache.hadoop.fs.ContentSummary;
import org.apache.hadoop.fs.CreateFlag;
import org.apache.hadoop.fs.FSDataInputStream;
import org.apache.hadoop.fs.FSDataOutputStream;
import org.apache.hadoop.fs.FileAlreadyExistsException;
import org.apache.hadoop.fs.FileChecksum;
import org.apache.hadoop.fs.FileStatus;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.FsServerDefaults;
import org.apache.hadoop.fs.FsStatus;
import org.apache.hadoop.fs.LocatedFileStatus;
import org.apache.hadoop.fs.Options;
import org.apache.hadoop.fs.ParentNotDirectoryException;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.fs.PathFilter;
import org.apache.hadoop.fs.RemoteIterator;
import org.apache.hadoop.fs.UnsupportedFileSystemException;
import org.apache.hadoop.fs.XAttrSetFlag;
import org.apache.hadoop.fs.permission.AclEntry;
import org.apache.hadoop.fs.permission.AclStatus;
import org.apache.hadoop.fs.permission.FsAction;
import org.apache.hadoop.fs.permission.FsPermission;
import org.apache.hadoop.security.AccessControlException;
import org.apache.hadoop.security.Credentials;
import org.apache.hadoop.security.token.Token;
import org.apache.hadoop.util.Progressable;
/** /**
* HoodieWrapperFileSystem wraps the default file system. * HoodieWrapperFileSystem wraps the default file system. It holds state about the open streams in
* It holds state about the open streams in the file system to support getting the * the file system to support getting the written size to each of the open streams.
* written size to each of the open streams.
*/ */
public class HoodieWrapperFileSystem extends FileSystem { public class HoodieWrapperFileSystem extends FileSystem {
private static final Set<String> SUPPORT_SCHEMES; private static final Set<String> SUPPORT_SCHEMES;
public static final String HOODIE_SCHEME_PREFIX = "hoodie-"; public static final String HOODIE_SCHEME_PREFIX = "hoodie-";
@@ -65,7 +82,8 @@ public class HoodieWrapperFileSystem extends FileSystem {
private FileSystem fileSystem; private FileSystem fileSystem;
private URI uri; private URI uri;
@Override public void initialize(URI uri, Configuration conf) throws IOException { @Override
public void initialize(URI uri, Configuration conf) throws IOException {
// Get the default filesystem to decorate // Get the default filesystem to decorate
fileSystem = FileSystem.get(conf); fileSystem = FileSystem.get(conf);
// Do not need to explicitly initialize the default filesystem, its done already in the above FileSystem.get // Do not need to explicitly initialize the default filesystem, its done already in the above FileSystem.get
@@ -74,15 +92,18 @@ public class HoodieWrapperFileSystem extends FileSystem {
this.uri = uri; this.uri = uri;
} }
@Override public URI getUri() { @Override
public URI getUri() {
return uri; return uri;
} }
@Override public FSDataInputStream open(Path f, int bufferSize) throws IOException { @Override
public FSDataInputStream open(Path f, int bufferSize) throws IOException {
return fileSystem.open(convertToDefaultPath(f), bufferSize); return fileSystem.open(convertToDefaultPath(f), bufferSize);
} }
@Override public FSDataOutputStream create(Path f, FsPermission permission, boolean overwrite, @Override
public FSDataOutputStream create(Path f, FsPermission permission, boolean overwrite,
int bufferSize, short replication, long blockSize, Progressable progress) int bufferSize, short replication, long blockSize, Progressable progress)
throws IOException { throws IOException {
final Path translatedPath = convertToDefaultPath(f); final Path translatedPath = convertToDefaultPath(f);
@@ -99,7 +120,8 @@ public class HoodieWrapperFileSystem extends FileSystem {
SizeAwareFSDataOutputStream os = SizeAwareFSDataOutputStream os =
new SizeAwareFSDataOutputStream(fsDataOutputStream, new Runnable() { new SizeAwareFSDataOutputStream(fsDataOutputStream, new Runnable() {
@Override public void run() { @Override
public void run() {
openStreams.remove(path.getName()); openStreams.remove(path.getName());
} }
}); });
@@ -107,33 +129,40 @@ public class HoodieWrapperFileSystem extends FileSystem {
return os; return os;
} }
@Override public FSDataOutputStream create(Path f, boolean overwrite) throws IOException { @Override
public FSDataOutputStream create(Path f, boolean overwrite) throws IOException {
return wrapOutputStream(f, fileSystem.create(convertToDefaultPath(f), overwrite)); return wrapOutputStream(f, fileSystem.create(convertToDefaultPath(f), overwrite));
} }
@Override public FSDataOutputStream create(Path f) throws IOException { @Override
public FSDataOutputStream create(Path f) throws IOException {
return wrapOutputStream(f, fileSystem.create(convertToDefaultPath(f))); return wrapOutputStream(f, fileSystem.create(convertToDefaultPath(f)));
} }
@Override public FSDataOutputStream create(Path f, Progressable progress) throws IOException { @Override
public FSDataOutputStream create(Path f, Progressable progress) throws IOException {
return fileSystem.create(convertToDefaultPath(f), progress); return fileSystem.create(convertToDefaultPath(f), progress);
} }
@Override public FSDataOutputStream create(Path f, short replication) throws IOException { @Override
public FSDataOutputStream create(Path f, short replication) throws IOException {
return fileSystem.create(convertToDefaultPath(f), replication); return fileSystem.create(convertToDefaultPath(f), replication);
} }
@Override public FSDataOutputStream create(Path f, short replication, Progressable progress) @Override
public FSDataOutputStream create(Path f, short replication, Progressable progress)
throws IOException { throws IOException {
return fileSystem.create(convertToDefaultPath(f), replication, progress); return fileSystem.create(convertToDefaultPath(f), replication, progress);
} }
@Override public FSDataOutputStream create(Path f, boolean overwrite, int bufferSize) @Override
public FSDataOutputStream create(Path f, boolean overwrite, int bufferSize)
throws IOException { throws IOException {
return fileSystem.create(convertToDefaultPath(f), overwrite, bufferSize); return fileSystem.create(convertToDefaultPath(f), overwrite, bufferSize);
} }
@Override public FSDataOutputStream create(Path f, boolean overwrite, int bufferSize, @Override
public FSDataOutputStream create(Path f, boolean overwrite, int bufferSize,
Progressable progress) throws IOException { Progressable progress) throws IOException {
return fileSystem.create(convertToDefaultPath(f), overwrite, bufferSize, progress); return fileSystem.create(convertToDefaultPath(f), overwrite, bufferSize, progress);
} }
@@ -173,91 +202,112 @@ public class HoodieWrapperFileSystem extends FileSystem {
} }
@Override public FSDataOutputStream append(Path f, int bufferSize, Progressable progress) @Override
public FSDataOutputStream append(Path f, int bufferSize, Progressable progress)
throws IOException { throws IOException {
return fileSystem.append(convertToDefaultPath(f), bufferSize, progress); return fileSystem.append(convertToDefaultPath(f), bufferSize, progress);
} }
@Override public boolean rename(Path src, Path dst) throws IOException { @Override
public boolean rename(Path src, Path dst) throws IOException {
return fileSystem.rename(convertToDefaultPath(src), convertToDefaultPath(dst)); return fileSystem.rename(convertToDefaultPath(src), convertToDefaultPath(dst));
} }
@Override public boolean delete(Path f, boolean recursive) throws IOException { @Override
public boolean delete(Path f, boolean recursive) throws IOException {
return fileSystem.delete(convertToDefaultPath(f), recursive); return fileSystem.delete(convertToDefaultPath(f), recursive);
} }
@Override public FileStatus[] listStatus(Path f) throws FileNotFoundException, IOException { @Override
public FileStatus[] listStatus(Path f) throws FileNotFoundException, IOException {
return fileSystem.listStatus(convertToDefaultPath(f)); return fileSystem.listStatus(convertToDefaultPath(f));
} }
@Override public void setWorkingDirectory(Path new_dir) { @Override
public void setWorkingDirectory(Path new_dir) {
fileSystem.setWorkingDirectory(convertToDefaultPath(new_dir)); fileSystem.setWorkingDirectory(convertToDefaultPath(new_dir));
} }
@Override public Path getWorkingDirectory() { @Override
public Path getWorkingDirectory() {
return convertToHoodiePath(fileSystem.getWorkingDirectory()); return convertToHoodiePath(fileSystem.getWorkingDirectory());
} }
@Override public boolean mkdirs(Path f, FsPermission permission) throws IOException { @Override
public boolean mkdirs(Path f, FsPermission permission) throws IOException {
return fileSystem.mkdirs(convertToDefaultPath(f), permission); return fileSystem.mkdirs(convertToDefaultPath(f), permission);
} }
@Override public FileStatus getFileStatus(Path f) throws IOException { @Override
public FileStatus getFileStatus(Path f) throws IOException {
return fileSystem.getFileStatus(convertToDefaultPath(f)); return fileSystem.getFileStatus(convertToDefaultPath(f));
} }
@Override public String getScheme() { @Override
public String getScheme() {
return uri.getScheme(); return uri.getScheme();
} }
@Override public String getCanonicalServiceName() { @Override
public String getCanonicalServiceName() {
return fileSystem.getCanonicalServiceName(); return fileSystem.getCanonicalServiceName();
} }
@Override public String getName() { @Override
public String getName() {
return fileSystem.getName(); return fileSystem.getName();
} }
@Override public Path makeQualified(Path path) { @Override
public Path makeQualified(Path path) {
return convertToHoodiePath(fileSystem.makeQualified(convertToDefaultPath(path))); return convertToHoodiePath(fileSystem.makeQualified(convertToDefaultPath(path)));
} }
@Override public Token<?> getDelegationToken(String renewer) throws IOException { @Override
public Token<?> getDelegationToken(String renewer) throws IOException {
return fileSystem.getDelegationToken(renewer); return fileSystem.getDelegationToken(renewer);
} }
@Override public Token<?>[] addDelegationTokens(String renewer, Credentials credentials) @Override
public Token<?>[] addDelegationTokens(String renewer, Credentials credentials)
throws IOException { throws IOException {
return fileSystem.addDelegationTokens(renewer, credentials); return fileSystem.addDelegationTokens(renewer, credentials);
} }
@Override public FileSystem[] getChildFileSystems() { @Override
public FileSystem[] getChildFileSystems() {
return fileSystem.getChildFileSystems(); return fileSystem.getChildFileSystems();
} }
@Override public BlockLocation[] getFileBlockLocations(FileStatus file, long start, long len) @Override
public BlockLocation[] getFileBlockLocations(FileStatus file, long start, long len)
throws IOException { throws IOException {
return fileSystem.getFileBlockLocations(file, start, len); return fileSystem.getFileBlockLocations(file, start, len);
} }
@Override public BlockLocation[] getFileBlockLocations(Path p, long start, long len) @Override
public BlockLocation[] getFileBlockLocations(Path p, long start, long len)
throws IOException { throws IOException {
return fileSystem.getFileBlockLocations(convertToDefaultPath(p), start, len); return fileSystem.getFileBlockLocations(convertToDefaultPath(p), start, len);
} }
@Override public FsServerDefaults getServerDefaults() throws IOException { @Override
public FsServerDefaults getServerDefaults() throws IOException {
return fileSystem.getServerDefaults(); return fileSystem.getServerDefaults();
} }
@Override public FsServerDefaults getServerDefaults(Path p) throws IOException { @Override
public FsServerDefaults getServerDefaults(Path p) throws IOException {
return fileSystem.getServerDefaults(convertToDefaultPath(p)); return fileSystem.getServerDefaults(convertToDefaultPath(p));
} }
@Override public Path resolvePath(Path p) throws IOException { @Override
public Path resolvePath(Path p) throws IOException {
return convertToHoodiePath(fileSystem.resolvePath(convertToDefaultPath(p))); return convertToHoodiePath(fileSystem.resolvePath(convertToDefaultPath(p)));
} }
@Override public FSDataInputStream open(Path f) throws IOException { @Override
public FSDataInputStream open(Path f) throws IOException {
return fileSystem.open(convertToDefaultPath(f)); return fileSystem.open(convertToDefaultPath(f));
} }
@@ -278,7 +328,8 @@ public class HoodieWrapperFileSystem extends FileSystem {
replication, blockSize, progress); replication, blockSize, progress);
} }
@Override public FSDataOutputStream createNonRecursive(Path f, FsPermission permission, @Override
public FSDataOutputStream createNonRecursive(Path f, FsPermission permission,
EnumSet<CreateFlag> flags, int bufferSize, short replication, long blockSize, EnumSet<CreateFlag> flags, int bufferSize, short replication, long blockSize,
Progressable progress) throws IOException { Progressable progress) throws IOException {
return fileSystem return fileSystem
@@ -286,122 +337,150 @@ public class HoodieWrapperFileSystem extends FileSystem {
blockSize, progress); blockSize, progress);
} }
@Override public boolean createNewFile(Path f) throws IOException { @Override
public boolean createNewFile(Path f) throws IOException {
return fileSystem.createNewFile(convertToDefaultPath(f)); return fileSystem.createNewFile(convertToDefaultPath(f));
} }
@Override public FSDataOutputStream append(Path f) throws IOException { @Override
public FSDataOutputStream append(Path f) throws IOException {
return fileSystem.append(convertToDefaultPath(f)); return fileSystem.append(convertToDefaultPath(f));
} }
@Override public FSDataOutputStream append(Path f, int bufferSize) throws IOException { @Override
public FSDataOutputStream append(Path f, int bufferSize) throws IOException {
return fileSystem.append(convertToDefaultPath(f), bufferSize); return fileSystem.append(convertToDefaultPath(f), bufferSize);
} }
@Override public void concat(Path trg, Path[] psrcs) throws IOException { @Override
public void concat(Path trg, Path[] psrcs) throws IOException {
Path[] psrcsNew = convertDefaults(psrcs); Path[] psrcsNew = convertDefaults(psrcs);
fileSystem.concat(convertToDefaultPath(trg), psrcsNew); fileSystem.concat(convertToDefaultPath(trg), psrcsNew);
} }
@Override public short getReplication(Path src) throws IOException { @Override
public short getReplication(Path src) throws IOException {
return fileSystem.getReplication(convertToDefaultPath(src)); return fileSystem.getReplication(convertToDefaultPath(src));
} }
@Override public boolean setReplication(Path src, short replication) throws IOException { @Override
public boolean setReplication(Path src, short replication) throws IOException {
return fileSystem.setReplication(convertToDefaultPath(src), replication); return fileSystem.setReplication(convertToDefaultPath(src), replication);
} }
@Override public boolean delete(Path f) throws IOException { @Override
public boolean delete(Path f) throws IOException {
return fileSystem.delete(convertToDefaultPath(f)); return fileSystem.delete(convertToDefaultPath(f));
} }
@Override public boolean deleteOnExit(Path f) throws IOException { @Override
public boolean deleteOnExit(Path f) throws IOException {
return fileSystem.deleteOnExit(convertToDefaultPath(f)); return fileSystem.deleteOnExit(convertToDefaultPath(f));
} }
@Override public boolean cancelDeleteOnExit(Path f) { @Override
public boolean cancelDeleteOnExit(Path f) {
return fileSystem.cancelDeleteOnExit(convertToDefaultPath(f)); return fileSystem.cancelDeleteOnExit(convertToDefaultPath(f));
} }
@Override public boolean exists(Path f) throws IOException { @Override
public boolean exists(Path f) throws IOException {
return fileSystem.exists(convertToDefaultPath(f)); return fileSystem.exists(convertToDefaultPath(f));
} }
@Override public boolean isDirectory(Path f) throws IOException { @Override
public boolean isDirectory(Path f) throws IOException {
return fileSystem.isDirectory(convertToDefaultPath(f)); return fileSystem.isDirectory(convertToDefaultPath(f));
} }
@Override public boolean isFile(Path f) throws IOException { @Override
public boolean isFile(Path f) throws IOException {
return fileSystem.isFile(convertToDefaultPath(f)); return fileSystem.isFile(convertToDefaultPath(f));
} }
@Override public long getLength(Path f) throws IOException { @Override
public long getLength(Path f) throws IOException {
return fileSystem.getLength(convertToDefaultPath(f)); return fileSystem.getLength(convertToDefaultPath(f));
} }
@Override public ContentSummary getContentSummary(Path f) throws IOException { @Override
public ContentSummary getContentSummary(Path f) throws IOException {
return fileSystem.getContentSummary(convertToDefaultPath(f)); return fileSystem.getContentSummary(convertToDefaultPath(f));
} }
@Override public RemoteIterator<Path> listCorruptFileBlocks(Path path) throws IOException { @Override
public RemoteIterator<Path> listCorruptFileBlocks(Path path) throws IOException {
return fileSystem.listCorruptFileBlocks(convertToDefaultPath(path)); return fileSystem.listCorruptFileBlocks(convertToDefaultPath(path));
} }
@Override public FileStatus[] listStatus(Path f, PathFilter filter) @Override
public FileStatus[] listStatus(Path f, PathFilter filter)
throws FileNotFoundException, IOException { throws FileNotFoundException, IOException {
return fileSystem.listStatus(convertToDefaultPath(f), filter); return fileSystem.listStatus(convertToDefaultPath(f), filter);
} }
@Override public FileStatus[] listStatus(Path[] files) @Override
public FileStatus[] listStatus(Path[] files)
throws FileNotFoundException, IOException { throws FileNotFoundException, IOException {
return fileSystem.listStatus(convertDefaults(files)); return fileSystem.listStatus(convertDefaults(files));
} }
@Override public FileStatus[] listStatus(Path[] files, PathFilter filter) @Override
public FileStatus[] listStatus(Path[] files, PathFilter filter)
throws FileNotFoundException, IOException { throws FileNotFoundException, IOException {
return fileSystem.listStatus(convertDefaults(files), filter); return fileSystem.listStatus(convertDefaults(files), filter);
} }
@Override public FileStatus[] globStatus(Path pathPattern) throws IOException { @Override
public FileStatus[] globStatus(Path pathPattern) throws IOException {
return fileSystem.globStatus(convertToDefaultPath(pathPattern)); return fileSystem.globStatus(convertToDefaultPath(pathPattern));
} }
@Override public FileStatus[] globStatus(Path pathPattern, PathFilter filter) @Override
public FileStatus[] globStatus(Path pathPattern, PathFilter filter)
throws IOException { throws IOException {
return fileSystem.globStatus(convertToDefaultPath(pathPattern), filter); return fileSystem.globStatus(convertToDefaultPath(pathPattern), filter);
} }
@Override public RemoteIterator<LocatedFileStatus> listLocatedStatus(Path f) @Override
public RemoteIterator<LocatedFileStatus> listLocatedStatus(Path f)
throws FileNotFoundException, IOException { throws FileNotFoundException, IOException {
return fileSystem.listLocatedStatus(convertToDefaultPath(f)); return fileSystem.listLocatedStatus(convertToDefaultPath(f));
} }
@Override public RemoteIterator<LocatedFileStatus> listFiles(Path f, boolean recursive) @Override
public RemoteIterator<LocatedFileStatus> listFiles(Path f, boolean recursive)
throws FileNotFoundException, IOException { throws FileNotFoundException, IOException {
return fileSystem.listFiles(convertToDefaultPath(f), recursive); return fileSystem.listFiles(convertToDefaultPath(f), recursive);
} }
@Override public Path getHomeDirectory() { @Override
public Path getHomeDirectory() {
return convertToHoodiePath(fileSystem.getHomeDirectory()); return convertToHoodiePath(fileSystem.getHomeDirectory());
} }
@Override public boolean mkdirs(Path f) throws IOException { @Override
public boolean mkdirs(Path f) throws IOException {
return fileSystem.mkdirs(convertToDefaultPath(f)); return fileSystem.mkdirs(convertToDefaultPath(f));
} }
@Override public void copyFromLocalFile(Path src, Path dst) throws IOException { @Override
public void copyFromLocalFile(Path src, Path dst) throws IOException {
fileSystem.copyFromLocalFile(convertToDefaultPath(src), convertToDefaultPath(dst)); fileSystem.copyFromLocalFile(convertToDefaultPath(src), convertToDefaultPath(dst));
} }
@Override public void moveFromLocalFile(Path[] srcs, Path dst) throws IOException { @Override
public void moveFromLocalFile(Path[] srcs, Path dst) throws IOException {
fileSystem.moveFromLocalFile(convertDefaults(srcs), convertToDefaultPath(dst)); fileSystem.moveFromLocalFile(convertDefaults(srcs), convertToDefaultPath(dst));
} }
@Override public void moveFromLocalFile(Path src, Path dst) throws IOException { @Override
public void moveFromLocalFile(Path src, Path dst) throws IOException {
fileSystem.moveFromLocalFile(convertToDefaultPath(src), convertToDefaultPath(dst)); fileSystem.moveFromLocalFile(convertToDefaultPath(src), convertToDefaultPath(dst));
} }
@Override public void copyFromLocalFile(boolean delSrc, Path src, Path dst) throws IOException { @Override
public void copyFromLocalFile(boolean delSrc, Path src, Path dst) throws IOException {
fileSystem.copyFromLocalFile(delSrc, convertToDefaultPath(src), convertToDefaultPath(dst)); fileSystem.copyFromLocalFile(delSrc, convertToDefaultPath(src), convertToDefaultPath(dst));
} }
@@ -412,21 +491,25 @@ public class HoodieWrapperFileSystem extends FileSystem {
.copyFromLocalFile(delSrc, overwrite, convertDefaults(srcs), convertToDefaultPath(dst)); .copyFromLocalFile(delSrc, overwrite, convertDefaults(srcs), convertToDefaultPath(dst));
} }
@Override public void copyFromLocalFile(boolean delSrc, boolean overwrite, Path src, Path dst) @Override
public void copyFromLocalFile(boolean delSrc, boolean overwrite, Path src, Path dst)
throws IOException { throws IOException {
fileSystem.copyFromLocalFile(delSrc, overwrite, convertToDefaultPath(src), fileSystem.copyFromLocalFile(delSrc, overwrite, convertToDefaultPath(src),
convertToDefaultPath(dst)); convertToDefaultPath(dst));
} }
@Override public void copyToLocalFile(Path src, Path dst) throws IOException { @Override
public void copyToLocalFile(Path src, Path dst) throws IOException {
fileSystem.copyToLocalFile(convertToDefaultPath(src), convertToDefaultPath(dst)); fileSystem.copyToLocalFile(convertToDefaultPath(src), convertToDefaultPath(dst));
} }
@Override public void moveToLocalFile(Path src, Path dst) throws IOException { @Override
public void moveToLocalFile(Path src, Path dst) throws IOException {
fileSystem.moveToLocalFile(convertToDefaultPath(src), convertToDefaultPath(dst)); fileSystem.moveToLocalFile(convertToDefaultPath(src), convertToDefaultPath(dst));
} }
@Override public void copyToLocalFile(boolean delSrc, Path src, Path dst) throws IOException { @Override
public void copyToLocalFile(boolean delSrc, Path src, Path dst) throws IOException {
fileSystem.copyToLocalFile(delSrc, convertToDefaultPath(src), convertToDefaultPath(dst)); fileSystem.copyToLocalFile(delSrc, convertToDefaultPath(src), convertToDefaultPath(dst));
} }
@@ -437,193 +520,237 @@ public class HoodieWrapperFileSystem extends FileSystem {
useRawLocalFileSystem); useRawLocalFileSystem);
} }
@Override public Path startLocalOutput(Path fsOutputFile, Path tmpLocalFile) @Override
public Path startLocalOutput(Path fsOutputFile, Path tmpLocalFile)
throws IOException { throws IOException {
return convertToHoodiePath(fileSystem.startLocalOutput(convertToDefaultPath(fsOutputFile), return convertToHoodiePath(fileSystem.startLocalOutput(convertToDefaultPath(fsOutputFile),
convertToDefaultPath(tmpLocalFile))); convertToDefaultPath(tmpLocalFile)));
} }
@Override public void completeLocalOutput(Path fsOutputFile, Path tmpLocalFile) @Override
public void completeLocalOutput(Path fsOutputFile, Path tmpLocalFile)
throws IOException { throws IOException {
fileSystem.completeLocalOutput(convertToDefaultPath(fsOutputFile), fileSystem.completeLocalOutput(convertToDefaultPath(fsOutputFile),
convertToDefaultPath(tmpLocalFile)); convertToDefaultPath(tmpLocalFile));
} }
@Override public void close() throws IOException { @Override
public void close() throws IOException {
fileSystem.close(); fileSystem.close();
} }
@Override public long getUsed() throws IOException { @Override
public long getUsed() throws IOException {
return fileSystem.getUsed(); return fileSystem.getUsed();
} }
@Override public long getBlockSize(Path f) throws IOException { @Override
public long getBlockSize(Path f) throws IOException {
return fileSystem.getBlockSize(convertToDefaultPath(f)); return fileSystem.getBlockSize(convertToDefaultPath(f));
} }
@Override public long getDefaultBlockSize() { @Override
public long getDefaultBlockSize() {
return fileSystem.getDefaultBlockSize(); return fileSystem.getDefaultBlockSize();
} }
@Override public long getDefaultBlockSize(Path f) { @Override
public long getDefaultBlockSize(Path f) {
return fileSystem.getDefaultBlockSize(convertToDefaultPath(f)); return fileSystem.getDefaultBlockSize(convertToDefaultPath(f));
} }
@Override public short getDefaultReplication() { @Override
public short getDefaultReplication() {
return fileSystem.getDefaultReplication(); return fileSystem.getDefaultReplication();
} }
@Override public short getDefaultReplication(Path path) { @Override
public short getDefaultReplication(Path path) {
return fileSystem.getDefaultReplication(convertToDefaultPath(path)); return fileSystem.getDefaultReplication(convertToDefaultPath(path));
} }
@Override public void access(Path path, FsAction mode) @Override
public void access(Path path, FsAction mode)
throws AccessControlException, FileNotFoundException, IOException { throws AccessControlException, FileNotFoundException, IOException {
fileSystem.access(convertToDefaultPath(path), mode); fileSystem.access(convertToDefaultPath(path), mode);
} }
@Override public void createSymlink(Path target, Path link, boolean createParent) @Override
public void createSymlink(Path target, Path link, boolean createParent)
throws AccessControlException, FileAlreadyExistsException, FileNotFoundException, throws AccessControlException, FileAlreadyExistsException, FileNotFoundException,
ParentNotDirectoryException, UnsupportedFileSystemException, IOException { ParentNotDirectoryException, UnsupportedFileSystemException, IOException {
fileSystem fileSystem
.createSymlink(convertToDefaultPath(target), convertToDefaultPath(link), createParent); .createSymlink(convertToDefaultPath(target), convertToDefaultPath(link), createParent);
} }
@Override public FileStatus getFileLinkStatus(Path f) @Override
public FileStatus getFileLinkStatus(Path f)
throws AccessControlException, FileNotFoundException, UnsupportedFileSystemException, throws AccessControlException, FileNotFoundException, UnsupportedFileSystemException,
IOException { IOException {
return fileSystem.getFileLinkStatus(convertToDefaultPath(f)); return fileSystem.getFileLinkStatus(convertToDefaultPath(f));
} }
@Override public boolean supportsSymlinks() { @Override
public boolean supportsSymlinks() {
return fileSystem.supportsSymlinks(); return fileSystem.supportsSymlinks();
} }
@Override public Path getLinkTarget(Path f) throws IOException { @Override
public Path getLinkTarget(Path f) throws IOException {
return convertToHoodiePath(fileSystem.getLinkTarget(convertToDefaultPath(f))); return convertToHoodiePath(fileSystem.getLinkTarget(convertToDefaultPath(f)));
} }
@Override public FileChecksum getFileChecksum(Path f) throws IOException { @Override
public FileChecksum getFileChecksum(Path f) throws IOException {
return fileSystem.getFileChecksum(convertToDefaultPath(f)); return fileSystem.getFileChecksum(convertToDefaultPath(f));
} }
@Override public FileChecksum getFileChecksum(Path f, long length) throws IOException { @Override
public FileChecksum getFileChecksum(Path f, long length) throws IOException {
return fileSystem.getFileChecksum(convertToDefaultPath(f), length); return fileSystem.getFileChecksum(convertToDefaultPath(f), length);
} }
@Override public void setVerifyChecksum(boolean verifyChecksum) { @Override
public void setVerifyChecksum(boolean verifyChecksum) {
fileSystem.setVerifyChecksum(verifyChecksum); fileSystem.setVerifyChecksum(verifyChecksum);
} }
@Override public void setWriteChecksum(boolean writeChecksum) { @Override
public void setWriteChecksum(boolean writeChecksum) {
fileSystem.setWriteChecksum(writeChecksum); fileSystem.setWriteChecksum(writeChecksum);
} }
@Override public FsStatus getStatus() throws IOException { @Override
public FsStatus getStatus() throws IOException {
return fileSystem.getStatus(); return fileSystem.getStatus();
} }
@Override public FsStatus getStatus(Path p) throws IOException { @Override
public FsStatus getStatus(Path p) throws IOException {
return fileSystem.getStatus(convertToDefaultPath(p)); return fileSystem.getStatus(convertToDefaultPath(p));
} }
@Override public void setPermission(Path p, FsPermission permission) throws IOException { @Override
public void setPermission(Path p, FsPermission permission) throws IOException {
fileSystem.setPermission(convertToDefaultPath(p), permission); fileSystem.setPermission(convertToDefaultPath(p), permission);
} }
@Override public void setOwner(Path p, String username, String groupname) throws IOException { @Override
public void setOwner(Path p, String username, String groupname) throws IOException {
fileSystem.setOwner(convertToDefaultPath(p), username, groupname); fileSystem.setOwner(convertToDefaultPath(p), username, groupname);
} }
@Override public void setTimes(Path p, long mtime, long atime) throws IOException { @Override
public void setTimes(Path p, long mtime, long atime) throws IOException {
fileSystem.setTimes(convertToDefaultPath(p), mtime, atime); fileSystem.setTimes(convertToDefaultPath(p), mtime, atime);
} }
@Override public Path createSnapshot(Path path, String snapshotName) throws IOException { @Override
public Path createSnapshot(Path path, String snapshotName) throws IOException {
return convertToHoodiePath( return convertToHoodiePath(
fileSystem.createSnapshot(convertToDefaultPath(path), snapshotName)); fileSystem.createSnapshot(convertToDefaultPath(path), snapshotName));
} }
@Override public void renameSnapshot(Path path, String snapshotOldName, String snapshotNewName) @Override
public void renameSnapshot(Path path, String snapshotOldName, String snapshotNewName)
throws IOException { throws IOException {
fileSystem.renameSnapshot(convertToDefaultPath(path), snapshotOldName, snapshotNewName); fileSystem.renameSnapshot(convertToDefaultPath(path), snapshotOldName, snapshotNewName);
} }
@Override public void deleteSnapshot(Path path, String snapshotName) throws IOException { @Override
public void deleteSnapshot(Path path, String snapshotName) throws IOException {
fileSystem.deleteSnapshot(convertToDefaultPath(path), snapshotName); fileSystem.deleteSnapshot(convertToDefaultPath(path), snapshotName);
} }
@Override public void modifyAclEntries(Path path, List<AclEntry> aclSpec) throws IOException { @Override
public void modifyAclEntries(Path path, List<AclEntry> aclSpec) throws IOException {
fileSystem.modifyAclEntries(convertToDefaultPath(path), aclSpec); fileSystem.modifyAclEntries(convertToDefaultPath(path), aclSpec);
} }
@Override public void removeAclEntries(Path path, List<AclEntry> aclSpec) throws IOException { @Override
public void removeAclEntries(Path path, List<AclEntry> aclSpec) throws IOException {
fileSystem.removeAclEntries(convertToDefaultPath(path), aclSpec); fileSystem.removeAclEntries(convertToDefaultPath(path), aclSpec);
} }
@Override public void removeDefaultAcl(Path path) throws IOException { @Override
public void removeDefaultAcl(Path path) throws IOException {
fileSystem.removeDefaultAcl(convertToDefaultPath(path)); fileSystem.removeDefaultAcl(convertToDefaultPath(path));
} }
@Override public void removeAcl(Path path) throws IOException { @Override
public void removeAcl(Path path) throws IOException {
fileSystem.removeAcl(convertToDefaultPath(path)); fileSystem.removeAcl(convertToDefaultPath(path));
} }
@Override public void setAcl(Path path, List<AclEntry> aclSpec) throws IOException { @Override
public void setAcl(Path path, List<AclEntry> aclSpec) throws IOException {
fileSystem.setAcl(convertToDefaultPath(path), aclSpec); fileSystem.setAcl(convertToDefaultPath(path), aclSpec);
} }
@Override public AclStatus getAclStatus(Path path) throws IOException { @Override
public AclStatus getAclStatus(Path path) throws IOException {
return fileSystem.getAclStatus(convertToDefaultPath(path)); return fileSystem.getAclStatus(convertToDefaultPath(path));
} }
@Override public void setXAttr(Path path, String name, byte[] value) throws IOException { @Override
public void setXAttr(Path path, String name, byte[] value) throws IOException {
fileSystem.setXAttr(convertToDefaultPath(path), name, value); fileSystem.setXAttr(convertToDefaultPath(path), name, value);
} }
@Override public void setXAttr(Path path, String name, byte[] value, EnumSet<XAttrSetFlag> flag) @Override
public void setXAttr(Path path, String name, byte[] value, EnumSet<XAttrSetFlag> flag)
throws IOException { throws IOException {
fileSystem.setXAttr(convertToDefaultPath(path), name, value, flag); fileSystem.setXAttr(convertToDefaultPath(path), name, value, flag);
} }
@Override public byte[] getXAttr(Path path, String name) throws IOException { @Override
public byte[] getXAttr(Path path, String name) throws IOException {
return fileSystem.getXAttr(convertToDefaultPath(path), name); return fileSystem.getXAttr(convertToDefaultPath(path), name);
} }
@Override public Map<String, byte[]> getXAttrs(Path path) throws IOException { @Override
public Map<String, byte[]> getXAttrs(Path path) throws IOException {
return fileSystem.getXAttrs(convertToDefaultPath(path)); return fileSystem.getXAttrs(convertToDefaultPath(path));
} }
@Override public Map<String, byte[]> getXAttrs(Path path, List<String> names) @Override
public Map<String, byte[]> getXAttrs(Path path, List<String> names)
throws IOException { throws IOException {
return fileSystem.getXAttrs(convertToDefaultPath(path), names); return fileSystem.getXAttrs(convertToDefaultPath(path), names);
} }
@Override public List<String> listXAttrs(Path path) throws IOException { @Override
public List<String> listXAttrs(Path path) throws IOException {
return fileSystem.listXAttrs(convertToDefaultPath(path)); return fileSystem.listXAttrs(convertToDefaultPath(path));
} }
@Override public void removeXAttr(Path path, String name) throws IOException { @Override
public void removeXAttr(Path path, String name) throws IOException {
fileSystem.removeXAttr(convertToDefaultPath(path), name); fileSystem.removeXAttr(convertToDefaultPath(path), name);
} }
@Override public void setConf(Configuration conf) { @Override
public void setConf(Configuration conf) {
// ignore this. we will set conf on init // ignore this. we will set conf on init
} }
@Override public Configuration getConf() { @Override
public Configuration getConf() {
return fileSystem.getConf(); return fileSystem.getConf();
} }
@Override public int hashCode() { @Override
public int hashCode() {
return fileSystem.hashCode(); return fileSystem.hashCode();
} }
@Override public boolean equals(Object obj) { @Override
public boolean equals(Object obj) {
return fileSystem.equals(obj); return fileSystem.equals(obj);
} }
@Override public String toString() { @Override
public String toString() {
return fileSystem.toString(); return fileSystem.toString();
} }

View File

@@ -16,16 +16,16 @@
package com.uber.hoodie.io.storage; package com.uber.hoodie.io.storage;
import org.apache.hadoop.fs.FSDataOutputStream;
import java.io.IOException; import java.io.IOException;
import java.util.concurrent.atomic.AtomicLong; import java.util.concurrent.atomic.AtomicLong;
import org.apache.hadoop.fs.FSDataOutputStream;
/** /**
* Wrapper over <code>FSDataOutputStream</code> to keep track of the size of the written bytes. * Wrapper over <code>FSDataOutputStream</code> to keep track of the size of the written bytes. This
* This gives a cheap way to check on the underlying file size. * gives a cheap way to check on the underlying file size.
*/ */
public class SizeAwareFSDataOutputStream extends FSDataOutputStream { public class SizeAwareFSDataOutputStream extends FSDataOutputStream {
// A callback to call when the output stream is closed. // A callback to call when the output stream is closed.
private final Runnable closeCallback; private final Runnable closeCallback;
// Keep track of the bytes written // Keep track of the bytes written
@@ -37,17 +37,20 @@ public class SizeAwareFSDataOutputStream extends FSDataOutputStream {
this.closeCallback = closeCallback; this.closeCallback = closeCallback;
} }
@Override public synchronized void write(byte[] b, int off, int len) throws IOException { @Override
public synchronized void write(byte[] b, int off, int len) throws IOException {
bytesWritten.addAndGet(len); bytesWritten.addAndGet(len);
super.write(b, off, len); super.write(b, off, len);
} }
@Override public void write(byte[] b) throws IOException { @Override
public void write(byte[] b) throws IOException {
bytesWritten.addAndGet(b.length); bytesWritten.addAndGet(b.length);
super.write(b); super.write(b);
} }
@Override public void close() throws IOException { @Override
public void close() throws IOException {
super.close(); super.close();
closeCallback.run(); closeCallback.run();
} }

View File

@@ -22,7 +22,6 @@ import com.codahale.metrics.Timer;
import com.google.common.annotations.VisibleForTesting; import com.google.common.annotations.VisibleForTesting;
import com.uber.hoodie.common.model.HoodieCommitMetadata; import com.uber.hoodie.common.model.HoodieCommitMetadata;
import com.uber.hoodie.config.HoodieWriteConfig; import com.uber.hoodie.config.HoodieWriteConfig;
import org.apache.log4j.LogManager; import org.apache.log4j.LogManager;
import org.apache.log4j.Logger; import org.apache.log4j.Logger;
@@ -30,6 +29,7 @@ import org.apache.log4j.Logger;
* Wrapper for metrics-related operations. * Wrapper for metrics-related operations.
*/ */
public class HoodieMetrics { public class HoodieMetrics {
private HoodieWriteConfig config = null; private HoodieWriteConfig config = null;
private String tableName = null; private String tableName = null;
private static Logger logger = LogManager.getLogger(HoodieMetrics.class); private static Logger logger = LogManager.getLogger(HoodieMetrics.class);
@@ -77,7 +77,8 @@ public class HoodieMetrics {
return commitTimer == null ? null : commitTimer.time(); return commitTimer == null ? null : commitTimer.time();
} }
public void updateCommitMetrics(long commitEpochTimeInMs, long durationInMs, HoodieCommitMetadata metadata) { public void updateCommitMetrics(long commitEpochTimeInMs, long durationInMs,
HoodieCommitMetadata metadata) {
if (config.isMetricsOn()) { if (config.isMetricsOn()) {
long totalPartitionsWritten = metadata.fetchTotalPartitionsWritten(); long totalPartitionsWritten = metadata.fetchTotalPartitionsWritten();
long totalFilesInsert = metadata.fetchTotalFilesInsert(); long totalFilesInsert = metadata.fetchTotalFilesInsert();
@@ -91,8 +92,10 @@ public class HoodieMetrics {
registerGauge(getMetricsName("commit", "totalFilesInsert"), totalFilesInsert); registerGauge(getMetricsName("commit", "totalFilesInsert"), totalFilesInsert);
registerGauge(getMetricsName("commit", "totalFilesUpdate"), totalFilesUpdate); registerGauge(getMetricsName("commit", "totalFilesUpdate"), totalFilesUpdate);
registerGauge(getMetricsName("commit", "totalRecordsWritten"), totalRecordsWritten); registerGauge(getMetricsName("commit", "totalRecordsWritten"), totalRecordsWritten);
registerGauge(getMetricsName("commit", "totalUpdateRecordsWritten"), totalUpdateRecordsWritten); registerGauge(getMetricsName("commit", "totalUpdateRecordsWritten"),
registerGauge(getMetricsName("commit", "totalInsertRecordsWritten"), totalInsertRecordsWritten); totalUpdateRecordsWritten);
registerGauge(getMetricsName("commit", "totalInsertRecordsWritten"),
totalInsertRecordsWritten);
registerGauge(getMetricsName("commit", "totalBytesWritten"), totalBytesWritten); registerGauge(getMetricsName("commit", "totalBytesWritten"), totalBytesWritten);
registerGauge(getMetricsName("commit", "commitTime"), commitEpochTimeInMs); registerGauge(getMetricsName("commit", "commitTime"), commitEpochTimeInMs);
} }
@@ -139,8 +142,7 @@ public class HoodieMetrics {
} }
/** /**
* By default, the timer context returns duration with nano seconds. * By default, the timer context returns duration with nano seconds. Convert it to millisecond.
* Convert it to millisecond.
*/ */
public long getDurationInMs(long ctxDuration) { public long getDurationInMs(long ctxDuration) {
return ctxDuration / 1000000; return ctxDuration / 1000000;

View File

@@ -22,6 +22,7 @@ import java.io.Closeable;
* Used for testing. * Used for testing.
*/ */
public class InMemoryMetricsReporter extends MetricsReporter { public class InMemoryMetricsReporter extends MetricsReporter {
@Override @Override
public void start() { public void start() {
} }

View File

@@ -19,16 +19,15 @@ package com.uber.hoodie.metrics;
import com.codahale.metrics.MetricRegistry; import com.codahale.metrics.MetricRegistry;
import com.google.common.io.Closeables; import com.google.common.io.Closeables;
import com.uber.hoodie.config.HoodieWriteConfig; import com.uber.hoodie.config.HoodieWriteConfig;
import com.uber.hoodie.config.HoodieMetricsConfig;
import com.uber.hoodie.exception.HoodieException; import com.uber.hoodie.exception.HoodieException;
import org.apache.commons.configuration.ConfigurationException;
import java.io.Closeable; import java.io.Closeable;
import org.apache.commons.configuration.ConfigurationException;
/** /**
* This is the main class of the metrics system. * This is the main class of the metrics system.
*/ */
public class Metrics { public class Metrics {
private static volatile boolean initialized = false; private static volatile boolean initialized = false;
private static Metrics metrics = null; private static Metrics metrics = null;
private final MetricRegistry registry; private final MetricRegistry registry;

View File

@@ -21,19 +21,18 @@ import com.codahale.metrics.MetricRegistry;
import com.codahale.metrics.graphite.Graphite; import com.codahale.metrics.graphite.Graphite;
import com.codahale.metrics.graphite.GraphiteReporter; import com.codahale.metrics.graphite.GraphiteReporter;
import com.uber.hoodie.config.HoodieWriteConfig; import com.uber.hoodie.config.HoodieWriteConfig;
import org.apache.log4j.LogManager;
import org.apache.log4j.Logger;
import java.io.Closeable; import java.io.Closeable;
import java.net.InetSocketAddress; import java.net.InetSocketAddress;
import java.util.concurrent.TimeUnit; import java.util.concurrent.TimeUnit;
import org.apache.log4j.LogManager;
import org.apache.log4j.Logger;
/** /**
* Implementation of Graphite reporter, which connects to the Graphite server, * Implementation of Graphite reporter, which connects to the Graphite server, and send metrics to
* and send metrics to that server. * that server.
*/ */
public class MetricsGraphiteReporter extends MetricsReporter { public class MetricsGraphiteReporter extends MetricsReporter {
private final MetricRegistry registry; private final MetricRegistry registry;
private final GraphiteReporter graphiteReporter; private final GraphiteReporter graphiteReporter;
private final HoodieWriteConfig config; private final HoodieWriteConfig config;

View File

@@ -22,6 +22,7 @@ import java.io.Closeable;
* Interface for implementing a Reporter. * Interface for implementing a Reporter.
*/ */
public abstract class MetricsReporter { public abstract class MetricsReporter {
/** /**
* Push out metrics at scheduled intervals * Push out metrics at scheduled intervals
*/ */

View File

@@ -18,7 +18,6 @@ package com.uber.hoodie.metrics;
import com.codahale.metrics.MetricRegistry; import com.codahale.metrics.MetricRegistry;
import com.uber.hoodie.config.HoodieWriteConfig; import com.uber.hoodie.config.HoodieWriteConfig;
import org.apache.log4j.LogManager; import org.apache.log4j.LogManager;
import org.apache.log4j.Logger; import org.apache.log4j.Logger;
@@ -26,6 +25,7 @@ import org.apache.log4j.Logger;
* Factory class for creating MetricsReporter. * Factory class for creating MetricsReporter.
*/ */
public class MetricsReporterFactory { public class MetricsReporterFactory {
private static Logger logger = LogManager.getLogger(MetricsReporterFactory.class); private static Logger logger = LogManager.getLogger(MetricsReporterFactory.class);
public static MetricsReporter createReporter(HoodieWriteConfig config, public static MetricsReporter createReporter(HoodieWriteConfig config,

View File

@@ -17,8 +17,8 @@
package com.uber.hoodie.metrics; package com.uber.hoodie.metrics;
/** /**
* Types of the reporter. Right now we only support Graphite. * Types of the reporter. Right now we only support Graphite. We can include JMX and CSV in the
* We can include JMX and CSV in the future. * future.
*/ */
public enum MetricsReporterType { public enum MetricsReporterType {
GRAPHITE, GRAPHITE,

View File

@@ -70,28 +70,16 @@ import org.apache.spark.api.java.function.PairFlatMapFunction;
import scala.Option; import scala.Option;
import scala.Tuple2; import scala.Tuple2;
import java.io.IOException;
import java.io.Serializable;
import java.util.ArrayList;
import java.util.Collections;
import java.util.HashMap;
import java.util.Iterator;
import java.util.List;
import java.util.Map;
import java.util.Optional;
import java.util.Set;
import java.util.stream.Collectors;
/** /**
* Implementation of a very heavily read-optimized Hoodie Table where * Implementation of a very heavily read-optimized Hoodie Table where
* *
* INSERTS - Produce new files, block aligned to desired size (or) * INSERTS - Produce new files, block aligned to desired size (or) Merge with the smallest existing
* Merge with the smallest existing file, to expand it * file, to expand it
* *
* UPDATES - Produce a new version of the file, just replacing the updated records with new values * UPDATES - Produce a new version of the file, just replacing the updated records with new values
*
*/ */
public class HoodieCopyOnWriteTable<T extends HoodieRecordPayload> extends HoodieTable<T> { public class HoodieCopyOnWriteTable<T extends HoodieRecordPayload> extends HoodieTable<T> {
public HoodieCopyOnWriteTable(HoodieWriteConfig config, HoodieTableMetaClient metaClient) { public HoodieCopyOnWriteTable(HoodieWriteConfig config, HoodieTableMetaClient metaClient) {
super(config, metaClient); super(config, metaClient);
} }
@@ -107,6 +95,7 @@ public class HoodieCopyOnWriteTable<T extends HoodieRecordPayload> extends Hoodi
* Helper class for a small file's location and its actual size on disk * Helper class for a small file's location and its actual size on disk
*/ */
class SmallFile implements Serializable { class SmallFile implements Serializable {
HoodieRecordLocation location; HoodieRecordLocation location;
long sizeBytes; long sizeBytes;
@@ -121,11 +110,11 @@ public class HoodieCopyOnWriteTable<T extends HoodieRecordPayload> extends Hoodi
} }
/** /**
* Helper class for an insert bucket along with the weight [0.0, 0.1] * Helper class for an insert bucket along with the weight [0.0, 0.1] that defines the amount of
* that defines the amount of incoming inserts that should be allocated to * incoming inserts that should be allocated to the bucket
* the bucket
*/ */
class InsertBucket implements Serializable { class InsertBucket implements Serializable {
int bucketNumber; int bucketNumber;
// fraction of total inserts, that should go into this bucket // fraction of total inserts, that should go into this bucket
double weight; double weight;
@@ -144,6 +133,7 @@ public class HoodieCopyOnWriteTable<T extends HoodieRecordPayload> extends Hoodi
* Helper class for a bucket's type (INSERT and UPDATE) and its file location * Helper class for a bucket's type (INSERT and UPDATE) and its file location
*/ */
class BucketInfo implements Serializable { class BucketInfo implements Serializable {
BucketType bucketType; BucketType bucketType;
String fileLoc; String fileLoc;
@@ -164,8 +154,8 @@ public class HoodieCopyOnWriteTable<T extends HoodieRecordPayload> extends Hoodi
class UpsertPartitioner extends Partitioner { class UpsertPartitioner extends Partitioner {
/** /**
* Total number of RDD partitions, is determined by total buckets we want to * Total number of RDD partitions, is determined by total buckets we want to pack the incoming
* pack the incoming workload into * workload into
*/ */
private int totalBuckets = 0; private int totalBuckets = 0;
@@ -181,8 +171,7 @@ public class HoodieCopyOnWriteTable<T extends HoodieRecordPayload> extends Hoodi
/** /**
* Helps us pack inserts into 1 or more buckets depending on number of * Helps us pack inserts into 1 or more buckets depending on number of incoming records.
* incoming records.
*/ */
private HashMap<String, List<InsertBucket>> partitionPathToInsertBuckets; private HashMap<String, List<InsertBucket>> partitionPathToInsertBuckets;
@@ -236,24 +225,28 @@ public class HoodieCopyOnWriteTable<T extends HoodieRecordPayload> extends Hoodi
if (pStat.getNumInserts() > 0) { if (pStat.getNumInserts() > 0) {
List<SmallFile> smallFiles = getSmallFiles(partitionPath); List<SmallFile> smallFiles = getSmallFiles(partitionPath);
logger.info("For partitionPath : "+ partitionPath + " Small Files => " + smallFiles); logger.info("For partitionPath : " + partitionPath + " Small Files => " + smallFiles);
long totalUnassignedInserts = pStat.getNumInserts(); long totalUnassignedInserts = pStat.getNumInserts();
List<Integer> bucketNumbers = new ArrayList<>(); List<Integer> bucketNumbers = new ArrayList<>();
List<Long> recordsPerBucket = new ArrayList<>(); List<Long> recordsPerBucket = new ArrayList<>();
// first try packing this into one of the smallFiles // first try packing this into one of the smallFiles
for (SmallFile smallFile: smallFiles) { for (SmallFile smallFile : smallFiles) {
long recordsToAppend = Math.min((config.getParquetMaxFileSize() - smallFile.sizeBytes)/ averageRecordSize, totalUnassignedInserts); long recordsToAppend = Math
if (recordsToAppend > 0 && totalUnassignedInserts > 0){ .min((config.getParquetMaxFileSize() - smallFile.sizeBytes) / averageRecordSize,
totalUnassignedInserts);
if (recordsToAppend > 0 && totalUnassignedInserts > 0) {
// create a new bucket or re-use an existing bucket // create a new bucket or re-use an existing bucket
int bucket; int bucket;
if (updateLocationToBucket.containsKey(smallFile.location.getFileId())) { if (updateLocationToBucket.containsKey(smallFile.location.getFileId())) {
bucket = updateLocationToBucket.get(smallFile.location.getFileId()); bucket = updateLocationToBucket.get(smallFile.location.getFileId());
logger.info("Assigning " + recordsToAppend + " inserts to existing update bucket "+ bucket); logger.info("Assigning " + recordsToAppend + " inserts to existing update bucket "
+ bucket);
} else { } else {
bucket = addUpdateBucket(smallFile.location.getFileId()); bucket = addUpdateBucket(smallFile.location.getFileId());
logger.info("Assigning " + recordsToAppend + " inserts to new update bucket "+ bucket); logger.info(
"Assigning " + recordsToAppend + " inserts to new update bucket " + bucket);
} }
bucketNumbers.add(bucket); bucketNumbers.add(bucket);
recordsPerBucket.add(recordsToAppend); recordsPerBucket.add(recordsToAppend);
@@ -265,16 +258,17 @@ public class HoodieCopyOnWriteTable<T extends HoodieRecordPayload> extends Hoodi
if (totalUnassignedInserts > 0) { if (totalUnassignedInserts > 0) {
long insertRecordsPerBucket = config.getCopyOnWriteInsertSplitSize(); long insertRecordsPerBucket = config.getCopyOnWriteInsertSplitSize();
if (config.shouldAutoTuneInsertSplits()) { if (config.shouldAutoTuneInsertSplits()) {
insertRecordsPerBucket = config.getParquetMaxFileSize()/averageRecordSize; insertRecordsPerBucket = config.getParquetMaxFileSize() / averageRecordSize;
} }
int insertBuckets = (int) Math.max(totalUnassignedInserts / insertRecordsPerBucket, 1L); int insertBuckets = (int) Math.max(totalUnassignedInserts / insertRecordsPerBucket, 1L);
logger.info("After small file assignment: unassignedInserts => " + totalUnassignedInserts logger
.info("After small file assignment: unassignedInserts => " + totalUnassignedInserts
+ ", totalInsertBuckets => " + insertBuckets + ", totalInsertBuckets => " + insertBuckets
+ ", recordsPerBucket => " + insertRecordsPerBucket); + ", recordsPerBucket => " + insertRecordsPerBucket);
for (int b = 0; b < insertBuckets; b++) { for (int b = 0; b < insertBuckets; b++) {
bucketNumbers.add(totalBuckets); bucketNumbers.add(totalBuckets);
recordsPerBucket.add(totalUnassignedInserts/insertBuckets); recordsPerBucket.add(totalUnassignedInserts / insertBuckets);
BucketInfo bucketInfo = new BucketInfo(); BucketInfo bucketInfo = new BucketInfo();
bucketInfo.bucketType = BucketType.INSERT; bucketInfo.bucketType = BucketType.INSERT;
bucketInfoMap.put(totalBuckets, bucketInfo); bucketInfoMap.put(totalBuckets, bucketInfo);
@@ -287,10 +281,11 @@ public class HoodieCopyOnWriteTable<T extends HoodieRecordPayload> extends Hoodi
for (int i = 0; i < bucketNumbers.size(); i++) { for (int i = 0; i < bucketNumbers.size(); i++) {
InsertBucket bkt = new InsertBucket(); InsertBucket bkt = new InsertBucket();
bkt.bucketNumber = bucketNumbers.get(i); bkt.bucketNumber = bucketNumbers.get(i);
bkt.weight = (1.0 * recordsPerBucket.get(i))/pStat.getNumInserts(); bkt.weight = (1.0 * recordsPerBucket.get(i)) / pStat.getNumInserts();
insertBuckets.add(bkt); insertBuckets.add(bkt);
} }
logger.info("Total insert buckets for partition path "+ partitionPath + " => " + insertBuckets); logger.info(
"Total insert buckets for partition path " + partitionPath + " => " + insertBuckets);
partitionPathToInsertBuckets.put(partitionPath, insertBuckets); partitionPathToInsertBuckets.put(partitionPath, insertBuckets);
} }
} }
@@ -299,9 +294,6 @@ public class HoodieCopyOnWriteTable<T extends HoodieRecordPayload> extends Hoodi
/** /**
* Returns a list of small files in the given partition path * Returns a list of small files in the given partition path
*
* @param partitionPath
* @return
*/ */
private List<SmallFile> getSmallFiles(String partitionPath) { private List<SmallFile> getSmallFiles(String partitionPath) {
List<SmallFile> smallFileLocations = new ArrayList<>(); List<SmallFile> smallFileLocations = new ArrayList<>();
@@ -330,10 +322,8 @@ public class HoodieCopyOnWriteTable<T extends HoodieRecordPayload> extends Hoodi
} }
/** /**
* Obtains the average record size based on records written during last commit. * Obtains the average record size based on records written during last commit. Used for
* Used for estimating how many records pack into one file. * estimating how many records pack into one file.
*
* @return
*/ */
private long averageBytesPerRecord() { private long averageBytesPerRecord() {
long avgSize = 0L; long avgSize = 0L;
@@ -375,13 +365,15 @@ public class HoodieCopyOnWriteTable<T extends HoodieRecordPayload> extends Hoodi
HoodieRecordLocation location = keyLocation._2().get(); HoodieRecordLocation location = keyLocation._2().get();
return updateLocationToBucket.get(location.getFileId()); return updateLocationToBucket.get(location.getFileId());
} else { } else {
List<InsertBucket> targetBuckets = partitionPathToInsertBuckets.get(keyLocation._1().getPartitionPath()); List<InsertBucket> targetBuckets = partitionPathToInsertBuckets
.get(keyLocation._1().getPartitionPath());
// pick the target bucket to use based on the weights. // pick the target bucket to use based on the weights.
double totalWeight = 0.0; double totalWeight = 0.0;
final long totalInserts = Math.max(1, globalStat.getNumInserts()); final long totalInserts = Math.max(1, globalStat.getNumInserts());
final long hashOfKey = Hashing.md5().hashString(keyLocation._1().getRecordKey(), StandardCharsets.UTF_8).asLong(); final long hashOfKey = Hashing.md5()
.hashString(keyLocation._1().getRecordKey(), StandardCharsets.UTF_8).asLong();
final double r = 1.0 * Math.floorMod(hashOfKey, totalInserts) / totalInserts; final double r = 1.0 * Math.floorMod(hashOfKey, totalInserts) / totalInserts;
for (InsertBucket insertBucket: targetBuckets) { for (InsertBucket insertBucket : targetBuckets) {
totalWeight += insertBucket.weight; totalWeight += insertBucket.weight;
if (r <= totalWeight) { if (r <= totalWeight) {
return insertBucket.bucketNumber; return insertBucket.bucketNumber;
@@ -413,14 +405,14 @@ public class HoodieCopyOnWriteTable<T extends HoodieRecordPayload> extends Hoodi
} }
public Iterator<List<WriteStatus>> handleUpdate(String commitTime, String fileLoc,
public Iterator<List<WriteStatus>> handleUpdate(String commitTime, String fileLoc, Iterator<HoodieRecord<T>> recordItr) Iterator<HoodieRecord<T>> recordItr)
throws IOException { throws IOException {
// these are updates // these are updates
HoodieMergeHandle upsertHandle = getUpdateHandle(commitTime, fileLoc, recordItr); HoodieMergeHandle upsertHandle = getUpdateHandle(commitTime, fileLoc, recordItr);
if (upsertHandle.getOldFilePath() == null) { if (upsertHandle.getOldFilePath() == null) {
throw new HoodieUpsertException("Error in finding the old file path at commit " + throw new HoodieUpsertException("Error in finding the old file path at commit " +
commitTime +" at fileLoc: " + fileLoc); commitTime + " at fileLoc: " + fileLoc);
} else { } else {
Configuration conf = FSUtils.getFs().getConf(); Configuration conf = FSUtils.getFs().getConf();
AvroReadSupport.setAvroReadSchema(conf, upsertHandle.getSchema()); AvroReadSupport.setAvroReadSchema(conf, upsertHandle.getSchema());
@@ -448,14 +440,17 @@ public class HoodieCopyOnWriteTable<T extends HoodieRecordPayload> extends Hoodi
logger.info("Upsert Handle has partition path as null " + upsertHandle.getOldFilePath() logger.info("Upsert Handle has partition path as null " + upsertHandle.getOldFilePath()
+ ", " + upsertHandle.getWriteStatus()); + ", " + upsertHandle.getWriteStatus());
} }
return Collections.singletonList(Collections.singletonList(upsertHandle.getWriteStatus())).iterator(); return Collections.singletonList(Collections.singletonList(upsertHandle.getWriteStatus()))
.iterator();
} }
protected HoodieMergeHandle getUpdateHandle(String commitTime, String fileLoc, Iterator<HoodieRecord<T>> recordItr) { protected HoodieMergeHandle getUpdateHandle(String commitTime, String fileLoc,
Iterator<HoodieRecord<T>> recordItr) {
return new HoodieMergeHandle<>(config, commitTime, this, recordItr, fileLoc); return new HoodieMergeHandle<>(config, commitTime, this, recordItr, fileLoc);
} }
public Iterator<List<WriteStatus>> handleInsert(String commitTime, Iterator<HoodieRecord<T>> recordItr) throws Exception { public Iterator<List<WriteStatus>> handleInsert(String commitTime,
Iterator<HoodieRecord<T>> recordItr) throws Exception {
return new LazyInsertIterable<>(recordItr, config, commitTime, this); return new LazyInsertIterable<>(recordItr, config, commitTime, this);
} }
@@ -473,7 +468,8 @@ public class HoodieCopyOnWriteTable<T extends HoodieRecordPayload> extends Hoodi
} else if (btype.equals(BucketType.UPDATE)) { } else if (btype.equals(BucketType.UPDATE)) {
return handleUpdate(commitTime, binfo.fileLoc, recordItr); return handleUpdate(commitTime, binfo.fileLoc, recordItr);
} else { } else {
throw new HoodieUpsertException("Unknown bucketType " + btype + " for partition :" + partition); throw new HoodieUpsertException(
"Unknown bucketType " + btype + " for partition :" + partition);
} }
} catch (Throwable t) { } catch (Throwable t) {
String msg = "Error upserting bucketType " + btype + " for partition :" + partition; String msg = "Error upserting bucketType " + btype + " for partition :" + partition;
@@ -496,9 +492,9 @@ public class HoodieCopyOnWriteTable<T extends HoodieRecordPayload> extends Hoodi
} }
/** /**
* Performs cleaning of partition paths according to cleaning policy and returns the number * Performs cleaning of partition paths according to cleaning policy and returns the number of
* of files cleaned. Handles skews in partitions to clean by making files to clean as the * files cleaned. Handles skews in partitions to clean by making files to clean as the unit of
* unit of task distribution. * task distribution.
* *
* @throws IllegalArgumentException if unknown cleaning policy is provided * @throws IllegalArgumentException if unknown cleaning policy is provided
*/ */
@@ -506,7 +502,8 @@ public class HoodieCopyOnWriteTable<T extends HoodieRecordPayload> extends Hoodi
public List<HoodieCleanStat> clean(JavaSparkContext jsc) { public List<HoodieCleanStat> clean(JavaSparkContext jsc) {
try { try {
List<String> partitionsToClean = List<String> partitionsToClean =
FSUtils.getAllPartitionPaths(getFs(), getMetaClient().getBasePath(), config.shouldAssumeDatePartitioning()); FSUtils.getAllPartitionPaths(getFs(), getMetaClient().getBasePath(),
config.shouldAssumeDatePartitioning());
logger.info("Partitions to clean up : " + partitionsToClean + ", with policy " + config logger.info("Partitions to clean up : " + partitionsToClean + ", with policy " + config
.getCleanerPolicy()); .getCleanerPolicy());
if (partitionsToClean.isEmpty()) { if (partitionsToClean.isEmpty()) {
@@ -520,19 +517,16 @@ public class HoodieCopyOnWriteTable<T extends HoodieRecordPayload> extends Hoodi
} }
/** /**
* * Common method used for cleaning out parquet files under a partition path during rollback of a
* Common method used for cleaning out parquet files under a partition path during rollback of a set of commits * set of commits
* @param partitionPath
* @param commits
* @return
* @throws IOException
*/ */
protected Map<FileStatus, Boolean> deleteCleanedFiles(String partitionPath, List<String> commits) throws IOException { protected Map<FileStatus, Boolean> deleteCleanedFiles(String partitionPath, List<String> commits)
throws IOException {
logger.info("Cleaning path " + partitionPath); logger.info("Cleaning path " + partitionPath);
FileSystem fs = FSUtils.getFs(); FileSystem fs = FSUtils.getFs();
FileStatus[] toBeDeleted = FileStatus[] toBeDeleted =
fs.listStatus(new Path(config.getBasePath(), partitionPath), path -> { fs.listStatus(new Path(config.getBasePath(), partitionPath), path -> {
if(!path.toString().contains(".parquet")) { if (!path.toString().contains(".parquet")) {
return false; return false;
} }
String fileCommitTime = FSUtils.getCommitTime(path.getName()); String fileCommitTime = FSUtils.getCommitTime(path.getName());
@@ -548,10 +542,12 @@ public class HoodieCopyOnWriteTable<T extends HoodieRecordPayload> extends Hoodi
} }
@Override @Override
public List<HoodieRollbackStat> rollback(JavaSparkContext jsc, List<String> commits) throws IOException { public List<HoodieRollbackStat> rollback(JavaSparkContext jsc, List<String> commits)
throws IOException {
String actionType = this.getCompactedCommitActionType(); String actionType = this.getCompactedCommitActionType();
HoodieActiveTimeline activeTimeline = this.getActiveTimeline(); HoodieActiveTimeline activeTimeline = this.getActiveTimeline();
List<String> inflights = this.getInflightCommitTimeline().getInstants().map(HoodieInstant::getTimestamp) List<String> inflights = this.getInflightCommitTimeline().getInstants()
.map(HoodieInstant::getTimestamp)
.collect(Collectors.toList()); .collect(Collectors.toList());
// Atomically unpublish all the commits // Atomically unpublish all the commits
@@ -563,7 +559,8 @@ public class HoodieCopyOnWriteTable<T extends HoodieRecordPayload> extends Hoodi
// delete all the data files for all these commits // delete all the data files for all these commits
logger.info("Clean out all parquet files generated for commits: " + commits); logger.info("Clean out all parquet files generated for commits: " + commits);
List<HoodieRollbackStat> stats = jsc.parallelize( List<HoodieRollbackStat> stats = jsc.parallelize(
FSUtils.getAllPartitionPaths(FSUtils.getFs(), this.getMetaClient().getBasePath(), config.shouldAssumeDatePartitioning())) FSUtils.getAllPartitionPaths(FSUtils.getFs(), this.getMetaClient().getBasePath(),
config.shouldAssumeDatePartitioning()))
.map((Function<String, HoodieRollbackStat>) partitionPath -> { .map((Function<String, HoodieRollbackStat>) partitionPath -> {
// Scan all partitions files with this commit time // Scan all partitions files with this commit time
Map<FileStatus, Boolean> results = deleteCleanedFiles(partitionPath, commits); Map<FileStatus, Boolean> results = deleteCleanedFiles(partitionPath, commits);
@@ -579,6 +576,7 @@ public class HoodieCopyOnWriteTable<T extends HoodieRecordPayload> extends Hoodi
} }
private static class PartitionCleanStat implements Serializable { private static class PartitionCleanStat implements Serializable {
private final String partitionPath; private final String partitionPath;
private final List<String> deletePathPatterns = new ArrayList<>(); private final List<String> deletePathPatterns = new ArrayList<>();
private final List<String> successDeleteFiles = new ArrayList<>(); private final List<String> successDeleteFiles = new ArrayList<>();
@@ -613,7 +611,8 @@ public class HoodieCopyOnWriteTable<T extends HoodieRecordPayload> extends Hoodi
} }
} }
private List<HoodieCleanStat> cleanPartitionPaths(List<String> partitionsToClean, JavaSparkContext jsc) { private List<HoodieCleanStat> cleanPartitionPaths(List<String> partitionsToClean,
JavaSparkContext jsc) {
int cleanerParallelism = Math.min(partitionsToClean.size(), config.getCleanerParallelism()); int cleanerParallelism = Math.min(partitionsToClean.size(), config.getCleanerParallelism());
logger.info("Using cleanerParallelism: " + cleanerParallelism); logger.info("Using cleanerParallelism: " + cleanerParallelism);
List<Tuple2<String, PartitionCleanStat>> partitionCleanStats = jsc List<Tuple2<String, PartitionCleanStat>> partitionCleanStats = jsc
@@ -621,7 +620,8 @@ public class HoodieCopyOnWriteTable<T extends HoodieRecordPayload> extends Hoodi
.flatMapToPair(getFilesToDeleteFunc(this, config)) .flatMapToPair(getFilesToDeleteFunc(this, config))
.repartition(cleanerParallelism) // repartition to remove skews .repartition(cleanerParallelism) // repartition to remove skews
.mapPartitionsToPair(deleteFilesFunc(this, config)) .mapPartitionsToPair(deleteFilesFunc(this, config))
.reduceByKey( // merge partition level clean stats below .reduceByKey(
// merge partition level clean stats below
(Function2<PartitionCleanStat, PartitionCleanStat, PartitionCleanStat>) (e1, e2) -> e1 (Function2<PartitionCleanStat, PartitionCleanStat, PartitionCleanStat>) (e1, e2) -> e1
.merge(e2)) .merge(e2))
.collect(); .collect();

View File

@@ -39,13 +39,6 @@ import com.uber.hoodie.exception.HoodieCompactionException;
import com.uber.hoodie.exception.HoodieRollbackException; import com.uber.hoodie.exception.HoodieRollbackException;
import com.uber.hoodie.io.HoodieAppendHandle; import com.uber.hoodie.io.HoodieAppendHandle;
import com.uber.hoodie.io.compact.HoodieRealtimeTableCompactor; import com.uber.hoodie.io.compact.HoodieRealtimeTableCompactor;
import org.apache.hadoop.fs.FileStatus;
import org.apache.hadoop.fs.Path;
import org.apache.log4j.LogManager;
import org.apache.log4j.Logger;
import org.apache.spark.api.java.JavaSparkContext;
import org.apache.spark.api.java.function.Function;
import java.io.IOException; import java.io.IOException;
import java.io.UncheckedIOException; import java.io.UncheckedIOException;
import java.util.Arrays; import java.util.Arrays;
@@ -56,6 +49,12 @@ import java.util.List;
import java.util.Map; import java.util.Map;
import java.util.Optional; import java.util.Optional;
import java.util.stream.Collectors; import java.util.stream.Collectors;
import org.apache.hadoop.fs.FileStatus;
import org.apache.hadoop.fs.Path;
import org.apache.log4j.LogManager;
import org.apache.log4j.Logger;
import org.apache.spark.api.java.JavaSparkContext;
import org.apache.spark.api.java.function.Function;
/** /**
@@ -64,13 +63,15 @@ import java.util.stream.Collectors;
* INSERTS - Same as HoodieCopyOnWriteTable - Produce new files, block aligned to desired size (or) * INSERTS - Same as HoodieCopyOnWriteTable - Produce new files, block aligned to desired size (or)
* Merge with the smallest existing file, to expand it * Merge with the smallest existing file, to expand it
* *
* UPDATES - Appends the changes to a rolling log file maintained per file Id. * UPDATES - Appends the changes to a rolling log file maintained per file Id. Compaction merges the
* Compaction merges the log file into the base file. * log file into the base file.
* *
* WARNING - MOR table type does not support nested rollbacks, every rollback * WARNING - MOR table type does not support nested rollbacks, every rollback must be followed by an
* must be followed by an attempted commit action * attempted commit action
*/ */
public class HoodieMergeOnReadTable<T extends HoodieRecordPayload> extends HoodieCopyOnWriteTable<T> { public class HoodieMergeOnReadTable<T extends HoodieRecordPayload> extends
HoodieCopyOnWriteTable<T> {
private static Logger logger = LogManager.getLogger(HoodieMergeOnReadTable.class); private static Logger logger = LogManager.getLogger(HoodieMergeOnReadTable.class);
public HoodieMergeOnReadTable(HoodieWriteConfig config, public HoodieMergeOnReadTable(HoodieWriteConfig config,
@@ -119,15 +120,17 @@ public class HoodieMergeOnReadTable<T extends HoodieRecordPayload> extends Hoodi
} }
@Override @Override
public List<HoodieRollbackStat> rollback(JavaSparkContext jsc, List<String> commits) throws IOException { public List<HoodieRollbackStat> rollback(JavaSparkContext jsc, List<String> commits)
throws IOException {
//At the moment, MOR table type does not support nested rollbacks //At the moment, MOR table type does not support nested rollbacks
if(commits.size() > 1) { if (commits.size() > 1) {
throw new UnsupportedOperationException("Nested Rollbacks are not supported"); throw new UnsupportedOperationException("Nested Rollbacks are not supported");
} }
Map<String, HoodieInstant> commitsAndCompactions = Map<String, HoodieInstant> commitsAndCompactions =
this.getActiveTimeline() this.getActiveTimeline()
.getTimelineOfActions(Sets.newHashSet(HoodieActiveTimeline.COMMIT_ACTION, HoodieActiveTimeline.COMPACTION_ACTION, HoodieActiveTimeline.DELTA_COMMIT_ACTION)) .getTimelineOfActions(Sets.newHashSet(HoodieActiveTimeline.COMMIT_ACTION,
HoodieActiveTimeline.COMPACTION_ACTION, HoodieActiveTimeline.DELTA_COMMIT_ACTION))
.getInstants() .getInstants()
.filter(i -> commits.contains(i.getTimestamp())) .filter(i -> commits.contains(i.getTimestamp()))
.collect(Collectors.toMap(i -> i.getTimestamp(), i -> i)); .collect(Collectors.toMap(i -> i.getTimestamp(), i -> i));
@@ -149,11 +152,14 @@ public class HoodieMergeOnReadTable<T extends HoodieRecordPayload> extends Hoodi
try { try {
logger.info("Starting to rollback Commit/Compaction " + instant); logger.info("Starting to rollback Commit/Compaction " + instant);
HoodieCommitMetadata commitMetadata = HoodieCommitMetadata HoodieCommitMetadata commitMetadata = HoodieCommitMetadata
.fromBytes(this.getCommitTimeline().getInstantDetails(new HoodieInstant(true, instant.getAction(), instant.getTimestamp())).get()); .fromBytes(this.getCommitTimeline().getInstantDetails(
new HoodieInstant(true, instant.getAction(), instant.getTimestamp())).get());
stats = jsc.parallelize(commitMetadata.getPartitionToWriteStats().keySet().stream().collect(Collectors.toList())) stats = jsc.parallelize(commitMetadata.getPartitionToWriteStats().keySet().stream()
.collect(Collectors.toList()))
.map((Function<String, HoodieRollbackStat>) partitionPath -> { .map((Function<String, HoodieRollbackStat>) partitionPath -> {
Map<FileStatus, Boolean> results = super.deleteCleanedFiles(partitionPath, Arrays.asList(commit)); Map<FileStatus, Boolean> results = super
.deleteCleanedFiles(partitionPath, Arrays.asList(commit));
return HoodieRollbackStat.newBuilder().withPartitionPath(partitionPath) return HoodieRollbackStat.newBuilder().withPartitionPath(partitionPath)
.withDeletedFileResults(results).build(); .withDeletedFileResults(results).build();
}).collect(); }).collect();
@@ -167,40 +173,55 @@ public class HoodieMergeOnReadTable<T extends HoodieRecordPayload> extends Hoodi
logger.info("Starting to rollback delta commit " + instant); logger.info("Starting to rollback delta commit " + instant);
HoodieCommitMetadata commitMetadata = HoodieCommitMetadata HoodieCommitMetadata commitMetadata = HoodieCommitMetadata
.fromBytes(this.getCommitTimeline().getInstantDetails(new HoodieInstant(true, instant.getAction(), instant.getTimestamp())).get()); .fromBytes(this.getCommitTimeline().getInstantDetails(
new HoodieInstant(true, instant.getAction(), instant.getTimestamp())).get());
stats = jsc.parallelize(commitMetadata.getPartitionToWriteStats().keySet().stream().collect(Collectors.toList())) stats = jsc.parallelize(commitMetadata.getPartitionToWriteStats().keySet().stream()
.collect(Collectors.toList()))
.map((Function<String, HoodieRollbackStat>) partitionPath -> { .map((Function<String, HoodieRollbackStat>) partitionPath -> {
// read commit file and (either append delete blocks or delete file) // read commit file and (either append delete blocks or delete file)
Map<FileStatus, Boolean> filesToDeletedStatus = new HashMap<>(); Map<FileStatus, Boolean> filesToDeletedStatus = new HashMap<>();
Map<FileStatus, Long> filesToNumBlocksRollback = new HashMap<>(); Map<FileStatus, Long> filesToNumBlocksRollback = new HashMap<>();
// we do not know fileIds for inserts (first inserts are parquet files), delete all parquet files for the corresponding failed commit, if present (same as COW) // we do not know fileIds for inserts (first inserts are parquet files), delete all parquet files for the corresponding failed commit, if present (same as COW)
filesToDeletedStatus = super.deleteCleanedFiles(partitionPath, Arrays.asList(commit)); filesToDeletedStatus = super
.deleteCleanedFiles(partitionPath, Arrays.asList(commit));
// append rollback blocks for updates // append rollback blocks for updates
commitMetadata.getPartitionToWriteStats().get(partitionPath).stream().filter(wStat -> wStat.getPrevCommit() != HoodieWriteStat.NULL_COMMIT).forEach(wStat -> { commitMetadata.getPartitionToWriteStats().get(partitionPath).stream()
.filter(wStat -> wStat.getPrevCommit() != HoodieWriteStat.NULL_COMMIT)
.forEach(wStat -> {
HoodieLogFormat.Writer writer = null; HoodieLogFormat.Writer writer = null;
try { try {
writer = HoodieLogFormat.newWriterBuilder() writer = HoodieLogFormat.newWriterBuilder()
.onParentPath(new Path(this.getMetaClient().getBasePath(), partitionPath)) .onParentPath(
new Path(this.getMetaClient().getBasePath(), partitionPath))
.withFileId(wStat.getFileId()).overBaseCommit(wStat.getPrevCommit()) .withFileId(wStat.getFileId()).overBaseCommit(wStat.getPrevCommit())
.withFs(FSUtils.getFs()).withFileExtension(HoodieLogFile.DELTA_EXTENSION).build(); .withFs(FSUtils.getFs())
.withFileExtension(HoodieLogFile.DELTA_EXTENSION).build();
Long numRollbackBlocks = 0L; Long numRollbackBlocks = 0L;
// generate metadata // generate metadata
Map<HoodieLogBlock.LogMetadataType, String> metadata = Maps.newHashMap(); Map<HoodieLogBlock.LogMetadataType, String> metadata = Maps.newHashMap();
metadata.put(HoodieLogBlock.LogMetadataType.INSTANT_TIME, metaClient.getActiveTimeline().lastInstant().get().getTimestamp()); metadata.put(HoodieLogBlock.LogMetadataType.INSTANT_TIME,
metaClient.getActiveTimeline().lastInstant().get().getTimestamp());
metadata.put(HoodieLogBlock.LogMetadataType.TARGET_INSTANT_TIME, commit); metadata.put(HoodieLogBlock.LogMetadataType.TARGET_INSTANT_TIME, commit);
// if update belongs to an existing log file // if update belongs to an existing log file
writer.appendBlock(new HoodieCommandBlock(HoodieCommandBlock.HoodieCommandBlockTypeEnum.ROLLBACK_PREVIOUS_BLOCK, metadata)); writer.appendBlock(new HoodieCommandBlock(
HoodieCommandBlock.HoodieCommandBlockTypeEnum.ROLLBACK_PREVIOUS_BLOCK,
metadata));
numRollbackBlocks++; numRollbackBlocks++;
if(wStat.getNumDeletes() > 0) { if (wStat.getNumDeletes() > 0) {
writer.appendBlock(new HoodieCommandBlock(HoodieCommandBlock.HoodieCommandBlockTypeEnum.ROLLBACK_PREVIOUS_BLOCK, metadata)); writer.appendBlock(new HoodieCommandBlock(
HoodieCommandBlock.HoodieCommandBlockTypeEnum.ROLLBACK_PREVIOUS_BLOCK,
metadata));
numRollbackBlocks++; numRollbackBlocks++;
} }
filesToNumBlocksRollback.put(FSUtils.getFs().getFileStatus(writer.getLogFile().getPath()), numRollbackBlocks); filesToNumBlocksRollback
.put(FSUtils.getFs().getFileStatus(writer.getLogFile().getPath()),
numRollbackBlocks);
} catch (IOException | InterruptedException io) { } catch (IOException | InterruptedException io) {
throw new HoodieRollbackException("Failed to rollback for commit " + commit, io); throw new HoodieRollbackException(
"Failed to rollback for commit " + commit, io);
} finally { } finally {
try { try {
writer.close(); writer.close();
@@ -223,10 +244,12 @@ public class HoodieMergeOnReadTable<T extends HoodieRecordPayload> extends Hoodi
}).flatMap(x -> x.stream()).collect(Collectors.toList()); }).flatMap(x -> x.stream()).collect(Collectors.toList());
commitsAndCompactions.entrySet().stream() commitsAndCompactions.entrySet().stream()
.map(entry -> new HoodieInstant(true, entry.getValue().getAction(), entry.getValue().getTimestamp())) .map(entry -> new HoodieInstant(true, entry.getValue().getAction(),
entry.getValue().getTimestamp()))
.forEach(this.getActiveTimeline()::deleteInflight); .forEach(this.getActiveTimeline()::deleteInflight);
logger.debug("Time(in ms) taken to finish rollback " + (System.currentTimeMillis() - startTime)); logger
.debug("Time(in ms) taken to finish rollback " + (System.currentTimeMillis() - startTime));
return allRollbackStats; return allRollbackStats;
} }

View File

@@ -34,7 +34,6 @@ import com.uber.hoodie.common.util.AvroUtils;
import com.uber.hoodie.config.HoodieWriteConfig; import com.uber.hoodie.config.HoodieWriteConfig;
import com.uber.hoodie.exception.HoodieCommitException; import com.uber.hoodie.exception.HoodieCommitException;
import com.uber.hoodie.exception.HoodieException; import com.uber.hoodie.exception.HoodieException;
import com.uber.hoodie.exception.HoodieRollbackException;
import com.uber.hoodie.exception.HoodieSavepointException; import com.uber.hoodie.exception.HoodieSavepointException;
import java.io.IOException; import java.io.IOException;
import java.io.Serializable; import java.io.Serializable;
@@ -43,8 +42,6 @@ import java.util.List;
import java.util.Optional; import java.util.Optional;
import java.util.stream.Collectors; import java.util.stream.Collectors;
import java.util.stream.Stream; import java.util.stream.Stream;
import org.apache.hadoop.fs.FileStatus;
import org.apache.hadoop.fs.FileSystem; import org.apache.hadoop.fs.FileSystem;
import org.apache.log4j.LogManager; import org.apache.log4j.LogManager;
import org.apache.log4j.Logger; import org.apache.log4j.Logger;
@@ -55,6 +52,7 @@ import org.apache.spark.api.java.JavaSparkContext;
* Abstract implementation of a HoodieTable * Abstract implementation of a HoodieTable
*/ */
public abstract class HoodieTable<T extends HoodieRecordPayload> implements Serializable { public abstract class HoodieTable<T extends HoodieRecordPayload> implements Serializable {
protected final HoodieWriteConfig config; protected final HoodieWriteConfig config;
protected final HoodieTableMetaClient metaClient; protected final HoodieTableMetaClient metaClient;
private static Logger logger = LogManager.getLogger(HoodieTable.class); private static Logger logger = LogManager.getLogger(HoodieTable.class);
@@ -65,27 +63,19 @@ public abstract class HoodieTable<T extends HoodieRecordPayload> implements Seri
} }
/** /**
* Provides a partitioner to perform the upsert operation, based on the * Provides a partitioner to perform the upsert operation, based on the workload profile
* workload profile
*
* @return
*/ */
public abstract Partitioner getUpsertPartitioner(WorkloadProfile profile); public abstract Partitioner getUpsertPartitioner(WorkloadProfile profile);
/** /**
* Provides a partitioner to perform the insert operation, based on the workload profile * Provides a partitioner to perform the insert operation, based on the workload profile
*
* @return
*/ */
public abstract Partitioner getInsertPartitioner(WorkloadProfile profile); public abstract Partitioner getInsertPartitioner(WorkloadProfile profile);
/** /**
* Return whether this HoodieTable implementation can benefit from workload * Return whether this HoodieTable implementation can benefit from workload profiling
* profiling
*
* @return
*/ */
public abstract boolean isWorkloadProfileNeeded(); public abstract boolean isWorkloadProfileNeeded();
@@ -103,8 +93,6 @@ public abstract class HoodieTable<T extends HoodieRecordPayload> implements Seri
/** /**
* Get the view of the file system for this table * Get the view of the file system for this table
*
* @return
*/ */
public TableFileSystemView getFileSystemView() { public TableFileSystemView getFileSystemView() {
return new HoodieTableFileSystemView(metaClient, getCompletedCommitTimeline()); return new HoodieTableFileSystemView(metaClient, getCompletedCommitTimeline());
@@ -112,8 +100,6 @@ public abstract class HoodieTable<T extends HoodieRecordPayload> implements Seri
/** /**
* Get the read optimized view of the file system for this table * Get the read optimized view of the file system for this table
*
* @return
*/ */
public TableFileSystemView.ReadOptimizedView getROFileSystemView() { public TableFileSystemView.ReadOptimizedView getROFileSystemView() {
return new HoodieTableFileSystemView(metaClient, getCompletedCommitTimeline()); return new HoodieTableFileSystemView(metaClient, getCompletedCommitTimeline());
@@ -121,8 +107,6 @@ public abstract class HoodieTable<T extends HoodieRecordPayload> implements Seri
/** /**
* Get the real time view of the file system for this table * Get the real time view of the file system for this table
*
* @return
*/ */
public TableFileSystemView.RealtimeView getRTFileSystemView() { public TableFileSystemView.RealtimeView getRTFileSystemView() {
return new HoodieTableFileSystemView(metaClient, getCompletedCommitTimeline()); return new HoodieTableFileSystemView(metaClient, getCompletedCommitTimeline());
@@ -130,8 +114,6 @@ public abstract class HoodieTable<T extends HoodieRecordPayload> implements Seri
/** /**
* Get the completed (commit + compaction) view of the file system for this table * Get the completed (commit + compaction) view of the file system for this table
*
* @return
*/ */
public TableFileSystemView getCompletedFileSystemView() { public TableFileSystemView getCompletedFileSystemView() {
return new HoodieTableFileSystemView(metaClient, getCommitTimeline()); return new HoodieTableFileSystemView(metaClient, getCommitTimeline());
@@ -139,7 +121,6 @@ public abstract class HoodieTable<T extends HoodieRecordPayload> implements Seri
/** /**
* Get only the completed (no-inflights) commit timeline * Get only the completed (no-inflights) commit timeline
* @return
*/ */
public HoodieTimeline getCompletedCommitTimeline() { public HoodieTimeline getCompletedCommitTimeline() {
return getCommitTimeline().filterCompletedInstants(); return getCommitTimeline().filterCompletedInstants();
@@ -147,7 +128,6 @@ public abstract class HoodieTable<T extends HoodieRecordPayload> implements Seri
/** /**
* Get only the inflights (no-completed) commit timeline * Get only the inflights (no-completed) commit timeline
* @return
*/ */
public HoodieTimeline getInflightCommitTimeline() { public HoodieTimeline getInflightCommitTimeline() {
return getCommitTimeline().filterInflights(); return getCommitTimeline().filterInflights();
@@ -156,7 +136,6 @@ public abstract class HoodieTable<T extends HoodieRecordPayload> implements Seri
/** /**
* Get only the completed (no-inflights) clean timeline * Get only the completed (no-inflights) clean timeline
* @return
*/ */
public HoodieTimeline getCompletedCleanTimeline() { public HoodieTimeline getCompletedCleanTimeline() {
return getActiveTimeline().getCleanerTimeline().filterCompletedInstants(); return getActiveTimeline().getCleanerTimeline().filterCompletedInstants();
@@ -164,7 +143,6 @@ public abstract class HoodieTable<T extends HoodieRecordPayload> implements Seri
/** /**
* Get only the completed (no-inflights) savepoint timeline * Get only the completed (no-inflights) savepoint timeline
* @return
*/ */
public HoodieTimeline getCompletedSavepointTimeline() { public HoodieTimeline getCompletedSavepointTimeline() {
return getActiveTimeline().getSavePointTimeline().filterCompletedInstants(); return getActiveTimeline().getSavePointTimeline().filterCompletedInstants();
@@ -172,7 +150,6 @@ public abstract class HoodieTable<T extends HoodieRecordPayload> implements Seri
/** /**
* Get the list of savepoints in this table * Get the list of savepoints in this table
* @return
*/ */
public List<String> getSavepoints() { public List<String> getSavepoints() {
return getCompletedSavepointTimeline().getInstants().map(HoodieInstant::getTimestamp) return getCompletedSavepointTimeline().getInstants().map(HoodieInstant::getTimestamp)
@@ -181,10 +158,6 @@ public abstract class HoodieTable<T extends HoodieRecordPayload> implements Seri
/** /**
* Get the list of data file names savepointed * Get the list of data file names savepointed
*
* @param savepointTime
* @return
* @throws IOException
*/ */
public Stream<String> getSavepointedDataFiles(String savepointTime) { public Stream<String> getSavepointedDataFiles(String savepointTime) {
if (!getSavepoints().contains(savepointTime)) { if (!getSavepoints().contains(savepointTime)) {
@@ -211,8 +184,6 @@ public abstract class HoodieTable<T extends HoodieRecordPayload> implements Seri
/** /**
* Get the commit timeline visible for this table * Get the commit timeline visible for this table
*
* @return
*/ */
public HoodieTimeline getCommitTimeline() { public HoodieTimeline getCommitTimeline() {
switch (metaClient.getTableType()) { switch (metaClient.getTableType()) {
@@ -223,13 +194,12 @@ public abstract class HoodieTable<T extends HoodieRecordPayload> implements Seri
// Include commit action to be able to start doing a MOR over a COW dataset - no migration required // Include commit action to be able to start doing a MOR over a COW dataset - no migration required
return getActiveTimeline().getCommitsAndCompactionsTimeline(); return getActiveTimeline().getCommitsAndCompactionsTimeline();
default: default:
throw new HoodieException("Unsupported table type :"+ metaClient.getTableType()); throw new HoodieException("Unsupported table type :" + metaClient.getTableType());
} }
} }
/** /**
* Get only the completed (no-inflights) compaction commit timeline * Get only the completed (no-inflights) compaction commit timeline
* @return
*/ */
public HoodieTimeline getCompletedCompactionCommitTimeline() { public HoodieTimeline getCompletedCompactionCommitTimeline() {
return getCompactionCommitTimeline().filterCompletedInstants(); return getCompactionCommitTimeline().filterCompletedInstants();
@@ -238,8 +208,6 @@ public abstract class HoodieTable<T extends HoodieRecordPayload> implements Seri
/** /**
* Get the compacted commit timeline visible for this table * Get the compacted commit timeline visible for this table
*
* @return
*/ */
public HoodieTimeline getCompactionCommitTimeline() { public HoodieTimeline getCompactionCommitTimeline() {
switch (metaClient.getTableType()) { switch (metaClient.getTableType()) {
@@ -250,13 +218,12 @@ public abstract class HoodieTable<T extends HoodieRecordPayload> implements Seri
return getActiveTimeline().getTimelineOfActions( return getActiveTimeline().getTimelineOfActions(
Sets.newHashSet(HoodieActiveTimeline.COMPACTION_ACTION)); Sets.newHashSet(HoodieActiveTimeline.COMPACTION_ACTION));
default: default:
throw new HoodieException("Unsupported table type :"+ metaClient.getTableType()); throw new HoodieException("Unsupported table type :" + metaClient.getTableType());
} }
} }
/** /**
* Gets the commit action type * Gets the commit action type
* @return
*/ */
public String getCommitActionType() { public String getCommitActionType() {
switch (metaClient.getTableType()) { switch (metaClient.getTableType()) {
@@ -271,7 +238,6 @@ public abstract class HoodieTable<T extends HoodieRecordPayload> implements Seri
/** /**
* Gets the action type for a compaction commit * Gets the action type for a compaction commit
* @return
*/ */
public String getCompactedCommitActionType() { public String getCompactedCommitActionType() {
switch (metaClient.getTableType()) { switch (metaClient.getTableType()) {
@@ -280,27 +246,18 @@ public abstract class HoodieTable<T extends HoodieRecordPayload> implements Seri
case MERGE_ON_READ: case MERGE_ON_READ:
return HoodieTimeline.COMPACTION_ACTION; return HoodieTimeline.COMPACTION_ACTION;
} }
throw new HoodieException("Unsupported table type :"+ metaClient.getTableType()); throw new HoodieException("Unsupported table type :" + metaClient.getTableType());
} }
/** /**
* Perform the ultimate IO for a given upserted (RDD) partition * Perform the ultimate IO for a given upserted (RDD) partition
*
* @param partition
* @param recordIterator
* @param partitioner
*/ */
public abstract Iterator<List<WriteStatus>> handleUpsertPartition(String commitTime, public abstract Iterator<List<WriteStatus>> handleUpsertPartition(String commitTime,
Integer partition, Iterator<HoodieRecord<T>> recordIterator, Partitioner partitioner); Integer partition, Iterator<HoodieRecord<T>> recordIterator, Partitioner partitioner);
/** /**
* Perform the ultimate IO for a given inserted (RDD) partition * Perform the ultimate IO for a given inserted (RDD) partition
*
* @param partition
* @param recordIterator
* @param partitioner
*/ */
public abstract Iterator<List<WriteStatus>> handleInsertPartition(String commitTime, public abstract Iterator<List<WriteStatus>> handleInsertPartition(String commitTime,
Integer partition, Iterator<HoodieRecord<T>> recordIterator, Partitioner partitioner); Integer partition, Iterator<HoodieRecord<T>> recordIterator, Partitioner partitioner);
@@ -319,27 +276,21 @@ public abstract class HoodieTable<T extends HoodieRecordPayload> implements Seri
} }
/** /**
* Run Compaction on the table. * Run Compaction on the table. Compaction arranges the data so that it is optimized for data
* Compaction arranges the data so that it is optimized for data access * access
*/ */
public abstract Optional<HoodieCompactionMetadata> compact(JavaSparkContext jsc); public abstract Optional<HoodieCompactionMetadata> compact(JavaSparkContext jsc);
/** /**
* Clean partition paths according to cleaning policy and returns the number * Clean partition paths according to cleaning policy and returns the number of files cleaned.
* of files cleaned.
*/ */
public abstract List<HoodieCleanStat> clean(JavaSparkContext jsc); public abstract List<HoodieCleanStat> clean(JavaSparkContext jsc);
/** /**
* Rollback the (inflight/committed) record changes with the given commit time. * Rollback the (inflight/committed) record changes with the given commit time. Four steps: (1)
* Four steps: * Atomically unpublish this commit (2) clean indexing data (3) clean new generated parquet files
* (1) Atomically unpublish this commit * / log blocks (4) Finally, delete .<action>.commit or .<action>.inflight file
* (2) clean indexing data
* (3) clean new generated parquet files / log blocks
* (4) Finally, delete .<action>.commit or .<action>.inflight file
* @param commits
* @return
* @throws HoodieRollbackException
*/ */
public abstract List<HoodieRollbackStat> rollback(JavaSparkContext jsc, List<String> commits) throws IOException; public abstract List<HoodieRollbackStat> rollback(JavaSparkContext jsc, List<String> commits)
throws IOException;
} }

View File

@@ -20,13 +20,13 @@ import com.uber.hoodie.common.model.HoodieRecordPayload;
import org.apache.spark.api.java.JavaRDD; import org.apache.spark.api.java.JavaRDD;
/** /**
* Repartition input records into at least expected number of output spark partitions. It should give * Repartition input records into at least expected number of output spark partitions. It should
* below guarantees * give below guarantees - Output spark partition will have records from only one hoodie partition.
* - Output spark partition will have records from only one hoodie partition. * - Average records per output spark partitions should be almost equal to (#inputRecords /
* - Average records per output spark partitions should be almost equal to (#inputRecords / #outputSparkPartitions) * #outputSparkPartitions) to avoid possible skews.
* to avoid possible skews.
*/ */
public interface UserDefinedBulkInsertPartitioner<T extends HoodieRecordPayload> { public interface UserDefinedBulkInsertPartitioner<T extends HoodieRecordPayload> {
JavaRDD<HoodieRecord<T>> repartitionRecords(JavaRDD<HoodieRecord<T>> records, int outputSparkPartitions); JavaRDD<HoodieRecord<T>> repartitionRecords(JavaRDD<HoodieRecord<T>> records,
int outputSparkPartitions);
} }

View File

@@ -20,15 +20,11 @@ package com.uber.hoodie.table;
import com.uber.hoodie.common.model.HoodieRecord; import com.uber.hoodie.common.model.HoodieRecord;
import com.uber.hoodie.common.model.HoodieRecordLocation; import com.uber.hoodie.common.model.HoodieRecordLocation;
import com.uber.hoodie.common.model.HoodieRecordPayload; import com.uber.hoodie.common.model.HoodieRecordPayload;
import org.apache.spark.api.java.JavaRDD;
import org.apache.spark.api.java.function.PairFunction;
import java.io.Serializable; import java.io.Serializable;
import java.util.HashMap; import java.util.HashMap;
import java.util.Map; import java.util.Map;
import java.util.Set; import java.util.Set;
import org.apache.spark.api.java.JavaRDD;
import scala.Option; import scala.Option;
import scala.Tuple2; import scala.Tuple2;
@@ -65,15 +61,18 @@ public class WorkloadProfile<T extends HoodieRecordPayload> implements Serializa
Map<Tuple2<String, Option<HoodieRecordLocation>>, Long> partitionLocationCounts = taggedRecords Map<Tuple2<String, Option<HoodieRecordLocation>>, Long> partitionLocationCounts = taggedRecords
.mapToPair(record -> .mapToPair(record ->
new Tuple2<>(new Tuple2<>(record.getPartitionPath(), Option.apply(record.getCurrentLocation())), record)) new Tuple2<>(
new Tuple2<>(record.getPartitionPath(), Option.apply(record.getCurrentLocation())),
record))
.countByKey(); .countByKey();
for (Map.Entry<Tuple2<String, Option<HoodieRecordLocation>>, Long> e: partitionLocationCounts.entrySet()) { for (Map.Entry<Tuple2<String, Option<HoodieRecordLocation>>, Long> e : partitionLocationCounts
.entrySet()) {
String partitionPath = e.getKey()._1(); String partitionPath = e.getKey()._1();
Long count = e.getValue(); Long count = e.getValue();
Option<HoodieRecordLocation> locOption = e.getKey()._2(); Option<HoodieRecordLocation> locOption = e.getKey()._2();
if (!partitionPathStatMap.containsKey(partitionPath)){ if (!partitionPathStatMap.containsKey(partitionPath)) {
partitionPathStatMap.put(partitionPath, new WorkloadStat()); partitionPathStatMap.put(partitionPath, new WorkloadStat());
} }
@@ -97,7 +96,7 @@ public class WorkloadProfile<T extends HoodieRecordPayload> implements Serializa
return partitionPathStatMap.keySet(); return partitionPathStatMap.keySet();
} }
public WorkloadStat getWorkloadStat(String partitionPath){ public WorkloadStat getWorkloadStat(String partitionPath) {
return partitionPathStatMap.get(partitionPath); return partitionPathStatMap.get(partitionPath);
} }

View File

@@ -17,7 +17,6 @@
package com.uber.hoodie.table; package com.uber.hoodie.table;
import com.uber.hoodie.common.model.HoodieRecordLocation; import com.uber.hoodie.common.model.HoodieRecordLocation;
import java.io.Serializable; import java.io.Serializable;
import java.util.HashMap; import java.util.HashMap;
@@ -25,6 +24,7 @@ import java.util.HashMap;
* Wraps stats about a single partition path. * Wraps stats about a single partition path.
*/ */
public class WorkloadStat implements Serializable { public class WorkloadStat implements Serializable {
private long numInserts = 0L; private long numInserts = 0L;
private long numUpdates = 0L; private long numUpdates = 0L;

View File

@@ -13,7 +13,6 @@
# See the License for the specific language governing permissions and # See the License for the specific language governing permissions and
# limitations under the License. # limitations under the License.
# #
# Set root logger level to DEBUG and its only appender to A1. # Set root logger level to DEBUG and its only appender to A1.
log4j.rootLogger=INFO, A1 log4j.rootLogger=INFO, A1
# A1 is set to be a ConsoleAppender. # A1 is set to be a ConsoleAppender.

View File

@@ -22,13 +22,12 @@ import com.uber.hoodie.common.HoodieTestDataGenerator;
import com.uber.hoodie.common.model.HoodieAvroPayload; import com.uber.hoodie.common.model.HoodieAvroPayload;
import com.uber.hoodie.common.model.HoodieRecord; import com.uber.hoodie.common.model.HoodieRecord;
import com.uber.hoodie.common.model.HoodieTableType; import com.uber.hoodie.common.model.HoodieTableType;
import com.uber.hoodie.common.table.HoodieTableConfig;
import com.uber.hoodie.common.table.HoodieTableMetaClient; import com.uber.hoodie.common.table.HoodieTableMetaClient;
import com.uber.hoodie.common.util.FSUtils; import com.uber.hoodie.common.util.FSUtils;
import com.uber.hoodie.config.HoodieIndexConfig; import com.uber.hoodie.config.HoodieIndexConfig;
import com.uber.hoodie.config.HoodieWriteConfig; import com.uber.hoodie.config.HoodieWriteConfig;
import com.uber.hoodie.index.HoodieIndex; import com.uber.hoodie.index.HoodieIndex;
import java.util.List;
import org.apache.hadoop.fs.FileSystem; import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path; import org.apache.hadoop.fs.Path;
import org.apache.log4j.LogManager; import org.apache.log4j.LogManager;
@@ -36,7 +35,6 @@ import org.apache.log4j.Logger;
import org.apache.spark.SparkConf; import org.apache.spark.SparkConf;
import org.apache.spark.api.java.JavaRDD; import org.apache.spark.api.java.JavaRDD;
import org.apache.spark.api.java.JavaSparkContext; import org.apache.spark.api.java.JavaSparkContext;
import java.util.List;
/** /**
* Driver program that uses the Hoodie client with synthetic workload, and performs basic * Driver program that uses the Hoodie client with synthetic workload, and performs basic
@@ -44,13 +42,13 @@ import java.util.List;
*/ */
public class HoodieClientExample { public class HoodieClientExample {
@Parameter(names={"--table-path", "-p"}, description = "path for Hoodie sample table") @Parameter(names = {"--table-path", "-p"}, description = "path for Hoodie sample table")
private String tablePath = "file:///tmp/hoodie/sample-table"; private String tablePath = "file:///tmp/hoodie/sample-table";
@Parameter(names={"--table-name", "-n"}, description = "table name for Hoodie sample table") @Parameter(names = {"--table-name", "-n"}, description = "table name for Hoodie sample table")
private String tableName = "hoodie_rt"; private String tableName = "hoodie_rt";
@Parameter(names={"--table-type", "-t"}, description = "One of COPY_ON_WRITE or MERGE_ON_READ") @Parameter(names = {"--table-type", "-t"}, description = "One of COPY_ON_WRITE or MERGE_ON_READ")
private String tableType = HoodieTableType.COPY_ON_WRITE.name(); private String tableType = HoodieTableType.COPY_ON_WRITE.name();
@Parameter(names = {"--help", "-h"}, help = true) @Parameter(names = {"--help", "-h"}, help = true)
@@ -85,7 +83,9 @@ public class HoodieClientExample {
Path path = new Path(tablePath); Path path = new Path(tablePath);
FileSystem fs = FSUtils.getFs(); FileSystem fs = FSUtils.getFs();
if (!fs.exists(path)) { if (!fs.exists(path)) {
HoodieTableMetaClient.initTableType(fs, tablePath, HoodieTableType.valueOf(tableType), tableName, HoodieAvroPayload.class.getName()); HoodieTableMetaClient
.initTableType(fs, tablePath, HoodieTableType.valueOf(tableType), tableName,
HoodieAvroPayload.class.getName());
} }
// Create the write client to write some records in // Create the write client to write some records in

View File

@@ -16,8 +16,12 @@
package com.uber.hoodie; package com.uber.hoodie;
import com.google.common.collect.Iterables; import static org.junit.Assert.assertEquals;
import static org.junit.Assert.assertFalse;
import static org.junit.Assert.assertTrue;
import static org.junit.Assert.fail;
import com.google.common.collect.Iterables;
import com.uber.hoodie.common.HoodieCleanStat; import com.uber.hoodie.common.HoodieCleanStat;
import com.uber.hoodie.common.HoodieClientTestUtils; import com.uber.hoodie.common.HoodieClientTestUtils;
import com.uber.hoodie.common.HoodieTestDataGenerator; import com.uber.hoodie.common.HoodieTestDataGenerator;
@@ -45,22 +49,6 @@ import com.uber.hoodie.config.HoodieWriteConfig;
import com.uber.hoodie.exception.HoodieRollbackException; import com.uber.hoodie.exception.HoodieRollbackException;
import com.uber.hoodie.index.HoodieIndex; import com.uber.hoodie.index.HoodieIndex;
import com.uber.hoodie.table.HoodieTable; import com.uber.hoodie.table.HoodieTable;
import org.apache.avro.generic.GenericRecord;
import org.apache.commons.io.IOUtils;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.spark.api.java.JavaRDD;
import org.apache.spark.api.java.JavaSparkContext;
import org.apache.spark.scheduler.SparkListener;
import org.apache.spark.scheduler.SparkListenerTaskEnd;
import org.apache.spark.sql.SQLContext;
import org.apache.spark.util.AccumulatorV2;
import org.junit.After;
import org.junit.Before;
import org.junit.Test;
import org.junit.rules.TemporaryFolder;
import java.io.File; import java.io.File;
import java.io.FileInputStream; import java.io.FileInputStream;
import java.io.IOException; import java.io.IOException;
@@ -76,15 +64,24 @@ import java.util.Optional;
import java.util.Set; import java.util.Set;
import java.util.TreeSet; import java.util.TreeSet;
import java.util.stream.Collectors; import java.util.stream.Collectors;
import org.apache.avro.generic.GenericRecord;
import org.apache.commons.io.IOUtils;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.spark.api.java.JavaRDD;
import org.apache.spark.api.java.JavaSparkContext;
import org.apache.spark.scheduler.SparkListener;
import org.apache.spark.scheduler.SparkListenerTaskEnd;
import org.apache.spark.sql.SQLContext;
import org.apache.spark.util.AccumulatorV2;
import org.junit.After;
import org.junit.Before;
import org.junit.Test;
import org.junit.rules.TemporaryFolder;
import scala.collection.Iterator; import scala.collection.Iterator;
import static org.junit.Assert.assertEquals;
import static org.junit.Assert.assertFalse;
import static org.junit.Assert.assertTrue;
import static org.junit.Assert.fail;
public class TestHoodieClientOnCopyOnWriteStorage implements Serializable { public class TestHoodieClientOnCopyOnWriteStorage implements Serializable {
private transient JavaSparkContext jsc = null; private transient JavaSparkContext jsc = null;
private transient SQLContext sqlContext; private transient SQLContext sqlContext;
private String basePath = null; private String basePath = null;
@@ -115,7 +112,8 @@ public class TestHoodieClientOnCopyOnWriteStorage implements Serializable {
private HoodieWriteConfig.Builder getConfigBuilder() { private HoodieWriteConfig.Builder getConfigBuilder() {
return HoodieWriteConfig.newBuilder().withPath(basePath) return HoodieWriteConfig.newBuilder().withPath(basePath)
.withSchema(HoodieTestDataGenerator.TRIP_EXAMPLE_SCHEMA).withParallelism(2, 2) .withSchema(HoodieTestDataGenerator.TRIP_EXAMPLE_SCHEMA).withParallelism(2, 2)
.withCompactionConfig(HoodieCompactionConfig.newBuilder().compactionSmallFileSize(1024 * 1024).build()) .withCompactionConfig(
HoodieCompactionConfig.newBuilder().compactionSmallFileSize(1024 * 1024).build())
.withStorageConfig(HoodieStorageConfig.newBuilder().limitFileSize(1024 * 1024).build()) .withStorageConfig(HoodieStorageConfig.newBuilder().limitFileSize(1024 * 1024).build())
.forTable("test-trip-table").withIndexConfig( .forTable("test-trip-table").withIndexConfig(
HoodieIndexConfig.newBuilder().withIndexType(HoodieIndex.IndexType.BLOOM).build()); HoodieIndexConfig.newBuilder().withIndexType(HoodieIndex.IndexType.BLOOM).build());
@@ -129,9 +127,11 @@ public class TestHoodieClientOnCopyOnWriteStorage implements Serializable {
} }
private void assertPartitionMetadata(String[] partitionPaths, FileSystem fs) throws IOException { private void assertPartitionMetadata(String[] partitionPaths, FileSystem fs) throws IOException {
for (String partitionPath: partitionPaths) { for (String partitionPath : partitionPaths) {
assertTrue(HoodiePartitionMetadata.hasPartitionMetadata(fs, new Path(basePath, partitionPath))); assertTrue(
HoodiePartitionMetadata pmeta = new HoodiePartitionMetadata(fs, new Path(basePath, partitionPath)); HoodiePartitionMetadata.hasPartitionMetadata(fs, new Path(basePath, partitionPath)));
HoodiePartitionMetadata pmeta = new HoodiePartitionMetadata(fs,
new Path(basePath, partitionPath));
pmeta.readFromFS(); pmeta.readFromFS();
assertEquals(3, pmeta.getPartitionDepth()); assertEquals(3, pmeta.getPartitionDepth());
} }
@@ -140,13 +140,13 @@ public class TestHoodieClientOnCopyOnWriteStorage implements Serializable {
private void checkTaggedRecords(List<HoodieRecord> taggedRecords, String commitTime) { private void checkTaggedRecords(List<HoodieRecord> taggedRecords, String commitTime) {
for (HoodieRecord rec : taggedRecords) { for (HoodieRecord rec : taggedRecords) {
assertTrue("Record " + rec + " found with no location.", rec.isCurrentLocationKnown()); assertTrue("Record " + rec + " found with no location.", rec.isCurrentLocationKnown());
assertEquals("All records should have commit time "+ commitTime+", since updates were made", assertEquals(
"All records should have commit time " + commitTime + ", since updates were made",
rec.getCurrentLocation().getCommitTime(), commitTime); rec.getCurrentLocation().getCommitTime(), commitTime);
} }
} }
@Test @Test
public void testFilterExist() throws Exception { public void testFilterExist() throws Exception {
HoodieWriteConfig config = getConfig(); HoodieWriteConfig config = getConfig();
@@ -231,17 +231,21 @@ public class TestHoodieClientOnCopyOnWriteStorage implements Serializable {
// verify that there is a commit // verify that there is a commit
HoodieTableMetaClient metaClient = new HoodieTableMetaClient(fs, basePath); HoodieTableMetaClient metaClient = new HoodieTableMetaClient(fs, basePath);
HoodieTimeline timeline = new HoodieActiveTimeline(fs, metaClient.getMetaPath()).getCommitTimeline(); HoodieTimeline timeline = new HoodieActiveTimeline(fs, metaClient.getMetaPath())
.getCommitTimeline();
assertEquals("Expecting a single commit.", 1, timeline.findInstantsAfter("000", Integer.MAX_VALUE).countInstants()); assertEquals("Expecting a single commit.", 1,
assertEquals("Latest commit should be 001", newCommitTime, timeline.lastInstant().get().getTimestamp()); timeline.findInstantsAfter("000", Integer.MAX_VALUE).countInstants());
assertEquals("Latest commit should be 001", newCommitTime,
timeline.lastInstant().get().getTimestamp());
assertEquals("Must contain 200 records", assertEquals("Must contain 200 records",
records.size(), records.size(),
HoodieClientTestUtils.readCommit(basePath, sqlContext, timeline, newCommitTime).count()); HoodieClientTestUtils.readCommit(basePath, sqlContext, timeline, newCommitTime).count());
// Should have 100 records in table (check using Index), all in locations marked at commit // Should have 100 records in table (check using Index), all in locations marked at commit
HoodieTable table = HoodieTable.getHoodieTable(metaClient, getConfig()); HoodieTable table = HoodieTable.getHoodieTable(metaClient, getConfig());
List<HoodieRecord> taggedRecords = index.tagLocation(jsc.parallelize(records, 1), table).collect(); List<HoodieRecord> taggedRecords = index.tagLocation(jsc.parallelize(records, 1), table)
.collect();
checkTaggedRecords(taggedRecords, "001"); checkTaggedRecords(taggedRecords, "001");
/** /**
@@ -265,8 +269,10 @@ public class TestHoodieClientOnCopyOnWriteStorage implements Serializable {
// verify there are now 2 commits // verify there are now 2 commits
timeline = new HoodieActiveTimeline(fs, metaClient.getMetaPath()).getCommitTimeline(); timeline = new HoodieActiveTimeline(fs, metaClient.getMetaPath()).getCommitTimeline();
assertEquals("Expecting two commits.", timeline.findInstantsAfter("000", Integer.MAX_VALUE).countInstants(), 2); assertEquals("Expecting two commits.",
assertEquals("Latest commit should be 004", timeline.lastInstant().get().getTimestamp(), newCommitTime); timeline.findInstantsAfter("000", Integer.MAX_VALUE).countInstants(), 2);
assertEquals("Latest commit should be 004", timeline.lastInstant().get().getTimestamp(),
newCommitTime);
metaClient = new HoodieTableMetaClient(fs, basePath); metaClient = new HoodieTableMetaClient(fs, basePath);
table = HoodieTable.getHoodieTable(metaClient, getConfig()); table = HoodieTable.getHoodieTable(metaClient, getConfig());
@@ -277,21 +283,20 @@ public class TestHoodieClientOnCopyOnWriteStorage implements Serializable {
// Check the entire dataset has 100 records still // Check the entire dataset has 100 records still
String[] fullPartitionPaths = new String[dataGen.getPartitionPaths().length]; String[] fullPartitionPaths = new String[dataGen.getPartitionPaths().length];
for (int i=0; i < fullPartitionPaths.length; i++) { for (int i = 0; i < fullPartitionPaths.length; i++) {
fullPartitionPaths[i] = String.format("%s/%s/*", basePath, dataGen.getPartitionPaths()[i]); fullPartitionPaths[i] = String.format("%s/%s/*", basePath, dataGen.getPartitionPaths()[i]);
} }
assertEquals("Must contain 200 records", assertEquals("Must contain 200 records",
200, 200,
HoodieClientTestUtils.read(basePath, sqlContext, fs, fullPartitionPaths).count()); HoodieClientTestUtils.read(basePath, sqlContext, fs, fullPartitionPaths).count());
// Check that the incremental consumption from time 000 // Check that the incremental consumption from time 000
assertEquals("Incremental consumption from time 002, should give all records in commit 004", assertEquals("Incremental consumption from time 002, should give all records in commit 004",
HoodieClientTestUtils.readCommit(basePath, sqlContext, timeline, newCommitTime).count(), HoodieClientTestUtils.readCommit(basePath, sqlContext, timeline, newCommitTime).count(),
HoodieClientTestUtils.readSince(basePath, sqlContext, timeline, "002").count()); HoodieClientTestUtils.readSince(basePath, sqlContext, timeline, "002").count());
assertEquals("Incremental consumption from time 001, should give all records in commit 004", assertEquals("Incremental consumption from time 001, should give all records in commit 004",
HoodieClientTestUtils.readCommit(basePath, sqlContext,timeline, newCommitTime).count(), HoodieClientTestUtils.readCommit(basePath, sqlContext, timeline, newCommitTime).count(),
HoodieClientTestUtils.readSince(basePath, sqlContext,timeline, "001").count()); HoodieClientTestUtils.readSince(basePath, sqlContext, timeline, "001").count());
} }
@Test @Test
@@ -322,15 +327,19 @@ public class TestHoodieClientOnCopyOnWriteStorage implements Serializable {
// verify that there is a commit // verify that there is a commit
HoodieTableMetaClient metaClient = new HoodieTableMetaClient(fs, basePath); HoodieTableMetaClient metaClient = new HoodieTableMetaClient(fs, basePath);
HoodieTimeline timeline = new HoodieActiveTimeline(fs, metaClient.getMetaPath()).getCommitTimeline(); HoodieTimeline timeline = new HoodieActiveTimeline(fs, metaClient.getMetaPath())
assertEquals("Expecting a single commit.", 1, timeline.findInstantsAfter("000", Integer.MAX_VALUE).countInstants()); .getCommitTimeline();
assertEquals("Latest commit should be 001", newCommitTime, timeline.lastInstant().get().getTimestamp()); assertEquals("Expecting a single commit.", 1,
timeline.findInstantsAfter("000", Integer.MAX_VALUE).countInstants());
assertEquals("Latest commit should be 001", newCommitTime,
timeline.lastInstant().get().getTimestamp());
assertEquals("Must contain 200 records", fewRecordsForInsert.size(), assertEquals("Must contain 200 records", fewRecordsForInsert.size(),
HoodieClientTestUtils.readCommit(basePath, sqlContext, timeline, newCommitTime).count()); HoodieClientTestUtils.readCommit(basePath, sqlContext, timeline, newCommitTime).count());
// Should have 100 records in table (check using Index), all in locations marked at commit // Should have 100 records in table (check using Index), all in locations marked at commit
HoodieTable table = HoodieTable.getHoodieTable(metaClient, getConfig()); HoodieTable table = HoodieTable.getHoodieTable(metaClient, getConfig());
List<HoodieRecord> taggedRecords = index.tagLocation(jsc.parallelize(fewRecordsForInsert, 1), table).collect(); List<HoodieRecord> taggedRecords = index
.tagLocation(jsc.parallelize(fewRecordsForInsert, 1), table).collect();
checkTaggedRecords(taggedRecords, "001"); checkTaggedRecords(taggedRecords, "001");
/** /**
@@ -339,8 +348,8 @@ public class TestHoodieClientOnCopyOnWriteStorage implements Serializable {
newCommitTime = "004"; newCommitTime = "004";
client.startCommitWithTime(newCommitTime); client.startCommitWithTime(newCommitTime);
fewRecordsForDelete = records.subList(0,50); fewRecordsForDelete = records.subList(0, 50);
List<HoodieRecord> fewRecordsForUpdate = records.subList(50,100); List<HoodieRecord> fewRecordsForUpdate = records.subList(50, 100);
records = dataGen.generateDeletesFromExistingRecords(fewRecordsForDelete); records = dataGen.generateDeletesFromExistingRecords(fewRecordsForDelete);
records.addAll(fewRecordsForUpdate); records.addAll(fewRecordsForUpdate);
@@ -351,18 +360,19 @@ public class TestHoodieClientOnCopyOnWriteStorage implements Serializable {
// verify there are now 2 commits // verify there are now 2 commits
timeline = new HoodieActiveTimeline(fs, metaClient.getMetaPath()).getCommitTimeline(); timeline = new HoodieActiveTimeline(fs, metaClient.getMetaPath()).getCommitTimeline();
assertEquals("Expecting two commits.", timeline.findInstantsAfter("000", Integer.MAX_VALUE).countInstants(), 2); assertEquals("Expecting two commits.",
assertEquals("Latest commit should be 004", timeline.lastInstant().get().getTimestamp(), newCommitTime); timeline.findInstantsAfter("000", Integer.MAX_VALUE).countInstants(), 2);
assertEquals("Latest commit should be 004", timeline.lastInstant().get().getTimestamp(),
newCommitTime);
// Check the entire dataset has 150 records(200-50) still // Check the entire dataset has 150 records(200-50) still
String[] fullPartitionPaths = new String[dataGen.getPartitionPaths().length]; String[] fullPartitionPaths = new String[dataGen.getPartitionPaths().length];
for (int i=0; i < fullPartitionPaths.length; i++) { for (int i = 0; i < fullPartitionPaths.length; i++) {
fullPartitionPaths[i] = String.format("%s/%s/*", basePath, dataGen.getPartitionPaths()[i]); fullPartitionPaths[i] = String.format("%s/%s/*", basePath, dataGen.getPartitionPaths()[i]);
} }
assertEquals("Must contain 150 records", 150, assertEquals("Must contain 150 records", 150,
HoodieClientTestUtils.read(basePath, sqlContext, fs, fullPartitionPaths).count()); HoodieClientTestUtils.read(basePath, sqlContext, fs, fullPartitionPaths).count());
// Check that the incremental consumption from time 000 // Check that the incremental consumption from time 000
assertEquals("Incremental consumption from latest commit, should give 50 updated records", assertEquals("Incremental consumption from latest commit, should give 50 updated records",
50, 50,
@@ -384,7 +394,8 @@ public class TestHoodieClientOnCopyOnWriteStorage implements Serializable {
.build()).build(); .build()).build();
HoodieWriteClient client = new HoodieWriteClient(jsc, cfg); HoodieWriteClient client = new HoodieWriteClient(jsc, cfg);
FileSystem fs = FSUtils.getFs(); FileSystem fs = FSUtils.getFs();
HoodieTestDataGenerator.writePartitionMetadata(fs, HoodieTestDataGenerator.DEFAULT_PARTITION_PATHS, basePath); HoodieTestDataGenerator
.writePartitionMetadata(fs, HoodieTestDataGenerator.DEFAULT_PARTITION_PATHS, basePath);
/** /**
* Write 1 (only inserts) * Write 1 (only inserts)
@@ -393,7 +404,8 @@ public class TestHoodieClientOnCopyOnWriteStorage implements Serializable {
client.startCommitWithTime(newCommitTime); client.startCommitWithTime(newCommitTime);
List<HoodieRecord> records = dataGen.generateInserts(newCommitTime, 200); List<HoodieRecord> records = dataGen.generateInserts(newCommitTime, 200);
List<WriteStatus> statuses = client.upsert(jsc.parallelize(records, 1), newCommitTime).collect(); List<WriteStatus> statuses = client.upsert(jsc.parallelize(records, 1), newCommitTime)
.collect();
assertNoWriteErrors(statuses); assertNoWriteErrors(statuses);
/** /**
@@ -437,7 +449,8 @@ public class TestHoodieClientOnCopyOnWriteStorage implements Serializable {
// Verify there are no errors // Verify there are no errors
assertNoWriteErrors(statuses); assertNoWriteErrors(statuses);
List<String> partitionPaths = FSUtils.getAllPartitionPaths(fs, cfg.getBasePath(), getConfig().shouldAssumeDatePartitioning()); List<String> partitionPaths = FSUtils
.getAllPartitionPaths(fs, cfg.getBasePath(), getConfig().shouldAssumeDatePartitioning());
HoodieTableMetaClient metaClient = new HoodieTableMetaClient(fs, basePath); HoodieTableMetaClient metaClient = new HoodieTableMetaClient(fs, basePath);
HoodieTable table = HoodieTable.getHoodieTable(metaClient, getConfig()); HoodieTable table = HoodieTable.getHoodieTable(metaClient, getConfig());
final TableFileSystemView.ReadOptimizedView view = table.getROFileSystemView(); final TableFileSystemView.ReadOptimizedView view = table.getROFileSystemView();
@@ -478,7 +491,8 @@ public class TestHoodieClientOnCopyOnWriteStorage implements Serializable {
.build()).build(); .build()).build();
HoodieWriteClient client = new HoodieWriteClient(jsc, cfg); HoodieWriteClient client = new HoodieWriteClient(jsc, cfg);
FileSystem fs = FSUtils.getFs(); FileSystem fs = FSUtils.getFs();
HoodieTestDataGenerator.writePartitionMetadata(fs, HoodieTestDataGenerator.DEFAULT_PARTITION_PATHS, basePath); HoodieTestDataGenerator
.writePartitionMetadata(fs, HoodieTestDataGenerator.DEFAULT_PARTITION_PATHS, basePath);
/** /**
* Write 1 (only inserts) * Write 1 (only inserts)
@@ -515,7 +529,8 @@ public class TestHoodieClientOnCopyOnWriteStorage implements Serializable {
statuses = client.upsert(jsc.parallelize(records, 1), newCommitTime).collect(); statuses = client.upsert(jsc.parallelize(records, 1), newCommitTime).collect();
// Verify there are no errors // Verify there are no errors
assertNoWriteErrors(statuses); assertNoWriteErrors(statuses);
List<String> partitionPaths = FSUtils.getAllPartitionPaths(fs, cfg.getBasePath(), getConfig().shouldAssumeDatePartitioning()); List<String> partitionPaths = FSUtils
.getAllPartitionPaths(fs, cfg.getBasePath(), getConfig().shouldAssumeDatePartitioning());
HoodieTableMetaClient metaClient = new HoodieTableMetaClient(fs, basePath); HoodieTableMetaClient metaClient = new HoodieTableMetaClient(fs, basePath);
HoodieTable table = HoodieTable.getHoodieTable(metaClient, getConfig()); HoodieTable table = HoodieTable.getHoodieTable(metaClient, getConfig());
final TableFileSystemView.ReadOptimizedView view1 = table.getROFileSystemView(); final TableFileSystemView.ReadOptimizedView view1 = table.getROFileSystemView();
@@ -525,7 +540,6 @@ public class TestHoodieClientOnCopyOnWriteStorage implements Serializable {
}).collect(Collectors.toList()); }).collect(Collectors.toList());
assertEquals("The data files for commit 003 should be present", 3, dataFiles.size()); assertEquals("The data files for commit 003 should be present", 3, dataFiles.size());
/** /**
* Write 4 (updates) * Write 4 (updates)
*/ */
@@ -546,7 +560,6 @@ public class TestHoodieClientOnCopyOnWriteStorage implements Serializable {
}).collect(Collectors.toList()); }).collect(Collectors.toList());
assertEquals("The data files for commit 004 should be present", 3, dataFiles.size()); assertEquals("The data files for commit 004 should be present", 3, dataFiles.size());
// rolling back to a non existent savepoint must not succeed // rolling back to a non existent savepoint must not succeed
try { try {
client.rollbackToSavepoint("001"); client.rollbackToSavepoint("001");
@@ -606,8 +619,10 @@ public class TestHoodieClientOnCopyOnWriteStorage implements Serializable {
// verify that there is a commit // verify that there is a commit
HoodieTableMetaClient metaClient = new HoodieTableMetaClient(fs, basePath); HoodieTableMetaClient metaClient = new HoodieTableMetaClient(fs, basePath);
HoodieTimeline timeline = new HoodieActiveTimeline(fs, metaClient.getMetaPath()).getCommitTimeline(); HoodieTimeline timeline = new HoodieActiveTimeline(fs, metaClient.getMetaPath())
assertEquals("Expecting a single commit.", 1, timeline.findInstantsAfter("000", Integer.MAX_VALUE).countInstants()); .getCommitTimeline();
assertEquals("Expecting a single commit.", 1,
timeline.findInstantsAfter("000", Integer.MAX_VALUE).countInstants());
// Should have 100 records in table (check using Index), all in locations marked at commit // Should have 100 records in table (check using Index), all in locations marked at commit
HoodieTable table = HoodieTable.getHoodieTable(metaClient, getConfig()); HoodieTable table = HoodieTable.getHoodieTable(metaClient, getConfig());
assertFalse(table.getCompletedCommitTimeline().empty()); assertFalse(table.getCompletedCommitTimeline().empty());
@@ -617,7 +632,8 @@ public class TestHoodieClientOnCopyOnWriteStorage implements Serializable {
assertEquals("The clean instant should be the same as the commit instant", commitTime, assertEquals("The clean instant should be the same as the commit instant", commitTime,
table.getCompletedCleanTimeline().getInstants().findFirst().get().getTimestamp()); table.getCompletedCleanTimeline().getInstants().findFirst().get().getTimestamp());
List<HoodieRecord> taggedRecords = index.tagLocation(jsc.parallelize(records, 1), table).collect(); List<HoodieRecord> taggedRecords = index.tagLocation(jsc.parallelize(records, 1), table)
.collect();
checkTaggedRecords(taggedRecords, newCommitTime); checkTaggedRecords(taggedRecords, newCommitTime);
// Keep doing some writes and clean inline. Make sure we have expected number of files remaining. // Keep doing some writes and clean inline. Make sure we have expected number of files remaining.
@@ -641,18 +657,20 @@ public class TestHoodieClientOnCopyOnWriteStorage implements Serializable {
// compute all the versions of all files, from time 0 // compute all the versions of all files, from time 0
HashMap<String, TreeSet<String>> fileIdToVersions = new HashMap<>(); HashMap<String, TreeSet<String>> fileIdToVersions = new HashMap<>();
for (HoodieInstant entry : timeline.getInstants().collect(Collectors.toList())) { for (HoodieInstant entry : timeline.getInstants().collect(Collectors.toList())) {
HoodieCommitMetadata commitMetadata = HoodieCommitMetadata.fromBytes(timeline.getInstantDetails(entry).get()); HoodieCommitMetadata commitMetadata = HoodieCommitMetadata
.fromBytes(timeline.getInstantDetails(entry).get());
for (HoodieWriteStat wstat : commitMetadata.getWriteStats(partitionPath)) { for (HoodieWriteStat wstat : commitMetadata.getWriteStats(partitionPath)) {
if (!fileIdToVersions.containsKey(wstat.getFileId())) { if (!fileIdToVersions.containsKey(wstat.getFileId())) {
fileIdToVersions.put(wstat.getFileId(), new TreeSet<>()); fileIdToVersions.put(wstat.getFileId(), new TreeSet<>());
} }
fileIdToVersions.get(wstat.getFileId()).add(FSUtils.getCommitTime(new Path(wstat.getPath()).getName())); fileIdToVersions.get(wstat.getFileId())
.add(FSUtils.getCommitTime(new Path(wstat.getPath()).getName()));
} }
} }
List<HoodieFileGroup> fileGroups = fsView.getAllFileGroups(partitionPath)
List<HoodieFileGroup> fileGroups = fsView.getAllFileGroups(partitionPath).collect(Collectors.toList()); .collect(Collectors.toList());
for (HoodieFileGroup fileGroup : fileGroups) { for (HoodieFileGroup fileGroup : fileGroups) {
// No file has no more than max versions // No file has no more than max versions
@@ -665,7 +683,8 @@ public class TestHoodieClientOnCopyOnWriteStorage implements Serializable {
// Each file, has the latest N versions (i.e cleaning gets rid of older versions) // Each file, has the latest N versions (i.e cleaning gets rid of older versions)
List<String> commitedVersions = new ArrayList<>(fileIdToVersions.get(fileId)); List<String> commitedVersions = new ArrayList<>(fileIdToVersions.get(fileId));
for (int i = 0; i < dataFiles.size(); i++) { for (int i = 0; i < dataFiles.size(); i++) {
assertEquals("File " + fileId + " does not have latest versions on commits" + commitedVersions, assertEquals(
"File " + fileId + " does not have latest versions on commits" + commitedVersions,
Iterables.get(dataFiles, i).getCommitTime(), Iterables.get(dataFiles, i).getCommitTime(),
commitedVersions.get(commitedVersions.size() - 1 - i)); commitedVersions.get(commitedVersions.size() - 1 - i));
} }
@@ -700,8 +719,10 @@ public class TestHoodieClientOnCopyOnWriteStorage implements Serializable {
// verify that there is a commit // verify that there is a commit
HoodieTableMetaClient metaClient = new HoodieTableMetaClient(fs, basePath); HoodieTableMetaClient metaClient = new HoodieTableMetaClient(fs, basePath);
HoodieTimeline timeline = new HoodieActiveTimeline(fs, metaClient.getMetaPath()).getCommitTimeline(); HoodieTimeline timeline = new HoodieActiveTimeline(fs, metaClient.getMetaPath())
assertEquals("Expecting a single commit.", 1, timeline.findInstantsAfter("000", Integer.MAX_VALUE).countInstants()); .getCommitTimeline();
assertEquals("Expecting a single commit.", 1,
timeline.findInstantsAfter("000", Integer.MAX_VALUE).countInstants());
// Should have 100 records in table (check using Index), all in locations marked at commit // Should have 100 records in table (check using Index), all in locations marked at commit
HoodieTable table = HoodieTable.getHoodieTable(metaClient, getConfig()); HoodieTable table = HoodieTable.getHoodieTable(metaClient, getConfig());
@@ -712,7 +733,8 @@ public class TestHoodieClientOnCopyOnWriteStorage implements Serializable {
assertEquals("The clean instant should be the same as the commit instant", commitTime, assertEquals("The clean instant should be the same as the commit instant", commitTime,
table.getCompletedCleanTimeline().getInstants().findFirst().get().getTimestamp()); table.getCompletedCleanTimeline().getInstants().findFirst().get().getTimestamp());
List<HoodieRecord> taggedRecords = index.tagLocation(jsc.parallelize(records, 1), table).collect(); List<HoodieRecord> taggedRecords = index.tagLocation(jsc.parallelize(records, 1), table)
.collect();
checkTaggedRecords(taggedRecords, newCommitTime); checkTaggedRecords(taggedRecords, newCommitTime);
// Keep doing some writes and clean inline. Make sure we have expected number of files remaining. // Keep doing some writes and clean inline. Make sure we have expected number of files remaining.
@@ -734,7 +756,8 @@ public class TestHoodieClientOnCopyOnWriteStorage implements Serializable {
activeTimeline.getInstants().collect(Collectors.toSet()); activeTimeline.getInstants().collect(Collectors.toSet());
if (earliestRetainedCommit.isPresent()) { if (earliestRetainedCommit.isPresent()) {
acceptableCommits.removeAll( acceptableCommits.removeAll(
activeTimeline.findInstantsInRange("000", earliestRetainedCommit.get().getTimestamp()).getInstants() activeTimeline.findInstantsInRange("000", earliestRetainedCommit.get().getTimestamp())
.getInstants()
.collect(Collectors.toSet())); .collect(Collectors.toSet()));
acceptableCommits.add(earliestRetainedCommit.get()); acceptableCommits.add(earliestRetainedCommit.get());
} }
@@ -742,7 +765,8 @@ public class TestHoodieClientOnCopyOnWriteStorage implements Serializable {
TableFileSystemView fsView = table1.getFileSystemView(); TableFileSystemView fsView = table1.getFileSystemView();
// Need to ensure the following // Need to ensure the following
for (String partitionPath : dataGen.getPartitionPaths()) { for (String partitionPath : dataGen.getPartitionPaths()) {
List<HoodieFileGroup> fileGroups = fsView.getAllFileGroups(partitionPath).collect(Collectors.toList()); List<HoodieFileGroup> fileGroups = fsView.getAllFileGroups(partitionPath)
.collect(Collectors.toList());
for (HoodieFileGroup fileGroup : fileGroups) { for (HoodieFileGroup fileGroup : fileGroups) {
Set<String> commitTimes = new HashSet<>(); Set<String> commitTimes = new HashSet<>();
fileGroup.getAllDataFiles().forEach(value -> { fileGroup.getAllDataFiles().forEach(value -> {
@@ -765,10 +789,9 @@ public class TestHoodieClientOnCopyOnWriteStorage implements Serializable {
String commitTime3 = "20160506030611"; String commitTime3 = "20160506030611";
new File(basePath + "/.hoodie").mkdirs(); new File(basePath + "/.hoodie").mkdirs();
HoodieTestDataGenerator.writePartitionMetadata(FSUtils.getFs(), HoodieTestDataGenerator.writePartitionMetadata(FSUtils.getFs(),
new String[] {"2016/05/01", "2016/05/02", "2016/05/06"}, new String[]{"2016/05/01", "2016/05/02", "2016/05/06"},
basePath); basePath);
// Only first two have commit files // Only first two have commit files
HoodieTestUtils.createCommitFiles(basePath, commitTime1, commitTime2); HoodieTestUtils.createCommitFiles(basePath, commitTime1, commitTime2);
// Third one has a .inflight intermediate commit file // Third one has a .inflight intermediate commit file
@@ -816,7 +839,6 @@ public class TestHoodieClientOnCopyOnWriteStorage implements Serializable {
client.rollback(commitTime3); client.rollback(commitTime3);
assertFalse(HoodieTestUtils.doesInflightExist(basePath, commitTime3)); assertFalse(HoodieTestUtils.doesInflightExist(basePath, commitTime3));
// Rollback commit2 // Rollback commit2
client.rollback(commitTime2); client.rollback(commitTime2);
assertFalse(HoodieTestUtils.doesCommitExist(basePath, commitTime2)); assertFalse(HoodieTestUtils.doesCommitExist(basePath, commitTime2));
@@ -839,7 +861,6 @@ public class TestHoodieClientOnCopyOnWriteStorage implements Serializable {
HoodieTestUtils.doesDataFileExist(basePath, "2016/05/02", commitTime2, file22) || HoodieTestUtils.doesDataFileExist(basePath, "2016/05/02", commitTime2, file22) ||
HoodieTestUtils.doesDataFileExist(basePath, "2016/05/06", commitTime2, file23)); HoodieTestUtils.doesDataFileExist(basePath, "2016/05/06", commitTime2, file23));
// Let's rollback commit1, Check results // Let's rollback commit1, Check results
client.rollback(commitTime1); client.rollback(commitTime1);
assertFalse(HoodieTestUtils.doesCommitExist(basePath, commitTime1)); assertFalse(HoodieTestUtils.doesCommitExist(basePath, commitTime1));
@@ -858,7 +879,7 @@ public class TestHoodieClientOnCopyOnWriteStorage implements Serializable {
String commitTime3 = "20160506030611"; String commitTime3 = "20160506030611";
new File(basePath + "/.hoodie").mkdirs(); new File(basePath + "/.hoodie").mkdirs();
HoodieTestDataGenerator.writePartitionMetadata(FSUtils.getFs(), HoodieTestDataGenerator.writePartitionMetadata(FSUtils.getFs(),
new String[] {"2016/05/01", "2016/05/02", "2016/05/06"}, new String[]{"2016/05/01", "2016/05/02", "2016/05/06"},
basePath); basePath);
// One good commit // One good commit
@@ -940,26 +961,29 @@ public class TestHoodieClientOnCopyOnWriteStorage implements Serializable {
final String TEST_PARTITION_PATH = "2016/09/26"; final String TEST_PARTITION_PATH = "2016/09/26";
final int INSERT_SPLIT_LIMIT = 100; final int INSERT_SPLIT_LIMIT = 100;
// setup the small file handling params // setup the small file handling params
HoodieWriteConfig config = getSmallInsertWriteConfig(INSERT_SPLIT_LIMIT); // hold upto 200 records max HoodieWriteConfig config = getSmallInsertWriteConfig(
dataGen = new HoodieTestDataGenerator(new String[] {TEST_PARTITION_PATH}); INSERT_SPLIT_LIMIT); // hold upto 200 records max
dataGen = new HoodieTestDataGenerator(new String[]{TEST_PARTITION_PATH});
HoodieWriteClient client = new HoodieWriteClient(jsc, config); HoodieWriteClient client = new HoodieWriteClient(jsc, config);
// Inserts => will write file1 // Inserts => will write file1
String commitTime1 = "001"; String commitTime1 = "001";
client.startCommitWithTime(commitTime1); client.startCommitWithTime(commitTime1);
List<HoodieRecord> inserts1 = dataGen.generateInserts(commitTime1, INSERT_SPLIT_LIMIT); // this writes ~500kb List<HoodieRecord> inserts1 = dataGen
.generateInserts(commitTime1, INSERT_SPLIT_LIMIT); // this writes ~500kb
Set<String> keys1 = HoodieClientTestUtils.getRecordKeys(inserts1); Set<String> keys1 = HoodieClientTestUtils.getRecordKeys(inserts1);
JavaRDD<HoodieRecord> insertRecordsRDD1 = jsc.parallelize(inserts1, 1); JavaRDD<HoodieRecord> insertRecordsRDD1 = jsc.parallelize(inserts1, 1);
List<WriteStatus> statuses= client.upsert(insertRecordsRDD1, commitTime1).collect(); List<WriteStatus> statuses = client.upsert(insertRecordsRDD1, commitTime1).collect();
assertNoWriteErrors(statuses); assertNoWriteErrors(statuses);
assertEquals("Just 1 file needs to be added.", 1, statuses.size()); assertEquals("Just 1 file needs to be added.", 1, statuses.size());
String file1 = statuses.get(0).getFileId(); String file1 = statuses.get(0).getFileId();
assertEquals("file should contain 100 records", assertEquals("file should contain 100 records",
ParquetUtils.readRowKeysFromParquet(new Path(basePath, TEST_PARTITION_PATH + "/" + FSUtils.makeDataFileName(commitTime1, 0, file1))).size(), ParquetUtils.readRowKeysFromParquet(new Path(basePath,
TEST_PARTITION_PATH + "/" + FSUtils.makeDataFileName(commitTime1, 0, file1))).size(),
100); 100);
// Update + Inserts such that they just expand file1 // Update + Inserts such that they just expand file1
@@ -977,15 +1001,20 @@ public class TestHoodieClientOnCopyOnWriteStorage implements Serializable {
assertEquals("Just 1 file needs to be updated.", 1, statuses.size()); assertEquals("Just 1 file needs to be updated.", 1, statuses.size());
assertEquals("Existing file should be expanded", file1, statuses.get(0).getFileId()); assertEquals("Existing file should be expanded", file1, statuses.get(0).getFileId());
assertEquals("Existing file should be expanded", commitTime1, statuses.get(0).getStat().getPrevCommit()); assertEquals("Existing file should be expanded", commitTime1,
Path newFile = new Path(basePath, TEST_PARTITION_PATH + "/" + FSUtils.makeDataFileName(commitTime2, 0, file1)); statuses.get(0).getStat().getPrevCommit());
assertEquals("file should contain 140 records", ParquetUtils.readRowKeysFromParquet(newFile).size(), 140); Path newFile = new Path(basePath,
TEST_PARTITION_PATH + "/" + FSUtils.makeDataFileName(commitTime2, 0, file1));
assertEquals("file should contain 140 records",
ParquetUtils.readRowKeysFromParquet(newFile).size(), 140);
List<GenericRecord> records = ParquetUtils.readAvroRecords(newFile); List<GenericRecord> records = ParquetUtils.readAvroRecords(newFile);
for (GenericRecord record: records) { for (GenericRecord record : records) {
String recordKey = record.get(HoodieRecord.RECORD_KEY_METADATA_FIELD).toString(); String recordKey = record.get(HoodieRecord.RECORD_KEY_METADATA_FIELD).toString();
assertEquals("only expect commit2", commitTime2, record.get(HoodieRecord.COMMIT_TIME_METADATA_FIELD).toString()); assertEquals("only expect commit2", commitTime2,
assertTrue("key expected to be part of commit2", keys2.contains(recordKey) || keys1.contains(recordKey)); record.get(HoodieRecord.COMMIT_TIME_METADATA_FIELD).toString());
assertTrue("key expected to be part of commit2",
keys2.contains(recordKey) || keys1.contains(recordKey));
} }
// update + inserts such that file1 is updated and expanded, a new file2 is created. // update + inserts such that file1 is updated and expanded, a new file2 is created.
@@ -1004,14 +1033,15 @@ public class TestHoodieClientOnCopyOnWriteStorage implements Serializable {
HoodieTableMetaClient metadata = new HoodieTableMetaClient(fs, basePath); HoodieTableMetaClient metadata = new HoodieTableMetaClient(fs, basePath);
HoodieTable table = HoodieTable.getHoodieTable(metadata, config); HoodieTable table = HoodieTable.getHoodieTable(metadata, config);
TableFileSystemView.ReadOptimizedView fileSystemView = table.getROFileSystemView(); TableFileSystemView.ReadOptimizedView fileSystemView = table.getROFileSystemView();
List<HoodieDataFile> files = fileSystemView.getLatestDataFilesBeforeOrOn(TEST_PARTITION_PATH, commitTime3).collect( List<HoodieDataFile> files = fileSystemView
.getLatestDataFilesBeforeOrOn(TEST_PARTITION_PATH, commitTime3).collect(
Collectors.toList()); Collectors.toList());
int numTotalInsertsInCommit3 = 0; int numTotalInsertsInCommit3 = 0;
for (HoodieDataFile file: files) { for (HoodieDataFile file : files) {
if (file.getFileName().contains(file1)) { if (file.getFileName().contains(file1)) {
assertEquals("Existing file should be expanded", commitTime3, file.getCommitTime()); assertEquals("Existing file should be expanded", commitTime3, file.getCommitTime());
records = ParquetUtils.readAvroRecords(new Path(file.getPath())); records = ParquetUtils.readAvroRecords(new Path(file.getPath()));
for (GenericRecord record: records) { for (GenericRecord record : records) {
String recordKey = record.get(HoodieRecord.RECORD_KEY_METADATA_FIELD).toString(); String recordKey = record.get(HoodieRecord.RECORD_KEY_METADATA_FIELD).toString();
String recordCommitTime = record.get(HoodieRecord.COMMIT_TIME_METADATA_FIELD).toString(); String recordCommitTime = record.get(HoodieRecord.COMMIT_TIME_METADATA_FIELD).toString();
if (recordCommitTime.equals(commitTime3)) { if (recordCommitTime.equals(commitTime3)) {
@@ -1023,13 +1053,15 @@ public class TestHoodieClientOnCopyOnWriteStorage implements Serializable {
} }
} }
} }
assertEquals("All keys added in commit 2 must be updated in commit3 correctly", 0, keys2.size()); assertEquals("All keys added in commit 2 must be updated in commit3 correctly", 0,
keys2.size());
} else { } else {
assertEquals("New file must be written for commit 3", commitTime3, file.getCommitTime()); assertEquals("New file must be written for commit 3", commitTime3, file.getCommitTime());
records = ParquetUtils.readAvroRecords(new Path(file.getPath())); records = ParquetUtils.readAvroRecords(new Path(file.getPath()));
for (GenericRecord record: records) { for (GenericRecord record : records) {
String recordKey = record.get(HoodieRecord.RECORD_KEY_METADATA_FIELD).toString(); String recordKey = record.get(HoodieRecord.RECORD_KEY_METADATA_FIELD).toString();
assertEquals("only expect commit3", commitTime3, record.get(HoodieRecord.COMMIT_TIME_METADATA_FIELD).toString()); assertEquals("only expect commit3", commitTime3,
record.get(HoodieRecord.COMMIT_TIME_METADATA_FIELD).toString());
assertTrue("key expected to be part of commit3", keys3.contains(recordKey)); assertTrue("key expected to be part of commit3", keys3.contains(recordKey));
} }
numTotalInsertsInCommit3 += records.size(); numTotalInsertsInCommit3 += records.size();
@@ -1044,17 +1076,19 @@ public class TestHoodieClientOnCopyOnWriteStorage implements Serializable {
final String TEST_PARTITION_PATH = "2016/09/26"; final String TEST_PARTITION_PATH = "2016/09/26";
final int INSERT_SPLIT_LIMIT = 100; final int INSERT_SPLIT_LIMIT = 100;
// setup the small file handling params // setup the small file handling params
HoodieWriteConfig config = getSmallInsertWriteConfig(INSERT_SPLIT_LIMIT); // hold upto 200 records max HoodieWriteConfig config = getSmallInsertWriteConfig(
dataGen = new HoodieTestDataGenerator(new String[] {TEST_PARTITION_PATH}); INSERT_SPLIT_LIMIT); // hold upto 200 records max
dataGen = new HoodieTestDataGenerator(new String[]{TEST_PARTITION_PATH});
HoodieWriteClient client = new HoodieWriteClient(jsc, config); HoodieWriteClient client = new HoodieWriteClient(jsc, config);
// Inserts => will write file1 // Inserts => will write file1
String commitTime1 = "001"; String commitTime1 = "001";
client.startCommitWithTime(commitTime1); client.startCommitWithTime(commitTime1);
List<HoodieRecord> inserts1 = dataGen.generateInserts(commitTime1, INSERT_SPLIT_LIMIT); // this writes ~500kb List<HoodieRecord> inserts1 = dataGen
.generateInserts(commitTime1, INSERT_SPLIT_LIMIT); // this writes ~500kb
Set<String> keys1 = HoodieClientTestUtils.getRecordKeys(inserts1); Set<String> keys1 = HoodieClientTestUtils.getRecordKeys(inserts1);
JavaRDD<HoodieRecord> insertRecordsRDD1 = jsc.parallelize(inserts1, 1); JavaRDD<HoodieRecord> insertRecordsRDD1 = jsc.parallelize(inserts1, 1);
List<WriteStatus> statuses= client.insert(insertRecordsRDD1, commitTime1).collect(); List<WriteStatus> statuses = client.insert(insertRecordsRDD1, commitTime1).collect();
assertNoWriteErrors(statuses); assertNoWriteErrors(statuses);
assertPartitionMetadata(new String[]{TEST_PARTITION_PATH}, FSUtils.getFs()); assertPartitionMetadata(new String[]{TEST_PARTITION_PATH}, FSUtils.getFs());
@@ -1062,7 +1096,8 @@ public class TestHoodieClientOnCopyOnWriteStorage implements Serializable {
assertEquals("Just 1 file needs to be added.", 1, statuses.size()); assertEquals("Just 1 file needs to be added.", 1, statuses.size());
String file1 = statuses.get(0).getFileId(); String file1 = statuses.get(0).getFileId();
assertEquals("file should contain 100 records", assertEquals("file should contain 100 records",
ParquetUtils.readRowKeysFromParquet(new Path(basePath, TEST_PARTITION_PATH + "/" + FSUtils.makeDataFileName(commitTime1, 0, file1))).size(), ParquetUtils.readRowKeysFromParquet(new Path(basePath,
TEST_PARTITION_PATH + "/" + FSUtils.makeDataFileName(commitTime1, 0, file1))).size(),
100); 100);
// Second, set of Inserts should just expand file1 // Second, set of Inserts should just expand file1
@@ -1076,16 +1111,21 @@ public class TestHoodieClientOnCopyOnWriteStorage implements Serializable {
assertEquals("Just 1 file needs to be updated.", 1, statuses.size()); assertEquals("Just 1 file needs to be updated.", 1, statuses.size());
assertEquals("Existing file should be expanded", file1, statuses.get(0).getFileId()); assertEquals("Existing file should be expanded", file1, statuses.get(0).getFileId());
assertEquals("Existing file should be expanded", commitTime1, statuses.get(0).getStat().getPrevCommit()); assertEquals("Existing file should be expanded", commitTime1,
Path newFile = new Path(basePath, TEST_PARTITION_PATH + "/" + FSUtils.makeDataFileName(commitTime2, 0, file1)); statuses.get(0).getStat().getPrevCommit());
assertEquals("file should contain 140 records", ParquetUtils.readRowKeysFromParquet(newFile).size(), 140); Path newFile = new Path(basePath,
TEST_PARTITION_PATH + "/" + FSUtils.makeDataFileName(commitTime2, 0, file1));
assertEquals("file should contain 140 records",
ParquetUtils.readRowKeysFromParquet(newFile).size(), 140);
List<GenericRecord> records = ParquetUtils.readAvroRecords(newFile); List<GenericRecord> records = ParquetUtils.readAvroRecords(newFile);
for (GenericRecord record: records) { for (GenericRecord record : records) {
String recordKey = record.get(HoodieRecord.RECORD_KEY_METADATA_FIELD).toString(); String recordKey = record.get(HoodieRecord.RECORD_KEY_METADATA_FIELD).toString();
String recCommitTime = record.get(HoodieRecord.COMMIT_TIME_METADATA_FIELD).toString(); String recCommitTime = record.get(HoodieRecord.COMMIT_TIME_METADATA_FIELD).toString();
assertTrue("Record expected to be part of commit 1 or commit2", commitTime1.equals(recCommitTime) || commitTime2.equals(recCommitTime)); assertTrue("Record expected to be part of commit 1 or commit2",
assertTrue("key expected to be part of commit 1 or commit2", keys2.contains(recordKey) || keys1.contains(recordKey)); commitTime1.equals(recCommitTime) || commitTime2.equals(recCommitTime));
assertTrue("key expected to be part of commit 1 or commit2",
keys2.contains(recordKey) || keys1.contains(recordKey));
} }
// Lots of inserts such that file1 is updated and expanded, a new file2 is created. // Lots of inserts such that file1 is updated and expanded, a new file2 is created.
@@ -1097,7 +1137,6 @@ public class TestHoodieClientOnCopyOnWriteStorage implements Serializable {
assertNoWriteErrors(statuses); assertNoWriteErrors(statuses);
assertEquals("2 files needs to be committed.", 2, statuses.size()); assertEquals("2 files needs to be committed.", 2, statuses.size());
FileSystem fs = FSUtils.getFs(); FileSystem fs = FSUtils.getFs();
HoodieTableMetaClient metaClient = new HoodieTableMetaClient(fs, basePath); HoodieTableMetaClient metaClient = new HoodieTableMetaClient(fs, basePath);
HoodieTable table = HoodieTable.getHoodieTable(metaClient, config); HoodieTable table = HoodieTable.getHoodieTable(metaClient, config);
@@ -1106,14 +1145,14 @@ public class TestHoodieClientOnCopyOnWriteStorage implements Serializable {
.collect(Collectors.toList()); .collect(Collectors.toList());
assertEquals("Total of 2 valid data files", 2, files.size()); assertEquals("Total of 2 valid data files", 2, files.size());
int totalInserts = 0; int totalInserts = 0;
for (HoodieDataFile file: files) { for (HoodieDataFile file : files) {
assertEquals("All files must be at commit 3", commitTime3, file.getCommitTime()); assertEquals("All files must be at commit 3", commitTime3, file.getCommitTime());
records = ParquetUtils.readAvroRecords(new Path(file.getPath())); records = ParquetUtils.readAvroRecords(new Path(file.getPath()));
totalInserts += records.size(); totalInserts += records.size();
} }
assertEquals("Total number of records must add up", totalInserts, inserts1.size() + inserts2.size() + insert3.size()); assertEquals("Total number of records must add up", totalInserts,
inserts1.size() + inserts2.size() + insert3.size());
} }
@Test @Test
@@ -1130,27 +1169,35 @@ public class TestHoodieClientOnCopyOnWriteStorage implements Serializable {
String file1P0C0 = HoodieTestUtils.createNewDataFile(basePath, partitionPaths[0], "000"); String file1P0C0 = HoodieTestUtils.createNewDataFile(basePath, partitionPaths[0], "000");
String file1P1C0 = HoodieTestUtils.createNewDataFile(basePath, partitionPaths[1], "000"); String file1P1C0 = HoodieTestUtils.createNewDataFile(basePath, partitionPaths[1], "000");
HoodieTable table = HoodieTable HoodieTable table = HoodieTable
.getHoodieTable(new HoodieTableMetaClient(FSUtils.getFs(), config.getBasePath(), true), config); .getHoodieTable(new HoodieTableMetaClient(FSUtils.getFs(), config.getBasePath(), true),
config);
List<HoodieCleanStat> hoodieCleanStatsOne = table.clean(jsc); List<HoodieCleanStat> hoodieCleanStatsOne = table.clean(jsc);
assertEquals("Must not clean any files" , 0, getCleanStat(hoodieCleanStatsOne, partitionPaths[0]).getSuccessDeleteFiles().size()); assertEquals("Must not clean any files", 0,
assertEquals("Must not clean any files" , 0, getCleanStat(hoodieCleanStatsOne, partitionPaths[1]).getSuccessDeleteFiles().size()); getCleanStat(hoodieCleanStatsOne, partitionPaths[0]).getSuccessDeleteFiles().size());
assertEquals("Must not clean any files", 0,
getCleanStat(hoodieCleanStatsOne, partitionPaths[1]).getSuccessDeleteFiles().size());
assertTrue(HoodieTestUtils.doesDataFileExist(basePath, partitionPaths[0], "000", file1P0C0)); assertTrue(HoodieTestUtils.doesDataFileExist(basePath, partitionPaths[0], "000", file1P0C0));
assertTrue(HoodieTestUtils.doesDataFileExist(basePath, partitionPaths[1], "000", file1P1C0)); assertTrue(HoodieTestUtils.doesDataFileExist(basePath, partitionPaths[1], "000", file1P1C0));
// make next commit, with 1 insert & 1 update per partition // make next commit, with 1 insert & 1 update per partition
HoodieTestUtils.createCommitFiles(basePath, "001"); HoodieTestUtils.createCommitFiles(basePath, "001");
table = HoodieTable table = HoodieTable
.getHoodieTable(new HoodieTableMetaClient(FSUtils.getFs(), config.getBasePath(), true), config); .getHoodieTable(new HoodieTableMetaClient(FSUtils.getFs(), config.getBasePath(), true),
config);
String file2P0C1 = HoodieTestUtils.createNewDataFile(basePath, partitionPaths[0], "001"); // insert String file2P0C1 = HoodieTestUtils
String file2P1C1 = HoodieTestUtils.createNewDataFile(basePath, partitionPaths[1], "001"); // insert .createNewDataFile(basePath, partitionPaths[0], "001"); // insert
String file2P1C1 = HoodieTestUtils
.createNewDataFile(basePath, partitionPaths[1], "001"); // insert
HoodieTestUtils.createDataFile(basePath, partitionPaths[0], "001", file1P0C0); // update HoodieTestUtils.createDataFile(basePath, partitionPaths[0], "001", file1P0C0); // update
HoodieTestUtils.createDataFile(basePath, partitionPaths[1], "001", file1P1C0); // update HoodieTestUtils.createDataFile(basePath, partitionPaths[1], "001", file1P1C0); // update
List<HoodieCleanStat> hoodieCleanStatsTwo = table.clean(jsc); List<HoodieCleanStat> hoodieCleanStatsTwo = table.clean(jsc);
assertEquals("Must clean 1 file" , 1, getCleanStat(hoodieCleanStatsTwo, partitionPaths[0]).getSuccessDeleteFiles().size()); assertEquals("Must clean 1 file", 1,
assertEquals("Must clean 1 file" , 1, getCleanStat(hoodieCleanStatsTwo, partitionPaths[1]).getSuccessDeleteFiles().size()); getCleanStat(hoodieCleanStatsTwo, partitionPaths[0]).getSuccessDeleteFiles().size());
assertEquals("Must clean 1 file", 1,
getCleanStat(hoodieCleanStatsTwo, partitionPaths[1]).getSuccessDeleteFiles().size());
assertTrue(HoodieTestUtils.doesDataFileExist(basePath, partitionPaths[0], "001", file2P0C1)); assertTrue(HoodieTestUtils.doesDataFileExist(basePath, partitionPaths[0], "001", file2P0C1));
assertTrue(HoodieTestUtils.doesDataFileExist(basePath, partitionPaths[1], "001", file2P1C1)); assertTrue(HoodieTestUtils.doesDataFileExist(basePath, partitionPaths[1], "001", file2P1C1));
assertFalse(HoodieTestUtils.doesDataFileExist(basePath, partitionPaths[0], "000", file1P0C0)); assertFalse(HoodieTestUtils.doesDataFileExist(basePath, partitionPaths[0], "000", file1P0C0));
@@ -1159,14 +1206,16 @@ public class TestHoodieClientOnCopyOnWriteStorage implements Serializable {
// make next commit, with 2 updates to existing files, and 1 insert // make next commit, with 2 updates to existing files, and 1 insert
HoodieTestUtils.createCommitFiles(basePath, "002"); HoodieTestUtils.createCommitFiles(basePath, "002");
table = HoodieTable table = HoodieTable
.getHoodieTable(new HoodieTableMetaClient(FSUtils.getFs(), config.getBasePath(), true), config); .getHoodieTable(new HoodieTableMetaClient(FSUtils.getFs(), config.getBasePath(), true),
config);
HoodieTestUtils.createDataFile(basePath, partitionPaths[0], "002", file1P0C0); // update HoodieTestUtils.createDataFile(basePath, partitionPaths[0], "002", file1P0C0); // update
HoodieTestUtils.createDataFile(basePath, partitionPaths[0], "002", file2P0C1); // update HoodieTestUtils.createDataFile(basePath, partitionPaths[0], "002", file2P0C1); // update
String file3P0C2 = HoodieTestUtils.createNewDataFile(basePath, partitionPaths[0], "002"); String file3P0C2 = HoodieTestUtils.createNewDataFile(basePath, partitionPaths[0], "002");
List<HoodieCleanStat> hoodieCleanStatsThree = table.clean(jsc); List<HoodieCleanStat> hoodieCleanStatsThree = table.clean(jsc);
assertEquals("Must clean two files" , 2, getCleanStat(hoodieCleanStatsThree, partitionPaths[0]).getSuccessDeleteFiles().size()); assertEquals("Must clean two files", 2,
getCleanStat(hoodieCleanStatsThree, partitionPaths[0]).getSuccessDeleteFiles().size());
assertFalse(HoodieTestUtils.doesDataFileExist(basePath, partitionPaths[0], "001", file1P0C0)); assertFalse(HoodieTestUtils.doesDataFileExist(basePath, partitionPaths[0], "001", file1P0C0));
assertFalse(HoodieTestUtils.doesDataFileExist(basePath, partitionPaths[0], "001", file2P0C1)); assertFalse(HoodieTestUtils.doesDataFileExist(basePath, partitionPaths[0], "001", file2P0C1));
assertTrue(HoodieTestUtils.doesDataFileExist(basePath, partitionPaths[0], "002", file3P0C2)); assertTrue(HoodieTestUtils.doesDataFileExist(basePath, partitionPaths[0], "002", file3P0C2));
@@ -1174,7 +1223,8 @@ public class TestHoodieClientOnCopyOnWriteStorage implements Serializable {
// No cleaning on partially written file, with no commit. // No cleaning on partially written file, with no commit.
HoodieTestUtils.createDataFile(basePath, partitionPaths[0], "003", file3P0C2); // update HoodieTestUtils.createDataFile(basePath, partitionPaths[0], "003", file3P0C2); // update
List<HoodieCleanStat> hoodieCleanStatsFour = table.clean(jsc); List<HoodieCleanStat> hoodieCleanStatsFour = table.clean(jsc);
assertEquals("Must not clean any files" , 0, getCleanStat(hoodieCleanStatsFour, partitionPaths[0]).getSuccessDeleteFiles().size()); assertEquals("Must not clean any files", 0,
getCleanStat(hoodieCleanStatsFour, partitionPaths[0]).getSuccessDeleteFiles().size());
assertTrue(HoodieTestUtils.doesDataFileExist(basePath, partitionPaths[0], "002", file3P0C2)); assertTrue(HoodieTestUtils.doesDataFileExist(basePath, partitionPaths[0], "002", file3P0C2));
} }
@@ -1187,31 +1237,39 @@ public class TestHoodieClientOnCopyOnWriteStorage implements Serializable {
.withCleanerPolicy(HoodieCleaningPolicy.KEEP_LATEST_FILE_VERSIONS) .withCleanerPolicy(HoodieCleaningPolicy.KEEP_LATEST_FILE_VERSIONS)
.retainFileVersions(1).build()).build(); .retainFileVersions(1).build()).build();
HoodieTableMetaClient metaClient = HoodieTestUtils
HoodieTableMetaClient metaClient = HoodieTestUtils.initTableType(basePath, HoodieTableType.MERGE_ON_READ); .initTableType(basePath, HoodieTableType.MERGE_ON_READ);
// Make 3 files, one base file and 2 log files associated with base file // Make 3 files, one base file and 2 log files associated with base file
String file1P0 = HoodieTestUtils.createNewDataFile(basePath, partitionPaths[0], "000"); String file1P0 = HoodieTestUtils.createNewDataFile(basePath, partitionPaths[0], "000");
String file2P0L0 = HoodieTestUtils.createNewLogFile(basePath, partitionPaths[0], "000", file1P0, Optional.empty()); String file2P0L0 = HoodieTestUtils
String file2P0L1 = HoodieTestUtils.createNewLogFile(basePath, partitionPaths[0], "000", file1P0, Optional.of(2)); .createNewLogFile(basePath, partitionPaths[0], "000", file1P0, Optional.empty());
String file2P0L1 = HoodieTestUtils
.createNewLogFile(basePath, partitionPaths[0], "000", file1P0, Optional.of(2));
// make 1 compaction commit // make 1 compaction commit
HoodieTestUtils.createCompactionCommitFiles(basePath, "000"); HoodieTestUtils.createCompactionCommitFiles(basePath, "000");
// Make 4 files, one base file and 3 log files associated with base file // Make 4 files, one base file and 3 log files associated with base file
HoodieTestUtils.createDataFile(basePath, partitionPaths[0], "001", file1P0); HoodieTestUtils.createDataFile(basePath, partitionPaths[0], "001", file1P0);
file2P0L0 = HoodieTestUtils.createNewLogFile(basePath, partitionPaths[0], "001", file1P0, Optional.empty()); file2P0L0 = HoodieTestUtils
file2P0L0 = HoodieTestUtils.createNewLogFile(basePath, partitionPaths[0], "001", file1P0, Optional.of(2)); .createNewLogFile(basePath, partitionPaths[0], "001", file1P0, Optional.empty());
file2P0L0 = HoodieTestUtils.createNewLogFile(basePath, partitionPaths[0], "001", file1P0, Optional.of(3)); file2P0L0 = HoodieTestUtils
.createNewLogFile(basePath, partitionPaths[0], "001", file1P0, Optional.of(2));
file2P0L0 = HoodieTestUtils
.createNewLogFile(basePath, partitionPaths[0], "001", file1P0, Optional.of(3));
// make 1 compaction commit // make 1 compaction commit
HoodieTestUtils.createCompactionCommitFiles(basePath, "001"); HoodieTestUtils.createCompactionCommitFiles(basePath, "001");
HoodieTable table = HoodieTable HoodieTable table = HoodieTable
.getHoodieTable(metaClient, config); .getHoodieTable(metaClient, config);
List<HoodieCleanStat> hoodieCleanStats = table.clean(jsc); List<HoodieCleanStat> hoodieCleanStats = table.clean(jsc);
assertEquals("Must clean three files, one parquet and 2 log files" , 3, getCleanStat(hoodieCleanStats, partitionPaths[0]).getSuccessDeleteFiles().size()); assertEquals("Must clean three files, one parquet and 2 log files", 3,
getCleanStat(hoodieCleanStats, partitionPaths[0]).getSuccessDeleteFiles().size());
assertFalse(HoodieTestUtils.doesDataFileExist(basePath, partitionPaths[0], "000", file1P0)); assertFalse(HoodieTestUtils.doesDataFileExist(basePath, partitionPaths[0], "000", file1P0));
assertFalse(HoodieTestUtils.doesLogFileExist(basePath, partitionPaths[0], "000", file2P0L0, Optional.empty())); assertFalse(HoodieTestUtils
assertFalse(HoodieTestUtils.doesLogFileExist(basePath, partitionPaths[0], "000", file2P0L0, Optional.of(2))); .doesLogFileExist(basePath, partitionPaths[0], "000", file2P0L0, Optional.empty()));
assertFalse(HoodieTestUtils
.doesLogFileExist(basePath, partitionPaths[0], "000", file2P0L0, Optional.of(2)));
} }
@Test @Test
@@ -1229,27 +1287,35 @@ public class TestHoodieClientOnCopyOnWriteStorage implements Serializable {
String file1P1C0 = HoodieTestUtils.createNewDataFile(basePath, partitionPaths[1], "000"); String file1P1C0 = HoodieTestUtils.createNewDataFile(basePath, partitionPaths[1], "000");
HoodieTable table = HoodieTable HoodieTable table = HoodieTable
.getHoodieTable(new HoodieTableMetaClient(FSUtils.getFs(), config.getBasePath(), true), config); .getHoodieTable(new HoodieTableMetaClient(FSUtils.getFs(), config.getBasePath(), true),
config);
List<HoodieCleanStat> hoodieCleanStatsOne = table.clean(jsc); List<HoodieCleanStat> hoodieCleanStatsOne = table.clean(jsc);
assertEquals("Must not clean any files" , 0, getCleanStat(hoodieCleanStatsOne, partitionPaths[0]).getSuccessDeleteFiles().size()); assertEquals("Must not clean any files", 0,
assertEquals("Must not clean any files" , 0, getCleanStat(hoodieCleanStatsOne, partitionPaths[1]).getSuccessDeleteFiles().size()); getCleanStat(hoodieCleanStatsOne, partitionPaths[0]).getSuccessDeleteFiles().size());
assertEquals("Must not clean any files", 0,
getCleanStat(hoodieCleanStatsOne, partitionPaths[1]).getSuccessDeleteFiles().size());
assertTrue(HoodieTestUtils.doesDataFileExist(basePath, partitionPaths[0], "000", file1P0C0)); assertTrue(HoodieTestUtils.doesDataFileExist(basePath, partitionPaths[0], "000", file1P0C0));
assertTrue(HoodieTestUtils.doesDataFileExist(basePath, partitionPaths[1], "000", file1P1C0)); assertTrue(HoodieTestUtils.doesDataFileExist(basePath, partitionPaths[1], "000", file1P1C0));
// make next commit, with 1 insert & 1 update per partition // make next commit, with 1 insert & 1 update per partition
HoodieTestUtils.createCommitFiles(basePath, "001"); HoodieTestUtils.createCommitFiles(basePath, "001");
table = HoodieTable table = HoodieTable
.getHoodieTable(new HoodieTableMetaClient(FSUtils.getFs(), config.getBasePath(), true), config); .getHoodieTable(new HoodieTableMetaClient(FSUtils.getFs(), config.getBasePath(), true),
config);
String file2P0C1 = HoodieTestUtils.createNewDataFile(basePath, partitionPaths[0], "001"); // insert String file2P0C1 = HoodieTestUtils
String file2P1C1 = HoodieTestUtils.createNewDataFile(basePath, partitionPaths[1], "001"); // insert .createNewDataFile(basePath, partitionPaths[0], "001"); // insert
String file2P1C1 = HoodieTestUtils
.createNewDataFile(basePath, partitionPaths[1], "001"); // insert
HoodieTestUtils.createDataFile(basePath, partitionPaths[0], "001", file1P0C0); // update HoodieTestUtils.createDataFile(basePath, partitionPaths[0], "001", file1P0C0); // update
HoodieTestUtils.createDataFile(basePath, partitionPaths[1], "001", file1P1C0); // update HoodieTestUtils.createDataFile(basePath, partitionPaths[1], "001", file1P1C0); // update
List<HoodieCleanStat> hoodieCleanStatsTwo = table.clean(jsc); List<HoodieCleanStat> hoodieCleanStatsTwo = table.clean(jsc);
assertEquals("Must not clean any files" , 0, getCleanStat(hoodieCleanStatsTwo, partitionPaths[0]).getSuccessDeleteFiles().size()); assertEquals("Must not clean any files", 0,
assertEquals("Must not clean any files" , 0, getCleanStat(hoodieCleanStatsTwo, partitionPaths[1]).getSuccessDeleteFiles().size()); getCleanStat(hoodieCleanStatsTwo, partitionPaths[0]).getSuccessDeleteFiles().size());
assertEquals("Must not clean any files", 0,
getCleanStat(hoodieCleanStatsTwo, partitionPaths[1]).getSuccessDeleteFiles().size());
assertTrue(HoodieTestUtils.doesDataFileExist(basePath, partitionPaths[0], "001", file2P0C1)); assertTrue(HoodieTestUtils.doesDataFileExist(basePath, partitionPaths[0], "001", file2P0C1));
assertTrue(HoodieTestUtils.doesDataFileExist(basePath, partitionPaths[1], "001", file2P1C1)); assertTrue(HoodieTestUtils.doesDataFileExist(basePath, partitionPaths[1], "001", file2P1C1));
assertTrue(HoodieTestUtils.doesDataFileExist(basePath, partitionPaths[0], "000", file1P0C0)); assertTrue(HoodieTestUtils.doesDataFileExist(basePath, partitionPaths[0], "000", file1P0C0));
@@ -1258,7 +1324,8 @@ public class TestHoodieClientOnCopyOnWriteStorage implements Serializable {
// make next commit, with 2 updates to existing files, and 1 insert // make next commit, with 2 updates to existing files, and 1 insert
HoodieTestUtils.createCommitFiles(basePath, "002"); HoodieTestUtils.createCommitFiles(basePath, "002");
table = HoodieTable table = HoodieTable
.getHoodieTable(new HoodieTableMetaClient(FSUtils.getFs(), config.getBasePath(), true), config); .getHoodieTable(new HoodieTableMetaClient(FSUtils.getFs(), config.getBasePath(), true),
config);
HoodieTestUtils.createDataFile(basePath, partitionPaths[0], "002", file1P0C0); // update HoodieTestUtils.createDataFile(basePath, partitionPaths[0], "002", file1P0C0); // update
HoodieTestUtils.createDataFile(basePath, partitionPaths[0], "002", file2P0C1); // update HoodieTestUtils.createDataFile(basePath, partitionPaths[0], "002", file2P0C1); // update
@@ -1274,7 +1341,8 @@ public class TestHoodieClientOnCopyOnWriteStorage implements Serializable {
// make next commit, with 2 updates to existing files, and 1 insert // make next commit, with 2 updates to existing files, and 1 insert
HoodieTestUtils.createCommitFiles(basePath, "003"); HoodieTestUtils.createCommitFiles(basePath, "003");
table = HoodieTable table = HoodieTable
.getHoodieTable(new HoodieTableMetaClient(FSUtils.getFs(), config.getBasePath(), true), config); .getHoodieTable(new HoodieTableMetaClient(FSUtils.getFs(), config.getBasePath(), true),
config);
HoodieTestUtils.createDataFile(basePath, partitionPaths[0], "003", file1P0C0); // update HoodieTestUtils.createDataFile(basePath, partitionPaths[0], "003", file1P0C0); // update
HoodieTestUtils.createDataFile(basePath, partitionPaths[0], "003", file2P0C1); // update HoodieTestUtils.createDataFile(basePath, partitionPaths[0], "003", file2P0C1); // update
@@ -1282,7 +1350,8 @@ public class TestHoodieClientOnCopyOnWriteStorage implements Serializable {
List<HoodieCleanStat> hoodieCleanStatsFour = table.clean(jsc); List<HoodieCleanStat> hoodieCleanStatsFour = table.clean(jsc);
assertEquals( assertEquals(
"Must not clean one old file", 1, getCleanStat(hoodieCleanStatsFour, partitionPaths[0]).getSuccessDeleteFiles().size()); "Must not clean one old file", 1,
getCleanStat(hoodieCleanStatsFour, partitionPaths[0]).getSuccessDeleteFiles().size());
assertFalse(HoodieTestUtils.doesDataFileExist(basePath, partitionPaths[0], "000", file1P0C0)); assertFalse(HoodieTestUtils.doesDataFileExist(basePath, partitionPaths[0], "000", file1P0C0));
assertTrue(HoodieTestUtils.doesDataFileExist(basePath, partitionPaths[0], "001", file1P0C0)); assertTrue(HoodieTestUtils.doesDataFileExist(basePath, partitionPaths[0], "001", file1P0C0));
@@ -1295,7 +1364,8 @@ public class TestHoodieClientOnCopyOnWriteStorage implements Serializable {
// No cleaning on partially written file, with no commit. // No cleaning on partially written file, with no commit.
HoodieTestUtils.createDataFile(basePath, partitionPaths[0], "004", file3P0C2); // update HoodieTestUtils.createDataFile(basePath, partitionPaths[0], "004", file3P0C2); // update
List<HoodieCleanStat> hoodieCleanStatsFive = table.clean(jsc); List<HoodieCleanStat> hoodieCleanStatsFive = table.clean(jsc);
assertEquals("Must not clean any files" , 0, getCleanStat(hoodieCleanStatsFive, partitionPaths[0]).getSuccessDeleteFiles().size()); assertEquals("Must not clean any files", 0,
getCleanStat(hoodieCleanStatsFive, partitionPaths[0]).getSuccessDeleteFiles().size());
assertTrue(HoodieTestUtils.doesDataFileExist(basePath, partitionPaths[0], "001", file1P0C0)); assertTrue(HoodieTestUtils.doesDataFileExist(basePath, partitionPaths[0], "001", file1P0C0));
assertTrue(HoodieTestUtils.doesDataFileExist(basePath, partitionPaths[0], "001", file2P0C1)); assertTrue(HoodieTestUtils.doesDataFileExist(basePath, partitionPaths[0], "001", file2P0C1));
} }
@@ -1344,13 +1414,14 @@ public class TestHoodieClientOnCopyOnWriteStorage implements Serializable {
Iterator<AccumulatorV2<?, ?>> iterator = taskEnd.taskMetrics().accumulators() Iterator<AccumulatorV2<?, ?>> iterator = taskEnd.taskMetrics().accumulators()
.iterator(); .iterator();
while(iterator.hasNext()) { while (iterator.hasNext()) {
AccumulatorV2 accumulator = iterator.next(); AccumulatorV2 accumulator = iterator.next();
if (taskEnd.stageId() == 1 && if (taskEnd.stageId() == 1 &&
accumulator.isRegistered() && accumulator.isRegistered() &&
accumulator.name().isDefined() && accumulator.name().isDefined() &&
accumulator.name().get().equals("internal.metrics.shuffle.read.recordsRead")) { accumulator.name().get().equals("internal.metrics.shuffle.read.recordsRead")) {
stageOneShuffleReadTaskRecordsCountMap.put(taskEnd.taskInfo().taskId(), (Long) accumulator.value()); stageOneShuffleReadTaskRecordsCountMap
.put(taskEnd.taskInfo().taskId(), (Long) accumulator.value());
} }
} }
} }
@@ -1378,22 +1449,27 @@ public class TestHoodieClientOnCopyOnWriteStorage implements Serializable {
updateAllFilesInPartition(filesP2C0, partitionPaths[2], "003"); updateAllFilesInPartition(filesP2C0, partitionPaths[2], "003");
HoodieTable table = HoodieTable HoodieTable table = HoodieTable
.getHoodieTable(new HoodieTableMetaClient(FSUtils.getFs(), config.getBasePath(), true), config); .getHoodieTable(new HoodieTableMetaClient(FSUtils.getFs(), config.getBasePath(), true),
config);
List<HoodieCleanStat> hoodieCleanStats = table.clean(jsc); List<HoodieCleanStat> hoodieCleanStats = table.clean(jsc);
assertEquals(100, getCleanStat(hoodieCleanStats, partitionPaths[0]).getSuccessDeleteFiles().size()); assertEquals(100,
assertEquals(10, getCleanStat(hoodieCleanStats, partitionPaths[1]).getSuccessDeleteFiles().size()); getCleanStat(hoodieCleanStats, partitionPaths[0]).getSuccessDeleteFiles().size());
assertEquals(10, getCleanStat(hoodieCleanStats, partitionPaths[2]).getSuccessDeleteFiles().size()); assertEquals(10,
getCleanStat(hoodieCleanStats, partitionPaths[1]).getSuccessDeleteFiles().size());
assertEquals(10,
getCleanStat(hoodieCleanStats, partitionPaths[2]).getSuccessDeleteFiles().size());
// 3 tasks are expected since the number of partitions is 3 // 3 tasks are expected since the number of partitions is 3
assertEquals(3, stageOneShuffleReadTaskRecordsCountMap.keySet().size()); assertEquals(3, stageOneShuffleReadTaskRecordsCountMap.keySet().size());
// Sum of all records processed = total number of files to clean // Sum of all records processed = total number of files to clean
assertEquals(120, stageOneShuffleReadTaskRecordsCountMap assertEquals(120, stageOneShuffleReadTaskRecordsCountMap
.values().stream().reduce((a,b) -> a + b).get().intValue()); .values().stream().reduce((a, b) -> a + b).get().intValue());
assertTrue("The skew in handling files to clean is not removed. " assertTrue("The skew in handling files to clean is not removed. "
+ "Each task should handle more records than the partitionPath with least files " + "Each task should handle more records than the partitionPath with least files "
+ "and less records than the partitionPath with most files.", + "and less records than the partitionPath with most files.",
stageOneShuffleReadTaskRecordsCountMap.values().stream().filter(a -> a > 10 && a < 100).count() == 3); stageOneShuffleReadTaskRecordsCountMap.values().stream().filter(a -> a > 10 && a < 100)
.count() == 3);
} }
public void testCommitWritesRelativePaths() throws Exception { public void testCommitWritesRelativePaths() throws Exception {
@@ -1454,7 +1530,8 @@ public class TestHoodieClientOnCopyOnWriteStorage implements Serializable {
} }
} }
private List<String> createFilesInPartition(String partitionPath, String commitTime, int numFiles) throws IOException { private List<String> createFilesInPartition(String partitionPath, String commitTime, int numFiles)
throws IOException {
List<String> files = new ArrayList<>(); List<String> files = new ArrayList<>();
for (int i = 0; i < numFiles; i++) { for (int i = 0; i < numFiles; i++) {
files.add(HoodieTestUtils.createNewDataFile(basePath, partitionPath, commitTime)); files.add(HoodieTestUtils.createNewDataFile(basePath, partitionPath, commitTime));

View File

@@ -29,15 +29,6 @@ import com.uber.hoodie.common.table.view.HoodieTableFileSystemView;
import com.uber.hoodie.common.util.FSUtils; import com.uber.hoodie.common.util.FSUtils;
import com.uber.hoodie.exception.HoodieException; import com.uber.hoodie.exception.HoodieException;
import com.uber.hoodie.table.HoodieTable; import com.uber.hoodie.table.HoodieTable;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.spark.sql.Dataset;
import org.apache.spark.sql.Row;
import org.apache.spark.sql.SQLContext;
import org.apache.spark.SparkConf;
import java.io.File; import java.io.File;
import java.io.IOException; import java.io.IOException;
import java.io.RandomAccessFile; import java.io.RandomAccessFile;
@@ -49,6 +40,12 @@ import java.util.Iterator;
import java.util.List; import java.util.List;
import java.util.Set; import java.util.Set;
import java.util.stream.Collectors; import java.util.stream.Collectors;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.spark.SparkConf;
import org.apache.spark.sql.Dataset;
import org.apache.spark.sql.Row;
import org.apache.spark.sql.SQLContext;
/** /**
* Utility methods to aid testing inside the HoodieClient module. * Utility methods to aid testing inside the HoodieClient module.
@@ -66,14 +63,15 @@ public class HoodieClientTestUtils {
public static Set<String> getRecordKeys(List<HoodieRecord> hoodieRecords) { public static Set<String> getRecordKeys(List<HoodieRecord> hoodieRecords) {
Set<String> keys = new HashSet<>(); Set<String> keys = new HashSet<>();
for (HoodieRecord rec: hoodieRecords) { for (HoodieRecord rec : hoodieRecords) {
keys.add(rec.getRecordKey()); keys.add(rec.getRecordKey());
} }
return keys; return keys;
} }
private static void fakeMetaFile(String basePath, String commitTime, String suffix) throws IOException { private static void fakeMetaFile(String basePath, String commitTime, String suffix)
String parentPath = basePath + "/"+ HoodieTableMetaClient.METAFOLDER_NAME; throws IOException {
String parentPath = basePath + "/" + HoodieTableMetaClient.METAFOLDER_NAME;
new File(parentPath).mkdirs(); new File(parentPath).mkdirs();
new File(parentPath + "/" + commitTime + suffix).createNewFile(); new File(parentPath + "/" + commitTime + suffix).createNewFile();
} }
@@ -87,14 +85,17 @@ public class HoodieClientTestUtils {
fakeMetaFile(basePath, commitTime, HoodieTimeline.INFLIGHT_EXTENSION); fakeMetaFile(basePath, commitTime, HoodieTimeline.INFLIGHT_EXTENSION);
} }
public static void fakeDataFile(String basePath, String partitionPath, String commitTime, String fileId) throws Exception { public static void fakeDataFile(String basePath, String partitionPath, String commitTime,
String fileId) throws Exception {
fakeDataFile(basePath, partitionPath, commitTime, fileId, 0); fakeDataFile(basePath, partitionPath, commitTime, fileId, 0);
} }
public static void fakeDataFile(String basePath, String partitionPath, String commitTime, String fileId, long length) throws Exception { public static void fakeDataFile(String basePath, String partitionPath, String commitTime,
String fileId, long length) throws Exception {
String parentPath = String.format("%s/%s", basePath, partitionPath); String parentPath = String.format("%s/%s", basePath, partitionPath);
new File(parentPath).mkdirs(); new File(parentPath).mkdirs();
String path = String.format("%s/%s", parentPath, FSUtils.makeDataFileName(commitTime, 0, fileId)); String path = String
.format("%s/%s", parentPath, FSUtils.makeDataFileName(commitTime, 0, fileId));
new File(path).createNewFile(); new File(path).createNewFile();
new RandomAccessFile(path, "rw").setLength(length); new RandomAccessFile(path, "rw").setLength(length);
} }
@@ -129,7 +130,8 @@ public class HoodieClientTestUtils {
new HoodieException("No commit exists at " + commitTime); new HoodieException("No commit exists at " + commitTime);
} }
try { try {
HashMap<String, String> paths = getLatestFileIDsToFullPath(basePath, commitTimeline, Arrays.asList(commitInstant)); HashMap<String, String> paths = getLatestFileIDsToFullPath(basePath, commitTimeline,
Arrays.asList(commitInstant));
return sqlContext.read() return sqlContext.read()
.parquet(paths.values().toArray(new String[paths.size()])) .parquet(paths.values().toArray(new String[paths.size()]))
.filter(String.format("%s ='%s'", HoodieRecord.COMMIT_TIME_METADATA_FIELD, commitTime)); .filter(String.format("%s ='%s'", HoodieRecord.COMMIT_TIME_METADATA_FIELD, commitTime));
@@ -150,12 +152,15 @@ public class HoodieClientTestUtils {
.getInstants().collect(Collectors.toList()); .getInstants().collect(Collectors.toList());
try { try {
// Go over the commit metadata, and obtain the new files that need to be read. // Go over the commit metadata, and obtain the new files that need to be read.
HashMap<String, String> fileIdToFullPath = getLatestFileIDsToFullPath(basePath, commitTimeline, commitsToReturn); HashMap<String, String> fileIdToFullPath = getLatestFileIDsToFullPath(basePath,
commitTimeline, commitsToReturn);
return sqlContext.read() return sqlContext.read()
.parquet(fileIdToFullPath.values().toArray(new String[fileIdToFullPath.size()])) .parquet(fileIdToFullPath.values().toArray(new String[fileIdToFullPath.size()]))
.filter(String.format("%s >'%s'", HoodieRecord.COMMIT_TIME_METADATA_FIELD, lastCommitTime)); .filter(
String.format("%s >'%s'", HoodieRecord.COMMIT_TIME_METADATA_FIELD, lastCommitTime));
} catch (IOException e) { } catch (IOException e) {
throw new HoodieException("Error pulling data incrementally from commitTimestamp :" + lastCommitTime, e); throw new HoodieException(
"Error pulling data incrementally from commitTimestamp :" + lastCommitTime, e);
} }
} }
@@ -171,7 +176,8 @@ public class HoodieClientTestUtils {
HoodieTable hoodieTable = HoodieTable HoodieTable hoodieTable = HoodieTable
.getHoodieTable(new HoodieTableMetaClient(fs, basePath, true), null); .getHoodieTable(new HoodieTableMetaClient(fs, basePath, true), null);
for (String path : paths) { for (String path : paths) {
TableFileSystemView.ReadOptimizedView fileSystemView = new HoodieTableFileSystemView(hoodieTable.getMetaClient(), TableFileSystemView.ReadOptimizedView fileSystemView = new HoodieTableFileSystemView(
hoodieTable.getMetaClient(),
hoodieTable.getCompletedCommitTimeline(), fs.globStatus(new Path(path))); hoodieTable.getCompletedCommitTimeline(), fs.globStatus(new Path(path)));
List<HoodieDataFile> latestFiles = fileSystemView.getLatestDataFiles().collect( List<HoodieDataFile> latestFiles = fileSystemView.getLatestDataFiles().collect(
Collectors.toList()); Collectors.toList());

View File

@@ -16,9 +16,16 @@
package com.uber.hoodie.common; package com.uber.hoodie.common;
import static com.uber.hoodie.common.HoodieTestDataGenerator.TRIP_EXAMPLE_SCHEMA;
import com.uber.hoodie.common.util.FSUtils; import com.uber.hoodie.common.util.FSUtils;
import com.uber.hoodie.common.util.HoodieAvroUtils; import com.uber.hoodie.common.util.HoodieAvroUtils;
import com.uber.hoodie.hadoop.realtime.HoodieRealtimeInputFormat; import com.uber.hoodie.hadoop.realtime.HoodieRealtimeInputFormat;
import java.io.IOException;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.List;
import java.util.stream.Collectors;
import org.apache.avro.Schema; import org.apache.avro.Schema;
import org.apache.avro.generic.GenericRecord; import org.apache.avro.generic.GenericRecord;
import org.apache.avro.generic.GenericRecordBuilder; import org.apache.avro.generic.GenericRecordBuilder;
@@ -30,20 +37,13 @@ import org.apache.hadoop.mapred.InputSplit;
import org.apache.hadoop.mapred.JobConf; import org.apache.hadoop.mapred.JobConf;
import org.apache.hadoop.mapred.RecordReader; import org.apache.hadoop.mapred.RecordReader;
import java.io.IOException;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.List;
import java.util.stream.Collectors;
import static com.uber.hoodie.common.HoodieTestDataGenerator.TRIP_EXAMPLE_SCHEMA;
/** /**
* Utility methods to aid in testing MergeOnRead (workaround for HoodieReadClient for MOR) * Utility methods to aid in testing MergeOnRead (workaround for HoodieReadClient for MOR)
*/ */
public class HoodieMergeOnReadTestUtils { public class HoodieMergeOnReadTestUtils {
public static List<GenericRecord> getRecordsUsingInputFormat(List<String> inputPaths) throws IOException { public static List<GenericRecord> getRecordsUsingInputFormat(List<String> inputPaths)
throws IOException {
JobConf jobConf = new JobConf(); JobConf jobConf = new JobConf();
Schema schema = HoodieAvroUtils.addMetadataFields(Schema.parse(TRIP_EXAMPLE_SCHEMA)); Schema schema = HoodieAvroUtils.addMetadataFields(Schema.parse(TRIP_EXAMPLE_SCHEMA));
HoodieRealtimeInputFormat inputFormat = new HoodieRealtimeInputFormat(); HoodieRealtimeInputFormat inputFormat = new HoodieRealtimeInputFormat();
@@ -75,10 +75,12 @@ public class HoodieMergeOnReadTestUtils {
}).get(); }).get();
} }
private static void setPropsForInputFormat(HoodieRealtimeInputFormat inputFormat, JobConf jobConf, Schema schema) { private static void setPropsForInputFormat(HoodieRealtimeInputFormat inputFormat, JobConf jobConf,
Schema schema) {
List<Schema.Field> fields = schema.getFields(); List<Schema.Field> fields = schema.getFields();
String names = fields.stream().map(f -> f.name().toString()).collect(Collectors.joining(",")); String names = fields.stream().map(f -> f.name().toString()).collect(Collectors.joining(","));
String postions = fields.stream().map(f -> String.valueOf(f.pos())).collect(Collectors.joining(",")); String postions = fields.stream().map(f -> String.valueOf(f.pos()))
.collect(Collectors.joining(","));
Configuration conf = FSUtils.getFs().getConf(); Configuration conf = FSUtils.getFs().getConf();
jobConf.set(ColumnProjectionUtils.READ_COLUMN_NAMES_CONF_STR, names); jobConf.set(ColumnProjectionUtils.READ_COLUMN_NAMES_CONF_STR, names);
jobConf.set(ColumnProjectionUtils.READ_COLUMN_IDS_CONF_STR, postions); jobConf.set(ColumnProjectionUtils.READ_COLUMN_IDS_CONF_STR, postions);

View File

@@ -16,17 +16,21 @@
package com.uber.hoodie.common; package com.uber.hoodie.common;
import com.uber.hoodie.avro.model.HoodieCleanMetadata;
import com.uber.hoodie.common.model.HoodieCleaningPolicy;
import com.uber.hoodie.common.model.HoodieCommitMetadata; import com.uber.hoodie.common.model.HoodieCommitMetadata;
import com.uber.hoodie.common.model.HoodieKey; import com.uber.hoodie.common.model.HoodieKey;
import com.uber.hoodie.common.model.HoodiePartitionMetadata; import com.uber.hoodie.common.model.HoodiePartitionMetadata;
import com.uber.hoodie.common.model.HoodieRecord; import com.uber.hoodie.common.model.HoodieRecord;
import com.uber.hoodie.common.table.HoodieTableMetaClient; import com.uber.hoodie.common.table.HoodieTableMetaClient;
import com.uber.hoodie.common.table.HoodieTimeline; import com.uber.hoodie.common.table.HoodieTimeline;
import com.uber.hoodie.common.util.AvroUtils;
import com.uber.hoodie.common.util.FSUtils; import com.uber.hoodie.common.util.FSUtils;
import com.uber.hoodie.common.util.HoodieAvroUtils; import com.uber.hoodie.common.util.HoodieAvroUtils;
import java.io.IOException;
import java.nio.charset.StandardCharsets;
import java.util.ArrayList;
import java.util.List;
import java.util.Optional;
import java.util.Random;
import java.util.UUID;
import org.apache.avro.Schema; import org.apache.avro.Schema;
import org.apache.avro.generic.GenericData; import org.apache.avro.generic.GenericData;
import org.apache.avro.generic.GenericRecord; import org.apache.avro.generic.GenericRecord;
@@ -34,15 +38,6 @@ import org.apache.hadoop.fs.FSDataOutputStream;
import org.apache.hadoop.fs.FileSystem; import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path; import org.apache.hadoop.fs.Path;
import java.io.IOException;
import java.nio.charset.StandardCharsets;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.List;
import java.util.Optional;
import java.util.Random;
import java.util.UUID;
/** /**
* Class to be used in tests to keep generating test inserts and updates against a corpus. * Class to be used in tests to keep generating test inserts and updates against a corpus.
* *
@@ -51,6 +46,7 @@ import java.util.UUID;
public class HoodieTestDataGenerator { public class HoodieTestDataGenerator {
static class KeyPartition { static class KeyPartition {
HoodieKey key; HoodieKey key;
String partitionPath; String partitionPath;
} }
@@ -74,14 +70,17 @@ public class HoodieTestDataGenerator {
public static final String[] DEFAULT_PARTITION_PATHS = {"2016/03/15", "2015/03/16", "2015/03/17"}; public static final String[] DEFAULT_PARTITION_PATHS = {"2016/03/15", "2015/03/16", "2015/03/17"};
public static void writePartitionMetadata(FileSystem fs, String[] partitionPaths, String basePath) { public static void writePartitionMetadata(FileSystem fs, String[] partitionPaths,
for (String partitionPath: partitionPaths) { String basePath) {
new HoodiePartitionMetadata(fs, "000", new Path(basePath), new Path(basePath, partitionPath)).trySave(0); for (String partitionPath : partitionPaths) {
new HoodiePartitionMetadata(fs, "000", new Path(basePath), new Path(basePath, partitionPath))
.trySave(0);
} }
} }
private List<KeyPartition> existingKeysList = new ArrayList<>(); private List<KeyPartition> existingKeysList = new ArrayList<>();
public static Schema avroSchema = HoodieAvroUtils.addMetadataFields(new Schema.Parser().parse(TRIP_EXAMPLE_SCHEMA)); public static Schema avroSchema = HoodieAvroUtils
.addMetadataFields(new Schema.Parser().parse(TRIP_EXAMPLE_SCHEMA));
private static Random rand = new Random(46474747); private static Random rand = new Random(46474747);
private String[] partitionPaths = DEFAULT_PARTITION_PATHS; private String[] partitionPaths = DEFAULT_PARTITION_PATHS;
@@ -95,8 +94,8 @@ public class HoodieTestDataGenerator {
/** /**
* Generates new inserts, uniformly across the partition paths above. It also updates the list * Generates new inserts, uniformly across the partition paths above. It also updates the list of
* of existing keys. * existing keys.
*/ */
public List<HoodieRecord> generateInserts(String commitTime, int n) throws IOException { public List<HoodieRecord> generateInserts(String commitTime, int n) throws IOException {
List<HoodieRecord> inserts = new ArrayList<>(); List<HoodieRecord> inserts = new ArrayList<>();
@@ -119,9 +118,10 @@ public class HoodieTestDataGenerator {
return generateDeletesFromExistingRecords(inserts); return generateDeletesFromExistingRecords(inserts);
} }
public List<HoodieRecord> generateDeletesFromExistingRecords(List<HoodieRecord> existingRecords) throws IOException { public List<HoodieRecord> generateDeletesFromExistingRecords(List<HoodieRecord> existingRecords)
throws IOException {
List<HoodieRecord> deletes = new ArrayList<>(); List<HoodieRecord> deletes = new ArrayList<>();
for (HoodieRecord existingRecord: existingRecords) { for (HoodieRecord existingRecord : existingRecords) {
HoodieRecord record = generateDeleteRecord(existingRecord); HoodieRecord record = generateDeleteRecord(existingRecord);
deletes.add(record); deletes.add(record);
@@ -131,14 +131,17 @@ public class HoodieTestDataGenerator {
public HoodieRecord generateDeleteRecord(HoodieRecord existingRecord) throws IOException { public HoodieRecord generateDeleteRecord(HoodieRecord existingRecord) throws IOException {
HoodieKey key = existingRecord.getKey(); HoodieKey key = existingRecord.getKey();
TestRawTripPayload payload = new TestRawTripPayload(Optional.empty(), key.getRecordKey(), key.getPartitionPath(), null, true); TestRawTripPayload payload = new TestRawTripPayload(Optional.empty(), key.getRecordKey(),
key.getPartitionPath(), null, true);
return new HoodieRecord(key, payload); return new HoodieRecord(key, payload);
} }
public List<HoodieRecord> generateUpdates(String commitTime, List<HoodieRecord> baseRecords) throws IOException { public List<HoodieRecord> generateUpdates(String commitTime, List<HoodieRecord> baseRecords)
throws IOException {
List<HoodieRecord> updates = new ArrayList<>(); List<HoodieRecord> updates = new ArrayList<>();
for (HoodieRecord baseRecord: baseRecords) { for (HoodieRecord baseRecord : baseRecords) {
HoodieRecord record = new HoodieRecord(baseRecord.getKey(), generateRandomValue(baseRecord.getKey(), commitTime)); HoodieRecord record = new HoodieRecord(baseRecord.getKey(),
generateRandomValue(baseRecord.getKey(), commitTime));
updates.add(record); updates.add(record);
} }
return updates; return updates;
@@ -162,11 +165,13 @@ public class HoodieTestDataGenerator {
* Generates a new avro record of the above schema format, retaining the key if optionally * Generates a new avro record of the above schema format, retaining the key if optionally
* provided. * provided.
*/ */
public static TestRawTripPayload generateRandomValue(HoodieKey key, String commitTime) throws IOException { public static TestRawTripPayload generateRandomValue(HoodieKey key, String commitTime)
throws IOException {
GenericRecord rec = generateGenericRecord(key.getRecordKey(), "rider-" + commitTime, GenericRecord rec = generateGenericRecord(key.getRecordKey(), "rider-" + commitTime,
"driver-" + commitTime, 0.0); "driver-" + commitTime, 0.0);
HoodieAvroUtils.addCommitMetadataToRecord(rec, commitTime, "-1"); HoodieAvroUtils.addCommitMetadataToRecord(rec, commitTime, "-1");
return new TestRawTripPayload(rec.toString(), key.getRecordKey(), key.getPartitionPath(), TRIP_EXAMPLE_SCHEMA); return new TestRawTripPayload(rec.toString(), key.getRecordKey(), key.getPartitionPath(),
TRIP_EXAMPLE_SCHEMA);
} }
public static GenericRecord generateGenericRecord(String rowKey, String riderName, public static GenericRecord generateGenericRecord(String rowKey, String riderName,
@@ -186,7 +191,8 @@ public class HoodieTestDataGenerator {
public static void createCommitFile(String basePath, String commitTime) throws IOException { public static void createCommitFile(String basePath, String commitTime) throws IOException {
Path commitFile = Path commitFile =
new Path(basePath + "/" + HoodieTableMetaClient.METAFOLDER_NAME + "/" + HoodieTimeline.makeCommitFileName(commitTime)); new Path(basePath + "/" + HoodieTableMetaClient.METAFOLDER_NAME + "/" + HoodieTimeline
.makeCommitFileName(commitTime));
FileSystem fs = FSUtils.getFs(); FileSystem fs = FSUtils.getFs();
FSDataOutputStream os = fs.create(commitFile, true); FSDataOutputStream os = fs.create(commitFile, true);
HoodieCommitMetadata commitMetadata = new HoodieCommitMetadata(); HoodieCommitMetadata commitMetadata = new HoodieCommitMetadata();

View File

@@ -17,31 +17,32 @@
package com.uber.hoodie.common; package com.uber.hoodie.common;
import com.fasterxml.jackson.databind.ObjectMapper; import com.fasterxml.jackson.databind.ObjectMapper;
import com.uber.hoodie.WriteStatus; import com.uber.hoodie.WriteStatus;
import com.uber.hoodie.avro.MercifulJsonConverter; import com.uber.hoodie.avro.MercifulJsonConverter;
import com.uber.hoodie.common.model.HoodieRecord; import com.uber.hoodie.common.model.HoodieRecord;
import com.uber.hoodie.common.model.HoodieRecordPayload; import com.uber.hoodie.common.model.HoodieRecordPayload;
import java.io.ByteArrayInputStream;
import java.io.ByteArrayOutputStream;
import java.io.IOException;
import java.io.StringWriter;
import java.util.HashMap; import java.util.HashMap;
import java.util.List; import java.util.List;
import java.util.Map.Entry;
import org.apache.avro.Schema;
import org.apache.avro.generic.IndexedRecord;
import org.apache.commons.io.IOUtils;
import java.io.*;
import java.util.Map; import java.util.Map;
import java.util.Map.Entry;
import java.util.Optional; import java.util.Optional;
import java.util.zip.Deflater; import java.util.zip.Deflater;
import java.util.zip.DeflaterOutputStream; import java.util.zip.DeflaterOutputStream;
import java.util.zip.InflaterInputStream; import java.util.zip.InflaterInputStream;
import org.apache.avro.Schema;
import org.apache.avro.generic.IndexedRecord;
import org.apache.commons.io.IOUtils;
/** /**
* Example row change event based on some example data used by testcases. The data avro schema is * Example row change event based on some example data used by testcases. The data avro schema is
* src/test/resources/schema1. * src/test/resources/schema1.
*/ */
public class TestRawTripPayload implements HoodieRecordPayload<TestRawTripPayload> { public class TestRawTripPayload implements HoodieRecordPayload<TestRawTripPayload> {
private transient static final ObjectMapper mapper = new ObjectMapper(); private transient static final ObjectMapper mapper = new ObjectMapper();
private String partitionPath; private String partitionPath;
private String rowKey; private String rowKey;
@@ -51,7 +52,7 @@ public class TestRawTripPayload implements HoodieRecordPayload<TestRawTripPayloa
public TestRawTripPayload(Optional<String> jsonData, String rowKey, String partitionPath, public TestRawTripPayload(Optional<String> jsonData, String rowKey, String partitionPath,
String schemaStr, Boolean isDeleted) throws IOException { String schemaStr, Boolean isDeleted) throws IOException {
if(jsonData.isPresent()) { if (jsonData.isPresent()) {
this.jsonDataCompressed = compressData(jsonData.get()); this.jsonDataCompressed = compressData(jsonData.get());
this.dataSize = jsonData.get().length(); this.dataSize = jsonData.get().length();
} }
@@ -61,7 +62,7 @@ public class TestRawTripPayload implements HoodieRecordPayload<TestRawTripPayloa
} }
public TestRawTripPayload(String jsonData, String rowKey, String partitionPath, public TestRawTripPayload(String jsonData, String rowKey, String partitionPath,
String schemaStr)throws IOException { String schemaStr) throws IOException {
this(Optional.of(jsonData), rowKey, partitionPath, schemaStr, false); this(Optional.of(jsonData), rowKey, partitionPath, schemaStr, false);
} }
@@ -79,16 +80,20 @@ public class TestRawTripPayload implements HoodieRecordPayload<TestRawTripPayloa
} }
@Override public TestRawTripPayload preCombine(TestRawTripPayload another) { @Override
public TestRawTripPayload preCombine(TestRawTripPayload another) {
return another; return another;
} }
@Override public Optional<IndexedRecord> combineAndGetUpdateValue(IndexedRecord oldRec, Schema schema) throws IOException { @Override
public Optional<IndexedRecord> combineAndGetUpdateValue(IndexedRecord oldRec, Schema schema)
throws IOException {
return this.getInsertValue(schema); return this.getInsertValue(schema);
} }
@Override public Optional<IndexedRecord> getInsertValue(Schema schema) throws IOException { @Override
if(isDeleted){ public Optional<IndexedRecord> getInsertValue(Schema schema) throws IOException {
if (isDeleted) {
return Optional.empty(); return Optional.empty();
} else { } else {
MercifulJsonConverter jsonConverter = new MercifulJsonConverter(schema); MercifulJsonConverter jsonConverter = new MercifulJsonConverter(schema);
@@ -135,16 +140,17 @@ public class TestRawTripPayload implements HoodieRecordPayload<TestRawTripPayloa
} }
/** /**
* A custom {@link WriteStatus} that merges passed metadata key value map * A custom {@link WriteStatus} that merges passed metadata key value map to {@code
* to {@code WriteStatus.markSuccess()} and {@code WriteStatus.markFailure()}. * WriteStatus.markSuccess()} and {@code WriteStatus.markFailure()}.
*/ */
public static class MetadataMergeWriteStatus extends WriteStatus { public static class MetadataMergeWriteStatus extends WriteStatus {
private Map<String, String> mergedMetadataMap = new HashMap<>(); private Map<String, String> mergedMetadataMap = new HashMap<>();
@Override @Override
public void markSuccess(HoodieRecord record, Optional<Map<String, String>> recordMetadata) { public void markSuccess(HoodieRecord record, Optional<Map<String, String>> recordMetadata) {
super.markSuccess(record, recordMetadata); super.markSuccess(record, recordMetadata);
if(recordMetadata.isPresent()) { if (recordMetadata.isPresent()) {
mergeMetadataMaps(recordMetadata.get(), mergedMetadataMap); mergeMetadataMaps(recordMetadata.get(), mergedMetadataMap);
} }
} }
@@ -153,25 +159,27 @@ public class TestRawTripPayload implements HoodieRecordPayload<TestRawTripPayloa
public void markFailure(HoodieRecord record, Throwable t, public void markFailure(HoodieRecord record, Throwable t,
Optional<Map<String, String>> recordMetadata) { Optional<Map<String, String>> recordMetadata) {
super.markFailure(record, t, recordMetadata); super.markFailure(record, t, recordMetadata);
if(recordMetadata.isPresent()) { if (recordMetadata.isPresent()) {
mergeMetadataMaps(recordMetadata.get(), mergedMetadataMap); mergeMetadataMaps(recordMetadata.get(), mergedMetadataMap);
} }
} }
public static Map<String, String> mergeMetadataForWriteStatuses(List<WriteStatus> writeStatuses) { public static Map<String, String> mergeMetadataForWriteStatuses(
List<WriteStatus> writeStatuses) {
Map<String, String> allWriteStatusMergedMetadataMap = new HashMap<>(); Map<String, String> allWriteStatusMergedMetadataMap = new HashMap<>();
for (WriteStatus writeStatus : writeStatuses) { for (WriteStatus writeStatus : writeStatuses) {
MetadataMergeWriteStatus.mergeMetadataMaps( MetadataMergeWriteStatus.mergeMetadataMaps(
((MetadataMergeWriteStatus)writeStatus).getMergedMetadataMap(), ((MetadataMergeWriteStatus) writeStatus).getMergedMetadataMap(),
allWriteStatusMergedMetadataMap); allWriteStatusMergedMetadataMap);
} }
return allWriteStatusMergedMetadataMap; return allWriteStatusMergedMetadataMap;
} }
private static void mergeMetadataMaps(Map<String, String> mergeFromMap, Map<String, String> mergeToMap) { private static void mergeMetadataMaps(Map<String, String> mergeFromMap,
Map<String, String> mergeToMap) {
for (Entry<String, String> entry : mergeFromMap.entrySet()) { for (Entry<String, String> entry : mergeFromMap.entrySet()) {
String key = entry.getKey(); String key = entry.getKey();
if(!mergeToMap.containsKey(key)) { if (!mergeToMap.containsKey(key)) {
mergeToMap.put(key, "0"); mergeToMap.put(key, "0");
} }
mergeToMap mergeToMap

View File

@@ -16,7 +16,7 @@
package com.uber.hoodie.config; package com.uber.hoodie.config;
import static org.junit.Assert.*; import static org.junit.Assert.assertEquals;
import com.google.common.collect.Maps; import com.google.common.collect.Maps;
import com.uber.hoodie.config.HoodieWriteConfig.Builder; import com.uber.hoodie.config.HoodieWriteConfig.Builder;
@@ -29,6 +29,7 @@ import java.util.Properties;
import org.junit.Test; import org.junit.Test;
public class HoodieWriteConfigTest { public class HoodieWriteConfigTest {
@Test @Test
public void testPropertyLoading() throws IOException { public void testPropertyLoading() throws IOException {
Builder builder = HoodieWriteConfig.newBuilder().withPath("/tmp"); Builder builder = HoodieWriteConfig.newBuilder().withPath("/tmp");
@@ -46,9 +47,10 @@ public class HoodieWriteConfigTest {
HoodieWriteConfig config = builder.build(); HoodieWriteConfig config = builder.build();
assertEquals(config.getMaxCommitsToKeep(), 5); assertEquals(config.getMaxCommitsToKeep(), 5);
assertEquals(config.getMinCommitsToKeep(), 2); assertEquals(config.getMinCommitsToKeep(), 2);
} }
private ByteArrayOutputStream saveParamsIntoOutputStream(Map<String, String> params) throws IOException { private ByteArrayOutputStream saveParamsIntoOutputStream(Map<String, String> params)
throws IOException {
Properties properties = new Properties(); Properties properties = new Properties();
properties.putAll(params); properties.putAll(params);
ByteArrayOutputStream outStream = new ByteArrayOutputStream(); ByteArrayOutputStream outStream = new ByteArrayOutputStream();

View File

@@ -16,31 +16,30 @@
package com.uber.hoodie.func; package com.uber.hoodie.func;
import com.uber.hoodie.common.table.HoodieTableMetaClient; import static org.junit.Assert.fail;
import com.uber.hoodie.common.table.HoodieTimeline;
import com.uber.hoodie.config.HoodieWriteConfig;
import com.uber.hoodie.WriteStatus; import com.uber.hoodie.WriteStatus;
import com.uber.hoodie.common.TestRawTripPayload; import com.uber.hoodie.common.TestRawTripPayload;
import com.uber.hoodie.common.model.HoodieKey; import com.uber.hoodie.common.model.HoodieKey;
import com.uber.hoodie.common.model.HoodieRecord; import com.uber.hoodie.common.model.HoodieRecord;
import com.uber.hoodie.common.model.HoodieRecordLocation; import com.uber.hoodie.common.model.HoodieRecordLocation;
import com.uber.hoodie.common.model.HoodieTestUtils; import com.uber.hoodie.common.model.HoodieTestUtils;
import com.uber.hoodie.common.table.HoodieTableMetaClient;
import com.uber.hoodie.common.table.HoodieTimeline;
import com.uber.hoodie.common.util.FSUtils; import com.uber.hoodie.common.util.FSUtils;
import com.uber.hoodie.config.HoodieWriteConfig;
import com.uber.hoodie.table.HoodieCopyOnWriteTable; import com.uber.hoodie.table.HoodieCopyOnWriteTable;
import java.util.ArrayList;
import java.util.Iterator;
import java.util.List;
import org.apache.commons.io.IOUtils; import org.apache.commons.io.IOUtils;
import org.apache.hadoop.fs.Path; import org.apache.hadoop.fs.Path;
import org.junit.Before; import org.junit.Before;
import org.junit.Test; import org.junit.Test;
import org.junit.rules.TemporaryFolder; import org.junit.rules.TemporaryFolder;
import java.util.ArrayList;
import java.util.Iterator;
import java.util.List;
import static org.junit.Assert.fail;
public class TestUpdateMapFunction { public class TestUpdateMapFunction {
private String basePath = null; private String basePath = null;
@Before @Before
@@ -90,7 +89,6 @@ public class TestUpdateMapFunction {
String fileId = insertResult.next().get(0).getFileId(); String fileId = insertResult.next().get(0).getFileId();
System.out.println(fileId); System.out.println(fileId);
table = new HoodieCopyOnWriteTable(config, metadata); table = new HoodieCopyOnWriteTable(config, metadata);
// New content with values for the newly added field // New content with values for the newly added field
recordStr1 = recordStr1 =

View File

@@ -16,17 +16,16 @@
package com.uber.hoodie.index; package com.uber.hoodie.index;
import com.uber.hoodie.config.HoodieWriteConfig; import static org.junit.Assert.assertTrue;
import com.uber.hoodie.config.HoodieIndexConfig; import com.uber.hoodie.config.HoodieIndexConfig;
import com.uber.hoodie.config.HoodieWriteConfig;
import com.uber.hoodie.index.bloom.HoodieBloomIndex; import com.uber.hoodie.index.bloom.HoodieBloomIndex;
import com.uber.hoodie.index.hbase.HBaseIndex; import com.uber.hoodie.index.hbase.HBaseIndex;
import org.junit.Test; import org.junit.Test;
import static org.junit.Assert.*;
public class TestHoodieIndex { public class TestHoodieIndex {
@Test @Test
public void testCreateIndex() throws Exception { public void testCreateIndex() throws Exception {
HoodieWriteConfig.Builder clientConfigBuilder = HoodieWriteConfig.newBuilder(); HoodieWriteConfig.Builder clientConfigBuilder = HoodieWriteConfig.newBuilder();

View File

@@ -18,28 +18,39 @@
package com.uber.hoodie.index.bloom; package com.uber.hoodie.index.bloom;
import static org.junit.Assert.assertEquals;
import static org.junit.Assert.assertFalse;
import static org.junit.Assert.assertNotNull;
import static org.junit.Assert.assertNull;
import static org.junit.Assert.assertTrue;
import static org.junit.Assert.fail;
import com.google.common.base.Optional; import com.google.common.base.Optional;
import com.google.common.collect.Lists; import com.google.common.collect.Lists;
import com.uber.hoodie.common.HoodieClientTestUtils;
import com.uber.hoodie.common.table.HoodieTableMetaClient;
import com.uber.hoodie.config.HoodieIndexConfig;
import com.uber.hoodie.config.HoodieWriteConfig;
import com.uber.hoodie.avro.HoodieAvroWriteSupport; import com.uber.hoodie.avro.HoodieAvroWriteSupport;
import com.uber.hoodie.common.BloomFilter; import com.uber.hoodie.common.BloomFilter;
import com.uber.hoodie.common.HoodieClientTestUtils;
import com.uber.hoodie.common.TestRawTripPayload; import com.uber.hoodie.common.TestRawTripPayload;
import com.uber.hoodie.common.model.HoodieKey; import com.uber.hoodie.common.model.HoodieKey;
import com.uber.hoodie.common.model.HoodieRecord; import com.uber.hoodie.common.model.HoodieRecord;
import com.uber.hoodie.common.model.HoodieTestUtils; import com.uber.hoodie.common.model.HoodieTestUtils;
import com.uber.hoodie.common.table.HoodieTableMetaClient;
import com.uber.hoodie.common.util.FSUtils; import com.uber.hoodie.common.util.FSUtils;
import com.uber.hoodie.common.util.HoodieAvroUtils; import com.uber.hoodie.common.util.HoodieAvroUtils;
import com.uber.hoodie.config.HoodieWriteConfig;
import com.uber.hoodie.index.bloom.BloomIndexFileInfo;
import com.uber.hoodie.index.bloom.HoodieBloomIndex;
import com.uber.hoodie.index.bloom.HoodieBloomIndexCheckFunction;
import com.uber.hoodie.io.storage.HoodieParquetConfig; import com.uber.hoodie.io.storage.HoodieParquetConfig;
import com.uber.hoodie.io.storage.HoodieParquetWriter; import com.uber.hoodie.io.storage.HoodieParquetWriter;
import com.uber.hoodie.table.HoodieTable; import com.uber.hoodie.table.HoodieTable;
import java.io.File;
import java.io.IOException;
import java.text.SimpleDateFormat;
import java.util.Arrays;
import java.util.Date;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import java.util.UUID;
import java.util.stream.Collectors;
import org.apache.avro.Schema; import org.apache.avro.Schema;
import org.apache.avro.generic.GenericRecord; import org.apache.avro.generic.GenericRecord;
import org.apache.commons.io.IOUtils; import org.apache.commons.io.IOUtils;
@@ -47,11 +58,8 @@ import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileSystem; import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path; import org.apache.hadoop.fs.Path;
import org.apache.parquet.avro.AvroSchemaConverter; import org.apache.parquet.avro.AvroSchemaConverter;
import org.apache.parquet.avro.AvroWriteSupport;
import org.apache.parquet.hadoop.ParquetWriter; import org.apache.parquet.hadoop.ParquetWriter;
import org.apache.parquet.hadoop.api.WriteSupport;
import org.apache.parquet.hadoop.metadata.CompressionCodecName; import org.apache.parquet.hadoop.metadata.CompressionCodecName;
import org.apache.spark.SparkConf;
import org.apache.spark.api.java.JavaPairRDD; import org.apache.spark.api.java.JavaPairRDD;
import org.apache.spark.api.java.JavaRDD; import org.apache.spark.api.java.JavaRDD;
import org.apache.spark.api.java.JavaSparkContext; import org.apache.spark.api.java.JavaSparkContext;
@@ -59,20 +67,10 @@ import org.junit.After;
import org.junit.Before; import org.junit.Before;
import org.junit.Test; import org.junit.Test;
import org.junit.rules.TemporaryFolder; import org.junit.rules.TemporaryFolder;
import org.mockito.Mockito;
import scala.Tuple2; import scala.Tuple2;
import java.io.File;
import java.io.IOException;
import java.text.SimpleDateFormat;
import java.util.*;
import java.util.stream.Collectors;
import java.util.stream.Stream;
import static org.junit.Assert.*;
public class TestHoodieBloomIndex { public class TestHoodieBloomIndex {
private JavaSparkContext jsc = null; private JavaSparkContext jsc = null;
private String basePath = null; private String basePath = null;
private transient final FileSystem fs; private transient final FileSystem fs;
@@ -106,15 +104,20 @@ public class TestHoodieBloomIndex {
String recordStr4 = "{\"_row_key\":\"4eb5b87c-1fej-4edd-87b4-6ec96dc405a0\",\"time\":\"2015-01-31T03:16:41.415Z\",\"number\":32}"; String recordStr4 = "{\"_row_key\":\"4eb5b87c-1fej-4edd-87b4-6ec96dc405a0\",\"time\":\"2015-01-31T03:16:41.415Z\",\"number\":32}";
TestRawTripPayload rowChange1 = new TestRawTripPayload(recordStr1); TestRawTripPayload rowChange1 = new TestRawTripPayload(recordStr1);
HoodieRecord record1 = new HoodieRecord(new HoodieKey(rowChange1.getRowKey(), rowChange1.getPartitionPath()), rowChange1); HoodieRecord record1 = new HoodieRecord(
new HoodieKey(rowChange1.getRowKey(), rowChange1.getPartitionPath()), rowChange1);
TestRawTripPayload rowChange2 = new TestRawTripPayload(recordStr2); TestRawTripPayload rowChange2 = new TestRawTripPayload(recordStr2);
HoodieRecord record2 = new HoodieRecord(new HoodieKey(rowChange2.getRowKey(), rowChange2.getPartitionPath()), rowChange2); HoodieRecord record2 = new HoodieRecord(
new HoodieKey(rowChange2.getRowKey(), rowChange2.getPartitionPath()), rowChange2);
TestRawTripPayload rowChange3 = new TestRawTripPayload(recordStr3); TestRawTripPayload rowChange3 = new TestRawTripPayload(recordStr3);
HoodieRecord record3 = new HoodieRecord(new HoodieKey(rowChange3.getRowKey(), rowChange3.getPartitionPath()), rowChange3); HoodieRecord record3 = new HoodieRecord(
new HoodieKey(rowChange3.getRowKey(), rowChange3.getPartitionPath()), rowChange3);
TestRawTripPayload rowChange4 = new TestRawTripPayload(recordStr4); TestRawTripPayload rowChange4 = new TestRawTripPayload(recordStr4);
HoodieRecord record4 = new HoodieRecord(new HoodieKey(rowChange4.getRowKey(), rowChange4.getPartitionPath()), rowChange4); HoodieRecord record4 = new HoodieRecord(
new HoodieKey(rowChange4.getRowKey(), rowChange4.getPartitionPath()), rowChange4);
JavaRDD<HoodieRecord> recordRDD = jsc.parallelize(Arrays.asList(record1, record2, record3, record4)); JavaRDD<HoodieRecord> recordRDD = jsc
.parallelize(Arrays.asList(record1, record2, record3, record4));
// Load to memory // Load to memory
HoodieWriteConfig config = HoodieWriteConfig.newBuilder().withPath(basePath).build(); HoodieWriteConfig config = HoodieWriteConfig.newBuilder().withPath(basePath).build();
@@ -144,20 +147,31 @@ public class TestHoodieBloomIndex {
new File(basePath + "/2016/04/01").mkdirs(); new File(basePath + "/2016/04/01").mkdirs();
new File(basePath + "/2015/03/12").mkdirs(); new File(basePath + "/2015/03/12").mkdirs();
TestRawTripPayload rowChange1 = new TestRawTripPayload("{\"_row_key\":\"000\",\"time\":\"2016-01-31T03:16:41.415Z\",\"number\":12}"); TestRawTripPayload rowChange1 = new TestRawTripPayload(
HoodieRecord record1 = new HoodieRecord(new HoodieKey(rowChange1.getRowKey(), rowChange1.getPartitionPath()), rowChange1); "{\"_row_key\":\"000\",\"time\":\"2016-01-31T03:16:41.415Z\",\"number\":12}");
TestRawTripPayload rowChange2 = new TestRawTripPayload("{\"_row_key\":\"001\",\"time\":\"2016-01-31T03:16:41.415Z\",\"number\":12}"); HoodieRecord record1 = new HoodieRecord(
HoodieRecord record2 = new HoodieRecord(new HoodieKey(rowChange2.getRowKey(), rowChange2.getPartitionPath()), rowChange2); new HoodieKey(rowChange1.getRowKey(), rowChange1.getPartitionPath()), rowChange1);
TestRawTripPayload rowChange3 = new TestRawTripPayload("{\"_row_key\":\"002\",\"time\":\"2016-01-31T03:16:41.415Z\",\"number\":12}"); TestRawTripPayload rowChange2 = new TestRawTripPayload(
HoodieRecord record3 = new HoodieRecord(new HoodieKey(rowChange3.getRowKey(), rowChange3.getPartitionPath()), rowChange3); "{\"_row_key\":\"001\",\"time\":\"2016-01-31T03:16:41.415Z\",\"number\":12}");
TestRawTripPayload rowChange4 = new TestRawTripPayload("{\"_row_key\":\"003\",\"time\":\"2016-01-31T03:16:41.415Z\",\"number\":12}"); HoodieRecord record2 = new HoodieRecord(
HoodieRecord record4 = new HoodieRecord(new HoodieKey(rowChange4.getRowKey(), rowChange4.getPartitionPath()), rowChange4); new HoodieKey(rowChange2.getRowKey(), rowChange2.getPartitionPath()), rowChange2);
TestRawTripPayload rowChange3 = new TestRawTripPayload(
"{\"_row_key\":\"002\",\"time\":\"2016-01-31T03:16:41.415Z\",\"number\":12}");
HoodieRecord record3 = new HoodieRecord(
new HoodieKey(rowChange3.getRowKey(), rowChange3.getPartitionPath()), rowChange3);
TestRawTripPayload rowChange4 = new TestRawTripPayload(
"{\"_row_key\":\"003\",\"time\":\"2016-01-31T03:16:41.415Z\",\"number\":12}");
HoodieRecord record4 = new HoodieRecord(
new HoodieKey(rowChange4.getRowKey(), rowChange4.getPartitionPath()), rowChange4);
writeParquetFile("2016/04/01", "2_0_20160401010101.parquet", Lists.newArrayList(), schema, null,
writeParquetFile("2016/04/01","2_0_20160401010101.parquet", Lists.newArrayList(), schema, null, false); false);
writeParquetFile("2015/03/12","1_0_20150312101010.parquet", Lists.newArrayList(), schema, null, false); writeParquetFile("2015/03/12", "1_0_20150312101010.parquet", Lists.newArrayList(), schema, null,
writeParquetFile("2015/03/12","3_0_20150312101010.parquet", Arrays.asList(record1), schema, null, false); false);
writeParquetFile("2015/03/12","4_0_20150312101010.parquet", Arrays.asList(record2, record3, record4), schema, null, false); writeParquetFile("2015/03/12", "3_0_20150312101010.parquet", Arrays.asList(record1), schema,
null, false);
writeParquetFile("2015/03/12", "4_0_20150312101010.parquet",
Arrays.asList(record2, record3, record4), schema, null, false);
List<String> partitions = Arrays.asList("2016/01/21", "2016/04/01", "2015/03/12"); List<String> partitions = Arrays.asList("2016/01/21", "2016/04/01", "2015/03/12");
HoodieTableMetaClient metadata = new HoodieTableMetaClient(fs, basePath); HoodieTableMetaClient metadata = new HoodieTableMetaClient(fs, basePath);
@@ -185,9 +199,11 @@ public class TestHoodieBloomIndex {
List<Tuple2<String, BloomIndexFileInfo>> expected = Arrays.asList( List<Tuple2<String, BloomIndexFileInfo>> expected = Arrays.asList(
new Tuple2<>("2016/04/01", new BloomIndexFileInfo("2_0_20160401010101.parquet")), new Tuple2<>("2016/04/01", new BloomIndexFileInfo("2_0_20160401010101.parquet")),
new Tuple2<>("2015/03/12",new BloomIndexFileInfo("1_0_20150312101010.parquet")), new Tuple2<>("2015/03/12", new BloomIndexFileInfo("1_0_20150312101010.parquet")),
new Tuple2<>("2015/03/12",new BloomIndexFileInfo("3_0_20150312101010.parquet", "000", "000")), new Tuple2<>("2015/03/12",
new Tuple2<>("2015/03/12",new BloomIndexFileInfo("4_0_20150312101010.parquet", "001", "003")) new BloomIndexFileInfo("3_0_20150312101010.parquet", "000", "000")),
new Tuple2<>("2015/03/12",
new BloomIndexFileInfo("4_0_20150312101010.parquet", "001", "003"))
); );
assertEquals(expected, filesList); assertEquals(expected, filesList);
} }
@@ -200,7 +216,6 @@ public class TestHoodieBloomIndex {
.build(); .build();
HoodieBloomIndex index = new HoodieBloomIndex(config, jsc); HoodieBloomIndex index = new HoodieBloomIndex(config, jsc);
final Map<String, List<BloomIndexFileInfo>> partitionToFileIndexInfo = new HashMap<>(); final Map<String, List<BloomIndexFileInfo>> partitionToFileIndexInfo = new HashMap<>();
partitionToFileIndexInfo.put("2017/10/22", Arrays.asList( partitionToFileIndexInfo.put("2017/10/22", Arrays.asList(
new BloomIndexFileInfo("f1"), new BloomIndexFileInfo("f1"),
@@ -212,14 +227,13 @@ public class TestHoodieBloomIndex {
JavaPairRDD<String, String> partitionRecordKeyPairRDD = jsc JavaPairRDD<String, String> partitionRecordKeyPairRDD = jsc
.parallelize(Arrays.asList( .parallelize(Arrays.asList(
new Tuple2<>("2017/10/22","003"), new Tuple2<>("2017/10/22", "003"),
new Tuple2<>("2017/10/22","002"), new Tuple2<>("2017/10/22", "002"),
new Tuple2<>("2017/10/22","005"), new Tuple2<>("2017/10/22", "005"),
new Tuple2<>("2017/10/22","004") new Tuple2<>("2017/10/22", "004")
)) ))
.mapToPair(t -> t); .mapToPair(t -> t);
List<Tuple2<String, Tuple2<String, HoodieKey>>> comparisonKeyList = index List<Tuple2<String, Tuple2<String, HoodieKey>>> comparisonKeyList = index
.explodeRecordRDDWithFileComparisons(partitionToFileIndexInfo, partitionRecordKeyPairRDD) .explodeRecordRDDWithFileComparisons(partitionToFileIndexInfo, partitionRecordKeyPairRDD)
.collect(); .collect();
@@ -240,7 +254,8 @@ public class TestHoodieBloomIndex {
} }
@Test @Test
public void testCheckUUIDsAgainstOneFile() throws IOException, InterruptedException, ClassNotFoundException { public void testCheckUUIDsAgainstOneFile()
throws IOException, InterruptedException, ClassNotFoundException {
// Create some records to use // Create some records to use
String recordStr1 = "{\"_row_key\":\"1eb5b87a-1feh-4edd-87b4-6ec96dc405a0\",\"time\":\"2016-01-31T03:16:41.415Z\",\"number\":12}"; String recordStr1 = "{\"_row_key\":\"1eb5b87a-1feh-4edd-87b4-6ec96dc405a0\",\"time\":\"2016-01-31T03:16:41.415Z\",\"number\":12}";
@@ -248,19 +263,23 @@ public class TestHoodieBloomIndex {
String recordStr3 = "{\"_row_key\":\"3eb5b87c-1fej-4edd-87b4-6ec96dc405a0\",\"time\":\"2016-01-31T03:16:41.415Z\",\"number\":15}"; String recordStr3 = "{\"_row_key\":\"3eb5b87c-1fej-4edd-87b4-6ec96dc405a0\",\"time\":\"2016-01-31T03:16:41.415Z\",\"number\":15}";
String recordStr4 = "{\"_row_key\":\"4eb5b87c-1fej-4edd-87b4-6ec96dc405a0\",\"time\":\"2016-01-31T03:16:41.415Z\",\"number\":32}"; String recordStr4 = "{\"_row_key\":\"4eb5b87c-1fej-4edd-87b4-6ec96dc405a0\",\"time\":\"2016-01-31T03:16:41.415Z\",\"number\":32}";
TestRawTripPayload rowChange1 = new TestRawTripPayload(recordStr1); TestRawTripPayload rowChange1 = new TestRawTripPayload(recordStr1);
HoodieRecord record1 = new HoodieRecord(new HoodieKey(rowChange1.getRowKey(), rowChange1.getPartitionPath()), rowChange1); HoodieRecord record1 = new HoodieRecord(
new HoodieKey(rowChange1.getRowKey(), rowChange1.getPartitionPath()), rowChange1);
TestRawTripPayload rowChange2 = new TestRawTripPayload(recordStr2); TestRawTripPayload rowChange2 = new TestRawTripPayload(recordStr2);
HoodieRecord record2 = new HoodieRecord(new HoodieKey(rowChange2.getRowKey(), rowChange2.getPartitionPath()), rowChange2); HoodieRecord record2 = new HoodieRecord(
new HoodieKey(rowChange2.getRowKey(), rowChange2.getPartitionPath()), rowChange2);
TestRawTripPayload rowChange3 = new TestRawTripPayload(recordStr3); TestRawTripPayload rowChange3 = new TestRawTripPayload(recordStr3);
HoodieRecord record3 = new HoodieRecord(new HoodieKey(rowChange3.getRowKey(), rowChange3.getPartitionPath()), rowChange3); HoodieRecord record3 = new HoodieRecord(
new HoodieKey(rowChange3.getRowKey(), rowChange3.getPartitionPath()), rowChange3);
TestRawTripPayload rowChange4 = new TestRawTripPayload(recordStr4); TestRawTripPayload rowChange4 = new TestRawTripPayload(recordStr4);
HoodieRecord record4 = new HoodieRecord(new HoodieKey(rowChange4.getRowKey(), rowChange4.getPartitionPath()), rowChange4); HoodieRecord record4 = new HoodieRecord(
new HoodieKey(rowChange4.getRowKey(), rowChange4.getPartitionPath()), rowChange4);
// We write record1, record2 to a parquet file, but the bloom filter contains (record1, record2, record3). // We write record1, record2 to a parquet file, but the bloom filter contains (record1, record2, record3).
BloomFilter filter = new BloomFilter(10000, 0.0000001); BloomFilter filter = new BloomFilter(10000, 0.0000001);
filter.add(record3.getRecordKey()); filter.add(record3.getRecordKey());
String filename = writeParquetFile("2016/01/31", Arrays.asList(record1, record2), schema, filter, true); String filename = writeParquetFile("2016/01/31", Arrays.asList(record1, record2), schema,
filter, true);
// The bloom filter contains 3 records // The bloom filter contains 3 records
assertTrue(filter.mightContain(record1.getRecordKey())); assertTrue(filter.mightContain(record1.getRecordKey()));
@@ -299,7 +318,8 @@ public class TestHoodieBloomIndex {
try { try {
bloomIndex.tagLocation(recordRDD, table); bloomIndex.tagLocation(recordRDD, table);
} catch (IllegalArgumentException e) { } catch (IllegalArgumentException e) {
fail("EmptyRDD should not result in IllegalArgumentException: Positive number of slices required"); fail(
"EmptyRDD should not result in IllegalArgumentException: Positive number of slices required");
} }
} }
@@ -313,14 +333,19 @@ public class TestHoodieBloomIndex {
String recordStr3 = "{\"_row_key\":\"3eb5b87c-1fej-4edd-87b4-6ec96dc405a0\",\"time\":\"2016-01-31T03:16:41.415Z\",\"number\":15}"; String recordStr3 = "{\"_row_key\":\"3eb5b87c-1fej-4edd-87b4-6ec96dc405a0\",\"time\":\"2016-01-31T03:16:41.415Z\",\"number\":15}";
String recordStr4 = "{\"_row_key\":\"4eb5b87c-1fej-4edd-87b4-6ec96dc405a0\",\"time\":\"2015-01-31T03:16:41.415Z\",\"number\":32}"; String recordStr4 = "{\"_row_key\":\"4eb5b87c-1fej-4edd-87b4-6ec96dc405a0\",\"time\":\"2015-01-31T03:16:41.415Z\",\"number\":32}";
TestRawTripPayload rowChange1 = new TestRawTripPayload(recordStr1); TestRawTripPayload rowChange1 = new TestRawTripPayload(recordStr1);
HoodieRecord record1 = new HoodieRecord(new HoodieKey(rowChange1.getRowKey(), rowChange1.getPartitionPath()), rowChange1); HoodieRecord record1 = new HoodieRecord(
new HoodieKey(rowChange1.getRowKey(), rowChange1.getPartitionPath()), rowChange1);
TestRawTripPayload rowChange2 = new TestRawTripPayload(recordStr2); TestRawTripPayload rowChange2 = new TestRawTripPayload(recordStr2);
HoodieRecord record2 = new HoodieRecord(new HoodieKey(rowChange2.getRowKey(), rowChange2.getPartitionPath()), rowChange2); HoodieRecord record2 = new HoodieRecord(
new HoodieKey(rowChange2.getRowKey(), rowChange2.getPartitionPath()), rowChange2);
TestRawTripPayload rowChange3 = new TestRawTripPayload(recordStr3); TestRawTripPayload rowChange3 = new TestRawTripPayload(recordStr3);
HoodieRecord record3 = new HoodieRecord(new HoodieKey(rowChange3.getRowKey(), rowChange3.getPartitionPath()), rowChange3); HoodieRecord record3 = new HoodieRecord(
new HoodieKey(rowChange3.getRowKey(), rowChange3.getPartitionPath()), rowChange3);
TestRawTripPayload rowChange4 = new TestRawTripPayload(recordStr4); TestRawTripPayload rowChange4 = new TestRawTripPayload(recordStr4);
HoodieRecord record4 = new HoodieRecord(new HoodieKey(rowChange4.getRowKey(), rowChange4.getPartitionPath()), rowChange4); HoodieRecord record4 = new HoodieRecord(
JavaRDD<HoodieRecord> recordRDD = jsc.parallelize(Arrays.asList(record1, record2, record3, record4)); new HoodieKey(rowChange4.getRowKey(), rowChange4.getPartitionPath()), rowChange4);
JavaRDD<HoodieRecord> recordRDD = jsc
.parallelize(Arrays.asList(record1, record2, record3, record4));
// Also create the metadata and config // Also create the metadata and config
HoodieTableMetaClient metadata = new HoodieTableMetaClient(fs, basePath); HoodieTableMetaClient metadata = new HoodieTableMetaClient(fs, basePath);
@@ -390,7 +415,8 @@ public class TestHoodieBloomIndex {
// Let's tag // Let's tag
HoodieBloomIndex bloomIndex = new HoodieBloomIndex(config, jsc); HoodieBloomIndex bloomIndex = new HoodieBloomIndex(config, jsc);
JavaPairRDD<HoodieKey, Optional<String>> taggedRecordRDD = bloomIndex.fetchRecordLocation(keysRDD, table); JavaPairRDD<HoodieKey, Optional<String>> taggedRecordRDD = bloomIndex
.fetchRecordLocation(keysRDD, table);
// Should not find any files // Should not find any files
for (Tuple2<HoodieKey, Optional<String>> record : taggedRecordRDD.collect()) { for (Tuple2<HoodieKey, Optional<String>> record : taggedRecordRDD.collect()) {
@@ -436,9 +462,11 @@ public class TestHoodieBloomIndex {
// We write record1 to a parquet file, using a bloom filter having both records // We write record1 to a parquet file, using a bloom filter having both records
TestRawTripPayload rowChange1 = new TestRawTripPayload(recordStr1); TestRawTripPayload rowChange1 = new TestRawTripPayload(recordStr1);
HoodieRecord record1 = new HoodieRecord(new HoodieKey(rowChange1.getRowKey(), rowChange1.getPartitionPath()), rowChange1); HoodieRecord record1 = new HoodieRecord(
new HoodieKey(rowChange1.getRowKey(), rowChange1.getPartitionPath()), rowChange1);
TestRawTripPayload rowChange2 = new TestRawTripPayload(recordStr2); TestRawTripPayload rowChange2 = new TestRawTripPayload(recordStr2);
HoodieRecord record2 = new HoodieRecord(new HoodieKey(rowChange2.getRowKey(), rowChange2.getPartitionPath()), rowChange2); HoodieRecord record2 = new HoodieRecord(
new HoodieKey(rowChange2.getRowKey(), rowChange2.getPartitionPath()), rowChange2);
BloomFilter filter = new BloomFilter(10000, 0.0000001); BloomFilter filter = new BloomFilter(10000, 0.0000001);
filter.add(record2.getRecordKey()); filter.add(record2.getRecordKey());
@@ -472,21 +500,22 @@ public class TestHoodieBloomIndex {
String fileId = UUID.randomUUID().toString(); String fileId = UUID.randomUUID().toString();
String filename = FSUtils.makeDataFileName(commitTime, 1, fileId); String filename = FSUtils.makeDataFileName(commitTime, 1, fileId);
return writeParquetFile(partitionPath, filename, records, schema, filter, createCommitTime); return writeParquetFile(partitionPath, filename, records, schema, filter, createCommitTime);
} }
private String writeParquetFile(String partitionPath, String filename, List<HoodieRecord> records, Schema schema, private String writeParquetFile(String partitionPath, String filename, List<HoodieRecord> records,
Schema schema,
BloomFilter filter, boolean createCommitTime) throws IOException { BloomFilter filter, boolean createCommitTime) throws IOException {
if (filter == null) { if (filter == null) {
filter = new BloomFilter(10000, 0.0000001); filter = new BloomFilter(10000, 0.0000001);
} }
HoodieAvroWriteSupport writeSupport = new HoodieAvroWriteSupport(new AvroSchemaConverter().convert(schema), schema, filter); HoodieAvroWriteSupport writeSupport = new HoodieAvroWriteSupport(
new AvroSchemaConverter().convert(schema), schema, filter);
String commitTime = FSUtils.getCommitTime(filename); String commitTime = FSUtils.getCommitTime(filename);
HoodieParquetConfig config = new HoodieParquetConfig(writeSupport, CompressionCodecName.GZIP, HoodieParquetConfig config = new HoodieParquetConfig(writeSupport, CompressionCodecName.GZIP,
ParquetWriter.DEFAULT_BLOCK_SIZE, ParquetWriter.DEFAULT_PAGE_SIZE, 120 * 1024 * 1024, new Configuration()); ParquetWriter.DEFAULT_BLOCK_SIZE, ParquetWriter.DEFAULT_PAGE_SIZE, 120 * 1024 * 1024,
new Configuration());
HoodieParquetWriter writer = new HoodieParquetWriter( HoodieParquetWriter writer = new HoodieParquetWriter(
commitTime, commitTime,
new Path(basePath + "/" + partitionPath + "/" + filename), new Path(basePath + "/" + partitionPath + "/" + filename),
@@ -496,7 +525,9 @@ public class TestHoodieBloomIndex {
for (HoodieRecord record : records) { for (HoodieRecord record : records) {
GenericRecord avroRecord = (GenericRecord) record.getData().getInsertValue(schema).get(); GenericRecord avroRecord = (GenericRecord) record.getData().getInsertValue(schema).get();
HoodieAvroUtils.addCommitMetadataToRecord(avroRecord, commitTime, "" + seqId++); HoodieAvroUtils.addCommitMetadataToRecord(avroRecord, commitTime, "" + seqId++);
HoodieAvroUtils.addHoodieKeyToRecord(avroRecord, record.getRecordKey(), record.getPartitionPath(), filename); HoodieAvroUtils
.addHoodieKeyToRecord(avroRecord, record.getRecordKey(), record.getPartitionPath(),
filename);
writer.writeAvro(record.getRecordKey(), avroRecord); writer.writeAvro(record.getRecordKey(), avroRecord);
filter.add(record.getRecordKey()); filter.add(record.getRecordKey());
} }
@@ -505,7 +536,9 @@ public class TestHoodieBloomIndex {
if (createCommitTime) { if (createCommitTime) {
// Also make sure the commit is valid // Also make sure the commit is valid
new File(basePath + "/" + HoodieTableMetaClient.METAFOLDER_NAME).mkdirs(); new File(basePath + "/" + HoodieTableMetaClient.METAFOLDER_NAME).mkdirs();
new File(basePath + "/" + HoodieTableMetaClient.METAFOLDER_NAME + "/" + commitTime + ".commit").createNewFile(); new File(
basePath + "/" + HoodieTableMetaClient.METAFOLDER_NAME + "/" + commitTime + ".commit")
.createNewFile();
} }
return filename; return filename;
} }

View File

@@ -16,9 +16,11 @@
package com.uber.hoodie.io; package com.uber.hoodie.io;
import static org.junit.Assert.assertEquals;
import static org.junit.Assert.assertTrue;
import com.uber.hoodie.avro.model.HoodieArchivedMetaEntry; import com.uber.hoodie.avro.model.HoodieArchivedMetaEntry;
import com.uber.hoodie.common.HoodieTestDataGenerator; import com.uber.hoodie.common.HoodieTestDataGenerator;
import com.uber.hoodie.common.model.HoodieArchivedLogFile;
import com.uber.hoodie.common.model.HoodieLogFile; import com.uber.hoodie.common.model.HoodieLogFile;
import com.uber.hoodie.common.model.HoodieTestUtils; import com.uber.hoodie.common.model.HoodieTestUtils;
import com.uber.hoodie.common.table.HoodieTableMetaClient; import com.uber.hoodie.common.table.HoodieTableMetaClient;
@@ -29,6 +31,11 @@ import com.uber.hoodie.common.table.timeline.HoodieInstant;
import com.uber.hoodie.common.util.FSUtils; import com.uber.hoodie.common.util.FSUtils;
import com.uber.hoodie.config.HoodieCompactionConfig; import com.uber.hoodie.config.HoodieCompactionConfig;
import com.uber.hoodie.config.HoodieWriteConfig; import com.uber.hoodie.config.HoodieWriteConfig;
import java.io.IOException;
import java.util.ArrayList;
import java.util.Collections;
import java.util.List;
import java.util.stream.Collectors;
import org.apache.avro.generic.GenericRecord; import org.apache.avro.generic.GenericRecord;
import org.apache.avro.generic.IndexedRecord; import org.apache.avro.generic.IndexedRecord;
import org.apache.hadoop.fs.FileSystem; import org.apache.hadoop.fs.FileSystem;
@@ -37,16 +44,8 @@ import org.junit.Before;
import org.junit.Test; import org.junit.Test;
import org.junit.rules.TemporaryFolder; import org.junit.rules.TemporaryFolder;
import java.io.IOException;
import java.util.ArrayList;
import java.util.Collections;
import java.util.List;
import java.util.stream.Collectors;
import static org.junit.Assert.assertEquals;
import static org.junit.Assert.assertTrue;
public class TestHoodieCommitArchiveLog { public class TestHoodieCommitArchiveLog {
private String basePath; private String basePath;
private FileSystem fs; private FileSystem fs;
@@ -97,7 +96,8 @@ public class TestHoodieCommitArchiveLog {
HoodieTestUtils.createCleanFiles(basePath, "105"); HoodieTestUtils.createCleanFiles(basePath, "105");
//reload the timeline and get all the commmits before archive //reload the timeline and get all the commmits before archive
timeline = metadata.getActiveTimeline().reload().getAllCommitsTimeline().filterCompletedInstants(); timeline = metadata.getActiveTimeline().reload().getAllCommitsTimeline()
.filterCompletedInstants();
List<HoodieInstant> originalCommits = timeline.getInstants().collect(Collectors.toList()); List<HoodieInstant> originalCommits = timeline.getInstants().collect(Collectors.toList());
assertEquals("Loaded 6 commits and the count should match", 12, timeline.countInstants()); assertEquals("Loaded 6 commits and the count should match", 12, timeline.countInstants());
@@ -107,27 +107,30 @@ public class TestHoodieCommitArchiveLog {
assertTrue(archiveLog.archiveIfRequired()); assertTrue(archiveLog.archiveIfRequired());
//reload the timeline and remove the remaining commits //reload the timeline and remove the remaining commits
timeline = metadata.getActiveTimeline().reload().getAllCommitsTimeline().filterCompletedInstants(); timeline = metadata.getActiveTimeline().reload().getAllCommitsTimeline()
.filterCompletedInstants();
originalCommits.removeAll(timeline.getInstants().collect(Collectors.toList())); originalCommits.removeAll(timeline.getInstants().collect(Collectors.toList()));
//read the file //read the file
HoodieLogFormat.Reader reader = HoodieLogFormat.newReader(FSUtils.getFs(), HoodieLogFormat.Reader reader = HoodieLogFormat.newReader(FSUtils.getFs(),
new HoodieLogFile(new Path(basePath + "/.hoodie/.commits_.archive.1")), HoodieArchivedMetaEntry.getClassSchema(), false); new HoodieLogFile(new Path(basePath + "/.hoodie/.commits_.archive.1")),
HoodieArchivedMetaEntry.getClassSchema(), false);
int archivedRecordsCount = 0; int archivedRecordsCount = 0;
List<IndexedRecord> readRecords = new ArrayList<>(); List<IndexedRecord> readRecords = new ArrayList<>();
//read the avro blocks and validate the number of records written in each avro block //read the avro blocks and validate the number of records written in each avro block
while(reader.hasNext()) { while (reader.hasNext()) {
HoodieAvroDataBlock blk = (HoodieAvroDataBlock) reader.next(); HoodieAvroDataBlock blk = (HoodieAvroDataBlock) reader.next();
List<IndexedRecord> records = blk.getRecords(); List<IndexedRecord> records = blk.getRecords();
readRecords.addAll(records); readRecords.addAll(records);
assertEquals("Archived and read records for each block are same", 8, records.size()); assertEquals("Archived and read records for each block are same", 8, records.size());
archivedRecordsCount += records.size(); archivedRecordsCount += records.size();
} }
assertEquals("Total archived records and total read records are the same count", 8, archivedRecordsCount); assertEquals("Total archived records and total read records are the same count", 8,
archivedRecordsCount);
//make sure the archived commits are the same as the (originalcommits - commitsleft) //make sure the archived commits are the same as the (originalcommits - commitsleft)
List<String> readCommits = readRecords.stream().map(r -> (GenericRecord)r).map(r -> { List<String> readCommits = readRecords.stream().map(r -> (GenericRecord) r).map(r -> {
return r.get("commitTime").toString(); return r.get("commitTime").toString();
}).collect(Collectors.toList()); }).collect(Collectors.toList());
Collections.sort(readCommits); Collections.sort(readCommits);
@@ -158,7 +161,8 @@ public class TestHoodieCommitArchiveLog {
boolean result = archiveLog.archiveIfRequired(); boolean result = archiveLog.archiveIfRequired();
assertTrue(result); assertTrue(result);
timeline = timeline =
metadata.getActiveTimeline().reload().getCommitsAndCompactionsTimeline().filterCompletedInstants(); metadata.getActiveTimeline().reload().getCommitsAndCompactionsTimeline()
.filterCompletedInstants();
assertEquals("Should not archive commits when maxCommitsToKeep is 5", 4, assertEquals("Should not archive commits when maxCommitsToKeep is 5", 4,
timeline.countInstants()); timeline.countInstants());
} }
@@ -184,7 +188,8 @@ public class TestHoodieCommitArchiveLog {
boolean result = archiveLog.archiveIfRequired(); boolean result = archiveLog.archiveIfRequired();
assertTrue(result); assertTrue(result);
timeline = timeline =
metadata.getActiveTimeline().reload().getCommitsAndCompactionsTimeline().filterCompletedInstants(); metadata.getActiveTimeline().reload().getCommitsAndCompactionsTimeline()
.filterCompletedInstants();
assertTrue("Archived commits should always be safe", assertTrue("Archived commits should always be safe",
timeline.containsOrBeforeTimelineStarts("100")); timeline.containsOrBeforeTimelineStarts("100"));
assertTrue("Archived commits should always be safe", assertTrue("Archived commits should always be safe",
@@ -217,7 +222,8 @@ public class TestHoodieCommitArchiveLog {
boolean result = archiveLog.archiveIfRequired(); boolean result = archiveLog.archiveIfRequired();
assertTrue(result); assertTrue(result);
timeline = timeline =
metadata.getActiveTimeline().reload().getCommitsAndCompactionsTimeline().filterCompletedInstants(); metadata.getActiveTimeline().reload().getCommitsAndCompactionsTimeline()
.filterCompletedInstants();
assertEquals( assertEquals(
"Since we have a savepoint at 101, we should never archive any commit after 101 (we only archive 100)", "Since we have a savepoint at 101, we should never archive any commit after 101 (we only archive 100)",
5, timeline.countInstants()); 5, timeline.countInstants());

View File

@@ -16,7 +16,9 @@
package com.uber.hoodie.io; package com.uber.hoodie.io;
import com.uber.hoodie.HoodieReadClient; import static org.junit.Assert.assertEquals;
import static org.junit.Assert.assertTrue;
import com.uber.hoodie.HoodieWriteClient; import com.uber.hoodie.HoodieWriteClient;
import com.uber.hoodie.WriteStatus; import com.uber.hoodie.WriteStatus;
import com.uber.hoodie.common.HoodieClientTestUtils; import com.uber.hoodie.common.HoodieClientTestUtils;
@@ -34,13 +36,16 @@ import com.uber.hoodie.config.HoodieCompactionConfig;
import com.uber.hoodie.config.HoodieIndexConfig; import com.uber.hoodie.config.HoodieIndexConfig;
import com.uber.hoodie.config.HoodieStorageConfig; import com.uber.hoodie.config.HoodieStorageConfig;
import com.uber.hoodie.config.HoodieWriteConfig; import com.uber.hoodie.config.HoodieWriteConfig;
import com.uber.hoodie.index.bloom.HoodieBloomIndex;
import com.uber.hoodie.index.HoodieIndex; import com.uber.hoodie.index.HoodieIndex;
import com.uber.hoodie.index.bloom.HoodieBloomIndex;
import com.uber.hoodie.io.compact.HoodieCompactor; import com.uber.hoodie.io.compact.HoodieCompactor;
import com.uber.hoodie.io.compact.HoodieRealtimeTableCompactor; import com.uber.hoodie.io.compact.HoodieRealtimeTableCompactor;
import com.uber.hoodie.table.HoodieTable; import com.uber.hoodie.table.HoodieTable;
import java.io.File;
import java.io.IOException;
import java.util.List;
import java.util.stream.Collectors;
import org.apache.hadoop.fs.FileSystem; import org.apache.hadoop.fs.FileSystem;
import org.apache.spark.SparkConf;
import org.apache.spark.api.java.JavaRDD; import org.apache.spark.api.java.JavaRDD;
import org.apache.spark.api.java.JavaSparkContext; import org.apache.spark.api.java.JavaSparkContext;
import org.junit.After; import org.junit.After;
@@ -48,15 +53,8 @@ import org.junit.Before;
import org.junit.Test; import org.junit.Test;
import org.junit.rules.TemporaryFolder; import org.junit.rules.TemporaryFolder;
import java.io.File;
import java.io.IOException;
import java.util.List;
import java.util.stream.Collectors;
import static org.junit.Assert.assertEquals;
import static org.junit.Assert.assertTrue;
public class TestHoodieCompactor { public class TestHoodieCompactor {
private transient JavaSparkContext jsc = null; private transient JavaSparkContext jsc = null;
private String basePath = null; private String basePath = null;
private HoodieCompactor compactor; private HoodieCompactor compactor;
@@ -194,7 +192,7 @@ public class TestHoodieCompactor {
List<FileSlice> groupedLogFiles = table.getRTFileSystemView() List<FileSlice> groupedLogFiles = table.getRTFileSystemView()
.getLatestFileSlices(partitionPath) .getLatestFileSlices(partitionPath)
.collect(Collectors.toList()); .collect(Collectors.toList());
for (FileSlice slice: groupedLogFiles) { for (FileSlice slice : groupedLogFiles) {
assertTrue( assertTrue(
"After compaction there should be no log files visiable on a Realtime view", "After compaction there should be no log files visiable on a Realtime view",
slice.getLogFiles().collect(Collectors.toList()).isEmpty()); slice.getLogFiles().collect(Collectors.toList()).isEmpty());

View File

@@ -17,12 +17,10 @@
package com.uber.hoodie.io.strategy; package com.uber.hoodie.io.strategy;
import static org.junit.Assert.assertEquals; import static org.junit.Assert.assertEquals;
import static org.junit.Assert.assertThat;
import static org.junit.Assert.assertTrue; import static org.junit.Assert.assertTrue;
import com.beust.jcommander.internal.Lists; import com.beust.jcommander.internal.Lists;
import com.google.common.collect.Maps; import com.google.common.collect.Maps;
import com.uber.hoodie.config.HoodieCompactionConfig; import com.uber.hoodie.config.HoodieCompactionConfig;
import com.uber.hoodie.config.HoodieWriteConfig; import com.uber.hoodie.config.HoodieWriteConfig;
import com.uber.hoodie.io.compact.CompactionOperation; import com.uber.hoodie.io.compact.CompactionOperation;

View File

@@ -17,9 +17,7 @@
package com.uber.hoodie.io.strategy; package com.uber.hoodie.io.strategy;
import com.uber.hoodie.common.model.HoodieDataFile; import com.uber.hoodie.common.model.HoodieDataFile;
import com.uber.hoodie.common.util.FSUtils;
import java.util.UUID; import java.util.UUID;
import org.apache.hadoop.fs.FileStatus;
public class TestHoodieDataFile extends HoodieDataFile { public class TestHoodieDataFile extends HoodieDataFile {

View File

@@ -18,7 +18,6 @@ package com.uber.hoodie.io.strategy;
import com.uber.hoodie.common.model.HoodieLogFile; import com.uber.hoodie.common.model.HoodieLogFile;
import java.util.Optional; import java.util.Optional;
import org.apache.hadoop.fs.Path; import org.apache.hadoop.fs.Path;
public class TestHoodieLogFile extends HoodieLogFile { public class TestHoodieLogFile extends HoodieLogFile {

View File

@@ -16,17 +16,17 @@
package com.uber.hoodie.metrics; package com.uber.hoodie.metrics;
import com.uber.hoodie.config.HoodieWriteConfig;
import org.apache.commons.configuration.ConfigurationException;
import org.junit.Before;
import org.junit.Test;
import static org.junit.Assert.assertTrue; import static org.junit.Assert.assertTrue;
import static org.mockito.Mockito.mock; import static org.mockito.Mockito.mock;
import static org.mockito.Mockito.when; import static org.mockito.Mockito.when;
import com.uber.hoodie.config.HoodieWriteConfig;
import org.apache.commons.configuration.ConfigurationException;
import org.junit.Before;
import org.junit.Test;
public class TestHoodieMetrics { public class TestHoodieMetrics {
private HoodieMetrics metrics = null; private HoodieMetrics metrics = null;
@Before @Before
@@ -40,6 +40,7 @@ public class TestHoodieMetrics {
@Test @Test
public void testRegisterGauge() { public void testRegisterGauge() {
metrics.registerGauge("metric1", 123L); metrics.registerGauge("metric1", 123L);
assertTrue(Metrics.getInstance().getRegistry().getGauges().get("metric1").getValue().toString().equals("123")); assertTrue(Metrics.getInstance().getRegistry().getGauges().get("metric1").getValue().toString()
.equals("123"));
} }
} }

View File

@@ -16,26 +16,37 @@
package com.uber.hoodie.table; package com.uber.hoodie.table;
import com.uber.hoodie.common.TestRawTripPayload.MetadataMergeWriteStatus; import static org.junit.Assert.assertEquals;
import com.uber.hoodie.common.table.HoodieTableMetaClient; import static org.junit.Assert.assertFalse;
import com.uber.hoodie.common.table.HoodieTimeline; import static org.junit.Assert.assertTrue;
import com.uber.hoodie.config.HoodieWriteConfig; import static org.mockito.Mockito.mock;
import static org.mockito.Mockito.when;
import com.uber.hoodie.WriteStatus; import com.uber.hoodie.WriteStatus;
import com.uber.hoodie.common.BloomFilter; import com.uber.hoodie.common.BloomFilter;
import com.uber.hoodie.common.HoodieClientTestUtils; import com.uber.hoodie.common.HoodieClientTestUtils;
import com.uber.hoodie.common.HoodieTestDataGenerator; import com.uber.hoodie.common.HoodieTestDataGenerator;
import com.uber.hoodie.common.TestRawTripPayload; import com.uber.hoodie.common.TestRawTripPayload;
import com.uber.hoodie.common.TestRawTripPayload.MetadataMergeWriteStatus;
import com.uber.hoodie.common.model.HoodieKey; import com.uber.hoodie.common.model.HoodieKey;
import com.uber.hoodie.common.model.HoodieRecord; import com.uber.hoodie.common.model.HoodieRecord;
import com.uber.hoodie.common.model.HoodieRecordLocation; import com.uber.hoodie.common.model.HoodieRecordLocation;
import com.uber.hoodie.common.model.HoodieTestUtils; import com.uber.hoodie.common.model.HoodieTestUtils;
import com.uber.hoodie.common.table.HoodieTableMetaClient;
import com.uber.hoodie.common.table.HoodieTimeline;
import com.uber.hoodie.common.util.FSUtils; import com.uber.hoodie.common.util.FSUtils;
import com.uber.hoodie.common.util.ParquetUtils; import com.uber.hoodie.common.util.ParquetUtils;
import com.uber.hoodie.config.HoodieCompactionConfig; import com.uber.hoodie.config.HoodieCompactionConfig;
import com.uber.hoodie.io.HoodieCreateHandle;
import com.uber.hoodie.config.HoodieStorageConfig; import com.uber.hoodie.config.HoodieStorageConfig;
import com.uber.hoodie.config.HoodieWriteConfig;
import com.uber.hoodie.io.HoodieCreateHandle;
import java.io.File;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Iterator;
import java.util.List;
import java.util.Map; import java.util.Map;
import java.util.UUID;
import org.apache.avro.generic.GenericRecord; import org.apache.avro.generic.GenericRecord;
import org.apache.commons.io.IOUtils; import org.apache.commons.io.IOUtils;
import org.apache.hadoop.fs.FileSystem; import org.apache.hadoop.fs.FileSystem;
@@ -47,22 +58,11 @@ import org.junit.After;
import org.junit.Before; import org.junit.Before;
import org.junit.Test; import org.junit.Test;
import org.junit.rules.TemporaryFolder; import org.junit.rules.TemporaryFolder;
import java.io.File;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Iterator;
import java.util.List;
import java.util.UUID;
import scala.Option; import scala.Option;
import scala.Tuple2; import scala.Tuple2;
import static org.junit.Assert.*;
import static org.mockito.Mockito.mock;
import static org.mockito.Mockito.when;
public class TestCopyOnWriteTable { public class TestCopyOnWriteTable {
private String basePath = null; private String basePath = null;
private transient JavaSparkContext jsc = null; private transient JavaSparkContext jsc = null;
@@ -104,7 +104,8 @@ public class TestCopyOnWriteTable {
private HoodieWriteConfig.Builder makeHoodieClientConfigBuilder() throws Exception { private HoodieWriteConfig.Builder makeHoodieClientConfigBuilder() throws Exception {
// Prepare the AvroParquetIO // Prepare the AvroParquetIO
String schemaStr = IOUtils.toString(getClass().getResourceAsStream("/exampleSchema.txt"), "UTF-8"); String schemaStr = IOUtils
.toString(getClass().getResourceAsStream("/exampleSchema.txt"), "UTF-8");
return HoodieWriteConfig.newBuilder().withPath(basePath).withSchema(schemaStr); return HoodieWriteConfig.newBuilder().withPath(basePath).withSchema(schemaStr);
} }
@@ -127,11 +128,17 @@ public class TestCopyOnWriteTable {
List<HoodieRecord> records = new ArrayList<>(); List<HoodieRecord> records = new ArrayList<>();
TestRawTripPayload rowChange1 = new TestRawTripPayload(recordStr1); TestRawTripPayload rowChange1 = new TestRawTripPayload(recordStr1);
records.add(new HoodieRecord(new HoodieKey(rowChange1.getRowKey(), rowChange1.getPartitionPath()), rowChange1)); records.add(
new HoodieRecord(new HoodieKey(rowChange1.getRowKey(), rowChange1.getPartitionPath()),
rowChange1));
TestRawTripPayload rowChange2 = new TestRawTripPayload(recordStr2); TestRawTripPayload rowChange2 = new TestRawTripPayload(recordStr2);
records.add(new HoodieRecord(new HoodieKey(rowChange2.getRowKey(), rowChange2.getPartitionPath()), rowChange2)); records.add(
new HoodieRecord(new HoodieKey(rowChange2.getRowKey(), rowChange2.getPartitionPath()),
rowChange2));
TestRawTripPayload rowChange3 = new TestRawTripPayload(recordStr3); TestRawTripPayload rowChange3 = new TestRawTripPayload(recordStr3);
records.add(new HoodieRecord(new HoodieKey(rowChange3.getRowKey(), rowChange3.getPartitionPath()), rowChange3)); records.add(
new HoodieRecord(new HoodieKey(rowChange3.getRowKey(), rowChange3.getPartitionPath()),
rowChange3));
// Insert new records // Insert new records
HoodieClientTestUtils.collectStatuses(table.handleInsert(firstCommitTime, records.iterator())); HoodieClientTestUtils.collectStatuses(table.handleInsert(firstCommitTime, records.iterator()));
@@ -159,7 +166,7 @@ public class TestCopyOnWriteTable {
List<GenericRecord> fileRecords = ParquetUtils.readAvroRecords(parquetFilePath); List<GenericRecord> fileRecords = ParquetUtils.readAvroRecords(parquetFilePath);
GenericRecord newRecord; GenericRecord newRecord;
int index = 0; int index = 0;
for (GenericRecord record: fileRecords) { for (GenericRecord record : fileRecords) {
assertTrue(record.get("_row_key").toString().equals(records.get(index).getRecordKey())); assertTrue(record.get("_row_key").toString().equals(records.get(index).getRecordKey()));
index++; index++;
} }
@@ -167,11 +174,15 @@ public class TestCopyOnWriteTable {
// We update the 1st record & add a new record // We update the 1st record & add a new record
String updateRecordStr1 = "{\"_row_key\":\"8eb5b87a-1feh-4edd-87b4-6ec96dc405a0\",\"time\":\"2016-01-31T03:16:41.415Z\",\"number\":15}"; String updateRecordStr1 = "{\"_row_key\":\"8eb5b87a-1feh-4edd-87b4-6ec96dc405a0\",\"time\":\"2016-01-31T03:16:41.415Z\",\"number\":15}";
TestRawTripPayload updateRowChanges1 = new TestRawTripPayload(updateRecordStr1); TestRawTripPayload updateRowChanges1 = new TestRawTripPayload(updateRecordStr1);
HoodieRecord updatedRecord1 = new HoodieRecord(new HoodieKey(updateRowChanges1.getRowKey(), updateRowChanges1.getPartitionPath()), updateRowChanges1); HoodieRecord updatedRecord1 = new HoodieRecord(
updatedRecord1.setCurrentLocation(new HoodieRecordLocation(null, FSUtils.getFileId(parquetFile.getName()))); new HoodieKey(updateRowChanges1.getRowKey(), updateRowChanges1.getPartitionPath()),
updateRowChanges1);
updatedRecord1.setCurrentLocation(
new HoodieRecordLocation(null, FSUtils.getFileId(parquetFile.getName())));
TestRawTripPayload rowChange4 = new TestRawTripPayload(recordStr4); TestRawTripPayload rowChange4 = new TestRawTripPayload(recordStr4);
HoodieRecord insertedRecord1 = new HoodieRecord(new HoodieKey(rowChange4.getRowKey(), rowChange4.getPartitionPath()), rowChange4); HoodieRecord insertedRecord1 = new HoodieRecord(
new HoodieKey(rowChange4.getRowKey(), rowChange4.getPartitionPath()), rowChange4);
List<HoodieRecord> updatedRecords = Arrays.asList(updatedRecord1, insertedRecord1); List<HoodieRecord> updatedRecords = Arrays.asList(updatedRecord1, insertedRecord1);
@@ -179,7 +190,9 @@ public class TestCopyOnWriteTable {
String newCommitTime = HoodieTestUtils.makeNewCommitTime(); String newCommitTime = HoodieTestUtils.makeNewCommitTime();
metadata = new HoodieTableMetaClient(FSUtils.getFs(), basePath); metadata = new HoodieTableMetaClient(FSUtils.getFs(), basePath);
table = new HoodieCopyOnWriteTable(config, metadata); table = new HoodieCopyOnWriteTable(config, metadata);
Iterator<List<WriteStatus>> iter = table.handleUpdate(newCommitTime, updatedRecord1.getCurrentLocation().getFileId(), updatedRecords.iterator()); Iterator<List<WriteStatus>> iter = table
.handleUpdate(newCommitTime, updatedRecord1.getCurrentLocation().getFileId(),
updatedRecords.iterator());
// Check the updated file // Check the updated file
File updatedParquetFile = null; File updatedParquetFile = null;
@@ -197,7 +210,8 @@ public class TestCopyOnWriteTable {
assertTrue(updatedParquetFile != null); assertTrue(updatedParquetFile != null);
// Check whether the record has been updated // Check whether the record has been updated
Path updatedParquetFilePath = new Path(updatedParquetFile.getAbsolutePath()); Path updatedParquetFilePath = new Path(updatedParquetFile.getAbsolutePath());
BloomFilter updatedFilter = ParquetUtils.readBloomFilterFromParquetMetadata(updatedParquetFilePath); BloomFilter updatedFilter = ParquetUtils
.readBloomFilterFromParquetMetadata(updatedParquetFilePath);
for (HoodieRecord record : records) { for (HoodieRecord record : records) {
// No change to the _row_key // No change to the _row_key
assertTrue(updatedFilter.mightContain(record.getRecordKey())); assertTrue(updatedFilter.mightContain(record.getRecordKey()));
@@ -206,7 +220,8 @@ public class TestCopyOnWriteTable {
assertTrue(updatedFilter.mightContain(insertedRecord1.getRecordKey())); assertTrue(updatedFilter.mightContain(insertedRecord1.getRecordKey()));
records.add(insertedRecord1);// add this so it can further check below records.add(insertedRecord1);// add this so it can further check below
ParquetReader updatedReader = ParquetReader.builder(new AvroReadSupport<>(), updatedParquetFilePath).build(); ParquetReader updatedReader = ParquetReader
.builder(new AvroReadSupport<>(), updatedParquetFilePath).build();
index = 0; index = 0;
while ((newRecord = (GenericRecord) updatedReader.read()) != null) { while ((newRecord = (GenericRecord) updatedReader.read()) != null) {
assertTrue(newRecord.get("_row_key").toString().equals(records.get(index).getRecordKey())); assertTrue(newRecord.get("_row_key").toString().equals(records.get(index).getRecordKey()));
@@ -243,7 +258,8 @@ public class TestCopyOnWriteTable {
@Test @Test
public void testMetadataAggregateFromWriteStatus() throws Exception { public void testMetadataAggregateFromWriteStatus() throws Exception {
// Prepare the AvroParquetIO // Prepare the AvroParquetIO
HoodieWriteConfig config = makeHoodieClientConfigBuilder().withWriteStatusClass(MetadataMergeWriteStatus.class).build(); HoodieWriteConfig config = makeHoodieClientConfigBuilder()
.withWriteStatusClass(MetadataMergeWriteStatus.class).build();
String firstCommitTime = HoodieTestUtils.makeNewCommitTime(); String firstCommitTime = HoodieTestUtils.makeNewCommitTime();
HoodieTableMetaClient metadata = new HoodieTableMetaClient(FSUtils.getFs(), basePath); HoodieTableMetaClient metadata = new HoodieTableMetaClient(FSUtils.getFs(), basePath);
@@ -256,11 +272,17 @@ public class TestCopyOnWriteTable {
List<HoodieRecord> records = new ArrayList<>(); List<HoodieRecord> records = new ArrayList<>();
TestRawTripPayload rowChange1 = new TestRawTripPayload(recordStr1); TestRawTripPayload rowChange1 = new TestRawTripPayload(recordStr1);
records.add(new HoodieRecord(new HoodieKey(rowChange1.getRowKey(), rowChange1.getPartitionPath()), rowChange1)); records.add(
new HoodieRecord(new HoodieKey(rowChange1.getRowKey(), rowChange1.getPartitionPath()),
rowChange1));
TestRawTripPayload rowChange2 = new TestRawTripPayload(recordStr2); TestRawTripPayload rowChange2 = new TestRawTripPayload(recordStr2);
records.add(new HoodieRecord(new HoodieKey(rowChange2.getRowKey(), rowChange2.getPartitionPath()), rowChange2)); records.add(
new HoodieRecord(new HoodieKey(rowChange2.getRowKey(), rowChange2.getPartitionPath()),
rowChange2));
TestRawTripPayload rowChange3 = new TestRawTripPayload(recordStr3); TestRawTripPayload rowChange3 = new TestRawTripPayload(recordStr3);
records.add(new HoodieRecord(new HoodieKey(rowChange3.getRowKey(), rowChange3.getPartitionPath()), rowChange3)); records.add(
new HoodieRecord(new HoodieKey(rowChange3.getRowKey(), rowChange3.getPartitionPath()),
rowChange3));
// Insert new records // Insert new records
List<WriteStatus> writeStatuses = HoodieClientTestUtils List<WriteStatus> writeStatuses = HoodieClientTestUtils
@@ -286,7 +308,8 @@ public class TestCopyOnWriteTable {
records.addAll(newHoodieRecords(1, "2016-02-01T03:16:41.415Z")); records.addAll(newHoodieRecords(1, "2016-02-01T03:16:41.415Z"));
// Simulate crash after first file // Simulate crash after first file
List<WriteStatus> statuses = HoodieClientTestUtils.collectStatuses(table.handleInsert(commitTime, records.iterator())); List<WriteStatus> statuses = HoodieClientTestUtils
.collectStatuses(table.handleInsert(commitTime, records.iterator()));
WriteStatus status = statuses.get(0); WriteStatus status = statuses.get(0);
Path partialFile = new Path(String.format("%s/%s/%s", Path partialFile = new Path(String.format("%s/%s/%s",
basePath, basePath,
@@ -299,7 +322,8 @@ public class TestCopyOnWriteTable {
records = newHoodieRecords(10, "2016-01-31T03:16:41.415Z"); records = newHoodieRecords(10, "2016-01-31T03:16:41.415Z");
records.addAll(newHoodieRecords(1, "2016-02-01T03:16:41.415Z")); records.addAll(newHoodieRecords(1, "2016-02-01T03:16:41.415Z"));
statuses = HoodieClientTestUtils.collectStatuses(table.handleInsert(commitTime, records.iterator())); statuses = HoodieClientTestUtils
.collectStatuses(table.handleInsert(commitTime, records.iterator()));
status = statuses.get(0); status = statuses.get(0);
Path retriedFIle = new Path(String.format("%s/%s/%s", Path retriedFIle = new Path(String.format("%s/%s/%s",
@@ -312,7 +336,8 @@ public class TestCopyOnWriteTable {
} }
@Test public void testInsertRecords() throws Exception { @Test
public void testInsertRecords() throws Exception {
HoodieWriteConfig config = makeHoodieClientConfig(); HoodieWriteConfig config = makeHoodieClientConfig();
String commitTime = HoodieTestUtils.makeNewCommitTime(); String commitTime = HoodieTestUtils.makeNewCommitTime();
HoodieTableMetaClient metadata = new HoodieTableMetaClient(FSUtils.getFs(), basePath); HoodieTableMetaClient metadata = new HoodieTableMetaClient(FSUtils.getFs(), basePath);
@@ -324,8 +349,8 @@ public class TestCopyOnWriteTable {
records.addAll(newHoodieRecords(1, "2016-02-01T03:16:41.415Z")); records.addAll(newHoodieRecords(1, "2016-02-01T03:16:41.415Z"));
// Insert new records // Insert new records
List<WriteStatus> returnedStatuses = HoodieClientTestUtils.collectStatuses(table.handleInsert(commitTime, records.iterator())); List<WriteStatus> returnedStatuses = HoodieClientTestUtils
.collectStatuses(table.handleInsert(commitTime, records.iterator()));
// TODO: check the actual files and make sure 11 records, total were written. // TODO: check the actual files and make sure 11 records, total were written.
assertEquals(2, returnedStatuses.size()); assertEquals(2, returnedStatuses.size());
@@ -343,7 +368,8 @@ public class TestCopyOnWriteTable {
records.addAll(newHoodieRecords(1, "2016-02-02T03:16:41.415Z")); records.addAll(newHoodieRecords(1, "2016-02-02T03:16:41.415Z"));
// Insert new records // Insert new records
returnedStatuses = HoodieClientTestUtils.collectStatuses(table.handleInsert(commitTime, records.iterator())); returnedStatuses = HoodieClientTestUtils
.collectStatuses(table.handleInsert(commitTime, records.iterator()));
assertEquals(3, returnedStatuses.size()); assertEquals(3, returnedStatuses.size());
assertEquals("2016/01/31", returnedStatuses.get(0).getPartitionPath()); assertEquals("2016/01/31", returnedStatuses.get(0).getPartitionPath());
@@ -357,7 +383,8 @@ public class TestCopyOnWriteTable {
} }
@Test public void testFileSizeUpsertRecords() throws Exception { @Test
public void testFileSizeUpsertRecords() throws Exception {
HoodieWriteConfig config = makeHoodieClientConfigBuilder().withStorageConfig( HoodieWriteConfig config = makeHoodieClientConfigBuilder().withStorageConfig(
HoodieStorageConfig.newBuilder().limitFileSize(64 * 1024).parquetBlockSize(64 * 1024) HoodieStorageConfig.newBuilder().limitFileSize(64 * 1024).parquetBlockSize(64 * 1024)
.parquetPageSize(64 * 1024).build()).build(); .parquetPageSize(64 * 1024).build()).build();
@@ -368,9 +395,11 @@ public class TestCopyOnWriteTable {
List<HoodieRecord> records = new ArrayList<>(); List<HoodieRecord> records = new ArrayList<>();
// Approx 1150 records are written for block size of 64KB // Approx 1150 records are written for block size of 64KB
for (int i = 0; i < 2000; i++) { for (int i = 0; i < 2000; i++) {
String recordStr = "{\"_row_key\":\"" + UUID.randomUUID().toString() + "\",\"time\":\"2016-01-31T03:16:41.415Z\",\"number\":" + i + "}"; String recordStr = "{\"_row_key\":\"" + UUID.randomUUID().toString()
+ "\",\"time\":\"2016-01-31T03:16:41.415Z\",\"number\":" + i + "}";
TestRawTripPayload rowChange = new TestRawTripPayload(recordStr); TestRawTripPayload rowChange = new TestRawTripPayload(recordStr);
records.add(new HoodieRecord(new HoodieKey(rowChange.getRowKey(), rowChange.getPartitionPath()), records
.add(new HoodieRecord(new HoodieKey(rowChange.getRowKey(), rowChange.getPartitionPath()),
rowChange)); rowChange));
} }
@@ -380,7 +409,8 @@ public class TestCopyOnWriteTable {
// Check the updated file // Check the updated file
int counts = 0; int counts = 0;
for (File file : new File(basePath + "/2016/01/31").listFiles()) { for (File file : new File(basePath + "/2016/01/31").listFiles()) {
if (file.getName().endsWith(".parquet") && FSUtils.getCommitTime(file.getName()).equals(commitTime)) { if (file.getName().endsWith(".parquet") && FSUtils.getCommitTime(file.getName())
.equals(commitTime)) {
System.out.println(file.getName() + "-" + file.length()); System.out.println(file.getName() + "-" + file.length());
counts++; counts++;
} }
@@ -391,7 +421,6 @@ public class TestCopyOnWriteTable {
} }
private List<HoodieCopyOnWriteTable.InsertBucket> testUpsertPartitioner(int smallFileSize, private List<HoodieCopyOnWriteTable.InsertBucket> testUpsertPartitioner(int smallFileSize,
int numInserts, int numInserts,
int numUpdates, int numUpdates,
@@ -400,8 +429,10 @@ public class TestCopyOnWriteTable {
final String TEST_PARTITION_PATH = "2016/09/26"; final String TEST_PARTITION_PATH = "2016/09/26";
HoodieWriteConfig config = makeHoodieClientConfigBuilder() HoodieWriteConfig config = makeHoodieClientConfigBuilder()
.withCompactionConfig(HoodieCompactionConfig.newBuilder() .withCompactionConfig(HoodieCompactionConfig.newBuilder()
.compactionSmallFileSize(smallFileSize).insertSplitSize(100).autoTuneInsertSplits(autoSplitInserts).build()) .compactionSmallFileSize(smallFileSize).insertSplitSize(100)
.withStorageConfig(HoodieStorageConfig.newBuilder().limitFileSize(1000 * 1024).build()).build(); .autoTuneInsertSplits(autoSplitInserts).build())
.withStorageConfig(HoodieStorageConfig.newBuilder().limitFileSize(1000 * 1024).build())
.build();
HoodieClientTestUtils.fakeCommitFile(basePath, "001"); HoodieClientTestUtils.fakeCommitFile(basePath, "001");
HoodieClientTestUtils.fakeDataFile(basePath, TEST_PARTITION_PATH, "001", "file1", fileSize); HoodieClientTestUtils.fakeDataFile(basePath, TEST_PARTITION_PATH, "001", "file1", fileSize);
@@ -409,10 +440,11 @@ public class TestCopyOnWriteTable {
HoodieTableMetaClient metadata = new HoodieTableMetaClient(FSUtils.getFs(), basePath); HoodieTableMetaClient metadata = new HoodieTableMetaClient(FSUtils.getFs(), basePath);
HoodieCopyOnWriteTable table = new HoodieCopyOnWriteTable(config, metadata); HoodieCopyOnWriteTable table = new HoodieCopyOnWriteTable(config, metadata);
HoodieTestDataGenerator dataGenerator = new HoodieTestDataGenerator(new String[]{TEST_PARTITION_PATH}); HoodieTestDataGenerator dataGenerator = new HoodieTestDataGenerator(
new String[]{TEST_PARTITION_PATH});
List<HoodieRecord> insertRecords = dataGenerator.generateInserts("001", numInserts); List<HoodieRecord> insertRecords = dataGenerator.generateInserts("001", numInserts);
List<HoodieRecord> updateRecords = dataGenerator.generateUpdates("001", numUpdates); List<HoodieRecord> updateRecords = dataGenerator.generateUpdates("001", numUpdates);
for (HoodieRecord updateRec: updateRecords) { for (HoodieRecord updateRec : updateRecords) {
updateRec.setCurrentLocation(new HoodieRecordLocation("001", "file1")); updateRec.setCurrentLocation(new HoodieRecordLocation("001", "file1"));
} }
List<HoodieRecord> records = new ArrayList<>(); List<HoodieRecord> records = new ArrayList<>();
@@ -430,7 +462,8 @@ public class TestCopyOnWriteTable {
assertEquals("Bucket 2 is INSERT", HoodieCopyOnWriteTable.BucketType.INSERT, assertEquals("Bucket 2 is INSERT", HoodieCopyOnWriteTable.BucketType.INSERT,
partitioner.getBucketInfo(2).bucketType); partitioner.getBucketInfo(2).bucketType);
assertEquals("Update record should have gone to the 1 update partiton", 0, assertEquals("Update record should have gone to the 1 update partiton", 0,
partitioner.getPartition(new Tuple2<>(updateRecords.get(0).getKey(), Option.apply(updateRecords.get(0).getCurrentLocation())))); partitioner.getPartition(new Tuple2<>(updateRecords.get(0).getKey(),
Option.apply(updateRecords.get(0).getCurrentLocation()))));
return partitioner.getInsertBuckets(TEST_PARTITION_PATH); return partitioner.getInsertBuckets(TEST_PARTITION_PATH);
} }
@@ -438,7 +471,8 @@ public class TestCopyOnWriteTable {
@Test @Test
public void testUpsertPartitioner() throws Exception { public void testUpsertPartitioner() throws Exception {
// Inserts + Updates... Check all updates go together & inserts subsplit // Inserts + Updates... Check all updates go together & inserts subsplit
List<HoodieCopyOnWriteTable.InsertBucket> insertBuckets = testUpsertPartitioner(0, 200, 100, 1024, false); List<HoodieCopyOnWriteTable.InsertBucket> insertBuckets = testUpsertPartitioner(0, 200, 100,
1024, false);
assertEquals("Total of 2 insert buckets", 2, insertBuckets.size()); assertEquals("Total of 2 insert buckets", 2, insertBuckets.size());
} }
@@ -446,16 +480,21 @@ public class TestCopyOnWriteTable {
@Test @Test
public void testUpsertPartitionerWithSmallInsertHandling() throws Exception { public void testUpsertPartitionerWithSmallInsertHandling() throws Exception {
// Inserts + Updates .. Check updates go together & inserts subsplit, after expanding smallest file // Inserts + Updates .. Check updates go together & inserts subsplit, after expanding smallest file
List<HoodieCopyOnWriteTable.InsertBucket> insertBuckets = testUpsertPartitioner(1000 * 1024, 400, 100, 800 * 1024, false); List<HoodieCopyOnWriteTable.InsertBucket> insertBuckets = testUpsertPartitioner(1000 * 1024,
400, 100, 800 * 1024, false);
assertEquals("Total of 3 insert buckets", 3, insertBuckets.size()); assertEquals("Total of 3 insert buckets", 3, insertBuckets.size());
assertEquals("First insert bucket must be same as update bucket", 0, insertBuckets.get(0).bucketNumber); assertEquals("First insert bucket must be same as update bucket", 0,
assertEquals("First insert bucket should have weight 0.5", 0.5, insertBuckets.get(0).weight, 0.01); insertBuckets.get(0).bucketNumber);
assertEquals("First insert bucket should have weight 0.5", 0.5, insertBuckets.get(0).weight,
0.01);
// Now with insert split size auto tuned // Now with insert split size auto tuned
insertBuckets = testUpsertPartitioner(1000 * 1024, 2400, 100, 800 * 1024, true); insertBuckets = testUpsertPartitioner(1000 * 1024, 2400, 100, 800 * 1024, true);
assertEquals("Total of 3 insert buckets", 3, insertBuckets.size()); assertEquals("Total of 3 insert buckets", 3, insertBuckets.size());
assertEquals("First insert bucket must be same as update bucket", 0, insertBuckets.get(0).bucketNumber); assertEquals("First insert bucket must be same as update bucket", 0,
assertEquals("First insert bucket should have weight 0.5", 200.0/2400, insertBuckets.get(0).weight, 0.01); insertBuckets.get(0).bucketNumber);
assertEquals("First insert bucket should have weight 0.5", 200.0 / 2400,
insertBuckets.get(0).weight, 0.01);
} }
@After @After

Some files were not shown because too many files have changed in this diff Show More