1
0

Reformatting code per Google Code Style all over

This commit is contained in:
Vinoth Chandar
2017-11-12 22:54:56 -08:00
committed by vinoth chandar
parent 5a62480a92
commit e45679f5e2
254 changed files with 21580 additions and 21108 deletions

View File

@@ -15,7 +15,9 @@
~ limitations under the License.
-->
<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
<project xmlns="http://maven.apache.org/POM/4.0.0"
xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
<parent>
<artifactId>hoodie</artifactId>
<groupId>com.uber.hoodie</groupId>

View File

@@ -17,12 +17,12 @@
package com.uber.hoodie.cli;
import com.uber.hoodie.common.table.HoodieTableMetaClient;
import java.io.IOException;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileSystem;
import java.io.IOException;
public class HoodieCLI {
public static Configuration conf;
public static FileSystem fs;
public static CLIState state = CLIState.INIT;
@@ -43,7 +43,7 @@ public class HoodieCLI {
}
public static void initFS(boolean force) throws IOException {
if(fs == null || force) {
if (fs == null || force) {
fs = FileSystem.get(conf);
}
}

View File

@@ -17,7 +17,6 @@
package com.uber.hoodie.cli;
import dnl.utils.text.table.TextTable;
import java.io.ByteArrayOutputStream;
import java.io.PrintStream;
import java.nio.charset.Charset;

View File

@@ -16,7 +16,6 @@
package com.uber.hoodie.cli;
import com.uber.hoodie.common.table.HoodieTableConfig;
import org.springframework.core.Ordered;
import org.springframework.core.annotation.Order;
import org.springframework.shell.plugin.support.DefaultPromptProvider;

View File

@@ -22,9 +22,13 @@ import org.springframework.shell.plugin.support.DefaultBannerProvider;
import org.springframework.shell.support.util.OsUtils;
import org.springframework.stereotype.Component;
@Component @Order(Ordered.HIGHEST_PRECEDENCE) public class HoodieSplashScreen
@Component
@Order(Ordered.HIGHEST_PRECEDENCE)
public class HoodieSplashScreen
extends DefaultBannerProvider {
private static String screen = "============================================" + OsUtils.LINE_SEPARATOR +
private static String screen =
"============================================" + OsUtils.LINE_SEPARATOR +
"* *" + OsUtils.LINE_SEPARATOR +
"* _ _ _ _ *" + OsUtils.LINE_SEPARATOR +
"* | | | | | (_) *" + OsUtils.LINE_SEPARATOR +
@@ -49,7 +53,8 @@ import org.springframework.stereotype.Component;
return "Welcome to Hoodie CLI. Please type help if you are looking for help. ";
}
@Override public String getProviderName() {
@Override
public String getProviderName() {
return "Hoodie Banner";
}
}

View File

@@ -16,16 +16,14 @@
package com.uber.hoodie.cli;
import java.io.IOException;
import org.springframework.shell.Bootstrap;
import java.io.IOException;
public class Main {
/**
* Main class that delegates to Spring Shell's Bootstrap class in order to simplify debugging inside an IDE
*
* @param args
* @throws IOException
* Main class that delegates to Spring Shell's Bootstrap class in order to simplify debugging
* inside an IDE
*/
public static void main(String[] args) throws IOException {
Bootstrap.main(args);

View File

@@ -24,6 +24,10 @@ import com.uber.hoodie.common.table.HoodieTimeline;
import com.uber.hoodie.common.table.log.HoodieLogFormat;
import com.uber.hoodie.common.table.log.block.HoodieAvroDataBlock;
import com.uber.hoodie.common.util.FSUtils;
import java.io.IOException;
import java.util.ArrayList;
import java.util.List;
import java.util.stream.Collectors;
import org.apache.avro.generic.GenericRecord;
import org.apache.avro.generic.IndexedRecord;
import org.apache.hadoop.fs.FileStatus;
@@ -34,11 +38,6 @@ import org.springframework.shell.core.annotation.CliCommand;
import org.springframework.shell.core.annotation.CliOption;
import org.springframework.stereotype.Component;
import java.io.IOException;
import java.util.ArrayList;
import java.util.List;
import java.util.stream.Collectors;
@Component
public class ArchivedCommitsCommand implements CommandMarker {
@@ -49,13 +48,16 @@ public class ArchivedCommitsCommand implements CommandMarker {
@CliCommand(value = "show archived commits", help = "Read commits from archived files and show details")
public String showCommits(
@CliOption(key = {"limit"}, mandatory = false, help = "Limit commits", unspecifiedDefaultValue = "10")
@CliOption(key = {
"limit"}, mandatory = false, help = "Limit commits", unspecifiedDefaultValue = "10")
final Integer limit) throws IOException {
System.out.println("===============> Showing only " + limit + " archived commits <===============");
FileStatus [] fsStatuses = FSUtils.getFs().globStatus(new Path(HoodieCLI.tableMetadata.getBasePath() + "/.hoodie/.commits_.archive*"));
System.out
.println("===============> Showing only " + limit + " archived commits <===============");
FileStatus[] fsStatuses = FSUtils.getFs().globStatus(
new Path(HoodieCLI.tableMetadata.getBasePath() + "/.hoodie/.commits_.archive*"));
List<String[]> allCommits = new ArrayList<>();
for(FileStatus fs : fsStatuses) {
for (FileStatus fs : fsStatuses) {
//read the archived file
HoodieLogFormat.Reader reader = HoodieLogFormat.newReader(FSUtils.getFs(),
new HoodieLogFile(fs.getPath()), HoodieArchivedMetaEntry.getClassSchema(), false);
@@ -67,11 +69,13 @@ public class ArchivedCommitsCommand implements CommandMarker {
List<IndexedRecord> records = blk.getRecords();
readRecords.addAll(records);
}
List<String[]> readCommits = readRecords.stream().map(r -> (GenericRecord)r).map(r -> readCommit(r)).limit(limit).collect(Collectors.toList());
List<String[]> readCommits = readRecords.stream().map(r -> (GenericRecord) r)
.map(r -> readCommit(r)).limit(limit).collect(Collectors.toList());
allCommits.addAll(readCommits);
}
return HoodiePrintHelper.print(
new String[] {"CommitTime", "CommitType", "CommitDetails"}, allCommits.toArray(new String[allCommits.size()][]));
new String[]{"CommitTime", "CommitType", "CommitDetails"},
allCommits.toArray(new String[allCommits.size()][]));
}
private String[] readCommit(GenericRecord record) {

View File

@@ -24,21 +24,21 @@ import com.uber.hoodie.common.table.HoodieTimeline;
import com.uber.hoodie.common.table.timeline.HoodieActiveTimeline;
import com.uber.hoodie.common.table.timeline.HoodieInstant;
import com.uber.hoodie.common.util.AvroUtils;
import org.springframework.shell.core.CommandMarker;
import org.springframework.shell.core.annotation.CliAvailabilityIndicator;
import org.springframework.shell.core.annotation.CliCommand;
import org.springframework.shell.core.annotation.CliOption;
import org.springframework.stereotype.Component;
import java.io.IOException;
import java.util.ArrayList;
import java.util.Collections;
import java.util.List;
import java.util.Map;
import java.util.stream.Collectors;
import org.springframework.shell.core.CommandMarker;
import org.springframework.shell.core.annotation.CliAvailabilityIndicator;
import org.springframework.shell.core.annotation.CliCommand;
import org.springframework.shell.core.annotation.CliOption;
import org.springframework.stereotype.Component;
@Component
public class CleansCommand implements CommandMarker {
@CliAvailabilityIndicator({"cleans show"})
public boolean isShowAvailable() {
return HoodieCLI.tableMetadata != null;
@@ -65,12 +65,12 @@ public class CleansCommand implements CommandMarker {
HoodieInstant clean = cleans.get(i);
HoodieCleanMetadata cleanMetadata =
AvroUtils.deserializeHoodieCleanMetadata(timeline.getInstantDetails(clean).get());
rows[i] = new String[] {clean.getTimestamp(), cleanMetadata.getEarliestCommitToRetain(),
rows[i] = new String[]{clean.getTimestamp(), cleanMetadata.getEarliestCommitToRetain(),
String.valueOf(cleanMetadata.getTotalFilesDeleted()),
String.valueOf(cleanMetadata.getTimeTakenInMillis())};
}
return HoodiePrintHelper.print(
new String[] {"CleanTime", "EarliestCommandRetained", "Total Files Deleted",
new String[]{"CleanTime", "EarliestCommandRetained", "Total Files Deleted",
"Total Time Taken"}, rows);
}
@@ -97,16 +97,17 @@ public class CleansCommand implements CommandMarker {
HoodieCleanMetadata cleanMetadata =
AvroUtils.deserializeHoodieCleanMetadata(timeline.getInstantDetails(cleanInstant).get());
List<String[]> rows = new ArrayList<>();
for (Map.Entry<String, HoodieCleanPartitionMetadata> entry : cleanMetadata.getPartitionMetadata().entrySet()) {
for (Map.Entry<String, HoodieCleanPartitionMetadata> entry : cleanMetadata
.getPartitionMetadata().entrySet()) {
String path = entry.getKey();
HoodieCleanPartitionMetadata stats = entry.getValue();
String policy = stats.getPolicy();
String totalSuccessDeletedFiles = String.valueOf(stats.getSuccessDeleteFiles().size());
String totalFailedDeletedFiles = String.valueOf(stats.getFailedDeleteFiles().size());
rows.add(new String[] {path, policy, totalSuccessDeletedFiles, totalFailedDeletedFiles});
rows.add(new String[]{path, policy, totalSuccessDeletedFiles, totalFailedDeletedFiles});
}
return HoodiePrintHelper.print(
new String[] {"Partition Path", "Cleaning policy", "Total Files Successfully Deleted",
new String[]{"Partition Path", "Cleaning policy", "Total Files Successfully Deleted",
"Total Failed Deletions"}, rows.toArray(new String[rows.size()][]));
}
}

View File

@@ -27,7 +27,12 @@ import com.uber.hoodie.common.table.HoodieTimeline;
import com.uber.hoodie.common.table.timeline.HoodieActiveTimeline;
import com.uber.hoodie.common.table.timeline.HoodieInstant;
import com.uber.hoodie.common.util.NumericUtils;
import java.io.IOException;
import java.util.ArrayList;
import java.util.Collections;
import java.util.List;
import java.util.Map;
import java.util.stream.Collectors;
import org.apache.spark.launcher.SparkLauncher;
import org.springframework.shell.core.CommandMarker;
import org.springframework.shell.core.annotation.CliAvailabilityIndicator;
@@ -35,15 +40,9 @@ import org.springframework.shell.core.annotation.CliCommand;
import org.springframework.shell.core.annotation.CliOption;
import org.springframework.stereotype.Component;
import java.io.IOException;
import java.util.ArrayList;
import java.util.Collections;
import java.util.List;
import java.util.Map;
import java.util.stream.Collectors;
@Component
public class CommitsCommand implements CommandMarker {
@CliAvailabilityIndicator({"commits show"})
public boolean isShowAvailable() {
return HoodieCLI.tableMetadata != null;
@@ -70,7 +69,8 @@ public class CommitsCommand implements CommandMarker {
"limit"}, mandatory = false, help = "Limit commits", unspecifiedDefaultValue = "10")
final Integer limit) throws IOException {
HoodieActiveTimeline activeTimeline = HoodieCLI.tableMetadata.getActiveTimeline();
HoodieTimeline timeline = activeTimeline.getCommitsAndCompactionsTimeline().filterCompletedInstants();
HoodieTimeline timeline = activeTimeline.getCommitsAndCompactionsTimeline()
.filterCompletedInstants();
List<HoodieInstant> commits = timeline.getInstants().collect(Collectors.toList());
String[][] rows = new String[commits.size()][];
Collections.reverse(commits);
@@ -78,7 +78,7 @@ public class CommitsCommand implements CommandMarker {
HoodieInstant commit = commits.get(i);
HoodieCommitMetadata commitMetadata =
HoodieCommitMetadata.fromBytes(timeline.getInstantDetails(commit).get());
rows[i] = new String[] {commit.getTimestamp(),
rows[i] = new String[]{commit.getTimestamp(),
NumericUtils.humanReadableByteCount(commitMetadata.fetchTotalBytesWritten()),
String.valueOf(commitMetadata.fetchTotalFilesInsert()),
String.valueOf(commitMetadata.fetchTotalFilesUpdated()),
@@ -88,7 +88,7 @@ public class CommitsCommand implements CommandMarker {
String.valueOf(commitMetadata.fetchTotalWriteErrors())};
}
return HoodiePrintHelper.print(
new String[] {"CommitTime", "Total Written (B)", "Total Files Added",
new String[]{"CommitTime", "Total Written (B)", "Total Files Added",
"Total Files Updated", "Total Partitions Written", "Total Records Written",
"Total Update Records Written", "Total Errors"}, rows);
}
@@ -108,8 +108,10 @@ public class CommitsCommand implements CommandMarker {
@CliOption(key = {"sparkProperties"}, help = "Spark Properites File Path")
final String sparkPropertiesPath) throws Exception {
HoodieActiveTimeline activeTimeline = HoodieCLI.tableMetadata.getActiveTimeline();
HoodieTimeline timeline = activeTimeline.getCommitsAndCompactionsTimeline().filterCompletedInstants();
HoodieInstant commitInstant = new HoodieInstant(false, HoodieTimeline.COMMIT_ACTION, commitTime);
HoodieTimeline timeline = activeTimeline.getCommitsAndCompactionsTimeline()
.filterCompletedInstants();
HoodieInstant commitInstant = new HoodieInstant(false, HoodieTimeline.COMMIT_ACTION,
commitTime);
if (!timeline.containsInstant(commitInstant)) {
return "Commit " + commitTime + " not found in Commits " + timeline;
@@ -135,8 +137,10 @@ public class CommitsCommand implements CommandMarker {
@CliOption(key = {"commit"}, help = "Commit to show")
final String commitTime) throws Exception {
HoodieActiveTimeline activeTimeline = HoodieCLI.tableMetadata.getActiveTimeline();
HoodieTimeline timeline = activeTimeline.getCommitsAndCompactionsTimeline().filterCompletedInstants();
HoodieInstant commitInstant = new HoodieInstant(false, HoodieTimeline.COMMIT_ACTION, commitTime);
HoodieTimeline timeline = activeTimeline.getCommitsAndCompactionsTimeline()
.filterCompletedInstants();
HoodieInstant commitInstant = new HoodieInstant(false, HoodieTimeline.COMMIT_ACTION,
commitTime);
if (!timeline.containsInstant(commitInstant)) {
return "Commit " + commitTime + " not found in Commits " + timeline;
@@ -165,7 +169,7 @@ public class CommitsCommand implements CommandMarker {
totalBytesWritten += stat.getTotalWriteBytes();
totalWriteErrors += stat.getTotalWriteErrors();
}
rows.add(new String[] {path, String.valueOf(totalFilesAdded),
rows.add(new String[]{path, String.valueOf(totalFilesAdded),
String.valueOf(totalFilesUpdated), String.valueOf(totalRecordsInserted),
String.valueOf(totalRecordsUpdated),
NumericUtils.humanReadableByteCount(totalBytesWritten),
@@ -173,7 +177,7 @@ public class CommitsCommand implements CommandMarker {
}
return HoodiePrintHelper.print(
new String[] {"Partition Path", "Total Files Added", "Total Files Updated",
new String[]{"Partition Path", "Total Files Added", "Total Files Updated",
"Total Records Inserted", "Total Records Updated", "Total Bytes Written",
"Total Errors"}, rows.toArray(new String[rows.size()][]));
}
@@ -183,8 +187,10 @@ public class CommitsCommand implements CommandMarker {
@CliOption(key = {"commit"}, help = "Commit to show")
final String commitTime) throws Exception {
HoodieActiveTimeline activeTimeline = HoodieCLI.tableMetadata.getActiveTimeline();
HoodieTimeline timeline = activeTimeline.getCommitsAndCompactionsTimeline().filterCompletedInstants();
HoodieInstant commitInstant = new HoodieInstant(false, HoodieTimeline.COMMIT_ACTION, commitTime);
HoodieTimeline timeline = activeTimeline.getCommitsAndCompactionsTimeline()
.filterCompletedInstants();
HoodieInstant commitInstant = new HoodieInstant(false, HoodieTimeline.COMMIT_ACTION,
commitTime);
if (!timeline.containsInstant(commitInstant)) {
return "Commit " + commitTime + " not found in Commits " + timeline;
@@ -197,14 +203,14 @@ public class CommitsCommand implements CommandMarker {
String path = entry.getKey();
List<HoodieWriteStat> stats = entry.getValue();
for (HoodieWriteStat stat : stats) {
rows.add(new String[] {path, stat.getFileId(), stat.getPrevCommit(),
rows.add(new String[]{path, stat.getFileId(), stat.getPrevCommit(),
String.valueOf(stat.getNumUpdateWrites()), String.valueOf(stat.getNumWrites()),
String.valueOf(stat.getTotalWriteBytes()),
String.valueOf(stat.getTotalWriteErrors())});
}
}
return HoodiePrintHelper.print(
new String[] {"Partition Path", "File ID", "Previous Commit", "Total Records Updated",
new String[]{"Partition Path", "File ID", "Previous Commit", "Total Records Updated",
"Total Records Written", "Total Bytes Written", "Total Errors"},
rows.toArray(new String[rows.size()][]));
}
@@ -219,16 +225,23 @@ public class CommitsCommand implements CommandMarker {
@CliOption(key = {"path"}, help = "Path of the dataset to compare to")
final String path) throws Exception {
HoodieTableMetaClient target = new HoodieTableMetaClient(HoodieCLI.fs, path);
HoodieTimeline targetTimeline = target.getActiveTimeline().getCommitsAndCompactionsTimeline().filterCompletedInstants();;
HoodieTimeline targetTimeline = target.getActiveTimeline().getCommitsAndCompactionsTimeline()
.filterCompletedInstants();
;
HoodieTableMetaClient source = HoodieCLI.tableMetadata;
HoodieTimeline sourceTimeline = source.getActiveTimeline().getCommitsAndCompactionsTimeline().filterCompletedInstants();;
HoodieTimeline sourceTimeline = source.getActiveTimeline().getCommitsAndCompactionsTimeline()
.filterCompletedInstants();
;
String targetLatestCommit =
targetTimeline.getInstants().iterator().hasNext() ? "0" : targetTimeline.lastInstant().get().getTimestamp();
targetTimeline.getInstants().iterator().hasNext() ? "0"
: targetTimeline.lastInstant().get().getTimestamp();
String sourceLatestCommit =
sourceTimeline.getInstants().iterator().hasNext() ? "0" : sourceTimeline.lastInstant().get().getTimestamp();
sourceTimeline.getInstants().iterator().hasNext() ? "0"
: sourceTimeline.lastInstant().get().getTimestamp();
if (sourceLatestCommit != null &&
HoodieTimeline.compareTimestamps(targetLatestCommit, sourceLatestCommit, HoodieTimeline.GREATER)) {
HoodieTimeline
.compareTimestamps(targetLatestCommit, sourceLatestCommit, HoodieTimeline.GREATER)) {
// source is behind the target
List<String> commitsToCatchup =
targetTimeline.findInstantsAfter(sourceLatestCommit, Integer.MAX_VALUE)

View File

@@ -18,15 +18,15 @@ package com.uber.hoodie.cli.commands;
import com.uber.hoodie.cli.HoodieCLI;
import com.uber.hoodie.common.table.HoodieTableMetaClient;
import java.io.IOException;
import org.springframework.shell.core.CommandMarker;
import org.springframework.shell.core.annotation.CliCommand;
import org.springframework.shell.core.annotation.CliOption;
import org.springframework.stereotype.Component;
import java.io.IOException;
@Component
public class DatasetsCommand implements CommandMarker {
@CliCommand(value = "connect", help = "Connect to a hoodie dataset")
public String connect(
@CliOption(key = {"path"}, mandatory = true, help = "Base Path of the dataset")

View File

@@ -68,7 +68,8 @@ public class HDFSParquetImportCommand implements CommandMarker {
boolean initialized = HoodieCLI.initConf();
HoodieCLI.initFS(initialized);
String sparkPropertiesPath = Utils
.getDefaultPropertiesFile(scala.collection.JavaConversions.propertiesAsScalaMap(System.getProperties()));
.getDefaultPropertiesFile(
scala.collection.JavaConversions.propertiesAsScalaMap(System.getProperties()));
SparkLauncher sparkLauncher = SparkUtil.initLauncher(sparkPropertiesPath);
sparkLauncher.addAppArgs(SparkCommand.IMPORT.toString(), srcPath, targetPath, tableName,

View File

@@ -16,23 +16,23 @@
package com.uber.hoodie.cli.commands;
import com.uber.hoodie.cli.HoodieCLI;
import com.uber.hoodie.cli.utils.CommitUtil;
import com.uber.hoodie.cli.utils.HiveUtil;
import com.uber.hoodie.cli.HoodieCLI;
import com.uber.hoodie.common.table.HoodieTableMetaClient;
import com.uber.hoodie.common.table.HoodieTimeline;
import com.uber.hoodie.common.table.timeline.HoodieInstant;
import java.util.List;
import java.util.stream.Collectors;
import org.springframework.shell.core.CommandMarker;
import org.springframework.shell.core.annotation.CliAvailabilityIndicator;
import org.springframework.shell.core.annotation.CliCommand;
import org.springframework.shell.core.annotation.CliOption;
import org.springframework.stereotype.Component;
import java.util.List;
import java.util.stream.Collectors;
@Component
public class HoodieSyncCommand implements CommandMarker {
@CliAvailabilityIndicator({"sync validate"})
public boolean isSyncVerificationAvailable() {
return HoodieCLI.tableMetadata != null && HoodieCLI.syncTableMetadata != null;
@@ -70,14 +70,18 @@ public class HoodieSyncCommand implements CommandMarker {
sourceCount = HiveUtil.countRecords(hiveServerUrl, source, srcDb, hiveUser, hivePass);
targetCount = HiveUtil.countRecords(hiveServerUrl, target, tgtDb, hiveUser, hivePass);
} else if ("latestPartitions".equals(mode)) {
sourceCount = HiveUtil.countRecords(hiveServerUrl, source, srcDb, partitionCount, hiveUser, hivePass);
targetCount = HiveUtil.countRecords(hiveServerUrl, target, tgtDb, partitionCount, hiveUser, hivePass);
sourceCount = HiveUtil
.countRecords(hiveServerUrl, source, srcDb, partitionCount, hiveUser, hivePass);
targetCount = HiveUtil
.countRecords(hiveServerUrl, target, tgtDb, partitionCount, hiveUser, hivePass);
}
String targetLatestCommit =
targetTimeline.getInstants().iterator().hasNext() ? "0" : targetTimeline.lastInstant().get().getTimestamp();
targetTimeline.getInstants().iterator().hasNext() ? "0"
: targetTimeline.lastInstant().get().getTimestamp();
String sourceLatestCommit =
sourceTimeline.getInstants().iterator().hasNext() ? "0" : sourceTimeline.lastInstant().get().getTimestamp();
sourceTimeline.getInstants().iterator().hasNext() ? "0"
: sourceTimeline.lastInstant().get().getTimestamp();
if (sourceLatestCommit != null && HoodieTimeline
.compareTimestamps(targetLatestCommit, sourceLatestCommit, HoodieTimeline.GREATER)) {

View File

@@ -22,7 +22,8 @@ import com.uber.hoodie.cli.utils.InputStreamConsumer;
import com.uber.hoodie.cli.utils.SparkUtil;
import com.uber.hoodie.common.model.HoodiePartitionMetadata;
import com.uber.hoodie.common.util.FSUtils;
import java.io.IOException;
import java.util.List;
import org.apache.hadoop.fs.Path;
import org.apache.spark.launcher.SparkLauncher;
import org.springframework.shell.core.CommandMarker;
@@ -31,9 +32,6 @@ import org.springframework.shell.core.annotation.CliCommand;
import org.springframework.shell.core.annotation.CliOption;
import org.springframework.stereotype.Component;
import java.io.IOException;
import java.util.List;
@Component
public class RepairsCommand implements CommandMarker {
@@ -52,7 +50,8 @@ public class RepairsCommand implements CommandMarker {
@CliOption(key = {
"duplicatedPartitionPath"}, help = "Partition Path containing the duplicates", mandatory = true)
final String duplicatedPartitionPath,
@CliOption(key = {"repairedOutputPath"}, help = "Location to place the repaired files", mandatory = true)
@CliOption(key = {
"repairedOutputPath"}, help = "Location to place the repaired files", mandatory = true)
final String repairedOutputPath,
@CliOption(key = {"sparkProperties"}, help = "Spark Properites File Path", mandatory = true)
final String sparkPropertiesPath) throws Exception {
@@ -71,7 +70,6 @@ public class RepairsCommand implements CommandMarker {
}
@CliCommand(value = "repair addpartitionmeta", help = "Add partition metadata to a dataset, if not present")
public String addPartitionMeta(
@CliOption(key = {"dryrun"},
@@ -79,17 +77,20 @@ public class RepairsCommand implements CommandMarker {
unspecifiedDefaultValue = "true")
final boolean dryRun) throws IOException {
String latestCommit = HoodieCLI.tableMetadata.getActiveTimeline().getCommitTimeline().lastInstant().get().getTimestamp();
String latestCommit = HoodieCLI.tableMetadata.getActiveTimeline().getCommitTimeline()
.lastInstant().get().getTimestamp();
List<String> partitionPaths = FSUtils.getAllFoldersThreeLevelsDown(HoodieCLI.fs,
HoodieCLI.tableMetadata.getBasePath());
Path basePath = new Path(HoodieCLI.tableMetadata.getBasePath());
String[][] rows = new String[partitionPaths.size() + 1][];
int ind = 0;
for (String partition: partitionPaths) {
for (String partition : partitionPaths) {
Path partitionPath = new Path(basePath, partition);
String[] row = new String[3];
row[0] = partition; row[1] = "Yes"; row[2] = "None";
row[0] = partition;
row[1] = "Yes";
row[2] = "None";
if (!HoodiePartitionMetadata.hasPartitionMetadata(HoodieCLI.fs, partitionPath)) {
row[1] = "No";
if (!dryRun) {
@@ -105,6 +106,6 @@ public class RepairsCommand implements CommandMarker {
}
return HoodiePrintHelper.print(
new String[] {"Partition Path", "Metadata Present?", "Action"}, rows);
new String[]{"Partition Path", "Metadata Present?", "Action"}, rows);
}
}

View File

@@ -27,6 +27,10 @@ import com.uber.hoodie.common.table.timeline.HoodieInstant;
import com.uber.hoodie.config.HoodieIndexConfig;
import com.uber.hoodie.config.HoodieWriteConfig;
import com.uber.hoodie.index.HoodieIndex;
import java.io.IOException;
import java.util.Collections;
import java.util.List;
import java.util.stream.Collectors;
import org.apache.spark.api.java.JavaSparkContext;
import org.apache.spark.launcher.SparkLauncher;
import org.springframework.shell.core.CommandMarker;
@@ -35,13 +39,9 @@ import org.springframework.shell.core.annotation.CliCommand;
import org.springframework.shell.core.annotation.CliOption;
import org.springframework.stereotype.Component;
import java.io.IOException;
import java.util.Collections;
import java.util.List;
import java.util.stream.Collectors;
@Component
public class SavepointsCommand implements CommandMarker {
@CliAvailabilityIndicator({"savepoints show"})
public boolean isShowAvailable() {
return HoodieCLI.tableMetadata != null;
@@ -60,7 +60,8 @@ public class SavepointsCommand implements CommandMarker {
@CliAvailabilityIndicator({"savepoint rollback"})
public boolean isRollbackToSavepointAvailable() {
return HoodieCLI.tableMetadata != null && !HoodieCLI.tableMetadata.getActiveTimeline().getSavePointTimeline().filterCompletedInstants().empty();
return HoodieCLI.tableMetadata != null && !HoodieCLI.tableMetadata.getActiveTimeline()
.getSavePointTimeline().filterCompletedInstants().empty();
}
@CliCommand(value = "savepoints show", help = "Show the savepoints")
@@ -72,9 +73,9 @@ public class SavepointsCommand implements CommandMarker {
Collections.reverse(commits);
for (int i = 0; i < commits.size(); i++) {
HoodieInstant commit = commits.get(i);
rows[i] = new String[] {commit.getTimestamp()};
rows[i] = new String[]{commit.getTimestamp()};
}
return HoodiePrintHelper.print(new String[] {"SavepointTime"}, rows);
return HoodiePrintHelper.print(new String[]{"SavepointTime"}, rows);
}
@CliCommand(value = "savepoint create", help = "Savepoint a commit")
@@ -152,5 +153,4 @@ public class SavepointsCommand implements CommandMarker {
}
}

View File

@@ -52,7 +52,7 @@ public class SparkMain {
JavaSparkContext jsc = SparkUtil.initJavaSparkConf("hoodie-cli-" + command);
int returnCode = 0;
switch(cmd) {
switch (cmd) {
case ROLLBACK:
assert (args.length == 3);
returnCode = rollback(jsc, args[1], args[2]);
@@ -98,7 +98,7 @@ public class SparkMain {
String basePath)
throws Exception {
DedupeSparkJob job = new DedupeSparkJob(basePath,
duplicatedPartitionPath,repairedOutputPath,new SQLContext(jsc), FSUtils.getFs());
duplicatedPartitionPath, repairedOutputPath, new SQLContext(jsc), FSUtils.getFs());
job.fixDuplicates(true);
return 0;
}
@@ -115,7 +115,8 @@ public class SparkMain {
}
}
private static int rollbackToSavepoint(JavaSparkContext jsc, String savepointTime, String basePath)
private static int rollbackToSavepoint(JavaSparkContext jsc, String savepointTime,
String basePath)
throws Exception {
HoodieWriteClient client = createHoodieClient(jsc, basePath);
if (client.rollbackToSavepoint(savepointTime)) {

View File

@@ -28,7 +28,10 @@ import com.uber.hoodie.common.table.timeline.HoodieActiveTimeline;
import com.uber.hoodie.common.table.timeline.HoodieInstant;
import com.uber.hoodie.common.util.FSUtils;
import com.uber.hoodie.common.util.NumericUtils;
import java.io.IOException;
import java.text.DecimalFormat;
import java.util.HashMap;
import java.util.stream.Collectors;
import org.apache.hadoop.fs.FileStatus;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
@@ -38,13 +41,9 @@ import org.springframework.shell.core.annotation.CliCommand;
import org.springframework.shell.core.annotation.CliOption;
import org.springframework.stereotype.Component;
import java.io.IOException;
import java.text.DecimalFormat;
import java.util.HashMap;
import java.util.stream.Collectors;
@Component
public class StatsCommand implements CommandMarker {
@CliAvailabilityIndicator({"stats wa"})
public boolean isWriteAmpAvailable() {
return HoodieCLI.tableMetadata != null;
@@ -64,13 +63,14 @@ public class StatsCommand implements CommandMarker {
for (HoodieInstant commitTime : timeline.getInstants().collect(
Collectors.toList())) {
String waf = "0";
HoodieCommitMetadata commit = HoodieCommitMetadata.fromBytes(activeTimeline.getInstantDetails(commitTime).get());
HoodieCommitMetadata commit = HoodieCommitMetadata
.fromBytes(activeTimeline.getInstantDetails(commitTime).get());
if (commit.fetchTotalUpdateRecordsWritten() > 0) {
waf = df.format(
(float) commit.fetchTotalRecordsWritten() / commit
.fetchTotalUpdateRecordsWritten());
}
rows[i++] = new String[] {commitTime.getTimestamp(),
rows[i++] = new String[]{commitTime.getTimestamp(),
String.valueOf(commit.fetchTotalUpdateRecordsWritten()),
String.valueOf(commit.fetchTotalRecordsWritten()), waf};
totalRecordsUpserted += commit.fetchTotalUpdateRecordsWritten();
@@ -80,10 +80,10 @@ public class StatsCommand implements CommandMarker {
if (totalRecordsUpserted > 0) {
waf = df.format((float) totalRecordsWritten / totalRecordsUpserted);
}
rows[i] = new String[] {"Total", String.valueOf(totalRecordsUpserted),
rows[i] = new String[]{"Total", String.valueOf(totalRecordsUpserted),
String.valueOf(totalRecordsWritten), waf};
return HoodiePrintHelper.print(
new String[] {"CommitTime", "Total Upserted", "Total Written",
new String[]{"CommitTime", "Total Upserted", "Total Written",
"Write Amplifiation Factor"}, rows);
}
@@ -105,7 +105,8 @@ public class StatsCommand implements CommandMarker {
@CliCommand(value = "stats filesizes", help = "File Sizes. Display summary stats on sizes of files")
public String fileSizeStats(
@CliOption(key = {"partitionPath"}, help = "regex to select files, eg: 2016/08/02", unspecifiedDefaultValue = "*/*/*")
@CliOption(key = {
"partitionPath"}, help = "regex to select files, eg: 2016/08/02", unspecifiedDefaultValue = "*/*/*")
final String globRegex) throws IOException {
FileSystem fs = HoodieCLI.fs;
@@ -118,7 +119,7 @@ public class StatsCommand implements CommandMarker {
final int MAX_FILES = 1000000;
Histogram globalHistogram = new Histogram(new UniformReservoir(MAX_FILES));
HashMap<String, Histogram> commitHistoMap = new HashMap<String, Histogram>();
for (FileStatus fileStatus: statuses) {
for (FileStatus fileStatus : statuses) {
String commitTime = FSUtils.getCommitTime(fileStatus.getPath().getName());
long sz = fileStatus.getLen();
if (!commitHistoMap.containsKey(commitTime)) {
@@ -130,7 +131,7 @@ public class StatsCommand implements CommandMarker {
String[][] rows = new String[commitHistoMap.size() + 1][];
int ind = 0;
for (String commitTime: commitHistoMap.keySet()) {
for (String commitTime : commitHistoMap.keySet()) {
Snapshot s = commitHistoMap.get(commitTime).getSnapshot();
rows[ind++] = printFileSizeHistogram(commitTime, s);
}
@@ -138,6 +139,7 @@ public class StatsCommand implements CommandMarker {
rows[ind++] = printFileSizeHistogram("ALL", s);
return HoodiePrintHelper.print(
new String[] {"CommitTime", "Min", "10th", "50th", "avg", "95th", "Max", "NumFiles", "StdDev"}, rows);
new String[]{"CommitTime", "Min", "10th", "50th", "avg", "95th", "Max", "NumFiles",
"StdDev"}, rows);
}
}

View File

@@ -23,9 +23,10 @@ import org.springframework.stereotype.Component;
@Component
public class UtilsCommand implements CommandMarker {
@CliCommand(value = "utils loadClass", help = "Load a class" )
@CliCommand(value = "utils loadClass", help = "Load a class")
public String loadClass(
@CliOption(key = {"class"}, help = "Check mode" ) final String clazz
@CliOption(key = {"class"}, help = "Check mode") final String clazz
) throws Exception {
Class klass = Class.forName(clazz);
return klass.getProtectionDomain().getCodeSource().getLocation().toExternalForm();

View File

@@ -20,16 +20,17 @@ import com.uber.hoodie.common.model.HoodieCommitMetadata;
import com.uber.hoodie.common.table.HoodieTableMetaClient;
import com.uber.hoodie.common.table.HoodieTimeline;
import com.uber.hoodie.common.table.timeline.HoodieInstant;
import java.io.IOException;
import java.util.List;
public class CommitUtil {
public static long countNewRecords(HoodieTableMetaClient target, List<String> commitsToCatchup)
throws IOException {
long totalNew = 0;
HoodieTimeline timeline = target.getActiveTimeline().reload().getCommitTimeline().filterCompletedInstants();
for(String commit:commitsToCatchup) {
HoodieTimeline timeline = target.getActiveTimeline().reload().getCommitTimeline()
.filterCompletedInstants();
for (String commit : commitsToCatchup) {
HoodieCommitMetadata c = HoodieCommitMetadata.fromBytes(timeline
.getInstantDetails(new HoodieInstant(false, HoodieTimeline.COMMIT_ACTION, commit))
.get());

View File

@@ -17,16 +17,16 @@
package com.uber.hoodie.cli.utils;
import com.uber.hoodie.common.table.HoodieTableMetaClient;
import org.apache.commons.dbcp.BasicDataSource;
import org.joda.time.DateTime;
import javax.sql.DataSource;
import java.sql.Connection;
import java.sql.ResultSet;
import java.sql.SQLException;
import java.sql.Statement;
import javax.sql.DataSource;
import org.apache.commons.dbcp.BasicDataSource;
import org.joda.time.DateTime;
public class HiveUtil {
private static String driverName = "org.apache.hive.jdbc.HiveDriver";
static {
@@ -39,7 +39,8 @@ public class HiveUtil {
private static Connection connection;
private static Connection getConnection(String jdbcUrl, String user, String pass) throws SQLException {
private static Connection getConnection(String jdbcUrl, String user, String pass)
throws SQLException {
DataSource ds = getDatasource(jdbcUrl, user, pass);
return ds.getConnection();
}
@@ -53,22 +54,25 @@ public class HiveUtil {
return ds;
}
public static long countRecords(String jdbcUrl, HoodieTableMetaClient source, String dbName, String user, String pass) throws SQLException {
public static long countRecords(String jdbcUrl, HoodieTableMetaClient source, String dbName,
String user, String pass) throws SQLException {
Connection conn = HiveUtil.getConnection(jdbcUrl, user, pass);
ResultSet rs = null;
Statement stmt = conn.createStatement();
try {
//stmt.execute("set mapred.job.queue.name=<queue_name>");
stmt.execute("set hive.input.format=org.apache.hadoop.hive.ql.io.HiveInputFormat" );
stmt.execute("set hive.stats.autogather=false" );
stmt.execute("set hive.input.format=org.apache.hadoop.hive.ql.io.HiveInputFormat");
stmt.execute("set hive.stats.autogather=false");
rs = stmt.executeQuery(
"select count(`_hoodie_commit_time`) as cnt from " + dbName + "." + source.getTableConfig()
"select count(`_hoodie_commit_time`) as cnt from " + dbName + "." + source
.getTableConfig()
.getTableName());
long count = -1;
if(rs.next()) {
if (rs.next()) {
count = rs.getLong("cnt");
}
System.out.println("Total records in " + source.getTableConfig().getTableName() + " is " + count);
System.out
.println("Total records in " + source.getTableConfig().getTableName() + " is " + count);
return count;
} finally {
if (rs != null) {
@@ -94,7 +98,8 @@ public class HiveUtil {
return countRecords(jdbcUrl, source, srcDb, startDateStr, endDateStr, user, pass);
}
private static long countRecords(String jdbcUrl, HoodieTableMetaClient source, String srcDb, String startDateStr,
private static long countRecords(String jdbcUrl, HoodieTableMetaClient source, String srcDb,
String startDateStr,
String endDateStr, String user, String pass) throws SQLException {
Connection conn = HiveUtil.getConnection(jdbcUrl, user, pass);
ResultSet rs = null;
@@ -107,7 +112,7 @@ public class HiveUtil {
"select count(`_hoodie_commit_time`) as cnt from " + srcDb + "." + source.getTableConfig()
.getTableName() + " where datestr>'" + startDateStr + "' and datestr<='"
+ endDateStr + "'");
if(rs.next()) {
if (rs.next()) {
return rs.getLong("cnt");
}
return -1;

View File

@@ -23,8 +23,10 @@ import java.io.InputStreamReader;
import java.util.logging.Logger;
public class InputStreamConsumer extends Thread {
protected final static Logger LOG = Logger.getLogger(InputStreamConsumer.class.getName());
private InputStream is;
public InputStreamConsumer(InputStream is) {
this.is = is;
}
@@ -35,8 +37,9 @@ public class InputStreamConsumer extends Thread {
InputStreamReader isr = new InputStreamReader(is);
BufferedReader br = new BufferedReader(isr);
String line;
while ( (line = br.readLine()) != null)
while ((line = br.readLine()) != null) {
LOG.info(line);
}
} catch (IOException ioe) {
LOG.severe(ioe.toString());
ioe.printStackTrace();

View File

@@ -18,26 +18,20 @@ package com.uber.hoodie.cli.utils;
import com.uber.hoodie.HoodieWriteClient;
import com.uber.hoodie.cli.commands.SparkMain;
import java.io.File;
import java.net.URISyntaxException;
import org.apache.log4j.Logger;
import org.apache.spark.SparkConf;
import org.apache.spark.api.java.JavaSparkContext;
import org.apache.spark.launcher.SparkLauncher;
import java.io.File;
import java.net.URISyntaxException;
public class SparkUtil {
public static Logger logger = Logger.getLogger(SparkUtil.class);
public static final String DEFUALT_SPARK_MASTER = "yarn-client";
/**
*
* TODO: Need to fix a bunch of hardcoded stuff here eg: history server, spark distro
*
* @return
* @throws URISyntaxException
*/
public static SparkLauncher initLauncher(String propertiesFile) throws URISyntaxException {
String currentJar = new File(
@@ -65,7 +59,8 @@ public class SparkUtil {
// Configure hadoop conf
sparkConf.set("spark.hadoop.mapred.output.compress", "true");
sparkConf.set("spark.hadoop.mapred.output.compression.codec", "true");
sparkConf.set("spark.hadoop.mapred.output.compression.codec", "org.apache.hadoop.io.compress.GzipCodec");
sparkConf.set("spark.hadoop.mapred.output.compression.codec",
"org.apache.hadoop.io.compress.GzipCodec");
sparkConf.set("spark.hadoop.mapred.output.compression.type", "BLOCK");
sparkConf = HoodieWriteClient.registerClasses(sparkConf);

View File

@@ -21,6 +21,6 @@
xsi:schemaLocation="http://www.springframework.org/schema/beans http://www.springframework.org/schema/beans/spring-beans.xsd
http://www.springframework.org/schema/context http://www.springframework.org/schema/context/spring-context.xsd">
<context:component-scan base-package="com.uber.hoodie.cli" />
<context:component-scan base-package="com.uber.hoodie.cli"/>
</beans>

View File

@@ -34,7 +34,7 @@ import scala.collection.mutable._
/**
* Spark job to de-duplicate data present in a partition path
*/
class DedupeSparkJob (basePath: String,
class DedupeSparkJob(basePath: String,
duplicatedPartitionPath: String,
repairOutputPath: String,
sqlContext: SQLContext,
@@ -50,8 +50,9 @@ class DedupeSparkJob (basePath: String,
* @param tblName
* @return
*/
def getDupeKeyDF(tblName: String) : DataFrame = {
val dupeSql = s"""
def getDupeKeyDF(tblName: String): DataFrame = {
val dupeSql =
s"""
select `${HoodieRecord.RECORD_KEY_METADATA_FIELD}` as dupe_key,
count(*) as dupe_cnt
from ${tblName}
@@ -69,7 +70,7 @@ class DedupeSparkJob (basePath: String,
*
* @return
*/
private def planDuplicateFix() : HashMap[String, HashSet[String]] = {
private def planDuplicateFix(): HashMap[String, HashSet[String]] = {
val tmpTableName = s"htbl_${System.currentTimeMillis()}"
val dedupeTblName = s"${tmpTableName}_dupeKeys"
@@ -78,17 +79,18 @@ class DedupeSparkJob (basePath: String,
val allFiles = fs.listStatus(new org.apache.hadoop.fs.Path(s"${basePath}/${duplicatedPartitionPath}"))
val fsView = new HoodieTableFileSystemView(metadata, metadata.getActiveTimeline.getCommitTimeline.filterCompletedInstants(), allFiles)
val latestFiles:java.util.List[HoodieDataFile] = fsView.getLatestDataFiles().collect(Collectors.toList[HoodieDataFile]())
val latestFiles: java.util.List[HoodieDataFile] = fsView.getLatestDataFiles().collect(Collectors.toList[HoodieDataFile]())
val filteredStatuses = latestFiles.map(f => f.getPath)
LOG.info(s" List of files under partition: ${} => ${filteredStatuses.mkString(" ")}")
val df = sqlContext.parquetFile(filteredStatuses:_*)
val df = sqlContext.parquetFile(filteredStatuses: _*)
df.registerTempTable(tmpTableName)
val dupeKeyDF = getDupeKeyDF(tmpTableName)
dupeKeyDF.registerTempTable(dedupeTblName)
// Obtain necessary satellite information for duplicate rows
val dupeDataSql = s"""
val dupeDataSql =
s"""
SELECT `_hoodie_record_key`, `_hoodie_partition_path`, `_hoodie_file_name`, `_hoodie_commit_time`
FROM ${tmpTableName} h
JOIN ${dedupeTblName} d
@@ -111,9 +113,9 @@ class DedupeSparkJob (basePath: String,
rows.foreach(r => {
val c = r(3).asInstanceOf[String].toLong
if (c != maxCommit){
if (c != maxCommit) {
val f = r(2).asInstanceOf[String].split("_")(0)
if (!fileToDeleteKeyMap.contains(f)){
if (!fileToDeleteKeyMap.contains(f)) {
fileToDeleteKeyMap(f) = HashSet[String]()
}
fileToDeleteKeyMap(f).add(key)
@@ -130,28 +132,30 @@ class DedupeSparkJob (basePath: String,
val allFiles = fs.listStatus(new Path(s"${basePath}/${duplicatedPartitionPath}"))
val fsView = new HoodieTableFileSystemView(metadata, metadata.getActiveTimeline.getCommitTimeline.filterCompletedInstants(), allFiles)
val latestFiles:java.util.List[HoodieDataFile] = fsView.getLatestDataFiles().collect(Collectors.toList[HoodieDataFile]())
val latestFiles: java.util.List[HoodieDataFile] = fsView.getLatestDataFiles().collect(Collectors.toList[HoodieDataFile]())
val fileNameToPathMap = latestFiles.map(f => (f.getFileId, new Path(f.getPath))).toMap
val dupeFixPlan = planDuplicateFix()
// 1. Copy all latest files into the temp fix path
fileNameToPathMap.foreach{ case(fileName, filePath) => {
fileNameToPathMap.foreach { case (fileName, filePath) => {
val badSuffix = if (dupeFixPlan.contains(fileName)) ".bad" else ""
val dstPath = new Path(s"${repairOutputPath}/${filePath.getName}${badSuffix}")
LOG.info(s"Copying from ${filePath} to ${dstPath}")
FileUtil.copy(fs, filePath, fs, dstPath, false, true, fs.getConf)
}}
}
}
// 2. Remove duplicates from the bad files
dupeFixPlan.foreach{case(fileName, keysToSkip) => {
dupeFixPlan.foreach { case (fileName, keysToSkip) => {
val commitTime = FSUtils.getCommitTime(fileNameToPathMap(fileName).getName)
val badFilePath = new Path(s"${repairOutputPath}/${fileNameToPathMap(fileName).getName}.bad")
val newFilePath = new Path(s"${repairOutputPath}/${fileNameToPathMap(fileName).getName}")
LOG.info(" Skipping and writing new file for : " + fileName)
SparkHelpers.skipKeysAndWriteNewFile(commitTime, fs, badFilePath, newFilePath, dupeFixPlan(fileName))
fs.delete(badFilePath, false)
}}
}
}
// 3. Check that there are no duplicates anymore.
val df = sqlContext.read.parquet(s"${repairOutputPath}/*.parquet")
@@ -186,6 +190,7 @@ class DedupeSparkJob (basePath: String,
LOG.info(s"[FOR REAL!!!] Copying from ${srcPath} to ${dstPath}")
FileUtil.copy(fs, srcPath, fs, dstPath, false, true, fs.getConf)
}
}}
}
}
}
}

View File

@@ -17,9 +17,9 @@
package com.uber.hoodie.cli
import com.uber.hoodie.avro.HoodieAvroWriteSupport
import com.uber.hoodie.common.{BloomFilter, HoodieJsonPayload}
import com.uber.hoodie.common.model.HoodieRecord
import com.uber.hoodie.common.util.ParquetUtils
import com.uber.hoodie.common.{BloomFilter, HoodieJsonPayload}
import com.uber.hoodie.config.{HoodieIndexConfig, HoodieStorageConfig}
import com.uber.hoodie.io.storage.{HoodieParquetConfig, HoodieParquetWriter}
import org.apache.avro.Schema
@@ -107,7 +107,7 @@ class SparkHelper(sqlContext: SQLContext, fs: FileSystem) {
* @param file
* @param sqlContext
*/
def getKeyCount(file: String, sqlContext: org.apache.spark.sql.SQLContext) ={
def getKeyCount(file: String, sqlContext: org.apache.spark.sql.SQLContext) = {
println(getRowKeyDF(file).collect().size)
}
@@ -122,7 +122,7 @@ class SparkHelper(sqlContext: SQLContext, fs: FileSystem) {
* @param file
* @return
*/
def fileKeysAgainstBF(conf: Configuration, sqlContext: SQLContext, file: String) : Boolean = {
def fileKeysAgainstBF(conf: Configuration, sqlContext: SQLContext, file: String): Boolean = {
val bfStr = SparkHelpers.getBloomFilter(file, conf)
val bf = new com.uber.hoodie.common.BloomFilter(bfStr)
val foundCount = sqlContext.parquetFile(file)
@@ -134,7 +134,7 @@ class SparkHelper(sqlContext: SQLContext, fs: FileSystem) {
totalCount == foundCount
}
def getDistinctKeyDF(paths: List[String]) : DataFrame = {
sqlContext.read.parquet(paths:_*).select(s"`${HoodieRecord.RECORD_KEY_METADATA_FIELD}`").distinct()
def getDistinctKeyDF(paths: List[String]): DataFrame = {
sqlContext.read.parquet(paths: _*).select(s"`${HoodieRecord.RECORD_KEY_METADATA_FIELD}`").distinct()
}
}

View File

@@ -15,7 +15,9 @@
~ limitations under the License.
-->
<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
<project xmlns="http://maven.apache.org/POM/4.0.0"
xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
<parent>
<artifactId>hoodie</artifactId>
<groupId>com.uber.hoodie</groupId>

View File

@@ -17,25 +17,19 @@
package com.uber.hoodie;
import com.google.common.base.Optional;
import com.uber.hoodie.common.model.HoodieCommitMetadata;
import com.uber.hoodie.common.model.HoodieDataFile;
import com.uber.hoodie.common.model.HoodieKey;
import com.uber.hoodie.common.model.HoodieRecord;
import com.uber.hoodie.common.table.HoodieTableMetaClient;
import com.uber.hoodie.common.table.HoodieTimeline;
import com.uber.hoodie.common.table.TableFileSystemView;
import com.uber.hoodie.common.table.timeline.HoodieInstant;
import com.uber.hoodie.common.table.view.HoodieTableFileSystemView;
import com.uber.hoodie.common.util.FSUtils;
import com.uber.hoodie.config.HoodieWriteConfig;
import com.uber.hoodie.exception.HoodieException;
import com.uber.hoodie.index.bloom.HoodieBloomIndex;
import com.uber.hoodie.table.HoodieTable;
import java.io.Serializable;
import java.util.HashSet;
import java.util.List;
import java.util.Set;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.log4j.LogManager;
import org.apache.log4j.Logger;
import org.apache.spark.SparkConf;
@@ -46,21 +40,10 @@ import org.apache.spark.sql.Dataset;
import org.apache.spark.sql.Row;
import org.apache.spark.sql.SQLContext;
import org.apache.spark.sql.types.StructType;
import java.io.IOException;
import java.io.Serializable;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.HashSet;
import java.util.List;
import java.util.Set;
import java.util.stream.Collectors;
import scala.Tuple2;
/**
* Provides an RDD based API for accessing/filtering Hoodie tables, based on keys.
*
*/
public class HoodieReadClient implements Serializable {
@@ -70,8 +53,8 @@ public class HoodieReadClient implements Serializable {
private transient final FileSystem fs;
/**
* TODO: We need to persist the index type into hoodie.properties and be able to access the
* index just with a simple basepath pointing to the dataset. Until, then just always assume a
* TODO: We need to persist the index type into hoodie.properties and be able to access the index
* just with a simple basepath pointing to the dataset. Until, then just always assume a
* BloomIndex
*/
private transient final HoodieBloomIndex index;
@@ -117,7 +100,8 @@ public class HoodieReadClient implements Serializable {
private void assertSqlContext() {
if (!sqlContextOpt.isPresent()) {
throw new IllegalStateException("SQLContext must be set, when performing dataframe operations");
throw new IllegalStateException(
"SQLContext must be set, when performing dataframe operations");
}
}
@@ -158,10 +142,10 @@ public class HoodieReadClient implements Serializable {
}
/**
* Checks if the given [Keys] exists in the hoodie table and returns [Key,
* Optional[FullFilePath]] If the optional FullFilePath value is not present, then the key is
* not found. If the FullFilePath value is present, it is the path component (without scheme) of
* the URI underlying file
* Checks if the given [Keys] exists in the hoodie table and returns [Key, Optional[FullFilePath]]
* If the optional FullFilePath value is not present, then the key is not found. If the
* FullFilePath value is present, it is the path component (without scheme) of the URI underlying
* file
*/
public JavaPairRDD<HoodieKey, Optional<String>> checkExists(JavaRDD<HoodieKey> hoodieKeys) {
return index.fetchRecordLocation(hoodieKeys, hoodieTable);

View File

@@ -50,10 +50,21 @@ import com.uber.hoodie.func.BulkInsertMapFunction;
import com.uber.hoodie.index.HoodieIndex;
import com.uber.hoodie.io.HoodieCommitArchiveLog;
import com.uber.hoodie.metrics.HoodieMetrics;
import com.uber.hoodie.table.UserDefinedBulkInsertPartitioner;
import com.uber.hoodie.table.HoodieTable;
import com.uber.hoodie.table.UserDefinedBulkInsertPartitioner;
import com.uber.hoodie.table.WorkloadProfile;
import com.uber.hoodie.table.WorkloadStat;
import java.io.IOException;
import java.io.Serializable;
import java.nio.charset.StandardCharsets;
import java.text.ParseException;
import java.util.Collections;
import java.util.Date;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import java.util.Optional;
import java.util.stream.Collectors;
import org.apache.hadoop.fs.FileSystem;
import org.apache.log4j.LogManager;
import org.apache.log4j.Logger;
@@ -66,25 +77,12 @@ import org.apache.spark.storage.StorageLevel;
import scala.Option;
import scala.Tuple2;
import java.io.IOException;
import java.io.Serializable;
import java.nio.charset.StandardCharsets;
import java.text.ParseException;
import java.util.Collections;
import java.util.Date;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import java.util.Optional;
import java.util.stream.Collectors;
/**
* Hoodie Write Client helps you build datasets on HDFS [insert()] and then
* perform efficient mutations on a HDFS dataset [upsert()]
*
* Note that, at any given time, there can only be one Spark job performing
* these operatons on a Hoodie dataset.
* Hoodie Write Client helps you build datasets on HDFS [insert()] and then perform efficient
* mutations on a HDFS dataset [upsert()]
*
* Note that, at any given time, there can only be one Spark job performing these operatons on a
* Hoodie dataset.
*/
public class HoodieWriteClient<T extends HoodieRecordPayload> implements Serializable {
@@ -102,7 +100,8 @@ public class HoodieWriteClient<T extends HoodieRecordPayload> implements Seriali
* @param clientConfig
* @throws Exception
*/
public HoodieWriteClient(JavaSparkContext jsc, HoodieWriteConfig clientConfig) throws Exception {
public HoodieWriteClient(JavaSparkContext jsc, HoodieWriteConfig clientConfig)
throws Exception {
this(jsc, clientConfig, false);
}
@@ -111,7 +110,8 @@ public class HoodieWriteClient<T extends HoodieRecordPayload> implements Seriali
* @param clientConfig
* @param rollbackInFlight
*/
public HoodieWriteClient(JavaSparkContext jsc, HoodieWriteConfig clientConfig, boolean rollbackInFlight) {
public HoodieWriteClient(JavaSparkContext jsc, HoodieWriteConfig clientConfig,
boolean rollbackInFlight) {
this.fs = FSUtils.getFs();
this.jsc = jsc;
this.config = clientConfig;
@@ -170,8 +170,8 @@ public class HoodieWriteClient<T extends HoodieRecordPayload> implements Seriali
* Inserts the given HoodieRecords, into the table. This API is intended to be used for normal
* writes.
*
* This implementation skips the index check and is able to leverage benefits such as
* small file handling/blocking alignment, as with upsert(), by profiling the workload
* This implementation skips the index check and is able to leverage benefits such as small file
* handling/blocking alignment, as with upsert(), by profiling the workload
*
* @param records HoodieRecords to insert
* @param commitTime Commit Time handle
@@ -210,7 +210,8 @@ public class HoodieWriteClient<T extends HoodieRecordPayload> implements Seriali
* @param commitTime Commit Time handle
* @return JavaRDD[WriteStatus] - RDD of WriteStatus to inspect errors and counts
*/
public JavaRDD<WriteStatus> bulkInsert(JavaRDD<HoodieRecord<T>> records, final String commitTime) {
public JavaRDD<WriteStatus> bulkInsert(JavaRDD<HoodieRecord<T>> records,
final String commitTime) {
return bulkInsert(records, commitTime, Option.empty());
}
@@ -221,16 +222,18 @@ public class HoodieWriteClient<T extends HoodieRecordPayload> implements Seriali
*
* This implementation uses sortBy (which does range partitioning based on reservoir sampling) and
* attempts to control the numbers of files with less memory compared to the {@link
* HoodieWriteClient#insert(JavaRDD, String)}. Optionally it allows users to specify their own partitioner. If
* specified then it will be used for repartitioning records. See {@link UserDefinedBulkInsertPartitioner}.
* HoodieWriteClient#insert(JavaRDD, String)}. Optionally it allows users to specify their own
* partitioner. If specified then it will be used for repartitioning records. See {@link
* UserDefinedBulkInsertPartitioner}.
*
* @param records HoodieRecords to insert
* @param commitTime Commit Time handle
* @param bulkInsertPartitioner If specified then it will be used to partition input records before they are
* inserted into hoodie.
* @param bulkInsertPartitioner If specified then it will be used to partition input records
* before they are inserted into hoodie.
* @return JavaRDD[WriteStatus] - RDD of WriteStatus to inspect errors and counts
*/
public JavaRDD<WriteStatus> bulkInsert(JavaRDD<HoodieRecord<T>> records, final String commitTime,
public JavaRDD<WriteStatus> bulkInsert(JavaRDD<HoodieRecord<T>> records,
final String commitTime,
Option<UserDefinedBulkInsertPartitioner> bulkInsertPartitioner) {
writeContext = metrics.getCommitCtx();
// Create a Hoodie table which encapsulated the commits and files visible
@@ -240,7 +243,8 @@ public class HoodieWriteClient<T extends HoodieRecordPayload> implements Seriali
try {
// De-dupe/merge if needed
JavaRDD<HoodieRecord<T>> dedupedRecords =
combineOnCondition(config.shouldCombineBeforeInsert(), records, config.getInsertShuffleParallelism());
combineOnCondition(config.shouldCombineBeforeInsert(), records,
config.getInsertShuffleParallelism());
final JavaRDD<HoodieRecord<T>> repartitionedRecords;
if (bulkInsertPartitioner.isDefined()) {
@@ -259,7 +263,8 @@ public class HoodieWriteClient<T extends HoodieRecordPayload> implements Seriali
}, true, config.getBulkInsertShuffleParallelism());
}
JavaRDD<WriteStatus> writeStatusRDD = repartitionedRecords
.mapPartitionsWithIndex(new BulkInsertMapFunction<T>(commitTime, config, table), true)
.mapPartitionsWithIndex(new BulkInsertMapFunction<T>(commitTime, config, table),
true)
.flatMap(writeStatuses -> writeStatuses.iterator());
return updateIndexAndCommitIfNeeded(writeStatusRDD, table, commitTime);
@@ -267,12 +272,13 @@ public class HoodieWriteClient<T extends HoodieRecordPayload> implements Seriali
if (e instanceof HoodieInsertException) {
throw e;
}
throw new HoodieInsertException("Failed to bulk insert for commit time " + commitTime, e);
throw new HoodieInsertException("Failed to bulk insert for commit time " + commitTime,
e);
}
}
private void commitOnAutoCommit(String commitTime, JavaRDD<WriteStatus> resultRDD) {
if(config.shouldAutoCommit()) {
if (config.shouldAutoCommit()) {
logger.info("Auto commit enabled: Committing " + commitTime);
boolean commitResult = commit(commitTime, resultRDD);
if (!commitResult) {
@@ -286,24 +292,22 @@ public class HoodieWriteClient<T extends HoodieRecordPayload> implements Seriali
private JavaRDD<HoodieRecord<T>> combineOnCondition(boolean condition,
JavaRDD<HoodieRecord<T>> records,
int parallelism) {
if(condition) {
if (condition) {
return deduplicateRecords(records, parallelism);
}
return records;
}
/**
*
* Save the workload profile in an intermediate file (here re-using commit files)
* This is useful when performing rollback for MOR datasets. Only updates are recorded
* in the workload profile metadata since updates to log blocks are unknown across batches
* Inserts (which are new parquet files) are rolled back based on commit time.
* // TODO : Create a new WorkloadProfile metadata file instead of using HoodieCommitMetadata
* @param profile
* @param commitTime
* @throws HoodieCommitException
* Save the workload profile in an intermediate file (here re-using commit files) This is useful
* when performing rollback for MOR datasets. Only updates are recorded in the workload profile
* metadata since updates to log blocks are unknown across batches Inserts (which are new parquet
* files) are rolled back based on commit time. // TODO : Create a new WorkloadProfile metadata
* file instead of using HoodieCommitMetadata
*/
private void saveWorkloadProfileMetadataToInflight(WorkloadProfile profile, HoodieTable<T> table, String commitTime) throws HoodieCommitException {
private void saveWorkloadProfileMetadataToInflight(WorkloadProfile profile,
HoodieTable<T> table,
String commitTime) throws HoodieCommitException {
try {
HoodieCommitMetadata metadata = new HoodieCommitMetadata();
profile.getPartitionPaths().stream().forEach(path -> {
@@ -320,8 +324,9 @@ public class HoodieWriteClient<T extends HoodieRecordPayload> implements Seriali
Optional<HoodieInstant> instant = activeTimeline.filterInflights().lastInstant();
activeTimeline.saveToInflight(instant.get(),
Optional.of(metadata.toJsonString().getBytes(StandardCharsets.UTF_8)));
} catch(IOException io) {
throw new HoodieCommitException("Failed to commit " + commitTime + " unable to save inflight metadata ", io);
} catch (IOException io) {
throw new HoodieCommitException(
"Failed to commit " + commitTime + " unable to save inflight metadata ", io);
}
}
@@ -358,7 +363,8 @@ public class HoodieWriteClient<T extends HoodieRecordPayload> implements Seriali
return updateIndexAndCommitIfNeeded(writeStatusRDD, hoodieTable, commitTime);
}
private Partitioner getPartitioner(HoodieTable table, boolean isUpsert, WorkloadProfile profile) {
private Partitioner getPartitioner(HoodieTable table, boolean isUpsert,
WorkloadProfile profile) {
if (isUpsert) {
return table.getUpsertPartitioner(profile);
} else {
@@ -366,7 +372,8 @@ public class HoodieWriteClient<T extends HoodieRecordPayload> implements Seriali
}
}
private JavaRDD<WriteStatus> updateIndexAndCommitIfNeeded(JavaRDD<WriteStatus> writeStatusRDD, HoodieTable<T> table, String commitTime) {
private JavaRDD<WriteStatus> updateIndexAndCommitIfNeeded(JavaRDD<WriteStatus> writeStatusRDD,
HoodieTable<T> table, String commitTime) {
// Update the index back
JavaRDD<WriteStatus> statuses = index.updateLocation(writeStatusRDD, table);
// Trigger the insert and collect statuses
@@ -375,10 +382,13 @@ public class HoodieWriteClient<T extends HoodieRecordPayload> implements Seriali
return statuses;
}
private JavaRDD<HoodieRecord<T>> partition(JavaRDD<HoodieRecord<T>> dedupedRecords, Partitioner partitioner) {
private JavaRDD<HoodieRecord<T>> partition(JavaRDD<HoodieRecord<T>> dedupedRecords,
Partitioner partitioner) {
return dedupedRecords
.mapToPair(record ->
new Tuple2<>(new Tuple2<>(record.getKey(), Option.apply(record.getCurrentLocation())), record))
new Tuple2<>(
new Tuple2<>(record.getKey(), Option.apply(record.getCurrentLocation())),
record))
.partitionBy(partitioner)
.map(tuple -> tuple._2());
}
@@ -438,7 +448,7 @@ public class HoodieWriteClient<T extends HoodieRecordPayload> implements Seriali
// We cannot have unbounded commit files. Archive commits if we have to archive
archiveLog.archiveIfRequired();
if(config.isAutoClean()) {
if (config.isAutoClean()) {
// Call clean to cleanup if there is anything to cleanup after the commit,
logger.info("Auto cleaning is enabled. Running cleaner now");
clean(commitTime);
@@ -465,12 +475,12 @@ public class HoodieWriteClient<T extends HoodieRecordPayload> implements Seriali
}
/**
* Savepoint a specific commit. Latest version of data files as of the passed in commitTime
* will be referenced in the savepoint and will never be cleaned. The savepointed commit
* will never be rolledback or archived.
* Savepoint a specific commit. Latest version of data files as of the passed in commitTime will
* be referenced in the savepoint and will never be cleaned. The savepointed commit will never be
* rolledback or archived.
*
* This gives an option to rollback the state to the savepoint anytime.
* Savepoint needs to be manually created and deleted.
* This gives an option to rollback the state to the savepoint anytime. Savepoint needs to be
* manually created and deleted.
*
* Savepoint should be on a commit that could not have been cleaned.
*
@@ -491,12 +501,12 @@ public class HoodieWriteClient<T extends HoodieRecordPayload> implements Seriali
}
/**
* Savepoint a specific commit. Latest version of data files as of the passed in commitTime
* will be referenced in the savepoint and will never be cleaned. The savepointed commit
* will never be rolledback or archived.
* Savepoint a specific commit. Latest version of data files as of the passed in commitTime will
* be referenced in the savepoint and will never be cleaned. The savepointed commit will never be
* rolledback or archived.
*
* This gives an option to rollback the state to the savepoint anytime.
* Savepoint needs to be manually created and deleted.
* This gives an option to rollback the state to the savepoint anytime. Savepoint needs to be
* manually created and deleted.
*
* Savepoint should be on a commit that could not have been cleaned.
*
@@ -510,9 +520,11 @@ public class HoodieWriteClient<T extends HoodieRecordPayload> implements Seriali
.getHoodieTable(new HoodieTableMetaClient(fs, config.getBasePath(), true), config);
Optional<HoodieInstant> cleanInstant = table.getCompletedCleanTimeline().lastInstant();
HoodieInstant commitInstant = new HoodieInstant(false, HoodieTimeline.COMMIT_ACTION, commitTime);
if(!table.getCompletedCommitTimeline().containsInstant(commitInstant)) {
throw new HoodieSavepointException("Could not savepoint non-existing commit " + commitInstant);
HoodieInstant commitInstant = new HoodieInstant(false, HoodieTimeline.COMMIT_ACTION,
commitTime);
if (!table.getCompletedCommitTimeline().containsInstant(commitInstant)) {
throw new HoodieSavepointException(
"Could not savepoint non-existing commit " + commitInstant);
}
try {
@@ -534,7 +546,8 @@ public class HoodieWriteClient<T extends HoodieRecordPayload> implements Seriali
+ lastCommitRetained);
Map<String, List<String>> latestFilesMap = jsc.parallelize(
FSUtils.getAllPartitionPaths(fs, table.getMetaClient().getBasePath(), config.shouldAssumeDatePartitioning()))
FSUtils.getAllPartitionPaths(fs, table.getMetaClient().getBasePath(),
config.shouldAssumeDatePartitioning()))
.mapToPair((PairFunction<String, String, List<String>>) partitionPath -> {
// Scan all partitions files with this commit time
logger.info("Collecting latest files in partition path " + partitionPath);
@@ -559,8 +572,8 @@ public class HoodieWriteClient<T extends HoodieRecordPayload> implements Seriali
}
/**
* Delete a savepoint that was created. Once the savepoint is deleted, the commit can be rolledback
* and cleaner may clean up data files.
* Delete a savepoint that was created. Once the savepoint is deleted, the commit can be
* rolledback and cleaner may clean up data files.
*
* @param savepointTime - delete the savepoint
* @return true if the savepoint was deleted successfully
@@ -586,9 +599,8 @@ public class HoodieWriteClient<T extends HoodieRecordPayload> implements Seriali
}
/**
* Rollback the state to the savepoint.
* WARNING: This rollsback recent commits and deleted data files. Queries accessing the files
* will mostly fail. This should be done during a downtime.
* Rollback the state to the savepoint. WARNING: This rollsback recent commits and deleted data
* files. Queries accessing the files will mostly fail. This should be done during a downtime.
*
* @param savepointTime - savepoint time to rollback to
* @return true if the savepoint was rollecback to successfully
@@ -616,7 +628,8 @@ public class HoodieWriteClient<T extends HoodieRecordPayload> implements Seriali
// Make sure the rollback was successful
Optional<HoodieInstant> lastInstant =
activeTimeline.reload().getCommitsAndCompactionsTimeline().filterCompletedInstants().lastInstant();
activeTimeline.reload().getCommitsAndCompactionsTimeline().filterCompletedInstants()
.lastInstant();
Preconditions.checkArgument(lastInstant.isPresent());
Preconditions.checkArgument(lastInstant.get().getTimestamp().equals(savepointTime),
savepointTime + "is not the last commit after rolling back " + commitsToRollback
@@ -625,12 +638,9 @@ public class HoodieWriteClient<T extends HoodieRecordPayload> implements Seriali
}
/**
* Rollback the (inflight/committed) record changes with the given commit time.
* Three steps:
* (1) Atomically unpublish this commit
* (2) clean indexing data,
* (3) clean new generated parquet files.
* (4) Finally delete .commit or .inflight file,
* Rollback the (inflight/committed) record changes with the given commit time. Three steps: (1)
* Atomically unpublish this commit (2) clean indexing data, (3) clean new generated parquet
* files. (4) Finally delete .commit or .inflight file,
*/
public boolean rollback(final String commitTime) throws HoodieRollbackException {
rollback(Lists.newArrayList(commitTime));
@@ -638,7 +648,7 @@ public class HoodieWriteClient<T extends HoodieRecordPayload> implements Seriali
}
private void rollback(List<String> commits) {
if(commits.isEmpty()) {
if (commits.isEmpty()) {
logger.info("List of commits to rollback is empty");
return;
}
@@ -702,7 +712,9 @@ public class HoodieWriteClient<T extends HoodieRecordPayload> implements Seriali
Optional<Long> durationInMs = Optional.empty();
if (context != null) {
durationInMs = Optional.of(metrics.getDurationInMs(context.stop()));
Long numFilesDeleted = stats.stream().mapToLong(stat -> stat.getSuccessDeleteFiles().size()).sum();
Long numFilesDeleted = stats.stream()
.mapToLong(stat -> stat.getSuccessDeleteFiles().size())
.sum();
metrics.updateRollbackMetrics(durationInMs.get(), numFilesDeleted);
}
HoodieRollbackMetadata rollbackMetadata =
@@ -733,9 +745,9 @@ public class HoodieWriteClient<T extends HoodieRecordPayload> implements Seriali
}
/**
* Clean up any stale/old files/data lying around (either on file storage or index storage)
* based on the configurations and CleaningPolicy used. (typically files that no longer can be used
* by a running query can be cleaned)
* Clean up any stale/old files/data lying around (either on file storage or index storage) based
* on the configurations and CleaningPolicy used. (typically files that no longer can be used by a
* running query can be cleaned)
*/
public void clean() throws HoodieIOException {
String startCleanTime = HoodieActiveTimeline.createNewCommitTime();
@@ -743,9 +755,9 @@ public class HoodieWriteClient<T extends HoodieRecordPayload> implements Seriali
}
/**
* Clean up any stale/old files/data lying around (either on file storage or index storage)
* based on the configurations and CleaningPolicy used. (typically files that no longer can be used
* by a running query can be cleaned)
* Clean up any stale/old files/data lying around (either on file storage or index storage) based
* on the configurations and CleaningPolicy used. (typically files that no longer can be used by a
* running query can be cleaned)
*/
private void clean(String startCleanTime) throws HoodieIOException {
try {
@@ -811,14 +823,16 @@ public class HoodieWriteClient<T extends HoodieRecordPayload> implements Seriali
}
public static SparkConf registerClasses(SparkConf conf) {
conf.registerKryoClasses(new Class[]{HoodieWriteConfig.class, HoodieRecord.class, HoodieKey.class});
conf.registerKryoClasses(
new Class[]{HoodieWriteConfig.class, HoodieRecord.class, HoodieKey.class});
return conf;
}
/**
* Deduplicate Hoodie records, using the given deduplication funciton.
*/
private JavaRDD<HoodieRecord<T>> deduplicateRecords(JavaRDD<HoodieRecord<T>> records, int parallelism) {
private JavaRDD<HoodieRecord<T>> deduplicateRecords(JavaRDD<HoodieRecord<T>> records,
int parallelism) {
return records
.mapToPair(record -> new Tuple2<>(record.getKey(), record))
.reduceByKey((rec1, rec2) -> {
@@ -833,8 +847,6 @@ public class HoodieWriteClient<T extends HoodieRecordPayload> implements Seriali
/**
* Cleanup all inflight commits
*
* @throws IOException
*/
private void rollbackInflightCommits() {
HoodieTable<T> table = HoodieTable

View File

@@ -19,7 +19,6 @@ package com.uber.hoodie;
import com.uber.hoodie.common.model.HoodieKey;
import com.uber.hoodie.common.model.HoodieRecord;
import com.uber.hoodie.common.model.HoodieWriteStat;
import java.io.Serializable;
import java.util.ArrayList;
import java.util.HashMap;
@@ -50,12 +49,14 @@ public class WriteStatus implements Serializable {
private long totalErrorRecords = 0;
/**
* Mark write as success, optionally using given parameters for the purpose of calculating
* some aggregate metrics. This method is not meant to cache passed arguments, since WriteStatus
* Mark write as success, optionally using given parameters for the purpose of calculating some
* aggregate metrics. This method is not meant to cache passed arguments, since WriteStatus
* objects are collected in Spark Driver.
*
* @param record deflated {@code HoodieRecord} containing information that uniquely identifies it.
* @param optionalRecordMetadata optional metadata related to data contained in {@link HoodieRecord} before deflation.
* @param record deflated {@code HoodieRecord} containing information that uniquely identifies
* it.
* @param optionalRecordMetadata optional metadata related to data contained in {@link
* HoodieRecord} before deflation.
*/
public void markSuccess(HoodieRecord record,
Optional<Map<String, String>> optionalRecordMetadata) {
@@ -64,12 +65,14 @@ public class WriteStatus implements Serializable {
}
/**
* Mark write as failed, optionally using given parameters for the purpose of calculating
* some aggregate metrics. This method is not meant to cache passed arguments, since WriteStatus
* Mark write as failed, optionally using given parameters for the purpose of calculating some
* aggregate metrics. This method is not meant to cache passed arguments, since WriteStatus
* objects are collected in Spark Driver.
*
* @param record deflated {@code HoodieRecord} containing information that uniquely identifies it.
* @param optionalRecordMetadata optional metadata related to data contained in {@link HoodieRecord} before deflation.
* @param record deflated {@code HoodieRecord} containing information that uniquely identifies
* it.
* @param optionalRecordMetadata optional metadata related to data contained in {@link
* HoodieRecord} before deflation.
*/
public void markFailure(HoodieRecord record, Throwable t,
Optional<Map<String, String>> optionalRecordMetadata) {
@@ -139,7 +142,9 @@ public class WriteStatus implements Serializable {
return totalRecords;
}
public long getTotalErrorRecords() { return totalErrorRecords; }
public long getTotalErrorRecords() {
return totalErrorRecords;
}
@Override
public String toString() {

View File

@@ -17,14 +17,15 @@
package com.uber.hoodie.config;
import java.io.Serializable;
import java.util.Map;
import java.util.Properties;
/**
* Default Way to load Hoodie config through a java.util.Properties
*/
public class DefaultHoodieConfig implements Serializable {
protected final Properties props;
public DefaultHoodieConfig(Properties props) {
this.props = props;
}
@@ -40,7 +41,8 @@ public class DefaultHoodieConfig implements Serializable {
}
}
public static void setDefaultOnCondition(Properties props, boolean condition, DefaultHoodieConfig config) {
public static void setDefaultOnCondition(Properties props, boolean condition,
DefaultHoodieConfig config) {
if (condition) {
props.putAll(config.getProps());
}

View File

@@ -19,21 +19,20 @@ package com.uber.hoodie.config;
import com.google.common.base.Preconditions;
import com.uber.hoodie.common.model.HoodieAvroPayload;
import com.uber.hoodie.common.model.HoodieCleaningPolicy;
import com.uber.hoodie.common.model.HoodieRecordPayload;
import com.uber.hoodie.io.compact.strategy.CompactionStrategy;
import com.uber.hoodie.io.compact.strategy.LogFileSizeBasedCompactionStrategy;
import javax.annotation.concurrent.Immutable;
import java.io.File;
import java.io.FileReader;
import java.io.IOException;
import java.util.Properties;
import javax.annotation.concurrent.Immutable;
/**
* Compaction related config
*/
@Immutable
public class HoodieCompactionConfig extends DefaultHoodieConfig {
public static final String CLEANER_POLICY_PROP = "hoodie.cleaner.policy";
private static final String DEFAULT_CLEANER_POLICY =
HoodieCleaningPolicy.KEEP_LATEST_COMMITS.name();
@@ -66,7 +65,9 @@ public class HoodieCompactionConfig extends DefaultHoodieConfig {
public static final String DEFAULT_PARQUET_SMALL_FILE_LIMIT_BYTES = String.valueOf(0);
/** Configs related to specific table types **/
/**
* Configs related to specific table types
**/
// Number of inserts, that will be put each partition/bucket for writing
public static final String COPY_ON_WRITE_TABLE_INSERT_SPLIT_SIZE = "hoodie.copyonwrite.insert.split.size";
// The rationale to pick the insert parallelism is the following. Writing out 100MB files,
@@ -82,7 +83,8 @@ public class HoodieCompactionConfig extends DefaultHoodieConfig {
// This value is used as a guessimate for the record size, if we can't determine this from previous commits
public static final String COPY_ON_WRITE_TABLE_RECORD_SIZE_ESTIMATE = "hoodie.copyonwrite.record.size.estimate";
// Used to determine how much more can be packed into a small file, before it exceeds the size limit.
public static final String DEFAULT_COPY_ON_WRITE_TABLE_RECORD_SIZE_ESTIMATE = String.valueOf(1024);
public static final String DEFAULT_COPY_ON_WRITE_TABLE_RECORD_SIZE_ESTIMATE = String
.valueOf(1024);
public static final String CLEANER_PARALLELISM = "hoodie.cleaner.parallelism";
public static final String DEFAULT_CLEANER_PARALLELISM = String.valueOf(200);
@@ -93,7 +95,8 @@ public class HoodieCompactionConfig extends DefaultHoodieConfig {
public static final String COMPACTION_STRATEGY_PROP = "hoodie.compaction.strategy";
// 200GB of target IO per compaction
public static final String DEFAULT_COMPACTION_STRATEGY = LogFileSizeBasedCompactionStrategy.class.getName();
public static final String DEFAULT_COMPACTION_STRATEGY = LogFileSizeBasedCompactionStrategy.class
.getName();
// used to merge records written to log file
public static final String DEFAULT_PAYLOAD_CLASS = HoodieAvroPayload.class.getName();
@@ -108,6 +111,7 @@ public class HoodieCompactionConfig extends DefaultHoodieConfig {
}
public static class Builder {
private final Properties props = new Properties();
public Builder fromFile(File propertiesFile) throws IOException {
@@ -174,12 +178,14 @@ public class HoodieCompactionConfig extends DefaultHoodieConfig {
}
public Builder autoTuneInsertSplits(boolean autoTuneInsertSplits) {
props.setProperty(COPY_ON_WRITE_TABLE_AUTO_SPLIT_INSERTS, String.valueOf(autoTuneInsertSplits));
props.setProperty(COPY_ON_WRITE_TABLE_AUTO_SPLIT_INSERTS,
String.valueOf(autoTuneInsertSplits));
return this;
}
public Builder approxRecordSize(int recordSizeEstimate) {
props.setProperty(COPY_ON_WRITE_TABLE_RECORD_SIZE_ESTIMATE, String.valueOf(recordSizeEstimate));
props.setProperty(COPY_ON_WRITE_TABLE_RECORD_SIZE_ESTIMATE,
String.valueOf(recordSizeEstimate));
return this;
}
@@ -199,7 +205,8 @@ public class HoodieCompactionConfig extends DefaultHoodieConfig {
}
public Builder withTargetIOPerCompactionInMB(long targetIOPerCompactionInMB) {
props.setProperty(TARGET_IO_PER_COMPACTION_IN_MB_PROP, String.valueOf(targetIOPerCompactionInMB));
props.setProperty(TARGET_IO_PER_COMPACTION_IN_MB_PROP,
String.valueOf(targetIOPerCompactionInMB));
return this;
}
@@ -228,7 +235,8 @@ public class HoodieCompactionConfig extends DefaultHoodieConfig {
setDefaultOnCondition(props, !props.containsKey(COPY_ON_WRITE_TABLE_AUTO_SPLIT_INSERTS),
COPY_ON_WRITE_TABLE_AUTO_SPLIT_INSERTS, DEFAULT_COPY_ON_WRITE_TABLE_AUTO_SPLIT_INSERTS);
setDefaultOnCondition(props, !props.containsKey(COPY_ON_WRITE_TABLE_RECORD_SIZE_ESTIMATE),
COPY_ON_WRITE_TABLE_RECORD_SIZE_ESTIMATE, DEFAULT_COPY_ON_WRITE_TABLE_RECORD_SIZE_ESTIMATE);
COPY_ON_WRITE_TABLE_RECORD_SIZE_ESTIMATE,
DEFAULT_COPY_ON_WRITE_TABLE_RECORD_SIZE_ESTIMATE);
setDefaultOnCondition(props, !props.containsKey(CLEANER_PARALLELISM),
CLEANER_PARALLELISM, DEFAULT_CLEANER_PARALLELISM);
setDefaultOnCondition(props, !props.containsKey(COMPACTION_STRATEGY_PROP),

View File

@@ -16,14 +16,12 @@
package com.uber.hoodie.config;
import com.google.common.base.Preconditions;
import com.uber.hoodie.index.HoodieIndex;
import javax.annotation.concurrent.Immutable;
import java.io.File;
import java.io.FileReader;
import java.io.IOException;
import java.util.Properties;
import javax.annotation.concurrent.Immutable;
/**
* Indexing related config
@@ -64,6 +62,7 @@ public class HoodieIndexConfig extends DefaultHoodieConfig {
}
public static class Builder {
private final Properties props = new Properties();
public Builder fromFile(File propertiesFile) throws IOException {

View File

@@ -17,12 +17,11 @@
package com.uber.hoodie.config;
import com.uber.hoodie.metrics.MetricsReporterType;
import javax.annotation.concurrent.Immutable;
import java.io.File;
import java.io.FileReader;
import java.io.IOException;
import java.util.Properties;
import javax.annotation.concurrent.Immutable;
/**
* Fetch the configurations used by the Metrics system.
@@ -56,6 +55,7 @@ public class HoodieMetricsConfig extends DefaultHoodieConfig {
}
public static class Builder {
private final Properties props = new Properties();
public Builder fromFile(File propertiesFile) throws IOException {

View File

@@ -16,17 +16,18 @@
package com.uber.hoodie.config;
import javax.annotation.concurrent.Immutable;
import java.io.File;
import java.io.FileReader;
import java.io.IOException;
import java.util.Properties;
import javax.annotation.concurrent.Immutable;
/**
* Storage related config
*/
@Immutable
public class HoodieStorageConfig extends DefaultHoodieConfig {
public static final String PARQUET_FILE_MAX_BYTES = "hoodie.parquet.max.file.size";
public static final String DEFAULT_PARQUET_FILE_MAX_BYTES = String.valueOf(120 * 1024 * 1024);
public static final String PARQUET_BLOCK_SIZE_BYTES = "hoodie.parquet.block.size";
@@ -43,6 +44,7 @@ public class HoodieStorageConfig extends DefaultHoodieConfig {
}
public static class Builder {
private final Properties props = new Properties();
public Builder fromFile(File propertiesFile) throws IOException {

View File

@@ -24,21 +24,21 @@ import com.uber.hoodie.common.util.ReflectionUtils;
import com.uber.hoodie.index.HoodieIndex;
import com.uber.hoodie.io.compact.strategy.CompactionStrategy;
import com.uber.hoodie.metrics.MetricsReporterType;
import org.apache.spark.storage.StorageLevel;
import javax.annotation.concurrent.Immutable;
import java.io.File;
import java.io.FileReader;
import java.io.IOException;
import java.io.InputStream;
import java.util.Map;
import java.util.Properties;
import javax.annotation.concurrent.Immutable;
import org.apache.spark.storage.StorageLevel;
/**
* Class storing configs for the {@link com.uber.hoodie.HoodieWriteClient}
*/
@Immutable
public class HoodieWriteConfig extends DefaultHoodieConfig {
private static final String BASE_PATH_PROP = "hoodie.base.path";
private static final String AVRO_SCHEMA = "hoodie.avro.schema";
public static final String TABLE_NAME = "hoodie.table.name";
@@ -141,7 +141,8 @@ public class HoodieWriteConfig extends DefaultHoodieConfig {
}
public int getParquetSmallFileLimit() {
return Integer.parseInt(props.getProperty(HoodieCompactionConfig.PARQUET_SMALL_FILE_LIMIT_BYTES));
return Integer
.parseInt(props.getProperty(HoodieCompactionConfig.PARQUET_SMALL_FILE_LIMIT_BYTES));
}
public int getCopyOnWriteInsertSplitSize() {
@@ -177,11 +178,13 @@ public class HoodieWriteConfig extends DefaultHoodieConfig {
}
public CompactionStrategy getCompactionStrategy() {
return ReflectionUtils.loadClass(props.getProperty(HoodieCompactionConfig.COMPACTION_STRATEGY_PROP));
return ReflectionUtils
.loadClass(props.getProperty(HoodieCompactionConfig.COMPACTION_STRATEGY_PROP));
}
public Long getTargetIOPerCompactionInMB() {
return Long.parseLong(props.getProperty(HoodieCompactionConfig.TARGET_IO_PER_COMPACTION_IN_MB_PROP));
return Long
.parseLong(props.getProperty(HoodieCompactionConfig.TARGET_IO_PER_COMPACTION_IN_MB_PROP));
}
/**
@@ -216,7 +219,8 @@ public class HoodieWriteConfig extends DefaultHoodieConfig {
}
public boolean getBloomIndexPruneByRanges() {
return Boolean.parseBoolean(props.getProperty(HoodieIndexConfig.BLOOM_INDEX_PRUNE_BY_RANGES_PROP));
return Boolean
.parseBoolean(props.getProperty(HoodieIndexConfig.BLOOM_INDEX_PRUNE_BY_RANGES_PROP));
}
public boolean getBloomIndexUseCaching() {
@@ -271,8 +275,8 @@ public class HoodieWriteConfig extends DefaultHoodieConfig {
}
public static class Builder {
private final Properties props = new Properties();
private boolean isIndexConfigSet = false;
private boolean isStorageConfigSet = false;
@@ -371,7 +375,8 @@ public class HoodieWriteConfig extends DefaultHoodieConfig {
}
public Builder withAssumeDatePartitioning(boolean assumeDatePartitioning) {
props.setProperty(HOODIE_ASSUME_DATE_PARTITIONING_PROP, String.valueOf(assumeDatePartitioning));
props.setProperty(HOODIE_ASSUME_DATE_PARTITIONING_PROP,
String.valueOf(assumeDatePartitioning));
return this;
}
@@ -386,7 +391,8 @@ public class HoodieWriteConfig extends DefaultHoodieConfig {
Preconditions.checkArgument(config.getBasePath() != null);
setDefaultOnCondition(props, !props.containsKey(INSERT_PARALLELISM), INSERT_PARALLELISM,
DEFAULT_PARALLELISM);
setDefaultOnCondition(props, !props.containsKey(BULKINSERT_PARALLELISM), BULKINSERT_PARALLELISM,
setDefaultOnCondition(props, !props.containsKey(BULKINSERT_PARALLELISM),
BULKINSERT_PARALLELISM,
DEFAULT_PARALLELISM);
setDefaultOnCondition(props, !props.containsKey(UPSERT_PARALLELISM), UPSERT_PARALLELISM,
DEFAULT_PARALLELISM);

View File

@@ -17,11 +17,11 @@
package com.uber.hoodie.exception;
/**
* <p>
* Exception thrown for any higher level errors when <code>HoodieClient</code> is doing a delta commit
* </p>
* <p> Exception thrown for any higher level errors when <code>HoodieClient</code> is doing a delta
* commit </p>
*/
public class HoodieAppendException extends HoodieException {
public HoodieAppendException(String msg, Throwable e) {
super(msg, e);
}

View File

@@ -17,11 +17,11 @@
package com.uber.hoodie.exception;
/**
* <p>
* Exception thrown for any higher level errors when <code>HoodieClient</code> is doing a Commit
* <p> Exception thrown for any higher level errors when <code>HoodieClient</code> is doing a Commit
* </p>
*/
public class HoodieCommitException extends HoodieException {
public HoodieCommitException(String msg) {
super(msg);
}

View File

@@ -17,6 +17,7 @@
package com.uber.hoodie.exception;
public class HoodieCompactionException extends HoodieException {
public HoodieCompactionException(String msg) {
super(msg);
}

View File

@@ -18,11 +18,10 @@ package com.uber.hoodie.exception;
/**
* <p>
* Exception thrown when dependent system is not available
* </p>
* <p> Exception thrown when dependent system is not available </p>
*/
public class HoodieDependentSystemUnavailableException extends HoodieException {
public static final String HBASE = "HBASE";
public HoodieDependentSystemUnavailableException(String system, String connectURL) {

View File

@@ -16,14 +16,12 @@
package com.uber.hoodie.exception;
import java.io.IOException;
/**
* <p>
* Exception thrown for any higher level errors when <code>HoodieClient</code> is doing a bulk insert
* </p>
* <p> Exception thrown for any higher level errors when <code>HoodieClient</code> is doing a bulk
* insert </p>
*/
public class HoodieInsertException extends HoodieException {
public HoodieInsertException(String msg, Throwable e) {
super(msg, e);
}

View File

@@ -17,11 +17,11 @@
package com.uber.hoodie.exception;
/**
* <p>
* Exception thrown for any higher level errors when <code>HoodieClient</code> is doing a incremental upsert
* </p>
* <p> Exception thrown for any higher level errors when <code>HoodieClient</code> is doing a
* incremental upsert </p>
*/
public class HoodieUpsertException extends HoodieException {
public HoodieUpsertException(String msg, Throwable e) {
super(msg, e);
}

View File

@@ -16,16 +16,14 @@
package com.uber.hoodie.func;
import com.uber.hoodie.common.table.HoodieTableMetaClient;
import com.uber.hoodie.config.HoodieWriteConfig;
import com.uber.hoodie.WriteStatus;
import com.uber.hoodie.common.model.HoodieRecord;
import com.uber.hoodie.common.model.HoodieRecordPayload;
import com.uber.hoodie.config.HoodieWriteConfig;
import com.uber.hoodie.table.HoodieTable;
import org.apache.spark.api.java.function.Function2;
import java.util.Iterator;
import java.util.List;
import org.apache.spark.api.java.function.Function2;
/**
@@ -46,7 +44,8 @@ public class BulkInsertMapFunction<T extends HoodieRecordPayload>
}
@Override
public Iterator<List<WriteStatus>> call(Integer partition, Iterator<HoodieRecord<T>> sortedRecordItr)
public Iterator<List<WriteStatus>> call(Integer partition,
Iterator<HoodieRecord<T>> sortedRecordItr)
throws Exception {
return new LazyInsertIterable<>(sortedRecordItr, config, commitTime, hoodieTable);
}

View File

@@ -16,27 +16,26 @@
package com.uber.hoodie.func;
import com.uber.hoodie.config.HoodieWriteConfig;
import com.uber.hoodie.WriteStatus;
import com.uber.hoodie.common.model.HoodieRecord;
import com.uber.hoodie.common.model.HoodieRecordPayload;
import com.uber.hoodie.io.HoodieIOHandle;
import com.uber.hoodie.config.HoodieWriteConfig;
import com.uber.hoodie.io.HoodieCreateHandle;
import com.uber.hoodie.io.HoodieIOHandle;
import com.uber.hoodie.table.HoodieTable;
import org.apache.spark.TaskContext;
import java.util.ArrayList;
import java.util.HashSet;
import java.util.Iterator;
import java.util.List;
import java.util.Set;
import org.apache.spark.TaskContext;
/**
* Lazy Iterable, that writes a stream of HoodieRecords sorted by the partitionPath,
* into new files.
* Lazy Iterable, that writes a stream of HoodieRecords sorted by the partitionPath, into new
* files.
*/
public class LazyInsertIterable<T extends HoodieRecordPayload> extends LazyIterableIterator<HoodieRecord<T>, List<WriteStatus>> {
public class LazyInsertIterable<T extends HoodieRecordPayload> extends
LazyIterableIterator<HoodieRecord<T>, List<WriteStatus>> {
private final HoodieWriteConfig hoodieConfig;
private final String commitTime;
@@ -53,11 +52,13 @@ public class LazyInsertIterable<T extends HoodieRecordPayload> extends LazyItera
this.hoodieTable = hoodieTable;
}
@Override protected void start() {
@Override
protected void start() {
}
@Override protected List<WriteStatus> computeNext() {
@Override
protected List<WriteStatus> computeNext() {
List<WriteStatus> statuses = new ArrayList<>();
while (inputItr.hasNext()) {
@@ -108,7 +109,8 @@ public class LazyInsertIterable<T extends HoodieRecordPayload> extends LazyItera
return statuses;
}
@Override protected void end() {
@Override
protected void end() {
}
}

View File

@@ -31,6 +31,7 @@ import java.util.Iterator;
* responsible for calling inputIterator.next() and doing the processing in computeNext()
*/
public abstract class LazyIterableIterator<I, O> implements Iterable<O>, Iterator<O> {
protected Iterator<I> inputItr = null;
private boolean consumed = false;
private boolean startCalled = false;
@@ -56,7 +57,6 @@ public abstract class LazyIterableIterator<I, O> implements Iterable<O>, Iterato
*/
protected abstract void end();
//////////////////
// iterable implementation
@@ -87,8 +87,9 @@ public abstract class LazyIterableIterator<I, O> implements Iterable<O>, Iterato
@Override
public Iterator<O> iterator() {
//check for consumed inputItr
if (consumed)
if (consumed) {
throw new RuntimeException("Invalid repeated inputItr consumption.");
}
//hand out self as inputItr exactly once (note: do not hand out the input
//inputItr since it is consumed by the self inputItr implementation)

View File

@@ -17,29 +17,26 @@
package com.uber.hoodie.index;
import com.google.common.base.Optional;
import com.uber.hoodie.config.HoodieWriteConfig;
import com.uber.hoodie.WriteStatus;
import com.uber.hoodie.common.model.HoodieKey;
import com.uber.hoodie.common.model.HoodieRecordPayload;
import com.uber.hoodie.common.model.HoodieRecord;
import com.uber.hoodie.common.model.HoodieRecordPayload;
import com.uber.hoodie.config.HoodieWriteConfig;
import com.uber.hoodie.exception.HoodieIndexException;
import com.uber.hoodie.index.bloom.HoodieBloomIndex;
import com.uber.hoodie.index.bucketed.BucketedIndex;
import com.uber.hoodie.index.hbase.HBaseIndex;
import com.uber.hoodie.table.HoodieTable;
import java.io.Serializable;
import org.apache.spark.api.java.JavaPairRDD;
import org.apache.spark.api.java.JavaRDD;
import org.apache.spark.api.java.JavaSparkContext;
import java.io.Serializable;
/**
* Base class for different types of indexes to determine the mapping from uuid
*
*/
public abstract class HoodieIndex<T extends HoodieRecordPayload> implements Serializable {
protected transient JavaSparkContext jsc = null;
public enum IndexType {
@@ -58,12 +55,9 @@ public abstract class HoodieIndex<T extends HoodieRecordPayload> implements Seri
/**
* Checks if the given [Keys] exists in the hoodie table and returns [Key, Optional[FullFilePath]]
* If the optional FullFilePath value is not present, then the key is not found. If the FullFilePath
* value is present, it is the path component (without scheme) of the URI underlying file
*
* @param hoodieKeys
* @param table
* @return
* If the optional FullFilePath value is not present, then the key is not found. If the
* FullFilePath value is present, it is the path component (without scheme) of the URI underlying
* file
*/
public abstract JavaPairRDD<HoodieKey, Optional<String>> fetchRecordLocation(
JavaRDD<HoodieKey> hoodieKeys, final HoodieTable<T> table);
@@ -89,17 +83,17 @@ public abstract class HoodieIndex<T extends HoodieRecordPayload> implements Seri
public abstract boolean rollbackCommit(String commitTime);
/**
* An index is `global` if {@link HoodieKey} to fileID mapping, does not depend on the `partitionPath`.
* Such an implementation is able to obtain the same mapping, for two hoodie keys with same `recordKey`
* but different `partitionPath`
* An index is `global` if {@link HoodieKey} to fileID mapping, does not depend on the
* `partitionPath`. Such an implementation is able to obtain the same mapping, for two hoodie keys
* with same `recordKey` but different `partitionPath`
*
* @return whether or not, the index implementation is global in nature
*/
public abstract boolean isGlobal();
/**
* This is used by storage to determine, if its safe to send inserts, straight to the log,
* i.e having a {@link com.uber.hoodie.common.model.FileSlice}, with no data file.
* This is used by storage to determine, if its safe to send inserts, straight to the log, i.e
* having a {@link com.uber.hoodie.common.model.FileSlice}, with no data file.
*
* @return Returns true/false depending on whether the impl has this capability
*/
@@ -107,12 +101,8 @@ public abstract class HoodieIndex<T extends HoodieRecordPayload> implements Seri
/**
*
* An index is "implicit" with respect to storage, if just writing new data to a file slice,
* updates the index as well. This is used by storage, to save memory footprint in
* certain cases.
*
* @return
* updates the index as well. This is used by storage, to save memory footprint in certain cases.
*/
public abstract boolean isImplicitWithStorage();

View File

@@ -17,32 +17,27 @@
package com.uber.hoodie.index;
import com.google.common.base.Optional;
import com.uber.hoodie.common.table.HoodieTableMetaClient;
import com.uber.hoodie.config.HoodieWriteConfig;
import com.uber.hoodie.WriteStatus;
import com.uber.hoodie.common.model.HoodieKey;
import com.uber.hoodie.common.model.HoodieRecord;
import com.uber.hoodie.common.model.HoodieRecordLocation;
import com.uber.hoodie.common.model.HoodieRecordPayload;
import com.uber.hoodie.config.HoodieWriteConfig;
import com.uber.hoodie.table.HoodieTable;
import java.util.ArrayList;
import java.util.Iterator;
import java.util.List;
import java.util.concurrent.ConcurrentHashMap;
import java.util.concurrent.ConcurrentMap;
import org.apache.spark.api.java.JavaPairRDD;
import org.apache.spark.api.java.JavaRDD;
import org.apache.spark.api.java.JavaSparkContext;
import org.apache.spark.api.java.function.Function;
import org.apache.spark.api.java.function.Function2;
import java.util.ArrayList;
import java.util.Iterator;
import java.util.List;
import java.util.concurrent.ConcurrentHashMap;
import java.util.concurrent.ConcurrentMap;
/**
* Hoodie Index implementation backed by an in-memory Hash map.
* <p>
* ONLY USE FOR LOCAL TESTING
* Hoodie Index implementation backed by an in-memory Hash map. <p> ONLY USE FOR LOCAL TESTING
*/
public class InMemoryHashIndex<T extends HoodieRecordPayload> extends HoodieIndex<T> {
@@ -64,6 +59,7 @@ public class InMemoryHashIndex<T extends HoodieRecordPayload> extends HoodieInde
*/
class LocationTagFunction
implements Function2<Integer, Iterator<HoodieRecord<T>>, Iterator<HoodieRecord<T>>> {
@Override
public Iterator<HoodieRecord<T>> call(Integer partitionNum,
Iterator<HoodieRecord<T>> hoodieRecordIterator) {
@@ -115,8 +111,6 @@ public class InMemoryHashIndex<T extends HoodieRecordPayload> extends HoodieInde
/**
* Only looks up by recordKey
*
* @return
*/
@Override
public boolean isGlobal() {
@@ -125,8 +119,6 @@ public class InMemoryHashIndex<T extends HoodieRecordPayload> extends HoodieInde
/**
* Mapping is available in HBase already.
*
* @return
*/
@Override
public boolean canIndexLogFiles() {
@@ -135,8 +127,6 @@ public class InMemoryHashIndex<T extends HoodieRecordPayload> extends HoodieInde
/**
* Index needs to be explicitly updated after storage write.
*
* @return
*/
@Override
public boolean isImplicitWithStorage() {

View File

@@ -19,7 +19,6 @@
package com.uber.hoodie.index.bloom;
import com.google.common.base.Objects;
import java.io.Serializable;
/**
@@ -63,8 +62,6 @@ public class BloomIndexFileInfo implements Serializable {
/**
* Does the given key fall within the range (inclusive)
* @param recordKey
* @return
*/
public boolean isKeyInRange(String recordKey) {
return minRecordKey.compareTo(recordKey) <= 0 &&
@@ -73,8 +70,12 @@ public class BloomIndexFileInfo implements Serializable {
@Override
public boolean equals(Object o) {
if (this == o) return true;
if (o == null || getClass() != o.getClass()) return false;
if (this == o) {
return true;
}
if (o == null || getClass() != o.getClass()) {
return false;
}
BloomIndexFileInfo that = (BloomIndexFileInfo) o;
return Objects.equal(that.fileName, fileName) &&

View File

@@ -18,9 +18,12 @@
package com.uber.hoodie.index.bloom;
import static java.util.stream.Collectors.groupingBy;
import static java.util.stream.Collectors.mapping;
import static java.util.stream.Collectors.toList;
import com.google.common.annotations.VisibleForTesting;
import com.google.common.base.Optional;
import com.uber.hoodie.WriteStatus;
import com.uber.hoodie.common.model.HoodieDataFile;
import com.uber.hoodie.common.model.HoodieKey;
@@ -34,7 +37,10 @@ import com.uber.hoodie.config.HoodieWriteConfig;
import com.uber.hoodie.exception.MetadataNotFoundException;
import com.uber.hoodie.index.HoodieIndex;
import com.uber.hoodie.table.HoodieTable;
import java.util.ArrayList;
import java.util.List;
import java.util.Map;
import java.util.stream.Collectors;
import org.apache.hadoop.fs.Path;
import org.apache.log4j.LogManager;
import org.apache.log4j.Logger;
@@ -42,16 +48,8 @@ import org.apache.spark.api.java.JavaPairRDD;
import org.apache.spark.api.java.JavaRDD;
import org.apache.spark.api.java.JavaSparkContext;
import org.apache.spark.storage.StorageLevel;
import scala.Tuple2;
import java.util.ArrayList;
import java.util.List;
import java.util.Map;
import java.util.stream.Collectors;
import static java.util.stream.Collectors.*;
/**
* Indexing mechanism based on bloom filter. Each parquet file includes its row_key bloom filter in
* its metadata.
@@ -64,14 +62,16 @@ public class HoodieBloomIndex<T extends HoodieRecordPayload> extends HoodieIndex
private static final int SPARK_MAXIMUM_BYTES_PER_PARTITION = 1500 * 1024 * 1024;
// this is how much a triplet of (partitionPath, fileId, recordKey) costs.
private static final int BYTES_PER_PARTITION_FILE_KEY_TRIPLET = 300;
private static int MAX_ITEMS_PER_SHUFFLE_PARTITION = SPARK_MAXIMUM_BYTES_PER_PARTITION / BYTES_PER_PARTITION_FILE_KEY_TRIPLET;
private static int MAX_ITEMS_PER_SHUFFLE_PARTITION =
SPARK_MAXIMUM_BYTES_PER_PARTITION / BYTES_PER_PARTITION_FILE_KEY_TRIPLET;
public HoodieBloomIndex(HoodieWriteConfig config, JavaSparkContext jsc) {
super(config, jsc);
}
@Override
public JavaRDD<HoodieRecord<T>> tagLocation(JavaRDD<HoodieRecord<T>> recordRDD, final HoodieTable<T> hoodieTable) {
public JavaRDD<HoodieRecord<T>> tagLocation(JavaRDD<HoodieRecord<T>> recordRDD,
final HoodieTable<T> hoodieTable) {
// Step 0: cache the input record RDD
if (config.getBloomIndexUseCaching()) {
@@ -83,7 +83,8 @@ public class HoodieBloomIndex<T extends HoodieRecordPayload> extends HoodieIndex
.mapToPair(record -> new Tuple2<>(record.getPartitionPath(), record.getRecordKey()));
// Lookup indexes for all the partition/recordkey pair
JavaPairRDD<String, String> rowKeyFilenamePairRDD = lookupIndex(partitionRecordKeyPairRDD, hoodieTable);
JavaPairRDD<String, String> rowKeyFilenamePairRDD = lookupIndex(partitionRecordKeyPairRDD,
hoodieTable);
// Cache the result, for subsequent stages.
if (config.getBloomIndexUseCaching()) {
@@ -96,7 +97,8 @@ public class HoodieBloomIndex<T extends HoodieRecordPayload> extends HoodieIndex
// Step 4: Tag the incoming records, as inserts or updates, by joining with existing record keys
// Cost: 4 sec.
JavaRDD<HoodieRecord<T>> taggedRecordRDD = tagLocationBacktoRecords(rowKeyFilenamePairRDD, recordRDD);
JavaRDD<HoodieRecord<T>> taggedRecordRDD = tagLocationBacktoRecords(rowKeyFilenamePairRDD,
recordRDD);
if (config.getBloomIndexUseCaching()) {
recordRDD.unpersist(); // unpersist the input Record RDD
@@ -135,8 +137,8 @@ public class HoodieBloomIndex<T extends HoodieRecordPayload> extends HoodieIndex
}
/**
* Lookup the location for each record key and return the pair<record_key,location> for all
* record keys already present and drop the record keys if not present
* Lookup the location for each record key and return the pair<record_key,location> for all record
* keys already present and drop the record keys if not present
*/
private JavaPairRDD<String, String> lookupIndex(
JavaPairRDD<String, String> partitionRecordKeyPairRDD, final HoodieTable<T> hoodieTable) {
@@ -145,25 +147,27 @@ public class HoodieBloomIndex<T extends HoodieRecordPayload> extends HoodieIndex
List<String> affectedPartitionPathList = new ArrayList<>(recordsPerPartition.keySet());
// Step 2: Load all involved files as <Partition, filename> pairs
List<Tuple2<String, BloomIndexFileInfo>> fileInfoList = loadInvolvedFiles(affectedPartitionPathList, hoodieTable);
List<Tuple2<String, BloomIndexFileInfo>> fileInfoList = loadInvolvedFiles(
affectedPartitionPathList, hoodieTable);
final Map<String, List<BloomIndexFileInfo>> partitionToFileInfo = fileInfoList.stream()
.collect(groupingBy(Tuple2::_1, mapping(Tuple2::_2, toList())));
// Step 3: Obtain a RDD, for each incoming record, that already exists, with the file id, that contains it.
int parallelism = autoComputeParallelism(recordsPerPartition, partitionToFileInfo, partitionRecordKeyPairRDD);
return findMatchingFilesForRecordKeys(partitionToFileInfo, partitionRecordKeyPairRDD, parallelism);
int parallelism = autoComputeParallelism(recordsPerPartition, partitionToFileInfo,
partitionRecordKeyPairRDD);
return findMatchingFilesForRecordKeys(partitionToFileInfo, partitionRecordKeyPairRDD,
parallelism);
}
/**
* The index lookup can be skewed in three dimensions : #files, #partitions, #records
*
* To be able to smoothly handle skews, we need to compute how to split each partitions into
* subpartitions. We do it here, in a way that keeps the amount of each Spark join partition to
* < 2GB.
*
* If {@link com.uber.hoodie.config.HoodieIndexConfig#BLOOM_INDEX_PARALLELISM_PROP} is specified as a NON-zero number,
* then that is used explicitly.
* subpartitions. We do it here, in a way that keeps the amount of each Spark join partition to <
* 2GB.
*
* If {@link com.uber.hoodie.config.HoodieIndexConfig#BLOOM_INDEX_PARALLELISM_PROP} is specified
* as a NON-zero number, then that is used explicitly.
*/
private int autoComputeParallelism(final Map<String, Long> recordsPerPartition,
final Map<String, List<BloomIndexFileInfo>> partitionToFileInfo,
@@ -172,7 +176,8 @@ public class HoodieBloomIndex<T extends HoodieRecordPayload> extends HoodieIndex
long totalComparisons = 0;
if (config.getBloomIndexPruneByRanges()) {
// we will just try exploding the input and then count to determine comparisons
totalComparisons = explodeRecordRDDWithFileComparisons(partitionToFileInfo, partitionRecordKeyPairRDD).count();
totalComparisons = explodeRecordRDDWithFileComparisons(partitionToFileInfo,
partitionRecordKeyPairRDD).count();
} else {
// if not pruning by ranges, then each file in a partition needs to compared against all
// records for a partition.
@@ -181,30 +186,36 @@ public class HoodieBloomIndex<T extends HoodieRecordPayload> extends HoodieIndex
long totalFiles = 0, totalRecords = 0;
for (String partitionPath : recordsPerPartition.keySet()) {
long numRecords = recordsPerPartition.get(partitionPath);
long numFiles = filesPerPartition.containsKey(partitionPath) ? filesPerPartition.get(partitionPath) : 1L;
long numFiles =
filesPerPartition.containsKey(partitionPath) ? filesPerPartition.get(partitionPath)
: 1L;
totalComparisons += numFiles * numRecords;
totalFiles += filesPerPartition.containsKey(partitionPath) ? filesPerPartition.get(partitionPath) : 0L;
totalFiles +=
filesPerPartition.containsKey(partitionPath) ? filesPerPartition.get(partitionPath)
: 0L;
totalRecords += numRecords;
}
logger.info("TotalRecords: " + totalRecords + ", TotalFiles: " + totalFiles + ", TotalAffectedPartitions:" + recordsPerPartition.size());
logger.info("TotalRecords: " + totalRecords + ", TotalFiles: " + totalFiles
+ ", TotalAffectedPartitions:" + recordsPerPartition.size());
}
// each partition will have an item per comparison.
int parallelism = (int) (totalComparisons/ MAX_ITEMS_PER_SHUFFLE_PARTITION + 1);
logger.info("Auto computed parallelism :" + parallelism + ", totalComparisons: " + totalComparisons);
int parallelism = (int) (totalComparisons / MAX_ITEMS_PER_SHUFFLE_PARTITION + 1);
logger.info(
"Auto computed parallelism :" + parallelism + ", totalComparisons: " + totalComparisons);
return parallelism;
}
/**
* Its crucial to pick the right parallelism.
*
* totalSubPartitions : this is deemed safe limit, to be nice with Spark.
* inputParallelism : typically number of input file splits
* totalSubPartitions : this is deemed safe limit, to be nice with Spark. inputParallelism :
* typically number of input file splits
*
* We pick the max such that, we are always safe, but go higher if say a there are a lot of
* input files. (otherwise, we will fallback to number of partitions in input and end up with
* slow performance)
* We pick the max such that, we are always safe, but go higher if say a there are a lot of input
* files. (otherwise, we will fallback to number of partitions in input and end up with slow
* performance)
*/
private int determineParallelism(int inputParallelism, int totalSubPartitions) {
// If bloom index parallelism is set, use it to to check against the input parallelism and take the max
@@ -221,9 +232,11 @@ public class HoodieBloomIndex<T extends HoodieRecordPayload> extends HoodieIndex
* Load all involved files as <Partition, filename> pair RDD.
*/
@VisibleForTesting
List<Tuple2<String, BloomIndexFileInfo>> loadInvolvedFiles(List<String> partitions, final HoodieTable<T> hoodieTable) {
List<Tuple2<String, BloomIndexFileInfo>> loadInvolvedFiles(List<String> partitions,
final HoodieTable<T> hoodieTable) {
// Obtain the latest data files from all the partitions.
List<Tuple2<String, HoodieDataFile>> dataFilesList = jsc.parallelize(partitions, Math.max(partitions.size(), 1))
List<Tuple2<String, HoodieDataFile>> dataFilesList = jsc
.parallelize(partitions, Math.max(partitions.size(), 1))
.flatMapToPair(partitionPath -> {
java.util.Optional<HoodieInstant> latestCommitTime =
hoodieTable.getCommitTimeline().filterCompletedInstants().lastInstant();
@@ -243,8 +256,10 @@ public class HoodieBloomIndex<T extends HoodieRecordPayload> extends HoodieIndex
return jsc.parallelize(dataFilesList, Math.max(dataFilesList.size(), 1))
.mapToPair(ft -> {
try {
String[] minMaxKeys = ParquetUtils.readMinMaxRecordKeys(ft._2().getFileStatus().getPath());
return new Tuple2<>(ft._1(), new BloomIndexFileInfo(ft._2().getFileName(), minMaxKeys[0], minMaxKeys[1]));
String[] minMaxKeys = ParquetUtils
.readMinMaxRecordKeys(ft._2().getFileStatus().getPath());
return new Tuple2<>(ft._1(),
new BloomIndexFileInfo(ft._2().getFileName(), minMaxKeys[0], minMaxKeys[1]));
} catch (MetadataNotFoundException me) {
logger.warn("Unable to find range metadata in file :" + ft._2());
return new Tuple2<>(ft._1(), new BloomIndexFileInfo(ft._2().getFileName()));
@@ -266,8 +281,6 @@ public class HoodieBloomIndex<T extends HoodieRecordPayload> extends HoodieIndex
/**
* This is not global, since we depend on the partitionPath to do the lookup
*
* @return
*/
@Override
public boolean isGlobal() {
@@ -276,8 +289,6 @@ public class HoodieBloomIndex<T extends HoodieRecordPayload> extends HoodieIndex
/**
* No indexes into log files yet.
*
* @return
*/
@Override
public boolean canIndexLogFiles() {
@@ -286,8 +297,6 @@ public class HoodieBloomIndex<T extends HoodieRecordPayload> extends HoodieIndex
/**
* Bloom filters are stored, into the same data files.
*
* @return
*/
@Override
public boolean isImplicitWithStorage() {
@@ -295,12 +304,8 @@ public class HoodieBloomIndex<T extends HoodieRecordPayload> extends HoodieIndex
}
/**
* if we dont have key ranges, then also we need to compare against the file. no other choice
* if we do, then only compare the file if the record key falls in range.
* @param indexInfo
* @param recordKey
* @return
* if we dont have key ranges, then also we need to compare against the file. no other choice if
* we do, then only compare the file if the record key falls in range.
*/
private boolean shouldCompareWithFile(BloomIndexFileInfo indexInfo, String recordKey) {
return !indexInfo.hasKeyRanges() || indexInfo.isKeyInRange(recordKey);
@@ -308,19 +313,16 @@ public class HoodieBloomIndex<T extends HoodieRecordPayload> extends HoodieIndex
/**
* For each incoming record, produce N output records, 1 each for each file against which the record's key
* needs to be checked. For datasets, where the keys have a definite insert order (e.g: timestamp as prefix),
* the number of files to be compared gets cut down a lot from range pruning.
*
*
* @param partitionToFileIndexInfo
* @param partitionRecordKeyPairRDD
* @return
* For each incoming record, produce N output records, 1 each for each file against which the
* record's key needs to be checked. For datasets, where the keys have a definite insert order
* (e.g: timestamp as prefix), the number of files to be compared gets cut down a lot from range
* pruning.
*/
// sub-partition to ensure the records can be looked up against files & also prune file<=>record comparisons based on recordKey
// ranges in the index info.
@VisibleForTesting
JavaPairRDD<String, Tuple2<String, HoodieKey>> explodeRecordRDDWithFileComparisons(final Map<String, List<BloomIndexFileInfo>> partitionToFileIndexInfo,
JavaPairRDD<String, Tuple2<String, HoodieKey>> explodeRecordRDDWithFileComparisons(
final Map<String, List<BloomIndexFileInfo>> partitionToFileIndexInfo,
JavaPairRDD<String, String> partitionRecordKeyPairRDD) {
return partitionRecordKeyPairRDD
.map(partitionRecordKeyPair -> {
@@ -329,13 +331,15 @@ public class HoodieBloomIndex<T extends HoodieRecordPayload> extends HoodieIndex
List<BloomIndexFileInfo> indexInfos = partitionToFileIndexInfo.get(partitionPath);
List<Tuple2<String, Tuple2<String, HoodieKey>>> recordComparisons = new ArrayList<>();
if (indexInfos != null) { // could be null, if there are no files in a given partition yet.
if (indexInfos
!= null) { // could be null, if there are no files in a given partition yet.
// for each candidate file in partition, that needs to be compared.
for (BloomIndexFileInfo indexInfo : indexInfos) {
if (shouldCompareWithFile(indexInfo, recordKey)) {
recordComparisons.add(
new Tuple2<>(String.format("%s#%s", indexInfo.getFileName(), recordKey),
new Tuple2<>(indexInfo.getFileName(), new HoodieKey(recordKey, partitionPath))));
new Tuple2<>(indexInfo.getFileName(),
new HoodieKey(recordKey, partitionPath))));
}
}
}
@@ -347,22 +351,23 @@ public class HoodieBloomIndex<T extends HoodieRecordPayload> extends HoodieIndex
/**
* Find out <RowKey, filename> pair. All workload grouped by file-level.
*
* Join PairRDD(PartitionPath, RecordKey) and PairRDD(PartitionPath, File) & then repartition
* such that each RDD partition is a file, then for each file, we do
* (1) load bloom filter,
* (2) load rowKeys,
* (3) Tag rowKey
* Join PairRDD(PartitionPath, RecordKey) and PairRDD(PartitionPath, File) & then repartition such
* that each RDD partition is a file, then for each file, we do (1) load bloom filter, (2) load
* rowKeys, (3) Tag rowKey
*
* Make sure the parallelism is atleast the groupby parallelism for tagging location
*/
@VisibleForTesting
JavaPairRDD<String, String> findMatchingFilesForRecordKeys(final Map<String, List<BloomIndexFileInfo>> partitionToFileIndexInfo,
JavaPairRDD<String, String> findMatchingFilesForRecordKeys(
final Map<String, List<BloomIndexFileInfo>> partitionToFileIndexInfo,
JavaPairRDD<String, String> partitionRecordKeyPairRDD,
int totalSubpartitions) {
int joinParallelism = determineParallelism(partitionRecordKeyPairRDD.partitions().size(), totalSubpartitions);
int joinParallelism = determineParallelism(partitionRecordKeyPairRDD.partitions().size(),
totalSubpartitions);
JavaPairRDD<String, Tuple2<String, HoodieKey>> fileSortedTripletRDD = explodeRecordRDDWithFileComparisons(partitionToFileIndexInfo, partitionRecordKeyPairRDD)
JavaPairRDD<String, Tuple2<String, HoodieKey>> fileSortedTripletRDD = explodeRecordRDDWithFileComparisons(
partitionToFileIndexInfo, partitionRecordKeyPairRDD)
// sort further based on filename, such that all checking for the file can happen within a single partition, on-the-fly
.sortByKey(true, joinParallelism);
@@ -382,7 +387,8 @@ public class HoodieBloomIndex<T extends HoodieRecordPayload> extends HoodieIndex
/**
* Tag the <rowKey, filename> back to the original HoodieRecord RDD.
*/
private JavaRDD<HoodieRecord<T>> tagLocationBacktoRecords(JavaPairRDD<String, String> rowKeyFilenamePairRDD,
private JavaRDD<HoodieRecord<T>> tagLocationBacktoRecords(
JavaPairRDD<String, String> rowKeyFilenamePairRDD,
JavaRDD<HoodieRecord<T>> recordRDD) {
JavaPairRDD<String, HoodieRecord<T>> rowKeyRecordPairRDD = recordRDD
.mapToPair(record -> new Tuple2<>(record.getRecordKey(), record));
@@ -404,7 +410,8 @@ public class HoodieBloomIndex<T extends HoodieRecordPayload> extends HoodieIndex
}
@Override
public JavaRDD<WriteStatus> updateLocation(JavaRDD<WriteStatus> writeStatusRDD, HoodieTable<T> hoodieTable) {
public JavaRDD<WriteStatus> updateLocation(JavaRDD<WriteStatus> writeStatusRDD,
HoodieTable<T> hoodieTable) {
return writeStatusRDD;
}
}

View File

@@ -24,24 +24,22 @@ import com.uber.hoodie.common.util.ParquetUtils;
import com.uber.hoodie.exception.HoodieException;
import com.uber.hoodie.exception.HoodieIndexException;
import com.uber.hoodie.func.LazyIterableIterator;
import org.apache.hadoop.fs.Path;
import org.apache.log4j.LogManager;
import org.apache.log4j.Logger;
import org.apache.spark.api.java.function.Function2;
import java.util.ArrayList;
import java.util.Iterator;
import java.util.List;
import java.util.Set;
import org.apache.hadoop.fs.Path;
import org.apache.log4j.LogManager;
import org.apache.log4j.Logger;
import org.apache.spark.api.java.function.Function2;
import scala.Tuple2;
/**
* Function performing actual checking of RDD parition containing (fileId, hoodieKeys) against the
* actual files
*/
public class HoodieBloomIndexCheckFunction implements Function2<Integer, Iterator<Tuple2<String, Tuple2<String, HoodieKey>>>, Iterator<List<IndexLookupResult>>> {
public class HoodieBloomIndexCheckFunction implements
Function2<Integer, Iterator<Tuple2<String, Tuple2<String, HoodieKey>>>, Iterator<List<IndexLookupResult>>> {
private static Logger logger = LogManager.getLogger(HoodieBloomIndexCheckFunction.class);
@@ -54,7 +52,8 @@ public class HoodieBloomIndexCheckFunction implements Function2<Integer, Iterato
/**
* Given a list of row keys and one file, return only row keys existing in that file.
*/
public static List<String> checkCandidatesAgainstFile(List<String> candidateRecordKeys, Path filePath) throws HoodieIndexException {
public static List<String> checkCandidatesAgainstFile(List<String> candidateRecordKeys,
Path filePath) throws HoodieIndexException {
List<String> foundRecordKeys = new ArrayList<>();
try {
// Load all rowKeys from the file, to double-confirm
@@ -69,18 +68,20 @@ public class HoodieBloomIndexCheckFunction implements Function2<Integer, Iterato
foundRecordKeys.add(rowKey);
}
}
logger.info("After checking with row keys, we have " + foundRecordKeys.size() + " results, for file " + filePath + " => " + foundRecordKeys);
logger.info("After checking with row keys, we have " + foundRecordKeys.size()
+ " results, for file " + filePath + " => " + foundRecordKeys);
if (logger.isDebugEnabled()) {
logger.debug("Keys matching for file " + filePath + " => " + foundRecordKeys);
}
}
} catch (Exception e){
} catch (Exception e) {
throw new HoodieIndexException("Error checking candidate keys against file.", e);
}
return foundRecordKeys;
}
class LazyKeyCheckIterator extends LazyIterableIterator<Tuple2<String, Tuple2<String, HoodieKey>>, List<IndexLookupResult>> {
class LazyKeyCheckIterator extends
LazyIterableIterator<Tuple2<String, Tuple2<String, HoodieKey>>, List<IndexLookupResult>> {
private List<String> candidateRecordKeys;
@@ -90,7 +91,8 @@ public class HoodieBloomIndexCheckFunction implements Function2<Integer, Iterato
private String currentParitionPath;
LazyKeyCheckIterator(Iterator<Tuple2<String, Tuple2<String, HoodieKey>>> fileParitionRecordKeyTripletItr) {
LazyKeyCheckIterator(
Iterator<Tuple2<String, Tuple2<String, HoodieKey>>> fileParitionRecordKeyTripletItr) {
super(fileParitionRecordKeyTripletItr);
currentFile = null;
candidateRecordKeys = new ArrayList<>();
@@ -144,11 +146,15 @@ public class HoodieBloomIndexCheckFunction implements Function2<Integer, Iterato
} else {
// do the actual checking of file & break out
Path filePath = new Path(basePath + "/" + currentParitionPath + "/" + currentFile);
logger.info("#1 After bloom filter, the candidate row keys is reduced to " + candidateRecordKeys.size() + " for " + filePath);
logger.info(
"#1 After bloom filter, the candidate row keys is reduced to " + candidateRecordKeys
.size() + " for " + filePath);
if (logger.isDebugEnabled()) {
logger.debug("#The candidate row keys for " + filePath + " => " + candidateRecordKeys);
logger
.debug("#The candidate row keys for " + filePath + " => " + candidateRecordKeys);
}
ret.add(new IndexLookupResult(currentFile, checkCandidatesAgainstFile(candidateRecordKeys, filePath)));
ret.add(new IndexLookupResult(currentFile,
checkCandidatesAgainstFile(candidateRecordKeys, filePath)));
initState(fileName, partitionPath);
if (bloomFilter.mightContain(recordKey)) {
@@ -164,11 +170,14 @@ public class HoodieBloomIndexCheckFunction implements Function2<Integer, Iterato
// handle case, where we ran out of input, finish pending work, update return val
if (!inputItr.hasNext()) {
Path filePath = new Path(basePath + "/" + currentParitionPath + "/" + currentFile);
logger.info("#2 After bloom filter, the candidate row keys is reduced to " + candidateRecordKeys.size() + " for " + filePath);
logger.info(
"#2 After bloom filter, the candidate row keys is reduced to " + candidateRecordKeys
.size() + " for " + filePath);
if (logger.isDebugEnabled()) {
logger.debug("#The candidate row keys for " + filePath + " => " + candidateRecordKeys);
}
ret.add(new IndexLookupResult(currentFile, checkCandidatesAgainstFile(candidateRecordKeys, filePath)));
ret.add(new IndexLookupResult(currentFile,
checkCandidatesAgainstFile(candidateRecordKeys, filePath)));
}
} catch (Throwable e) {
@@ -189,7 +198,8 @@ public class HoodieBloomIndexCheckFunction implements Function2<Integer, Iterato
@Override
public Iterator<List<IndexLookupResult>> call(Integer partition,
Iterator<Tuple2<String, Tuple2<String, HoodieKey>>> fileParitionRecordKeyTripletItr) throws Exception {
Iterator<Tuple2<String, Tuple2<String, HoodieKey>>> fileParitionRecordKeyTripletItr)
throws Exception {
return new LazyKeyCheckIterator(fileParitionRecordKeyTripletItr);
}
}

View File

@@ -19,7 +19,6 @@
package com.uber.hoodie.index.bucketed;
import com.google.common.base.Optional;
import com.uber.hoodie.WriteStatus;
import com.uber.hoodie.common.model.HoodieKey;
import com.uber.hoodie.common.model.HoodieRecord;
@@ -29,29 +28,22 @@ import com.uber.hoodie.config.HoodieWriteConfig;
import com.uber.hoodie.exception.HoodieIndexException;
import com.uber.hoodie.index.HoodieIndex;
import com.uber.hoodie.table.HoodieTable;
import org.apache.log4j.LogManager;
import org.apache.log4j.Logger;
import org.apache.spark.api.java.JavaPairRDD;
import org.apache.spark.api.java.JavaRDD;
import org.apache.spark.api.java.JavaSparkContext;
import scala.Tuple2;
/**
* An `stateless` index implementation that will using a deterministic mapping function to
* determine the fileID for a given record.
*
* Pros:
* - Fast
*
* Cons :
* - Need to tune the number of buckets per partition path manually (FIXME: Need to autotune this)
* - Could increase write amplification on copy-on-write storage since inserts always rewrite files
* - Not global.
*
* An `stateless` index implementation that will using a deterministic mapping function to determine
* the fileID for a given record.
*
* Pros: - Fast
*
* Cons : - Need to tune the number of buckets per partition path manually (FIXME: Need to autotune
* this) - Could increase write amplification on copy-on-write storage since inserts always rewrite
* files - Not global.
*/
public class BucketedIndex<T extends HoodieRecordPayload> extends HoodieIndex<T> {
@@ -66,12 +58,14 @@ public class BucketedIndex<T extends HoodieRecordPayload> extends HoodieIndex<T>
}
@Override
public JavaPairRDD<HoodieKey, Optional<String>> fetchRecordLocation(JavaRDD<HoodieKey> hoodieKeys, HoodieTable<T> table) {
public JavaPairRDD<HoodieKey, Optional<String>> fetchRecordLocation(JavaRDD<HoodieKey> hoodieKeys,
HoodieTable<T> table) {
return hoodieKeys.mapToPair(hk -> new Tuple2<>(hk, Optional.of(getBucket(hk.getRecordKey()))));
}
@Override
public JavaRDD<HoodieRecord<T>> tagLocation(JavaRDD<HoodieRecord<T>> recordRDD, HoodieTable<T> hoodieTable) throws HoodieIndexException {
public JavaRDD<HoodieRecord<T>> tagLocation(JavaRDD<HoodieRecord<T>> recordRDD,
HoodieTable<T> hoodieTable) throws HoodieIndexException {
return recordRDD.map(record -> {
String bucket = getBucket(record.getRecordKey());
//HACK(vc) a non-existent commit is provided here.
@@ -81,7 +75,8 @@ public class BucketedIndex<T extends HoodieRecordPayload> extends HoodieIndex<T>
}
@Override
public JavaRDD<WriteStatus> updateLocation(JavaRDD<WriteStatus> writeStatusRDD, HoodieTable<T> hoodieTable) throws HoodieIndexException {
public JavaRDD<WriteStatus> updateLocation(JavaRDD<WriteStatus> writeStatusRDD,
HoodieTable<T> hoodieTable) throws HoodieIndexException {
return writeStatusRDD;
}
@@ -93,8 +88,6 @@ public class BucketedIndex<T extends HoodieRecordPayload> extends HoodieIndex<T>
/**
* Bucketing is still done within each partition.
*
* @return
*/
@Override
public boolean isGlobal() {
@@ -102,10 +95,8 @@ public class BucketedIndex<T extends HoodieRecordPayload> extends HoodieIndex<T>
}
/**
* Since indexing is just a deterministic hash, we can identify file group correctly even without an index
* on the actual log file.
*
* @return
* Since indexing is just a deterministic hash, we can identify file group correctly even without
* an index on the actual log file.
*/
@Override
public boolean canIndexLogFiles() {
@@ -114,8 +105,6 @@ public class BucketedIndex<T extends HoodieRecordPayload> extends HoodieIndex<T>
/**
* Indexing is just a hash function.
*
* @return
*/
@Override
public boolean isImplicitWithStorage() {

View File

@@ -19,24 +19,33 @@
package com.uber.hoodie.index.hbase;
import com.google.common.base.Optional;
import com.uber.hoodie.common.table.HoodieTimeline;
import com.uber.hoodie.common.table.timeline.HoodieInstant;
import com.uber.hoodie.config.HoodieWriteConfig;
import com.uber.hoodie.WriteStatus;
import com.uber.hoodie.common.model.HoodieKey;
import com.uber.hoodie.common.model.HoodieRecord;
import com.uber.hoodie.common.model.HoodieRecordLocation;
import com.uber.hoodie.common.model.HoodieRecordPayload;
import com.uber.hoodie.common.model.HoodieRecord;
import com.uber.hoodie.common.table.HoodieTimeline;
import com.uber.hoodie.common.table.timeline.HoodieInstant;
import com.uber.hoodie.config.HoodieIndexConfig;
import com.uber.hoodie.config.HoodieWriteConfig;
import com.uber.hoodie.exception.HoodieDependentSystemUnavailableException;
import com.uber.hoodie.exception.HoodieIndexException;
import com.uber.hoodie.index.HoodieIndex;
import com.uber.hoodie.table.HoodieTable;
import java.io.IOException;
import java.util.ArrayList;
import java.util.Iterator;
import java.util.List;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.hbase.HBaseConfiguration;
import org.apache.hadoop.hbase.TableName;
import org.apache.hadoop.hbase.client.*;
import org.apache.hadoop.hbase.client.Connection;
import org.apache.hadoop.hbase.client.ConnectionFactory;
import org.apache.hadoop.hbase.client.Delete;
import org.apache.hadoop.hbase.client.Get;
import org.apache.hadoop.hbase.client.HTable;
import org.apache.hadoop.hbase.client.Put;
import org.apache.hadoop.hbase.client.Result;
import org.apache.hadoop.hbase.util.Bytes;
import org.apache.log4j.LogManager;
import org.apache.log4j.Logger;
@@ -45,15 +54,11 @@ import org.apache.spark.api.java.JavaRDD;
import org.apache.spark.api.java.JavaSparkContext;
import org.apache.spark.api.java.function.Function2;
import java.io.IOException;
import java.util.ArrayList;
import java.util.Iterator;
import java.util.List;
/**
* Hoodie Index implementation backed by HBase
*/
public class HBaseIndex<T extends HoodieRecordPayload> extends HoodieIndex<T> {
private final static byte[] SYSTEM_COLUMN_FAMILY = Bytes.toBytes("_s");
private final static byte[] COMMIT_TS_COLUMN = Bytes.toBytes("commit_ts");
private final static byte[] FILE_NAME_COLUMN = Bytes.toBytes("file_name");
@@ -144,9 +149,7 @@ public class HBaseIndex<T extends HoodieRecordPayload> extends HoodieIndex<T> {
} catch (IOException e) {
throw new HoodieIndexException(
"Failed to Tag indexed locations because of exception with HBase Client", e);
}
finally {
} finally {
if (hTable != null) {
try {
hTable.close();
@@ -161,11 +164,14 @@ public class HBaseIndex<T extends HoodieRecordPayload> extends HoodieIndex<T> {
}
@Override
public JavaRDD<HoodieRecord<T>> tagLocation(JavaRDD<HoodieRecord<T>> recordRDD, HoodieTable<T> hoodieTable) {
public JavaRDD<HoodieRecord<T>> tagLocation(JavaRDD<HoodieRecord<T>> recordRDD,
HoodieTable<T> hoodieTable) {
return recordRDD.mapPartitionsWithIndex(this.new LocationTagFunction(hoodieTable), true);
}
class UpdateLocationTask implements Function2<Integer, Iterator<WriteStatus>, Iterator<WriteStatus>> {
class UpdateLocationTask implements
Function2<Integer, Iterator<WriteStatus>, Iterator<WriteStatus>> {
@Override
public Iterator<WriteStatus> call(Integer partition, Iterator<WriteStatus> statusIterator) {
@@ -187,7 +193,7 @@ public class HBaseIndex<T extends HoodieRecordPayload> extends HoodieIndex<T> {
for (HoodieRecord rec : writeStatus.getWrittenRecords()) {
if (!writeStatus.isErrored(rec.getKey())) {
java.util.Optional<HoodieRecordLocation> loc = rec.getNewLocation();
if(loc.isPresent()) {
if (loc.isPresent()) {
Put put = new Put(Bytes.toBytes(rec.getRecordKey()));
put.addColumn(SYSTEM_COLUMN_FAMILY, COMMIT_TS_COLUMN,
Bytes.toBytes(loc.get().getCommitTime()));
@@ -244,8 +250,6 @@ public class HBaseIndex<T extends HoodieRecordPayload> extends HoodieIndex<T> {
/**
* Only looks up by recordKey
*
* @return
*/
@Override
public boolean isGlobal() {
@@ -254,8 +258,6 @@ public class HBaseIndex<T extends HoodieRecordPayload> extends HoodieIndex<T> {
/**
* Mapping is available in HBase already.
*
* @return
*/
@Override
public boolean canIndexLogFiles() {
@@ -264,8 +266,6 @@ public class HBaseIndex<T extends HoodieRecordPayload> extends HoodieIndex<T> {
/**
* Index needs to be explicitly updated after storage write.
*
* @return
*/
@Override
public boolean isImplicitWithStorage() {

View File

@@ -36,13 +36,6 @@ import com.uber.hoodie.config.HoodieWriteConfig;
import com.uber.hoodie.exception.HoodieAppendException;
import com.uber.hoodie.exception.HoodieUpsertException;
import com.uber.hoodie.table.HoodieTable;
import org.apache.avro.generic.GenericRecord;
import org.apache.avro.generic.IndexedRecord;
import org.apache.hadoop.fs.Path;
import org.apache.log4j.LogManager;
import org.apache.log4j.Logger;
import org.apache.spark.TaskContext;
import java.io.IOException;
import java.util.ArrayList;
import java.util.Iterator;
@@ -50,13 +43,18 @@ import java.util.List;
import java.util.Map;
import java.util.Optional;
import java.util.concurrent.atomic.AtomicLong;
import org.apache.avro.generic.GenericRecord;
import org.apache.avro.generic.IndexedRecord;
import org.apache.hadoop.fs.Path;
import org.apache.log4j.LogManager;
import org.apache.log4j.Logger;
import org.apache.spark.TaskContext;
/**
* IO Operation to append data onto an existing file.
*
* @param <T>
*/
public class HoodieAppendHandle<T extends HoodieRecordPayload> extends HoodieIOHandle<T> {
private static Logger logger = LogManager.getLogger(HoodieMergeHandle.class);
private static AtomicLong recordIndex = new AtomicLong(1);
@@ -133,7 +131,7 @@ public class HoodieAppendHandle<T extends HoodieRecordPayload> extends HoodieIOH
try {
Optional<IndexedRecord> avroRecord = hoodieRecord.getData().getInsertValue(schema);
if(avroRecord.isPresent()) {
if (avroRecord.isPresent()) {
String seqId = HoodieRecord.generateSequenceId(commitTime, TaskContext.getPartitionId(),
recordIndex.getAndIncrement());
HoodieAvroUtils
@@ -164,18 +162,19 @@ public class HoodieAppendHandle<T extends HoodieRecordPayload> extends HoodieIOH
metadata.put(HoodieLogBlock.LogMetadataType.INSTANT_TIME, commitTime);
records.stream().forEach(record -> {
Optional<IndexedRecord> indexedRecord = getIndexedRecord(record);
if(indexedRecord.isPresent()) {
if (indexedRecord.isPresent()) {
recordList.add(indexedRecord.get());
} else {
keysToDelete.add(record.getRecordKey());
}
});
try {
if(recordList.size() > 0) {
if (recordList.size() > 0) {
writer = writer.appendBlock(new HoodieAvroDataBlock(recordList, schema, metadata));
}
if(keysToDelete.size() > 0) {
writer = writer.appendBlock(new HoodieDeleteBlock(keysToDelete.stream().toArray(String[]::new), metadata));
if (keysToDelete.size() > 0) {
writer = writer.appendBlock(
new HoodieDeleteBlock(keysToDelete.stream().toArray(String[]::new), metadata));
}
} catch (Exception e) {
throw new HoodieAppendException(

View File

@@ -27,27 +27,24 @@ import com.uber.hoodie.common.table.TableFileSystemView;
import com.uber.hoodie.common.table.timeline.HoodieInstant;
import com.uber.hoodie.config.HoodieWriteConfig;
import com.uber.hoodie.table.HoodieTable;
import org.apache.hadoop.fs.FileSystem;
import org.apache.log4j.LogManager;
import org.apache.log4j.Logger;
import java.io.IOException;
import java.util.ArrayList;
import java.util.Iterator;
import java.util.List;
import java.util.Optional;
import java.util.stream.Collectors;
import org.apache.hadoop.fs.FileSystem;
import org.apache.log4j.LogManager;
import org.apache.log4j.Logger;
/**
* Cleaner is responsible for garbage collecting older files in a given partition path, such that
* <p>
* 1) It provides sufficient time for existing queries running on older versions, to finish
* <p>
* 2) It bounds the growth of the files in the file system
* <p>
* TODO: Should all cleaning be done based on {@link com.uber.hoodie.common.model.HoodieCommitMetadata}
* <p> 1) It provides sufficient time for existing queries running on older versions, to finish <p>
* 2) It bounds the growth of the files in the file system <p> TODO: Should all cleaning be done
* based on {@link com.uber.hoodie.common.model.HoodieCommitMetadata}
*/
public class HoodieCleanHelper<T extends HoodieRecordPayload<T>> {
private static Logger logger = LogManager.getLogger(HoodieCleanHelper.class);
private final TableFileSystemView fileSystemView;
@@ -66,13 +63,9 @@ public class HoodieCleanHelper<T extends HoodieRecordPayload<T>> {
/**
* Selects the older versions of files for cleaning, such that it bounds the number of versions of each file.
* This policy is useful, if you are simply interested in querying the table, and you don't want too many
* versions for a single file (i.e run it with versionsRetained = 1)
*
* @param partitionPath
* @return
* @throws IOException
* Selects the older versions of files for cleaning, such that it bounds the number of versions of
* each file. This policy is useful, if you are simply interested in querying the table, and you
* don't want too many versions for a single file (i.e run it with versionsRetained = 1)
*/
private List<String> getFilesToCleanKeepingLatestVersions(String partitionPath)
throws IOException {
@@ -93,7 +86,7 @@ public class HoodieCleanHelper<T extends HoodieRecordPayload<T>> {
// Skip this most recent version
FileSlice nextSlice = fileSliceIterator.next();
HoodieDataFile dataFile = nextSlice.getDataFile().get();
if(savepointedFiles.contains(dataFile.getFileName())) {
if (savepointedFiles.contains(dataFile.getFileName())) {
// do not clean up a savepoint data file
continue;
}
@@ -118,22 +111,15 @@ public class HoodieCleanHelper<T extends HoodieRecordPayload<T>> {
/**
* Selects the versions for file for cleaning, such that it
* <p>
* - Leaves the latest version of the file untouched
* - For older versions,
* - It leaves all the commits untouched which has occured in last <code>config.getCleanerCommitsRetained()</code> commits
* - It leaves ONE commit before this window. We assume that the max(query execution time) == commit_batch_time * config.getCleanerCommitsRetained(). This is 12 hours by default.
* This is essential to leave the file used by the query thats running for the max time.
* <p>
* This provides the effect of having lookback into all changes that happened in the last X
* commits. (eg: if you retain 24 commits, and commit batch time is 30 mins, then you have 12 hrs of lookback)
* <p>
* This policy is the default.
*
* @param partitionPath
* @return
* @throws IOException
* Selects the versions for file for cleaning, such that it <p> - Leaves the latest version of the
* file untouched - For older versions, - It leaves all the commits untouched which has occured in
* last <code>config.getCleanerCommitsRetained()</code> commits - It leaves ONE commit before this
* window. We assume that the max(query execution time) == commit_batch_time *
* config.getCleanerCommitsRetained(). This is 12 hours by default. This is essential to leave the
* file used by the query thats running for the max time. <p> This provides the effect of having
* lookback into all changes that happened in the last X commits. (eg: if you retain 24 commits,
* and commit batch time is 30 mins, then you have 12 hrs of lookback) <p> This policy is the
* default.
*/
private List<String> getFilesToCleanKeepingLatestCommits(String partitionPath)
throws IOException {
@@ -164,7 +150,7 @@ public class HoodieCleanHelper<T extends HoodieRecordPayload<T>> {
for (FileSlice aSlice : fileSliceList) {
HoodieDataFile aFile = aSlice.getDataFile().get();
String fileCommitTime = aFile.getCommitTime();
if(savepointedFiles.contains(aFile.getFileName())) {
if (savepointedFiles.contains(aFile.getFileName())) {
// do not clean up a savepoint data file
continue;
}

View File

@@ -39,6 +39,12 @@ import com.uber.hoodie.exception.HoodieCommitException;
import com.uber.hoodie.exception.HoodieException;
import com.uber.hoodie.exception.HoodieIOException;
import com.uber.hoodie.table.HoodieTable;
import java.io.IOException;
import java.util.ArrayList;
import java.util.List;
import java.util.Optional;
import java.util.stream.Collectors;
import java.util.stream.Stream;
import org.apache.avro.Schema;
import org.apache.avro.generic.IndexedRecord;
import org.apache.hadoop.fs.FileSystem;
@@ -46,17 +52,11 @@ import org.apache.hadoop.fs.Path;
import org.apache.log4j.LogManager;
import org.apache.log4j.Logger;
import java.io.IOException;
import java.util.ArrayList;
import java.util.List;
import java.util.Optional;
import java.util.stream.Collectors;
import java.util.stream.Stream;
/**
* Archiver to bound the growth of <action>.commit files
*/
public class HoodieCommitArchiveLog {
private static Logger log = LogManager.getLogger(HoodieCommitArchiveLog.class);
private final Path archiveFilePath;
@@ -73,7 +73,7 @@ public class HoodieCommitArchiveLog {
private HoodieLogFormat.Writer openWriter() {
try {
if(this.writer == null) {
if (this.writer == null) {
return HoodieLogFormat.newWriterBuilder()
.onParentPath(archiveFilePath.getParent())
.withFileId(archiveFilePath.getName())
@@ -83,17 +83,17 @@ public class HoodieCommitArchiveLog {
} else {
return this.writer;
}
} catch(InterruptedException | IOException e) {
} catch (InterruptedException | IOException e) {
throw new HoodieException("Unable to initialize HoodieLogFormat writer", e);
}
}
private void close() {
try {
if(this.writer != null) {
if (this.writer != null) {
this.writer.close();
}
} catch(IOException e) {
} catch (IOException e) {
throw new HoodieException("Unable to close HoodieLogFormat writer", e);
}
}
@@ -125,10 +125,12 @@ public class HoodieCommitArchiveLog {
int maxCommitsToKeep = config.getMaxCommitsToKeep();
int minCommitsToKeep = config.getMinCommitsToKeep();
HoodieTable table = HoodieTable.getHoodieTable(new HoodieTableMetaClient(fs, config.getBasePath(), true), config);
HoodieTable table = HoodieTable
.getHoodieTable(new HoodieTableMetaClient(fs, config.getBasePath(), true), config);
// GroupBy each action and limit each action timeline to maxCommitsToKeep
HoodieTimeline cleanAndRollbackTimeline = table.getActiveTimeline().getTimelineOfActions(Sets.newHashSet(HoodieTimeline.CLEAN_ACTION,
HoodieTimeline cleanAndRollbackTimeline = table.getActiveTimeline()
.getTimelineOfActions(Sets.newHashSet(HoodieTimeline.CLEAN_ACTION,
HoodieTimeline.ROLLBACK_ACTION));
Stream<HoodieInstant> instants = cleanAndRollbackTimeline.getInstants()
.collect(Collectors.groupingBy(s -> s.getAction()))
@@ -198,7 +200,7 @@ public class HoodieCommitArchiveLog {
}
HoodieAvroDataBlock block = new HoodieAvroDataBlock(records, wrapperSchema);
this.writer = writer.appendBlock(block);
} catch(Exception e) {
} catch (Exception e) {
throw new HoodieCommitException("Failed to archive commits", e);
}
}
@@ -207,40 +209,48 @@ public class HoodieCommitArchiveLog {
return archiveFilePath;
}
private IndexedRecord convertToAvroRecord(HoodieTimeline commitTimeline, HoodieInstant hoodieInstant) throws IOException {
private IndexedRecord convertToAvroRecord(HoodieTimeline commitTimeline,
HoodieInstant hoodieInstant) throws IOException {
HoodieArchivedMetaEntry archivedMetaWrapper = new HoodieArchivedMetaEntry();
archivedMetaWrapper.setCommitTime(hoodieInstant.getTimestamp());
switch(hoodieInstant.getAction()) {
case HoodieTimeline.CLEAN_ACTION:{
archivedMetaWrapper.setHoodieCleanMetadata(AvroUtils.deserializeAvroMetadata(commitTimeline.getInstantDetails(hoodieInstant).get(), HoodieCleanMetadata.class));
switch (hoodieInstant.getAction()) {
case HoodieTimeline.CLEAN_ACTION: {
archivedMetaWrapper.setHoodieCleanMetadata(AvroUtils
.deserializeAvroMetadata(commitTimeline.getInstantDetails(hoodieInstant).get(),
HoodieCleanMetadata.class));
archivedMetaWrapper.setActionType(ActionType.clean.name());
break;
}
case HoodieTimeline.COMMIT_ACTION:{
case HoodieTimeline.COMMIT_ACTION: {
HoodieCommitMetadata commitMetadata = HoodieCommitMetadata
.fromBytes(commitTimeline.getInstantDetails(hoodieInstant).get());
archivedMetaWrapper.setHoodieCommitMetadata(commitMetadataConverter(commitMetadata));
archivedMetaWrapper.setActionType(ActionType.commit.name());
break;
}
case HoodieTimeline.COMPACTION_ACTION:{
case HoodieTimeline.COMPACTION_ACTION: {
com.uber.hoodie.common.model.HoodieCompactionMetadata compactionMetadata = com.uber.hoodie.common.model.HoodieCompactionMetadata
.fromBytes(commitTimeline.getInstantDetails(hoodieInstant).get());
archivedMetaWrapper.setHoodieCompactionMetadata(compactionMetadataConverter(compactionMetadata));
archivedMetaWrapper
.setHoodieCompactionMetadata(compactionMetadataConverter(compactionMetadata));
archivedMetaWrapper.setActionType(ActionType.compaction.name());
break;
}
case HoodieTimeline.ROLLBACK_ACTION:{
archivedMetaWrapper.setHoodieRollbackMetadata(AvroUtils.deserializeAvroMetadata(commitTimeline.getInstantDetails(hoodieInstant).get(), HoodieRollbackMetadata.class));
case HoodieTimeline.ROLLBACK_ACTION: {
archivedMetaWrapper.setHoodieRollbackMetadata(AvroUtils
.deserializeAvroMetadata(commitTimeline.getInstantDetails(hoodieInstant).get(),
HoodieRollbackMetadata.class));
archivedMetaWrapper.setActionType(ActionType.rollback.name());
break;
}
case HoodieTimeline.SAVEPOINT_ACTION:{
archivedMetaWrapper.setHoodieSavePointMetadata(AvroUtils.deserializeAvroMetadata(commitTimeline.getInstantDetails(hoodieInstant).get(), HoodieSavepointMetadata.class));
case HoodieTimeline.SAVEPOINT_ACTION: {
archivedMetaWrapper.setHoodieSavePointMetadata(AvroUtils
.deserializeAvroMetadata(commitTimeline.getInstantDetails(hoodieInstant).get(),
HoodieSavepointMetadata.class));
archivedMetaWrapper.setActionType(ActionType.savepoint.name());
break;
}
case HoodieTimeline.DELTA_COMMIT_ACTION:{
case HoodieTimeline.DELTA_COMMIT_ACTION: {
HoodieCommitMetadata commitMetadata = HoodieCommitMetadata
.fromBytes(commitTimeline.getInstantDetails(hoodieInstant).get());
archivedMetaWrapper.setHoodieCommitMetadata(commitMetadataConverter(commitMetadata));
@@ -251,19 +261,23 @@ public class HoodieCommitArchiveLog {
return archivedMetaWrapper;
}
private com.uber.hoodie.avro.model.HoodieCommitMetadata commitMetadataConverter(HoodieCommitMetadata hoodieCommitMetadata) {
private com.uber.hoodie.avro.model.HoodieCommitMetadata commitMetadataConverter(
HoodieCommitMetadata hoodieCommitMetadata) {
ObjectMapper mapper = new ObjectMapper();
//Need this to ignore other public get() methods
mapper.configure(DeserializationFeature.FAIL_ON_UNKNOWN_PROPERTIES, false);
com.uber.hoodie.avro.model.HoodieCommitMetadata avroMetaData =
mapper.convertValue(hoodieCommitMetadata, com.uber.hoodie.avro.model.HoodieCommitMetadata.class);
mapper.convertValue(hoodieCommitMetadata,
com.uber.hoodie.avro.model.HoodieCommitMetadata.class);
return avroMetaData;
}
private com.uber.hoodie.avro.model.HoodieCompactionMetadata compactionMetadataConverter(HoodieCompactionMetadata hoodieCompactionMetadata) {
private com.uber.hoodie.avro.model.HoodieCompactionMetadata compactionMetadataConverter(
HoodieCompactionMetadata hoodieCompactionMetadata) {
ObjectMapper mapper = new ObjectMapper();
mapper.configure(DeserializationFeature.FAIL_ON_UNKNOWN_PROPERTIES, false);
com.uber.hoodie.avro.model.HoodieCompactionMetadata avroMetaData = mapper.convertValue(hoodieCompactionMetadata,
com.uber.hoodie.avro.model.HoodieCompactionMetadata avroMetaData = mapper
.convertValue(hoodieCompactionMetadata,
com.uber.hoodie.avro.model.HoodieCompactionMetadata.class);
return avroMetaData;
}

View File

@@ -29,17 +29,17 @@ import com.uber.hoodie.exception.HoodieInsertException;
import com.uber.hoodie.io.storage.HoodieStorageWriter;
import com.uber.hoodie.io.storage.HoodieStorageWriterFactory;
import com.uber.hoodie.table.HoodieTable;
import java.io.IOException;
import java.util.Optional;
import java.util.UUID;
import org.apache.avro.generic.IndexedRecord;
import org.apache.hadoop.fs.Path;
import org.apache.log4j.LogManager;
import org.apache.log4j.Logger;
import org.apache.spark.TaskContext;
import java.io.IOException;
import java.util.Optional;
import java.util.UUID;
public class HoodieCreateHandle<T extends HoodieRecordPayload> extends HoodieIOHandle<T> {
private static Logger logger = LogManager.getLogger(HoodieCreateHandle.class);
private final WriteStatus status;
@@ -63,7 +63,8 @@ public class HoodieCreateHandle<T extends HoodieRecordPayload> extends HoodieIOH
new Path(config.getBasePath(), partitionPath));
partitionMetadata.trySave(TaskContext.getPartitionId());
this.storageWriter =
HoodieStorageWriterFactory.getStorageWriter(commitTime, path, hoodieTable, config, schema);
HoodieStorageWriterFactory
.getStorageWriter(commitTime, path, hoodieTable, config, schema);
} catch (IOException e) {
throw new HoodieInsertException(
"Failed to initialize HoodieStorageWriter for path " + path, e);
@@ -74,10 +75,8 @@ public class HoodieCreateHandle<T extends HoodieRecordPayload> extends HoodieIOH
/**
* Determines whether we can accept the incoming records, into the current file, depending on
*
* - Whether it belongs to the same partitionPath as existing records
* - Whether the current file written bytes lt max file size
*
* @return
* - Whether it belongs to the same partitionPath as existing records - Whether the current file
* written bytes lt max file size
*/
public boolean canWrite(HoodieRecord record) {
return storageWriter.canWrite() && record.getPartitionPath()
@@ -86,15 +85,13 @@ public class HoodieCreateHandle<T extends HoodieRecordPayload> extends HoodieIOH
/**
* Perform the actual writing of the given record into the backing file.
*
* @param record
*/
public void write(HoodieRecord record) {
Optional recordMetadata = record.getData().getMetadata();
try {
Optional<IndexedRecord> avroRecord = record.getData().getInsertValue(schema);
if(avroRecord.isPresent()) {
if (avroRecord.isPresent()) {
storageWriter.writeAvroWithMetadata(avroRecord.get(), record);
// update the new location of record, so we know where to find it next
record.setNewLocation(new HoodieRecordLocation(commitTime, status.getFileId()));
@@ -114,8 +111,6 @@ public class HoodieCreateHandle<T extends HoodieRecordPayload> extends HoodieIOH
/**
* Performs actions to durably, persist the current changes and returns a WriteStatus object
*
* @return
*/
public WriteStatus close() {
logger.info(

View File

@@ -24,6 +24,7 @@ import com.uber.hoodie.common.util.HoodieAvroUtils;
import com.uber.hoodie.config.HoodieWriteConfig;
import com.uber.hoodie.exception.HoodieIOException;
import com.uber.hoodie.table.HoodieTable;
import java.io.IOException;
import org.apache.avro.Schema;
import org.apache.hadoop.fs.FileStatus;
import org.apache.hadoop.fs.FileSystem;
@@ -31,9 +32,8 @@ import org.apache.hadoop.fs.Path;
import org.apache.log4j.LogManager;
import org.apache.log4j.Logger;
import java.io.IOException;
public abstract class HoodieIOHandle<T extends HoodieRecordPayload> {
private static Logger logger = LogManager.getLogger(HoodieIOHandle.class);
protected final String commitTime;
protected final HoodieWriteConfig config;

View File

@@ -16,19 +16,23 @@
package com.uber.hoodie.io;
import com.uber.hoodie.common.model.HoodiePartitionMetadata;
import com.uber.hoodie.common.util.ReflectionUtils;
import com.uber.hoodie.config.HoodieWriteConfig;
import com.uber.hoodie.WriteStatus;
import com.uber.hoodie.common.model.HoodiePartitionMetadata;
import com.uber.hoodie.common.model.HoodieRecord;
import com.uber.hoodie.common.model.HoodieRecordLocation;
import com.uber.hoodie.common.model.HoodieRecordPayload;
import com.uber.hoodie.common.model.HoodieWriteStat;
import com.uber.hoodie.common.util.FSUtils;
import com.uber.hoodie.common.util.ReflectionUtils;
import com.uber.hoodie.config.HoodieWriteConfig;
import com.uber.hoodie.exception.HoodieUpsertException;
import com.uber.hoodie.io.storage.HoodieStorageWriter;
import com.uber.hoodie.io.storage.HoodieStorageWriterFactory;
import com.uber.hoodie.table.HoodieTable;
import java.io.IOException;
import java.util.HashMap;
import java.util.Iterator;
import java.util.Optional;
import org.apache.avro.generic.GenericRecord;
import org.apache.avro.generic.IndexedRecord;
import org.apache.hadoop.fs.Path;
@@ -36,13 +40,9 @@ import org.apache.log4j.LogManager;
import org.apache.log4j.Logger;
import org.apache.spark.TaskContext;
import java.io.IOException;
import java.util.HashMap;
import java.util.Iterator;
import java.util.Optional;
@SuppressWarnings("Duplicates")
public class HoodieMergeHandle<T extends HoodieRecordPayload> extends HoodieIOHandle<T> {
private static Logger logger = LogManager.getLogger(HoodieMergeHandle.class);
private WriteStatus writeStatus;
@@ -94,7 +94,7 @@ public class HoodieMergeHandle<T extends HoodieRecordPayload> extends HoodieIOHa
oldFilePath = new Path(
config.getBasePath() + "/" + record.getPartitionPath() + "/"
+ latestValidFilePath);
String relativePath = new Path( record.getPartitionPath() + "/" + FSUtils
String relativePath = new Path(record.getPartitionPath() + "/" + FSUtils
.makeDataFileName(commitTime, TaskContext.getPartitionId(), fileId)).toString();
newFilePath = new Path(config.getBasePath(), relativePath);
@@ -129,10 +129,11 @@ public class HoodieMergeHandle<T extends HoodieRecordPayload> extends HoodieIOHa
}
private boolean writeUpdateRecord(HoodieRecord<T> hoodieRecord, Optional<IndexedRecord> indexedRecord) {
private boolean writeUpdateRecord(HoodieRecord<T> hoodieRecord,
Optional<IndexedRecord> indexedRecord) {
Optional recordMetadata = hoodieRecord.getData().getMetadata();
try {
if(indexedRecord.isPresent()) {
if (indexedRecord.isPresent()) {
storageWriter.writeAvroWithMetadata(indexedRecord.get(), hoodieRecord);
recordsWritten++;
updatedRecordsWritten++;
@@ -144,14 +145,15 @@ public class HoodieMergeHandle<T extends HoodieRecordPayload> extends HoodieIOHa
writeStatus.markSuccess(hoodieRecord, recordMetadata);
return true;
} catch (Exception e) {
logger.error("Error writing record "+ hoodieRecord, e);
logger.error("Error writing record " + hoodieRecord, e);
writeStatus.markFailure(hoodieRecord, e, recordMetadata);
}
return false;
}
/**
* Go through an old record. Here if we detect a newer version shows up, we write the new one to the file.
* Go through an old record. Here if we detect a newer version shows up, we write the new one to
* the file.
*/
public void write(GenericRecord oldRecord) {
String key = oldRecord.get(HoodieRecord.RECORD_KEY_METADATA_FIELD).toString();
@@ -159,7 +161,8 @@ public class HoodieMergeHandle<T extends HoodieRecordPayload> extends HoodieIOHa
boolean copyOldRecord = true;
if (keyToNewRecords.containsKey(key)) {
try {
Optional<IndexedRecord> combinedAvroRecord = hoodieRecord.getData().combineAndGetUpdateValue(oldRecord, schema);
Optional<IndexedRecord> combinedAvroRecord = hoodieRecord.getData()
.combineAndGetUpdateValue(oldRecord, schema);
if (writeUpdateRecord(hoodieRecord, combinedAvroRecord)) {
/* ONLY WHEN
* 1) we have an update for this key AND
@@ -171,7 +174,8 @@ public class HoodieMergeHandle<T extends HoodieRecordPayload> extends HoodieIOHa
}
keyToNewRecords.remove(key);
} catch (Exception e) {
throw new HoodieUpsertException("Failed to combine/merge new record with old value in storage, for new record {"
throw new HoodieUpsertException(
"Failed to combine/merge new record with old value in storage, for new record {"
+ keyToNewRecords.get(key) + "}, old value {" + oldRecord + "}", e);
}
}
@@ -193,7 +197,7 @@ public class HoodieMergeHandle<T extends HoodieRecordPayload> extends HoodieIOHa
+ getOldFilePath() + " to new file " + newFilePath, e);
throw new HoodieUpsertException(errMsg, e);
}
recordsWritten ++;
recordsWritten++;
}
}

View File

@@ -18,7 +18,6 @@ package com.uber.hoodie.io.compact;
import com.uber.hoodie.common.model.HoodieDataFile;
import com.uber.hoodie.common.model.HoodieLogFile;
import com.uber.hoodie.config.HoodieWriteConfig;
import com.uber.hoodie.io.compact.strategy.CompactionStrategy;
import java.io.Serializable;
@@ -27,8 +26,8 @@ import java.util.Map;
import java.util.stream.Collectors;
/**
* Encapsulates all the needed information about a compaction
* and make a decision whether this compaction is effective or not
* Encapsulates all the needed information about a compaction and make a decision whether this
* compaction is effective or not
*
* @see CompactionStrategy
*/

View File

@@ -22,18 +22,17 @@ import com.uber.hoodie.common.table.timeline.HoodieActiveTimeline;
import com.uber.hoodie.common.table.timeline.HoodieInstant;
import com.uber.hoodie.config.HoodieWriteConfig;
import com.uber.hoodie.table.HoodieTable;
import org.apache.spark.api.java.JavaSparkContext;
import java.io.Serializable;
import java.util.Date;
import org.apache.spark.api.java.JavaSparkContext;
/**
* A HoodieCompactor runs compaction on a hoodie table
*/
public interface HoodieCompactor extends Serializable {
/**
* Compact the delta files with the data files
* @throws Exception
*/
HoodieCompactionMetadata compact(JavaSparkContext jsc, final HoodieWriteConfig config,
HoodieTable hoodieTable) throws Exception;

View File

@@ -16,14 +16,14 @@
package com.uber.hoodie.io.compact;
import static java.util.stream.Collectors.toList;
import com.google.common.base.Preconditions;
import com.google.common.collect.Lists;
import com.google.common.collect.Sets;
import com.uber.hoodie.WriteStatus;
import com.uber.hoodie.common.model.CompactionWriteStat;
import com.uber.hoodie.common.model.HoodieAvroPayload;
import com.uber.hoodie.common.model.HoodieCompactionMetadata;
import com.uber.hoodie.common.model.HoodieRecordPayload;
import com.uber.hoodie.common.model.HoodieTableType;
import com.uber.hoodie.common.table.HoodieTableMetaClient;
import com.uber.hoodie.common.table.HoodieTimeline;
@@ -36,7 +36,12 @@ import com.uber.hoodie.config.HoodieWriteConfig;
import com.uber.hoodie.exception.HoodieCompactionException;
import com.uber.hoodie.table.HoodieCopyOnWriteTable;
import com.uber.hoodie.table.HoodieTable;
import java.io.IOException;
import java.nio.charset.StandardCharsets;
import java.util.Collection;
import java.util.Iterator;
import java.util.List;
import java.util.Optional;
import java.util.stream.Collectors;
import java.util.stream.StreamSupport;
import org.apache.avro.Schema;
@@ -46,18 +51,10 @@ import org.apache.log4j.Logger;
import org.apache.spark.api.java.JavaSparkContext;
import org.apache.spark.api.java.function.FlatMapFunction;
import java.io.IOException;
import java.nio.charset.StandardCharsets;
import java.util.Iterator;
import java.util.List;
import java.util.Optional;
import static java.util.stream.Collectors.*;
/**
* HoodieRealtimeTableCompactor compacts a hoodie table with merge on read storage.
* Computes all possible compactions, passes it through a CompactionFilter and executes
* all the compactions and writes a new version of base files and make a normal commit
* HoodieRealtimeTableCompactor compacts a hoodie table with merge on read storage. Computes all
* possible compactions, passes it through a CompactionFilter and executes all the compactions and
* writes a new version of base files and make a normal commit
*
* @see HoodieCompactor
*/
@@ -80,7 +77,8 @@ public class HoodieRealtimeTableCompactor implements HoodieCompactor {
String compactionCommit = startCompactionCommit(hoodieTable);
log.info("Compacting " + metaClient.getBasePath() + " with commit " + compactionCommit);
List<String> partitionPaths =
FSUtils.getAllPartitionPaths(metaClient.getFs(), metaClient.getBasePath(), config.shouldAssumeDatePartitioning());
FSUtils.getAllPartitionPaths(metaClient.getFs(), metaClient.getBasePath(),
config.shouldAssumeDatePartitioning());
log.info("Compaction looking for files to compact in " + partitionPaths + " partitions");
List<CompactionOperation> operations =
@@ -156,7 +154,8 @@ public class HoodieRealtimeTableCompactor implements HoodieCompactor {
HoodieTimeline.DELTA_COMMIT_ACTION))
.filterCompletedInstants().lastInstant().get().getTimestamp();
HoodieCompactedLogRecordScanner scanner = new HoodieCompactedLogRecordScanner(fs, metaClient.getBasePath(),
HoodieCompactedLogRecordScanner scanner = new HoodieCompactedLogRecordScanner(fs,
metaClient.getBasePath(),
operation.getDeltaFilePaths(), readerSchema, maxInstantTime);
if (!scanner.iterator().hasNext()) {
return Lists.newArrayList();

View File

@@ -28,8 +28,8 @@ import java.util.Map;
import java.util.Optional;
/**
* CompactionStrategy which looks at total IO to be done for the compaction (read + write)
* and limits the list of compactions to be under a configured limit on the IO
* CompactionStrategy which looks at total IO to be done for the compaction (read + write) and
* limits the list of compactions to be under a configured limit on the IO
*
* @see CompactionStrategy
*/
@@ -62,7 +62,8 @@ public class BoundedIOCompactionStrategy implements CompactionStrategy {
}
@Override
public List<CompactionOperation> orderAndFilter(HoodieWriteConfig writeConfig, List<CompactionOperation> operations) {
public List<CompactionOperation> orderAndFilter(HoodieWriteConfig writeConfig,
List<CompactionOperation> operations) {
// Iterate through the operations in order and accept operations as long as we are within the IO limit
// Preserves the original ordering of compactions
List<CompactionOperation> finalOperations = Lists.newArrayList();

View File

@@ -25,12 +25,12 @@ import java.util.List;
import java.util.Map;
/**
* Strategy for compaction. Pluggable implementation of define how compaction should be done.
* The implementations of this interface can capture the relevant metrics to order and filter
* the final list of compaction operation to run in a single compaction.
* Strategy for compaction. Pluggable implementation of define how compaction should be done. The
* implementations of this interface can capture the relevant metrics to order and filter the final
* list of compaction operation to run in a single compaction.
*
* Implementation of CompactionStrategy cannot hold any state.
* Difference instantiations can be passed in every time
* Implementation of CompactionStrategy cannot hold any state. Difference instantiations can be
* passed in every time
*
* @see com.uber.hoodie.io.compact.HoodieRealtimeTableCompactor
* @see CompactionOperation
@@ -38,8 +38,8 @@ import java.util.Map;
public interface CompactionStrategy extends Serializable {
/**
* Callback hook when a CompactionOperation is created. Individual strategies can
* capture the metrics they need to decide on the priority.
* Callback hook when a CompactionOperation is created. Individual strategies can capture the
* metrics they need to decide on the priority.
*
* @param dataFile - Base file to compact
* @param partitionPath - Partition path
@@ -50,8 +50,8 @@ public interface CompactionStrategy extends Serializable {
List<HoodieLogFile> logFiles);
/**
* Order and Filter the list of compactions. Use the metrics captured with the
* captureMetrics to order and filter out compactions
* Order and Filter the list of compactions. Use the metrics captured with the captureMetrics to
* order and filter out compactions
*
* @param writeConfig - HoodieWriteConfig - config for this compaction is passed in
* @param operations - list of compactions collected

View File

@@ -27,8 +27,8 @@ import java.util.Optional;
import java.util.stream.Collectors;
/**
* LogFileSizeBasedCompactionStrategy orders the compactions based on the total log files size
* and limits the compactions within a configured IO bound
* LogFileSizeBasedCompactionStrategy orders the compactions based on the total log files size and
* limits the compactions within a configured IO bound
*
* @see BoundedIOCompactionStrategy
* @see CompactionStrategy

View File

@@ -25,9 +25,9 @@ import java.util.List;
import java.util.Map;
/**
* UnBoundedCompactionStrategy will not change ordering or filter any compaction.
* It is a pass-through and will compact all the base files which has a log file.
* This usually means no-intelligence on compaction.
* UnBoundedCompactionStrategy will not change ordering or filter any compaction. It is a
* pass-through and will compact all the base files which has a log file. This usually means
* no-intelligence on compaction.
*
* @see CompactionStrategy
*/

View File

@@ -17,11 +17,11 @@
package com.uber.hoodie.io.storage;
import com.uber.hoodie.avro.HoodieAvroWriteSupport;
import org.apache.avro.Schema;
import org.apache.hadoop.conf.Configuration;
import org.apache.parquet.hadoop.metadata.CompressionCodecName;
public class HoodieParquetConfig {
private HoodieAvroWriteSupport writeSupport;
private CompressionCodecName compressionCodecName;
private int blockSize;

View File

@@ -20,6 +20,8 @@ import com.uber.hoodie.avro.HoodieAvroWriteSupport;
import com.uber.hoodie.common.model.HoodieRecord;
import com.uber.hoodie.common.model.HoodieRecordPayload;
import com.uber.hoodie.common.util.HoodieAvroUtils;
import java.io.IOException;
import java.util.concurrent.atomic.AtomicLong;
import org.apache.avro.Schema;
import org.apache.avro.generic.GenericRecord;
import org.apache.avro.generic.IndexedRecord;
@@ -30,17 +32,13 @@ import org.apache.parquet.hadoop.ParquetFileWriter;
import org.apache.parquet.hadoop.ParquetWriter;
import org.apache.spark.TaskContext;
import java.io.IOException;
import java.util.concurrent.atomic.AtomicLong;
/**
* HoodieParquetWriter extends the ParquetWriter to help limit the size of underlying file.
* Provides a way to check if the current file can take more records with the <code>canWrite()</code>
*
* @param <T>
* HoodieParquetWriter extends the ParquetWriter to help limit the size of underlying file. Provides
* a way to check if the current file can take more records with the <code>canWrite()</code>
*/
public class HoodieParquetWriter<T extends HoodieRecordPayload, R extends IndexedRecord>
extends ParquetWriter<IndexedRecord> implements HoodieStorageWriter<R> {
private static double STREAM_COMPRESSION_RATIO = 0.1;
private static AtomicLong recordIndex = new AtomicLong(1);
@@ -101,7 +99,8 @@ public class HoodieParquetWriter<T extends HoodieRecordPayload, R extends Indexe
return fs.getBytesWritten(file) < maxFileSize;
}
@Override public void writeAvro(String key, IndexedRecord object) throws IOException {
@Override
public void writeAvro(String key, IndexedRecord object) throws IOException {
super.write(object);
writeSupport.add(key);
}

View File

@@ -17,13 +17,16 @@
package com.uber.hoodie.io.storage;
import com.uber.hoodie.common.model.HoodieRecord;
import java.io.IOException;
import org.apache.avro.generic.IndexedRecord;
import java.io.IOException;
public interface HoodieStorageWriter<R extends IndexedRecord> {
void writeAvroWithMetadata(R newRecord, HoodieRecord record) throws IOException;
boolean canWrite();
void close() throws IOException;
void writeAvro(String key, R oldRecord) throws IOException;
}

View File

@@ -16,24 +16,24 @@
package com.uber.hoodie.io.storage;
import com.uber.hoodie.common.table.HoodieTableMetaClient;
import com.uber.hoodie.config.HoodieWriteConfig;
import com.uber.hoodie.avro.HoodieAvroWriteSupport;
import com.uber.hoodie.common.BloomFilter;
import com.uber.hoodie.common.model.HoodieRecordPayload;
import com.uber.hoodie.common.util.FSUtils;
import com.uber.hoodie.config.HoodieWriteConfig;
import com.uber.hoodie.table.HoodieTable;
import java.io.IOException;
import org.apache.avro.Schema;
import org.apache.avro.generic.IndexedRecord;
import org.apache.hadoop.fs.Path;
import org.apache.parquet.avro.AvroSchemaConverter;
import org.apache.parquet.hadoop.metadata.CompressionCodecName;
import java.io.IOException;
public class HoodieStorageWriterFactory {
public static <T extends HoodieRecordPayload, R extends IndexedRecord> HoodieStorageWriter<R> getStorageWriter(
String commitTime, Path path, HoodieTable<T> hoodieTable, HoodieWriteConfig config, Schema schema)
String commitTime, Path path, HoodieTable<T> hoodieTable, HoodieWriteConfig config,
Schema schema)
throws IOException {
//TODO - based on the metadata choose the implementation of HoodieStorageWriter
// Currently only parquet is supported

View File

@@ -16,17 +16,6 @@
package com.uber.hoodie.io.storage;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.*;
import org.apache.hadoop.fs.permission.AclEntry;
import org.apache.hadoop.fs.permission.AclStatus;
import org.apache.hadoop.fs.permission.FsAction;
import org.apache.hadoop.fs.permission.FsPermission;
import org.apache.hadoop.security.AccessControlException;
import org.apache.hadoop.security.Credentials;
import org.apache.hadoop.security.token.Token;
import org.apache.hadoop.util.Progressable;
import java.io.FileNotFoundException;
import java.io.IOException;
import java.net.URI;
@@ -38,13 +27,41 @@ import java.util.Map;
import java.util.Set;
import java.util.concurrent.ConcurrentHashMap;
import java.util.concurrent.ConcurrentMap;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.BlockLocation;
import org.apache.hadoop.fs.ContentSummary;
import org.apache.hadoop.fs.CreateFlag;
import org.apache.hadoop.fs.FSDataInputStream;
import org.apache.hadoop.fs.FSDataOutputStream;
import org.apache.hadoop.fs.FileAlreadyExistsException;
import org.apache.hadoop.fs.FileChecksum;
import org.apache.hadoop.fs.FileStatus;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.FsServerDefaults;
import org.apache.hadoop.fs.FsStatus;
import org.apache.hadoop.fs.LocatedFileStatus;
import org.apache.hadoop.fs.Options;
import org.apache.hadoop.fs.ParentNotDirectoryException;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.fs.PathFilter;
import org.apache.hadoop.fs.RemoteIterator;
import org.apache.hadoop.fs.UnsupportedFileSystemException;
import org.apache.hadoop.fs.XAttrSetFlag;
import org.apache.hadoop.fs.permission.AclEntry;
import org.apache.hadoop.fs.permission.AclStatus;
import org.apache.hadoop.fs.permission.FsAction;
import org.apache.hadoop.fs.permission.FsPermission;
import org.apache.hadoop.security.AccessControlException;
import org.apache.hadoop.security.Credentials;
import org.apache.hadoop.security.token.Token;
import org.apache.hadoop.util.Progressable;
/**
* HoodieWrapperFileSystem wraps the default file system.
* It holds state about the open streams in the file system to support getting the
* written size to each of the open streams.
* HoodieWrapperFileSystem wraps the default file system. It holds state about the open streams in
* the file system to support getting the written size to each of the open streams.
*/
public class HoodieWrapperFileSystem extends FileSystem {
private static final Set<String> SUPPORT_SCHEMES;
public static final String HOODIE_SCHEME_PREFIX = "hoodie-";
@@ -65,7 +82,8 @@ public class HoodieWrapperFileSystem extends FileSystem {
private FileSystem fileSystem;
private URI uri;
@Override public void initialize(URI uri, Configuration conf) throws IOException {
@Override
public void initialize(URI uri, Configuration conf) throws IOException {
// Get the default filesystem to decorate
fileSystem = FileSystem.get(conf);
// Do not need to explicitly initialize the default filesystem, its done already in the above FileSystem.get
@@ -74,15 +92,18 @@ public class HoodieWrapperFileSystem extends FileSystem {
this.uri = uri;
}
@Override public URI getUri() {
@Override
public URI getUri() {
return uri;
}
@Override public FSDataInputStream open(Path f, int bufferSize) throws IOException {
@Override
public FSDataInputStream open(Path f, int bufferSize) throws IOException {
return fileSystem.open(convertToDefaultPath(f), bufferSize);
}
@Override public FSDataOutputStream create(Path f, FsPermission permission, boolean overwrite,
@Override
public FSDataOutputStream create(Path f, FsPermission permission, boolean overwrite,
int bufferSize, short replication, long blockSize, Progressable progress)
throws IOException {
final Path translatedPath = convertToDefaultPath(f);
@@ -99,7 +120,8 @@ public class HoodieWrapperFileSystem extends FileSystem {
SizeAwareFSDataOutputStream os =
new SizeAwareFSDataOutputStream(fsDataOutputStream, new Runnable() {
@Override public void run() {
@Override
public void run() {
openStreams.remove(path.getName());
}
});
@@ -107,33 +129,40 @@ public class HoodieWrapperFileSystem extends FileSystem {
return os;
}
@Override public FSDataOutputStream create(Path f, boolean overwrite) throws IOException {
@Override
public FSDataOutputStream create(Path f, boolean overwrite) throws IOException {
return wrapOutputStream(f, fileSystem.create(convertToDefaultPath(f), overwrite));
}
@Override public FSDataOutputStream create(Path f) throws IOException {
@Override
public FSDataOutputStream create(Path f) throws IOException {
return wrapOutputStream(f, fileSystem.create(convertToDefaultPath(f)));
}
@Override public FSDataOutputStream create(Path f, Progressable progress) throws IOException {
@Override
public FSDataOutputStream create(Path f, Progressable progress) throws IOException {
return fileSystem.create(convertToDefaultPath(f), progress);
}
@Override public FSDataOutputStream create(Path f, short replication) throws IOException {
@Override
public FSDataOutputStream create(Path f, short replication) throws IOException {
return fileSystem.create(convertToDefaultPath(f), replication);
}
@Override public FSDataOutputStream create(Path f, short replication, Progressable progress)
@Override
public FSDataOutputStream create(Path f, short replication, Progressable progress)
throws IOException {
return fileSystem.create(convertToDefaultPath(f), replication, progress);
}
@Override public FSDataOutputStream create(Path f, boolean overwrite, int bufferSize)
@Override
public FSDataOutputStream create(Path f, boolean overwrite, int bufferSize)
throws IOException {
return fileSystem.create(convertToDefaultPath(f), overwrite, bufferSize);
}
@Override public FSDataOutputStream create(Path f, boolean overwrite, int bufferSize,
@Override
public FSDataOutputStream create(Path f, boolean overwrite, int bufferSize,
Progressable progress) throws IOException {
return fileSystem.create(convertToDefaultPath(f), overwrite, bufferSize, progress);
}
@@ -173,91 +202,112 @@ public class HoodieWrapperFileSystem extends FileSystem {
}
@Override public FSDataOutputStream append(Path f, int bufferSize, Progressable progress)
@Override
public FSDataOutputStream append(Path f, int bufferSize, Progressable progress)
throws IOException {
return fileSystem.append(convertToDefaultPath(f), bufferSize, progress);
}
@Override public boolean rename(Path src, Path dst) throws IOException {
@Override
public boolean rename(Path src, Path dst) throws IOException {
return fileSystem.rename(convertToDefaultPath(src), convertToDefaultPath(dst));
}
@Override public boolean delete(Path f, boolean recursive) throws IOException {
@Override
public boolean delete(Path f, boolean recursive) throws IOException {
return fileSystem.delete(convertToDefaultPath(f), recursive);
}
@Override public FileStatus[] listStatus(Path f) throws FileNotFoundException, IOException {
@Override
public FileStatus[] listStatus(Path f) throws FileNotFoundException, IOException {
return fileSystem.listStatus(convertToDefaultPath(f));
}
@Override public void setWorkingDirectory(Path new_dir) {
@Override
public void setWorkingDirectory(Path new_dir) {
fileSystem.setWorkingDirectory(convertToDefaultPath(new_dir));
}
@Override public Path getWorkingDirectory() {
@Override
public Path getWorkingDirectory() {
return convertToHoodiePath(fileSystem.getWorkingDirectory());
}
@Override public boolean mkdirs(Path f, FsPermission permission) throws IOException {
@Override
public boolean mkdirs(Path f, FsPermission permission) throws IOException {
return fileSystem.mkdirs(convertToDefaultPath(f), permission);
}
@Override public FileStatus getFileStatus(Path f) throws IOException {
@Override
public FileStatus getFileStatus(Path f) throws IOException {
return fileSystem.getFileStatus(convertToDefaultPath(f));
}
@Override public String getScheme() {
@Override
public String getScheme() {
return uri.getScheme();
}
@Override public String getCanonicalServiceName() {
@Override
public String getCanonicalServiceName() {
return fileSystem.getCanonicalServiceName();
}
@Override public String getName() {
@Override
public String getName() {
return fileSystem.getName();
}
@Override public Path makeQualified(Path path) {
@Override
public Path makeQualified(Path path) {
return convertToHoodiePath(fileSystem.makeQualified(convertToDefaultPath(path)));
}
@Override public Token<?> getDelegationToken(String renewer) throws IOException {
@Override
public Token<?> getDelegationToken(String renewer) throws IOException {
return fileSystem.getDelegationToken(renewer);
}
@Override public Token<?>[] addDelegationTokens(String renewer, Credentials credentials)
@Override
public Token<?>[] addDelegationTokens(String renewer, Credentials credentials)
throws IOException {
return fileSystem.addDelegationTokens(renewer, credentials);
}
@Override public FileSystem[] getChildFileSystems() {
@Override
public FileSystem[] getChildFileSystems() {
return fileSystem.getChildFileSystems();
}
@Override public BlockLocation[] getFileBlockLocations(FileStatus file, long start, long len)
@Override
public BlockLocation[] getFileBlockLocations(FileStatus file, long start, long len)
throws IOException {
return fileSystem.getFileBlockLocations(file, start, len);
}
@Override public BlockLocation[] getFileBlockLocations(Path p, long start, long len)
@Override
public BlockLocation[] getFileBlockLocations(Path p, long start, long len)
throws IOException {
return fileSystem.getFileBlockLocations(convertToDefaultPath(p), start, len);
}
@Override public FsServerDefaults getServerDefaults() throws IOException {
@Override
public FsServerDefaults getServerDefaults() throws IOException {
return fileSystem.getServerDefaults();
}
@Override public FsServerDefaults getServerDefaults(Path p) throws IOException {
@Override
public FsServerDefaults getServerDefaults(Path p) throws IOException {
return fileSystem.getServerDefaults(convertToDefaultPath(p));
}
@Override public Path resolvePath(Path p) throws IOException {
@Override
public Path resolvePath(Path p) throws IOException {
return convertToHoodiePath(fileSystem.resolvePath(convertToDefaultPath(p)));
}
@Override public FSDataInputStream open(Path f) throws IOException {
@Override
public FSDataInputStream open(Path f) throws IOException {
return fileSystem.open(convertToDefaultPath(f));
}
@@ -278,7 +328,8 @@ public class HoodieWrapperFileSystem extends FileSystem {
replication, blockSize, progress);
}
@Override public FSDataOutputStream createNonRecursive(Path f, FsPermission permission,
@Override
public FSDataOutputStream createNonRecursive(Path f, FsPermission permission,
EnumSet<CreateFlag> flags, int bufferSize, short replication, long blockSize,
Progressable progress) throws IOException {
return fileSystem
@@ -286,122 +337,150 @@ public class HoodieWrapperFileSystem extends FileSystem {
blockSize, progress);
}
@Override public boolean createNewFile(Path f) throws IOException {
@Override
public boolean createNewFile(Path f) throws IOException {
return fileSystem.createNewFile(convertToDefaultPath(f));
}
@Override public FSDataOutputStream append(Path f) throws IOException {
@Override
public FSDataOutputStream append(Path f) throws IOException {
return fileSystem.append(convertToDefaultPath(f));
}
@Override public FSDataOutputStream append(Path f, int bufferSize) throws IOException {
@Override
public FSDataOutputStream append(Path f, int bufferSize) throws IOException {
return fileSystem.append(convertToDefaultPath(f), bufferSize);
}
@Override public void concat(Path trg, Path[] psrcs) throws IOException {
@Override
public void concat(Path trg, Path[] psrcs) throws IOException {
Path[] psrcsNew = convertDefaults(psrcs);
fileSystem.concat(convertToDefaultPath(trg), psrcsNew);
}
@Override public short getReplication(Path src) throws IOException {
@Override
public short getReplication(Path src) throws IOException {
return fileSystem.getReplication(convertToDefaultPath(src));
}
@Override public boolean setReplication(Path src, short replication) throws IOException {
@Override
public boolean setReplication(Path src, short replication) throws IOException {
return fileSystem.setReplication(convertToDefaultPath(src), replication);
}
@Override public boolean delete(Path f) throws IOException {
@Override
public boolean delete(Path f) throws IOException {
return fileSystem.delete(convertToDefaultPath(f));
}
@Override public boolean deleteOnExit(Path f) throws IOException {
@Override
public boolean deleteOnExit(Path f) throws IOException {
return fileSystem.deleteOnExit(convertToDefaultPath(f));
}
@Override public boolean cancelDeleteOnExit(Path f) {
@Override
public boolean cancelDeleteOnExit(Path f) {
return fileSystem.cancelDeleteOnExit(convertToDefaultPath(f));
}
@Override public boolean exists(Path f) throws IOException {
@Override
public boolean exists(Path f) throws IOException {
return fileSystem.exists(convertToDefaultPath(f));
}
@Override public boolean isDirectory(Path f) throws IOException {
@Override
public boolean isDirectory(Path f) throws IOException {
return fileSystem.isDirectory(convertToDefaultPath(f));
}
@Override public boolean isFile(Path f) throws IOException {
@Override
public boolean isFile(Path f) throws IOException {
return fileSystem.isFile(convertToDefaultPath(f));
}
@Override public long getLength(Path f) throws IOException {
@Override
public long getLength(Path f) throws IOException {
return fileSystem.getLength(convertToDefaultPath(f));
}
@Override public ContentSummary getContentSummary(Path f) throws IOException {
@Override
public ContentSummary getContentSummary(Path f) throws IOException {
return fileSystem.getContentSummary(convertToDefaultPath(f));
}
@Override public RemoteIterator<Path> listCorruptFileBlocks(Path path) throws IOException {
@Override
public RemoteIterator<Path> listCorruptFileBlocks(Path path) throws IOException {
return fileSystem.listCorruptFileBlocks(convertToDefaultPath(path));
}
@Override public FileStatus[] listStatus(Path f, PathFilter filter)
@Override
public FileStatus[] listStatus(Path f, PathFilter filter)
throws FileNotFoundException, IOException {
return fileSystem.listStatus(convertToDefaultPath(f), filter);
}
@Override public FileStatus[] listStatus(Path[] files)
@Override
public FileStatus[] listStatus(Path[] files)
throws FileNotFoundException, IOException {
return fileSystem.listStatus(convertDefaults(files));
}
@Override public FileStatus[] listStatus(Path[] files, PathFilter filter)
@Override
public FileStatus[] listStatus(Path[] files, PathFilter filter)
throws FileNotFoundException, IOException {
return fileSystem.listStatus(convertDefaults(files), filter);
}
@Override public FileStatus[] globStatus(Path pathPattern) throws IOException {
@Override
public FileStatus[] globStatus(Path pathPattern) throws IOException {
return fileSystem.globStatus(convertToDefaultPath(pathPattern));
}
@Override public FileStatus[] globStatus(Path pathPattern, PathFilter filter)
@Override
public FileStatus[] globStatus(Path pathPattern, PathFilter filter)
throws IOException {
return fileSystem.globStatus(convertToDefaultPath(pathPattern), filter);
}
@Override public RemoteIterator<LocatedFileStatus> listLocatedStatus(Path f)
@Override
public RemoteIterator<LocatedFileStatus> listLocatedStatus(Path f)
throws FileNotFoundException, IOException {
return fileSystem.listLocatedStatus(convertToDefaultPath(f));
}
@Override public RemoteIterator<LocatedFileStatus> listFiles(Path f, boolean recursive)
@Override
public RemoteIterator<LocatedFileStatus> listFiles(Path f, boolean recursive)
throws FileNotFoundException, IOException {
return fileSystem.listFiles(convertToDefaultPath(f), recursive);
}
@Override public Path getHomeDirectory() {
@Override
public Path getHomeDirectory() {
return convertToHoodiePath(fileSystem.getHomeDirectory());
}
@Override public boolean mkdirs(Path f) throws IOException {
@Override
public boolean mkdirs(Path f) throws IOException {
return fileSystem.mkdirs(convertToDefaultPath(f));
}
@Override public void copyFromLocalFile(Path src, Path dst) throws IOException {
@Override
public void copyFromLocalFile(Path src, Path dst) throws IOException {
fileSystem.copyFromLocalFile(convertToDefaultPath(src), convertToDefaultPath(dst));
}
@Override public void moveFromLocalFile(Path[] srcs, Path dst) throws IOException {
@Override
public void moveFromLocalFile(Path[] srcs, Path dst) throws IOException {
fileSystem.moveFromLocalFile(convertDefaults(srcs), convertToDefaultPath(dst));
}
@Override public void moveFromLocalFile(Path src, Path dst) throws IOException {
@Override
public void moveFromLocalFile(Path src, Path dst) throws IOException {
fileSystem.moveFromLocalFile(convertToDefaultPath(src), convertToDefaultPath(dst));
}
@Override public void copyFromLocalFile(boolean delSrc, Path src, Path dst) throws IOException {
@Override
public void copyFromLocalFile(boolean delSrc, Path src, Path dst) throws IOException {
fileSystem.copyFromLocalFile(delSrc, convertToDefaultPath(src), convertToDefaultPath(dst));
}
@@ -412,21 +491,25 @@ public class HoodieWrapperFileSystem extends FileSystem {
.copyFromLocalFile(delSrc, overwrite, convertDefaults(srcs), convertToDefaultPath(dst));
}
@Override public void copyFromLocalFile(boolean delSrc, boolean overwrite, Path src, Path dst)
@Override
public void copyFromLocalFile(boolean delSrc, boolean overwrite, Path src, Path dst)
throws IOException {
fileSystem.copyFromLocalFile(delSrc, overwrite, convertToDefaultPath(src),
convertToDefaultPath(dst));
}
@Override public void copyToLocalFile(Path src, Path dst) throws IOException {
@Override
public void copyToLocalFile(Path src, Path dst) throws IOException {
fileSystem.copyToLocalFile(convertToDefaultPath(src), convertToDefaultPath(dst));
}
@Override public void moveToLocalFile(Path src, Path dst) throws IOException {
@Override
public void moveToLocalFile(Path src, Path dst) throws IOException {
fileSystem.moveToLocalFile(convertToDefaultPath(src), convertToDefaultPath(dst));
}
@Override public void copyToLocalFile(boolean delSrc, Path src, Path dst) throws IOException {
@Override
public void copyToLocalFile(boolean delSrc, Path src, Path dst) throws IOException {
fileSystem.copyToLocalFile(delSrc, convertToDefaultPath(src), convertToDefaultPath(dst));
}
@@ -437,193 +520,237 @@ public class HoodieWrapperFileSystem extends FileSystem {
useRawLocalFileSystem);
}
@Override public Path startLocalOutput(Path fsOutputFile, Path tmpLocalFile)
@Override
public Path startLocalOutput(Path fsOutputFile, Path tmpLocalFile)
throws IOException {
return convertToHoodiePath(fileSystem.startLocalOutput(convertToDefaultPath(fsOutputFile),
convertToDefaultPath(tmpLocalFile)));
}
@Override public void completeLocalOutput(Path fsOutputFile, Path tmpLocalFile)
@Override
public void completeLocalOutput(Path fsOutputFile, Path tmpLocalFile)
throws IOException {
fileSystem.completeLocalOutput(convertToDefaultPath(fsOutputFile),
convertToDefaultPath(tmpLocalFile));
}
@Override public void close() throws IOException {
@Override
public void close() throws IOException {
fileSystem.close();
}
@Override public long getUsed() throws IOException {
@Override
public long getUsed() throws IOException {
return fileSystem.getUsed();
}
@Override public long getBlockSize(Path f) throws IOException {
@Override
public long getBlockSize(Path f) throws IOException {
return fileSystem.getBlockSize(convertToDefaultPath(f));
}
@Override public long getDefaultBlockSize() {
@Override
public long getDefaultBlockSize() {
return fileSystem.getDefaultBlockSize();
}
@Override public long getDefaultBlockSize(Path f) {
@Override
public long getDefaultBlockSize(Path f) {
return fileSystem.getDefaultBlockSize(convertToDefaultPath(f));
}
@Override public short getDefaultReplication() {
@Override
public short getDefaultReplication() {
return fileSystem.getDefaultReplication();
}
@Override public short getDefaultReplication(Path path) {
@Override
public short getDefaultReplication(Path path) {
return fileSystem.getDefaultReplication(convertToDefaultPath(path));
}
@Override public void access(Path path, FsAction mode)
@Override
public void access(Path path, FsAction mode)
throws AccessControlException, FileNotFoundException, IOException {
fileSystem.access(convertToDefaultPath(path), mode);
}
@Override public void createSymlink(Path target, Path link, boolean createParent)
@Override
public void createSymlink(Path target, Path link, boolean createParent)
throws AccessControlException, FileAlreadyExistsException, FileNotFoundException,
ParentNotDirectoryException, UnsupportedFileSystemException, IOException {
fileSystem
.createSymlink(convertToDefaultPath(target), convertToDefaultPath(link), createParent);
}
@Override public FileStatus getFileLinkStatus(Path f)
@Override
public FileStatus getFileLinkStatus(Path f)
throws AccessControlException, FileNotFoundException, UnsupportedFileSystemException,
IOException {
return fileSystem.getFileLinkStatus(convertToDefaultPath(f));
}
@Override public boolean supportsSymlinks() {
@Override
public boolean supportsSymlinks() {
return fileSystem.supportsSymlinks();
}
@Override public Path getLinkTarget(Path f) throws IOException {
@Override
public Path getLinkTarget(Path f) throws IOException {
return convertToHoodiePath(fileSystem.getLinkTarget(convertToDefaultPath(f)));
}
@Override public FileChecksum getFileChecksum(Path f) throws IOException {
@Override
public FileChecksum getFileChecksum(Path f) throws IOException {
return fileSystem.getFileChecksum(convertToDefaultPath(f));
}
@Override public FileChecksum getFileChecksum(Path f, long length) throws IOException {
@Override
public FileChecksum getFileChecksum(Path f, long length) throws IOException {
return fileSystem.getFileChecksum(convertToDefaultPath(f), length);
}
@Override public void setVerifyChecksum(boolean verifyChecksum) {
@Override
public void setVerifyChecksum(boolean verifyChecksum) {
fileSystem.setVerifyChecksum(verifyChecksum);
}
@Override public void setWriteChecksum(boolean writeChecksum) {
@Override
public void setWriteChecksum(boolean writeChecksum) {
fileSystem.setWriteChecksum(writeChecksum);
}
@Override public FsStatus getStatus() throws IOException {
@Override
public FsStatus getStatus() throws IOException {
return fileSystem.getStatus();
}
@Override public FsStatus getStatus(Path p) throws IOException {
@Override
public FsStatus getStatus(Path p) throws IOException {
return fileSystem.getStatus(convertToDefaultPath(p));
}
@Override public void setPermission(Path p, FsPermission permission) throws IOException {
@Override
public void setPermission(Path p, FsPermission permission) throws IOException {
fileSystem.setPermission(convertToDefaultPath(p), permission);
}
@Override public void setOwner(Path p, String username, String groupname) throws IOException {
@Override
public void setOwner(Path p, String username, String groupname) throws IOException {
fileSystem.setOwner(convertToDefaultPath(p), username, groupname);
}
@Override public void setTimes(Path p, long mtime, long atime) throws IOException {
@Override
public void setTimes(Path p, long mtime, long atime) throws IOException {
fileSystem.setTimes(convertToDefaultPath(p), mtime, atime);
}
@Override public Path createSnapshot(Path path, String snapshotName) throws IOException {
@Override
public Path createSnapshot(Path path, String snapshotName) throws IOException {
return convertToHoodiePath(
fileSystem.createSnapshot(convertToDefaultPath(path), snapshotName));
}
@Override public void renameSnapshot(Path path, String snapshotOldName, String snapshotNewName)
@Override
public void renameSnapshot(Path path, String snapshotOldName, String snapshotNewName)
throws IOException {
fileSystem.renameSnapshot(convertToDefaultPath(path), snapshotOldName, snapshotNewName);
}
@Override public void deleteSnapshot(Path path, String snapshotName) throws IOException {
@Override
public void deleteSnapshot(Path path, String snapshotName) throws IOException {
fileSystem.deleteSnapshot(convertToDefaultPath(path), snapshotName);
}
@Override public void modifyAclEntries(Path path, List<AclEntry> aclSpec) throws IOException {
@Override
public void modifyAclEntries(Path path, List<AclEntry> aclSpec) throws IOException {
fileSystem.modifyAclEntries(convertToDefaultPath(path), aclSpec);
}
@Override public void removeAclEntries(Path path, List<AclEntry> aclSpec) throws IOException {
@Override
public void removeAclEntries(Path path, List<AclEntry> aclSpec) throws IOException {
fileSystem.removeAclEntries(convertToDefaultPath(path), aclSpec);
}
@Override public void removeDefaultAcl(Path path) throws IOException {
@Override
public void removeDefaultAcl(Path path) throws IOException {
fileSystem.removeDefaultAcl(convertToDefaultPath(path));
}
@Override public void removeAcl(Path path) throws IOException {
@Override
public void removeAcl(Path path) throws IOException {
fileSystem.removeAcl(convertToDefaultPath(path));
}
@Override public void setAcl(Path path, List<AclEntry> aclSpec) throws IOException {
@Override
public void setAcl(Path path, List<AclEntry> aclSpec) throws IOException {
fileSystem.setAcl(convertToDefaultPath(path), aclSpec);
}
@Override public AclStatus getAclStatus(Path path) throws IOException {
@Override
public AclStatus getAclStatus(Path path) throws IOException {
return fileSystem.getAclStatus(convertToDefaultPath(path));
}
@Override public void setXAttr(Path path, String name, byte[] value) throws IOException {
@Override
public void setXAttr(Path path, String name, byte[] value) throws IOException {
fileSystem.setXAttr(convertToDefaultPath(path), name, value);
}
@Override public void setXAttr(Path path, String name, byte[] value, EnumSet<XAttrSetFlag> flag)
@Override
public void setXAttr(Path path, String name, byte[] value, EnumSet<XAttrSetFlag> flag)
throws IOException {
fileSystem.setXAttr(convertToDefaultPath(path), name, value, flag);
}
@Override public byte[] getXAttr(Path path, String name) throws IOException {
@Override
public byte[] getXAttr(Path path, String name) throws IOException {
return fileSystem.getXAttr(convertToDefaultPath(path), name);
}
@Override public Map<String, byte[]> getXAttrs(Path path) throws IOException {
@Override
public Map<String, byte[]> getXAttrs(Path path) throws IOException {
return fileSystem.getXAttrs(convertToDefaultPath(path));
}
@Override public Map<String, byte[]> getXAttrs(Path path, List<String> names)
@Override
public Map<String, byte[]> getXAttrs(Path path, List<String> names)
throws IOException {
return fileSystem.getXAttrs(convertToDefaultPath(path), names);
}
@Override public List<String> listXAttrs(Path path) throws IOException {
@Override
public List<String> listXAttrs(Path path) throws IOException {
return fileSystem.listXAttrs(convertToDefaultPath(path));
}
@Override public void removeXAttr(Path path, String name) throws IOException {
@Override
public void removeXAttr(Path path, String name) throws IOException {
fileSystem.removeXAttr(convertToDefaultPath(path), name);
}
@Override public void setConf(Configuration conf) {
@Override
public void setConf(Configuration conf) {
// ignore this. we will set conf on init
}
@Override public Configuration getConf() {
@Override
public Configuration getConf() {
return fileSystem.getConf();
}
@Override public int hashCode() {
@Override
public int hashCode() {
return fileSystem.hashCode();
}
@Override public boolean equals(Object obj) {
@Override
public boolean equals(Object obj) {
return fileSystem.equals(obj);
}
@Override public String toString() {
@Override
public String toString() {
return fileSystem.toString();
}

View File

@@ -16,16 +16,16 @@
package com.uber.hoodie.io.storage;
import org.apache.hadoop.fs.FSDataOutputStream;
import java.io.IOException;
import java.util.concurrent.atomic.AtomicLong;
import org.apache.hadoop.fs.FSDataOutputStream;
/**
* Wrapper over <code>FSDataOutputStream</code> to keep track of the size of the written bytes.
* This gives a cheap way to check on the underlying file size.
* Wrapper over <code>FSDataOutputStream</code> to keep track of the size of the written bytes. This
* gives a cheap way to check on the underlying file size.
*/
public class SizeAwareFSDataOutputStream extends FSDataOutputStream {
// A callback to call when the output stream is closed.
private final Runnable closeCallback;
// Keep track of the bytes written
@@ -37,17 +37,20 @@ public class SizeAwareFSDataOutputStream extends FSDataOutputStream {
this.closeCallback = closeCallback;
}
@Override public synchronized void write(byte[] b, int off, int len) throws IOException {
@Override
public synchronized void write(byte[] b, int off, int len) throws IOException {
bytesWritten.addAndGet(len);
super.write(b, off, len);
}
@Override public void write(byte[] b) throws IOException {
@Override
public void write(byte[] b) throws IOException {
bytesWritten.addAndGet(b.length);
super.write(b);
}
@Override public void close() throws IOException {
@Override
public void close() throws IOException {
super.close();
closeCallback.run();
}

View File

@@ -22,7 +22,6 @@ import com.codahale.metrics.Timer;
import com.google.common.annotations.VisibleForTesting;
import com.uber.hoodie.common.model.HoodieCommitMetadata;
import com.uber.hoodie.config.HoodieWriteConfig;
import org.apache.log4j.LogManager;
import org.apache.log4j.Logger;
@@ -30,6 +29,7 @@ import org.apache.log4j.Logger;
* Wrapper for metrics-related operations.
*/
public class HoodieMetrics {
private HoodieWriteConfig config = null;
private String tableName = null;
private static Logger logger = LogManager.getLogger(HoodieMetrics.class);
@@ -77,7 +77,8 @@ public class HoodieMetrics {
return commitTimer == null ? null : commitTimer.time();
}
public void updateCommitMetrics(long commitEpochTimeInMs, long durationInMs, HoodieCommitMetadata metadata) {
public void updateCommitMetrics(long commitEpochTimeInMs, long durationInMs,
HoodieCommitMetadata metadata) {
if (config.isMetricsOn()) {
long totalPartitionsWritten = metadata.fetchTotalPartitionsWritten();
long totalFilesInsert = metadata.fetchTotalFilesInsert();
@@ -91,8 +92,10 @@ public class HoodieMetrics {
registerGauge(getMetricsName("commit", "totalFilesInsert"), totalFilesInsert);
registerGauge(getMetricsName("commit", "totalFilesUpdate"), totalFilesUpdate);
registerGauge(getMetricsName("commit", "totalRecordsWritten"), totalRecordsWritten);
registerGauge(getMetricsName("commit", "totalUpdateRecordsWritten"), totalUpdateRecordsWritten);
registerGauge(getMetricsName("commit", "totalInsertRecordsWritten"), totalInsertRecordsWritten);
registerGauge(getMetricsName("commit", "totalUpdateRecordsWritten"),
totalUpdateRecordsWritten);
registerGauge(getMetricsName("commit", "totalInsertRecordsWritten"),
totalInsertRecordsWritten);
registerGauge(getMetricsName("commit", "totalBytesWritten"), totalBytesWritten);
registerGauge(getMetricsName("commit", "commitTime"), commitEpochTimeInMs);
}
@@ -139,8 +142,7 @@ public class HoodieMetrics {
}
/**
* By default, the timer context returns duration with nano seconds.
* Convert it to millisecond.
* By default, the timer context returns duration with nano seconds. Convert it to millisecond.
*/
public long getDurationInMs(long ctxDuration) {
return ctxDuration / 1000000;

View File

@@ -22,6 +22,7 @@ import java.io.Closeable;
* Used for testing.
*/
public class InMemoryMetricsReporter extends MetricsReporter {
@Override
public void start() {
}

View File

@@ -19,16 +19,15 @@ package com.uber.hoodie.metrics;
import com.codahale.metrics.MetricRegistry;
import com.google.common.io.Closeables;
import com.uber.hoodie.config.HoodieWriteConfig;
import com.uber.hoodie.config.HoodieMetricsConfig;
import com.uber.hoodie.exception.HoodieException;
import org.apache.commons.configuration.ConfigurationException;
import java.io.Closeable;
import org.apache.commons.configuration.ConfigurationException;
/**
* This is the main class of the metrics system.
*/
public class Metrics {
private static volatile boolean initialized = false;
private static Metrics metrics = null;
private final MetricRegistry registry;

View File

@@ -21,19 +21,18 @@ import com.codahale.metrics.MetricRegistry;
import com.codahale.metrics.graphite.Graphite;
import com.codahale.metrics.graphite.GraphiteReporter;
import com.uber.hoodie.config.HoodieWriteConfig;
import org.apache.log4j.LogManager;
import org.apache.log4j.Logger;
import java.io.Closeable;
import java.net.InetSocketAddress;
import java.util.concurrent.TimeUnit;
import org.apache.log4j.LogManager;
import org.apache.log4j.Logger;
/**
* Implementation of Graphite reporter, which connects to the Graphite server,
* and send metrics to that server.
* Implementation of Graphite reporter, which connects to the Graphite server, and send metrics to
* that server.
*/
public class MetricsGraphiteReporter extends MetricsReporter {
private final MetricRegistry registry;
private final GraphiteReporter graphiteReporter;
private final HoodieWriteConfig config;

View File

@@ -22,6 +22,7 @@ import java.io.Closeable;
* Interface for implementing a Reporter.
*/
public abstract class MetricsReporter {
/**
* Push out metrics at scheduled intervals
*/

View File

@@ -18,7 +18,6 @@ package com.uber.hoodie.metrics;
import com.codahale.metrics.MetricRegistry;
import com.uber.hoodie.config.HoodieWriteConfig;
import org.apache.log4j.LogManager;
import org.apache.log4j.Logger;
@@ -26,6 +25,7 @@ import org.apache.log4j.Logger;
* Factory class for creating MetricsReporter.
*/
public class MetricsReporterFactory {
private static Logger logger = LogManager.getLogger(MetricsReporterFactory.class);
public static MetricsReporter createReporter(HoodieWriteConfig config,

View File

@@ -17,8 +17,8 @@
package com.uber.hoodie.metrics;
/**
* Types of the reporter. Right now we only support Graphite.
* We can include JMX and CSV in the future.
* Types of the reporter. Right now we only support Graphite. We can include JMX and CSV in the
* future.
*/
public enum MetricsReporterType {
GRAPHITE,

View File

@@ -70,28 +70,16 @@ import org.apache.spark.api.java.function.PairFlatMapFunction;
import scala.Option;
import scala.Tuple2;
import java.io.IOException;
import java.io.Serializable;
import java.util.ArrayList;
import java.util.Collections;
import java.util.HashMap;
import java.util.Iterator;
import java.util.List;
import java.util.Map;
import java.util.Optional;
import java.util.Set;
import java.util.stream.Collectors;
/**
* Implementation of a very heavily read-optimized Hoodie Table where
*
* INSERTS - Produce new files, block aligned to desired size (or)
* Merge with the smallest existing file, to expand it
* INSERTS - Produce new files, block aligned to desired size (or) Merge with the smallest existing
* file, to expand it
*
* UPDATES - Produce a new version of the file, just replacing the updated records with new values
*
*/
public class HoodieCopyOnWriteTable<T extends HoodieRecordPayload> extends HoodieTable<T> {
public HoodieCopyOnWriteTable(HoodieWriteConfig config, HoodieTableMetaClient metaClient) {
super(config, metaClient);
}
@@ -107,6 +95,7 @@ public class HoodieCopyOnWriteTable<T extends HoodieRecordPayload> extends Hoodi
* Helper class for a small file's location and its actual size on disk
*/
class SmallFile implements Serializable {
HoodieRecordLocation location;
long sizeBytes;
@@ -121,11 +110,11 @@ public class HoodieCopyOnWriteTable<T extends HoodieRecordPayload> extends Hoodi
}
/**
* Helper class for an insert bucket along with the weight [0.0, 0.1]
* that defines the amount of incoming inserts that should be allocated to
* the bucket
* Helper class for an insert bucket along with the weight [0.0, 0.1] that defines the amount of
* incoming inserts that should be allocated to the bucket
*/
class InsertBucket implements Serializable {
int bucketNumber;
// fraction of total inserts, that should go into this bucket
double weight;
@@ -144,6 +133,7 @@ public class HoodieCopyOnWriteTable<T extends HoodieRecordPayload> extends Hoodi
* Helper class for a bucket's type (INSERT and UPDATE) and its file location
*/
class BucketInfo implements Serializable {
BucketType bucketType;
String fileLoc;
@@ -164,8 +154,8 @@ public class HoodieCopyOnWriteTable<T extends HoodieRecordPayload> extends Hoodi
class UpsertPartitioner extends Partitioner {
/**
* Total number of RDD partitions, is determined by total buckets we want to
* pack the incoming workload into
* Total number of RDD partitions, is determined by total buckets we want to pack the incoming
* workload into
*/
private int totalBuckets = 0;
@@ -181,8 +171,7 @@ public class HoodieCopyOnWriteTable<T extends HoodieRecordPayload> extends Hoodi
/**
* Helps us pack inserts into 1 or more buckets depending on number of
* incoming records.
* Helps us pack inserts into 1 or more buckets depending on number of incoming records.
*/
private HashMap<String, List<InsertBucket>> partitionPathToInsertBuckets;
@@ -236,24 +225,28 @@ public class HoodieCopyOnWriteTable<T extends HoodieRecordPayload> extends Hoodi
if (pStat.getNumInserts() > 0) {
List<SmallFile> smallFiles = getSmallFiles(partitionPath);
logger.info("For partitionPath : "+ partitionPath + " Small Files => " + smallFiles);
logger.info("For partitionPath : " + partitionPath + " Small Files => " + smallFiles);
long totalUnassignedInserts = pStat.getNumInserts();
List<Integer> bucketNumbers = new ArrayList<>();
List<Long> recordsPerBucket = new ArrayList<>();
// first try packing this into one of the smallFiles
for (SmallFile smallFile: smallFiles) {
long recordsToAppend = Math.min((config.getParquetMaxFileSize() - smallFile.sizeBytes)/ averageRecordSize, totalUnassignedInserts);
if (recordsToAppend > 0 && totalUnassignedInserts > 0){
for (SmallFile smallFile : smallFiles) {
long recordsToAppend = Math
.min((config.getParquetMaxFileSize() - smallFile.sizeBytes) / averageRecordSize,
totalUnassignedInserts);
if (recordsToAppend > 0 && totalUnassignedInserts > 0) {
// create a new bucket or re-use an existing bucket
int bucket;
if (updateLocationToBucket.containsKey(smallFile.location.getFileId())) {
bucket = updateLocationToBucket.get(smallFile.location.getFileId());
logger.info("Assigning " + recordsToAppend + " inserts to existing update bucket "+ bucket);
logger.info("Assigning " + recordsToAppend + " inserts to existing update bucket "
+ bucket);
} else {
bucket = addUpdateBucket(smallFile.location.getFileId());
logger.info("Assigning " + recordsToAppend + " inserts to new update bucket "+ bucket);
logger.info(
"Assigning " + recordsToAppend + " inserts to new update bucket " + bucket);
}
bucketNumbers.add(bucket);
recordsPerBucket.add(recordsToAppend);
@@ -265,16 +258,17 @@ public class HoodieCopyOnWriteTable<T extends HoodieRecordPayload> extends Hoodi
if (totalUnassignedInserts > 0) {
long insertRecordsPerBucket = config.getCopyOnWriteInsertSplitSize();
if (config.shouldAutoTuneInsertSplits()) {
insertRecordsPerBucket = config.getParquetMaxFileSize()/averageRecordSize;
insertRecordsPerBucket = config.getParquetMaxFileSize() / averageRecordSize;
}
int insertBuckets = (int) Math.max(totalUnassignedInserts / insertRecordsPerBucket, 1L);
logger.info("After small file assignment: unassignedInserts => " + totalUnassignedInserts
logger
.info("After small file assignment: unassignedInserts => " + totalUnassignedInserts
+ ", totalInsertBuckets => " + insertBuckets
+ ", recordsPerBucket => " + insertRecordsPerBucket);
for (int b = 0; b < insertBuckets; b++) {
bucketNumbers.add(totalBuckets);
recordsPerBucket.add(totalUnassignedInserts/insertBuckets);
recordsPerBucket.add(totalUnassignedInserts / insertBuckets);
BucketInfo bucketInfo = new BucketInfo();
bucketInfo.bucketType = BucketType.INSERT;
bucketInfoMap.put(totalBuckets, bucketInfo);
@@ -287,10 +281,11 @@ public class HoodieCopyOnWriteTable<T extends HoodieRecordPayload> extends Hoodi
for (int i = 0; i < bucketNumbers.size(); i++) {
InsertBucket bkt = new InsertBucket();
bkt.bucketNumber = bucketNumbers.get(i);
bkt.weight = (1.0 * recordsPerBucket.get(i))/pStat.getNumInserts();
bkt.weight = (1.0 * recordsPerBucket.get(i)) / pStat.getNumInserts();
insertBuckets.add(bkt);
}
logger.info("Total insert buckets for partition path "+ partitionPath + " => " + insertBuckets);
logger.info(
"Total insert buckets for partition path " + partitionPath + " => " + insertBuckets);
partitionPathToInsertBuckets.put(partitionPath, insertBuckets);
}
}
@@ -299,9 +294,6 @@ public class HoodieCopyOnWriteTable<T extends HoodieRecordPayload> extends Hoodi
/**
* Returns a list of small files in the given partition path
*
* @param partitionPath
* @return
*/
private List<SmallFile> getSmallFiles(String partitionPath) {
List<SmallFile> smallFileLocations = new ArrayList<>();
@@ -330,10 +322,8 @@ public class HoodieCopyOnWriteTable<T extends HoodieRecordPayload> extends Hoodi
}
/**
* Obtains the average record size based on records written during last commit.
* Used for estimating how many records pack into one file.
*
* @return
* Obtains the average record size based on records written during last commit. Used for
* estimating how many records pack into one file.
*/
private long averageBytesPerRecord() {
long avgSize = 0L;
@@ -375,13 +365,15 @@ public class HoodieCopyOnWriteTable<T extends HoodieRecordPayload> extends Hoodi
HoodieRecordLocation location = keyLocation._2().get();
return updateLocationToBucket.get(location.getFileId());
} else {
List<InsertBucket> targetBuckets = partitionPathToInsertBuckets.get(keyLocation._1().getPartitionPath());
List<InsertBucket> targetBuckets = partitionPathToInsertBuckets
.get(keyLocation._1().getPartitionPath());
// pick the target bucket to use based on the weights.
double totalWeight = 0.0;
final long totalInserts = Math.max(1, globalStat.getNumInserts());
final long hashOfKey = Hashing.md5().hashString(keyLocation._1().getRecordKey(), StandardCharsets.UTF_8).asLong();
final long hashOfKey = Hashing.md5()
.hashString(keyLocation._1().getRecordKey(), StandardCharsets.UTF_8).asLong();
final double r = 1.0 * Math.floorMod(hashOfKey, totalInserts) / totalInserts;
for (InsertBucket insertBucket: targetBuckets) {
for (InsertBucket insertBucket : targetBuckets) {
totalWeight += insertBucket.weight;
if (r <= totalWeight) {
return insertBucket.bucketNumber;
@@ -413,14 +405,14 @@ public class HoodieCopyOnWriteTable<T extends HoodieRecordPayload> extends Hoodi
}
public Iterator<List<WriteStatus>> handleUpdate(String commitTime, String fileLoc, Iterator<HoodieRecord<T>> recordItr)
public Iterator<List<WriteStatus>> handleUpdate(String commitTime, String fileLoc,
Iterator<HoodieRecord<T>> recordItr)
throws IOException {
// these are updates
HoodieMergeHandle upsertHandle = getUpdateHandle(commitTime, fileLoc, recordItr);
if (upsertHandle.getOldFilePath() == null) {
throw new HoodieUpsertException("Error in finding the old file path at commit " +
commitTime +" at fileLoc: " + fileLoc);
commitTime + " at fileLoc: " + fileLoc);
} else {
Configuration conf = FSUtils.getFs().getConf();
AvroReadSupport.setAvroReadSchema(conf, upsertHandle.getSchema());
@@ -448,14 +440,17 @@ public class HoodieCopyOnWriteTable<T extends HoodieRecordPayload> extends Hoodi
logger.info("Upsert Handle has partition path as null " + upsertHandle.getOldFilePath()
+ ", " + upsertHandle.getWriteStatus());
}
return Collections.singletonList(Collections.singletonList(upsertHandle.getWriteStatus())).iterator();
return Collections.singletonList(Collections.singletonList(upsertHandle.getWriteStatus()))
.iterator();
}
protected HoodieMergeHandle getUpdateHandle(String commitTime, String fileLoc, Iterator<HoodieRecord<T>> recordItr) {
protected HoodieMergeHandle getUpdateHandle(String commitTime, String fileLoc,
Iterator<HoodieRecord<T>> recordItr) {
return new HoodieMergeHandle<>(config, commitTime, this, recordItr, fileLoc);
}
public Iterator<List<WriteStatus>> handleInsert(String commitTime, Iterator<HoodieRecord<T>> recordItr) throws Exception {
public Iterator<List<WriteStatus>> handleInsert(String commitTime,
Iterator<HoodieRecord<T>> recordItr) throws Exception {
return new LazyInsertIterable<>(recordItr, config, commitTime, this);
}
@@ -473,7 +468,8 @@ public class HoodieCopyOnWriteTable<T extends HoodieRecordPayload> extends Hoodi
} else if (btype.equals(BucketType.UPDATE)) {
return handleUpdate(commitTime, binfo.fileLoc, recordItr);
} else {
throw new HoodieUpsertException("Unknown bucketType " + btype + " for partition :" + partition);
throw new HoodieUpsertException(
"Unknown bucketType " + btype + " for partition :" + partition);
}
} catch (Throwable t) {
String msg = "Error upserting bucketType " + btype + " for partition :" + partition;
@@ -496,9 +492,9 @@ public class HoodieCopyOnWriteTable<T extends HoodieRecordPayload> extends Hoodi
}
/**
* Performs cleaning of partition paths according to cleaning policy and returns the number
* of files cleaned. Handles skews in partitions to clean by making files to clean as the
* unit of task distribution.
* Performs cleaning of partition paths according to cleaning policy and returns the number of
* files cleaned. Handles skews in partitions to clean by making files to clean as the unit of
* task distribution.
*
* @throws IllegalArgumentException if unknown cleaning policy is provided
*/
@@ -506,7 +502,8 @@ public class HoodieCopyOnWriteTable<T extends HoodieRecordPayload> extends Hoodi
public List<HoodieCleanStat> clean(JavaSparkContext jsc) {
try {
List<String> partitionsToClean =
FSUtils.getAllPartitionPaths(getFs(), getMetaClient().getBasePath(), config.shouldAssumeDatePartitioning());
FSUtils.getAllPartitionPaths(getFs(), getMetaClient().getBasePath(),
config.shouldAssumeDatePartitioning());
logger.info("Partitions to clean up : " + partitionsToClean + ", with policy " + config
.getCleanerPolicy());
if (partitionsToClean.isEmpty()) {
@@ -520,19 +517,16 @@ public class HoodieCopyOnWriteTable<T extends HoodieRecordPayload> extends Hoodi
}
/**
*
* Common method used for cleaning out parquet files under a partition path during rollback of a set of commits
* @param partitionPath
* @param commits
* @return
* @throws IOException
* Common method used for cleaning out parquet files under a partition path during rollback of a
* set of commits
*/
protected Map<FileStatus, Boolean> deleteCleanedFiles(String partitionPath, List<String> commits) throws IOException {
protected Map<FileStatus, Boolean> deleteCleanedFiles(String partitionPath, List<String> commits)
throws IOException {
logger.info("Cleaning path " + partitionPath);
FileSystem fs = FSUtils.getFs();
FileStatus[] toBeDeleted =
fs.listStatus(new Path(config.getBasePath(), partitionPath), path -> {
if(!path.toString().contains(".parquet")) {
if (!path.toString().contains(".parquet")) {
return false;
}
String fileCommitTime = FSUtils.getCommitTime(path.getName());
@@ -548,10 +542,12 @@ public class HoodieCopyOnWriteTable<T extends HoodieRecordPayload> extends Hoodi
}
@Override
public List<HoodieRollbackStat> rollback(JavaSparkContext jsc, List<String> commits) throws IOException {
public List<HoodieRollbackStat> rollback(JavaSparkContext jsc, List<String> commits)
throws IOException {
String actionType = this.getCompactedCommitActionType();
HoodieActiveTimeline activeTimeline = this.getActiveTimeline();
List<String> inflights = this.getInflightCommitTimeline().getInstants().map(HoodieInstant::getTimestamp)
List<String> inflights = this.getInflightCommitTimeline().getInstants()
.map(HoodieInstant::getTimestamp)
.collect(Collectors.toList());
// Atomically unpublish all the commits
@@ -563,7 +559,8 @@ public class HoodieCopyOnWriteTable<T extends HoodieRecordPayload> extends Hoodi
// delete all the data files for all these commits
logger.info("Clean out all parquet files generated for commits: " + commits);
List<HoodieRollbackStat> stats = jsc.parallelize(
FSUtils.getAllPartitionPaths(FSUtils.getFs(), this.getMetaClient().getBasePath(), config.shouldAssumeDatePartitioning()))
FSUtils.getAllPartitionPaths(FSUtils.getFs(), this.getMetaClient().getBasePath(),
config.shouldAssumeDatePartitioning()))
.map((Function<String, HoodieRollbackStat>) partitionPath -> {
// Scan all partitions files with this commit time
Map<FileStatus, Boolean> results = deleteCleanedFiles(partitionPath, commits);
@@ -579,6 +576,7 @@ public class HoodieCopyOnWriteTable<T extends HoodieRecordPayload> extends Hoodi
}
private static class PartitionCleanStat implements Serializable {
private final String partitionPath;
private final List<String> deletePathPatterns = new ArrayList<>();
private final List<String> successDeleteFiles = new ArrayList<>();
@@ -613,7 +611,8 @@ public class HoodieCopyOnWriteTable<T extends HoodieRecordPayload> extends Hoodi
}
}
private List<HoodieCleanStat> cleanPartitionPaths(List<String> partitionsToClean, JavaSparkContext jsc) {
private List<HoodieCleanStat> cleanPartitionPaths(List<String> partitionsToClean,
JavaSparkContext jsc) {
int cleanerParallelism = Math.min(partitionsToClean.size(), config.getCleanerParallelism());
logger.info("Using cleanerParallelism: " + cleanerParallelism);
List<Tuple2<String, PartitionCleanStat>> partitionCleanStats = jsc
@@ -621,7 +620,8 @@ public class HoodieCopyOnWriteTable<T extends HoodieRecordPayload> extends Hoodi
.flatMapToPair(getFilesToDeleteFunc(this, config))
.repartition(cleanerParallelism) // repartition to remove skews
.mapPartitionsToPair(deleteFilesFunc(this, config))
.reduceByKey( // merge partition level clean stats below
.reduceByKey(
// merge partition level clean stats below
(Function2<PartitionCleanStat, PartitionCleanStat, PartitionCleanStat>) (e1, e2) -> e1
.merge(e2))
.collect();

View File

@@ -39,13 +39,6 @@ import com.uber.hoodie.exception.HoodieCompactionException;
import com.uber.hoodie.exception.HoodieRollbackException;
import com.uber.hoodie.io.HoodieAppendHandle;
import com.uber.hoodie.io.compact.HoodieRealtimeTableCompactor;
import org.apache.hadoop.fs.FileStatus;
import org.apache.hadoop.fs.Path;
import org.apache.log4j.LogManager;
import org.apache.log4j.Logger;
import org.apache.spark.api.java.JavaSparkContext;
import org.apache.spark.api.java.function.Function;
import java.io.IOException;
import java.io.UncheckedIOException;
import java.util.Arrays;
@@ -56,6 +49,12 @@ import java.util.List;
import java.util.Map;
import java.util.Optional;
import java.util.stream.Collectors;
import org.apache.hadoop.fs.FileStatus;
import org.apache.hadoop.fs.Path;
import org.apache.log4j.LogManager;
import org.apache.log4j.Logger;
import org.apache.spark.api.java.JavaSparkContext;
import org.apache.spark.api.java.function.Function;
/**
@@ -64,13 +63,15 @@ import java.util.stream.Collectors;
* INSERTS - Same as HoodieCopyOnWriteTable - Produce new files, block aligned to desired size (or)
* Merge with the smallest existing file, to expand it
*
* UPDATES - Appends the changes to a rolling log file maintained per file Id.
* Compaction merges the log file into the base file.
* UPDATES - Appends the changes to a rolling log file maintained per file Id. Compaction merges the
* log file into the base file.
*
* WARNING - MOR table type does not support nested rollbacks, every rollback
* must be followed by an attempted commit action
* WARNING - MOR table type does not support nested rollbacks, every rollback must be followed by an
* attempted commit action
*/
public class HoodieMergeOnReadTable<T extends HoodieRecordPayload> extends HoodieCopyOnWriteTable<T> {
public class HoodieMergeOnReadTable<T extends HoodieRecordPayload> extends
HoodieCopyOnWriteTable<T> {
private static Logger logger = LogManager.getLogger(HoodieMergeOnReadTable.class);
public HoodieMergeOnReadTable(HoodieWriteConfig config,
@@ -119,15 +120,17 @@ public class HoodieMergeOnReadTable<T extends HoodieRecordPayload> extends Hoodi
}
@Override
public List<HoodieRollbackStat> rollback(JavaSparkContext jsc, List<String> commits) throws IOException {
public List<HoodieRollbackStat> rollback(JavaSparkContext jsc, List<String> commits)
throws IOException {
//At the moment, MOR table type does not support nested rollbacks
if(commits.size() > 1) {
if (commits.size() > 1) {
throw new UnsupportedOperationException("Nested Rollbacks are not supported");
}
Map<String, HoodieInstant> commitsAndCompactions =
this.getActiveTimeline()
.getTimelineOfActions(Sets.newHashSet(HoodieActiveTimeline.COMMIT_ACTION, HoodieActiveTimeline.COMPACTION_ACTION, HoodieActiveTimeline.DELTA_COMMIT_ACTION))
.getTimelineOfActions(Sets.newHashSet(HoodieActiveTimeline.COMMIT_ACTION,
HoodieActiveTimeline.COMPACTION_ACTION, HoodieActiveTimeline.DELTA_COMMIT_ACTION))
.getInstants()
.filter(i -> commits.contains(i.getTimestamp()))
.collect(Collectors.toMap(i -> i.getTimestamp(), i -> i));
@@ -149,11 +152,14 @@ public class HoodieMergeOnReadTable<T extends HoodieRecordPayload> extends Hoodi
try {
logger.info("Starting to rollback Commit/Compaction " + instant);
HoodieCommitMetadata commitMetadata = HoodieCommitMetadata
.fromBytes(this.getCommitTimeline().getInstantDetails(new HoodieInstant(true, instant.getAction(), instant.getTimestamp())).get());
.fromBytes(this.getCommitTimeline().getInstantDetails(
new HoodieInstant(true, instant.getAction(), instant.getTimestamp())).get());
stats = jsc.parallelize(commitMetadata.getPartitionToWriteStats().keySet().stream().collect(Collectors.toList()))
stats = jsc.parallelize(commitMetadata.getPartitionToWriteStats().keySet().stream()
.collect(Collectors.toList()))
.map((Function<String, HoodieRollbackStat>) partitionPath -> {
Map<FileStatus, Boolean> results = super.deleteCleanedFiles(partitionPath, Arrays.asList(commit));
Map<FileStatus, Boolean> results = super
.deleteCleanedFiles(partitionPath, Arrays.asList(commit));
return HoodieRollbackStat.newBuilder().withPartitionPath(partitionPath)
.withDeletedFileResults(results).build();
}).collect();
@@ -167,40 +173,55 @@ public class HoodieMergeOnReadTable<T extends HoodieRecordPayload> extends Hoodi
logger.info("Starting to rollback delta commit " + instant);
HoodieCommitMetadata commitMetadata = HoodieCommitMetadata
.fromBytes(this.getCommitTimeline().getInstantDetails(new HoodieInstant(true, instant.getAction(), instant.getTimestamp())).get());
.fromBytes(this.getCommitTimeline().getInstantDetails(
new HoodieInstant(true, instant.getAction(), instant.getTimestamp())).get());
stats = jsc.parallelize(commitMetadata.getPartitionToWriteStats().keySet().stream().collect(Collectors.toList()))
stats = jsc.parallelize(commitMetadata.getPartitionToWriteStats().keySet().stream()
.collect(Collectors.toList()))
.map((Function<String, HoodieRollbackStat>) partitionPath -> {
// read commit file and (either append delete blocks or delete file)
Map<FileStatus, Boolean> filesToDeletedStatus = new HashMap<>();
Map<FileStatus, Long> filesToNumBlocksRollback = new HashMap<>();
// we do not know fileIds for inserts (first inserts are parquet files), delete all parquet files for the corresponding failed commit, if present (same as COW)
filesToDeletedStatus = super.deleteCleanedFiles(partitionPath, Arrays.asList(commit));
filesToDeletedStatus = super
.deleteCleanedFiles(partitionPath, Arrays.asList(commit));
// append rollback blocks for updates
commitMetadata.getPartitionToWriteStats().get(partitionPath).stream().filter(wStat -> wStat.getPrevCommit() != HoodieWriteStat.NULL_COMMIT).forEach(wStat -> {
commitMetadata.getPartitionToWriteStats().get(partitionPath).stream()
.filter(wStat -> wStat.getPrevCommit() != HoodieWriteStat.NULL_COMMIT)
.forEach(wStat -> {
HoodieLogFormat.Writer writer = null;
try {
writer = HoodieLogFormat.newWriterBuilder()
.onParentPath(new Path(this.getMetaClient().getBasePath(), partitionPath))
.onParentPath(
new Path(this.getMetaClient().getBasePath(), partitionPath))
.withFileId(wStat.getFileId()).overBaseCommit(wStat.getPrevCommit())
.withFs(FSUtils.getFs()).withFileExtension(HoodieLogFile.DELTA_EXTENSION).build();
.withFs(FSUtils.getFs())
.withFileExtension(HoodieLogFile.DELTA_EXTENSION).build();
Long numRollbackBlocks = 0L;
// generate metadata
Map<HoodieLogBlock.LogMetadataType, String> metadata = Maps.newHashMap();
metadata.put(HoodieLogBlock.LogMetadataType.INSTANT_TIME, metaClient.getActiveTimeline().lastInstant().get().getTimestamp());
metadata.put(HoodieLogBlock.LogMetadataType.INSTANT_TIME,
metaClient.getActiveTimeline().lastInstant().get().getTimestamp());
metadata.put(HoodieLogBlock.LogMetadataType.TARGET_INSTANT_TIME, commit);
// if update belongs to an existing log file
writer.appendBlock(new HoodieCommandBlock(HoodieCommandBlock.HoodieCommandBlockTypeEnum.ROLLBACK_PREVIOUS_BLOCK, metadata));
writer.appendBlock(new HoodieCommandBlock(
HoodieCommandBlock.HoodieCommandBlockTypeEnum.ROLLBACK_PREVIOUS_BLOCK,
metadata));
numRollbackBlocks++;
if(wStat.getNumDeletes() > 0) {
writer.appendBlock(new HoodieCommandBlock(HoodieCommandBlock.HoodieCommandBlockTypeEnum.ROLLBACK_PREVIOUS_BLOCK, metadata));
if (wStat.getNumDeletes() > 0) {
writer.appendBlock(new HoodieCommandBlock(
HoodieCommandBlock.HoodieCommandBlockTypeEnum.ROLLBACK_PREVIOUS_BLOCK,
metadata));
numRollbackBlocks++;
}
filesToNumBlocksRollback.put(FSUtils.getFs().getFileStatus(writer.getLogFile().getPath()), numRollbackBlocks);
filesToNumBlocksRollback
.put(FSUtils.getFs().getFileStatus(writer.getLogFile().getPath()),
numRollbackBlocks);
} catch (IOException | InterruptedException io) {
throw new HoodieRollbackException("Failed to rollback for commit " + commit, io);
throw new HoodieRollbackException(
"Failed to rollback for commit " + commit, io);
} finally {
try {
writer.close();
@@ -223,10 +244,12 @@ public class HoodieMergeOnReadTable<T extends HoodieRecordPayload> extends Hoodi
}).flatMap(x -> x.stream()).collect(Collectors.toList());
commitsAndCompactions.entrySet().stream()
.map(entry -> new HoodieInstant(true, entry.getValue().getAction(), entry.getValue().getTimestamp()))
.map(entry -> new HoodieInstant(true, entry.getValue().getAction(),
entry.getValue().getTimestamp()))
.forEach(this.getActiveTimeline()::deleteInflight);
logger.debug("Time(in ms) taken to finish rollback " + (System.currentTimeMillis() - startTime));
logger
.debug("Time(in ms) taken to finish rollback " + (System.currentTimeMillis() - startTime));
return allRollbackStats;
}

View File

@@ -34,7 +34,6 @@ import com.uber.hoodie.common.util.AvroUtils;
import com.uber.hoodie.config.HoodieWriteConfig;
import com.uber.hoodie.exception.HoodieCommitException;
import com.uber.hoodie.exception.HoodieException;
import com.uber.hoodie.exception.HoodieRollbackException;
import com.uber.hoodie.exception.HoodieSavepointException;
import java.io.IOException;
import java.io.Serializable;
@@ -43,8 +42,6 @@ import java.util.List;
import java.util.Optional;
import java.util.stream.Collectors;
import java.util.stream.Stream;
import org.apache.hadoop.fs.FileStatus;
import org.apache.hadoop.fs.FileSystem;
import org.apache.log4j.LogManager;
import org.apache.log4j.Logger;
@@ -55,6 +52,7 @@ import org.apache.spark.api.java.JavaSparkContext;
* Abstract implementation of a HoodieTable
*/
public abstract class HoodieTable<T extends HoodieRecordPayload> implements Serializable {
protected final HoodieWriteConfig config;
protected final HoodieTableMetaClient metaClient;
private static Logger logger = LogManager.getLogger(HoodieTable.class);
@@ -65,27 +63,19 @@ public abstract class HoodieTable<T extends HoodieRecordPayload> implements Seri
}
/**
* Provides a partitioner to perform the upsert operation, based on the
* workload profile
*
* @return
* Provides a partitioner to perform the upsert operation, based on the workload profile
*/
public abstract Partitioner getUpsertPartitioner(WorkloadProfile profile);
/**
* Provides a partitioner to perform the insert operation, based on the workload profile
*
* @return
*/
public abstract Partitioner getInsertPartitioner(WorkloadProfile profile);
/**
* Return whether this HoodieTable implementation can benefit from workload
* profiling
*
* @return
* Return whether this HoodieTable implementation can benefit from workload profiling
*/
public abstract boolean isWorkloadProfileNeeded();
@@ -103,8 +93,6 @@ public abstract class HoodieTable<T extends HoodieRecordPayload> implements Seri
/**
* Get the view of the file system for this table
*
* @return
*/
public TableFileSystemView getFileSystemView() {
return new HoodieTableFileSystemView(metaClient, getCompletedCommitTimeline());
@@ -112,8 +100,6 @@ public abstract class HoodieTable<T extends HoodieRecordPayload> implements Seri
/**
* Get the read optimized view of the file system for this table
*
* @return
*/
public TableFileSystemView.ReadOptimizedView getROFileSystemView() {
return new HoodieTableFileSystemView(metaClient, getCompletedCommitTimeline());
@@ -121,8 +107,6 @@ public abstract class HoodieTable<T extends HoodieRecordPayload> implements Seri
/**
* Get the real time view of the file system for this table
*
* @return
*/
public TableFileSystemView.RealtimeView getRTFileSystemView() {
return new HoodieTableFileSystemView(metaClient, getCompletedCommitTimeline());
@@ -130,8 +114,6 @@ public abstract class HoodieTable<T extends HoodieRecordPayload> implements Seri
/**
* Get the completed (commit + compaction) view of the file system for this table
*
* @return
*/
public TableFileSystemView getCompletedFileSystemView() {
return new HoodieTableFileSystemView(metaClient, getCommitTimeline());
@@ -139,7 +121,6 @@ public abstract class HoodieTable<T extends HoodieRecordPayload> implements Seri
/**
* Get only the completed (no-inflights) commit timeline
* @return
*/
public HoodieTimeline getCompletedCommitTimeline() {
return getCommitTimeline().filterCompletedInstants();
@@ -147,7 +128,6 @@ public abstract class HoodieTable<T extends HoodieRecordPayload> implements Seri
/**
* Get only the inflights (no-completed) commit timeline
* @return
*/
public HoodieTimeline getInflightCommitTimeline() {
return getCommitTimeline().filterInflights();
@@ -156,7 +136,6 @@ public abstract class HoodieTable<T extends HoodieRecordPayload> implements Seri
/**
* Get only the completed (no-inflights) clean timeline
* @return
*/
public HoodieTimeline getCompletedCleanTimeline() {
return getActiveTimeline().getCleanerTimeline().filterCompletedInstants();
@@ -164,7 +143,6 @@ public abstract class HoodieTable<T extends HoodieRecordPayload> implements Seri
/**
* Get only the completed (no-inflights) savepoint timeline
* @return
*/
public HoodieTimeline getCompletedSavepointTimeline() {
return getActiveTimeline().getSavePointTimeline().filterCompletedInstants();
@@ -172,7 +150,6 @@ public abstract class HoodieTable<T extends HoodieRecordPayload> implements Seri
/**
* Get the list of savepoints in this table
* @return
*/
public List<String> getSavepoints() {
return getCompletedSavepointTimeline().getInstants().map(HoodieInstant::getTimestamp)
@@ -181,10 +158,6 @@ public abstract class HoodieTable<T extends HoodieRecordPayload> implements Seri
/**
* Get the list of data file names savepointed
*
* @param savepointTime
* @return
* @throws IOException
*/
public Stream<String> getSavepointedDataFiles(String savepointTime) {
if (!getSavepoints().contains(savepointTime)) {
@@ -211,8 +184,6 @@ public abstract class HoodieTable<T extends HoodieRecordPayload> implements Seri
/**
* Get the commit timeline visible for this table
*
* @return
*/
public HoodieTimeline getCommitTimeline() {
switch (metaClient.getTableType()) {
@@ -223,13 +194,12 @@ public abstract class HoodieTable<T extends HoodieRecordPayload> implements Seri
// Include commit action to be able to start doing a MOR over a COW dataset - no migration required
return getActiveTimeline().getCommitsAndCompactionsTimeline();
default:
throw new HoodieException("Unsupported table type :"+ metaClient.getTableType());
throw new HoodieException("Unsupported table type :" + metaClient.getTableType());
}
}
/**
* Get only the completed (no-inflights) compaction commit timeline
* @return
*/
public HoodieTimeline getCompletedCompactionCommitTimeline() {
return getCompactionCommitTimeline().filterCompletedInstants();
@@ -238,8 +208,6 @@ public abstract class HoodieTable<T extends HoodieRecordPayload> implements Seri
/**
* Get the compacted commit timeline visible for this table
*
* @return
*/
public HoodieTimeline getCompactionCommitTimeline() {
switch (metaClient.getTableType()) {
@@ -250,13 +218,12 @@ public abstract class HoodieTable<T extends HoodieRecordPayload> implements Seri
return getActiveTimeline().getTimelineOfActions(
Sets.newHashSet(HoodieActiveTimeline.COMPACTION_ACTION));
default:
throw new HoodieException("Unsupported table type :"+ metaClient.getTableType());
throw new HoodieException("Unsupported table type :" + metaClient.getTableType());
}
}
/**
* Gets the commit action type
* @return
*/
public String getCommitActionType() {
switch (metaClient.getTableType()) {
@@ -271,7 +238,6 @@ public abstract class HoodieTable<T extends HoodieRecordPayload> implements Seri
/**
* Gets the action type for a compaction commit
* @return
*/
public String getCompactedCommitActionType() {
switch (metaClient.getTableType()) {
@@ -280,27 +246,18 @@ public abstract class HoodieTable<T extends HoodieRecordPayload> implements Seri
case MERGE_ON_READ:
return HoodieTimeline.COMPACTION_ACTION;
}
throw new HoodieException("Unsupported table type :"+ metaClient.getTableType());
throw new HoodieException("Unsupported table type :" + metaClient.getTableType());
}
/**
* Perform the ultimate IO for a given upserted (RDD) partition
*
* @param partition
* @param recordIterator
* @param partitioner
*/
public abstract Iterator<List<WriteStatus>> handleUpsertPartition(String commitTime,
Integer partition, Iterator<HoodieRecord<T>> recordIterator, Partitioner partitioner);
/**
* Perform the ultimate IO for a given inserted (RDD) partition
*
* @param partition
* @param recordIterator
* @param partitioner
*/
public abstract Iterator<List<WriteStatus>> handleInsertPartition(String commitTime,
Integer partition, Iterator<HoodieRecord<T>> recordIterator, Partitioner partitioner);
@@ -319,27 +276,21 @@ public abstract class HoodieTable<T extends HoodieRecordPayload> implements Seri
}
/**
* Run Compaction on the table.
* Compaction arranges the data so that it is optimized for data access
* Run Compaction on the table. Compaction arranges the data so that it is optimized for data
* access
*/
public abstract Optional<HoodieCompactionMetadata> compact(JavaSparkContext jsc);
/**
* Clean partition paths according to cleaning policy and returns the number
* of files cleaned.
* Clean partition paths according to cleaning policy and returns the number of files cleaned.
*/
public abstract List<HoodieCleanStat> clean(JavaSparkContext jsc);
/**
* Rollback the (inflight/committed) record changes with the given commit time.
* Four steps:
* (1) Atomically unpublish this commit
* (2) clean indexing data
* (3) clean new generated parquet files / log blocks
* (4) Finally, delete .<action>.commit or .<action>.inflight file
* @param commits
* @return
* @throws HoodieRollbackException
* Rollback the (inflight/committed) record changes with the given commit time. Four steps: (1)
* Atomically unpublish this commit (2) clean indexing data (3) clean new generated parquet files
* / log blocks (4) Finally, delete .<action>.commit or .<action>.inflight file
*/
public abstract List<HoodieRollbackStat> rollback(JavaSparkContext jsc, List<String> commits) throws IOException;
public abstract List<HoodieRollbackStat> rollback(JavaSparkContext jsc, List<String> commits)
throws IOException;
}

View File

@@ -20,13 +20,13 @@ import com.uber.hoodie.common.model.HoodieRecordPayload;
import org.apache.spark.api.java.JavaRDD;
/**
* Repartition input records into at least expected number of output spark partitions. It should give
* below guarantees
* - Output spark partition will have records from only one hoodie partition.
* - Average records per output spark partitions should be almost equal to (#inputRecords / #outputSparkPartitions)
* to avoid possible skews.
* Repartition input records into at least expected number of output spark partitions. It should
* give below guarantees - Output spark partition will have records from only one hoodie partition.
* - Average records per output spark partitions should be almost equal to (#inputRecords /
* #outputSparkPartitions) to avoid possible skews.
*/
public interface UserDefinedBulkInsertPartitioner<T extends HoodieRecordPayload> {
JavaRDD<HoodieRecord<T>> repartitionRecords(JavaRDD<HoodieRecord<T>> records, int outputSparkPartitions);
JavaRDD<HoodieRecord<T>> repartitionRecords(JavaRDD<HoodieRecord<T>> records,
int outputSparkPartitions);
}

View File

@@ -20,15 +20,11 @@ package com.uber.hoodie.table;
import com.uber.hoodie.common.model.HoodieRecord;
import com.uber.hoodie.common.model.HoodieRecordLocation;
import com.uber.hoodie.common.model.HoodieRecordPayload;
import org.apache.spark.api.java.JavaRDD;
import org.apache.spark.api.java.function.PairFunction;
import java.io.Serializable;
import java.util.HashMap;
import java.util.Map;
import java.util.Set;
import org.apache.spark.api.java.JavaRDD;
import scala.Option;
import scala.Tuple2;
@@ -65,15 +61,18 @@ public class WorkloadProfile<T extends HoodieRecordPayload> implements Serializa
Map<Tuple2<String, Option<HoodieRecordLocation>>, Long> partitionLocationCounts = taggedRecords
.mapToPair(record ->
new Tuple2<>(new Tuple2<>(record.getPartitionPath(), Option.apply(record.getCurrentLocation())), record))
new Tuple2<>(
new Tuple2<>(record.getPartitionPath(), Option.apply(record.getCurrentLocation())),
record))
.countByKey();
for (Map.Entry<Tuple2<String, Option<HoodieRecordLocation>>, Long> e: partitionLocationCounts.entrySet()) {
for (Map.Entry<Tuple2<String, Option<HoodieRecordLocation>>, Long> e : partitionLocationCounts
.entrySet()) {
String partitionPath = e.getKey()._1();
Long count = e.getValue();
Option<HoodieRecordLocation> locOption = e.getKey()._2();
if (!partitionPathStatMap.containsKey(partitionPath)){
if (!partitionPathStatMap.containsKey(partitionPath)) {
partitionPathStatMap.put(partitionPath, new WorkloadStat());
}
@@ -97,7 +96,7 @@ public class WorkloadProfile<T extends HoodieRecordPayload> implements Serializa
return partitionPathStatMap.keySet();
}
public WorkloadStat getWorkloadStat(String partitionPath){
public WorkloadStat getWorkloadStat(String partitionPath) {
return partitionPathStatMap.get(partitionPath);
}

View File

@@ -17,7 +17,6 @@
package com.uber.hoodie.table;
import com.uber.hoodie.common.model.HoodieRecordLocation;
import java.io.Serializable;
import java.util.HashMap;
@@ -25,6 +24,7 @@ import java.util.HashMap;
* Wraps stats about a single partition path.
*/
public class WorkloadStat implements Serializable {
private long numInserts = 0L;
private long numUpdates = 0L;

View File

@@ -13,7 +13,6 @@
# See the License for the specific language governing permissions and
# limitations under the License.
#
# Set root logger level to DEBUG and its only appender to A1.
log4j.rootLogger=INFO, A1
# A1 is set to be a ConsoleAppender.

View File

@@ -22,13 +22,12 @@ import com.uber.hoodie.common.HoodieTestDataGenerator;
import com.uber.hoodie.common.model.HoodieAvroPayload;
import com.uber.hoodie.common.model.HoodieRecord;
import com.uber.hoodie.common.model.HoodieTableType;
import com.uber.hoodie.common.table.HoodieTableConfig;
import com.uber.hoodie.common.table.HoodieTableMetaClient;
import com.uber.hoodie.common.util.FSUtils;
import com.uber.hoodie.config.HoodieIndexConfig;
import com.uber.hoodie.config.HoodieWriteConfig;
import com.uber.hoodie.index.HoodieIndex;
import java.util.List;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.log4j.LogManager;
@@ -36,7 +35,6 @@ import org.apache.log4j.Logger;
import org.apache.spark.SparkConf;
import org.apache.spark.api.java.JavaRDD;
import org.apache.spark.api.java.JavaSparkContext;
import java.util.List;
/**
* Driver program that uses the Hoodie client with synthetic workload, and performs basic
@@ -44,13 +42,13 @@ import java.util.List;
*/
public class HoodieClientExample {
@Parameter(names={"--table-path", "-p"}, description = "path for Hoodie sample table")
@Parameter(names = {"--table-path", "-p"}, description = "path for Hoodie sample table")
private String tablePath = "file:///tmp/hoodie/sample-table";
@Parameter(names={"--table-name", "-n"}, description = "table name for Hoodie sample table")
@Parameter(names = {"--table-name", "-n"}, description = "table name for Hoodie sample table")
private String tableName = "hoodie_rt";
@Parameter(names={"--table-type", "-t"}, description = "One of COPY_ON_WRITE or MERGE_ON_READ")
@Parameter(names = {"--table-type", "-t"}, description = "One of COPY_ON_WRITE or MERGE_ON_READ")
private String tableType = HoodieTableType.COPY_ON_WRITE.name();
@Parameter(names = {"--help", "-h"}, help = true)
@@ -85,7 +83,9 @@ public class HoodieClientExample {
Path path = new Path(tablePath);
FileSystem fs = FSUtils.getFs();
if (!fs.exists(path)) {
HoodieTableMetaClient.initTableType(fs, tablePath, HoodieTableType.valueOf(tableType), tableName, HoodieAvroPayload.class.getName());
HoodieTableMetaClient
.initTableType(fs, tablePath, HoodieTableType.valueOf(tableType), tableName,
HoodieAvroPayload.class.getName());
}
// Create the write client to write some records in

View File

@@ -16,8 +16,12 @@
package com.uber.hoodie;
import com.google.common.collect.Iterables;
import static org.junit.Assert.assertEquals;
import static org.junit.Assert.assertFalse;
import static org.junit.Assert.assertTrue;
import static org.junit.Assert.fail;
import com.google.common.collect.Iterables;
import com.uber.hoodie.common.HoodieCleanStat;
import com.uber.hoodie.common.HoodieClientTestUtils;
import com.uber.hoodie.common.HoodieTestDataGenerator;
@@ -45,22 +49,6 @@ import com.uber.hoodie.config.HoodieWriteConfig;
import com.uber.hoodie.exception.HoodieRollbackException;
import com.uber.hoodie.index.HoodieIndex;
import com.uber.hoodie.table.HoodieTable;
import org.apache.avro.generic.GenericRecord;
import org.apache.commons.io.IOUtils;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.spark.api.java.JavaRDD;
import org.apache.spark.api.java.JavaSparkContext;
import org.apache.spark.scheduler.SparkListener;
import org.apache.spark.scheduler.SparkListenerTaskEnd;
import org.apache.spark.sql.SQLContext;
import org.apache.spark.util.AccumulatorV2;
import org.junit.After;
import org.junit.Before;
import org.junit.Test;
import org.junit.rules.TemporaryFolder;
import java.io.File;
import java.io.FileInputStream;
import java.io.IOException;
@@ -76,15 +64,24 @@ import java.util.Optional;
import java.util.Set;
import java.util.TreeSet;
import java.util.stream.Collectors;
import org.apache.avro.generic.GenericRecord;
import org.apache.commons.io.IOUtils;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.spark.api.java.JavaRDD;
import org.apache.spark.api.java.JavaSparkContext;
import org.apache.spark.scheduler.SparkListener;
import org.apache.spark.scheduler.SparkListenerTaskEnd;
import org.apache.spark.sql.SQLContext;
import org.apache.spark.util.AccumulatorV2;
import org.junit.After;
import org.junit.Before;
import org.junit.Test;
import org.junit.rules.TemporaryFolder;
import scala.collection.Iterator;
import static org.junit.Assert.assertEquals;
import static org.junit.Assert.assertFalse;
import static org.junit.Assert.assertTrue;
import static org.junit.Assert.fail;
public class TestHoodieClientOnCopyOnWriteStorage implements Serializable {
private transient JavaSparkContext jsc = null;
private transient SQLContext sqlContext;
private String basePath = null;
@@ -115,7 +112,8 @@ public class TestHoodieClientOnCopyOnWriteStorage implements Serializable {
private HoodieWriteConfig.Builder getConfigBuilder() {
return HoodieWriteConfig.newBuilder().withPath(basePath)
.withSchema(HoodieTestDataGenerator.TRIP_EXAMPLE_SCHEMA).withParallelism(2, 2)
.withCompactionConfig(HoodieCompactionConfig.newBuilder().compactionSmallFileSize(1024 * 1024).build())
.withCompactionConfig(
HoodieCompactionConfig.newBuilder().compactionSmallFileSize(1024 * 1024).build())
.withStorageConfig(HoodieStorageConfig.newBuilder().limitFileSize(1024 * 1024).build())
.forTable("test-trip-table").withIndexConfig(
HoodieIndexConfig.newBuilder().withIndexType(HoodieIndex.IndexType.BLOOM).build());
@@ -129,9 +127,11 @@ public class TestHoodieClientOnCopyOnWriteStorage implements Serializable {
}
private void assertPartitionMetadata(String[] partitionPaths, FileSystem fs) throws IOException {
for (String partitionPath: partitionPaths) {
assertTrue(HoodiePartitionMetadata.hasPartitionMetadata(fs, new Path(basePath, partitionPath)));
HoodiePartitionMetadata pmeta = new HoodiePartitionMetadata(fs, new Path(basePath, partitionPath));
for (String partitionPath : partitionPaths) {
assertTrue(
HoodiePartitionMetadata.hasPartitionMetadata(fs, new Path(basePath, partitionPath)));
HoodiePartitionMetadata pmeta = new HoodiePartitionMetadata(fs,
new Path(basePath, partitionPath));
pmeta.readFromFS();
assertEquals(3, pmeta.getPartitionDepth());
}
@@ -140,13 +140,13 @@ public class TestHoodieClientOnCopyOnWriteStorage implements Serializable {
private void checkTaggedRecords(List<HoodieRecord> taggedRecords, String commitTime) {
for (HoodieRecord rec : taggedRecords) {
assertTrue("Record " + rec + " found with no location.", rec.isCurrentLocationKnown());
assertEquals("All records should have commit time "+ commitTime+", since updates were made",
assertEquals(
"All records should have commit time " + commitTime + ", since updates were made",
rec.getCurrentLocation().getCommitTime(), commitTime);
}
}
@Test
public void testFilterExist() throws Exception {
HoodieWriteConfig config = getConfig();
@@ -231,17 +231,21 @@ public class TestHoodieClientOnCopyOnWriteStorage implements Serializable {
// verify that there is a commit
HoodieTableMetaClient metaClient = new HoodieTableMetaClient(fs, basePath);
HoodieTimeline timeline = new HoodieActiveTimeline(fs, metaClient.getMetaPath()).getCommitTimeline();
HoodieTimeline timeline = new HoodieActiveTimeline(fs, metaClient.getMetaPath())
.getCommitTimeline();
assertEquals("Expecting a single commit.", 1, timeline.findInstantsAfter("000", Integer.MAX_VALUE).countInstants());
assertEquals("Latest commit should be 001", newCommitTime, timeline.lastInstant().get().getTimestamp());
assertEquals("Expecting a single commit.", 1,
timeline.findInstantsAfter("000", Integer.MAX_VALUE).countInstants());
assertEquals("Latest commit should be 001", newCommitTime,
timeline.lastInstant().get().getTimestamp());
assertEquals("Must contain 200 records",
records.size(),
HoodieClientTestUtils.readCommit(basePath, sqlContext, timeline, newCommitTime).count());
// Should have 100 records in table (check using Index), all in locations marked at commit
HoodieTable table = HoodieTable.getHoodieTable(metaClient, getConfig());
List<HoodieRecord> taggedRecords = index.tagLocation(jsc.parallelize(records, 1), table).collect();
List<HoodieRecord> taggedRecords = index.tagLocation(jsc.parallelize(records, 1), table)
.collect();
checkTaggedRecords(taggedRecords, "001");
/**
@@ -265,8 +269,10 @@ public class TestHoodieClientOnCopyOnWriteStorage implements Serializable {
// verify there are now 2 commits
timeline = new HoodieActiveTimeline(fs, metaClient.getMetaPath()).getCommitTimeline();
assertEquals("Expecting two commits.", timeline.findInstantsAfter("000", Integer.MAX_VALUE).countInstants(), 2);
assertEquals("Latest commit should be 004", timeline.lastInstant().get().getTimestamp(), newCommitTime);
assertEquals("Expecting two commits.",
timeline.findInstantsAfter("000", Integer.MAX_VALUE).countInstants(), 2);
assertEquals("Latest commit should be 004", timeline.lastInstant().get().getTimestamp(),
newCommitTime);
metaClient = new HoodieTableMetaClient(fs, basePath);
table = HoodieTable.getHoodieTable(metaClient, getConfig());
@@ -277,21 +283,20 @@ public class TestHoodieClientOnCopyOnWriteStorage implements Serializable {
// Check the entire dataset has 100 records still
String[] fullPartitionPaths = new String[dataGen.getPartitionPaths().length];
for (int i=0; i < fullPartitionPaths.length; i++) {
for (int i = 0; i < fullPartitionPaths.length; i++) {
fullPartitionPaths[i] = String.format("%s/%s/*", basePath, dataGen.getPartitionPaths()[i]);
}
assertEquals("Must contain 200 records",
200,
HoodieClientTestUtils.read(basePath, sqlContext, fs, fullPartitionPaths).count());
// Check that the incremental consumption from time 000
assertEquals("Incremental consumption from time 002, should give all records in commit 004",
HoodieClientTestUtils.readCommit(basePath, sqlContext, timeline, newCommitTime).count(),
HoodieClientTestUtils.readSince(basePath, sqlContext, timeline, "002").count());
assertEquals("Incremental consumption from time 001, should give all records in commit 004",
HoodieClientTestUtils.readCommit(basePath, sqlContext,timeline, newCommitTime).count(),
HoodieClientTestUtils.readSince(basePath, sqlContext,timeline, "001").count());
HoodieClientTestUtils.readCommit(basePath, sqlContext, timeline, newCommitTime).count(),
HoodieClientTestUtils.readSince(basePath, sqlContext, timeline, "001").count());
}
@Test
@@ -322,15 +327,19 @@ public class TestHoodieClientOnCopyOnWriteStorage implements Serializable {
// verify that there is a commit
HoodieTableMetaClient metaClient = new HoodieTableMetaClient(fs, basePath);
HoodieTimeline timeline = new HoodieActiveTimeline(fs, metaClient.getMetaPath()).getCommitTimeline();
assertEquals("Expecting a single commit.", 1, timeline.findInstantsAfter("000", Integer.MAX_VALUE).countInstants());
assertEquals("Latest commit should be 001", newCommitTime, timeline.lastInstant().get().getTimestamp());
HoodieTimeline timeline = new HoodieActiveTimeline(fs, metaClient.getMetaPath())
.getCommitTimeline();
assertEquals("Expecting a single commit.", 1,
timeline.findInstantsAfter("000", Integer.MAX_VALUE).countInstants());
assertEquals("Latest commit should be 001", newCommitTime,
timeline.lastInstant().get().getTimestamp());
assertEquals("Must contain 200 records", fewRecordsForInsert.size(),
HoodieClientTestUtils.readCommit(basePath, sqlContext, timeline, newCommitTime).count());
// Should have 100 records in table (check using Index), all in locations marked at commit
HoodieTable table = HoodieTable.getHoodieTable(metaClient, getConfig());
List<HoodieRecord> taggedRecords = index.tagLocation(jsc.parallelize(fewRecordsForInsert, 1), table).collect();
List<HoodieRecord> taggedRecords = index
.tagLocation(jsc.parallelize(fewRecordsForInsert, 1), table).collect();
checkTaggedRecords(taggedRecords, "001");
/**
@@ -339,8 +348,8 @@ public class TestHoodieClientOnCopyOnWriteStorage implements Serializable {
newCommitTime = "004";
client.startCommitWithTime(newCommitTime);
fewRecordsForDelete = records.subList(0,50);
List<HoodieRecord> fewRecordsForUpdate = records.subList(50,100);
fewRecordsForDelete = records.subList(0, 50);
List<HoodieRecord> fewRecordsForUpdate = records.subList(50, 100);
records = dataGen.generateDeletesFromExistingRecords(fewRecordsForDelete);
records.addAll(fewRecordsForUpdate);
@@ -351,18 +360,19 @@ public class TestHoodieClientOnCopyOnWriteStorage implements Serializable {
// verify there are now 2 commits
timeline = new HoodieActiveTimeline(fs, metaClient.getMetaPath()).getCommitTimeline();
assertEquals("Expecting two commits.", timeline.findInstantsAfter("000", Integer.MAX_VALUE).countInstants(), 2);
assertEquals("Latest commit should be 004", timeline.lastInstant().get().getTimestamp(), newCommitTime);
assertEquals("Expecting two commits.",
timeline.findInstantsAfter("000", Integer.MAX_VALUE).countInstants(), 2);
assertEquals("Latest commit should be 004", timeline.lastInstant().get().getTimestamp(),
newCommitTime);
// Check the entire dataset has 150 records(200-50) still
String[] fullPartitionPaths = new String[dataGen.getPartitionPaths().length];
for (int i=0; i < fullPartitionPaths.length; i++) {
for (int i = 0; i < fullPartitionPaths.length; i++) {
fullPartitionPaths[i] = String.format("%s/%s/*", basePath, dataGen.getPartitionPaths()[i]);
}
assertEquals("Must contain 150 records", 150,
HoodieClientTestUtils.read(basePath, sqlContext, fs, fullPartitionPaths).count());
// Check that the incremental consumption from time 000
assertEquals("Incremental consumption from latest commit, should give 50 updated records",
50,
@@ -384,7 +394,8 @@ public class TestHoodieClientOnCopyOnWriteStorage implements Serializable {
.build()).build();
HoodieWriteClient client = new HoodieWriteClient(jsc, cfg);
FileSystem fs = FSUtils.getFs();
HoodieTestDataGenerator.writePartitionMetadata(fs, HoodieTestDataGenerator.DEFAULT_PARTITION_PATHS, basePath);
HoodieTestDataGenerator
.writePartitionMetadata(fs, HoodieTestDataGenerator.DEFAULT_PARTITION_PATHS, basePath);
/**
* Write 1 (only inserts)
@@ -393,7 +404,8 @@ public class TestHoodieClientOnCopyOnWriteStorage implements Serializable {
client.startCommitWithTime(newCommitTime);
List<HoodieRecord> records = dataGen.generateInserts(newCommitTime, 200);
List<WriteStatus> statuses = client.upsert(jsc.parallelize(records, 1), newCommitTime).collect();
List<WriteStatus> statuses = client.upsert(jsc.parallelize(records, 1), newCommitTime)
.collect();
assertNoWriteErrors(statuses);
/**
@@ -437,7 +449,8 @@ public class TestHoodieClientOnCopyOnWriteStorage implements Serializable {
// Verify there are no errors
assertNoWriteErrors(statuses);
List<String> partitionPaths = FSUtils.getAllPartitionPaths(fs, cfg.getBasePath(), getConfig().shouldAssumeDatePartitioning());
List<String> partitionPaths = FSUtils
.getAllPartitionPaths(fs, cfg.getBasePath(), getConfig().shouldAssumeDatePartitioning());
HoodieTableMetaClient metaClient = new HoodieTableMetaClient(fs, basePath);
HoodieTable table = HoodieTable.getHoodieTable(metaClient, getConfig());
final TableFileSystemView.ReadOptimizedView view = table.getROFileSystemView();
@@ -478,7 +491,8 @@ public class TestHoodieClientOnCopyOnWriteStorage implements Serializable {
.build()).build();
HoodieWriteClient client = new HoodieWriteClient(jsc, cfg);
FileSystem fs = FSUtils.getFs();
HoodieTestDataGenerator.writePartitionMetadata(fs, HoodieTestDataGenerator.DEFAULT_PARTITION_PATHS, basePath);
HoodieTestDataGenerator
.writePartitionMetadata(fs, HoodieTestDataGenerator.DEFAULT_PARTITION_PATHS, basePath);
/**
* Write 1 (only inserts)
@@ -515,7 +529,8 @@ public class TestHoodieClientOnCopyOnWriteStorage implements Serializable {
statuses = client.upsert(jsc.parallelize(records, 1), newCommitTime).collect();
// Verify there are no errors
assertNoWriteErrors(statuses);
List<String> partitionPaths = FSUtils.getAllPartitionPaths(fs, cfg.getBasePath(), getConfig().shouldAssumeDatePartitioning());
List<String> partitionPaths = FSUtils
.getAllPartitionPaths(fs, cfg.getBasePath(), getConfig().shouldAssumeDatePartitioning());
HoodieTableMetaClient metaClient = new HoodieTableMetaClient(fs, basePath);
HoodieTable table = HoodieTable.getHoodieTable(metaClient, getConfig());
final TableFileSystemView.ReadOptimizedView view1 = table.getROFileSystemView();
@@ -525,7 +540,6 @@ public class TestHoodieClientOnCopyOnWriteStorage implements Serializable {
}).collect(Collectors.toList());
assertEquals("The data files for commit 003 should be present", 3, dataFiles.size());
/**
* Write 4 (updates)
*/
@@ -546,7 +560,6 @@ public class TestHoodieClientOnCopyOnWriteStorage implements Serializable {
}).collect(Collectors.toList());
assertEquals("The data files for commit 004 should be present", 3, dataFiles.size());
// rolling back to a non existent savepoint must not succeed
try {
client.rollbackToSavepoint("001");
@@ -606,8 +619,10 @@ public class TestHoodieClientOnCopyOnWriteStorage implements Serializable {
// verify that there is a commit
HoodieTableMetaClient metaClient = new HoodieTableMetaClient(fs, basePath);
HoodieTimeline timeline = new HoodieActiveTimeline(fs, metaClient.getMetaPath()).getCommitTimeline();
assertEquals("Expecting a single commit.", 1, timeline.findInstantsAfter("000", Integer.MAX_VALUE).countInstants());
HoodieTimeline timeline = new HoodieActiveTimeline(fs, metaClient.getMetaPath())
.getCommitTimeline();
assertEquals("Expecting a single commit.", 1,
timeline.findInstantsAfter("000", Integer.MAX_VALUE).countInstants());
// Should have 100 records in table (check using Index), all in locations marked at commit
HoodieTable table = HoodieTable.getHoodieTable(metaClient, getConfig());
assertFalse(table.getCompletedCommitTimeline().empty());
@@ -617,7 +632,8 @@ public class TestHoodieClientOnCopyOnWriteStorage implements Serializable {
assertEquals("The clean instant should be the same as the commit instant", commitTime,
table.getCompletedCleanTimeline().getInstants().findFirst().get().getTimestamp());
List<HoodieRecord> taggedRecords = index.tagLocation(jsc.parallelize(records, 1), table).collect();
List<HoodieRecord> taggedRecords = index.tagLocation(jsc.parallelize(records, 1), table)
.collect();
checkTaggedRecords(taggedRecords, newCommitTime);
// Keep doing some writes and clean inline. Make sure we have expected number of files remaining.
@@ -641,18 +657,20 @@ public class TestHoodieClientOnCopyOnWriteStorage implements Serializable {
// compute all the versions of all files, from time 0
HashMap<String, TreeSet<String>> fileIdToVersions = new HashMap<>();
for (HoodieInstant entry : timeline.getInstants().collect(Collectors.toList())) {
HoodieCommitMetadata commitMetadata = HoodieCommitMetadata.fromBytes(timeline.getInstantDetails(entry).get());
HoodieCommitMetadata commitMetadata = HoodieCommitMetadata
.fromBytes(timeline.getInstantDetails(entry).get());
for (HoodieWriteStat wstat : commitMetadata.getWriteStats(partitionPath)) {
if (!fileIdToVersions.containsKey(wstat.getFileId())) {
fileIdToVersions.put(wstat.getFileId(), new TreeSet<>());
}
fileIdToVersions.get(wstat.getFileId()).add(FSUtils.getCommitTime(new Path(wstat.getPath()).getName()));
fileIdToVersions.get(wstat.getFileId())
.add(FSUtils.getCommitTime(new Path(wstat.getPath()).getName()));
}
}
List<HoodieFileGroup> fileGroups = fsView.getAllFileGroups(partitionPath).collect(Collectors.toList());
List<HoodieFileGroup> fileGroups = fsView.getAllFileGroups(partitionPath)
.collect(Collectors.toList());
for (HoodieFileGroup fileGroup : fileGroups) {
// No file has no more than max versions
@@ -665,7 +683,8 @@ public class TestHoodieClientOnCopyOnWriteStorage implements Serializable {
// Each file, has the latest N versions (i.e cleaning gets rid of older versions)
List<String> commitedVersions = new ArrayList<>(fileIdToVersions.get(fileId));
for (int i = 0; i < dataFiles.size(); i++) {
assertEquals("File " + fileId + " does not have latest versions on commits" + commitedVersions,
assertEquals(
"File " + fileId + " does not have latest versions on commits" + commitedVersions,
Iterables.get(dataFiles, i).getCommitTime(),
commitedVersions.get(commitedVersions.size() - 1 - i));
}
@@ -700,8 +719,10 @@ public class TestHoodieClientOnCopyOnWriteStorage implements Serializable {
// verify that there is a commit
HoodieTableMetaClient metaClient = new HoodieTableMetaClient(fs, basePath);
HoodieTimeline timeline = new HoodieActiveTimeline(fs, metaClient.getMetaPath()).getCommitTimeline();
assertEquals("Expecting a single commit.", 1, timeline.findInstantsAfter("000", Integer.MAX_VALUE).countInstants());
HoodieTimeline timeline = new HoodieActiveTimeline(fs, metaClient.getMetaPath())
.getCommitTimeline();
assertEquals("Expecting a single commit.", 1,
timeline.findInstantsAfter("000", Integer.MAX_VALUE).countInstants());
// Should have 100 records in table (check using Index), all in locations marked at commit
HoodieTable table = HoodieTable.getHoodieTable(metaClient, getConfig());
@@ -712,7 +733,8 @@ public class TestHoodieClientOnCopyOnWriteStorage implements Serializable {
assertEquals("The clean instant should be the same as the commit instant", commitTime,
table.getCompletedCleanTimeline().getInstants().findFirst().get().getTimestamp());
List<HoodieRecord> taggedRecords = index.tagLocation(jsc.parallelize(records, 1), table).collect();
List<HoodieRecord> taggedRecords = index.tagLocation(jsc.parallelize(records, 1), table)
.collect();
checkTaggedRecords(taggedRecords, newCommitTime);
// Keep doing some writes and clean inline. Make sure we have expected number of files remaining.
@@ -734,7 +756,8 @@ public class TestHoodieClientOnCopyOnWriteStorage implements Serializable {
activeTimeline.getInstants().collect(Collectors.toSet());
if (earliestRetainedCommit.isPresent()) {
acceptableCommits.removeAll(
activeTimeline.findInstantsInRange("000", earliestRetainedCommit.get().getTimestamp()).getInstants()
activeTimeline.findInstantsInRange("000", earliestRetainedCommit.get().getTimestamp())
.getInstants()
.collect(Collectors.toSet()));
acceptableCommits.add(earliestRetainedCommit.get());
}
@@ -742,7 +765,8 @@ public class TestHoodieClientOnCopyOnWriteStorage implements Serializable {
TableFileSystemView fsView = table1.getFileSystemView();
// Need to ensure the following
for (String partitionPath : dataGen.getPartitionPaths()) {
List<HoodieFileGroup> fileGroups = fsView.getAllFileGroups(partitionPath).collect(Collectors.toList());
List<HoodieFileGroup> fileGroups = fsView.getAllFileGroups(partitionPath)
.collect(Collectors.toList());
for (HoodieFileGroup fileGroup : fileGroups) {
Set<String> commitTimes = new HashSet<>();
fileGroup.getAllDataFiles().forEach(value -> {
@@ -765,10 +789,9 @@ public class TestHoodieClientOnCopyOnWriteStorage implements Serializable {
String commitTime3 = "20160506030611";
new File(basePath + "/.hoodie").mkdirs();
HoodieTestDataGenerator.writePartitionMetadata(FSUtils.getFs(),
new String[] {"2016/05/01", "2016/05/02", "2016/05/06"},
new String[]{"2016/05/01", "2016/05/02", "2016/05/06"},
basePath);
// Only first two have commit files
HoodieTestUtils.createCommitFiles(basePath, commitTime1, commitTime2);
// Third one has a .inflight intermediate commit file
@@ -816,7 +839,6 @@ public class TestHoodieClientOnCopyOnWriteStorage implements Serializable {
client.rollback(commitTime3);
assertFalse(HoodieTestUtils.doesInflightExist(basePath, commitTime3));
// Rollback commit2
client.rollback(commitTime2);
assertFalse(HoodieTestUtils.doesCommitExist(basePath, commitTime2));
@@ -839,7 +861,6 @@ public class TestHoodieClientOnCopyOnWriteStorage implements Serializable {
HoodieTestUtils.doesDataFileExist(basePath, "2016/05/02", commitTime2, file22) ||
HoodieTestUtils.doesDataFileExist(basePath, "2016/05/06", commitTime2, file23));
// Let's rollback commit1, Check results
client.rollback(commitTime1);
assertFalse(HoodieTestUtils.doesCommitExist(basePath, commitTime1));
@@ -858,7 +879,7 @@ public class TestHoodieClientOnCopyOnWriteStorage implements Serializable {
String commitTime3 = "20160506030611";
new File(basePath + "/.hoodie").mkdirs();
HoodieTestDataGenerator.writePartitionMetadata(FSUtils.getFs(),
new String[] {"2016/05/01", "2016/05/02", "2016/05/06"},
new String[]{"2016/05/01", "2016/05/02", "2016/05/06"},
basePath);
// One good commit
@@ -940,26 +961,29 @@ public class TestHoodieClientOnCopyOnWriteStorage implements Serializable {
final String TEST_PARTITION_PATH = "2016/09/26";
final int INSERT_SPLIT_LIMIT = 100;
// setup the small file handling params
HoodieWriteConfig config = getSmallInsertWriteConfig(INSERT_SPLIT_LIMIT); // hold upto 200 records max
dataGen = new HoodieTestDataGenerator(new String[] {TEST_PARTITION_PATH});
HoodieWriteConfig config = getSmallInsertWriteConfig(
INSERT_SPLIT_LIMIT); // hold upto 200 records max
dataGen = new HoodieTestDataGenerator(new String[]{TEST_PARTITION_PATH});
HoodieWriteClient client = new HoodieWriteClient(jsc, config);
// Inserts => will write file1
String commitTime1 = "001";
client.startCommitWithTime(commitTime1);
List<HoodieRecord> inserts1 = dataGen.generateInserts(commitTime1, INSERT_SPLIT_LIMIT); // this writes ~500kb
List<HoodieRecord> inserts1 = dataGen
.generateInserts(commitTime1, INSERT_SPLIT_LIMIT); // this writes ~500kb
Set<String> keys1 = HoodieClientTestUtils.getRecordKeys(inserts1);
JavaRDD<HoodieRecord> insertRecordsRDD1 = jsc.parallelize(inserts1, 1);
List<WriteStatus> statuses= client.upsert(insertRecordsRDD1, commitTime1).collect();
List<WriteStatus> statuses = client.upsert(insertRecordsRDD1, commitTime1).collect();
assertNoWriteErrors(statuses);
assertEquals("Just 1 file needs to be added.", 1, statuses.size());
String file1 = statuses.get(0).getFileId();
assertEquals("file should contain 100 records",
ParquetUtils.readRowKeysFromParquet(new Path(basePath, TEST_PARTITION_PATH + "/" + FSUtils.makeDataFileName(commitTime1, 0, file1))).size(),
ParquetUtils.readRowKeysFromParquet(new Path(basePath,
TEST_PARTITION_PATH + "/" + FSUtils.makeDataFileName(commitTime1, 0, file1))).size(),
100);
// Update + Inserts such that they just expand file1
@@ -977,15 +1001,20 @@ public class TestHoodieClientOnCopyOnWriteStorage implements Serializable {
assertEquals("Just 1 file needs to be updated.", 1, statuses.size());
assertEquals("Existing file should be expanded", file1, statuses.get(0).getFileId());
assertEquals("Existing file should be expanded", commitTime1, statuses.get(0).getStat().getPrevCommit());
Path newFile = new Path(basePath, TEST_PARTITION_PATH + "/" + FSUtils.makeDataFileName(commitTime2, 0, file1));
assertEquals("file should contain 140 records", ParquetUtils.readRowKeysFromParquet(newFile).size(), 140);
assertEquals("Existing file should be expanded", commitTime1,
statuses.get(0).getStat().getPrevCommit());
Path newFile = new Path(basePath,
TEST_PARTITION_PATH + "/" + FSUtils.makeDataFileName(commitTime2, 0, file1));
assertEquals("file should contain 140 records",
ParquetUtils.readRowKeysFromParquet(newFile).size(), 140);
List<GenericRecord> records = ParquetUtils.readAvroRecords(newFile);
for (GenericRecord record: records) {
for (GenericRecord record : records) {
String recordKey = record.get(HoodieRecord.RECORD_KEY_METADATA_FIELD).toString();
assertEquals("only expect commit2", commitTime2, record.get(HoodieRecord.COMMIT_TIME_METADATA_FIELD).toString());
assertTrue("key expected to be part of commit2", keys2.contains(recordKey) || keys1.contains(recordKey));
assertEquals("only expect commit2", commitTime2,
record.get(HoodieRecord.COMMIT_TIME_METADATA_FIELD).toString());
assertTrue("key expected to be part of commit2",
keys2.contains(recordKey) || keys1.contains(recordKey));
}
// update + inserts such that file1 is updated and expanded, a new file2 is created.
@@ -1004,14 +1033,15 @@ public class TestHoodieClientOnCopyOnWriteStorage implements Serializable {
HoodieTableMetaClient metadata = new HoodieTableMetaClient(fs, basePath);
HoodieTable table = HoodieTable.getHoodieTable(metadata, config);
TableFileSystemView.ReadOptimizedView fileSystemView = table.getROFileSystemView();
List<HoodieDataFile> files = fileSystemView.getLatestDataFilesBeforeOrOn(TEST_PARTITION_PATH, commitTime3).collect(
List<HoodieDataFile> files = fileSystemView
.getLatestDataFilesBeforeOrOn(TEST_PARTITION_PATH, commitTime3).collect(
Collectors.toList());
int numTotalInsertsInCommit3 = 0;
for (HoodieDataFile file: files) {
for (HoodieDataFile file : files) {
if (file.getFileName().contains(file1)) {
assertEquals("Existing file should be expanded", commitTime3, file.getCommitTime());
records = ParquetUtils.readAvroRecords(new Path(file.getPath()));
for (GenericRecord record: records) {
for (GenericRecord record : records) {
String recordKey = record.get(HoodieRecord.RECORD_KEY_METADATA_FIELD).toString();
String recordCommitTime = record.get(HoodieRecord.COMMIT_TIME_METADATA_FIELD).toString();
if (recordCommitTime.equals(commitTime3)) {
@@ -1023,13 +1053,15 @@ public class TestHoodieClientOnCopyOnWriteStorage implements Serializable {
}
}
}
assertEquals("All keys added in commit 2 must be updated in commit3 correctly", 0, keys2.size());
assertEquals("All keys added in commit 2 must be updated in commit3 correctly", 0,
keys2.size());
} else {
assertEquals("New file must be written for commit 3", commitTime3, file.getCommitTime());
records = ParquetUtils.readAvroRecords(new Path(file.getPath()));
for (GenericRecord record: records) {
for (GenericRecord record : records) {
String recordKey = record.get(HoodieRecord.RECORD_KEY_METADATA_FIELD).toString();
assertEquals("only expect commit3", commitTime3, record.get(HoodieRecord.COMMIT_TIME_METADATA_FIELD).toString());
assertEquals("only expect commit3", commitTime3,
record.get(HoodieRecord.COMMIT_TIME_METADATA_FIELD).toString());
assertTrue("key expected to be part of commit3", keys3.contains(recordKey));
}
numTotalInsertsInCommit3 += records.size();
@@ -1044,17 +1076,19 @@ public class TestHoodieClientOnCopyOnWriteStorage implements Serializable {
final String TEST_PARTITION_PATH = "2016/09/26";
final int INSERT_SPLIT_LIMIT = 100;
// setup the small file handling params
HoodieWriteConfig config = getSmallInsertWriteConfig(INSERT_SPLIT_LIMIT); // hold upto 200 records max
dataGen = new HoodieTestDataGenerator(new String[] {TEST_PARTITION_PATH});
HoodieWriteConfig config = getSmallInsertWriteConfig(
INSERT_SPLIT_LIMIT); // hold upto 200 records max
dataGen = new HoodieTestDataGenerator(new String[]{TEST_PARTITION_PATH});
HoodieWriteClient client = new HoodieWriteClient(jsc, config);
// Inserts => will write file1
String commitTime1 = "001";
client.startCommitWithTime(commitTime1);
List<HoodieRecord> inserts1 = dataGen.generateInserts(commitTime1, INSERT_SPLIT_LIMIT); // this writes ~500kb
List<HoodieRecord> inserts1 = dataGen
.generateInserts(commitTime1, INSERT_SPLIT_LIMIT); // this writes ~500kb
Set<String> keys1 = HoodieClientTestUtils.getRecordKeys(inserts1);
JavaRDD<HoodieRecord> insertRecordsRDD1 = jsc.parallelize(inserts1, 1);
List<WriteStatus> statuses= client.insert(insertRecordsRDD1, commitTime1).collect();
List<WriteStatus> statuses = client.insert(insertRecordsRDD1, commitTime1).collect();
assertNoWriteErrors(statuses);
assertPartitionMetadata(new String[]{TEST_PARTITION_PATH}, FSUtils.getFs());
@@ -1062,7 +1096,8 @@ public class TestHoodieClientOnCopyOnWriteStorage implements Serializable {
assertEquals("Just 1 file needs to be added.", 1, statuses.size());
String file1 = statuses.get(0).getFileId();
assertEquals("file should contain 100 records",
ParquetUtils.readRowKeysFromParquet(new Path(basePath, TEST_PARTITION_PATH + "/" + FSUtils.makeDataFileName(commitTime1, 0, file1))).size(),
ParquetUtils.readRowKeysFromParquet(new Path(basePath,
TEST_PARTITION_PATH + "/" + FSUtils.makeDataFileName(commitTime1, 0, file1))).size(),
100);
// Second, set of Inserts should just expand file1
@@ -1076,16 +1111,21 @@ public class TestHoodieClientOnCopyOnWriteStorage implements Serializable {
assertEquals("Just 1 file needs to be updated.", 1, statuses.size());
assertEquals("Existing file should be expanded", file1, statuses.get(0).getFileId());
assertEquals("Existing file should be expanded", commitTime1, statuses.get(0).getStat().getPrevCommit());
Path newFile = new Path(basePath, TEST_PARTITION_PATH + "/" + FSUtils.makeDataFileName(commitTime2, 0, file1));
assertEquals("file should contain 140 records", ParquetUtils.readRowKeysFromParquet(newFile).size(), 140);
assertEquals("Existing file should be expanded", commitTime1,
statuses.get(0).getStat().getPrevCommit());
Path newFile = new Path(basePath,
TEST_PARTITION_PATH + "/" + FSUtils.makeDataFileName(commitTime2, 0, file1));
assertEquals("file should contain 140 records",
ParquetUtils.readRowKeysFromParquet(newFile).size(), 140);
List<GenericRecord> records = ParquetUtils.readAvroRecords(newFile);
for (GenericRecord record: records) {
for (GenericRecord record : records) {
String recordKey = record.get(HoodieRecord.RECORD_KEY_METADATA_FIELD).toString();
String recCommitTime = record.get(HoodieRecord.COMMIT_TIME_METADATA_FIELD).toString();
assertTrue("Record expected to be part of commit 1 or commit2", commitTime1.equals(recCommitTime) || commitTime2.equals(recCommitTime));
assertTrue("key expected to be part of commit 1 or commit2", keys2.contains(recordKey) || keys1.contains(recordKey));
assertTrue("Record expected to be part of commit 1 or commit2",
commitTime1.equals(recCommitTime) || commitTime2.equals(recCommitTime));
assertTrue("key expected to be part of commit 1 or commit2",
keys2.contains(recordKey) || keys1.contains(recordKey));
}
// Lots of inserts such that file1 is updated and expanded, a new file2 is created.
@@ -1097,7 +1137,6 @@ public class TestHoodieClientOnCopyOnWriteStorage implements Serializable {
assertNoWriteErrors(statuses);
assertEquals("2 files needs to be committed.", 2, statuses.size());
FileSystem fs = FSUtils.getFs();
HoodieTableMetaClient metaClient = new HoodieTableMetaClient(fs, basePath);
HoodieTable table = HoodieTable.getHoodieTable(metaClient, config);
@@ -1106,14 +1145,14 @@ public class TestHoodieClientOnCopyOnWriteStorage implements Serializable {
.collect(Collectors.toList());
assertEquals("Total of 2 valid data files", 2, files.size());
int totalInserts = 0;
for (HoodieDataFile file: files) {
for (HoodieDataFile file : files) {
assertEquals("All files must be at commit 3", commitTime3, file.getCommitTime());
records = ParquetUtils.readAvroRecords(new Path(file.getPath()));
totalInserts += records.size();
}
assertEquals("Total number of records must add up", totalInserts, inserts1.size() + inserts2.size() + insert3.size());
assertEquals("Total number of records must add up", totalInserts,
inserts1.size() + inserts2.size() + insert3.size());
}
@Test
@@ -1130,27 +1169,35 @@ public class TestHoodieClientOnCopyOnWriteStorage implements Serializable {
String file1P0C0 = HoodieTestUtils.createNewDataFile(basePath, partitionPaths[0], "000");
String file1P1C0 = HoodieTestUtils.createNewDataFile(basePath, partitionPaths[1], "000");
HoodieTable table = HoodieTable
.getHoodieTable(new HoodieTableMetaClient(FSUtils.getFs(), config.getBasePath(), true), config);
.getHoodieTable(new HoodieTableMetaClient(FSUtils.getFs(), config.getBasePath(), true),
config);
List<HoodieCleanStat> hoodieCleanStatsOne = table.clean(jsc);
assertEquals("Must not clean any files" , 0, getCleanStat(hoodieCleanStatsOne, partitionPaths[0]).getSuccessDeleteFiles().size());
assertEquals("Must not clean any files" , 0, getCleanStat(hoodieCleanStatsOne, partitionPaths[1]).getSuccessDeleteFiles().size());
assertEquals("Must not clean any files", 0,
getCleanStat(hoodieCleanStatsOne, partitionPaths[0]).getSuccessDeleteFiles().size());
assertEquals("Must not clean any files", 0,
getCleanStat(hoodieCleanStatsOne, partitionPaths[1]).getSuccessDeleteFiles().size());
assertTrue(HoodieTestUtils.doesDataFileExist(basePath, partitionPaths[0], "000", file1P0C0));
assertTrue(HoodieTestUtils.doesDataFileExist(basePath, partitionPaths[1], "000", file1P1C0));
// make next commit, with 1 insert & 1 update per partition
HoodieTestUtils.createCommitFiles(basePath, "001");
table = HoodieTable
.getHoodieTable(new HoodieTableMetaClient(FSUtils.getFs(), config.getBasePath(), true), config);
.getHoodieTable(new HoodieTableMetaClient(FSUtils.getFs(), config.getBasePath(), true),
config);
String file2P0C1 = HoodieTestUtils.createNewDataFile(basePath, partitionPaths[0], "001"); // insert
String file2P1C1 = HoodieTestUtils.createNewDataFile(basePath, partitionPaths[1], "001"); // insert
String file2P0C1 = HoodieTestUtils
.createNewDataFile(basePath, partitionPaths[0], "001"); // insert
String file2P1C1 = HoodieTestUtils
.createNewDataFile(basePath, partitionPaths[1], "001"); // insert
HoodieTestUtils.createDataFile(basePath, partitionPaths[0], "001", file1P0C0); // update
HoodieTestUtils.createDataFile(basePath, partitionPaths[1], "001", file1P1C0); // update
List<HoodieCleanStat> hoodieCleanStatsTwo = table.clean(jsc);
assertEquals("Must clean 1 file" , 1, getCleanStat(hoodieCleanStatsTwo, partitionPaths[0]).getSuccessDeleteFiles().size());
assertEquals("Must clean 1 file" , 1, getCleanStat(hoodieCleanStatsTwo, partitionPaths[1]).getSuccessDeleteFiles().size());
assertEquals("Must clean 1 file", 1,
getCleanStat(hoodieCleanStatsTwo, partitionPaths[0]).getSuccessDeleteFiles().size());
assertEquals("Must clean 1 file", 1,
getCleanStat(hoodieCleanStatsTwo, partitionPaths[1]).getSuccessDeleteFiles().size());
assertTrue(HoodieTestUtils.doesDataFileExist(basePath, partitionPaths[0], "001", file2P0C1));
assertTrue(HoodieTestUtils.doesDataFileExist(basePath, partitionPaths[1], "001", file2P1C1));
assertFalse(HoodieTestUtils.doesDataFileExist(basePath, partitionPaths[0], "000", file1P0C0));
@@ -1159,14 +1206,16 @@ public class TestHoodieClientOnCopyOnWriteStorage implements Serializable {
// make next commit, with 2 updates to existing files, and 1 insert
HoodieTestUtils.createCommitFiles(basePath, "002");
table = HoodieTable
.getHoodieTable(new HoodieTableMetaClient(FSUtils.getFs(), config.getBasePath(), true), config);
.getHoodieTable(new HoodieTableMetaClient(FSUtils.getFs(), config.getBasePath(), true),
config);
HoodieTestUtils.createDataFile(basePath, partitionPaths[0], "002", file1P0C0); // update
HoodieTestUtils.createDataFile(basePath, partitionPaths[0], "002", file2P0C1); // update
String file3P0C2 = HoodieTestUtils.createNewDataFile(basePath, partitionPaths[0], "002");
List<HoodieCleanStat> hoodieCleanStatsThree = table.clean(jsc);
assertEquals("Must clean two files" , 2, getCleanStat(hoodieCleanStatsThree, partitionPaths[0]).getSuccessDeleteFiles().size());
assertEquals("Must clean two files", 2,
getCleanStat(hoodieCleanStatsThree, partitionPaths[0]).getSuccessDeleteFiles().size());
assertFalse(HoodieTestUtils.doesDataFileExist(basePath, partitionPaths[0], "001", file1P0C0));
assertFalse(HoodieTestUtils.doesDataFileExist(basePath, partitionPaths[0], "001", file2P0C1));
assertTrue(HoodieTestUtils.doesDataFileExist(basePath, partitionPaths[0], "002", file3P0C2));
@@ -1174,7 +1223,8 @@ public class TestHoodieClientOnCopyOnWriteStorage implements Serializable {
// No cleaning on partially written file, with no commit.
HoodieTestUtils.createDataFile(basePath, partitionPaths[0], "003", file3P0C2); // update
List<HoodieCleanStat> hoodieCleanStatsFour = table.clean(jsc);
assertEquals("Must not clean any files" , 0, getCleanStat(hoodieCleanStatsFour, partitionPaths[0]).getSuccessDeleteFiles().size());
assertEquals("Must not clean any files", 0,
getCleanStat(hoodieCleanStatsFour, partitionPaths[0]).getSuccessDeleteFiles().size());
assertTrue(HoodieTestUtils.doesDataFileExist(basePath, partitionPaths[0], "002", file3P0C2));
}
@@ -1187,31 +1237,39 @@ public class TestHoodieClientOnCopyOnWriteStorage implements Serializable {
.withCleanerPolicy(HoodieCleaningPolicy.KEEP_LATEST_FILE_VERSIONS)
.retainFileVersions(1).build()).build();
HoodieTableMetaClient metaClient = HoodieTestUtils.initTableType(basePath, HoodieTableType.MERGE_ON_READ);
HoodieTableMetaClient metaClient = HoodieTestUtils
.initTableType(basePath, HoodieTableType.MERGE_ON_READ);
// Make 3 files, one base file and 2 log files associated with base file
String file1P0 = HoodieTestUtils.createNewDataFile(basePath, partitionPaths[0], "000");
String file2P0L0 = HoodieTestUtils.createNewLogFile(basePath, partitionPaths[0], "000", file1P0, Optional.empty());
String file2P0L1 = HoodieTestUtils.createNewLogFile(basePath, partitionPaths[0], "000", file1P0, Optional.of(2));
String file2P0L0 = HoodieTestUtils
.createNewLogFile(basePath, partitionPaths[0], "000", file1P0, Optional.empty());
String file2P0L1 = HoodieTestUtils
.createNewLogFile(basePath, partitionPaths[0], "000", file1P0, Optional.of(2));
// make 1 compaction commit
HoodieTestUtils.createCompactionCommitFiles(basePath, "000");
// Make 4 files, one base file and 3 log files associated with base file
HoodieTestUtils.createDataFile(basePath, partitionPaths[0], "001", file1P0);
file2P0L0 = HoodieTestUtils.createNewLogFile(basePath, partitionPaths[0], "001", file1P0, Optional.empty());
file2P0L0 = HoodieTestUtils.createNewLogFile(basePath, partitionPaths[0], "001", file1P0, Optional.of(2));
file2P0L0 = HoodieTestUtils.createNewLogFile(basePath, partitionPaths[0], "001", file1P0, Optional.of(3));
file2P0L0 = HoodieTestUtils
.createNewLogFile(basePath, partitionPaths[0], "001", file1P0, Optional.empty());
file2P0L0 = HoodieTestUtils
.createNewLogFile(basePath, partitionPaths[0], "001", file1P0, Optional.of(2));
file2P0L0 = HoodieTestUtils
.createNewLogFile(basePath, partitionPaths[0], "001", file1P0, Optional.of(3));
// make 1 compaction commit
HoodieTestUtils.createCompactionCommitFiles(basePath, "001");
HoodieTable table = HoodieTable
.getHoodieTable(metaClient, config);
List<HoodieCleanStat> hoodieCleanStats = table.clean(jsc);
assertEquals("Must clean three files, one parquet and 2 log files" , 3, getCleanStat(hoodieCleanStats, partitionPaths[0]).getSuccessDeleteFiles().size());
assertEquals("Must clean three files, one parquet and 2 log files", 3,
getCleanStat(hoodieCleanStats, partitionPaths[0]).getSuccessDeleteFiles().size());
assertFalse(HoodieTestUtils.doesDataFileExist(basePath, partitionPaths[0], "000", file1P0));
assertFalse(HoodieTestUtils.doesLogFileExist(basePath, partitionPaths[0], "000", file2P0L0, Optional.empty()));
assertFalse(HoodieTestUtils.doesLogFileExist(basePath, partitionPaths[0], "000", file2P0L0, Optional.of(2)));
assertFalse(HoodieTestUtils
.doesLogFileExist(basePath, partitionPaths[0], "000", file2P0L0, Optional.empty()));
assertFalse(HoodieTestUtils
.doesLogFileExist(basePath, partitionPaths[0], "000", file2P0L0, Optional.of(2)));
}
@Test
@@ -1229,27 +1287,35 @@ public class TestHoodieClientOnCopyOnWriteStorage implements Serializable {
String file1P1C0 = HoodieTestUtils.createNewDataFile(basePath, partitionPaths[1], "000");
HoodieTable table = HoodieTable
.getHoodieTable(new HoodieTableMetaClient(FSUtils.getFs(), config.getBasePath(), true), config);
.getHoodieTable(new HoodieTableMetaClient(FSUtils.getFs(), config.getBasePath(), true),
config);
List<HoodieCleanStat> hoodieCleanStatsOne = table.clean(jsc);
assertEquals("Must not clean any files" , 0, getCleanStat(hoodieCleanStatsOne, partitionPaths[0]).getSuccessDeleteFiles().size());
assertEquals("Must not clean any files" , 0, getCleanStat(hoodieCleanStatsOne, partitionPaths[1]).getSuccessDeleteFiles().size());
assertEquals("Must not clean any files", 0,
getCleanStat(hoodieCleanStatsOne, partitionPaths[0]).getSuccessDeleteFiles().size());
assertEquals("Must not clean any files", 0,
getCleanStat(hoodieCleanStatsOne, partitionPaths[1]).getSuccessDeleteFiles().size());
assertTrue(HoodieTestUtils.doesDataFileExist(basePath, partitionPaths[0], "000", file1P0C0));
assertTrue(HoodieTestUtils.doesDataFileExist(basePath, partitionPaths[1], "000", file1P1C0));
// make next commit, with 1 insert & 1 update per partition
HoodieTestUtils.createCommitFiles(basePath, "001");
table = HoodieTable
.getHoodieTable(new HoodieTableMetaClient(FSUtils.getFs(), config.getBasePath(), true), config);
.getHoodieTable(new HoodieTableMetaClient(FSUtils.getFs(), config.getBasePath(), true),
config);
String file2P0C1 = HoodieTestUtils.createNewDataFile(basePath, partitionPaths[0], "001"); // insert
String file2P1C1 = HoodieTestUtils.createNewDataFile(basePath, partitionPaths[1], "001"); // insert
String file2P0C1 = HoodieTestUtils
.createNewDataFile(basePath, partitionPaths[0], "001"); // insert
String file2P1C1 = HoodieTestUtils
.createNewDataFile(basePath, partitionPaths[1], "001"); // insert
HoodieTestUtils.createDataFile(basePath, partitionPaths[0], "001", file1P0C0); // update
HoodieTestUtils.createDataFile(basePath, partitionPaths[1], "001", file1P1C0); // update
List<HoodieCleanStat> hoodieCleanStatsTwo = table.clean(jsc);
assertEquals("Must not clean any files" , 0, getCleanStat(hoodieCleanStatsTwo, partitionPaths[0]).getSuccessDeleteFiles().size());
assertEquals("Must not clean any files" , 0, getCleanStat(hoodieCleanStatsTwo, partitionPaths[1]).getSuccessDeleteFiles().size());
assertEquals("Must not clean any files", 0,
getCleanStat(hoodieCleanStatsTwo, partitionPaths[0]).getSuccessDeleteFiles().size());
assertEquals("Must not clean any files", 0,
getCleanStat(hoodieCleanStatsTwo, partitionPaths[1]).getSuccessDeleteFiles().size());
assertTrue(HoodieTestUtils.doesDataFileExist(basePath, partitionPaths[0], "001", file2P0C1));
assertTrue(HoodieTestUtils.doesDataFileExist(basePath, partitionPaths[1], "001", file2P1C1));
assertTrue(HoodieTestUtils.doesDataFileExist(basePath, partitionPaths[0], "000", file1P0C0));
@@ -1258,7 +1324,8 @@ public class TestHoodieClientOnCopyOnWriteStorage implements Serializable {
// make next commit, with 2 updates to existing files, and 1 insert
HoodieTestUtils.createCommitFiles(basePath, "002");
table = HoodieTable
.getHoodieTable(new HoodieTableMetaClient(FSUtils.getFs(), config.getBasePath(), true), config);
.getHoodieTable(new HoodieTableMetaClient(FSUtils.getFs(), config.getBasePath(), true),
config);
HoodieTestUtils.createDataFile(basePath, partitionPaths[0], "002", file1P0C0); // update
HoodieTestUtils.createDataFile(basePath, partitionPaths[0], "002", file2P0C1); // update
@@ -1274,7 +1341,8 @@ public class TestHoodieClientOnCopyOnWriteStorage implements Serializable {
// make next commit, with 2 updates to existing files, and 1 insert
HoodieTestUtils.createCommitFiles(basePath, "003");
table = HoodieTable
.getHoodieTable(new HoodieTableMetaClient(FSUtils.getFs(), config.getBasePath(), true), config);
.getHoodieTable(new HoodieTableMetaClient(FSUtils.getFs(), config.getBasePath(), true),
config);
HoodieTestUtils.createDataFile(basePath, partitionPaths[0], "003", file1P0C0); // update
HoodieTestUtils.createDataFile(basePath, partitionPaths[0], "003", file2P0C1); // update
@@ -1282,7 +1350,8 @@ public class TestHoodieClientOnCopyOnWriteStorage implements Serializable {
List<HoodieCleanStat> hoodieCleanStatsFour = table.clean(jsc);
assertEquals(
"Must not clean one old file", 1, getCleanStat(hoodieCleanStatsFour, partitionPaths[0]).getSuccessDeleteFiles().size());
"Must not clean one old file", 1,
getCleanStat(hoodieCleanStatsFour, partitionPaths[0]).getSuccessDeleteFiles().size());
assertFalse(HoodieTestUtils.doesDataFileExist(basePath, partitionPaths[0], "000", file1P0C0));
assertTrue(HoodieTestUtils.doesDataFileExist(basePath, partitionPaths[0], "001", file1P0C0));
@@ -1295,7 +1364,8 @@ public class TestHoodieClientOnCopyOnWriteStorage implements Serializable {
// No cleaning on partially written file, with no commit.
HoodieTestUtils.createDataFile(basePath, partitionPaths[0], "004", file3P0C2); // update
List<HoodieCleanStat> hoodieCleanStatsFive = table.clean(jsc);
assertEquals("Must not clean any files" , 0, getCleanStat(hoodieCleanStatsFive, partitionPaths[0]).getSuccessDeleteFiles().size());
assertEquals("Must not clean any files", 0,
getCleanStat(hoodieCleanStatsFive, partitionPaths[0]).getSuccessDeleteFiles().size());
assertTrue(HoodieTestUtils.doesDataFileExist(basePath, partitionPaths[0], "001", file1P0C0));
assertTrue(HoodieTestUtils.doesDataFileExist(basePath, partitionPaths[0], "001", file2P0C1));
}
@@ -1344,13 +1414,14 @@ public class TestHoodieClientOnCopyOnWriteStorage implements Serializable {
Iterator<AccumulatorV2<?, ?>> iterator = taskEnd.taskMetrics().accumulators()
.iterator();
while(iterator.hasNext()) {
while (iterator.hasNext()) {
AccumulatorV2 accumulator = iterator.next();
if (taskEnd.stageId() == 1 &&
accumulator.isRegistered() &&
accumulator.name().isDefined() &&
accumulator.name().get().equals("internal.metrics.shuffle.read.recordsRead")) {
stageOneShuffleReadTaskRecordsCountMap.put(taskEnd.taskInfo().taskId(), (Long) accumulator.value());
stageOneShuffleReadTaskRecordsCountMap
.put(taskEnd.taskInfo().taskId(), (Long) accumulator.value());
}
}
}
@@ -1378,22 +1449,27 @@ public class TestHoodieClientOnCopyOnWriteStorage implements Serializable {
updateAllFilesInPartition(filesP2C0, partitionPaths[2], "003");
HoodieTable table = HoodieTable
.getHoodieTable(new HoodieTableMetaClient(FSUtils.getFs(), config.getBasePath(), true), config);
.getHoodieTable(new HoodieTableMetaClient(FSUtils.getFs(), config.getBasePath(), true),
config);
List<HoodieCleanStat> hoodieCleanStats = table.clean(jsc);
assertEquals(100, getCleanStat(hoodieCleanStats, partitionPaths[0]).getSuccessDeleteFiles().size());
assertEquals(10, getCleanStat(hoodieCleanStats, partitionPaths[1]).getSuccessDeleteFiles().size());
assertEquals(10, getCleanStat(hoodieCleanStats, partitionPaths[2]).getSuccessDeleteFiles().size());
assertEquals(100,
getCleanStat(hoodieCleanStats, partitionPaths[0]).getSuccessDeleteFiles().size());
assertEquals(10,
getCleanStat(hoodieCleanStats, partitionPaths[1]).getSuccessDeleteFiles().size());
assertEquals(10,
getCleanStat(hoodieCleanStats, partitionPaths[2]).getSuccessDeleteFiles().size());
// 3 tasks are expected since the number of partitions is 3
assertEquals(3, stageOneShuffleReadTaskRecordsCountMap.keySet().size());
// Sum of all records processed = total number of files to clean
assertEquals(120, stageOneShuffleReadTaskRecordsCountMap
.values().stream().reduce((a,b) -> a + b).get().intValue());
.values().stream().reduce((a, b) -> a + b).get().intValue());
assertTrue("The skew in handling files to clean is not removed. "
+ "Each task should handle more records than the partitionPath with least files "
+ "and less records than the partitionPath with most files.",
stageOneShuffleReadTaskRecordsCountMap.values().stream().filter(a -> a > 10 && a < 100).count() == 3);
stageOneShuffleReadTaskRecordsCountMap.values().stream().filter(a -> a > 10 && a < 100)
.count() == 3);
}
public void testCommitWritesRelativePaths() throws Exception {
@@ -1454,7 +1530,8 @@ public class TestHoodieClientOnCopyOnWriteStorage implements Serializable {
}
}
private List<String> createFilesInPartition(String partitionPath, String commitTime, int numFiles) throws IOException {
private List<String> createFilesInPartition(String partitionPath, String commitTime, int numFiles)
throws IOException {
List<String> files = new ArrayList<>();
for (int i = 0; i < numFiles; i++) {
files.add(HoodieTestUtils.createNewDataFile(basePath, partitionPath, commitTime));

View File

@@ -29,15 +29,6 @@ import com.uber.hoodie.common.table.view.HoodieTableFileSystemView;
import com.uber.hoodie.common.util.FSUtils;
import com.uber.hoodie.exception.HoodieException;
import com.uber.hoodie.table.HoodieTable;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.spark.sql.Dataset;
import org.apache.spark.sql.Row;
import org.apache.spark.sql.SQLContext;
import org.apache.spark.SparkConf;
import java.io.File;
import java.io.IOException;
import java.io.RandomAccessFile;
@@ -49,6 +40,12 @@ import java.util.Iterator;
import java.util.List;
import java.util.Set;
import java.util.stream.Collectors;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.spark.SparkConf;
import org.apache.spark.sql.Dataset;
import org.apache.spark.sql.Row;
import org.apache.spark.sql.SQLContext;
/**
* Utility methods to aid testing inside the HoodieClient module.
@@ -66,14 +63,15 @@ public class HoodieClientTestUtils {
public static Set<String> getRecordKeys(List<HoodieRecord> hoodieRecords) {
Set<String> keys = new HashSet<>();
for (HoodieRecord rec: hoodieRecords) {
for (HoodieRecord rec : hoodieRecords) {
keys.add(rec.getRecordKey());
}
return keys;
}
private static void fakeMetaFile(String basePath, String commitTime, String suffix) throws IOException {
String parentPath = basePath + "/"+ HoodieTableMetaClient.METAFOLDER_NAME;
private static void fakeMetaFile(String basePath, String commitTime, String suffix)
throws IOException {
String parentPath = basePath + "/" + HoodieTableMetaClient.METAFOLDER_NAME;
new File(parentPath).mkdirs();
new File(parentPath + "/" + commitTime + suffix).createNewFile();
}
@@ -87,14 +85,17 @@ public class HoodieClientTestUtils {
fakeMetaFile(basePath, commitTime, HoodieTimeline.INFLIGHT_EXTENSION);
}
public static void fakeDataFile(String basePath, String partitionPath, String commitTime, String fileId) throws Exception {
public static void fakeDataFile(String basePath, String partitionPath, String commitTime,
String fileId) throws Exception {
fakeDataFile(basePath, partitionPath, commitTime, fileId, 0);
}
public static void fakeDataFile(String basePath, String partitionPath, String commitTime, String fileId, long length) throws Exception {
public static void fakeDataFile(String basePath, String partitionPath, String commitTime,
String fileId, long length) throws Exception {
String parentPath = String.format("%s/%s", basePath, partitionPath);
new File(parentPath).mkdirs();
String path = String.format("%s/%s", parentPath, FSUtils.makeDataFileName(commitTime, 0, fileId));
String path = String
.format("%s/%s", parentPath, FSUtils.makeDataFileName(commitTime, 0, fileId));
new File(path).createNewFile();
new RandomAccessFile(path, "rw").setLength(length);
}
@@ -129,7 +130,8 @@ public class HoodieClientTestUtils {
new HoodieException("No commit exists at " + commitTime);
}
try {
HashMap<String, String> paths = getLatestFileIDsToFullPath(basePath, commitTimeline, Arrays.asList(commitInstant));
HashMap<String, String> paths = getLatestFileIDsToFullPath(basePath, commitTimeline,
Arrays.asList(commitInstant));
return sqlContext.read()
.parquet(paths.values().toArray(new String[paths.size()]))
.filter(String.format("%s ='%s'", HoodieRecord.COMMIT_TIME_METADATA_FIELD, commitTime));
@@ -150,12 +152,15 @@ public class HoodieClientTestUtils {
.getInstants().collect(Collectors.toList());
try {
// Go over the commit metadata, and obtain the new files that need to be read.
HashMap<String, String> fileIdToFullPath = getLatestFileIDsToFullPath(basePath, commitTimeline, commitsToReturn);
HashMap<String, String> fileIdToFullPath = getLatestFileIDsToFullPath(basePath,
commitTimeline, commitsToReturn);
return sqlContext.read()
.parquet(fileIdToFullPath.values().toArray(new String[fileIdToFullPath.size()]))
.filter(String.format("%s >'%s'", HoodieRecord.COMMIT_TIME_METADATA_FIELD, lastCommitTime));
.filter(
String.format("%s >'%s'", HoodieRecord.COMMIT_TIME_METADATA_FIELD, lastCommitTime));
} catch (IOException e) {
throw new HoodieException("Error pulling data incrementally from commitTimestamp :" + lastCommitTime, e);
throw new HoodieException(
"Error pulling data incrementally from commitTimestamp :" + lastCommitTime, e);
}
}
@@ -171,7 +176,8 @@ public class HoodieClientTestUtils {
HoodieTable hoodieTable = HoodieTable
.getHoodieTable(new HoodieTableMetaClient(fs, basePath, true), null);
for (String path : paths) {
TableFileSystemView.ReadOptimizedView fileSystemView = new HoodieTableFileSystemView(hoodieTable.getMetaClient(),
TableFileSystemView.ReadOptimizedView fileSystemView = new HoodieTableFileSystemView(
hoodieTable.getMetaClient(),
hoodieTable.getCompletedCommitTimeline(), fs.globStatus(new Path(path)));
List<HoodieDataFile> latestFiles = fileSystemView.getLatestDataFiles().collect(
Collectors.toList());

View File

@@ -16,9 +16,16 @@
package com.uber.hoodie.common;
import static com.uber.hoodie.common.HoodieTestDataGenerator.TRIP_EXAMPLE_SCHEMA;
import com.uber.hoodie.common.util.FSUtils;
import com.uber.hoodie.common.util.HoodieAvroUtils;
import com.uber.hoodie.hadoop.realtime.HoodieRealtimeInputFormat;
import java.io.IOException;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.List;
import java.util.stream.Collectors;
import org.apache.avro.Schema;
import org.apache.avro.generic.GenericRecord;
import org.apache.avro.generic.GenericRecordBuilder;
@@ -30,20 +37,13 @@ import org.apache.hadoop.mapred.InputSplit;
import org.apache.hadoop.mapred.JobConf;
import org.apache.hadoop.mapred.RecordReader;
import java.io.IOException;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.List;
import java.util.stream.Collectors;
import static com.uber.hoodie.common.HoodieTestDataGenerator.TRIP_EXAMPLE_SCHEMA;
/**
* Utility methods to aid in testing MergeOnRead (workaround for HoodieReadClient for MOR)
*/
public class HoodieMergeOnReadTestUtils {
public static List<GenericRecord> getRecordsUsingInputFormat(List<String> inputPaths) throws IOException {
public static List<GenericRecord> getRecordsUsingInputFormat(List<String> inputPaths)
throws IOException {
JobConf jobConf = new JobConf();
Schema schema = HoodieAvroUtils.addMetadataFields(Schema.parse(TRIP_EXAMPLE_SCHEMA));
HoodieRealtimeInputFormat inputFormat = new HoodieRealtimeInputFormat();
@@ -75,10 +75,12 @@ public class HoodieMergeOnReadTestUtils {
}).get();
}
private static void setPropsForInputFormat(HoodieRealtimeInputFormat inputFormat, JobConf jobConf, Schema schema) {
private static void setPropsForInputFormat(HoodieRealtimeInputFormat inputFormat, JobConf jobConf,
Schema schema) {
List<Schema.Field> fields = schema.getFields();
String names = fields.stream().map(f -> f.name().toString()).collect(Collectors.joining(","));
String postions = fields.stream().map(f -> String.valueOf(f.pos())).collect(Collectors.joining(","));
String postions = fields.stream().map(f -> String.valueOf(f.pos()))
.collect(Collectors.joining(","));
Configuration conf = FSUtils.getFs().getConf();
jobConf.set(ColumnProjectionUtils.READ_COLUMN_NAMES_CONF_STR, names);
jobConf.set(ColumnProjectionUtils.READ_COLUMN_IDS_CONF_STR, postions);

View File

@@ -16,17 +16,21 @@
package com.uber.hoodie.common;
import com.uber.hoodie.avro.model.HoodieCleanMetadata;
import com.uber.hoodie.common.model.HoodieCleaningPolicy;
import com.uber.hoodie.common.model.HoodieCommitMetadata;
import com.uber.hoodie.common.model.HoodieKey;
import com.uber.hoodie.common.model.HoodiePartitionMetadata;
import com.uber.hoodie.common.model.HoodieRecord;
import com.uber.hoodie.common.table.HoodieTableMetaClient;
import com.uber.hoodie.common.table.HoodieTimeline;
import com.uber.hoodie.common.util.AvroUtils;
import com.uber.hoodie.common.util.FSUtils;
import com.uber.hoodie.common.util.HoodieAvroUtils;
import java.io.IOException;
import java.nio.charset.StandardCharsets;
import java.util.ArrayList;
import java.util.List;
import java.util.Optional;
import java.util.Random;
import java.util.UUID;
import org.apache.avro.Schema;
import org.apache.avro.generic.GenericData;
import org.apache.avro.generic.GenericRecord;
@@ -34,15 +38,6 @@ import org.apache.hadoop.fs.FSDataOutputStream;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import java.io.IOException;
import java.nio.charset.StandardCharsets;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.List;
import java.util.Optional;
import java.util.Random;
import java.util.UUID;
/**
* Class to be used in tests to keep generating test inserts and updates against a corpus.
*
@@ -51,6 +46,7 @@ import java.util.UUID;
public class HoodieTestDataGenerator {
static class KeyPartition {
HoodieKey key;
String partitionPath;
}
@@ -74,14 +70,17 @@ public class HoodieTestDataGenerator {
public static final String[] DEFAULT_PARTITION_PATHS = {"2016/03/15", "2015/03/16", "2015/03/17"};
public static void writePartitionMetadata(FileSystem fs, String[] partitionPaths, String basePath) {
for (String partitionPath: partitionPaths) {
new HoodiePartitionMetadata(fs, "000", new Path(basePath), new Path(basePath, partitionPath)).trySave(0);
public static void writePartitionMetadata(FileSystem fs, String[] partitionPaths,
String basePath) {
for (String partitionPath : partitionPaths) {
new HoodiePartitionMetadata(fs, "000", new Path(basePath), new Path(basePath, partitionPath))
.trySave(0);
}
}
private List<KeyPartition> existingKeysList = new ArrayList<>();
public static Schema avroSchema = HoodieAvroUtils.addMetadataFields(new Schema.Parser().parse(TRIP_EXAMPLE_SCHEMA));
public static Schema avroSchema = HoodieAvroUtils
.addMetadataFields(new Schema.Parser().parse(TRIP_EXAMPLE_SCHEMA));
private static Random rand = new Random(46474747);
private String[] partitionPaths = DEFAULT_PARTITION_PATHS;
@@ -95,8 +94,8 @@ public class HoodieTestDataGenerator {
/**
* Generates new inserts, uniformly across the partition paths above. It also updates the list
* of existing keys.
* Generates new inserts, uniformly across the partition paths above. It also updates the list of
* existing keys.
*/
public List<HoodieRecord> generateInserts(String commitTime, int n) throws IOException {
List<HoodieRecord> inserts = new ArrayList<>();
@@ -119,9 +118,10 @@ public class HoodieTestDataGenerator {
return generateDeletesFromExistingRecords(inserts);
}
public List<HoodieRecord> generateDeletesFromExistingRecords(List<HoodieRecord> existingRecords) throws IOException {
public List<HoodieRecord> generateDeletesFromExistingRecords(List<HoodieRecord> existingRecords)
throws IOException {
List<HoodieRecord> deletes = new ArrayList<>();
for (HoodieRecord existingRecord: existingRecords) {
for (HoodieRecord existingRecord : existingRecords) {
HoodieRecord record = generateDeleteRecord(existingRecord);
deletes.add(record);
@@ -131,14 +131,17 @@ public class HoodieTestDataGenerator {
public HoodieRecord generateDeleteRecord(HoodieRecord existingRecord) throws IOException {
HoodieKey key = existingRecord.getKey();
TestRawTripPayload payload = new TestRawTripPayload(Optional.empty(), key.getRecordKey(), key.getPartitionPath(), null, true);
TestRawTripPayload payload = new TestRawTripPayload(Optional.empty(), key.getRecordKey(),
key.getPartitionPath(), null, true);
return new HoodieRecord(key, payload);
}
public List<HoodieRecord> generateUpdates(String commitTime, List<HoodieRecord> baseRecords) throws IOException {
public List<HoodieRecord> generateUpdates(String commitTime, List<HoodieRecord> baseRecords)
throws IOException {
List<HoodieRecord> updates = new ArrayList<>();
for (HoodieRecord baseRecord: baseRecords) {
HoodieRecord record = new HoodieRecord(baseRecord.getKey(), generateRandomValue(baseRecord.getKey(), commitTime));
for (HoodieRecord baseRecord : baseRecords) {
HoodieRecord record = new HoodieRecord(baseRecord.getKey(),
generateRandomValue(baseRecord.getKey(), commitTime));
updates.add(record);
}
return updates;
@@ -162,11 +165,13 @@ public class HoodieTestDataGenerator {
* Generates a new avro record of the above schema format, retaining the key if optionally
* provided.
*/
public static TestRawTripPayload generateRandomValue(HoodieKey key, String commitTime) throws IOException {
public static TestRawTripPayload generateRandomValue(HoodieKey key, String commitTime)
throws IOException {
GenericRecord rec = generateGenericRecord(key.getRecordKey(), "rider-" + commitTime,
"driver-" + commitTime, 0.0);
HoodieAvroUtils.addCommitMetadataToRecord(rec, commitTime, "-1");
return new TestRawTripPayload(rec.toString(), key.getRecordKey(), key.getPartitionPath(), TRIP_EXAMPLE_SCHEMA);
return new TestRawTripPayload(rec.toString(), key.getRecordKey(), key.getPartitionPath(),
TRIP_EXAMPLE_SCHEMA);
}
public static GenericRecord generateGenericRecord(String rowKey, String riderName,
@@ -186,7 +191,8 @@ public class HoodieTestDataGenerator {
public static void createCommitFile(String basePath, String commitTime) throws IOException {
Path commitFile =
new Path(basePath + "/" + HoodieTableMetaClient.METAFOLDER_NAME + "/" + HoodieTimeline.makeCommitFileName(commitTime));
new Path(basePath + "/" + HoodieTableMetaClient.METAFOLDER_NAME + "/" + HoodieTimeline
.makeCommitFileName(commitTime));
FileSystem fs = FSUtils.getFs();
FSDataOutputStream os = fs.create(commitFile, true);
HoodieCommitMetadata commitMetadata = new HoodieCommitMetadata();

View File

@@ -17,31 +17,32 @@
package com.uber.hoodie.common;
import com.fasterxml.jackson.databind.ObjectMapper;
import com.uber.hoodie.WriteStatus;
import com.uber.hoodie.avro.MercifulJsonConverter;
import com.uber.hoodie.common.model.HoodieRecord;
import com.uber.hoodie.common.model.HoodieRecordPayload;
import java.io.ByteArrayInputStream;
import java.io.ByteArrayOutputStream;
import java.io.IOException;
import java.io.StringWriter;
import java.util.HashMap;
import java.util.List;
import java.util.Map.Entry;
import org.apache.avro.Schema;
import org.apache.avro.generic.IndexedRecord;
import org.apache.commons.io.IOUtils;
import java.io.*;
import java.util.Map;
import java.util.Map.Entry;
import java.util.Optional;
import java.util.zip.Deflater;
import java.util.zip.DeflaterOutputStream;
import java.util.zip.InflaterInputStream;
import org.apache.avro.Schema;
import org.apache.avro.generic.IndexedRecord;
import org.apache.commons.io.IOUtils;
/**
* Example row change event based on some example data used by testcases. The data avro schema is
* src/test/resources/schema1.
*/
public class TestRawTripPayload implements HoodieRecordPayload<TestRawTripPayload> {
private transient static final ObjectMapper mapper = new ObjectMapper();
private String partitionPath;
private String rowKey;
@@ -51,7 +52,7 @@ public class TestRawTripPayload implements HoodieRecordPayload<TestRawTripPayloa
public TestRawTripPayload(Optional<String> jsonData, String rowKey, String partitionPath,
String schemaStr, Boolean isDeleted) throws IOException {
if(jsonData.isPresent()) {
if (jsonData.isPresent()) {
this.jsonDataCompressed = compressData(jsonData.get());
this.dataSize = jsonData.get().length();
}
@@ -61,7 +62,7 @@ public class TestRawTripPayload implements HoodieRecordPayload<TestRawTripPayloa
}
public TestRawTripPayload(String jsonData, String rowKey, String partitionPath,
String schemaStr)throws IOException {
String schemaStr) throws IOException {
this(Optional.of(jsonData), rowKey, partitionPath, schemaStr, false);
}
@@ -79,16 +80,20 @@ public class TestRawTripPayload implements HoodieRecordPayload<TestRawTripPayloa
}
@Override public TestRawTripPayload preCombine(TestRawTripPayload another) {
@Override
public TestRawTripPayload preCombine(TestRawTripPayload another) {
return another;
}
@Override public Optional<IndexedRecord> combineAndGetUpdateValue(IndexedRecord oldRec, Schema schema) throws IOException {
@Override
public Optional<IndexedRecord> combineAndGetUpdateValue(IndexedRecord oldRec, Schema schema)
throws IOException {
return this.getInsertValue(schema);
}
@Override public Optional<IndexedRecord> getInsertValue(Schema schema) throws IOException {
if(isDeleted){
@Override
public Optional<IndexedRecord> getInsertValue(Schema schema) throws IOException {
if (isDeleted) {
return Optional.empty();
} else {
MercifulJsonConverter jsonConverter = new MercifulJsonConverter(schema);
@@ -135,16 +140,17 @@ public class TestRawTripPayload implements HoodieRecordPayload<TestRawTripPayloa
}
/**
* A custom {@link WriteStatus} that merges passed metadata key value map
* to {@code WriteStatus.markSuccess()} and {@code WriteStatus.markFailure()}.
* A custom {@link WriteStatus} that merges passed metadata key value map to {@code
* WriteStatus.markSuccess()} and {@code WriteStatus.markFailure()}.
*/
public static class MetadataMergeWriteStatus extends WriteStatus {
private Map<String, String> mergedMetadataMap = new HashMap<>();
@Override
public void markSuccess(HoodieRecord record, Optional<Map<String, String>> recordMetadata) {
super.markSuccess(record, recordMetadata);
if(recordMetadata.isPresent()) {
if (recordMetadata.isPresent()) {
mergeMetadataMaps(recordMetadata.get(), mergedMetadataMap);
}
}
@@ -153,25 +159,27 @@ public class TestRawTripPayload implements HoodieRecordPayload<TestRawTripPayloa
public void markFailure(HoodieRecord record, Throwable t,
Optional<Map<String, String>> recordMetadata) {
super.markFailure(record, t, recordMetadata);
if(recordMetadata.isPresent()) {
if (recordMetadata.isPresent()) {
mergeMetadataMaps(recordMetadata.get(), mergedMetadataMap);
}
}
public static Map<String, String> mergeMetadataForWriteStatuses(List<WriteStatus> writeStatuses) {
public static Map<String, String> mergeMetadataForWriteStatuses(
List<WriteStatus> writeStatuses) {
Map<String, String> allWriteStatusMergedMetadataMap = new HashMap<>();
for (WriteStatus writeStatus : writeStatuses) {
MetadataMergeWriteStatus.mergeMetadataMaps(
((MetadataMergeWriteStatus)writeStatus).getMergedMetadataMap(),
((MetadataMergeWriteStatus) writeStatus).getMergedMetadataMap(),
allWriteStatusMergedMetadataMap);
}
return allWriteStatusMergedMetadataMap;
}
private static void mergeMetadataMaps(Map<String, String> mergeFromMap, Map<String, String> mergeToMap) {
private static void mergeMetadataMaps(Map<String, String> mergeFromMap,
Map<String, String> mergeToMap) {
for (Entry<String, String> entry : mergeFromMap.entrySet()) {
String key = entry.getKey();
if(!mergeToMap.containsKey(key)) {
if (!mergeToMap.containsKey(key)) {
mergeToMap.put(key, "0");
}
mergeToMap

View File

@@ -16,7 +16,7 @@
package com.uber.hoodie.config;
import static org.junit.Assert.*;
import static org.junit.Assert.assertEquals;
import com.google.common.collect.Maps;
import com.uber.hoodie.config.HoodieWriteConfig.Builder;
@@ -29,6 +29,7 @@ import java.util.Properties;
import org.junit.Test;
public class HoodieWriteConfigTest {
@Test
public void testPropertyLoading() throws IOException {
Builder builder = HoodieWriteConfig.newBuilder().withPath("/tmp");
@@ -46,9 +47,10 @@ public class HoodieWriteConfigTest {
HoodieWriteConfig config = builder.build();
assertEquals(config.getMaxCommitsToKeep(), 5);
assertEquals(config.getMinCommitsToKeep(), 2);
}
}
private ByteArrayOutputStream saveParamsIntoOutputStream(Map<String, String> params) throws IOException {
private ByteArrayOutputStream saveParamsIntoOutputStream(Map<String, String> params)
throws IOException {
Properties properties = new Properties();
properties.putAll(params);
ByteArrayOutputStream outStream = new ByteArrayOutputStream();

View File

@@ -16,31 +16,30 @@
package com.uber.hoodie.func;
import com.uber.hoodie.common.table.HoodieTableMetaClient;
import com.uber.hoodie.common.table.HoodieTimeline;
import com.uber.hoodie.config.HoodieWriteConfig;
import static org.junit.Assert.fail;
import com.uber.hoodie.WriteStatus;
import com.uber.hoodie.common.TestRawTripPayload;
import com.uber.hoodie.common.model.HoodieKey;
import com.uber.hoodie.common.model.HoodieRecord;
import com.uber.hoodie.common.model.HoodieRecordLocation;
import com.uber.hoodie.common.model.HoodieTestUtils;
import com.uber.hoodie.common.table.HoodieTableMetaClient;
import com.uber.hoodie.common.table.HoodieTimeline;
import com.uber.hoodie.common.util.FSUtils;
import com.uber.hoodie.config.HoodieWriteConfig;
import com.uber.hoodie.table.HoodieCopyOnWriteTable;
import java.util.ArrayList;
import java.util.Iterator;
import java.util.List;
import org.apache.commons.io.IOUtils;
import org.apache.hadoop.fs.Path;
import org.junit.Before;
import org.junit.Test;
import org.junit.rules.TemporaryFolder;
import java.util.ArrayList;
import java.util.Iterator;
import java.util.List;
import static org.junit.Assert.fail;
public class TestUpdateMapFunction {
private String basePath = null;
@Before
@@ -90,7 +89,6 @@ public class TestUpdateMapFunction {
String fileId = insertResult.next().get(0).getFileId();
System.out.println(fileId);
table = new HoodieCopyOnWriteTable(config, metadata);
// New content with values for the newly added field
recordStr1 =

View File

@@ -16,17 +16,16 @@
package com.uber.hoodie.index;
import com.uber.hoodie.config.HoodieWriteConfig;
import static org.junit.Assert.assertTrue;
import com.uber.hoodie.config.HoodieIndexConfig;
import com.uber.hoodie.config.HoodieWriteConfig;
import com.uber.hoodie.index.bloom.HoodieBloomIndex;
import com.uber.hoodie.index.hbase.HBaseIndex;
import org.junit.Test;
import static org.junit.Assert.*;
public class TestHoodieIndex {
@Test
public void testCreateIndex() throws Exception {
HoodieWriteConfig.Builder clientConfigBuilder = HoodieWriteConfig.newBuilder();

View File

@@ -18,28 +18,39 @@
package com.uber.hoodie.index.bloom;
import static org.junit.Assert.assertEquals;
import static org.junit.Assert.assertFalse;
import static org.junit.Assert.assertNotNull;
import static org.junit.Assert.assertNull;
import static org.junit.Assert.assertTrue;
import static org.junit.Assert.fail;
import com.google.common.base.Optional;
import com.google.common.collect.Lists;
import com.uber.hoodie.common.HoodieClientTestUtils;
import com.uber.hoodie.common.table.HoodieTableMetaClient;
import com.uber.hoodie.config.HoodieIndexConfig;
import com.uber.hoodie.config.HoodieWriteConfig;
import com.uber.hoodie.avro.HoodieAvroWriteSupport;
import com.uber.hoodie.common.BloomFilter;
import com.uber.hoodie.common.HoodieClientTestUtils;
import com.uber.hoodie.common.TestRawTripPayload;
import com.uber.hoodie.common.model.HoodieKey;
import com.uber.hoodie.common.model.HoodieRecord;
import com.uber.hoodie.common.model.HoodieTestUtils;
import com.uber.hoodie.common.table.HoodieTableMetaClient;
import com.uber.hoodie.common.util.FSUtils;
import com.uber.hoodie.common.util.HoodieAvroUtils;
import com.uber.hoodie.index.bloom.BloomIndexFileInfo;
import com.uber.hoodie.index.bloom.HoodieBloomIndex;
import com.uber.hoodie.index.bloom.HoodieBloomIndexCheckFunction;
import com.uber.hoodie.config.HoodieWriteConfig;
import com.uber.hoodie.io.storage.HoodieParquetConfig;
import com.uber.hoodie.io.storage.HoodieParquetWriter;
import com.uber.hoodie.table.HoodieTable;
import java.io.File;
import java.io.IOException;
import java.text.SimpleDateFormat;
import java.util.Arrays;
import java.util.Date;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import java.util.UUID;
import java.util.stream.Collectors;
import org.apache.avro.Schema;
import org.apache.avro.generic.GenericRecord;
import org.apache.commons.io.IOUtils;
@@ -47,11 +58,8 @@ import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.parquet.avro.AvroSchemaConverter;
import org.apache.parquet.avro.AvroWriteSupport;
import org.apache.parquet.hadoop.ParquetWriter;
import org.apache.parquet.hadoop.api.WriteSupport;
import org.apache.parquet.hadoop.metadata.CompressionCodecName;
import org.apache.spark.SparkConf;
import org.apache.spark.api.java.JavaPairRDD;
import org.apache.spark.api.java.JavaRDD;
import org.apache.spark.api.java.JavaSparkContext;
@@ -59,20 +67,10 @@ import org.junit.After;
import org.junit.Before;
import org.junit.Test;
import org.junit.rules.TemporaryFolder;
import org.mockito.Mockito;
import scala.Tuple2;
import java.io.File;
import java.io.IOException;
import java.text.SimpleDateFormat;
import java.util.*;
import java.util.stream.Collectors;
import java.util.stream.Stream;
import static org.junit.Assert.*;
public class TestHoodieBloomIndex {
private JavaSparkContext jsc = null;
private String basePath = null;
private transient final FileSystem fs;
@@ -106,15 +104,20 @@ public class TestHoodieBloomIndex {
String recordStr4 = "{\"_row_key\":\"4eb5b87c-1fej-4edd-87b4-6ec96dc405a0\",\"time\":\"2015-01-31T03:16:41.415Z\",\"number\":32}";
TestRawTripPayload rowChange1 = new TestRawTripPayload(recordStr1);
HoodieRecord record1 = new HoodieRecord(new HoodieKey(rowChange1.getRowKey(), rowChange1.getPartitionPath()), rowChange1);
HoodieRecord record1 = new HoodieRecord(
new HoodieKey(rowChange1.getRowKey(), rowChange1.getPartitionPath()), rowChange1);
TestRawTripPayload rowChange2 = new TestRawTripPayload(recordStr2);
HoodieRecord record2 = new HoodieRecord(new HoodieKey(rowChange2.getRowKey(), rowChange2.getPartitionPath()), rowChange2);
HoodieRecord record2 = new HoodieRecord(
new HoodieKey(rowChange2.getRowKey(), rowChange2.getPartitionPath()), rowChange2);
TestRawTripPayload rowChange3 = new TestRawTripPayload(recordStr3);
HoodieRecord record3 = new HoodieRecord(new HoodieKey(rowChange3.getRowKey(), rowChange3.getPartitionPath()), rowChange3);
HoodieRecord record3 = new HoodieRecord(
new HoodieKey(rowChange3.getRowKey(), rowChange3.getPartitionPath()), rowChange3);
TestRawTripPayload rowChange4 = new TestRawTripPayload(recordStr4);
HoodieRecord record4 = new HoodieRecord(new HoodieKey(rowChange4.getRowKey(), rowChange4.getPartitionPath()), rowChange4);
HoodieRecord record4 = new HoodieRecord(
new HoodieKey(rowChange4.getRowKey(), rowChange4.getPartitionPath()), rowChange4);
JavaRDD<HoodieRecord> recordRDD = jsc.parallelize(Arrays.asList(record1, record2, record3, record4));
JavaRDD<HoodieRecord> recordRDD = jsc
.parallelize(Arrays.asList(record1, record2, record3, record4));
// Load to memory
HoodieWriteConfig config = HoodieWriteConfig.newBuilder().withPath(basePath).build();
@@ -144,20 +147,31 @@ public class TestHoodieBloomIndex {
new File(basePath + "/2016/04/01").mkdirs();
new File(basePath + "/2015/03/12").mkdirs();
TestRawTripPayload rowChange1 = new TestRawTripPayload("{\"_row_key\":\"000\",\"time\":\"2016-01-31T03:16:41.415Z\",\"number\":12}");
HoodieRecord record1 = new HoodieRecord(new HoodieKey(rowChange1.getRowKey(), rowChange1.getPartitionPath()), rowChange1);
TestRawTripPayload rowChange2 = new TestRawTripPayload("{\"_row_key\":\"001\",\"time\":\"2016-01-31T03:16:41.415Z\",\"number\":12}");
HoodieRecord record2 = new HoodieRecord(new HoodieKey(rowChange2.getRowKey(), rowChange2.getPartitionPath()), rowChange2);
TestRawTripPayload rowChange3 = new TestRawTripPayload("{\"_row_key\":\"002\",\"time\":\"2016-01-31T03:16:41.415Z\",\"number\":12}");
HoodieRecord record3 = new HoodieRecord(new HoodieKey(rowChange3.getRowKey(), rowChange3.getPartitionPath()), rowChange3);
TestRawTripPayload rowChange4 = new TestRawTripPayload("{\"_row_key\":\"003\",\"time\":\"2016-01-31T03:16:41.415Z\",\"number\":12}");
HoodieRecord record4 = new HoodieRecord(new HoodieKey(rowChange4.getRowKey(), rowChange4.getPartitionPath()), rowChange4);
TestRawTripPayload rowChange1 = new TestRawTripPayload(
"{\"_row_key\":\"000\",\"time\":\"2016-01-31T03:16:41.415Z\",\"number\":12}");
HoodieRecord record1 = new HoodieRecord(
new HoodieKey(rowChange1.getRowKey(), rowChange1.getPartitionPath()), rowChange1);
TestRawTripPayload rowChange2 = new TestRawTripPayload(
"{\"_row_key\":\"001\",\"time\":\"2016-01-31T03:16:41.415Z\",\"number\":12}");
HoodieRecord record2 = new HoodieRecord(
new HoodieKey(rowChange2.getRowKey(), rowChange2.getPartitionPath()), rowChange2);
TestRawTripPayload rowChange3 = new TestRawTripPayload(
"{\"_row_key\":\"002\",\"time\":\"2016-01-31T03:16:41.415Z\",\"number\":12}");
HoodieRecord record3 = new HoodieRecord(
new HoodieKey(rowChange3.getRowKey(), rowChange3.getPartitionPath()), rowChange3);
TestRawTripPayload rowChange4 = new TestRawTripPayload(
"{\"_row_key\":\"003\",\"time\":\"2016-01-31T03:16:41.415Z\",\"number\":12}");
HoodieRecord record4 = new HoodieRecord(
new HoodieKey(rowChange4.getRowKey(), rowChange4.getPartitionPath()), rowChange4);
writeParquetFile("2016/04/01","2_0_20160401010101.parquet", Lists.newArrayList(), schema, null, false);
writeParquetFile("2015/03/12","1_0_20150312101010.parquet", Lists.newArrayList(), schema, null, false);
writeParquetFile("2015/03/12","3_0_20150312101010.parquet", Arrays.asList(record1), schema, null, false);
writeParquetFile("2015/03/12","4_0_20150312101010.parquet", Arrays.asList(record2, record3, record4), schema, null, false);
writeParquetFile("2016/04/01", "2_0_20160401010101.parquet", Lists.newArrayList(), schema, null,
false);
writeParquetFile("2015/03/12", "1_0_20150312101010.parquet", Lists.newArrayList(), schema, null,
false);
writeParquetFile("2015/03/12", "3_0_20150312101010.parquet", Arrays.asList(record1), schema,
null, false);
writeParquetFile("2015/03/12", "4_0_20150312101010.parquet",
Arrays.asList(record2, record3, record4), schema, null, false);
List<String> partitions = Arrays.asList("2016/01/21", "2016/04/01", "2015/03/12");
HoodieTableMetaClient metadata = new HoodieTableMetaClient(fs, basePath);
@@ -185,9 +199,11 @@ public class TestHoodieBloomIndex {
List<Tuple2<String, BloomIndexFileInfo>> expected = Arrays.asList(
new Tuple2<>("2016/04/01", new BloomIndexFileInfo("2_0_20160401010101.parquet")),
new Tuple2<>("2015/03/12",new BloomIndexFileInfo("1_0_20150312101010.parquet")),
new Tuple2<>("2015/03/12",new BloomIndexFileInfo("3_0_20150312101010.parquet", "000", "000")),
new Tuple2<>("2015/03/12",new BloomIndexFileInfo("4_0_20150312101010.parquet", "001", "003"))
new Tuple2<>("2015/03/12", new BloomIndexFileInfo("1_0_20150312101010.parquet")),
new Tuple2<>("2015/03/12",
new BloomIndexFileInfo("3_0_20150312101010.parquet", "000", "000")),
new Tuple2<>("2015/03/12",
new BloomIndexFileInfo("4_0_20150312101010.parquet", "001", "003"))
);
assertEquals(expected, filesList);
}
@@ -200,7 +216,6 @@ public class TestHoodieBloomIndex {
.build();
HoodieBloomIndex index = new HoodieBloomIndex(config, jsc);
final Map<String, List<BloomIndexFileInfo>> partitionToFileIndexInfo = new HashMap<>();
partitionToFileIndexInfo.put("2017/10/22", Arrays.asList(
new BloomIndexFileInfo("f1"),
@@ -212,14 +227,13 @@ public class TestHoodieBloomIndex {
JavaPairRDD<String, String> partitionRecordKeyPairRDD = jsc
.parallelize(Arrays.asList(
new Tuple2<>("2017/10/22","003"),
new Tuple2<>("2017/10/22","002"),
new Tuple2<>("2017/10/22","005"),
new Tuple2<>("2017/10/22","004")
new Tuple2<>("2017/10/22", "003"),
new Tuple2<>("2017/10/22", "002"),
new Tuple2<>("2017/10/22", "005"),
new Tuple2<>("2017/10/22", "004")
))
.mapToPair(t -> t);
List<Tuple2<String, Tuple2<String, HoodieKey>>> comparisonKeyList = index
.explodeRecordRDDWithFileComparisons(partitionToFileIndexInfo, partitionRecordKeyPairRDD)
.collect();
@@ -240,7 +254,8 @@ public class TestHoodieBloomIndex {
}
@Test
public void testCheckUUIDsAgainstOneFile() throws IOException, InterruptedException, ClassNotFoundException {
public void testCheckUUIDsAgainstOneFile()
throws IOException, InterruptedException, ClassNotFoundException {
// Create some records to use
String recordStr1 = "{\"_row_key\":\"1eb5b87a-1feh-4edd-87b4-6ec96dc405a0\",\"time\":\"2016-01-31T03:16:41.415Z\",\"number\":12}";
@@ -248,19 +263,23 @@ public class TestHoodieBloomIndex {
String recordStr3 = "{\"_row_key\":\"3eb5b87c-1fej-4edd-87b4-6ec96dc405a0\",\"time\":\"2016-01-31T03:16:41.415Z\",\"number\":15}";
String recordStr4 = "{\"_row_key\":\"4eb5b87c-1fej-4edd-87b4-6ec96dc405a0\",\"time\":\"2016-01-31T03:16:41.415Z\",\"number\":32}";
TestRawTripPayload rowChange1 = new TestRawTripPayload(recordStr1);
HoodieRecord record1 = new HoodieRecord(new HoodieKey(rowChange1.getRowKey(), rowChange1.getPartitionPath()), rowChange1);
HoodieRecord record1 = new HoodieRecord(
new HoodieKey(rowChange1.getRowKey(), rowChange1.getPartitionPath()), rowChange1);
TestRawTripPayload rowChange2 = new TestRawTripPayload(recordStr2);
HoodieRecord record2 = new HoodieRecord(new HoodieKey(rowChange2.getRowKey(), rowChange2.getPartitionPath()), rowChange2);
HoodieRecord record2 = new HoodieRecord(
new HoodieKey(rowChange2.getRowKey(), rowChange2.getPartitionPath()), rowChange2);
TestRawTripPayload rowChange3 = new TestRawTripPayload(recordStr3);
HoodieRecord record3 = new HoodieRecord(new HoodieKey(rowChange3.getRowKey(), rowChange3.getPartitionPath()), rowChange3);
HoodieRecord record3 = new HoodieRecord(
new HoodieKey(rowChange3.getRowKey(), rowChange3.getPartitionPath()), rowChange3);
TestRawTripPayload rowChange4 = new TestRawTripPayload(recordStr4);
HoodieRecord record4 = new HoodieRecord(new HoodieKey(rowChange4.getRowKey(), rowChange4.getPartitionPath()), rowChange4);
HoodieRecord record4 = new HoodieRecord(
new HoodieKey(rowChange4.getRowKey(), rowChange4.getPartitionPath()), rowChange4);
// We write record1, record2 to a parquet file, but the bloom filter contains (record1, record2, record3).
BloomFilter filter = new BloomFilter(10000, 0.0000001);
filter.add(record3.getRecordKey());
String filename = writeParquetFile("2016/01/31", Arrays.asList(record1, record2), schema, filter, true);
String filename = writeParquetFile("2016/01/31", Arrays.asList(record1, record2), schema,
filter, true);
// The bloom filter contains 3 records
assertTrue(filter.mightContain(record1.getRecordKey()));
@@ -299,7 +318,8 @@ public class TestHoodieBloomIndex {
try {
bloomIndex.tagLocation(recordRDD, table);
} catch (IllegalArgumentException e) {
fail("EmptyRDD should not result in IllegalArgumentException: Positive number of slices required");
fail(
"EmptyRDD should not result in IllegalArgumentException: Positive number of slices required");
}
}
@@ -313,14 +333,19 @@ public class TestHoodieBloomIndex {
String recordStr3 = "{\"_row_key\":\"3eb5b87c-1fej-4edd-87b4-6ec96dc405a0\",\"time\":\"2016-01-31T03:16:41.415Z\",\"number\":15}";
String recordStr4 = "{\"_row_key\":\"4eb5b87c-1fej-4edd-87b4-6ec96dc405a0\",\"time\":\"2015-01-31T03:16:41.415Z\",\"number\":32}";
TestRawTripPayload rowChange1 = new TestRawTripPayload(recordStr1);
HoodieRecord record1 = new HoodieRecord(new HoodieKey(rowChange1.getRowKey(), rowChange1.getPartitionPath()), rowChange1);
HoodieRecord record1 = new HoodieRecord(
new HoodieKey(rowChange1.getRowKey(), rowChange1.getPartitionPath()), rowChange1);
TestRawTripPayload rowChange2 = new TestRawTripPayload(recordStr2);
HoodieRecord record2 = new HoodieRecord(new HoodieKey(rowChange2.getRowKey(), rowChange2.getPartitionPath()), rowChange2);
HoodieRecord record2 = new HoodieRecord(
new HoodieKey(rowChange2.getRowKey(), rowChange2.getPartitionPath()), rowChange2);
TestRawTripPayload rowChange3 = new TestRawTripPayload(recordStr3);
HoodieRecord record3 = new HoodieRecord(new HoodieKey(rowChange3.getRowKey(), rowChange3.getPartitionPath()), rowChange3);
HoodieRecord record3 = new HoodieRecord(
new HoodieKey(rowChange3.getRowKey(), rowChange3.getPartitionPath()), rowChange3);
TestRawTripPayload rowChange4 = new TestRawTripPayload(recordStr4);
HoodieRecord record4 = new HoodieRecord(new HoodieKey(rowChange4.getRowKey(), rowChange4.getPartitionPath()), rowChange4);
JavaRDD<HoodieRecord> recordRDD = jsc.parallelize(Arrays.asList(record1, record2, record3, record4));
HoodieRecord record4 = new HoodieRecord(
new HoodieKey(rowChange4.getRowKey(), rowChange4.getPartitionPath()), rowChange4);
JavaRDD<HoodieRecord> recordRDD = jsc
.parallelize(Arrays.asList(record1, record2, record3, record4));
// Also create the metadata and config
HoodieTableMetaClient metadata = new HoodieTableMetaClient(fs, basePath);
@@ -390,7 +415,8 @@ public class TestHoodieBloomIndex {
// Let's tag
HoodieBloomIndex bloomIndex = new HoodieBloomIndex(config, jsc);
JavaPairRDD<HoodieKey, Optional<String>> taggedRecordRDD = bloomIndex.fetchRecordLocation(keysRDD, table);
JavaPairRDD<HoodieKey, Optional<String>> taggedRecordRDD = bloomIndex
.fetchRecordLocation(keysRDD, table);
// Should not find any files
for (Tuple2<HoodieKey, Optional<String>> record : taggedRecordRDD.collect()) {
@@ -436,9 +462,11 @@ public class TestHoodieBloomIndex {
// We write record1 to a parquet file, using a bloom filter having both records
TestRawTripPayload rowChange1 = new TestRawTripPayload(recordStr1);
HoodieRecord record1 = new HoodieRecord(new HoodieKey(rowChange1.getRowKey(), rowChange1.getPartitionPath()), rowChange1);
HoodieRecord record1 = new HoodieRecord(
new HoodieKey(rowChange1.getRowKey(), rowChange1.getPartitionPath()), rowChange1);
TestRawTripPayload rowChange2 = new TestRawTripPayload(recordStr2);
HoodieRecord record2 = new HoodieRecord(new HoodieKey(rowChange2.getRowKey(), rowChange2.getPartitionPath()), rowChange2);
HoodieRecord record2 = new HoodieRecord(
new HoodieKey(rowChange2.getRowKey(), rowChange2.getPartitionPath()), rowChange2);
BloomFilter filter = new BloomFilter(10000, 0.0000001);
filter.add(record2.getRecordKey());
@@ -472,21 +500,22 @@ public class TestHoodieBloomIndex {
String fileId = UUID.randomUUID().toString();
String filename = FSUtils.makeDataFileName(commitTime, 1, fileId);
return writeParquetFile(partitionPath, filename, records, schema, filter, createCommitTime);
}
private String writeParquetFile(String partitionPath, String filename, List<HoodieRecord> records, Schema schema,
private String writeParquetFile(String partitionPath, String filename, List<HoodieRecord> records,
Schema schema,
BloomFilter filter, boolean createCommitTime) throws IOException {
if (filter == null) {
filter = new BloomFilter(10000, 0.0000001);
}
HoodieAvroWriteSupport writeSupport = new HoodieAvroWriteSupport(new AvroSchemaConverter().convert(schema), schema, filter);
HoodieAvroWriteSupport writeSupport = new HoodieAvroWriteSupport(
new AvroSchemaConverter().convert(schema), schema, filter);
String commitTime = FSUtils.getCommitTime(filename);
HoodieParquetConfig config = new HoodieParquetConfig(writeSupport, CompressionCodecName.GZIP,
ParquetWriter.DEFAULT_BLOCK_SIZE, ParquetWriter.DEFAULT_PAGE_SIZE, 120 * 1024 * 1024, new Configuration());
ParquetWriter.DEFAULT_BLOCK_SIZE, ParquetWriter.DEFAULT_PAGE_SIZE, 120 * 1024 * 1024,
new Configuration());
HoodieParquetWriter writer = new HoodieParquetWriter(
commitTime,
new Path(basePath + "/" + partitionPath + "/" + filename),
@@ -496,7 +525,9 @@ public class TestHoodieBloomIndex {
for (HoodieRecord record : records) {
GenericRecord avroRecord = (GenericRecord) record.getData().getInsertValue(schema).get();
HoodieAvroUtils.addCommitMetadataToRecord(avroRecord, commitTime, "" + seqId++);
HoodieAvroUtils.addHoodieKeyToRecord(avroRecord, record.getRecordKey(), record.getPartitionPath(), filename);
HoodieAvroUtils
.addHoodieKeyToRecord(avroRecord, record.getRecordKey(), record.getPartitionPath(),
filename);
writer.writeAvro(record.getRecordKey(), avroRecord);
filter.add(record.getRecordKey());
}
@@ -505,7 +536,9 @@ public class TestHoodieBloomIndex {
if (createCommitTime) {
// Also make sure the commit is valid
new File(basePath + "/" + HoodieTableMetaClient.METAFOLDER_NAME).mkdirs();
new File(basePath + "/" + HoodieTableMetaClient.METAFOLDER_NAME + "/" + commitTime + ".commit").createNewFile();
new File(
basePath + "/" + HoodieTableMetaClient.METAFOLDER_NAME + "/" + commitTime + ".commit")
.createNewFile();
}
return filename;
}

View File

@@ -16,9 +16,11 @@
package com.uber.hoodie.io;
import static org.junit.Assert.assertEquals;
import static org.junit.Assert.assertTrue;
import com.uber.hoodie.avro.model.HoodieArchivedMetaEntry;
import com.uber.hoodie.common.HoodieTestDataGenerator;
import com.uber.hoodie.common.model.HoodieArchivedLogFile;
import com.uber.hoodie.common.model.HoodieLogFile;
import com.uber.hoodie.common.model.HoodieTestUtils;
import com.uber.hoodie.common.table.HoodieTableMetaClient;
@@ -29,6 +31,11 @@ import com.uber.hoodie.common.table.timeline.HoodieInstant;
import com.uber.hoodie.common.util.FSUtils;
import com.uber.hoodie.config.HoodieCompactionConfig;
import com.uber.hoodie.config.HoodieWriteConfig;
import java.io.IOException;
import java.util.ArrayList;
import java.util.Collections;
import java.util.List;
import java.util.stream.Collectors;
import org.apache.avro.generic.GenericRecord;
import org.apache.avro.generic.IndexedRecord;
import org.apache.hadoop.fs.FileSystem;
@@ -37,16 +44,8 @@ import org.junit.Before;
import org.junit.Test;
import org.junit.rules.TemporaryFolder;
import java.io.IOException;
import java.util.ArrayList;
import java.util.Collections;
import java.util.List;
import java.util.stream.Collectors;
import static org.junit.Assert.assertEquals;
import static org.junit.Assert.assertTrue;
public class TestHoodieCommitArchiveLog {
private String basePath;
private FileSystem fs;
@@ -97,7 +96,8 @@ public class TestHoodieCommitArchiveLog {
HoodieTestUtils.createCleanFiles(basePath, "105");
//reload the timeline and get all the commmits before archive
timeline = metadata.getActiveTimeline().reload().getAllCommitsTimeline().filterCompletedInstants();
timeline = metadata.getActiveTimeline().reload().getAllCommitsTimeline()
.filterCompletedInstants();
List<HoodieInstant> originalCommits = timeline.getInstants().collect(Collectors.toList());
assertEquals("Loaded 6 commits and the count should match", 12, timeline.countInstants());
@@ -107,27 +107,30 @@ public class TestHoodieCommitArchiveLog {
assertTrue(archiveLog.archiveIfRequired());
//reload the timeline and remove the remaining commits
timeline = metadata.getActiveTimeline().reload().getAllCommitsTimeline().filterCompletedInstants();
timeline = metadata.getActiveTimeline().reload().getAllCommitsTimeline()
.filterCompletedInstants();
originalCommits.removeAll(timeline.getInstants().collect(Collectors.toList()));
//read the file
HoodieLogFormat.Reader reader = HoodieLogFormat.newReader(FSUtils.getFs(),
new HoodieLogFile(new Path(basePath + "/.hoodie/.commits_.archive.1")), HoodieArchivedMetaEntry.getClassSchema(), false);
new HoodieLogFile(new Path(basePath + "/.hoodie/.commits_.archive.1")),
HoodieArchivedMetaEntry.getClassSchema(), false);
int archivedRecordsCount = 0;
List<IndexedRecord> readRecords = new ArrayList<>();
//read the avro blocks and validate the number of records written in each avro block
while(reader.hasNext()) {
while (reader.hasNext()) {
HoodieAvroDataBlock blk = (HoodieAvroDataBlock) reader.next();
List<IndexedRecord> records = blk.getRecords();
readRecords.addAll(records);
assertEquals("Archived and read records for each block are same", 8, records.size());
archivedRecordsCount += records.size();
}
assertEquals("Total archived records and total read records are the same count", 8, archivedRecordsCount);
assertEquals("Total archived records and total read records are the same count", 8,
archivedRecordsCount);
//make sure the archived commits are the same as the (originalcommits - commitsleft)
List<String> readCommits = readRecords.stream().map(r -> (GenericRecord)r).map(r -> {
List<String> readCommits = readRecords.stream().map(r -> (GenericRecord) r).map(r -> {
return r.get("commitTime").toString();
}).collect(Collectors.toList());
Collections.sort(readCommits);
@@ -158,7 +161,8 @@ public class TestHoodieCommitArchiveLog {
boolean result = archiveLog.archiveIfRequired();
assertTrue(result);
timeline =
metadata.getActiveTimeline().reload().getCommitsAndCompactionsTimeline().filterCompletedInstants();
metadata.getActiveTimeline().reload().getCommitsAndCompactionsTimeline()
.filterCompletedInstants();
assertEquals("Should not archive commits when maxCommitsToKeep is 5", 4,
timeline.countInstants());
}
@@ -184,7 +188,8 @@ public class TestHoodieCommitArchiveLog {
boolean result = archiveLog.archiveIfRequired();
assertTrue(result);
timeline =
metadata.getActiveTimeline().reload().getCommitsAndCompactionsTimeline().filterCompletedInstants();
metadata.getActiveTimeline().reload().getCommitsAndCompactionsTimeline()
.filterCompletedInstants();
assertTrue("Archived commits should always be safe",
timeline.containsOrBeforeTimelineStarts("100"));
assertTrue("Archived commits should always be safe",
@@ -217,7 +222,8 @@ public class TestHoodieCommitArchiveLog {
boolean result = archiveLog.archiveIfRequired();
assertTrue(result);
timeline =
metadata.getActiveTimeline().reload().getCommitsAndCompactionsTimeline().filterCompletedInstants();
metadata.getActiveTimeline().reload().getCommitsAndCompactionsTimeline()
.filterCompletedInstants();
assertEquals(
"Since we have a savepoint at 101, we should never archive any commit after 101 (we only archive 100)",
5, timeline.countInstants());

View File

@@ -16,7 +16,9 @@
package com.uber.hoodie.io;
import com.uber.hoodie.HoodieReadClient;
import static org.junit.Assert.assertEquals;
import static org.junit.Assert.assertTrue;
import com.uber.hoodie.HoodieWriteClient;
import com.uber.hoodie.WriteStatus;
import com.uber.hoodie.common.HoodieClientTestUtils;
@@ -34,13 +36,16 @@ import com.uber.hoodie.config.HoodieCompactionConfig;
import com.uber.hoodie.config.HoodieIndexConfig;
import com.uber.hoodie.config.HoodieStorageConfig;
import com.uber.hoodie.config.HoodieWriteConfig;
import com.uber.hoodie.index.bloom.HoodieBloomIndex;
import com.uber.hoodie.index.HoodieIndex;
import com.uber.hoodie.index.bloom.HoodieBloomIndex;
import com.uber.hoodie.io.compact.HoodieCompactor;
import com.uber.hoodie.io.compact.HoodieRealtimeTableCompactor;
import com.uber.hoodie.table.HoodieTable;
import java.io.File;
import java.io.IOException;
import java.util.List;
import java.util.stream.Collectors;
import org.apache.hadoop.fs.FileSystem;
import org.apache.spark.SparkConf;
import org.apache.spark.api.java.JavaRDD;
import org.apache.spark.api.java.JavaSparkContext;
import org.junit.After;
@@ -48,15 +53,8 @@ import org.junit.Before;
import org.junit.Test;
import org.junit.rules.TemporaryFolder;
import java.io.File;
import java.io.IOException;
import java.util.List;
import java.util.stream.Collectors;
import static org.junit.Assert.assertEquals;
import static org.junit.Assert.assertTrue;
public class TestHoodieCompactor {
private transient JavaSparkContext jsc = null;
private String basePath = null;
private HoodieCompactor compactor;
@@ -194,7 +192,7 @@ public class TestHoodieCompactor {
List<FileSlice> groupedLogFiles = table.getRTFileSystemView()
.getLatestFileSlices(partitionPath)
.collect(Collectors.toList());
for (FileSlice slice: groupedLogFiles) {
for (FileSlice slice : groupedLogFiles) {
assertTrue(
"After compaction there should be no log files visiable on a Realtime view",
slice.getLogFiles().collect(Collectors.toList()).isEmpty());

View File

@@ -17,12 +17,10 @@
package com.uber.hoodie.io.strategy;
import static org.junit.Assert.assertEquals;
import static org.junit.Assert.assertThat;
import static org.junit.Assert.assertTrue;
import com.beust.jcommander.internal.Lists;
import com.google.common.collect.Maps;
import com.uber.hoodie.config.HoodieCompactionConfig;
import com.uber.hoodie.config.HoodieWriteConfig;
import com.uber.hoodie.io.compact.CompactionOperation;

View File

@@ -17,9 +17,7 @@
package com.uber.hoodie.io.strategy;
import com.uber.hoodie.common.model.HoodieDataFile;
import com.uber.hoodie.common.util.FSUtils;
import java.util.UUID;
import org.apache.hadoop.fs.FileStatus;
public class TestHoodieDataFile extends HoodieDataFile {

View File

@@ -18,7 +18,6 @@ package com.uber.hoodie.io.strategy;
import com.uber.hoodie.common.model.HoodieLogFile;
import java.util.Optional;
import org.apache.hadoop.fs.Path;
public class TestHoodieLogFile extends HoodieLogFile {

View File

@@ -16,17 +16,17 @@
package com.uber.hoodie.metrics;
import com.uber.hoodie.config.HoodieWriteConfig;
import org.apache.commons.configuration.ConfigurationException;
import org.junit.Before;
import org.junit.Test;
import static org.junit.Assert.assertTrue;
import static org.mockito.Mockito.mock;
import static org.mockito.Mockito.when;
import com.uber.hoodie.config.HoodieWriteConfig;
import org.apache.commons.configuration.ConfigurationException;
import org.junit.Before;
import org.junit.Test;
public class TestHoodieMetrics {
private HoodieMetrics metrics = null;
@Before
@@ -40,6 +40,7 @@ public class TestHoodieMetrics {
@Test
public void testRegisterGauge() {
metrics.registerGauge("metric1", 123L);
assertTrue(Metrics.getInstance().getRegistry().getGauges().get("metric1").getValue().toString().equals("123"));
assertTrue(Metrics.getInstance().getRegistry().getGauges().get("metric1").getValue().toString()
.equals("123"));
}
}

View File

@@ -16,26 +16,37 @@
package com.uber.hoodie.table;
import com.uber.hoodie.common.TestRawTripPayload.MetadataMergeWriteStatus;
import com.uber.hoodie.common.table.HoodieTableMetaClient;
import com.uber.hoodie.common.table.HoodieTimeline;
import com.uber.hoodie.config.HoodieWriteConfig;
import static org.junit.Assert.assertEquals;
import static org.junit.Assert.assertFalse;
import static org.junit.Assert.assertTrue;
import static org.mockito.Mockito.mock;
import static org.mockito.Mockito.when;
import com.uber.hoodie.WriteStatus;
import com.uber.hoodie.common.BloomFilter;
import com.uber.hoodie.common.HoodieClientTestUtils;
import com.uber.hoodie.common.HoodieTestDataGenerator;
import com.uber.hoodie.common.TestRawTripPayload;
import com.uber.hoodie.common.TestRawTripPayload.MetadataMergeWriteStatus;
import com.uber.hoodie.common.model.HoodieKey;
import com.uber.hoodie.common.model.HoodieRecord;
import com.uber.hoodie.common.model.HoodieRecordLocation;
import com.uber.hoodie.common.model.HoodieTestUtils;
import com.uber.hoodie.common.table.HoodieTableMetaClient;
import com.uber.hoodie.common.table.HoodieTimeline;
import com.uber.hoodie.common.util.FSUtils;
import com.uber.hoodie.common.util.ParquetUtils;
import com.uber.hoodie.config.HoodieCompactionConfig;
import com.uber.hoodie.io.HoodieCreateHandle;
import com.uber.hoodie.config.HoodieStorageConfig;
import com.uber.hoodie.config.HoodieWriteConfig;
import com.uber.hoodie.io.HoodieCreateHandle;
import java.io.File;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Iterator;
import java.util.List;
import java.util.Map;
import java.util.UUID;
import org.apache.avro.generic.GenericRecord;
import org.apache.commons.io.IOUtils;
import org.apache.hadoop.fs.FileSystem;
@@ -47,22 +58,11 @@ import org.junit.After;
import org.junit.Before;
import org.junit.Test;
import org.junit.rules.TemporaryFolder;
import java.io.File;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Iterator;
import java.util.List;
import java.util.UUID;
import scala.Option;
import scala.Tuple2;
import static org.junit.Assert.*;
import static org.mockito.Mockito.mock;
import static org.mockito.Mockito.when;
public class TestCopyOnWriteTable {
private String basePath = null;
private transient JavaSparkContext jsc = null;
@@ -104,7 +104,8 @@ public class TestCopyOnWriteTable {
private HoodieWriteConfig.Builder makeHoodieClientConfigBuilder() throws Exception {
// Prepare the AvroParquetIO
String schemaStr = IOUtils.toString(getClass().getResourceAsStream("/exampleSchema.txt"), "UTF-8");
String schemaStr = IOUtils
.toString(getClass().getResourceAsStream("/exampleSchema.txt"), "UTF-8");
return HoodieWriteConfig.newBuilder().withPath(basePath).withSchema(schemaStr);
}
@@ -127,11 +128,17 @@ public class TestCopyOnWriteTable {
List<HoodieRecord> records = new ArrayList<>();
TestRawTripPayload rowChange1 = new TestRawTripPayload(recordStr1);
records.add(new HoodieRecord(new HoodieKey(rowChange1.getRowKey(), rowChange1.getPartitionPath()), rowChange1));
records.add(
new HoodieRecord(new HoodieKey(rowChange1.getRowKey(), rowChange1.getPartitionPath()),
rowChange1));
TestRawTripPayload rowChange2 = new TestRawTripPayload(recordStr2);
records.add(new HoodieRecord(new HoodieKey(rowChange2.getRowKey(), rowChange2.getPartitionPath()), rowChange2));
records.add(
new HoodieRecord(new HoodieKey(rowChange2.getRowKey(), rowChange2.getPartitionPath()),
rowChange2));
TestRawTripPayload rowChange3 = new TestRawTripPayload(recordStr3);
records.add(new HoodieRecord(new HoodieKey(rowChange3.getRowKey(), rowChange3.getPartitionPath()), rowChange3));
records.add(
new HoodieRecord(new HoodieKey(rowChange3.getRowKey(), rowChange3.getPartitionPath()),
rowChange3));
// Insert new records
HoodieClientTestUtils.collectStatuses(table.handleInsert(firstCommitTime, records.iterator()));
@@ -159,7 +166,7 @@ public class TestCopyOnWriteTable {
List<GenericRecord> fileRecords = ParquetUtils.readAvroRecords(parquetFilePath);
GenericRecord newRecord;
int index = 0;
for (GenericRecord record: fileRecords) {
for (GenericRecord record : fileRecords) {
assertTrue(record.get("_row_key").toString().equals(records.get(index).getRecordKey()));
index++;
}
@@ -167,11 +174,15 @@ public class TestCopyOnWriteTable {
// We update the 1st record & add a new record
String updateRecordStr1 = "{\"_row_key\":\"8eb5b87a-1feh-4edd-87b4-6ec96dc405a0\",\"time\":\"2016-01-31T03:16:41.415Z\",\"number\":15}";
TestRawTripPayload updateRowChanges1 = new TestRawTripPayload(updateRecordStr1);
HoodieRecord updatedRecord1 = new HoodieRecord(new HoodieKey(updateRowChanges1.getRowKey(), updateRowChanges1.getPartitionPath()), updateRowChanges1);
updatedRecord1.setCurrentLocation(new HoodieRecordLocation(null, FSUtils.getFileId(parquetFile.getName())));
HoodieRecord updatedRecord1 = new HoodieRecord(
new HoodieKey(updateRowChanges1.getRowKey(), updateRowChanges1.getPartitionPath()),
updateRowChanges1);
updatedRecord1.setCurrentLocation(
new HoodieRecordLocation(null, FSUtils.getFileId(parquetFile.getName())));
TestRawTripPayload rowChange4 = new TestRawTripPayload(recordStr4);
HoodieRecord insertedRecord1 = new HoodieRecord(new HoodieKey(rowChange4.getRowKey(), rowChange4.getPartitionPath()), rowChange4);
HoodieRecord insertedRecord1 = new HoodieRecord(
new HoodieKey(rowChange4.getRowKey(), rowChange4.getPartitionPath()), rowChange4);
List<HoodieRecord> updatedRecords = Arrays.asList(updatedRecord1, insertedRecord1);
@@ -179,7 +190,9 @@ public class TestCopyOnWriteTable {
String newCommitTime = HoodieTestUtils.makeNewCommitTime();
metadata = new HoodieTableMetaClient(FSUtils.getFs(), basePath);
table = new HoodieCopyOnWriteTable(config, metadata);
Iterator<List<WriteStatus>> iter = table.handleUpdate(newCommitTime, updatedRecord1.getCurrentLocation().getFileId(), updatedRecords.iterator());
Iterator<List<WriteStatus>> iter = table
.handleUpdate(newCommitTime, updatedRecord1.getCurrentLocation().getFileId(),
updatedRecords.iterator());
// Check the updated file
File updatedParquetFile = null;
@@ -197,7 +210,8 @@ public class TestCopyOnWriteTable {
assertTrue(updatedParquetFile != null);
// Check whether the record has been updated
Path updatedParquetFilePath = new Path(updatedParquetFile.getAbsolutePath());
BloomFilter updatedFilter = ParquetUtils.readBloomFilterFromParquetMetadata(updatedParquetFilePath);
BloomFilter updatedFilter = ParquetUtils
.readBloomFilterFromParquetMetadata(updatedParquetFilePath);
for (HoodieRecord record : records) {
// No change to the _row_key
assertTrue(updatedFilter.mightContain(record.getRecordKey()));
@@ -206,7 +220,8 @@ public class TestCopyOnWriteTable {
assertTrue(updatedFilter.mightContain(insertedRecord1.getRecordKey()));
records.add(insertedRecord1);// add this so it can further check below
ParquetReader updatedReader = ParquetReader.builder(new AvroReadSupport<>(), updatedParquetFilePath).build();
ParquetReader updatedReader = ParquetReader
.builder(new AvroReadSupport<>(), updatedParquetFilePath).build();
index = 0;
while ((newRecord = (GenericRecord) updatedReader.read()) != null) {
assertTrue(newRecord.get("_row_key").toString().equals(records.get(index).getRecordKey()));
@@ -243,7 +258,8 @@ public class TestCopyOnWriteTable {
@Test
public void testMetadataAggregateFromWriteStatus() throws Exception {
// Prepare the AvroParquetIO
HoodieWriteConfig config = makeHoodieClientConfigBuilder().withWriteStatusClass(MetadataMergeWriteStatus.class).build();
HoodieWriteConfig config = makeHoodieClientConfigBuilder()
.withWriteStatusClass(MetadataMergeWriteStatus.class).build();
String firstCommitTime = HoodieTestUtils.makeNewCommitTime();
HoodieTableMetaClient metadata = new HoodieTableMetaClient(FSUtils.getFs(), basePath);
@@ -256,11 +272,17 @@ public class TestCopyOnWriteTable {
List<HoodieRecord> records = new ArrayList<>();
TestRawTripPayload rowChange1 = new TestRawTripPayload(recordStr1);
records.add(new HoodieRecord(new HoodieKey(rowChange1.getRowKey(), rowChange1.getPartitionPath()), rowChange1));
records.add(
new HoodieRecord(new HoodieKey(rowChange1.getRowKey(), rowChange1.getPartitionPath()),
rowChange1));
TestRawTripPayload rowChange2 = new TestRawTripPayload(recordStr2);
records.add(new HoodieRecord(new HoodieKey(rowChange2.getRowKey(), rowChange2.getPartitionPath()), rowChange2));
records.add(
new HoodieRecord(new HoodieKey(rowChange2.getRowKey(), rowChange2.getPartitionPath()),
rowChange2));
TestRawTripPayload rowChange3 = new TestRawTripPayload(recordStr3);
records.add(new HoodieRecord(new HoodieKey(rowChange3.getRowKey(), rowChange3.getPartitionPath()), rowChange3));
records.add(
new HoodieRecord(new HoodieKey(rowChange3.getRowKey(), rowChange3.getPartitionPath()),
rowChange3));
// Insert new records
List<WriteStatus> writeStatuses = HoodieClientTestUtils
@@ -286,7 +308,8 @@ public class TestCopyOnWriteTable {
records.addAll(newHoodieRecords(1, "2016-02-01T03:16:41.415Z"));
// Simulate crash after first file
List<WriteStatus> statuses = HoodieClientTestUtils.collectStatuses(table.handleInsert(commitTime, records.iterator()));
List<WriteStatus> statuses = HoodieClientTestUtils
.collectStatuses(table.handleInsert(commitTime, records.iterator()));
WriteStatus status = statuses.get(0);
Path partialFile = new Path(String.format("%s/%s/%s",
basePath,
@@ -299,7 +322,8 @@ public class TestCopyOnWriteTable {
records = newHoodieRecords(10, "2016-01-31T03:16:41.415Z");
records.addAll(newHoodieRecords(1, "2016-02-01T03:16:41.415Z"));
statuses = HoodieClientTestUtils.collectStatuses(table.handleInsert(commitTime, records.iterator()));
statuses = HoodieClientTestUtils
.collectStatuses(table.handleInsert(commitTime, records.iterator()));
status = statuses.get(0);
Path retriedFIle = new Path(String.format("%s/%s/%s",
@@ -312,7 +336,8 @@ public class TestCopyOnWriteTable {
}
@Test public void testInsertRecords() throws Exception {
@Test
public void testInsertRecords() throws Exception {
HoodieWriteConfig config = makeHoodieClientConfig();
String commitTime = HoodieTestUtils.makeNewCommitTime();
HoodieTableMetaClient metadata = new HoodieTableMetaClient(FSUtils.getFs(), basePath);
@@ -324,8 +349,8 @@ public class TestCopyOnWriteTable {
records.addAll(newHoodieRecords(1, "2016-02-01T03:16:41.415Z"));
// Insert new records
List<WriteStatus> returnedStatuses = HoodieClientTestUtils.collectStatuses(table.handleInsert(commitTime, records.iterator()));
List<WriteStatus> returnedStatuses = HoodieClientTestUtils
.collectStatuses(table.handleInsert(commitTime, records.iterator()));
// TODO: check the actual files and make sure 11 records, total were written.
assertEquals(2, returnedStatuses.size());
@@ -343,7 +368,8 @@ public class TestCopyOnWriteTable {
records.addAll(newHoodieRecords(1, "2016-02-02T03:16:41.415Z"));
// Insert new records
returnedStatuses = HoodieClientTestUtils.collectStatuses(table.handleInsert(commitTime, records.iterator()));
returnedStatuses = HoodieClientTestUtils
.collectStatuses(table.handleInsert(commitTime, records.iterator()));
assertEquals(3, returnedStatuses.size());
assertEquals("2016/01/31", returnedStatuses.get(0).getPartitionPath());
@@ -357,7 +383,8 @@ public class TestCopyOnWriteTable {
}
@Test public void testFileSizeUpsertRecords() throws Exception {
@Test
public void testFileSizeUpsertRecords() throws Exception {
HoodieWriteConfig config = makeHoodieClientConfigBuilder().withStorageConfig(
HoodieStorageConfig.newBuilder().limitFileSize(64 * 1024).parquetBlockSize(64 * 1024)
.parquetPageSize(64 * 1024).build()).build();
@@ -368,9 +395,11 @@ public class TestCopyOnWriteTable {
List<HoodieRecord> records = new ArrayList<>();
// Approx 1150 records are written for block size of 64KB
for (int i = 0; i < 2000; i++) {
String recordStr = "{\"_row_key\":\"" + UUID.randomUUID().toString() + "\",\"time\":\"2016-01-31T03:16:41.415Z\",\"number\":" + i + "}";
String recordStr = "{\"_row_key\":\"" + UUID.randomUUID().toString()
+ "\",\"time\":\"2016-01-31T03:16:41.415Z\",\"number\":" + i + "}";
TestRawTripPayload rowChange = new TestRawTripPayload(recordStr);
records.add(new HoodieRecord(new HoodieKey(rowChange.getRowKey(), rowChange.getPartitionPath()),
records
.add(new HoodieRecord(new HoodieKey(rowChange.getRowKey(), rowChange.getPartitionPath()),
rowChange));
}
@@ -380,7 +409,8 @@ public class TestCopyOnWriteTable {
// Check the updated file
int counts = 0;
for (File file : new File(basePath + "/2016/01/31").listFiles()) {
if (file.getName().endsWith(".parquet") && FSUtils.getCommitTime(file.getName()).equals(commitTime)) {
if (file.getName().endsWith(".parquet") && FSUtils.getCommitTime(file.getName())
.equals(commitTime)) {
System.out.println(file.getName() + "-" + file.length());
counts++;
}
@@ -391,7 +421,6 @@ public class TestCopyOnWriteTable {
}
private List<HoodieCopyOnWriteTable.InsertBucket> testUpsertPartitioner(int smallFileSize,
int numInserts,
int numUpdates,
@@ -400,8 +429,10 @@ public class TestCopyOnWriteTable {
final String TEST_PARTITION_PATH = "2016/09/26";
HoodieWriteConfig config = makeHoodieClientConfigBuilder()
.withCompactionConfig(HoodieCompactionConfig.newBuilder()
.compactionSmallFileSize(smallFileSize).insertSplitSize(100).autoTuneInsertSplits(autoSplitInserts).build())
.withStorageConfig(HoodieStorageConfig.newBuilder().limitFileSize(1000 * 1024).build()).build();
.compactionSmallFileSize(smallFileSize).insertSplitSize(100)
.autoTuneInsertSplits(autoSplitInserts).build())
.withStorageConfig(HoodieStorageConfig.newBuilder().limitFileSize(1000 * 1024).build())
.build();
HoodieClientTestUtils.fakeCommitFile(basePath, "001");
HoodieClientTestUtils.fakeDataFile(basePath, TEST_PARTITION_PATH, "001", "file1", fileSize);
@@ -409,10 +440,11 @@ public class TestCopyOnWriteTable {
HoodieTableMetaClient metadata = new HoodieTableMetaClient(FSUtils.getFs(), basePath);
HoodieCopyOnWriteTable table = new HoodieCopyOnWriteTable(config, metadata);
HoodieTestDataGenerator dataGenerator = new HoodieTestDataGenerator(new String[]{TEST_PARTITION_PATH});
HoodieTestDataGenerator dataGenerator = new HoodieTestDataGenerator(
new String[]{TEST_PARTITION_PATH});
List<HoodieRecord> insertRecords = dataGenerator.generateInserts("001", numInserts);
List<HoodieRecord> updateRecords = dataGenerator.generateUpdates("001", numUpdates);
for (HoodieRecord updateRec: updateRecords) {
for (HoodieRecord updateRec : updateRecords) {
updateRec.setCurrentLocation(new HoodieRecordLocation("001", "file1"));
}
List<HoodieRecord> records = new ArrayList<>();
@@ -430,7 +462,8 @@ public class TestCopyOnWriteTable {
assertEquals("Bucket 2 is INSERT", HoodieCopyOnWriteTable.BucketType.INSERT,
partitioner.getBucketInfo(2).bucketType);
assertEquals("Update record should have gone to the 1 update partiton", 0,
partitioner.getPartition(new Tuple2<>(updateRecords.get(0).getKey(), Option.apply(updateRecords.get(0).getCurrentLocation()))));
partitioner.getPartition(new Tuple2<>(updateRecords.get(0).getKey(),
Option.apply(updateRecords.get(0).getCurrentLocation()))));
return partitioner.getInsertBuckets(TEST_PARTITION_PATH);
}
@@ -438,7 +471,8 @@ public class TestCopyOnWriteTable {
@Test
public void testUpsertPartitioner() throws Exception {
// Inserts + Updates... Check all updates go together & inserts subsplit
List<HoodieCopyOnWriteTable.InsertBucket> insertBuckets = testUpsertPartitioner(0, 200, 100, 1024, false);
List<HoodieCopyOnWriteTable.InsertBucket> insertBuckets = testUpsertPartitioner(0, 200, 100,
1024, false);
assertEquals("Total of 2 insert buckets", 2, insertBuckets.size());
}
@@ -446,16 +480,21 @@ public class TestCopyOnWriteTable {
@Test
public void testUpsertPartitionerWithSmallInsertHandling() throws Exception {
// Inserts + Updates .. Check updates go together & inserts subsplit, after expanding smallest file
List<HoodieCopyOnWriteTable.InsertBucket> insertBuckets = testUpsertPartitioner(1000 * 1024, 400, 100, 800 * 1024, false);
List<HoodieCopyOnWriteTable.InsertBucket> insertBuckets = testUpsertPartitioner(1000 * 1024,
400, 100, 800 * 1024, false);
assertEquals("Total of 3 insert buckets", 3, insertBuckets.size());
assertEquals("First insert bucket must be same as update bucket", 0, insertBuckets.get(0).bucketNumber);
assertEquals("First insert bucket should have weight 0.5", 0.5, insertBuckets.get(0).weight, 0.01);
assertEquals("First insert bucket must be same as update bucket", 0,
insertBuckets.get(0).bucketNumber);
assertEquals("First insert bucket should have weight 0.5", 0.5, insertBuckets.get(0).weight,
0.01);
// Now with insert split size auto tuned
insertBuckets = testUpsertPartitioner(1000 * 1024, 2400, 100, 800 * 1024, true);
assertEquals("Total of 3 insert buckets", 3, insertBuckets.size());
assertEquals("First insert bucket must be same as update bucket", 0, insertBuckets.get(0).bucketNumber);
assertEquals("First insert bucket should have weight 0.5", 200.0/2400, insertBuckets.get(0).weight, 0.01);
assertEquals("First insert bucket must be same as update bucket", 0,
insertBuckets.get(0).bucketNumber);
assertEquals("First insert bucket should have weight 0.5", 200.0 / 2400,
insertBuckets.get(0).weight, 0.01);
}
@After

Some files were not shown because too many files have changed in this diff Show More