Reformatting code per Google Code Style all over
This commit is contained in:
committed by
vinoth chandar
parent
5a62480a92
commit
e45679f5e2
@@ -15,7 +15,9 @@
|
||||
~ limitations under the License.
|
||||
-->
|
||||
|
||||
<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
|
||||
<project xmlns="http://maven.apache.org/POM/4.0.0"
|
||||
xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
|
||||
xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
|
||||
<parent>
|
||||
<artifactId>hoodie</artifactId>
|
||||
<groupId>com.uber.hoodie</groupId>
|
||||
|
||||
@@ -17,12 +17,12 @@
|
||||
package com.uber.hoodie.cli;
|
||||
|
||||
import com.uber.hoodie.common.table.HoodieTableMetaClient;
|
||||
import java.io.IOException;
|
||||
import org.apache.hadoop.conf.Configuration;
|
||||
import org.apache.hadoop.fs.FileSystem;
|
||||
|
||||
import java.io.IOException;
|
||||
|
||||
public class HoodieCLI {
|
||||
|
||||
public static Configuration conf;
|
||||
public static FileSystem fs;
|
||||
public static CLIState state = CLIState.INIT;
|
||||
@@ -43,7 +43,7 @@ public class HoodieCLI {
|
||||
}
|
||||
|
||||
public static void initFS(boolean force) throws IOException {
|
||||
if(fs == null || force) {
|
||||
if (fs == null || force) {
|
||||
fs = FileSystem.get(conf);
|
||||
}
|
||||
}
|
||||
|
||||
@@ -17,7 +17,6 @@
|
||||
package com.uber.hoodie.cli;
|
||||
|
||||
import dnl.utils.text.table.TextTable;
|
||||
|
||||
import java.io.ByteArrayOutputStream;
|
||||
import java.io.PrintStream;
|
||||
import java.nio.charset.Charset;
|
||||
|
||||
@@ -16,7 +16,6 @@
|
||||
|
||||
package com.uber.hoodie.cli;
|
||||
|
||||
import com.uber.hoodie.common.table.HoodieTableConfig;
|
||||
import org.springframework.core.Ordered;
|
||||
import org.springframework.core.annotation.Order;
|
||||
import org.springframework.shell.plugin.support.DefaultPromptProvider;
|
||||
|
||||
@@ -22,9 +22,13 @@ import org.springframework.shell.plugin.support.DefaultBannerProvider;
|
||||
import org.springframework.shell.support.util.OsUtils;
|
||||
import org.springframework.stereotype.Component;
|
||||
|
||||
@Component @Order(Ordered.HIGHEST_PRECEDENCE) public class HoodieSplashScreen
|
||||
@Component
|
||||
@Order(Ordered.HIGHEST_PRECEDENCE)
|
||||
public class HoodieSplashScreen
|
||||
extends DefaultBannerProvider {
|
||||
private static String screen = "============================================" + OsUtils.LINE_SEPARATOR +
|
||||
|
||||
private static String screen =
|
||||
"============================================" + OsUtils.LINE_SEPARATOR +
|
||||
"* *" + OsUtils.LINE_SEPARATOR +
|
||||
"* _ _ _ _ *" + OsUtils.LINE_SEPARATOR +
|
||||
"* | | | | | (_) *" + OsUtils.LINE_SEPARATOR +
|
||||
@@ -49,7 +53,8 @@ import org.springframework.stereotype.Component;
|
||||
return "Welcome to Hoodie CLI. Please type help if you are looking for help. ";
|
||||
}
|
||||
|
||||
@Override public String getProviderName() {
|
||||
@Override
|
||||
public String getProviderName() {
|
||||
return "Hoodie Banner";
|
||||
}
|
||||
}
|
||||
|
||||
@@ -16,16 +16,14 @@
|
||||
|
||||
package com.uber.hoodie.cli;
|
||||
|
||||
import java.io.IOException;
|
||||
import org.springframework.shell.Bootstrap;
|
||||
|
||||
import java.io.IOException;
|
||||
|
||||
public class Main {
|
||||
|
||||
/**
|
||||
* Main class that delegates to Spring Shell's Bootstrap class in order to simplify debugging inside an IDE
|
||||
*
|
||||
* @param args
|
||||
* @throws IOException
|
||||
* Main class that delegates to Spring Shell's Bootstrap class in order to simplify debugging
|
||||
* inside an IDE
|
||||
*/
|
||||
public static void main(String[] args) throws IOException {
|
||||
Bootstrap.main(args);
|
||||
|
||||
@@ -24,6 +24,10 @@ import com.uber.hoodie.common.table.HoodieTimeline;
|
||||
import com.uber.hoodie.common.table.log.HoodieLogFormat;
|
||||
import com.uber.hoodie.common.table.log.block.HoodieAvroDataBlock;
|
||||
import com.uber.hoodie.common.util.FSUtils;
|
||||
import java.io.IOException;
|
||||
import java.util.ArrayList;
|
||||
import java.util.List;
|
||||
import java.util.stream.Collectors;
|
||||
import org.apache.avro.generic.GenericRecord;
|
||||
import org.apache.avro.generic.IndexedRecord;
|
||||
import org.apache.hadoop.fs.FileStatus;
|
||||
@@ -34,11 +38,6 @@ import org.springframework.shell.core.annotation.CliCommand;
|
||||
import org.springframework.shell.core.annotation.CliOption;
|
||||
import org.springframework.stereotype.Component;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.util.ArrayList;
|
||||
import java.util.List;
|
||||
import java.util.stream.Collectors;
|
||||
|
||||
@Component
|
||||
public class ArchivedCommitsCommand implements CommandMarker {
|
||||
|
||||
@@ -49,13 +48,16 @@ public class ArchivedCommitsCommand implements CommandMarker {
|
||||
|
||||
@CliCommand(value = "show archived commits", help = "Read commits from archived files and show details")
|
||||
public String showCommits(
|
||||
@CliOption(key = {"limit"}, mandatory = false, help = "Limit commits", unspecifiedDefaultValue = "10")
|
||||
@CliOption(key = {
|
||||
"limit"}, mandatory = false, help = "Limit commits", unspecifiedDefaultValue = "10")
|
||||
final Integer limit) throws IOException {
|
||||
|
||||
System.out.println("===============> Showing only " + limit + " archived commits <===============");
|
||||
FileStatus [] fsStatuses = FSUtils.getFs().globStatus(new Path(HoodieCLI.tableMetadata.getBasePath() + "/.hoodie/.commits_.archive*"));
|
||||
System.out
|
||||
.println("===============> Showing only " + limit + " archived commits <===============");
|
||||
FileStatus[] fsStatuses = FSUtils.getFs().globStatus(
|
||||
new Path(HoodieCLI.tableMetadata.getBasePath() + "/.hoodie/.commits_.archive*"));
|
||||
List<String[]> allCommits = new ArrayList<>();
|
||||
for(FileStatus fs : fsStatuses) {
|
||||
for (FileStatus fs : fsStatuses) {
|
||||
//read the archived file
|
||||
HoodieLogFormat.Reader reader = HoodieLogFormat.newReader(FSUtils.getFs(),
|
||||
new HoodieLogFile(fs.getPath()), HoodieArchivedMetaEntry.getClassSchema(), false);
|
||||
@@ -67,11 +69,13 @@ public class ArchivedCommitsCommand implements CommandMarker {
|
||||
List<IndexedRecord> records = blk.getRecords();
|
||||
readRecords.addAll(records);
|
||||
}
|
||||
List<String[]> readCommits = readRecords.stream().map(r -> (GenericRecord)r).map(r -> readCommit(r)).limit(limit).collect(Collectors.toList());
|
||||
List<String[]> readCommits = readRecords.stream().map(r -> (GenericRecord) r)
|
||||
.map(r -> readCommit(r)).limit(limit).collect(Collectors.toList());
|
||||
allCommits.addAll(readCommits);
|
||||
}
|
||||
return HoodiePrintHelper.print(
|
||||
new String[] {"CommitTime", "CommitType", "CommitDetails"}, allCommits.toArray(new String[allCommits.size()][]));
|
||||
new String[]{"CommitTime", "CommitType", "CommitDetails"},
|
||||
allCommits.toArray(new String[allCommits.size()][]));
|
||||
}
|
||||
|
||||
private String[] readCommit(GenericRecord record) {
|
||||
|
||||
@@ -24,21 +24,21 @@ import com.uber.hoodie.common.table.HoodieTimeline;
|
||||
import com.uber.hoodie.common.table.timeline.HoodieActiveTimeline;
|
||||
import com.uber.hoodie.common.table.timeline.HoodieInstant;
|
||||
import com.uber.hoodie.common.util.AvroUtils;
|
||||
import org.springframework.shell.core.CommandMarker;
|
||||
import org.springframework.shell.core.annotation.CliAvailabilityIndicator;
|
||||
import org.springframework.shell.core.annotation.CliCommand;
|
||||
import org.springframework.shell.core.annotation.CliOption;
|
||||
import org.springframework.stereotype.Component;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.util.ArrayList;
|
||||
import java.util.Collections;
|
||||
import java.util.List;
|
||||
import java.util.Map;
|
||||
import java.util.stream.Collectors;
|
||||
import org.springframework.shell.core.CommandMarker;
|
||||
import org.springframework.shell.core.annotation.CliAvailabilityIndicator;
|
||||
import org.springframework.shell.core.annotation.CliCommand;
|
||||
import org.springframework.shell.core.annotation.CliOption;
|
||||
import org.springframework.stereotype.Component;
|
||||
|
||||
@Component
|
||||
public class CleansCommand implements CommandMarker {
|
||||
|
||||
@CliAvailabilityIndicator({"cleans show"})
|
||||
public boolean isShowAvailable() {
|
||||
return HoodieCLI.tableMetadata != null;
|
||||
@@ -65,12 +65,12 @@ public class CleansCommand implements CommandMarker {
|
||||
HoodieInstant clean = cleans.get(i);
|
||||
HoodieCleanMetadata cleanMetadata =
|
||||
AvroUtils.deserializeHoodieCleanMetadata(timeline.getInstantDetails(clean).get());
|
||||
rows[i] = new String[] {clean.getTimestamp(), cleanMetadata.getEarliestCommitToRetain(),
|
||||
rows[i] = new String[]{clean.getTimestamp(), cleanMetadata.getEarliestCommitToRetain(),
|
||||
String.valueOf(cleanMetadata.getTotalFilesDeleted()),
|
||||
String.valueOf(cleanMetadata.getTimeTakenInMillis())};
|
||||
}
|
||||
return HoodiePrintHelper.print(
|
||||
new String[] {"CleanTime", "EarliestCommandRetained", "Total Files Deleted",
|
||||
new String[]{"CleanTime", "EarliestCommandRetained", "Total Files Deleted",
|
||||
"Total Time Taken"}, rows);
|
||||
}
|
||||
|
||||
@@ -97,16 +97,17 @@ public class CleansCommand implements CommandMarker {
|
||||
HoodieCleanMetadata cleanMetadata =
|
||||
AvroUtils.deserializeHoodieCleanMetadata(timeline.getInstantDetails(cleanInstant).get());
|
||||
List<String[]> rows = new ArrayList<>();
|
||||
for (Map.Entry<String, HoodieCleanPartitionMetadata> entry : cleanMetadata.getPartitionMetadata().entrySet()) {
|
||||
for (Map.Entry<String, HoodieCleanPartitionMetadata> entry : cleanMetadata
|
||||
.getPartitionMetadata().entrySet()) {
|
||||
String path = entry.getKey();
|
||||
HoodieCleanPartitionMetadata stats = entry.getValue();
|
||||
String policy = stats.getPolicy();
|
||||
String totalSuccessDeletedFiles = String.valueOf(stats.getSuccessDeleteFiles().size());
|
||||
String totalFailedDeletedFiles = String.valueOf(stats.getFailedDeleteFiles().size());
|
||||
rows.add(new String[] {path, policy, totalSuccessDeletedFiles, totalFailedDeletedFiles});
|
||||
rows.add(new String[]{path, policy, totalSuccessDeletedFiles, totalFailedDeletedFiles});
|
||||
}
|
||||
return HoodiePrintHelper.print(
|
||||
new String[] {"Partition Path", "Cleaning policy", "Total Files Successfully Deleted",
|
||||
new String[]{"Partition Path", "Cleaning policy", "Total Files Successfully Deleted",
|
||||
"Total Failed Deletions"}, rows.toArray(new String[rows.size()][]));
|
||||
}
|
||||
}
|
||||
|
||||
@@ -27,7 +27,12 @@ import com.uber.hoodie.common.table.HoodieTimeline;
|
||||
import com.uber.hoodie.common.table.timeline.HoodieActiveTimeline;
|
||||
import com.uber.hoodie.common.table.timeline.HoodieInstant;
|
||||
import com.uber.hoodie.common.util.NumericUtils;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.util.ArrayList;
|
||||
import java.util.Collections;
|
||||
import java.util.List;
|
||||
import java.util.Map;
|
||||
import java.util.stream.Collectors;
|
||||
import org.apache.spark.launcher.SparkLauncher;
|
||||
import org.springframework.shell.core.CommandMarker;
|
||||
import org.springframework.shell.core.annotation.CliAvailabilityIndicator;
|
||||
@@ -35,15 +40,9 @@ import org.springframework.shell.core.annotation.CliCommand;
|
||||
import org.springframework.shell.core.annotation.CliOption;
|
||||
import org.springframework.stereotype.Component;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.util.ArrayList;
|
||||
import java.util.Collections;
|
||||
import java.util.List;
|
||||
import java.util.Map;
|
||||
import java.util.stream.Collectors;
|
||||
|
||||
@Component
|
||||
public class CommitsCommand implements CommandMarker {
|
||||
|
||||
@CliAvailabilityIndicator({"commits show"})
|
||||
public boolean isShowAvailable() {
|
||||
return HoodieCLI.tableMetadata != null;
|
||||
@@ -70,7 +69,8 @@ public class CommitsCommand implements CommandMarker {
|
||||
"limit"}, mandatory = false, help = "Limit commits", unspecifiedDefaultValue = "10")
|
||||
final Integer limit) throws IOException {
|
||||
HoodieActiveTimeline activeTimeline = HoodieCLI.tableMetadata.getActiveTimeline();
|
||||
HoodieTimeline timeline = activeTimeline.getCommitsAndCompactionsTimeline().filterCompletedInstants();
|
||||
HoodieTimeline timeline = activeTimeline.getCommitsAndCompactionsTimeline()
|
||||
.filterCompletedInstants();
|
||||
List<HoodieInstant> commits = timeline.getInstants().collect(Collectors.toList());
|
||||
String[][] rows = new String[commits.size()][];
|
||||
Collections.reverse(commits);
|
||||
@@ -78,7 +78,7 @@ public class CommitsCommand implements CommandMarker {
|
||||
HoodieInstant commit = commits.get(i);
|
||||
HoodieCommitMetadata commitMetadata =
|
||||
HoodieCommitMetadata.fromBytes(timeline.getInstantDetails(commit).get());
|
||||
rows[i] = new String[] {commit.getTimestamp(),
|
||||
rows[i] = new String[]{commit.getTimestamp(),
|
||||
NumericUtils.humanReadableByteCount(commitMetadata.fetchTotalBytesWritten()),
|
||||
String.valueOf(commitMetadata.fetchTotalFilesInsert()),
|
||||
String.valueOf(commitMetadata.fetchTotalFilesUpdated()),
|
||||
@@ -88,7 +88,7 @@ public class CommitsCommand implements CommandMarker {
|
||||
String.valueOf(commitMetadata.fetchTotalWriteErrors())};
|
||||
}
|
||||
return HoodiePrintHelper.print(
|
||||
new String[] {"CommitTime", "Total Written (B)", "Total Files Added",
|
||||
new String[]{"CommitTime", "Total Written (B)", "Total Files Added",
|
||||
"Total Files Updated", "Total Partitions Written", "Total Records Written",
|
||||
"Total Update Records Written", "Total Errors"}, rows);
|
||||
}
|
||||
@@ -108,8 +108,10 @@ public class CommitsCommand implements CommandMarker {
|
||||
@CliOption(key = {"sparkProperties"}, help = "Spark Properites File Path")
|
||||
final String sparkPropertiesPath) throws Exception {
|
||||
HoodieActiveTimeline activeTimeline = HoodieCLI.tableMetadata.getActiveTimeline();
|
||||
HoodieTimeline timeline = activeTimeline.getCommitsAndCompactionsTimeline().filterCompletedInstants();
|
||||
HoodieInstant commitInstant = new HoodieInstant(false, HoodieTimeline.COMMIT_ACTION, commitTime);
|
||||
HoodieTimeline timeline = activeTimeline.getCommitsAndCompactionsTimeline()
|
||||
.filterCompletedInstants();
|
||||
HoodieInstant commitInstant = new HoodieInstant(false, HoodieTimeline.COMMIT_ACTION,
|
||||
commitTime);
|
||||
|
||||
if (!timeline.containsInstant(commitInstant)) {
|
||||
return "Commit " + commitTime + " not found in Commits " + timeline;
|
||||
@@ -135,8 +137,10 @@ public class CommitsCommand implements CommandMarker {
|
||||
@CliOption(key = {"commit"}, help = "Commit to show")
|
||||
final String commitTime) throws Exception {
|
||||
HoodieActiveTimeline activeTimeline = HoodieCLI.tableMetadata.getActiveTimeline();
|
||||
HoodieTimeline timeline = activeTimeline.getCommitsAndCompactionsTimeline().filterCompletedInstants();
|
||||
HoodieInstant commitInstant = new HoodieInstant(false, HoodieTimeline.COMMIT_ACTION, commitTime);
|
||||
HoodieTimeline timeline = activeTimeline.getCommitsAndCompactionsTimeline()
|
||||
.filterCompletedInstants();
|
||||
HoodieInstant commitInstant = new HoodieInstant(false, HoodieTimeline.COMMIT_ACTION,
|
||||
commitTime);
|
||||
|
||||
if (!timeline.containsInstant(commitInstant)) {
|
||||
return "Commit " + commitTime + " not found in Commits " + timeline;
|
||||
@@ -165,7 +169,7 @@ public class CommitsCommand implements CommandMarker {
|
||||
totalBytesWritten += stat.getTotalWriteBytes();
|
||||
totalWriteErrors += stat.getTotalWriteErrors();
|
||||
}
|
||||
rows.add(new String[] {path, String.valueOf(totalFilesAdded),
|
||||
rows.add(new String[]{path, String.valueOf(totalFilesAdded),
|
||||
String.valueOf(totalFilesUpdated), String.valueOf(totalRecordsInserted),
|
||||
String.valueOf(totalRecordsUpdated),
|
||||
NumericUtils.humanReadableByteCount(totalBytesWritten),
|
||||
@@ -173,7 +177,7 @@ public class CommitsCommand implements CommandMarker {
|
||||
|
||||
}
|
||||
return HoodiePrintHelper.print(
|
||||
new String[] {"Partition Path", "Total Files Added", "Total Files Updated",
|
||||
new String[]{"Partition Path", "Total Files Added", "Total Files Updated",
|
||||
"Total Records Inserted", "Total Records Updated", "Total Bytes Written",
|
||||
"Total Errors"}, rows.toArray(new String[rows.size()][]));
|
||||
}
|
||||
@@ -183,8 +187,10 @@ public class CommitsCommand implements CommandMarker {
|
||||
@CliOption(key = {"commit"}, help = "Commit to show")
|
||||
final String commitTime) throws Exception {
|
||||
HoodieActiveTimeline activeTimeline = HoodieCLI.tableMetadata.getActiveTimeline();
|
||||
HoodieTimeline timeline = activeTimeline.getCommitsAndCompactionsTimeline().filterCompletedInstants();
|
||||
HoodieInstant commitInstant = new HoodieInstant(false, HoodieTimeline.COMMIT_ACTION, commitTime);
|
||||
HoodieTimeline timeline = activeTimeline.getCommitsAndCompactionsTimeline()
|
||||
.filterCompletedInstants();
|
||||
HoodieInstant commitInstant = new HoodieInstant(false, HoodieTimeline.COMMIT_ACTION,
|
||||
commitTime);
|
||||
|
||||
if (!timeline.containsInstant(commitInstant)) {
|
||||
return "Commit " + commitTime + " not found in Commits " + timeline;
|
||||
@@ -197,14 +203,14 @@ public class CommitsCommand implements CommandMarker {
|
||||
String path = entry.getKey();
|
||||
List<HoodieWriteStat> stats = entry.getValue();
|
||||
for (HoodieWriteStat stat : stats) {
|
||||
rows.add(new String[] {path, stat.getFileId(), stat.getPrevCommit(),
|
||||
rows.add(new String[]{path, stat.getFileId(), stat.getPrevCommit(),
|
||||
String.valueOf(stat.getNumUpdateWrites()), String.valueOf(stat.getNumWrites()),
|
||||
String.valueOf(stat.getTotalWriteBytes()),
|
||||
String.valueOf(stat.getTotalWriteErrors())});
|
||||
}
|
||||
}
|
||||
return HoodiePrintHelper.print(
|
||||
new String[] {"Partition Path", "File ID", "Previous Commit", "Total Records Updated",
|
||||
new String[]{"Partition Path", "File ID", "Previous Commit", "Total Records Updated",
|
||||
"Total Records Written", "Total Bytes Written", "Total Errors"},
|
||||
rows.toArray(new String[rows.size()][]));
|
||||
}
|
||||
@@ -219,16 +225,23 @@ public class CommitsCommand implements CommandMarker {
|
||||
@CliOption(key = {"path"}, help = "Path of the dataset to compare to")
|
||||
final String path) throws Exception {
|
||||
HoodieTableMetaClient target = new HoodieTableMetaClient(HoodieCLI.fs, path);
|
||||
HoodieTimeline targetTimeline = target.getActiveTimeline().getCommitsAndCompactionsTimeline().filterCompletedInstants();;
|
||||
HoodieTimeline targetTimeline = target.getActiveTimeline().getCommitsAndCompactionsTimeline()
|
||||
.filterCompletedInstants();
|
||||
;
|
||||
HoodieTableMetaClient source = HoodieCLI.tableMetadata;
|
||||
HoodieTimeline sourceTimeline = source.getActiveTimeline().getCommitsAndCompactionsTimeline().filterCompletedInstants();;
|
||||
HoodieTimeline sourceTimeline = source.getActiveTimeline().getCommitsAndCompactionsTimeline()
|
||||
.filterCompletedInstants();
|
||||
;
|
||||
String targetLatestCommit =
|
||||
targetTimeline.getInstants().iterator().hasNext() ? "0" : targetTimeline.lastInstant().get().getTimestamp();
|
||||
targetTimeline.getInstants().iterator().hasNext() ? "0"
|
||||
: targetTimeline.lastInstant().get().getTimestamp();
|
||||
String sourceLatestCommit =
|
||||
sourceTimeline.getInstants().iterator().hasNext() ? "0" : sourceTimeline.lastInstant().get().getTimestamp();
|
||||
sourceTimeline.getInstants().iterator().hasNext() ? "0"
|
||||
: sourceTimeline.lastInstant().get().getTimestamp();
|
||||
|
||||
if (sourceLatestCommit != null &&
|
||||
HoodieTimeline.compareTimestamps(targetLatestCommit, sourceLatestCommit, HoodieTimeline.GREATER)) {
|
||||
HoodieTimeline
|
||||
.compareTimestamps(targetLatestCommit, sourceLatestCommit, HoodieTimeline.GREATER)) {
|
||||
// source is behind the target
|
||||
List<String> commitsToCatchup =
|
||||
targetTimeline.findInstantsAfter(sourceLatestCommit, Integer.MAX_VALUE)
|
||||
|
||||
@@ -18,15 +18,15 @@ package com.uber.hoodie.cli.commands;
|
||||
|
||||
import com.uber.hoodie.cli.HoodieCLI;
|
||||
import com.uber.hoodie.common.table.HoodieTableMetaClient;
|
||||
import java.io.IOException;
|
||||
import org.springframework.shell.core.CommandMarker;
|
||||
import org.springframework.shell.core.annotation.CliCommand;
|
||||
import org.springframework.shell.core.annotation.CliOption;
|
||||
import org.springframework.stereotype.Component;
|
||||
|
||||
import java.io.IOException;
|
||||
|
||||
@Component
|
||||
public class DatasetsCommand implements CommandMarker {
|
||||
|
||||
@CliCommand(value = "connect", help = "Connect to a hoodie dataset")
|
||||
public String connect(
|
||||
@CliOption(key = {"path"}, mandatory = true, help = "Base Path of the dataset")
|
||||
|
||||
@@ -68,7 +68,8 @@ public class HDFSParquetImportCommand implements CommandMarker {
|
||||
boolean initialized = HoodieCLI.initConf();
|
||||
HoodieCLI.initFS(initialized);
|
||||
String sparkPropertiesPath = Utils
|
||||
.getDefaultPropertiesFile(scala.collection.JavaConversions.propertiesAsScalaMap(System.getProperties()));
|
||||
.getDefaultPropertiesFile(
|
||||
scala.collection.JavaConversions.propertiesAsScalaMap(System.getProperties()));
|
||||
SparkLauncher sparkLauncher = SparkUtil.initLauncher(sparkPropertiesPath);
|
||||
|
||||
sparkLauncher.addAppArgs(SparkCommand.IMPORT.toString(), srcPath, targetPath, tableName,
|
||||
|
||||
@@ -16,23 +16,23 @@
|
||||
|
||||
package com.uber.hoodie.cli.commands;
|
||||
|
||||
import com.uber.hoodie.cli.HoodieCLI;
|
||||
import com.uber.hoodie.cli.utils.CommitUtil;
|
||||
import com.uber.hoodie.cli.utils.HiveUtil;
|
||||
import com.uber.hoodie.cli.HoodieCLI;
|
||||
import com.uber.hoodie.common.table.HoodieTableMetaClient;
|
||||
import com.uber.hoodie.common.table.HoodieTimeline;
|
||||
import com.uber.hoodie.common.table.timeline.HoodieInstant;
|
||||
import java.util.List;
|
||||
import java.util.stream.Collectors;
|
||||
import org.springframework.shell.core.CommandMarker;
|
||||
import org.springframework.shell.core.annotation.CliAvailabilityIndicator;
|
||||
import org.springframework.shell.core.annotation.CliCommand;
|
||||
import org.springframework.shell.core.annotation.CliOption;
|
||||
import org.springframework.stereotype.Component;
|
||||
|
||||
import java.util.List;
|
||||
import java.util.stream.Collectors;
|
||||
|
||||
@Component
|
||||
public class HoodieSyncCommand implements CommandMarker {
|
||||
|
||||
@CliAvailabilityIndicator({"sync validate"})
|
||||
public boolean isSyncVerificationAvailable() {
|
||||
return HoodieCLI.tableMetadata != null && HoodieCLI.syncTableMetadata != null;
|
||||
@@ -70,14 +70,18 @@ public class HoodieSyncCommand implements CommandMarker {
|
||||
sourceCount = HiveUtil.countRecords(hiveServerUrl, source, srcDb, hiveUser, hivePass);
|
||||
targetCount = HiveUtil.countRecords(hiveServerUrl, target, tgtDb, hiveUser, hivePass);
|
||||
} else if ("latestPartitions".equals(mode)) {
|
||||
sourceCount = HiveUtil.countRecords(hiveServerUrl, source, srcDb, partitionCount, hiveUser, hivePass);
|
||||
targetCount = HiveUtil.countRecords(hiveServerUrl, target, tgtDb, partitionCount, hiveUser, hivePass);
|
||||
sourceCount = HiveUtil
|
||||
.countRecords(hiveServerUrl, source, srcDb, partitionCount, hiveUser, hivePass);
|
||||
targetCount = HiveUtil
|
||||
.countRecords(hiveServerUrl, target, tgtDb, partitionCount, hiveUser, hivePass);
|
||||
}
|
||||
|
||||
String targetLatestCommit =
|
||||
targetTimeline.getInstants().iterator().hasNext() ? "0" : targetTimeline.lastInstant().get().getTimestamp();
|
||||
targetTimeline.getInstants().iterator().hasNext() ? "0"
|
||||
: targetTimeline.lastInstant().get().getTimestamp();
|
||||
String sourceLatestCommit =
|
||||
sourceTimeline.getInstants().iterator().hasNext() ? "0" : sourceTimeline.lastInstant().get().getTimestamp();
|
||||
sourceTimeline.getInstants().iterator().hasNext() ? "0"
|
||||
: sourceTimeline.lastInstant().get().getTimestamp();
|
||||
|
||||
if (sourceLatestCommit != null && HoodieTimeline
|
||||
.compareTimestamps(targetLatestCommit, sourceLatestCommit, HoodieTimeline.GREATER)) {
|
||||
|
||||
@@ -22,7 +22,8 @@ import com.uber.hoodie.cli.utils.InputStreamConsumer;
|
||||
import com.uber.hoodie.cli.utils.SparkUtil;
|
||||
import com.uber.hoodie.common.model.HoodiePartitionMetadata;
|
||||
import com.uber.hoodie.common.util.FSUtils;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.util.List;
|
||||
import org.apache.hadoop.fs.Path;
|
||||
import org.apache.spark.launcher.SparkLauncher;
|
||||
import org.springframework.shell.core.CommandMarker;
|
||||
@@ -31,9 +32,6 @@ import org.springframework.shell.core.annotation.CliCommand;
|
||||
import org.springframework.shell.core.annotation.CliOption;
|
||||
import org.springframework.stereotype.Component;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.util.List;
|
||||
|
||||
@Component
|
||||
public class RepairsCommand implements CommandMarker {
|
||||
|
||||
@@ -52,7 +50,8 @@ public class RepairsCommand implements CommandMarker {
|
||||
@CliOption(key = {
|
||||
"duplicatedPartitionPath"}, help = "Partition Path containing the duplicates", mandatory = true)
|
||||
final String duplicatedPartitionPath,
|
||||
@CliOption(key = {"repairedOutputPath"}, help = "Location to place the repaired files", mandatory = true)
|
||||
@CliOption(key = {
|
||||
"repairedOutputPath"}, help = "Location to place the repaired files", mandatory = true)
|
||||
final String repairedOutputPath,
|
||||
@CliOption(key = {"sparkProperties"}, help = "Spark Properites File Path", mandatory = true)
|
||||
final String sparkPropertiesPath) throws Exception {
|
||||
@@ -71,7 +70,6 @@ public class RepairsCommand implements CommandMarker {
|
||||
}
|
||||
|
||||
|
||||
|
||||
@CliCommand(value = "repair addpartitionmeta", help = "Add partition metadata to a dataset, if not present")
|
||||
public String addPartitionMeta(
|
||||
@CliOption(key = {"dryrun"},
|
||||
@@ -79,17 +77,20 @@ public class RepairsCommand implements CommandMarker {
|
||||
unspecifiedDefaultValue = "true")
|
||||
final boolean dryRun) throws IOException {
|
||||
|
||||
String latestCommit = HoodieCLI.tableMetadata.getActiveTimeline().getCommitTimeline().lastInstant().get().getTimestamp();
|
||||
String latestCommit = HoodieCLI.tableMetadata.getActiveTimeline().getCommitTimeline()
|
||||
.lastInstant().get().getTimestamp();
|
||||
List<String> partitionPaths = FSUtils.getAllFoldersThreeLevelsDown(HoodieCLI.fs,
|
||||
HoodieCLI.tableMetadata.getBasePath());
|
||||
Path basePath = new Path(HoodieCLI.tableMetadata.getBasePath());
|
||||
String[][] rows = new String[partitionPaths.size() + 1][];
|
||||
|
||||
int ind = 0;
|
||||
for (String partition: partitionPaths) {
|
||||
for (String partition : partitionPaths) {
|
||||
Path partitionPath = new Path(basePath, partition);
|
||||
String[] row = new String[3];
|
||||
row[0] = partition; row[1] = "Yes"; row[2] = "None";
|
||||
row[0] = partition;
|
||||
row[1] = "Yes";
|
||||
row[2] = "None";
|
||||
if (!HoodiePartitionMetadata.hasPartitionMetadata(HoodieCLI.fs, partitionPath)) {
|
||||
row[1] = "No";
|
||||
if (!dryRun) {
|
||||
@@ -105,6 +106,6 @@ public class RepairsCommand implements CommandMarker {
|
||||
}
|
||||
|
||||
return HoodiePrintHelper.print(
|
||||
new String[] {"Partition Path", "Metadata Present?", "Action"}, rows);
|
||||
new String[]{"Partition Path", "Metadata Present?", "Action"}, rows);
|
||||
}
|
||||
}
|
||||
|
||||
@@ -27,6 +27,10 @@ import com.uber.hoodie.common.table.timeline.HoodieInstant;
|
||||
import com.uber.hoodie.config.HoodieIndexConfig;
|
||||
import com.uber.hoodie.config.HoodieWriteConfig;
|
||||
import com.uber.hoodie.index.HoodieIndex;
|
||||
import java.io.IOException;
|
||||
import java.util.Collections;
|
||||
import java.util.List;
|
||||
import java.util.stream.Collectors;
|
||||
import org.apache.spark.api.java.JavaSparkContext;
|
||||
import org.apache.spark.launcher.SparkLauncher;
|
||||
import org.springframework.shell.core.CommandMarker;
|
||||
@@ -35,13 +39,9 @@ import org.springframework.shell.core.annotation.CliCommand;
|
||||
import org.springframework.shell.core.annotation.CliOption;
|
||||
import org.springframework.stereotype.Component;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.util.Collections;
|
||||
import java.util.List;
|
||||
import java.util.stream.Collectors;
|
||||
|
||||
@Component
|
||||
public class SavepointsCommand implements CommandMarker {
|
||||
|
||||
@CliAvailabilityIndicator({"savepoints show"})
|
||||
public boolean isShowAvailable() {
|
||||
return HoodieCLI.tableMetadata != null;
|
||||
@@ -60,7 +60,8 @@ public class SavepointsCommand implements CommandMarker {
|
||||
|
||||
@CliAvailabilityIndicator({"savepoint rollback"})
|
||||
public boolean isRollbackToSavepointAvailable() {
|
||||
return HoodieCLI.tableMetadata != null && !HoodieCLI.tableMetadata.getActiveTimeline().getSavePointTimeline().filterCompletedInstants().empty();
|
||||
return HoodieCLI.tableMetadata != null && !HoodieCLI.tableMetadata.getActiveTimeline()
|
||||
.getSavePointTimeline().filterCompletedInstants().empty();
|
||||
}
|
||||
|
||||
@CliCommand(value = "savepoints show", help = "Show the savepoints")
|
||||
@@ -72,9 +73,9 @@ public class SavepointsCommand implements CommandMarker {
|
||||
Collections.reverse(commits);
|
||||
for (int i = 0; i < commits.size(); i++) {
|
||||
HoodieInstant commit = commits.get(i);
|
||||
rows[i] = new String[] {commit.getTimestamp()};
|
||||
rows[i] = new String[]{commit.getTimestamp()};
|
||||
}
|
||||
return HoodiePrintHelper.print(new String[] {"SavepointTime"}, rows);
|
||||
return HoodiePrintHelper.print(new String[]{"SavepointTime"}, rows);
|
||||
}
|
||||
|
||||
@CliCommand(value = "savepoint create", help = "Savepoint a commit")
|
||||
@@ -152,5 +153,4 @@ public class SavepointsCommand implements CommandMarker {
|
||||
}
|
||||
|
||||
|
||||
|
||||
}
|
||||
|
||||
@@ -52,7 +52,7 @@ public class SparkMain {
|
||||
|
||||
JavaSparkContext jsc = SparkUtil.initJavaSparkConf("hoodie-cli-" + command);
|
||||
int returnCode = 0;
|
||||
switch(cmd) {
|
||||
switch (cmd) {
|
||||
case ROLLBACK:
|
||||
assert (args.length == 3);
|
||||
returnCode = rollback(jsc, args[1], args[2]);
|
||||
@@ -98,7 +98,7 @@ public class SparkMain {
|
||||
String basePath)
|
||||
throws Exception {
|
||||
DedupeSparkJob job = new DedupeSparkJob(basePath,
|
||||
duplicatedPartitionPath,repairedOutputPath,new SQLContext(jsc), FSUtils.getFs());
|
||||
duplicatedPartitionPath, repairedOutputPath, new SQLContext(jsc), FSUtils.getFs());
|
||||
job.fixDuplicates(true);
|
||||
return 0;
|
||||
}
|
||||
@@ -115,7 +115,8 @@ public class SparkMain {
|
||||
}
|
||||
}
|
||||
|
||||
private static int rollbackToSavepoint(JavaSparkContext jsc, String savepointTime, String basePath)
|
||||
private static int rollbackToSavepoint(JavaSparkContext jsc, String savepointTime,
|
||||
String basePath)
|
||||
throws Exception {
|
||||
HoodieWriteClient client = createHoodieClient(jsc, basePath);
|
||||
if (client.rollbackToSavepoint(savepointTime)) {
|
||||
|
||||
@@ -28,7 +28,10 @@ import com.uber.hoodie.common.table.timeline.HoodieActiveTimeline;
|
||||
import com.uber.hoodie.common.table.timeline.HoodieInstant;
|
||||
import com.uber.hoodie.common.util.FSUtils;
|
||||
import com.uber.hoodie.common.util.NumericUtils;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.text.DecimalFormat;
|
||||
import java.util.HashMap;
|
||||
import java.util.stream.Collectors;
|
||||
import org.apache.hadoop.fs.FileStatus;
|
||||
import org.apache.hadoop.fs.FileSystem;
|
||||
import org.apache.hadoop.fs.Path;
|
||||
@@ -38,13 +41,9 @@ import org.springframework.shell.core.annotation.CliCommand;
|
||||
import org.springframework.shell.core.annotation.CliOption;
|
||||
import org.springframework.stereotype.Component;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.text.DecimalFormat;
|
||||
import java.util.HashMap;
|
||||
import java.util.stream.Collectors;
|
||||
|
||||
@Component
|
||||
public class StatsCommand implements CommandMarker {
|
||||
|
||||
@CliAvailabilityIndicator({"stats wa"})
|
||||
public boolean isWriteAmpAvailable() {
|
||||
return HoodieCLI.tableMetadata != null;
|
||||
@@ -64,13 +63,14 @@ public class StatsCommand implements CommandMarker {
|
||||
for (HoodieInstant commitTime : timeline.getInstants().collect(
|
||||
Collectors.toList())) {
|
||||
String waf = "0";
|
||||
HoodieCommitMetadata commit = HoodieCommitMetadata.fromBytes(activeTimeline.getInstantDetails(commitTime).get());
|
||||
HoodieCommitMetadata commit = HoodieCommitMetadata
|
||||
.fromBytes(activeTimeline.getInstantDetails(commitTime).get());
|
||||
if (commit.fetchTotalUpdateRecordsWritten() > 0) {
|
||||
waf = df.format(
|
||||
(float) commit.fetchTotalRecordsWritten() / commit
|
||||
.fetchTotalUpdateRecordsWritten());
|
||||
}
|
||||
rows[i++] = new String[] {commitTime.getTimestamp(),
|
||||
rows[i++] = new String[]{commitTime.getTimestamp(),
|
||||
String.valueOf(commit.fetchTotalUpdateRecordsWritten()),
|
||||
String.valueOf(commit.fetchTotalRecordsWritten()), waf};
|
||||
totalRecordsUpserted += commit.fetchTotalUpdateRecordsWritten();
|
||||
@@ -80,10 +80,10 @@ public class StatsCommand implements CommandMarker {
|
||||
if (totalRecordsUpserted > 0) {
|
||||
waf = df.format((float) totalRecordsWritten / totalRecordsUpserted);
|
||||
}
|
||||
rows[i] = new String[] {"Total", String.valueOf(totalRecordsUpserted),
|
||||
rows[i] = new String[]{"Total", String.valueOf(totalRecordsUpserted),
|
||||
String.valueOf(totalRecordsWritten), waf};
|
||||
return HoodiePrintHelper.print(
|
||||
new String[] {"CommitTime", "Total Upserted", "Total Written",
|
||||
new String[]{"CommitTime", "Total Upserted", "Total Written",
|
||||
"Write Amplifiation Factor"}, rows);
|
||||
|
||||
}
|
||||
@@ -105,7 +105,8 @@ public class StatsCommand implements CommandMarker {
|
||||
|
||||
@CliCommand(value = "stats filesizes", help = "File Sizes. Display summary stats on sizes of files")
|
||||
public String fileSizeStats(
|
||||
@CliOption(key = {"partitionPath"}, help = "regex to select files, eg: 2016/08/02", unspecifiedDefaultValue = "*/*/*")
|
||||
@CliOption(key = {
|
||||
"partitionPath"}, help = "regex to select files, eg: 2016/08/02", unspecifiedDefaultValue = "*/*/*")
|
||||
final String globRegex) throws IOException {
|
||||
|
||||
FileSystem fs = HoodieCLI.fs;
|
||||
@@ -118,7 +119,7 @@ public class StatsCommand implements CommandMarker {
|
||||
final int MAX_FILES = 1000000;
|
||||
Histogram globalHistogram = new Histogram(new UniformReservoir(MAX_FILES));
|
||||
HashMap<String, Histogram> commitHistoMap = new HashMap<String, Histogram>();
|
||||
for (FileStatus fileStatus: statuses) {
|
||||
for (FileStatus fileStatus : statuses) {
|
||||
String commitTime = FSUtils.getCommitTime(fileStatus.getPath().getName());
|
||||
long sz = fileStatus.getLen();
|
||||
if (!commitHistoMap.containsKey(commitTime)) {
|
||||
@@ -130,7 +131,7 @@ public class StatsCommand implements CommandMarker {
|
||||
|
||||
String[][] rows = new String[commitHistoMap.size() + 1][];
|
||||
int ind = 0;
|
||||
for (String commitTime: commitHistoMap.keySet()) {
|
||||
for (String commitTime : commitHistoMap.keySet()) {
|
||||
Snapshot s = commitHistoMap.get(commitTime).getSnapshot();
|
||||
rows[ind++] = printFileSizeHistogram(commitTime, s);
|
||||
}
|
||||
@@ -138,6 +139,7 @@ public class StatsCommand implements CommandMarker {
|
||||
rows[ind++] = printFileSizeHistogram("ALL", s);
|
||||
|
||||
return HoodiePrintHelper.print(
|
||||
new String[] {"CommitTime", "Min", "10th", "50th", "avg", "95th", "Max", "NumFiles", "StdDev"}, rows);
|
||||
new String[]{"CommitTime", "Min", "10th", "50th", "avg", "95th", "Max", "NumFiles",
|
||||
"StdDev"}, rows);
|
||||
}
|
||||
}
|
||||
|
||||
@@ -23,9 +23,10 @@ import org.springframework.stereotype.Component;
|
||||
|
||||
@Component
|
||||
public class UtilsCommand implements CommandMarker {
|
||||
@CliCommand(value = "utils loadClass", help = "Load a class" )
|
||||
|
||||
@CliCommand(value = "utils loadClass", help = "Load a class")
|
||||
public String loadClass(
|
||||
@CliOption(key = {"class"}, help = "Check mode" ) final String clazz
|
||||
@CliOption(key = {"class"}, help = "Check mode") final String clazz
|
||||
) throws Exception {
|
||||
Class klass = Class.forName(clazz);
|
||||
return klass.getProtectionDomain().getCodeSource().getLocation().toExternalForm();
|
||||
|
||||
@@ -20,16 +20,17 @@ import com.uber.hoodie.common.model.HoodieCommitMetadata;
|
||||
import com.uber.hoodie.common.table.HoodieTableMetaClient;
|
||||
import com.uber.hoodie.common.table.HoodieTimeline;
|
||||
import com.uber.hoodie.common.table.timeline.HoodieInstant;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.util.List;
|
||||
|
||||
public class CommitUtil {
|
||||
|
||||
public static long countNewRecords(HoodieTableMetaClient target, List<String> commitsToCatchup)
|
||||
throws IOException {
|
||||
long totalNew = 0;
|
||||
HoodieTimeline timeline = target.getActiveTimeline().reload().getCommitTimeline().filterCompletedInstants();
|
||||
for(String commit:commitsToCatchup) {
|
||||
HoodieTimeline timeline = target.getActiveTimeline().reload().getCommitTimeline()
|
||||
.filterCompletedInstants();
|
||||
for (String commit : commitsToCatchup) {
|
||||
HoodieCommitMetadata c = HoodieCommitMetadata.fromBytes(timeline
|
||||
.getInstantDetails(new HoodieInstant(false, HoodieTimeline.COMMIT_ACTION, commit))
|
||||
.get());
|
||||
|
||||
@@ -17,16 +17,16 @@
|
||||
package com.uber.hoodie.cli.utils;
|
||||
|
||||
import com.uber.hoodie.common.table.HoodieTableMetaClient;
|
||||
import org.apache.commons.dbcp.BasicDataSource;
|
||||
import org.joda.time.DateTime;
|
||||
|
||||
import javax.sql.DataSource;
|
||||
import java.sql.Connection;
|
||||
import java.sql.ResultSet;
|
||||
import java.sql.SQLException;
|
||||
import java.sql.Statement;
|
||||
import javax.sql.DataSource;
|
||||
import org.apache.commons.dbcp.BasicDataSource;
|
||||
import org.joda.time.DateTime;
|
||||
|
||||
public class HiveUtil {
|
||||
|
||||
private static String driverName = "org.apache.hive.jdbc.HiveDriver";
|
||||
|
||||
static {
|
||||
@@ -39,7 +39,8 @@ public class HiveUtil {
|
||||
|
||||
private static Connection connection;
|
||||
|
||||
private static Connection getConnection(String jdbcUrl, String user, String pass) throws SQLException {
|
||||
private static Connection getConnection(String jdbcUrl, String user, String pass)
|
||||
throws SQLException {
|
||||
DataSource ds = getDatasource(jdbcUrl, user, pass);
|
||||
return ds.getConnection();
|
||||
}
|
||||
@@ -53,22 +54,25 @@ public class HiveUtil {
|
||||
return ds;
|
||||
}
|
||||
|
||||
public static long countRecords(String jdbcUrl, HoodieTableMetaClient source, String dbName, String user, String pass) throws SQLException {
|
||||
public static long countRecords(String jdbcUrl, HoodieTableMetaClient source, String dbName,
|
||||
String user, String pass) throws SQLException {
|
||||
Connection conn = HiveUtil.getConnection(jdbcUrl, user, pass);
|
||||
ResultSet rs = null;
|
||||
Statement stmt = conn.createStatement();
|
||||
try {
|
||||
//stmt.execute("set mapred.job.queue.name=<queue_name>");
|
||||
stmt.execute("set hive.input.format=org.apache.hadoop.hive.ql.io.HiveInputFormat" );
|
||||
stmt.execute("set hive.stats.autogather=false" );
|
||||
stmt.execute("set hive.input.format=org.apache.hadoop.hive.ql.io.HiveInputFormat");
|
||||
stmt.execute("set hive.stats.autogather=false");
|
||||
rs = stmt.executeQuery(
|
||||
"select count(`_hoodie_commit_time`) as cnt from " + dbName + "." + source.getTableConfig()
|
||||
"select count(`_hoodie_commit_time`) as cnt from " + dbName + "." + source
|
||||
.getTableConfig()
|
||||
.getTableName());
|
||||
long count = -1;
|
||||
if(rs.next()) {
|
||||
if (rs.next()) {
|
||||
count = rs.getLong("cnt");
|
||||
}
|
||||
System.out.println("Total records in " + source.getTableConfig().getTableName() + " is " + count);
|
||||
System.out
|
||||
.println("Total records in " + source.getTableConfig().getTableName() + " is " + count);
|
||||
return count;
|
||||
} finally {
|
||||
if (rs != null) {
|
||||
@@ -94,7 +98,8 @@ public class HiveUtil {
|
||||
return countRecords(jdbcUrl, source, srcDb, startDateStr, endDateStr, user, pass);
|
||||
}
|
||||
|
||||
private static long countRecords(String jdbcUrl, HoodieTableMetaClient source, String srcDb, String startDateStr,
|
||||
private static long countRecords(String jdbcUrl, HoodieTableMetaClient source, String srcDb,
|
||||
String startDateStr,
|
||||
String endDateStr, String user, String pass) throws SQLException {
|
||||
Connection conn = HiveUtil.getConnection(jdbcUrl, user, pass);
|
||||
ResultSet rs = null;
|
||||
@@ -107,7 +112,7 @@ public class HiveUtil {
|
||||
"select count(`_hoodie_commit_time`) as cnt from " + srcDb + "." + source.getTableConfig()
|
||||
.getTableName() + " where datestr>'" + startDateStr + "' and datestr<='"
|
||||
+ endDateStr + "'");
|
||||
if(rs.next()) {
|
||||
if (rs.next()) {
|
||||
return rs.getLong("cnt");
|
||||
}
|
||||
return -1;
|
||||
|
||||
@@ -23,8 +23,10 @@ import java.io.InputStreamReader;
|
||||
import java.util.logging.Logger;
|
||||
|
||||
public class InputStreamConsumer extends Thread {
|
||||
|
||||
protected final static Logger LOG = Logger.getLogger(InputStreamConsumer.class.getName());
|
||||
private InputStream is;
|
||||
|
||||
public InputStreamConsumer(InputStream is) {
|
||||
this.is = is;
|
||||
}
|
||||
@@ -35,8 +37,9 @@ public class InputStreamConsumer extends Thread {
|
||||
InputStreamReader isr = new InputStreamReader(is);
|
||||
BufferedReader br = new BufferedReader(isr);
|
||||
String line;
|
||||
while ( (line = br.readLine()) != null)
|
||||
while ((line = br.readLine()) != null) {
|
||||
LOG.info(line);
|
||||
}
|
||||
} catch (IOException ioe) {
|
||||
LOG.severe(ioe.toString());
|
||||
ioe.printStackTrace();
|
||||
|
||||
@@ -18,26 +18,20 @@ package com.uber.hoodie.cli.utils;
|
||||
|
||||
import com.uber.hoodie.HoodieWriteClient;
|
||||
import com.uber.hoodie.cli.commands.SparkMain;
|
||||
|
||||
import java.io.File;
|
||||
import java.net.URISyntaxException;
|
||||
import org.apache.log4j.Logger;
|
||||
import org.apache.spark.SparkConf;
|
||||
import org.apache.spark.api.java.JavaSparkContext;
|
||||
import org.apache.spark.launcher.SparkLauncher;
|
||||
|
||||
import java.io.File;
|
||||
import java.net.URISyntaxException;
|
||||
|
||||
public class SparkUtil {
|
||||
|
||||
public static Logger logger = Logger.getLogger(SparkUtil.class);
|
||||
public static final String DEFUALT_SPARK_MASTER = "yarn-client";
|
||||
|
||||
/**
|
||||
*
|
||||
* TODO: Need to fix a bunch of hardcoded stuff here eg: history server, spark distro
|
||||
*
|
||||
* @return
|
||||
* @throws URISyntaxException
|
||||
*/
|
||||
public static SparkLauncher initLauncher(String propertiesFile) throws URISyntaxException {
|
||||
String currentJar = new File(
|
||||
@@ -65,7 +59,8 @@ public class SparkUtil {
|
||||
// Configure hadoop conf
|
||||
sparkConf.set("spark.hadoop.mapred.output.compress", "true");
|
||||
sparkConf.set("spark.hadoop.mapred.output.compression.codec", "true");
|
||||
sparkConf.set("spark.hadoop.mapred.output.compression.codec", "org.apache.hadoop.io.compress.GzipCodec");
|
||||
sparkConf.set("spark.hadoop.mapred.output.compression.codec",
|
||||
"org.apache.hadoop.io.compress.GzipCodec");
|
||||
sparkConf.set("spark.hadoop.mapred.output.compression.type", "BLOCK");
|
||||
|
||||
sparkConf = HoodieWriteClient.registerClasses(sparkConf);
|
||||
|
||||
@@ -21,6 +21,6 @@
|
||||
xsi:schemaLocation="http://www.springframework.org/schema/beans http://www.springframework.org/schema/beans/spring-beans.xsd
|
||||
http://www.springframework.org/schema/context http://www.springframework.org/schema/context/spring-context.xsd">
|
||||
|
||||
<context:component-scan base-package="com.uber.hoodie.cli" />
|
||||
<context:component-scan base-package="com.uber.hoodie.cli"/>
|
||||
|
||||
</beans>
|
||||
|
||||
@@ -34,7 +34,7 @@ import scala.collection.mutable._
|
||||
/**
|
||||
* Spark job to de-duplicate data present in a partition path
|
||||
*/
|
||||
class DedupeSparkJob (basePath: String,
|
||||
class DedupeSparkJob(basePath: String,
|
||||
duplicatedPartitionPath: String,
|
||||
repairOutputPath: String,
|
||||
sqlContext: SQLContext,
|
||||
@@ -50,8 +50,9 @@ class DedupeSparkJob (basePath: String,
|
||||
* @param tblName
|
||||
* @return
|
||||
*/
|
||||
def getDupeKeyDF(tblName: String) : DataFrame = {
|
||||
val dupeSql = s"""
|
||||
def getDupeKeyDF(tblName: String): DataFrame = {
|
||||
val dupeSql =
|
||||
s"""
|
||||
select `${HoodieRecord.RECORD_KEY_METADATA_FIELD}` as dupe_key,
|
||||
count(*) as dupe_cnt
|
||||
from ${tblName}
|
||||
@@ -69,7 +70,7 @@ class DedupeSparkJob (basePath: String,
|
||||
*
|
||||
* @return
|
||||
*/
|
||||
private def planDuplicateFix() : HashMap[String, HashSet[String]] = {
|
||||
private def planDuplicateFix(): HashMap[String, HashSet[String]] = {
|
||||
|
||||
val tmpTableName = s"htbl_${System.currentTimeMillis()}"
|
||||
val dedupeTblName = s"${tmpTableName}_dupeKeys"
|
||||
@@ -78,17 +79,18 @@ class DedupeSparkJob (basePath: String,
|
||||
|
||||
val allFiles = fs.listStatus(new org.apache.hadoop.fs.Path(s"${basePath}/${duplicatedPartitionPath}"))
|
||||
val fsView = new HoodieTableFileSystemView(metadata, metadata.getActiveTimeline.getCommitTimeline.filterCompletedInstants(), allFiles)
|
||||
val latestFiles:java.util.List[HoodieDataFile] = fsView.getLatestDataFiles().collect(Collectors.toList[HoodieDataFile]())
|
||||
val latestFiles: java.util.List[HoodieDataFile] = fsView.getLatestDataFiles().collect(Collectors.toList[HoodieDataFile]())
|
||||
val filteredStatuses = latestFiles.map(f => f.getPath)
|
||||
LOG.info(s" List of files under partition: ${} => ${filteredStatuses.mkString(" ")}")
|
||||
|
||||
val df = sqlContext.parquetFile(filteredStatuses:_*)
|
||||
val df = sqlContext.parquetFile(filteredStatuses: _*)
|
||||
df.registerTempTable(tmpTableName)
|
||||
val dupeKeyDF = getDupeKeyDF(tmpTableName)
|
||||
dupeKeyDF.registerTempTable(dedupeTblName)
|
||||
|
||||
// Obtain necessary satellite information for duplicate rows
|
||||
val dupeDataSql = s"""
|
||||
val dupeDataSql =
|
||||
s"""
|
||||
SELECT `_hoodie_record_key`, `_hoodie_partition_path`, `_hoodie_file_name`, `_hoodie_commit_time`
|
||||
FROM ${tmpTableName} h
|
||||
JOIN ${dedupeTblName} d
|
||||
@@ -111,9 +113,9 @@ class DedupeSparkJob (basePath: String,
|
||||
|
||||
rows.foreach(r => {
|
||||
val c = r(3).asInstanceOf[String].toLong
|
||||
if (c != maxCommit){
|
||||
if (c != maxCommit) {
|
||||
val f = r(2).asInstanceOf[String].split("_")(0)
|
||||
if (!fileToDeleteKeyMap.contains(f)){
|
||||
if (!fileToDeleteKeyMap.contains(f)) {
|
||||
fileToDeleteKeyMap(f) = HashSet[String]()
|
||||
}
|
||||
fileToDeleteKeyMap(f).add(key)
|
||||
@@ -130,28 +132,30 @@ class DedupeSparkJob (basePath: String,
|
||||
val allFiles = fs.listStatus(new Path(s"${basePath}/${duplicatedPartitionPath}"))
|
||||
val fsView = new HoodieTableFileSystemView(metadata, metadata.getActiveTimeline.getCommitTimeline.filterCompletedInstants(), allFiles)
|
||||
|
||||
val latestFiles:java.util.List[HoodieDataFile] = fsView.getLatestDataFiles().collect(Collectors.toList[HoodieDataFile]())
|
||||
val latestFiles: java.util.List[HoodieDataFile] = fsView.getLatestDataFiles().collect(Collectors.toList[HoodieDataFile]())
|
||||
|
||||
val fileNameToPathMap = latestFiles.map(f => (f.getFileId, new Path(f.getPath))).toMap
|
||||
val dupeFixPlan = planDuplicateFix()
|
||||
|
||||
// 1. Copy all latest files into the temp fix path
|
||||
fileNameToPathMap.foreach{ case(fileName, filePath) => {
|
||||
fileNameToPathMap.foreach { case (fileName, filePath) => {
|
||||
val badSuffix = if (dupeFixPlan.contains(fileName)) ".bad" else ""
|
||||
val dstPath = new Path(s"${repairOutputPath}/${filePath.getName}${badSuffix}")
|
||||
LOG.info(s"Copying from ${filePath} to ${dstPath}")
|
||||
FileUtil.copy(fs, filePath, fs, dstPath, false, true, fs.getConf)
|
||||
}}
|
||||
}
|
||||
}
|
||||
|
||||
// 2. Remove duplicates from the bad files
|
||||
dupeFixPlan.foreach{case(fileName, keysToSkip) => {
|
||||
dupeFixPlan.foreach { case (fileName, keysToSkip) => {
|
||||
val commitTime = FSUtils.getCommitTime(fileNameToPathMap(fileName).getName)
|
||||
val badFilePath = new Path(s"${repairOutputPath}/${fileNameToPathMap(fileName).getName}.bad")
|
||||
val newFilePath = new Path(s"${repairOutputPath}/${fileNameToPathMap(fileName).getName}")
|
||||
LOG.info(" Skipping and writing new file for : " + fileName)
|
||||
SparkHelpers.skipKeysAndWriteNewFile(commitTime, fs, badFilePath, newFilePath, dupeFixPlan(fileName))
|
||||
fs.delete(badFilePath, false)
|
||||
}}
|
||||
}
|
||||
}
|
||||
|
||||
// 3. Check that there are no duplicates anymore.
|
||||
val df = sqlContext.read.parquet(s"${repairOutputPath}/*.parquet")
|
||||
@@ -186,6 +190,7 @@ class DedupeSparkJob (basePath: String,
|
||||
LOG.info(s"[FOR REAL!!!] Copying from ${srcPath} to ${dstPath}")
|
||||
FileUtil.copy(fs, srcPath, fs, dstPath, false, true, fs.getConf)
|
||||
}
|
||||
}}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -17,9 +17,9 @@
|
||||
package com.uber.hoodie.cli
|
||||
|
||||
import com.uber.hoodie.avro.HoodieAvroWriteSupport
|
||||
import com.uber.hoodie.common.{BloomFilter, HoodieJsonPayload}
|
||||
import com.uber.hoodie.common.model.HoodieRecord
|
||||
import com.uber.hoodie.common.util.ParquetUtils
|
||||
import com.uber.hoodie.common.{BloomFilter, HoodieJsonPayload}
|
||||
import com.uber.hoodie.config.{HoodieIndexConfig, HoodieStorageConfig}
|
||||
import com.uber.hoodie.io.storage.{HoodieParquetConfig, HoodieParquetWriter}
|
||||
import org.apache.avro.Schema
|
||||
@@ -107,7 +107,7 @@ class SparkHelper(sqlContext: SQLContext, fs: FileSystem) {
|
||||
* @param file
|
||||
* @param sqlContext
|
||||
*/
|
||||
def getKeyCount(file: String, sqlContext: org.apache.spark.sql.SQLContext) ={
|
||||
def getKeyCount(file: String, sqlContext: org.apache.spark.sql.SQLContext) = {
|
||||
println(getRowKeyDF(file).collect().size)
|
||||
}
|
||||
|
||||
@@ -122,7 +122,7 @@ class SparkHelper(sqlContext: SQLContext, fs: FileSystem) {
|
||||
* @param file
|
||||
* @return
|
||||
*/
|
||||
def fileKeysAgainstBF(conf: Configuration, sqlContext: SQLContext, file: String) : Boolean = {
|
||||
def fileKeysAgainstBF(conf: Configuration, sqlContext: SQLContext, file: String): Boolean = {
|
||||
val bfStr = SparkHelpers.getBloomFilter(file, conf)
|
||||
val bf = new com.uber.hoodie.common.BloomFilter(bfStr)
|
||||
val foundCount = sqlContext.parquetFile(file)
|
||||
@@ -134,7 +134,7 @@ class SparkHelper(sqlContext: SQLContext, fs: FileSystem) {
|
||||
totalCount == foundCount
|
||||
}
|
||||
|
||||
def getDistinctKeyDF(paths: List[String]) : DataFrame = {
|
||||
sqlContext.read.parquet(paths:_*).select(s"`${HoodieRecord.RECORD_KEY_METADATA_FIELD}`").distinct()
|
||||
def getDistinctKeyDF(paths: List[String]): DataFrame = {
|
||||
sqlContext.read.parquet(paths: _*).select(s"`${HoodieRecord.RECORD_KEY_METADATA_FIELD}`").distinct()
|
||||
}
|
||||
}
|
||||
|
||||
@@ -15,7 +15,9 @@
|
||||
~ limitations under the License.
|
||||
-->
|
||||
|
||||
<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
|
||||
<project xmlns="http://maven.apache.org/POM/4.0.0"
|
||||
xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
|
||||
xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
|
||||
<parent>
|
||||
<artifactId>hoodie</artifactId>
|
||||
<groupId>com.uber.hoodie</groupId>
|
||||
|
||||
@@ -17,25 +17,19 @@
|
||||
package com.uber.hoodie;
|
||||
|
||||
import com.google.common.base.Optional;
|
||||
|
||||
import com.uber.hoodie.common.model.HoodieCommitMetadata;
|
||||
import com.uber.hoodie.common.model.HoodieDataFile;
|
||||
import com.uber.hoodie.common.model.HoodieKey;
|
||||
import com.uber.hoodie.common.model.HoodieRecord;
|
||||
import com.uber.hoodie.common.table.HoodieTableMetaClient;
|
||||
import com.uber.hoodie.common.table.HoodieTimeline;
|
||||
import com.uber.hoodie.common.table.TableFileSystemView;
|
||||
import com.uber.hoodie.common.table.timeline.HoodieInstant;
|
||||
import com.uber.hoodie.common.table.view.HoodieTableFileSystemView;
|
||||
import com.uber.hoodie.common.util.FSUtils;
|
||||
import com.uber.hoodie.config.HoodieWriteConfig;
|
||||
import com.uber.hoodie.exception.HoodieException;
|
||||
import com.uber.hoodie.index.bloom.HoodieBloomIndex;
|
||||
|
||||
import com.uber.hoodie.table.HoodieTable;
|
||||
|
||||
import java.io.Serializable;
|
||||
import java.util.HashSet;
|
||||
import java.util.List;
|
||||
import java.util.Set;
|
||||
import org.apache.hadoop.fs.FileSystem;
|
||||
import org.apache.hadoop.fs.Path;
|
||||
import org.apache.log4j.LogManager;
|
||||
import org.apache.log4j.Logger;
|
||||
import org.apache.spark.SparkConf;
|
||||
@@ -46,21 +40,10 @@ import org.apache.spark.sql.Dataset;
|
||||
import org.apache.spark.sql.Row;
|
||||
import org.apache.spark.sql.SQLContext;
|
||||
import org.apache.spark.sql.types.StructType;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.io.Serializable;
|
||||
import java.util.ArrayList;
|
||||
import java.util.HashMap;
|
||||
import java.util.HashSet;
|
||||
import java.util.List;
|
||||
import java.util.Set;
|
||||
import java.util.stream.Collectors;
|
||||
|
||||
import scala.Tuple2;
|
||||
|
||||
/**
|
||||
* Provides an RDD based API for accessing/filtering Hoodie tables, based on keys.
|
||||
*
|
||||
*/
|
||||
public class HoodieReadClient implements Serializable {
|
||||
|
||||
@@ -70,8 +53,8 @@ public class HoodieReadClient implements Serializable {
|
||||
|
||||
private transient final FileSystem fs;
|
||||
/**
|
||||
* TODO: We need to persist the index type into hoodie.properties and be able to access the
|
||||
* index just with a simple basepath pointing to the dataset. Until, then just always assume a
|
||||
* TODO: We need to persist the index type into hoodie.properties and be able to access the index
|
||||
* just with a simple basepath pointing to the dataset. Until, then just always assume a
|
||||
* BloomIndex
|
||||
*/
|
||||
private transient final HoodieBloomIndex index;
|
||||
@@ -117,7 +100,8 @@ public class HoodieReadClient implements Serializable {
|
||||
|
||||
private void assertSqlContext() {
|
||||
if (!sqlContextOpt.isPresent()) {
|
||||
throw new IllegalStateException("SQLContext must be set, when performing dataframe operations");
|
||||
throw new IllegalStateException(
|
||||
"SQLContext must be set, when performing dataframe operations");
|
||||
}
|
||||
}
|
||||
|
||||
@@ -158,10 +142,10 @@ public class HoodieReadClient implements Serializable {
|
||||
}
|
||||
|
||||
/**
|
||||
* Checks if the given [Keys] exists in the hoodie table and returns [Key,
|
||||
* Optional[FullFilePath]] If the optional FullFilePath value is not present, then the key is
|
||||
* not found. If the FullFilePath value is present, it is the path component (without scheme) of
|
||||
* the URI underlying file
|
||||
* Checks if the given [Keys] exists in the hoodie table and returns [Key, Optional[FullFilePath]]
|
||||
* If the optional FullFilePath value is not present, then the key is not found. If the
|
||||
* FullFilePath value is present, it is the path component (without scheme) of the URI underlying
|
||||
* file
|
||||
*/
|
||||
public JavaPairRDD<HoodieKey, Optional<String>> checkExists(JavaRDD<HoodieKey> hoodieKeys) {
|
||||
return index.fetchRecordLocation(hoodieKeys, hoodieTable);
|
||||
|
||||
@@ -50,10 +50,21 @@ import com.uber.hoodie.func.BulkInsertMapFunction;
|
||||
import com.uber.hoodie.index.HoodieIndex;
|
||||
import com.uber.hoodie.io.HoodieCommitArchiveLog;
|
||||
import com.uber.hoodie.metrics.HoodieMetrics;
|
||||
import com.uber.hoodie.table.UserDefinedBulkInsertPartitioner;
|
||||
import com.uber.hoodie.table.HoodieTable;
|
||||
import com.uber.hoodie.table.UserDefinedBulkInsertPartitioner;
|
||||
import com.uber.hoodie.table.WorkloadProfile;
|
||||
import com.uber.hoodie.table.WorkloadStat;
|
||||
import java.io.IOException;
|
||||
import java.io.Serializable;
|
||||
import java.nio.charset.StandardCharsets;
|
||||
import java.text.ParseException;
|
||||
import java.util.Collections;
|
||||
import java.util.Date;
|
||||
import java.util.HashMap;
|
||||
import java.util.List;
|
||||
import java.util.Map;
|
||||
import java.util.Optional;
|
||||
import java.util.stream.Collectors;
|
||||
import org.apache.hadoop.fs.FileSystem;
|
||||
import org.apache.log4j.LogManager;
|
||||
import org.apache.log4j.Logger;
|
||||
@@ -66,25 +77,12 @@ import org.apache.spark.storage.StorageLevel;
|
||||
import scala.Option;
|
||||
import scala.Tuple2;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.io.Serializable;
|
||||
import java.nio.charset.StandardCharsets;
|
||||
import java.text.ParseException;
|
||||
import java.util.Collections;
|
||||
import java.util.Date;
|
||||
import java.util.HashMap;
|
||||
import java.util.List;
|
||||
import java.util.Map;
|
||||
import java.util.Optional;
|
||||
import java.util.stream.Collectors;
|
||||
|
||||
/**
|
||||
* Hoodie Write Client helps you build datasets on HDFS [insert()] and then
|
||||
* perform efficient mutations on a HDFS dataset [upsert()]
|
||||
*
|
||||
* Note that, at any given time, there can only be one Spark job performing
|
||||
* these operatons on a Hoodie dataset.
|
||||
* Hoodie Write Client helps you build datasets on HDFS [insert()] and then perform efficient
|
||||
* mutations on a HDFS dataset [upsert()]
|
||||
*
|
||||
* Note that, at any given time, there can only be one Spark job performing these operatons on a
|
||||
* Hoodie dataset.
|
||||
*/
|
||||
public class HoodieWriteClient<T extends HoodieRecordPayload> implements Serializable {
|
||||
|
||||
@@ -102,7 +100,8 @@ public class HoodieWriteClient<T extends HoodieRecordPayload> implements Seriali
|
||||
* @param clientConfig
|
||||
* @throws Exception
|
||||
*/
|
||||
public HoodieWriteClient(JavaSparkContext jsc, HoodieWriteConfig clientConfig) throws Exception {
|
||||
public HoodieWriteClient(JavaSparkContext jsc, HoodieWriteConfig clientConfig)
|
||||
throws Exception {
|
||||
this(jsc, clientConfig, false);
|
||||
}
|
||||
|
||||
@@ -111,7 +110,8 @@ public class HoodieWriteClient<T extends HoodieRecordPayload> implements Seriali
|
||||
* @param clientConfig
|
||||
* @param rollbackInFlight
|
||||
*/
|
||||
public HoodieWriteClient(JavaSparkContext jsc, HoodieWriteConfig clientConfig, boolean rollbackInFlight) {
|
||||
public HoodieWriteClient(JavaSparkContext jsc, HoodieWriteConfig clientConfig,
|
||||
boolean rollbackInFlight) {
|
||||
this.fs = FSUtils.getFs();
|
||||
this.jsc = jsc;
|
||||
this.config = clientConfig;
|
||||
@@ -170,8 +170,8 @@ public class HoodieWriteClient<T extends HoodieRecordPayload> implements Seriali
|
||||
* Inserts the given HoodieRecords, into the table. This API is intended to be used for normal
|
||||
* writes.
|
||||
*
|
||||
* This implementation skips the index check and is able to leverage benefits such as
|
||||
* small file handling/blocking alignment, as with upsert(), by profiling the workload
|
||||
* This implementation skips the index check and is able to leverage benefits such as small file
|
||||
* handling/blocking alignment, as with upsert(), by profiling the workload
|
||||
*
|
||||
* @param records HoodieRecords to insert
|
||||
* @param commitTime Commit Time handle
|
||||
@@ -210,7 +210,8 @@ public class HoodieWriteClient<T extends HoodieRecordPayload> implements Seriali
|
||||
* @param commitTime Commit Time handle
|
||||
* @return JavaRDD[WriteStatus] - RDD of WriteStatus to inspect errors and counts
|
||||
*/
|
||||
public JavaRDD<WriteStatus> bulkInsert(JavaRDD<HoodieRecord<T>> records, final String commitTime) {
|
||||
public JavaRDD<WriteStatus> bulkInsert(JavaRDD<HoodieRecord<T>> records,
|
||||
final String commitTime) {
|
||||
return bulkInsert(records, commitTime, Option.empty());
|
||||
}
|
||||
|
||||
@@ -221,16 +222,18 @@ public class HoodieWriteClient<T extends HoodieRecordPayload> implements Seriali
|
||||
*
|
||||
* This implementation uses sortBy (which does range partitioning based on reservoir sampling) and
|
||||
* attempts to control the numbers of files with less memory compared to the {@link
|
||||
* HoodieWriteClient#insert(JavaRDD, String)}. Optionally it allows users to specify their own partitioner. If
|
||||
* specified then it will be used for repartitioning records. See {@link UserDefinedBulkInsertPartitioner}.
|
||||
* HoodieWriteClient#insert(JavaRDD, String)}. Optionally it allows users to specify their own
|
||||
* partitioner. If specified then it will be used for repartitioning records. See {@link
|
||||
* UserDefinedBulkInsertPartitioner}.
|
||||
*
|
||||
* @param records HoodieRecords to insert
|
||||
* @param commitTime Commit Time handle
|
||||
* @param bulkInsertPartitioner If specified then it will be used to partition input records before they are
|
||||
* inserted into hoodie.
|
||||
* @param bulkInsertPartitioner If specified then it will be used to partition input records
|
||||
* before they are inserted into hoodie.
|
||||
* @return JavaRDD[WriteStatus] - RDD of WriteStatus to inspect errors and counts
|
||||
*/
|
||||
public JavaRDD<WriteStatus> bulkInsert(JavaRDD<HoodieRecord<T>> records, final String commitTime,
|
||||
public JavaRDD<WriteStatus> bulkInsert(JavaRDD<HoodieRecord<T>> records,
|
||||
final String commitTime,
|
||||
Option<UserDefinedBulkInsertPartitioner> bulkInsertPartitioner) {
|
||||
writeContext = metrics.getCommitCtx();
|
||||
// Create a Hoodie table which encapsulated the commits and files visible
|
||||
@@ -240,7 +243,8 @@ public class HoodieWriteClient<T extends HoodieRecordPayload> implements Seriali
|
||||
try {
|
||||
// De-dupe/merge if needed
|
||||
JavaRDD<HoodieRecord<T>> dedupedRecords =
|
||||
combineOnCondition(config.shouldCombineBeforeInsert(), records, config.getInsertShuffleParallelism());
|
||||
combineOnCondition(config.shouldCombineBeforeInsert(), records,
|
||||
config.getInsertShuffleParallelism());
|
||||
|
||||
final JavaRDD<HoodieRecord<T>> repartitionedRecords;
|
||||
if (bulkInsertPartitioner.isDefined()) {
|
||||
@@ -259,7 +263,8 @@ public class HoodieWriteClient<T extends HoodieRecordPayload> implements Seriali
|
||||
}, true, config.getBulkInsertShuffleParallelism());
|
||||
}
|
||||
JavaRDD<WriteStatus> writeStatusRDD = repartitionedRecords
|
||||
.mapPartitionsWithIndex(new BulkInsertMapFunction<T>(commitTime, config, table), true)
|
||||
.mapPartitionsWithIndex(new BulkInsertMapFunction<T>(commitTime, config, table),
|
||||
true)
|
||||
.flatMap(writeStatuses -> writeStatuses.iterator());
|
||||
|
||||
return updateIndexAndCommitIfNeeded(writeStatusRDD, table, commitTime);
|
||||
@@ -267,12 +272,13 @@ public class HoodieWriteClient<T extends HoodieRecordPayload> implements Seriali
|
||||
if (e instanceof HoodieInsertException) {
|
||||
throw e;
|
||||
}
|
||||
throw new HoodieInsertException("Failed to bulk insert for commit time " + commitTime, e);
|
||||
throw new HoodieInsertException("Failed to bulk insert for commit time " + commitTime,
|
||||
e);
|
||||
}
|
||||
}
|
||||
|
||||
private void commitOnAutoCommit(String commitTime, JavaRDD<WriteStatus> resultRDD) {
|
||||
if(config.shouldAutoCommit()) {
|
||||
if (config.shouldAutoCommit()) {
|
||||
logger.info("Auto commit enabled: Committing " + commitTime);
|
||||
boolean commitResult = commit(commitTime, resultRDD);
|
||||
if (!commitResult) {
|
||||
@@ -286,24 +292,22 @@ public class HoodieWriteClient<T extends HoodieRecordPayload> implements Seriali
|
||||
private JavaRDD<HoodieRecord<T>> combineOnCondition(boolean condition,
|
||||
JavaRDD<HoodieRecord<T>> records,
|
||||
int parallelism) {
|
||||
if(condition) {
|
||||
if (condition) {
|
||||
return deduplicateRecords(records, parallelism);
|
||||
}
|
||||
return records;
|
||||
}
|
||||
|
||||
/**
|
||||
*
|
||||
* Save the workload profile in an intermediate file (here re-using commit files)
|
||||
* This is useful when performing rollback for MOR datasets. Only updates are recorded
|
||||
* in the workload profile metadata since updates to log blocks are unknown across batches
|
||||
* Inserts (which are new parquet files) are rolled back based on commit time.
|
||||
* // TODO : Create a new WorkloadProfile metadata file instead of using HoodieCommitMetadata
|
||||
* @param profile
|
||||
* @param commitTime
|
||||
* @throws HoodieCommitException
|
||||
* Save the workload profile in an intermediate file (here re-using commit files) This is useful
|
||||
* when performing rollback for MOR datasets. Only updates are recorded in the workload profile
|
||||
* metadata since updates to log blocks are unknown across batches Inserts (which are new parquet
|
||||
* files) are rolled back based on commit time. // TODO : Create a new WorkloadProfile metadata
|
||||
* file instead of using HoodieCommitMetadata
|
||||
*/
|
||||
private void saveWorkloadProfileMetadataToInflight(WorkloadProfile profile, HoodieTable<T> table, String commitTime) throws HoodieCommitException {
|
||||
private void saveWorkloadProfileMetadataToInflight(WorkloadProfile profile,
|
||||
HoodieTable<T> table,
|
||||
String commitTime) throws HoodieCommitException {
|
||||
try {
|
||||
HoodieCommitMetadata metadata = new HoodieCommitMetadata();
|
||||
profile.getPartitionPaths().stream().forEach(path -> {
|
||||
@@ -320,8 +324,9 @@ public class HoodieWriteClient<T extends HoodieRecordPayload> implements Seriali
|
||||
Optional<HoodieInstant> instant = activeTimeline.filterInflights().lastInstant();
|
||||
activeTimeline.saveToInflight(instant.get(),
|
||||
Optional.of(metadata.toJsonString().getBytes(StandardCharsets.UTF_8)));
|
||||
} catch(IOException io) {
|
||||
throw new HoodieCommitException("Failed to commit " + commitTime + " unable to save inflight metadata ", io);
|
||||
} catch (IOException io) {
|
||||
throw new HoodieCommitException(
|
||||
"Failed to commit " + commitTime + " unable to save inflight metadata ", io);
|
||||
}
|
||||
}
|
||||
|
||||
@@ -358,7 +363,8 @@ public class HoodieWriteClient<T extends HoodieRecordPayload> implements Seriali
|
||||
return updateIndexAndCommitIfNeeded(writeStatusRDD, hoodieTable, commitTime);
|
||||
}
|
||||
|
||||
private Partitioner getPartitioner(HoodieTable table, boolean isUpsert, WorkloadProfile profile) {
|
||||
private Partitioner getPartitioner(HoodieTable table, boolean isUpsert,
|
||||
WorkloadProfile profile) {
|
||||
if (isUpsert) {
|
||||
return table.getUpsertPartitioner(profile);
|
||||
} else {
|
||||
@@ -366,7 +372,8 @@ public class HoodieWriteClient<T extends HoodieRecordPayload> implements Seriali
|
||||
}
|
||||
}
|
||||
|
||||
private JavaRDD<WriteStatus> updateIndexAndCommitIfNeeded(JavaRDD<WriteStatus> writeStatusRDD, HoodieTable<T> table, String commitTime) {
|
||||
private JavaRDD<WriteStatus> updateIndexAndCommitIfNeeded(JavaRDD<WriteStatus> writeStatusRDD,
|
||||
HoodieTable<T> table, String commitTime) {
|
||||
// Update the index back
|
||||
JavaRDD<WriteStatus> statuses = index.updateLocation(writeStatusRDD, table);
|
||||
// Trigger the insert and collect statuses
|
||||
@@ -375,10 +382,13 @@ public class HoodieWriteClient<T extends HoodieRecordPayload> implements Seriali
|
||||
return statuses;
|
||||
}
|
||||
|
||||
private JavaRDD<HoodieRecord<T>> partition(JavaRDD<HoodieRecord<T>> dedupedRecords, Partitioner partitioner) {
|
||||
private JavaRDD<HoodieRecord<T>> partition(JavaRDD<HoodieRecord<T>> dedupedRecords,
|
||||
Partitioner partitioner) {
|
||||
return dedupedRecords
|
||||
.mapToPair(record ->
|
||||
new Tuple2<>(new Tuple2<>(record.getKey(), Option.apply(record.getCurrentLocation())), record))
|
||||
new Tuple2<>(
|
||||
new Tuple2<>(record.getKey(), Option.apply(record.getCurrentLocation())),
|
||||
record))
|
||||
.partitionBy(partitioner)
|
||||
.map(tuple -> tuple._2());
|
||||
}
|
||||
@@ -438,7 +448,7 @@ public class HoodieWriteClient<T extends HoodieRecordPayload> implements Seriali
|
||||
|
||||
// We cannot have unbounded commit files. Archive commits if we have to archive
|
||||
archiveLog.archiveIfRequired();
|
||||
if(config.isAutoClean()) {
|
||||
if (config.isAutoClean()) {
|
||||
// Call clean to cleanup if there is anything to cleanup after the commit,
|
||||
logger.info("Auto cleaning is enabled. Running cleaner now");
|
||||
clean(commitTime);
|
||||
@@ -465,12 +475,12 @@ public class HoodieWriteClient<T extends HoodieRecordPayload> implements Seriali
|
||||
}
|
||||
|
||||
/**
|
||||
* Savepoint a specific commit. Latest version of data files as of the passed in commitTime
|
||||
* will be referenced in the savepoint and will never be cleaned. The savepointed commit
|
||||
* will never be rolledback or archived.
|
||||
* Savepoint a specific commit. Latest version of data files as of the passed in commitTime will
|
||||
* be referenced in the savepoint and will never be cleaned. The savepointed commit will never be
|
||||
* rolledback or archived.
|
||||
*
|
||||
* This gives an option to rollback the state to the savepoint anytime.
|
||||
* Savepoint needs to be manually created and deleted.
|
||||
* This gives an option to rollback the state to the savepoint anytime. Savepoint needs to be
|
||||
* manually created and deleted.
|
||||
*
|
||||
* Savepoint should be on a commit that could not have been cleaned.
|
||||
*
|
||||
@@ -491,12 +501,12 @@ public class HoodieWriteClient<T extends HoodieRecordPayload> implements Seriali
|
||||
}
|
||||
|
||||
/**
|
||||
* Savepoint a specific commit. Latest version of data files as of the passed in commitTime
|
||||
* will be referenced in the savepoint and will never be cleaned. The savepointed commit
|
||||
* will never be rolledback or archived.
|
||||
* Savepoint a specific commit. Latest version of data files as of the passed in commitTime will
|
||||
* be referenced in the savepoint and will never be cleaned. The savepointed commit will never be
|
||||
* rolledback or archived.
|
||||
*
|
||||
* This gives an option to rollback the state to the savepoint anytime.
|
||||
* Savepoint needs to be manually created and deleted.
|
||||
* This gives an option to rollback the state to the savepoint anytime. Savepoint needs to be
|
||||
* manually created and deleted.
|
||||
*
|
||||
* Savepoint should be on a commit that could not have been cleaned.
|
||||
*
|
||||
@@ -510,9 +520,11 @@ public class HoodieWriteClient<T extends HoodieRecordPayload> implements Seriali
|
||||
.getHoodieTable(new HoodieTableMetaClient(fs, config.getBasePath(), true), config);
|
||||
Optional<HoodieInstant> cleanInstant = table.getCompletedCleanTimeline().lastInstant();
|
||||
|
||||
HoodieInstant commitInstant = new HoodieInstant(false, HoodieTimeline.COMMIT_ACTION, commitTime);
|
||||
if(!table.getCompletedCommitTimeline().containsInstant(commitInstant)) {
|
||||
throw new HoodieSavepointException("Could not savepoint non-existing commit " + commitInstant);
|
||||
HoodieInstant commitInstant = new HoodieInstant(false, HoodieTimeline.COMMIT_ACTION,
|
||||
commitTime);
|
||||
if (!table.getCompletedCommitTimeline().containsInstant(commitInstant)) {
|
||||
throw new HoodieSavepointException(
|
||||
"Could not savepoint non-existing commit " + commitInstant);
|
||||
}
|
||||
|
||||
try {
|
||||
@@ -534,7 +546,8 @@ public class HoodieWriteClient<T extends HoodieRecordPayload> implements Seriali
|
||||
+ lastCommitRetained);
|
||||
|
||||
Map<String, List<String>> latestFilesMap = jsc.parallelize(
|
||||
FSUtils.getAllPartitionPaths(fs, table.getMetaClient().getBasePath(), config.shouldAssumeDatePartitioning()))
|
||||
FSUtils.getAllPartitionPaths(fs, table.getMetaClient().getBasePath(),
|
||||
config.shouldAssumeDatePartitioning()))
|
||||
.mapToPair((PairFunction<String, String, List<String>>) partitionPath -> {
|
||||
// Scan all partitions files with this commit time
|
||||
logger.info("Collecting latest files in partition path " + partitionPath);
|
||||
@@ -559,8 +572,8 @@ public class HoodieWriteClient<T extends HoodieRecordPayload> implements Seriali
|
||||
}
|
||||
|
||||
/**
|
||||
* Delete a savepoint that was created. Once the savepoint is deleted, the commit can be rolledback
|
||||
* and cleaner may clean up data files.
|
||||
* Delete a savepoint that was created. Once the savepoint is deleted, the commit can be
|
||||
* rolledback and cleaner may clean up data files.
|
||||
*
|
||||
* @param savepointTime - delete the savepoint
|
||||
* @return true if the savepoint was deleted successfully
|
||||
@@ -586,9 +599,8 @@ public class HoodieWriteClient<T extends HoodieRecordPayload> implements Seriali
|
||||
}
|
||||
|
||||
/**
|
||||
* Rollback the state to the savepoint.
|
||||
* WARNING: This rollsback recent commits and deleted data files. Queries accessing the files
|
||||
* will mostly fail. This should be done during a downtime.
|
||||
* Rollback the state to the savepoint. WARNING: This rollsback recent commits and deleted data
|
||||
* files. Queries accessing the files will mostly fail. This should be done during a downtime.
|
||||
*
|
||||
* @param savepointTime - savepoint time to rollback to
|
||||
* @return true if the savepoint was rollecback to successfully
|
||||
@@ -616,7 +628,8 @@ public class HoodieWriteClient<T extends HoodieRecordPayload> implements Seriali
|
||||
|
||||
// Make sure the rollback was successful
|
||||
Optional<HoodieInstant> lastInstant =
|
||||
activeTimeline.reload().getCommitsAndCompactionsTimeline().filterCompletedInstants().lastInstant();
|
||||
activeTimeline.reload().getCommitsAndCompactionsTimeline().filterCompletedInstants()
|
||||
.lastInstant();
|
||||
Preconditions.checkArgument(lastInstant.isPresent());
|
||||
Preconditions.checkArgument(lastInstant.get().getTimestamp().equals(savepointTime),
|
||||
savepointTime + "is not the last commit after rolling back " + commitsToRollback
|
||||
@@ -625,12 +638,9 @@ public class HoodieWriteClient<T extends HoodieRecordPayload> implements Seriali
|
||||
}
|
||||
|
||||
/**
|
||||
* Rollback the (inflight/committed) record changes with the given commit time.
|
||||
* Three steps:
|
||||
* (1) Atomically unpublish this commit
|
||||
* (2) clean indexing data,
|
||||
* (3) clean new generated parquet files.
|
||||
* (4) Finally delete .commit or .inflight file,
|
||||
* Rollback the (inflight/committed) record changes with the given commit time. Three steps: (1)
|
||||
* Atomically unpublish this commit (2) clean indexing data, (3) clean new generated parquet
|
||||
* files. (4) Finally delete .commit or .inflight file,
|
||||
*/
|
||||
public boolean rollback(final String commitTime) throws HoodieRollbackException {
|
||||
rollback(Lists.newArrayList(commitTime));
|
||||
@@ -638,7 +648,7 @@ public class HoodieWriteClient<T extends HoodieRecordPayload> implements Seriali
|
||||
}
|
||||
|
||||
private void rollback(List<String> commits) {
|
||||
if(commits.isEmpty()) {
|
||||
if (commits.isEmpty()) {
|
||||
logger.info("List of commits to rollback is empty");
|
||||
return;
|
||||
}
|
||||
@@ -702,7 +712,9 @@ public class HoodieWriteClient<T extends HoodieRecordPayload> implements Seriali
|
||||
Optional<Long> durationInMs = Optional.empty();
|
||||
if (context != null) {
|
||||
durationInMs = Optional.of(metrics.getDurationInMs(context.stop()));
|
||||
Long numFilesDeleted = stats.stream().mapToLong(stat -> stat.getSuccessDeleteFiles().size()).sum();
|
||||
Long numFilesDeleted = stats.stream()
|
||||
.mapToLong(stat -> stat.getSuccessDeleteFiles().size())
|
||||
.sum();
|
||||
metrics.updateRollbackMetrics(durationInMs.get(), numFilesDeleted);
|
||||
}
|
||||
HoodieRollbackMetadata rollbackMetadata =
|
||||
@@ -733,9 +745,9 @@ public class HoodieWriteClient<T extends HoodieRecordPayload> implements Seriali
|
||||
}
|
||||
|
||||
/**
|
||||
* Clean up any stale/old files/data lying around (either on file storage or index storage)
|
||||
* based on the configurations and CleaningPolicy used. (typically files that no longer can be used
|
||||
* by a running query can be cleaned)
|
||||
* Clean up any stale/old files/data lying around (either on file storage or index storage) based
|
||||
* on the configurations and CleaningPolicy used. (typically files that no longer can be used by a
|
||||
* running query can be cleaned)
|
||||
*/
|
||||
public void clean() throws HoodieIOException {
|
||||
String startCleanTime = HoodieActiveTimeline.createNewCommitTime();
|
||||
@@ -743,9 +755,9 @@ public class HoodieWriteClient<T extends HoodieRecordPayload> implements Seriali
|
||||
}
|
||||
|
||||
/**
|
||||
* Clean up any stale/old files/data lying around (either on file storage or index storage)
|
||||
* based on the configurations and CleaningPolicy used. (typically files that no longer can be used
|
||||
* by a running query can be cleaned)
|
||||
* Clean up any stale/old files/data lying around (either on file storage or index storage) based
|
||||
* on the configurations and CleaningPolicy used. (typically files that no longer can be used by a
|
||||
* running query can be cleaned)
|
||||
*/
|
||||
private void clean(String startCleanTime) throws HoodieIOException {
|
||||
try {
|
||||
@@ -811,14 +823,16 @@ public class HoodieWriteClient<T extends HoodieRecordPayload> implements Seriali
|
||||
}
|
||||
|
||||
public static SparkConf registerClasses(SparkConf conf) {
|
||||
conf.registerKryoClasses(new Class[]{HoodieWriteConfig.class, HoodieRecord.class, HoodieKey.class});
|
||||
conf.registerKryoClasses(
|
||||
new Class[]{HoodieWriteConfig.class, HoodieRecord.class, HoodieKey.class});
|
||||
return conf;
|
||||
}
|
||||
|
||||
/**
|
||||
* Deduplicate Hoodie records, using the given deduplication funciton.
|
||||
*/
|
||||
private JavaRDD<HoodieRecord<T>> deduplicateRecords(JavaRDD<HoodieRecord<T>> records, int parallelism) {
|
||||
private JavaRDD<HoodieRecord<T>> deduplicateRecords(JavaRDD<HoodieRecord<T>> records,
|
||||
int parallelism) {
|
||||
return records
|
||||
.mapToPair(record -> new Tuple2<>(record.getKey(), record))
|
||||
.reduceByKey((rec1, rec2) -> {
|
||||
@@ -833,8 +847,6 @@ public class HoodieWriteClient<T extends HoodieRecordPayload> implements Seriali
|
||||
|
||||
/**
|
||||
* Cleanup all inflight commits
|
||||
*
|
||||
* @throws IOException
|
||||
*/
|
||||
private void rollbackInflightCommits() {
|
||||
HoodieTable<T> table = HoodieTable
|
||||
|
||||
@@ -19,7 +19,6 @@ package com.uber.hoodie;
|
||||
import com.uber.hoodie.common.model.HoodieKey;
|
||||
import com.uber.hoodie.common.model.HoodieRecord;
|
||||
import com.uber.hoodie.common.model.HoodieWriteStat;
|
||||
|
||||
import java.io.Serializable;
|
||||
import java.util.ArrayList;
|
||||
import java.util.HashMap;
|
||||
@@ -50,12 +49,14 @@ public class WriteStatus implements Serializable {
|
||||
private long totalErrorRecords = 0;
|
||||
|
||||
/**
|
||||
* Mark write as success, optionally using given parameters for the purpose of calculating
|
||||
* some aggregate metrics. This method is not meant to cache passed arguments, since WriteStatus
|
||||
* Mark write as success, optionally using given parameters for the purpose of calculating some
|
||||
* aggregate metrics. This method is not meant to cache passed arguments, since WriteStatus
|
||||
* objects are collected in Spark Driver.
|
||||
*
|
||||
* @param record deflated {@code HoodieRecord} containing information that uniquely identifies it.
|
||||
* @param optionalRecordMetadata optional metadata related to data contained in {@link HoodieRecord} before deflation.
|
||||
* @param record deflated {@code HoodieRecord} containing information that uniquely identifies
|
||||
* it.
|
||||
* @param optionalRecordMetadata optional metadata related to data contained in {@link
|
||||
* HoodieRecord} before deflation.
|
||||
*/
|
||||
public void markSuccess(HoodieRecord record,
|
||||
Optional<Map<String, String>> optionalRecordMetadata) {
|
||||
@@ -64,12 +65,14 @@ public class WriteStatus implements Serializable {
|
||||
}
|
||||
|
||||
/**
|
||||
* Mark write as failed, optionally using given parameters for the purpose of calculating
|
||||
* some aggregate metrics. This method is not meant to cache passed arguments, since WriteStatus
|
||||
* Mark write as failed, optionally using given parameters for the purpose of calculating some
|
||||
* aggregate metrics. This method is not meant to cache passed arguments, since WriteStatus
|
||||
* objects are collected in Spark Driver.
|
||||
*
|
||||
* @param record deflated {@code HoodieRecord} containing information that uniquely identifies it.
|
||||
* @param optionalRecordMetadata optional metadata related to data contained in {@link HoodieRecord} before deflation.
|
||||
* @param record deflated {@code HoodieRecord} containing information that uniquely identifies
|
||||
* it.
|
||||
* @param optionalRecordMetadata optional metadata related to data contained in {@link
|
||||
* HoodieRecord} before deflation.
|
||||
*/
|
||||
public void markFailure(HoodieRecord record, Throwable t,
|
||||
Optional<Map<String, String>> optionalRecordMetadata) {
|
||||
@@ -139,7 +142,9 @@ public class WriteStatus implements Serializable {
|
||||
return totalRecords;
|
||||
}
|
||||
|
||||
public long getTotalErrorRecords() { return totalErrorRecords; }
|
||||
public long getTotalErrorRecords() {
|
||||
return totalErrorRecords;
|
||||
}
|
||||
|
||||
@Override
|
||||
public String toString() {
|
||||
|
||||
@@ -17,14 +17,15 @@
|
||||
package com.uber.hoodie.config;
|
||||
|
||||
import java.io.Serializable;
|
||||
import java.util.Map;
|
||||
import java.util.Properties;
|
||||
|
||||
/**
|
||||
* Default Way to load Hoodie config through a java.util.Properties
|
||||
*/
|
||||
public class DefaultHoodieConfig implements Serializable {
|
||||
|
||||
protected final Properties props;
|
||||
|
||||
public DefaultHoodieConfig(Properties props) {
|
||||
this.props = props;
|
||||
}
|
||||
@@ -40,7 +41,8 @@ public class DefaultHoodieConfig implements Serializable {
|
||||
}
|
||||
}
|
||||
|
||||
public static void setDefaultOnCondition(Properties props, boolean condition, DefaultHoodieConfig config) {
|
||||
public static void setDefaultOnCondition(Properties props, boolean condition,
|
||||
DefaultHoodieConfig config) {
|
||||
if (condition) {
|
||||
props.putAll(config.getProps());
|
||||
}
|
||||
|
||||
@@ -19,21 +19,20 @@ package com.uber.hoodie.config;
|
||||
import com.google.common.base.Preconditions;
|
||||
import com.uber.hoodie.common.model.HoodieAvroPayload;
|
||||
import com.uber.hoodie.common.model.HoodieCleaningPolicy;
|
||||
import com.uber.hoodie.common.model.HoodieRecordPayload;
|
||||
import com.uber.hoodie.io.compact.strategy.CompactionStrategy;
|
||||
import com.uber.hoodie.io.compact.strategy.LogFileSizeBasedCompactionStrategy;
|
||||
|
||||
import javax.annotation.concurrent.Immutable;
|
||||
import java.io.File;
|
||||
import java.io.FileReader;
|
||||
import java.io.IOException;
|
||||
import java.util.Properties;
|
||||
import javax.annotation.concurrent.Immutable;
|
||||
|
||||
/**
|
||||
* Compaction related config
|
||||
*/
|
||||
@Immutable
|
||||
public class HoodieCompactionConfig extends DefaultHoodieConfig {
|
||||
|
||||
public static final String CLEANER_POLICY_PROP = "hoodie.cleaner.policy";
|
||||
private static final String DEFAULT_CLEANER_POLICY =
|
||||
HoodieCleaningPolicy.KEEP_LATEST_COMMITS.name();
|
||||
@@ -66,7 +65,9 @@ public class HoodieCompactionConfig extends DefaultHoodieConfig {
|
||||
public static final String DEFAULT_PARQUET_SMALL_FILE_LIMIT_BYTES = String.valueOf(0);
|
||||
|
||||
|
||||
/** Configs related to specific table types **/
|
||||
/**
|
||||
* Configs related to specific table types
|
||||
**/
|
||||
// Number of inserts, that will be put each partition/bucket for writing
|
||||
public static final String COPY_ON_WRITE_TABLE_INSERT_SPLIT_SIZE = "hoodie.copyonwrite.insert.split.size";
|
||||
// The rationale to pick the insert parallelism is the following. Writing out 100MB files,
|
||||
@@ -82,7 +83,8 @@ public class HoodieCompactionConfig extends DefaultHoodieConfig {
|
||||
// This value is used as a guessimate for the record size, if we can't determine this from previous commits
|
||||
public static final String COPY_ON_WRITE_TABLE_RECORD_SIZE_ESTIMATE = "hoodie.copyonwrite.record.size.estimate";
|
||||
// Used to determine how much more can be packed into a small file, before it exceeds the size limit.
|
||||
public static final String DEFAULT_COPY_ON_WRITE_TABLE_RECORD_SIZE_ESTIMATE = String.valueOf(1024);
|
||||
public static final String DEFAULT_COPY_ON_WRITE_TABLE_RECORD_SIZE_ESTIMATE = String
|
||||
.valueOf(1024);
|
||||
|
||||
public static final String CLEANER_PARALLELISM = "hoodie.cleaner.parallelism";
|
||||
public static final String DEFAULT_CLEANER_PARALLELISM = String.valueOf(200);
|
||||
@@ -93,7 +95,8 @@ public class HoodieCompactionConfig extends DefaultHoodieConfig {
|
||||
|
||||
public static final String COMPACTION_STRATEGY_PROP = "hoodie.compaction.strategy";
|
||||
// 200GB of target IO per compaction
|
||||
public static final String DEFAULT_COMPACTION_STRATEGY = LogFileSizeBasedCompactionStrategy.class.getName();
|
||||
public static final String DEFAULT_COMPACTION_STRATEGY = LogFileSizeBasedCompactionStrategy.class
|
||||
.getName();
|
||||
|
||||
// used to merge records written to log file
|
||||
public static final String DEFAULT_PAYLOAD_CLASS = HoodieAvroPayload.class.getName();
|
||||
@@ -108,6 +111,7 @@ public class HoodieCompactionConfig extends DefaultHoodieConfig {
|
||||
}
|
||||
|
||||
public static class Builder {
|
||||
|
||||
private final Properties props = new Properties();
|
||||
|
||||
public Builder fromFile(File propertiesFile) throws IOException {
|
||||
@@ -174,12 +178,14 @@ public class HoodieCompactionConfig extends DefaultHoodieConfig {
|
||||
}
|
||||
|
||||
public Builder autoTuneInsertSplits(boolean autoTuneInsertSplits) {
|
||||
props.setProperty(COPY_ON_WRITE_TABLE_AUTO_SPLIT_INSERTS, String.valueOf(autoTuneInsertSplits));
|
||||
props.setProperty(COPY_ON_WRITE_TABLE_AUTO_SPLIT_INSERTS,
|
||||
String.valueOf(autoTuneInsertSplits));
|
||||
return this;
|
||||
}
|
||||
|
||||
public Builder approxRecordSize(int recordSizeEstimate) {
|
||||
props.setProperty(COPY_ON_WRITE_TABLE_RECORD_SIZE_ESTIMATE, String.valueOf(recordSizeEstimate));
|
||||
props.setProperty(COPY_ON_WRITE_TABLE_RECORD_SIZE_ESTIMATE,
|
||||
String.valueOf(recordSizeEstimate));
|
||||
return this;
|
||||
}
|
||||
|
||||
@@ -199,7 +205,8 @@ public class HoodieCompactionConfig extends DefaultHoodieConfig {
|
||||
}
|
||||
|
||||
public Builder withTargetIOPerCompactionInMB(long targetIOPerCompactionInMB) {
|
||||
props.setProperty(TARGET_IO_PER_COMPACTION_IN_MB_PROP, String.valueOf(targetIOPerCompactionInMB));
|
||||
props.setProperty(TARGET_IO_PER_COMPACTION_IN_MB_PROP,
|
||||
String.valueOf(targetIOPerCompactionInMB));
|
||||
return this;
|
||||
}
|
||||
|
||||
@@ -228,7 +235,8 @@ public class HoodieCompactionConfig extends DefaultHoodieConfig {
|
||||
setDefaultOnCondition(props, !props.containsKey(COPY_ON_WRITE_TABLE_AUTO_SPLIT_INSERTS),
|
||||
COPY_ON_WRITE_TABLE_AUTO_SPLIT_INSERTS, DEFAULT_COPY_ON_WRITE_TABLE_AUTO_SPLIT_INSERTS);
|
||||
setDefaultOnCondition(props, !props.containsKey(COPY_ON_WRITE_TABLE_RECORD_SIZE_ESTIMATE),
|
||||
COPY_ON_WRITE_TABLE_RECORD_SIZE_ESTIMATE, DEFAULT_COPY_ON_WRITE_TABLE_RECORD_SIZE_ESTIMATE);
|
||||
COPY_ON_WRITE_TABLE_RECORD_SIZE_ESTIMATE,
|
||||
DEFAULT_COPY_ON_WRITE_TABLE_RECORD_SIZE_ESTIMATE);
|
||||
setDefaultOnCondition(props, !props.containsKey(CLEANER_PARALLELISM),
|
||||
CLEANER_PARALLELISM, DEFAULT_CLEANER_PARALLELISM);
|
||||
setDefaultOnCondition(props, !props.containsKey(COMPACTION_STRATEGY_PROP),
|
||||
|
||||
@@ -16,14 +16,12 @@
|
||||
|
||||
package com.uber.hoodie.config;
|
||||
|
||||
import com.google.common.base.Preconditions;
|
||||
import com.uber.hoodie.index.HoodieIndex;
|
||||
|
||||
import javax.annotation.concurrent.Immutable;
|
||||
import java.io.File;
|
||||
import java.io.FileReader;
|
||||
import java.io.IOException;
|
||||
import java.util.Properties;
|
||||
import javax.annotation.concurrent.Immutable;
|
||||
|
||||
/**
|
||||
* Indexing related config
|
||||
@@ -64,6 +62,7 @@ public class HoodieIndexConfig extends DefaultHoodieConfig {
|
||||
}
|
||||
|
||||
public static class Builder {
|
||||
|
||||
private final Properties props = new Properties();
|
||||
|
||||
public Builder fromFile(File propertiesFile) throws IOException {
|
||||
|
||||
@@ -17,12 +17,11 @@
|
||||
package com.uber.hoodie.config;
|
||||
|
||||
import com.uber.hoodie.metrics.MetricsReporterType;
|
||||
|
||||
import javax.annotation.concurrent.Immutable;
|
||||
import java.io.File;
|
||||
import java.io.FileReader;
|
||||
import java.io.IOException;
|
||||
import java.util.Properties;
|
||||
import javax.annotation.concurrent.Immutable;
|
||||
|
||||
/**
|
||||
* Fetch the configurations used by the Metrics system.
|
||||
@@ -56,6 +55,7 @@ public class HoodieMetricsConfig extends DefaultHoodieConfig {
|
||||
}
|
||||
|
||||
public static class Builder {
|
||||
|
||||
private final Properties props = new Properties();
|
||||
|
||||
public Builder fromFile(File propertiesFile) throws IOException {
|
||||
|
||||
@@ -16,17 +16,18 @@
|
||||
|
||||
package com.uber.hoodie.config;
|
||||
|
||||
import javax.annotation.concurrent.Immutable;
|
||||
import java.io.File;
|
||||
import java.io.FileReader;
|
||||
import java.io.IOException;
|
||||
import java.util.Properties;
|
||||
import javax.annotation.concurrent.Immutable;
|
||||
|
||||
/**
|
||||
* Storage related config
|
||||
*/
|
||||
@Immutable
|
||||
public class HoodieStorageConfig extends DefaultHoodieConfig {
|
||||
|
||||
public static final String PARQUET_FILE_MAX_BYTES = "hoodie.parquet.max.file.size";
|
||||
public static final String DEFAULT_PARQUET_FILE_MAX_BYTES = String.valueOf(120 * 1024 * 1024);
|
||||
public static final String PARQUET_BLOCK_SIZE_BYTES = "hoodie.parquet.block.size";
|
||||
@@ -43,6 +44,7 @@ public class HoodieStorageConfig extends DefaultHoodieConfig {
|
||||
}
|
||||
|
||||
public static class Builder {
|
||||
|
||||
private final Properties props = new Properties();
|
||||
|
||||
public Builder fromFile(File propertiesFile) throws IOException {
|
||||
|
||||
@@ -24,21 +24,21 @@ import com.uber.hoodie.common.util.ReflectionUtils;
|
||||
import com.uber.hoodie.index.HoodieIndex;
|
||||
import com.uber.hoodie.io.compact.strategy.CompactionStrategy;
|
||||
import com.uber.hoodie.metrics.MetricsReporterType;
|
||||
import org.apache.spark.storage.StorageLevel;
|
||||
|
||||
import javax.annotation.concurrent.Immutable;
|
||||
import java.io.File;
|
||||
import java.io.FileReader;
|
||||
import java.io.IOException;
|
||||
import java.io.InputStream;
|
||||
import java.util.Map;
|
||||
import java.util.Properties;
|
||||
import javax.annotation.concurrent.Immutable;
|
||||
import org.apache.spark.storage.StorageLevel;
|
||||
|
||||
/**
|
||||
* Class storing configs for the {@link com.uber.hoodie.HoodieWriteClient}
|
||||
*/
|
||||
@Immutable
|
||||
public class HoodieWriteConfig extends DefaultHoodieConfig {
|
||||
|
||||
private static final String BASE_PATH_PROP = "hoodie.base.path";
|
||||
private static final String AVRO_SCHEMA = "hoodie.avro.schema";
|
||||
public static final String TABLE_NAME = "hoodie.table.name";
|
||||
@@ -141,7 +141,8 @@ public class HoodieWriteConfig extends DefaultHoodieConfig {
|
||||
}
|
||||
|
||||
public int getParquetSmallFileLimit() {
|
||||
return Integer.parseInt(props.getProperty(HoodieCompactionConfig.PARQUET_SMALL_FILE_LIMIT_BYTES));
|
||||
return Integer
|
||||
.parseInt(props.getProperty(HoodieCompactionConfig.PARQUET_SMALL_FILE_LIMIT_BYTES));
|
||||
}
|
||||
|
||||
public int getCopyOnWriteInsertSplitSize() {
|
||||
@@ -177,11 +178,13 @@ public class HoodieWriteConfig extends DefaultHoodieConfig {
|
||||
}
|
||||
|
||||
public CompactionStrategy getCompactionStrategy() {
|
||||
return ReflectionUtils.loadClass(props.getProperty(HoodieCompactionConfig.COMPACTION_STRATEGY_PROP));
|
||||
return ReflectionUtils
|
||||
.loadClass(props.getProperty(HoodieCompactionConfig.COMPACTION_STRATEGY_PROP));
|
||||
}
|
||||
|
||||
public Long getTargetIOPerCompactionInMB() {
|
||||
return Long.parseLong(props.getProperty(HoodieCompactionConfig.TARGET_IO_PER_COMPACTION_IN_MB_PROP));
|
||||
return Long
|
||||
.parseLong(props.getProperty(HoodieCompactionConfig.TARGET_IO_PER_COMPACTION_IN_MB_PROP));
|
||||
}
|
||||
|
||||
/**
|
||||
@@ -216,7 +219,8 @@ public class HoodieWriteConfig extends DefaultHoodieConfig {
|
||||
}
|
||||
|
||||
public boolean getBloomIndexPruneByRanges() {
|
||||
return Boolean.parseBoolean(props.getProperty(HoodieIndexConfig.BLOOM_INDEX_PRUNE_BY_RANGES_PROP));
|
||||
return Boolean
|
||||
.parseBoolean(props.getProperty(HoodieIndexConfig.BLOOM_INDEX_PRUNE_BY_RANGES_PROP));
|
||||
}
|
||||
|
||||
public boolean getBloomIndexUseCaching() {
|
||||
@@ -271,8 +275,8 @@ public class HoodieWriteConfig extends DefaultHoodieConfig {
|
||||
}
|
||||
|
||||
|
||||
|
||||
public static class Builder {
|
||||
|
||||
private final Properties props = new Properties();
|
||||
private boolean isIndexConfigSet = false;
|
||||
private boolean isStorageConfigSet = false;
|
||||
@@ -371,7 +375,8 @@ public class HoodieWriteConfig extends DefaultHoodieConfig {
|
||||
}
|
||||
|
||||
public Builder withAssumeDatePartitioning(boolean assumeDatePartitioning) {
|
||||
props.setProperty(HOODIE_ASSUME_DATE_PARTITIONING_PROP, String.valueOf(assumeDatePartitioning));
|
||||
props.setProperty(HOODIE_ASSUME_DATE_PARTITIONING_PROP,
|
||||
String.valueOf(assumeDatePartitioning));
|
||||
return this;
|
||||
}
|
||||
|
||||
@@ -386,7 +391,8 @@ public class HoodieWriteConfig extends DefaultHoodieConfig {
|
||||
Preconditions.checkArgument(config.getBasePath() != null);
|
||||
setDefaultOnCondition(props, !props.containsKey(INSERT_PARALLELISM), INSERT_PARALLELISM,
|
||||
DEFAULT_PARALLELISM);
|
||||
setDefaultOnCondition(props, !props.containsKey(BULKINSERT_PARALLELISM), BULKINSERT_PARALLELISM,
|
||||
setDefaultOnCondition(props, !props.containsKey(BULKINSERT_PARALLELISM),
|
||||
BULKINSERT_PARALLELISM,
|
||||
DEFAULT_PARALLELISM);
|
||||
setDefaultOnCondition(props, !props.containsKey(UPSERT_PARALLELISM), UPSERT_PARALLELISM,
|
||||
DEFAULT_PARALLELISM);
|
||||
|
||||
@@ -17,11 +17,11 @@
|
||||
package com.uber.hoodie.exception;
|
||||
|
||||
/**
|
||||
* <p>
|
||||
* Exception thrown for any higher level errors when <code>HoodieClient</code> is doing a delta commit
|
||||
* </p>
|
||||
* <p> Exception thrown for any higher level errors when <code>HoodieClient</code> is doing a delta
|
||||
* commit </p>
|
||||
*/
|
||||
public class HoodieAppendException extends HoodieException {
|
||||
|
||||
public HoodieAppendException(String msg, Throwable e) {
|
||||
super(msg, e);
|
||||
}
|
||||
|
||||
@@ -17,11 +17,11 @@
|
||||
package com.uber.hoodie.exception;
|
||||
|
||||
/**
|
||||
* <p>
|
||||
* Exception thrown for any higher level errors when <code>HoodieClient</code> is doing a Commit
|
||||
* <p> Exception thrown for any higher level errors when <code>HoodieClient</code> is doing a Commit
|
||||
* </p>
|
||||
*/
|
||||
public class HoodieCommitException extends HoodieException {
|
||||
|
||||
public HoodieCommitException(String msg) {
|
||||
super(msg);
|
||||
}
|
||||
|
||||
@@ -17,6 +17,7 @@
|
||||
package com.uber.hoodie.exception;
|
||||
|
||||
public class HoodieCompactionException extends HoodieException {
|
||||
|
||||
public HoodieCompactionException(String msg) {
|
||||
super(msg);
|
||||
}
|
||||
|
||||
@@ -18,11 +18,10 @@ package com.uber.hoodie.exception;
|
||||
|
||||
|
||||
/**
|
||||
* <p>
|
||||
* Exception thrown when dependent system is not available
|
||||
* </p>
|
||||
* <p> Exception thrown when dependent system is not available </p>
|
||||
*/
|
||||
public class HoodieDependentSystemUnavailableException extends HoodieException {
|
||||
|
||||
public static final String HBASE = "HBASE";
|
||||
|
||||
public HoodieDependentSystemUnavailableException(String system, String connectURL) {
|
||||
|
||||
@@ -16,14 +16,12 @@
|
||||
|
||||
package com.uber.hoodie.exception;
|
||||
|
||||
import java.io.IOException;
|
||||
|
||||
/**
|
||||
* <p>
|
||||
* Exception thrown for any higher level errors when <code>HoodieClient</code> is doing a bulk insert
|
||||
* </p>
|
||||
* <p> Exception thrown for any higher level errors when <code>HoodieClient</code> is doing a bulk
|
||||
* insert </p>
|
||||
*/
|
||||
public class HoodieInsertException extends HoodieException {
|
||||
|
||||
public HoodieInsertException(String msg, Throwable e) {
|
||||
super(msg, e);
|
||||
}
|
||||
|
||||
@@ -17,11 +17,11 @@
|
||||
package com.uber.hoodie.exception;
|
||||
|
||||
/**
|
||||
* <p>
|
||||
* Exception thrown for any higher level errors when <code>HoodieClient</code> is doing a incremental upsert
|
||||
* </p>
|
||||
* <p> Exception thrown for any higher level errors when <code>HoodieClient</code> is doing a
|
||||
* incremental upsert </p>
|
||||
*/
|
||||
public class HoodieUpsertException extends HoodieException {
|
||||
|
||||
public HoodieUpsertException(String msg, Throwable e) {
|
||||
super(msg, e);
|
||||
}
|
||||
|
||||
@@ -16,16 +16,14 @@
|
||||
|
||||
package com.uber.hoodie.func;
|
||||
|
||||
import com.uber.hoodie.common.table.HoodieTableMetaClient;
|
||||
import com.uber.hoodie.config.HoodieWriteConfig;
|
||||
import com.uber.hoodie.WriteStatus;
|
||||
import com.uber.hoodie.common.model.HoodieRecord;
|
||||
import com.uber.hoodie.common.model.HoodieRecordPayload;
|
||||
import com.uber.hoodie.config.HoodieWriteConfig;
|
||||
import com.uber.hoodie.table.HoodieTable;
|
||||
import org.apache.spark.api.java.function.Function2;
|
||||
|
||||
import java.util.Iterator;
|
||||
import java.util.List;
|
||||
import org.apache.spark.api.java.function.Function2;
|
||||
|
||||
|
||||
/**
|
||||
@@ -46,7 +44,8 @@ public class BulkInsertMapFunction<T extends HoodieRecordPayload>
|
||||
}
|
||||
|
||||
@Override
|
||||
public Iterator<List<WriteStatus>> call(Integer partition, Iterator<HoodieRecord<T>> sortedRecordItr)
|
||||
public Iterator<List<WriteStatus>> call(Integer partition,
|
||||
Iterator<HoodieRecord<T>> sortedRecordItr)
|
||||
throws Exception {
|
||||
return new LazyInsertIterable<>(sortedRecordItr, config, commitTime, hoodieTable);
|
||||
}
|
||||
|
||||
@@ -16,27 +16,26 @@
|
||||
|
||||
package com.uber.hoodie.func;
|
||||
|
||||
import com.uber.hoodie.config.HoodieWriteConfig;
|
||||
import com.uber.hoodie.WriteStatus;
|
||||
import com.uber.hoodie.common.model.HoodieRecord;
|
||||
import com.uber.hoodie.common.model.HoodieRecordPayload;
|
||||
|
||||
import com.uber.hoodie.io.HoodieIOHandle;
|
||||
import com.uber.hoodie.config.HoodieWriteConfig;
|
||||
import com.uber.hoodie.io.HoodieCreateHandle;
|
||||
import com.uber.hoodie.io.HoodieIOHandle;
|
||||
import com.uber.hoodie.table.HoodieTable;
|
||||
import org.apache.spark.TaskContext;
|
||||
|
||||
import java.util.ArrayList;
|
||||
import java.util.HashSet;
|
||||
import java.util.Iterator;
|
||||
import java.util.List;
|
||||
import java.util.Set;
|
||||
import org.apache.spark.TaskContext;
|
||||
|
||||
/**
|
||||
* Lazy Iterable, that writes a stream of HoodieRecords sorted by the partitionPath,
|
||||
* into new files.
|
||||
* Lazy Iterable, that writes a stream of HoodieRecords sorted by the partitionPath, into new
|
||||
* files.
|
||||
*/
|
||||
public class LazyInsertIterable<T extends HoodieRecordPayload> extends LazyIterableIterator<HoodieRecord<T>, List<WriteStatus>> {
|
||||
public class LazyInsertIterable<T extends HoodieRecordPayload> extends
|
||||
LazyIterableIterator<HoodieRecord<T>, List<WriteStatus>> {
|
||||
|
||||
private final HoodieWriteConfig hoodieConfig;
|
||||
private final String commitTime;
|
||||
@@ -53,11 +52,13 @@ public class LazyInsertIterable<T extends HoodieRecordPayload> extends LazyItera
|
||||
this.hoodieTable = hoodieTable;
|
||||
}
|
||||
|
||||
@Override protected void start() {
|
||||
@Override
|
||||
protected void start() {
|
||||
}
|
||||
|
||||
|
||||
@Override protected List<WriteStatus> computeNext() {
|
||||
@Override
|
||||
protected List<WriteStatus> computeNext() {
|
||||
List<WriteStatus> statuses = new ArrayList<>();
|
||||
|
||||
while (inputItr.hasNext()) {
|
||||
@@ -108,7 +109,8 @@ public class LazyInsertIterable<T extends HoodieRecordPayload> extends LazyItera
|
||||
return statuses;
|
||||
}
|
||||
|
||||
@Override protected void end() {
|
||||
@Override
|
||||
protected void end() {
|
||||
|
||||
}
|
||||
}
|
||||
|
||||
@@ -31,6 +31,7 @@ import java.util.Iterator;
|
||||
* responsible for calling inputIterator.next() and doing the processing in computeNext()
|
||||
*/
|
||||
public abstract class LazyIterableIterator<I, O> implements Iterable<O>, Iterator<O> {
|
||||
|
||||
protected Iterator<I> inputItr = null;
|
||||
private boolean consumed = false;
|
||||
private boolean startCalled = false;
|
||||
@@ -56,7 +57,6 @@ public abstract class LazyIterableIterator<I, O> implements Iterable<O>, Iterato
|
||||
*/
|
||||
protected abstract void end();
|
||||
|
||||
|
||||
//////////////////
|
||||
// iterable implementation
|
||||
|
||||
@@ -87,8 +87,9 @@ public abstract class LazyIterableIterator<I, O> implements Iterable<O>, Iterato
|
||||
@Override
|
||||
public Iterator<O> iterator() {
|
||||
//check for consumed inputItr
|
||||
if (consumed)
|
||||
if (consumed) {
|
||||
throw new RuntimeException("Invalid repeated inputItr consumption.");
|
||||
}
|
||||
|
||||
//hand out self as inputItr exactly once (note: do not hand out the input
|
||||
//inputItr since it is consumed by the self inputItr implementation)
|
||||
|
||||
@@ -17,29 +17,26 @@
|
||||
package com.uber.hoodie.index;
|
||||
|
||||
import com.google.common.base.Optional;
|
||||
|
||||
import com.uber.hoodie.config.HoodieWriteConfig;
|
||||
import com.uber.hoodie.WriteStatus;
|
||||
import com.uber.hoodie.common.model.HoodieKey;
|
||||
import com.uber.hoodie.common.model.HoodieRecordPayload;
|
||||
import com.uber.hoodie.common.model.HoodieRecord;
|
||||
|
||||
import com.uber.hoodie.common.model.HoodieRecordPayload;
|
||||
import com.uber.hoodie.config.HoodieWriteConfig;
|
||||
import com.uber.hoodie.exception.HoodieIndexException;
|
||||
import com.uber.hoodie.index.bloom.HoodieBloomIndex;
|
||||
import com.uber.hoodie.index.bucketed.BucketedIndex;
|
||||
import com.uber.hoodie.index.hbase.HBaseIndex;
|
||||
import com.uber.hoodie.table.HoodieTable;
|
||||
import java.io.Serializable;
|
||||
import org.apache.spark.api.java.JavaPairRDD;
|
||||
import org.apache.spark.api.java.JavaRDD;
|
||||
import org.apache.spark.api.java.JavaSparkContext;
|
||||
|
||||
import java.io.Serializable;
|
||||
|
||||
/**
|
||||
* Base class for different types of indexes to determine the mapping from uuid
|
||||
*
|
||||
*/
|
||||
public abstract class HoodieIndex<T extends HoodieRecordPayload> implements Serializable {
|
||||
|
||||
protected transient JavaSparkContext jsc = null;
|
||||
|
||||
public enum IndexType {
|
||||
@@ -58,12 +55,9 @@ public abstract class HoodieIndex<T extends HoodieRecordPayload> implements Seri
|
||||
|
||||
/**
|
||||
* Checks if the given [Keys] exists in the hoodie table and returns [Key, Optional[FullFilePath]]
|
||||
* If the optional FullFilePath value is not present, then the key is not found. If the FullFilePath
|
||||
* value is present, it is the path component (without scheme) of the URI underlying file
|
||||
*
|
||||
* @param hoodieKeys
|
||||
* @param table
|
||||
* @return
|
||||
* If the optional FullFilePath value is not present, then the key is not found. If the
|
||||
* FullFilePath value is present, it is the path component (without scheme) of the URI underlying
|
||||
* file
|
||||
*/
|
||||
public abstract JavaPairRDD<HoodieKey, Optional<String>> fetchRecordLocation(
|
||||
JavaRDD<HoodieKey> hoodieKeys, final HoodieTable<T> table);
|
||||
@@ -89,17 +83,17 @@ public abstract class HoodieIndex<T extends HoodieRecordPayload> implements Seri
|
||||
public abstract boolean rollbackCommit(String commitTime);
|
||||
|
||||
/**
|
||||
* An index is `global` if {@link HoodieKey} to fileID mapping, does not depend on the `partitionPath`.
|
||||
* Such an implementation is able to obtain the same mapping, for two hoodie keys with same `recordKey`
|
||||
* but different `partitionPath`
|
||||
* An index is `global` if {@link HoodieKey} to fileID mapping, does not depend on the
|
||||
* `partitionPath`. Such an implementation is able to obtain the same mapping, for two hoodie keys
|
||||
* with same `recordKey` but different `partitionPath`
|
||||
*
|
||||
* @return whether or not, the index implementation is global in nature
|
||||
*/
|
||||
public abstract boolean isGlobal();
|
||||
|
||||
/**
|
||||
* This is used by storage to determine, if its safe to send inserts, straight to the log,
|
||||
* i.e having a {@link com.uber.hoodie.common.model.FileSlice}, with no data file.
|
||||
* This is used by storage to determine, if its safe to send inserts, straight to the log, i.e
|
||||
* having a {@link com.uber.hoodie.common.model.FileSlice}, with no data file.
|
||||
*
|
||||
* @return Returns true/false depending on whether the impl has this capability
|
||||
*/
|
||||
@@ -107,12 +101,8 @@ public abstract class HoodieIndex<T extends HoodieRecordPayload> implements Seri
|
||||
|
||||
|
||||
/**
|
||||
*
|
||||
* An index is "implicit" with respect to storage, if just writing new data to a file slice,
|
||||
* updates the index as well. This is used by storage, to save memory footprint in
|
||||
* certain cases.
|
||||
*
|
||||
* @return
|
||||
* updates the index as well. This is used by storage, to save memory footprint in certain cases.
|
||||
*/
|
||||
public abstract boolean isImplicitWithStorage();
|
||||
|
||||
|
||||
@@ -17,32 +17,27 @@
|
||||
package com.uber.hoodie.index;
|
||||
|
||||
import com.google.common.base.Optional;
|
||||
import com.uber.hoodie.common.table.HoodieTableMetaClient;
|
||||
import com.uber.hoodie.config.HoodieWriteConfig;
|
||||
import com.uber.hoodie.WriteStatus;
|
||||
import com.uber.hoodie.common.model.HoodieKey;
|
||||
import com.uber.hoodie.common.model.HoodieRecord;
|
||||
import com.uber.hoodie.common.model.HoodieRecordLocation;
|
||||
import com.uber.hoodie.common.model.HoodieRecordPayload;
|
||||
|
||||
import com.uber.hoodie.config.HoodieWriteConfig;
|
||||
import com.uber.hoodie.table.HoodieTable;
|
||||
import java.util.ArrayList;
|
||||
import java.util.Iterator;
|
||||
import java.util.List;
|
||||
import java.util.concurrent.ConcurrentHashMap;
|
||||
import java.util.concurrent.ConcurrentMap;
|
||||
import org.apache.spark.api.java.JavaPairRDD;
|
||||
import org.apache.spark.api.java.JavaRDD;
|
||||
import org.apache.spark.api.java.JavaSparkContext;
|
||||
import org.apache.spark.api.java.function.Function;
|
||||
import org.apache.spark.api.java.function.Function2;
|
||||
|
||||
import java.util.ArrayList;
|
||||
import java.util.Iterator;
|
||||
import java.util.List;
|
||||
import java.util.concurrent.ConcurrentHashMap;
|
||||
import java.util.concurrent.ConcurrentMap;
|
||||
|
||||
|
||||
/**
|
||||
* Hoodie Index implementation backed by an in-memory Hash map.
|
||||
* <p>
|
||||
* ONLY USE FOR LOCAL TESTING
|
||||
* Hoodie Index implementation backed by an in-memory Hash map. <p> ONLY USE FOR LOCAL TESTING
|
||||
*/
|
||||
public class InMemoryHashIndex<T extends HoodieRecordPayload> extends HoodieIndex<T> {
|
||||
|
||||
@@ -64,6 +59,7 @@ public class InMemoryHashIndex<T extends HoodieRecordPayload> extends HoodieInde
|
||||
*/
|
||||
class LocationTagFunction
|
||||
implements Function2<Integer, Iterator<HoodieRecord<T>>, Iterator<HoodieRecord<T>>> {
|
||||
|
||||
@Override
|
||||
public Iterator<HoodieRecord<T>> call(Integer partitionNum,
|
||||
Iterator<HoodieRecord<T>> hoodieRecordIterator) {
|
||||
@@ -115,8 +111,6 @@ public class InMemoryHashIndex<T extends HoodieRecordPayload> extends HoodieInde
|
||||
|
||||
/**
|
||||
* Only looks up by recordKey
|
||||
*
|
||||
* @return
|
||||
*/
|
||||
@Override
|
||||
public boolean isGlobal() {
|
||||
@@ -125,8 +119,6 @@ public class InMemoryHashIndex<T extends HoodieRecordPayload> extends HoodieInde
|
||||
|
||||
/**
|
||||
* Mapping is available in HBase already.
|
||||
*
|
||||
* @return
|
||||
*/
|
||||
@Override
|
||||
public boolean canIndexLogFiles() {
|
||||
@@ -135,8 +127,6 @@ public class InMemoryHashIndex<T extends HoodieRecordPayload> extends HoodieInde
|
||||
|
||||
/**
|
||||
* Index needs to be explicitly updated after storage write.
|
||||
*
|
||||
* @return
|
||||
*/
|
||||
@Override
|
||||
public boolean isImplicitWithStorage() {
|
||||
|
||||
@@ -19,7 +19,6 @@
|
||||
package com.uber.hoodie.index.bloom;
|
||||
|
||||
import com.google.common.base.Objects;
|
||||
|
||||
import java.io.Serializable;
|
||||
|
||||
/**
|
||||
@@ -63,8 +62,6 @@ public class BloomIndexFileInfo implements Serializable {
|
||||
|
||||
/**
|
||||
* Does the given key fall within the range (inclusive)
|
||||
* @param recordKey
|
||||
* @return
|
||||
*/
|
||||
public boolean isKeyInRange(String recordKey) {
|
||||
return minRecordKey.compareTo(recordKey) <= 0 &&
|
||||
@@ -73,8 +70,12 @@ public class BloomIndexFileInfo implements Serializable {
|
||||
|
||||
@Override
|
||||
public boolean equals(Object o) {
|
||||
if (this == o) return true;
|
||||
if (o == null || getClass() != o.getClass()) return false;
|
||||
if (this == o) {
|
||||
return true;
|
||||
}
|
||||
if (o == null || getClass() != o.getClass()) {
|
||||
return false;
|
||||
}
|
||||
|
||||
BloomIndexFileInfo that = (BloomIndexFileInfo) o;
|
||||
return Objects.equal(that.fileName, fileName) &&
|
||||
|
||||
@@ -18,9 +18,12 @@
|
||||
|
||||
package com.uber.hoodie.index.bloom;
|
||||
|
||||
import static java.util.stream.Collectors.groupingBy;
|
||||
import static java.util.stream.Collectors.mapping;
|
||||
import static java.util.stream.Collectors.toList;
|
||||
|
||||
import com.google.common.annotations.VisibleForTesting;
|
||||
import com.google.common.base.Optional;
|
||||
|
||||
import com.uber.hoodie.WriteStatus;
|
||||
import com.uber.hoodie.common.model.HoodieDataFile;
|
||||
import com.uber.hoodie.common.model.HoodieKey;
|
||||
@@ -34,7 +37,10 @@ import com.uber.hoodie.config.HoodieWriteConfig;
|
||||
import com.uber.hoodie.exception.MetadataNotFoundException;
|
||||
import com.uber.hoodie.index.HoodieIndex;
|
||||
import com.uber.hoodie.table.HoodieTable;
|
||||
|
||||
import java.util.ArrayList;
|
||||
import java.util.List;
|
||||
import java.util.Map;
|
||||
import java.util.stream.Collectors;
|
||||
import org.apache.hadoop.fs.Path;
|
||||
import org.apache.log4j.LogManager;
|
||||
import org.apache.log4j.Logger;
|
||||
@@ -42,16 +48,8 @@ import org.apache.spark.api.java.JavaPairRDD;
|
||||
import org.apache.spark.api.java.JavaRDD;
|
||||
import org.apache.spark.api.java.JavaSparkContext;
|
||||
import org.apache.spark.storage.StorageLevel;
|
||||
|
||||
import scala.Tuple2;
|
||||
|
||||
import java.util.ArrayList;
|
||||
import java.util.List;
|
||||
import java.util.Map;
|
||||
import java.util.stream.Collectors;
|
||||
|
||||
import static java.util.stream.Collectors.*;
|
||||
|
||||
/**
|
||||
* Indexing mechanism based on bloom filter. Each parquet file includes its row_key bloom filter in
|
||||
* its metadata.
|
||||
@@ -64,14 +62,16 @@ public class HoodieBloomIndex<T extends HoodieRecordPayload> extends HoodieIndex
|
||||
private static final int SPARK_MAXIMUM_BYTES_PER_PARTITION = 1500 * 1024 * 1024;
|
||||
// this is how much a triplet of (partitionPath, fileId, recordKey) costs.
|
||||
private static final int BYTES_PER_PARTITION_FILE_KEY_TRIPLET = 300;
|
||||
private static int MAX_ITEMS_PER_SHUFFLE_PARTITION = SPARK_MAXIMUM_BYTES_PER_PARTITION / BYTES_PER_PARTITION_FILE_KEY_TRIPLET;
|
||||
private static int MAX_ITEMS_PER_SHUFFLE_PARTITION =
|
||||
SPARK_MAXIMUM_BYTES_PER_PARTITION / BYTES_PER_PARTITION_FILE_KEY_TRIPLET;
|
||||
|
||||
public HoodieBloomIndex(HoodieWriteConfig config, JavaSparkContext jsc) {
|
||||
super(config, jsc);
|
||||
}
|
||||
|
||||
@Override
|
||||
public JavaRDD<HoodieRecord<T>> tagLocation(JavaRDD<HoodieRecord<T>> recordRDD, final HoodieTable<T> hoodieTable) {
|
||||
public JavaRDD<HoodieRecord<T>> tagLocation(JavaRDD<HoodieRecord<T>> recordRDD,
|
||||
final HoodieTable<T> hoodieTable) {
|
||||
|
||||
// Step 0: cache the input record RDD
|
||||
if (config.getBloomIndexUseCaching()) {
|
||||
@@ -83,7 +83,8 @@ public class HoodieBloomIndex<T extends HoodieRecordPayload> extends HoodieIndex
|
||||
.mapToPair(record -> new Tuple2<>(record.getPartitionPath(), record.getRecordKey()));
|
||||
|
||||
// Lookup indexes for all the partition/recordkey pair
|
||||
JavaPairRDD<String, String> rowKeyFilenamePairRDD = lookupIndex(partitionRecordKeyPairRDD, hoodieTable);
|
||||
JavaPairRDD<String, String> rowKeyFilenamePairRDD = lookupIndex(partitionRecordKeyPairRDD,
|
||||
hoodieTable);
|
||||
|
||||
// Cache the result, for subsequent stages.
|
||||
if (config.getBloomIndexUseCaching()) {
|
||||
@@ -96,7 +97,8 @@ public class HoodieBloomIndex<T extends HoodieRecordPayload> extends HoodieIndex
|
||||
|
||||
// Step 4: Tag the incoming records, as inserts or updates, by joining with existing record keys
|
||||
// Cost: 4 sec.
|
||||
JavaRDD<HoodieRecord<T>> taggedRecordRDD = tagLocationBacktoRecords(rowKeyFilenamePairRDD, recordRDD);
|
||||
JavaRDD<HoodieRecord<T>> taggedRecordRDD = tagLocationBacktoRecords(rowKeyFilenamePairRDD,
|
||||
recordRDD);
|
||||
|
||||
if (config.getBloomIndexUseCaching()) {
|
||||
recordRDD.unpersist(); // unpersist the input Record RDD
|
||||
@@ -135,8 +137,8 @@ public class HoodieBloomIndex<T extends HoodieRecordPayload> extends HoodieIndex
|
||||
}
|
||||
|
||||
/**
|
||||
* Lookup the location for each record key and return the pair<record_key,location> for all
|
||||
* record keys already present and drop the record keys if not present
|
||||
* Lookup the location for each record key and return the pair<record_key,location> for all record
|
||||
* keys already present and drop the record keys if not present
|
||||
*/
|
||||
private JavaPairRDD<String, String> lookupIndex(
|
||||
JavaPairRDD<String, String> partitionRecordKeyPairRDD, final HoodieTable<T> hoodieTable) {
|
||||
@@ -145,25 +147,27 @@ public class HoodieBloomIndex<T extends HoodieRecordPayload> extends HoodieIndex
|
||||
List<String> affectedPartitionPathList = new ArrayList<>(recordsPerPartition.keySet());
|
||||
|
||||
// Step 2: Load all involved files as <Partition, filename> pairs
|
||||
List<Tuple2<String, BloomIndexFileInfo>> fileInfoList = loadInvolvedFiles(affectedPartitionPathList, hoodieTable);
|
||||
List<Tuple2<String, BloomIndexFileInfo>> fileInfoList = loadInvolvedFiles(
|
||||
affectedPartitionPathList, hoodieTable);
|
||||
final Map<String, List<BloomIndexFileInfo>> partitionToFileInfo = fileInfoList.stream()
|
||||
.collect(groupingBy(Tuple2::_1, mapping(Tuple2::_2, toList())));
|
||||
|
||||
// Step 3: Obtain a RDD, for each incoming record, that already exists, with the file id, that contains it.
|
||||
int parallelism = autoComputeParallelism(recordsPerPartition, partitionToFileInfo, partitionRecordKeyPairRDD);
|
||||
return findMatchingFilesForRecordKeys(partitionToFileInfo, partitionRecordKeyPairRDD, parallelism);
|
||||
int parallelism = autoComputeParallelism(recordsPerPartition, partitionToFileInfo,
|
||||
partitionRecordKeyPairRDD);
|
||||
return findMatchingFilesForRecordKeys(partitionToFileInfo, partitionRecordKeyPairRDD,
|
||||
parallelism);
|
||||
}
|
||||
|
||||
/**
|
||||
* The index lookup can be skewed in three dimensions : #files, #partitions, #records
|
||||
*
|
||||
* To be able to smoothly handle skews, we need to compute how to split each partitions into
|
||||
* subpartitions. We do it here, in a way that keeps the amount of each Spark join partition to
|
||||
* < 2GB.
|
||||
*
|
||||
* If {@link com.uber.hoodie.config.HoodieIndexConfig#BLOOM_INDEX_PARALLELISM_PROP} is specified as a NON-zero number,
|
||||
* then that is used explicitly.
|
||||
* subpartitions. We do it here, in a way that keeps the amount of each Spark join partition to <
|
||||
* 2GB.
|
||||
*
|
||||
* If {@link com.uber.hoodie.config.HoodieIndexConfig#BLOOM_INDEX_PARALLELISM_PROP} is specified
|
||||
* as a NON-zero number, then that is used explicitly.
|
||||
*/
|
||||
private int autoComputeParallelism(final Map<String, Long> recordsPerPartition,
|
||||
final Map<String, List<BloomIndexFileInfo>> partitionToFileInfo,
|
||||
@@ -172,7 +176,8 @@ public class HoodieBloomIndex<T extends HoodieRecordPayload> extends HoodieIndex
|
||||
long totalComparisons = 0;
|
||||
if (config.getBloomIndexPruneByRanges()) {
|
||||
// we will just try exploding the input and then count to determine comparisons
|
||||
totalComparisons = explodeRecordRDDWithFileComparisons(partitionToFileInfo, partitionRecordKeyPairRDD).count();
|
||||
totalComparisons = explodeRecordRDDWithFileComparisons(partitionToFileInfo,
|
||||
partitionRecordKeyPairRDD).count();
|
||||
} else {
|
||||
// if not pruning by ranges, then each file in a partition needs to compared against all
|
||||
// records for a partition.
|
||||
@@ -181,30 +186,36 @@ public class HoodieBloomIndex<T extends HoodieRecordPayload> extends HoodieIndex
|
||||
long totalFiles = 0, totalRecords = 0;
|
||||
for (String partitionPath : recordsPerPartition.keySet()) {
|
||||
long numRecords = recordsPerPartition.get(partitionPath);
|
||||
long numFiles = filesPerPartition.containsKey(partitionPath) ? filesPerPartition.get(partitionPath) : 1L;
|
||||
long numFiles =
|
||||
filesPerPartition.containsKey(partitionPath) ? filesPerPartition.get(partitionPath)
|
||||
: 1L;
|
||||
|
||||
totalComparisons += numFiles * numRecords;
|
||||
totalFiles += filesPerPartition.containsKey(partitionPath) ? filesPerPartition.get(partitionPath) : 0L;
|
||||
totalFiles +=
|
||||
filesPerPartition.containsKey(partitionPath) ? filesPerPartition.get(partitionPath)
|
||||
: 0L;
|
||||
totalRecords += numRecords;
|
||||
}
|
||||
logger.info("TotalRecords: " + totalRecords + ", TotalFiles: " + totalFiles + ", TotalAffectedPartitions:" + recordsPerPartition.size());
|
||||
logger.info("TotalRecords: " + totalRecords + ", TotalFiles: " + totalFiles
|
||||
+ ", TotalAffectedPartitions:" + recordsPerPartition.size());
|
||||
}
|
||||
|
||||
// each partition will have an item per comparison.
|
||||
int parallelism = (int) (totalComparisons/ MAX_ITEMS_PER_SHUFFLE_PARTITION + 1);
|
||||
logger.info("Auto computed parallelism :" + parallelism + ", totalComparisons: " + totalComparisons);
|
||||
int parallelism = (int) (totalComparisons / MAX_ITEMS_PER_SHUFFLE_PARTITION + 1);
|
||||
logger.info(
|
||||
"Auto computed parallelism :" + parallelism + ", totalComparisons: " + totalComparisons);
|
||||
return parallelism;
|
||||
}
|
||||
|
||||
/**
|
||||
* Its crucial to pick the right parallelism.
|
||||
*
|
||||
* totalSubPartitions : this is deemed safe limit, to be nice with Spark.
|
||||
* inputParallelism : typically number of input file splits
|
||||
* totalSubPartitions : this is deemed safe limit, to be nice with Spark. inputParallelism :
|
||||
* typically number of input file splits
|
||||
*
|
||||
* We pick the max such that, we are always safe, but go higher if say a there are a lot of
|
||||
* input files. (otherwise, we will fallback to number of partitions in input and end up with
|
||||
* slow performance)
|
||||
* We pick the max such that, we are always safe, but go higher if say a there are a lot of input
|
||||
* files. (otherwise, we will fallback to number of partitions in input and end up with slow
|
||||
* performance)
|
||||
*/
|
||||
private int determineParallelism(int inputParallelism, int totalSubPartitions) {
|
||||
// If bloom index parallelism is set, use it to to check against the input parallelism and take the max
|
||||
@@ -221,9 +232,11 @@ public class HoodieBloomIndex<T extends HoodieRecordPayload> extends HoodieIndex
|
||||
* Load all involved files as <Partition, filename> pair RDD.
|
||||
*/
|
||||
@VisibleForTesting
|
||||
List<Tuple2<String, BloomIndexFileInfo>> loadInvolvedFiles(List<String> partitions, final HoodieTable<T> hoodieTable) {
|
||||
List<Tuple2<String, BloomIndexFileInfo>> loadInvolvedFiles(List<String> partitions,
|
||||
final HoodieTable<T> hoodieTable) {
|
||||
// Obtain the latest data files from all the partitions.
|
||||
List<Tuple2<String, HoodieDataFile>> dataFilesList = jsc.parallelize(partitions, Math.max(partitions.size(), 1))
|
||||
List<Tuple2<String, HoodieDataFile>> dataFilesList = jsc
|
||||
.parallelize(partitions, Math.max(partitions.size(), 1))
|
||||
.flatMapToPair(partitionPath -> {
|
||||
java.util.Optional<HoodieInstant> latestCommitTime =
|
||||
hoodieTable.getCommitTimeline().filterCompletedInstants().lastInstant();
|
||||
@@ -243,8 +256,10 @@ public class HoodieBloomIndex<T extends HoodieRecordPayload> extends HoodieIndex
|
||||
return jsc.parallelize(dataFilesList, Math.max(dataFilesList.size(), 1))
|
||||
.mapToPair(ft -> {
|
||||
try {
|
||||
String[] minMaxKeys = ParquetUtils.readMinMaxRecordKeys(ft._2().getFileStatus().getPath());
|
||||
return new Tuple2<>(ft._1(), new BloomIndexFileInfo(ft._2().getFileName(), minMaxKeys[0], minMaxKeys[1]));
|
||||
String[] minMaxKeys = ParquetUtils
|
||||
.readMinMaxRecordKeys(ft._2().getFileStatus().getPath());
|
||||
return new Tuple2<>(ft._1(),
|
||||
new BloomIndexFileInfo(ft._2().getFileName(), minMaxKeys[0], minMaxKeys[1]));
|
||||
} catch (MetadataNotFoundException me) {
|
||||
logger.warn("Unable to find range metadata in file :" + ft._2());
|
||||
return new Tuple2<>(ft._1(), new BloomIndexFileInfo(ft._2().getFileName()));
|
||||
@@ -266,8 +281,6 @@ public class HoodieBloomIndex<T extends HoodieRecordPayload> extends HoodieIndex
|
||||
|
||||
/**
|
||||
* This is not global, since we depend on the partitionPath to do the lookup
|
||||
*
|
||||
* @return
|
||||
*/
|
||||
@Override
|
||||
public boolean isGlobal() {
|
||||
@@ -276,8 +289,6 @@ public class HoodieBloomIndex<T extends HoodieRecordPayload> extends HoodieIndex
|
||||
|
||||
/**
|
||||
* No indexes into log files yet.
|
||||
*
|
||||
* @return
|
||||
*/
|
||||
@Override
|
||||
public boolean canIndexLogFiles() {
|
||||
@@ -286,8 +297,6 @@ public class HoodieBloomIndex<T extends HoodieRecordPayload> extends HoodieIndex
|
||||
|
||||
/**
|
||||
* Bloom filters are stored, into the same data files.
|
||||
*
|
||||
* @return
|
||||
*/
|
||||
@Override
|
||||
public boolean isImplicitWithStorage() {
|
||||
@@ -295,12 +304,8 @@ public class HoodieBloomIndex<T extends HoodieRecordPayload> extends HoodieIndex
|
||||
}
|
||||
|
||||
/**
|
||||
* if we dont have key ranges, then also we need to compare against the file. no other choice
|
||||
* if we do, then only compare the file if the record key falls in range.
|
||||
|
||||
* @param indexInfo
|
||||
* @param recordKey
|
||||
* @return
|
||||
* if we dont have key ranges, then also we need to compare against the file. no other choice if
|
||||
* we do, then only compare the file if the record key falls in range.
|
||||
*/
|
||||
private boolean shouldCompareWithFile(BloomIndexFileInfo indexInfo, String recordKey) {
|
||||
return !indexInfo.hasKeyRanges() || indexInfo.isKeyInRange(recordKey);
|
||||
@@ -308,19 +313,16 @@ public class HoodieBloomIndex<T extends HoodieRecordPayload> extends HoodieIndex
|
||||
|
||||
|
||||
/**
|
||||
* For each incoming record, produce N output records, 1 each for each file against which the record's key
|
||||
* needs to be checked. For datasets, where the keys have a definite insert order (e.g: timestamp as prefix),
|
||||
* the number of files to be compared gets cut down a lot from range pruning.
|
||||
*
|
||||
*
|
||||
* @param partitionToFileIndexInfo
|
||||
* @param partitionRecordKeyPairRDD
|
||||
* @return
|
||||
* For each incoming record, produce N output records, 1 each for each file against which the
|
||||
* record's key needs to be checked. For datasets, where the keys have a definite insert order
|
||||
* (e.g: timestamp as prefix), the number of files to be compared gets cut down a lot from range
|
||||
* pruning.
|
||||
*/
|
||||
// sub-partition to ensure the records can be looked up against files & also prune file<=>record comparisons based on recordKey
|
||||
// ranges in the index info.
|
||||
@VisibleForTesting
|
||||
JavaPairRDD<String, Tuple2<String, HoodieKey>> explodeRecordRDDWithFileComparisons(final Map<String, List<BloomIndexFileInfo>> partitionToFileIndexInfo,
|
||||
JavaPairRDD<String, Tuple2<String, HoodieKey>> explodeRecordRDDWithFileComparisons(
|
||||
final Map<String, List<BloomIndexFileInfo>> partitionToFileIndexInfo,
|
||||
JavaPairRDD<String, String> partitionRecordKeyPairRDD) {
|
||||
return partitionRecordKeyPairRDD
|
||||
.map(partitionRecordKeyPair -> {
|
||||
@@ -329,13 +331,15 @@ public class HoodieBloomIndex<T extends HoodieRecordPayload> extends HoodieIndex
|
||||
|
||||
List<BloomIndexFileInfo> indexInfos = partitionToFileIndexInfo.get(partitionPath);
|
||||
List<Tuple2<String, Tuple2<String, HoodieKey>>> recordComparisons = new ArrayList<>();
|
||||
if (indexInfos != null) { // could be null, if there are no files in a given partition yet.
|
||||
if (indexInfos
|
||||
!= null) { // could be null, if there are no files in a given partition yet.
|
||||
// for each candidate file in partition, that needs to be compared.
|
||||
for (BloomIndexFileInfo indexInfo : indexInfos) {
|
||||
if (shouldCompareWithFile(indexInfo, recordKey)) {
|
||||
recordComparisons.add(
|
||||
new Tuple2<>(String.format("%s#%s", indexInfo.getFileName(), recordKey),
|
||||
new Tuple2<>(indexInfo.getFileName(), new HoodieKey(recordKey, partitionPath))));
|
||||
new Tuple2<>(indexInfo.getFileName(),
|
||||
new HoodieKey(recordKey, partitionPath))));
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -347,22 +351,23 @@ public class HoodieBloomIndex<T extends HoodieRecordPayload> extends HoodieIndex
|
||||
/**
|
||||
* Find out <RowKey, filename> pair. All workload grouped by file-level.
|
||||
*
|
||||
* Join PairRDD(PartitionPath, RecordKey) and PairRDD(PartitionPath, File) & then repartition
|
||||
* such that each RDD partition is a file, then for each file, we do
|
||||
* (1) load bloom filter,
|
||||
* (2) load rowKeys,
|
||||
* (3) Tag rowKey
|
||||
* Join PairRDD(PartitionPath, RecordKey) and PairRDD(PartitionPath, File) & then repartition such
|
||||
* that each RDD partition is a file, then for each file, we do (1) load bloom filter, (2) load
|
||||
* rowKeys, (3) Tag rowKey
|
||||
*
|
||||
* Make sure the parallelism is atleast the groupby parallelism for tagging location
|
||||
*/
|
||||
@VisibleForTesting
|
||||
JavaPairRDD<String, String> findMatchingFilesForRecordKeys(final Map<String, List<BloomIndexFileInfo>> partitionToFileIndexInfo,
|
||||
JavaPairRDD<String, String> findMatchingFilesForRecordKeys(
|
||||
final Map<String, List<BloomIndexFileInfo>> partitionToFileIndexInfo,
|
||||
JavaPairRDD<String, String> partitionRecordKeyPairRDD,
|
||||
int totalSubpartitions) {
|
||||
|
||||
int joinParallelism = determineParallelism(partitionRecordKeyPairRDD.partitions().size(), totalSubpartitions);
|
||||
int joinParallelism = determineParallelism(partitionRecordKeyPairRDD.partitions().size(),
|
||||
totalSubpartitions);
|
||||
|
||||
JavaPairRDD<String, Tuple2<String, HoodieKey>> fileSortedTripletRDD = explodeRecordRDDWithFileComparisons(partitionToFileIndexInfo, partitionRecordKeyPairRDD)
|
||||
JavaPairRDD<String, Tuple2<String, HoodieKey>> fileSortedTripletRDD = explodeRecordRDDWithFileComparisons(
|
||||
partitionToFileIndexInfo, partitionRecordKeyPairRDD)
|
||||
// sort further based on filename, such that all checking for the file can happen within a single partition, on-the-fly
|
||||
.sortByKey(true, joinParallelism);
|
||||
|
||||
@@ -382,7 +387,8 @@ public class HoodieBloomIndex<T extends HoodieRecordPayload> extends HoodieIndex
|
||||
/**
|
||||
* Tag the <rowKey, filename> back to the original HoodieRecord RDD.
|
||||
*/
|
||||
private JavaRDD<HoodieRecord<T>> tagLocationBacktoRecords(JavaPairRDD<String, String> rowKeyFilenamePairRDD,
|
||||
private JavaRDD<HoodieRecord<T>> tagLocationBacktoRecords(
|
||||
JavaPairRDD<String, String> rowKeyFilenamePairRDD,
|
||||
JavaRDD<HoodieRecord<T>> recordRDD) {
|
||||
JavaPairRDD<String, HoodieRecord<T>> rowKeyRecordPairRDD = recordRDD
|
||||
.mapToPair(record -> new Tuple2<>(record.getRecordKey(), record));
|
||||
@@ -404,7 +410,8 @@ public class HoodieBloomIndex<T extends HoodieRecordPayload> extends HoodieIndex
|
||||
}
|
||||
|
||||
@Override
|
||||
public JavaRDD<WriteStatus> updateLocation(JavaRDD<WriteStatus> writeStatusRDD, HoodieTable<T> hoodieTable) {
|
||||
public JavaRDD<WriteStatus> updateLocation(JavaRDD<WriteStatus> writeStatusRDD,
|
||||
HoodieTable<T> hoodieTable) {
|
||||
return writeStatusRDD;
|
||||
}
|
||||
}
|
||||
|
||||
@@ -24,24 +24,22 @@ import com.uber.hoodie.common.util.ParquetUtils;
|
||||
import com.uber.hoodie.exception.HoodieException;
|
||||
import com.uber.hoodie.exception.HoodieIndexException;
|
||||
import com.uber.hoodie.func.LazyIterableIterator;
|
||||
|
||||
import org.apache.hadoop.fs.Path;
|
||||
import org.apache.log4j.LogManager;
|
||||
import org.apache.log4j.Logger;
|
||||
import org.apache.spark.api.java.function.Function2;
|
||||
|
||||
import java.util.ArrayList;
|
||||
import java.util.Iterator;
|
||||
import java.util.List;
|
||||
import java.util.Set;
|
||||
|
||||
import org.apache.hadoop.fs.Path;
|
||||
import org.apache.log4j.LogManager;
|
||||
import org.apache.log4j.Logger;
|
||||
import org.apache.spark.api.java.function.Function2;
|
||||
import scala.Tuple2;
|
||||
|
||||
/**
|
||||
* Function performing actual checking of RDD parition containing (fileId, hoodieKeys) against the
|
||||
* actual files
|
||||
*/
|
||||
public class HoodieBloomIndexCheckFunction implements Function2<Integer, Iterator<Tuple2<String, Tuple2<String, HoodieKey>>>, Iterator<List<IndexLookupResult>>> {
|
||||
public class HoodieBloomIndexCheckFunction implements
|
||||
Function2<Integer, Iterator<Tuple2<String, Tuple2<String, HoodieKey>>>, Iterator<List<IndexLookupResult>>> {
|
||||
|
||||
private static Logger logger = LogManager.getLogger(HoodieBloomIndexCheckFunction.class);
|
||||
|
||||
@@ -54,7 +52,8 @@ public class HoodieBloomIndexCheckFunction implements Function2<Integer, Iterato
|
||||
/**
|
||||
* Given a list of row keys and one file, return only row keys existing in that file.
|
||||
*/
|
||||
public static List<String> checkCandidatesAgainstFile(List<String> candidateRecordKeys, Path filePath) throws HoodieIndexException {
|
||||
public static List<String> checkCandidatesAgainstFile(List<String> candidateRecordKeys,
|
||||
Path filePath) throws HoodieIndexException {
|
||||
List<String> foundRecordKeys = new ArrayList<>();
|
||||
try {
|
||||
// Load all rowKeys from the file, to double-confirm
|
||||
@@ -69,18 +68,20 @@ public class HoodieBloomIndexCheckFunction implements Function2<Integer, Iterato
|
||||
foundRecordKeys.add(rowKey);
|
||||
}
|
||||
}
|
||||
logger.info("After checking with row keys, we have " + foundRecordKeys.size() + " results, for file " + filePath + " => " + foundRecordKeys);
|
||||
logger.info("After checking with row keys, we have " + foundRecordKeys.size()
|
||||
+ " results, for file " + filePath + " => " + foundRecordKeys);
|
||||
if (logger.isDebugEnabled()) {
|
||||
logger.debug("Keys matching for file " + filePath + " => " + foundRecordKeys);
|
||||
}
|
||||
}
|
||||
} catch (Exception e){
|
||||
} catch (Exception e) {
|
||||
throw new HoodieIndexException("Error checking candidate keys against file.", e);
|
||||
}
|
||||
return foundRecordKeys;
|
||||
}
|
||||
|
||||
class LazyKeyCheckIterator extends LazyIterableIterator<Tuple2<String, Tuple2<String, HoodieKey>>, List<IndexLookupResult>> {
|
||||
class LazyKeyCheckIterator extends
|
||||
LazyIterableIterator<Tuple2<String, Tuple2<String, HoodieKey>>, List<IndexLookupResult>> {
|
||||
|
||||
private List<String> candidateRecordKeys;
|
||||
|
||||
@@ -90,7 +91,8 @@ public class HoodieBloomIndexCheckFunction implements Function2<Integer, Iterato
|
||||
|
||||
private String currentParitionPath;
|
||||
|
||||
LazyKeyCheckIterator(Iterator<Tuple2<String, Tuple2<String, HoodieKey>>> fileParitionRecordKeyTripletItr) {
|
||||
LazyKeyCheckIterator(
|
||||
Iterator<Tuple2<String, Tuple2<String, HoodieKey>>> fileParitionRecordKeyTripletItr) {
|
||||
super(fileParitionRecordKeyTripletItr);
|
||||
currentFile = null;
|
||||
candidateRecordKeys = new ArrayList<>();
|
||||
@@ -144,11 +146,15 @@ public class HoodieBloomIndexCheckFunction implements Function2<Integer, Iterato
|
||||
} else {
|
||||
// do the actual checking of file & break out
|
||||
Path filePath = new Path(basePath + "/" + currentParitionPath + "/" + currentFile);
|
||||
logger.info("#1 After bloom filter, the candidate row keys is reduced to " + candidateRecordKeys.size() + " for " + filePath);
|
||||
logger.info(
|
||||
"#1 After bloom filter, the candidate row keys is reduced to " + candidateRecordKeys
|
||||
.size() + " for " + filePath);
|
||||
if (logger.isDebugEnabled()) {
|
||||
logger.debug("#The candidate row keys for " + filePath + " => " + candidateRecordKeys);
|
||||
logger
|
||||
.debug("#The candidate row keys for " + filePath + " => " + candidateRecordKeys);
|
||||
}
|
||||
ret.add(new IndexLookupResult(currentFile, checkCandidatesAgainstFile(candidateRecordKeys, filePath)));
|
||||
ret.add(new IndexLookupResult(currentFile,
|
||||
checkCandidatesAgainstFile(candidateRecordKeys, filePath)));
|
||||
|
||||
initState(fileName, partitionPath);
|
||||
if (bloomFilter.mightContain(recordKey)) {
|
||||
@@ -164,11 +170,14 @@ public class HoodieBloomIndexCheckFunction implements Function2<Integer, Iterato
|
||||
// handle case, where we ran out of input, finish pending work, update return val
|
||||
if (!inputItr.hasNext()) {
|
||||
Path filePath = new Path(basePath + "/" + currentParitionPath + "/" + currentFile);
|
||||
logger.info("#2 After bloom filter, the candidate row keys is reduced to " + candidateRecordKeys.size() + " for " + filePath);
|
||||
logger.info(
|
||||
"#2 After bloom filter, the candidate row keys is reduced to " + candidateRecordKeys
|
||||
.size() + " for " + filePath);
|
||||
if (logger.isDebugEnabled()) {
|
||||
logger.debug("#The candidate row keys for " + filePath + " => " + candidateRecordKeys);
|
||||
}
|
||||
ret.add(new IndexLookupResult(currentFile, checkCandidatesAgainstFile(candidateRecordKeys, filePath)));
|
||||
ret.add(new IndexLookupResult(currentFile,
|
||||
checkCandidatesAgainstFile(candidateRecordKeys, filePath)));
|
||||
}
|
||||
|
||||
} catch (Throwable e) {
|
||||
@@ -189,7 +198,8 @@ public class HoodieBloomIndexCheckFunction implements Function2<Integer, Iterato
|
||||
|
||||
@Override
|
||||
public Iterator<List<IndexLookupResult>> call(Integer partition,
|
||||
Iterator<Tuple2<String, Tuple2<String, HoodieKey>>> fileParitionRecordKeyTripletItr) throws Exception {
|
||||
Iterator<Tuple2<String, Tuple2<String, HoodieKey>>> fileParitionRecordKeyTripletItr)
|
||||
throws Exception {
|
||||
return new LazyKeyCheckIterator(fileParitionRecordKeyTripletItr);
|
||||
}
|
||||
}
|
||||
|
||||
@@ -19,7 +19,6 @@
|
||||
package com.uber.hoodie.index.bucketed;
|
||||
|
||||
import com.google.common.base.Optional;
|
||||
|
||||
import com.uber.hoodie.WriteStatus;
|
||||
import com.uber.hoodie.common.model.HoodieKey;
|
||||
import com.uber.hoodie.common.model.HoodieRecord;
|
||||
@@ -29,29 +28,22 @@ import com.uber.hoodie.config.HoodieWriteConfig;
|
||||
import com.uber.hoodie.exception.HoodieIndexException;
|
||||
import com.uber.hoodie.index.HoodieIndex;
|
||||
import com.uber.hoodie.table.HoodieTable;
|
||||
|
||||
import org.apache.log4j.LogManager;
|
||||
import org.apache.log4j.Logger;
|
||||
import org.apache.spark.api.java.JavaPairRDD;
|
||||
import org.apache.spark.api.java.JavaRDD;
|
||||
import org.apache.spark.api.java.JavaSparkContext;
|
||||
|
||||
import scala.Tuple2;
|
||||
|
||||
/**
|
||||
* An `stateless` index implementation that will using a deterministic mapping function to
|
||||
* determine the fileID for a given record.
|
||||
*
|
||||
* Pros:
|
||||
* - Fast
|
||||
*
|
||||
* Cons :
|
||||
* - Need to tune the number of buckets per partition path manually (FIXME: Need to autotune this)
|
||||
* - Could increase write amplification on copy-on-write storage since inserts always rewrite files
|
||||
* - Not global.
|
||||
*
|
||||
* An `stateless` index implementation that will using a deterministic mapping function to determine
|
||||
* the fileID for a given record.
|
||||
*
|
||||
* Pros: - Fast
|
||||
*
|
||||
* Cons : - Need to tune the number of buckets per partition path manually (FIXME: Need to autotune
|
||||
* this) - Could increase write amplification on copy-on-write storage since inserts always rewrite
|
||||
* files - Not global.
|
||||
*/
|
||||
public class BucketedIndex<T extends HoodieRecordPayload> extends HoodieIndex<T> {
|
||||
|
||||
@@ -66,12 +58,14 @@ public class BucketedIndex<T extends HoodieRecordPayload> extends HoodieIndex<T>
|
||||
}
|
||||
|
||||
@Override
|
||||
public JavaPairRDD<HoodieKey, Optional<String>> fetchRecordLocation(JavaRDD<HoodieKey> hoodieKeys, HoodieTable<T> table) {
|
||||
public JavaPairRDD<HoodieKey, Optional<String>> fetchRecordLocation(JavaRDD<HoodieKey> hoodieKeys,
|
||||
HoodieTable<T> table) {
|
||||
return hoodieKeys.mapToPair(hk -> new Tuple2<>(hk, Optional.of(getBucket(hk.getRecordKey()))));
|
||||
}
|
||||
|
||||
@Override
|
||||
public JavaRDD<HoodieRecord<T>> tagLocation(JavaRDD<HoodieRecord<T>> recordRDD, HoodieTable<T> hoodieTable) throws HoodieIndexException {
|
||||
public JavaRDD<HoodieRecord<T>> tagLocation(JavaRDD<HoodieRecord<T>> recordRDD,
|
||||
HoodieTable<T> hoodieTable) throws HoodieIndexException {
|
||||
return recordRDD.map(record -> {
|
||||
String bucket = getBucket(record.getRecordKey());
|
||||
//HACK(vc) a non-existent commit is provided here.
|
||||
@@ -81,7 +75,8 @@ public class BucketedIndex<T extends HoodieRecordPayload> extends HoodieIndex<T>
|
||||
}
|
||||
|
||||
@Override
|
||||
public JavaRDD<WriteStatus> updateLocation(JavaRDD<WriteStatus> writeStatusRDD, HoodieTable<T> hoodieTable) throws HoodieIndexException {
|
||||
public JavaRDD<WriteStatus> updateLocation(JavaRDD<WriteStatus> writeStatusRDD,
|
||||
HoodieTable<T> hoodieTable) throws HoodieIndexException {
|
||||
return writeStatusRDD;
|
||||
}
|
||||
|
||||
@@ -93,8 +88,6 @@ public class BucketedIndex<T extends HoodieRecordPayload> extends HoodieIndex<T>
|
||||
|
||||
/**
|
||||
* Bucketing is still done within each partition.
|
||||
*
|
||||
* @return
|
||||
*/
|
||||
@Override
|
||||
public boolean isGlobal() {
|
||||
@@ -102,10 +95,8 @@ public class BucketedIndex<T extends HoodieRecordPayload> extends HoodieIndex<T>
|
||||
}
|
||||
|
||||
/**
|
||||
* Since indexing is just a deterministic hash, we can identify file group correctly even without an index
|
||||
* on the actual log file.
|
||||
*
|
||||
* @return
|
||||
* Since indexing is just a deterministic hash, we can identify file group correctly even without
|
||||
* an index on the actual log file.
|
||||
*/
|
||||
@Override
|
||||
public boolean canIndexLogFiles() {
|
||||
@@ -114,8 +105,6 @@ public class BucketedIndex<T extends HoodieRecordPayload> extends HoodieIndex<T>
|
||||
|
||||
/**
|
||||
* Indexing is just a hash function.
|
||||
*
|
||||
* @return
|
||||
*/
|
||||
@Override
|
||||
public boolean isImplicitWithStorage() {
|
||||
|
||||
@@ -19,24 +19,33 @@
|
||||
package com.uber.hoodie.index.hbase;
|
||||
|
||||
import com.google.common.base.Optional;
|
||||
import com.uber.hoodie.common.table.HoodieTimeline;
|
||||
import com.uber.hoodie.common.table.timeline.HoodieInstant;
|
||||
import com.uber.hoodie.config.HoodieWriteConfig;
|
||||
import com.uber.hoodie.WriteStatus;
|
||||
import com.uber.hoodie.common.model.HoodieKey;
|
||||
import com.uber.hoodie.common.model.HoodieRecord;
|
||||
import com.uber.hoodie.common.model.HoodieRecordLocation;
|
||||
import com.uber.hoodie.common.model.HoodieRecordPayload;
|
||||
import com.uber.hoodie.common.model.HoodieRecord;
|
||||
|
||||
import com.uber.hoodie.common.table.HoodieTimeline;
|
||||
import com.uber.hoodie.common.table.timeline.HoodieInstant;
|
||||
import com.uber.hoodie.config.HoodieIndexConfig;
|
||||
import com.uber.hoodie.config.HoodieWriteConfig;
|
||||
import com.uber.hoodie.exception.HoodieDependentSystemUnavailableException;
|
||||
import com.uber.hoodie.exception.HoodieIndexException;
|
||||
import com.uber.hoodie.index.HoodieIndex;
|
||||
import com.uber.hoodie.table.HoodieTable;
|
||||
import java.io.IOException;
|
||||
import java.util.ArrayList;
|
||||
import java.util.Iterator;
|
||||
import java.util.List;
|
||||
import org.apache.hadoop.conf.Configuration;
|
||||
import org.apache.hadoop.hbase.HBaseConfiguration;
|
||||
import org.apache.hadoop.hbase.TableName;
|
||||
import org.apache.hadoop.hbase.client.*;
|
||||
import org.apache.hadoop.hbase.client.Connection;
|
||||
import org.apache.hadoop.hbase.client.ConnectionFactory;
|
||||
import org.apache.hadoop.hbase.client.Delete;
|
||||
import org.apache.hadoop.hbase.client.Get;
|
||||
import org.apache.hadoop.hbase.client.HTable;
|
||||
import org.apache.hadoop.hbase.client.Put;
|
||||
import org.apache.hadoop.hbase.client.Result;
|
||||
import org.apache.hadoop.hbase.util.Bytes;
|
||||
import org.apache.log4j.LogManager;
|
||||
import org.apache.log4j.Logger;
|
||||
@@ -45,15 +54,11 @@ import org.apache.spark.api.java.JavaRDD;
|
||||
import org.apache.spark.api.java.JavaSparkContext;
|
||||
import org.apache.spark.api.java.function.Function2;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.util.ArrayList;
|
||||
import java.util.Iterator;
|
||||
import java.util.List;
|
||||
|
||||
/**
|
||||
* Hoodie Index implementation backed by HBase
|
||||
*/
|
||||
public class HBaseIndex<T extends HoodieRecordPayload> extends HoodieIndex<T> {
|
||||
|
||||
private final static byte[] SYSTEM_COLUMN_FAMILY = Bytes.toBytes("_s");
|
||||
private final static byte[] COMMIT_TS_COLUMN = Bytes.toBytes("commit_ts");
|
||||
private final static byte[] FILE_NAME_COLUMN = Bytes.toBytes("file_name");
|
||||
@@ -144,9 +149,7 @@ public class HBaseIndex<T extends HoodieRecordPayload> extends HoodieIndex<T> {
|
||||
} catch (IOException e) {
|
||||
throw new HoodieIndexException(
|
||||
"Failed to Tag indexed locations because of exception with HBase Client", e);
|
||||
}
|
||||
|
||||
finally {
|
||||
} finally {
|
||||
if (hTable != null) {
|
||||
try {
|
||||
hTable.close();
|
||||
@@ -161,11 +164,14 @@ public class HBaseIndex<T extends HoodieRecordPayload> extends HoodieIndex<T> {
|
||||
}
|
||||
|
||||
@Override
|
||||
public JavaRDD<HoodieRecord<T>> tagLocation(JavaRDD<HoodieRecord<T>> recordRDD, HoodieTable<T> hoodieTable) {
|
||||
public JavaRDD<HoodieRecord<T>> tagLocation(JavaRDD<HoodieRecord<T>> recordRDD,
|
||||
HoodieTable<T> hoodieTable) {
|
||||
return recordRDD.mapPartitionsWithIndex(this.new LocationTagFunction(hoodieTable), true);
|
||||
}
|
||||
|
||||
class UpdateLocationTask implements Function2<Integer, Iterator<WriteStatus>, Iterator<WriteStatus>> {
|
||||
class UpdateLocationTask implements
|
||||
Function2<Integer, Iterator<WriteStatus>, Iterator<WriteStatus>> {
|
||||
|
||||
@Override
|
||||
public Iterator<WriteStatus> call(Integer partition, Iterator<WriteStatus> statusIterator) {
|
||||
|
||||
@@ -187,7 +193,7 @@ public class HBaseIndex<T extends HoodieRecordPayload> extends HoodieIndex<T> {
|
||||
for (HoodieRecord rec : writeStatus.getWrittenRecords()) {
|
||||
if (!writeStatus.isErrored(rec.getKey())) {
|
||||
java.util.Optional<HoodieRecordLocation> loc = rec.getNewLocation();
|
||||
if(loc.isPresent()) {
|
||||
if (loc.isPresent()) {
|
||||
Put put = new Put(Bytes.toBytes(rec.getRecordKey()));
|
||||
put.addColumn(SYSTEM_COLUMN_FAMILY, COMMIT_TS_COLUMN,
|
||||
Bytes.toBytes(loc.get().getCommitTime()));
|
||||
@@ -244,8 +250,6 @@ public class HBaseIndex<T extends HoodieRecordPayload> extends HoodieIndex<T> {
|
||||
|
||||
/**
|
||||
* Only looks up by recordKey
|
||||
*
|
||||
* @return
|
||||
*/
|
||||
@Override
|
||||
public boolean isGlobal() {
|
||||
@@ -254,8 +258,6 @@ public class HBaseIndex<T extends HoodieRecordPayload> extends HoodieIndex<T> {
|
||||
|
||||
/**
|
||||
* Mapping is available in HBase already.
|
||||
*
|
||||
* @return
|
||||
*/
|
||||
@Override
|
||||
public boolean canIndexLogFiles() {
|
||||
@@ -264,8 +266,6 @@ public class HBaseIndex<T extends HoodieRecordPayload> extends HoodieIndex<T> {
|
||||
|
||||
/**
|
||||
* Index needs to be explicitly updated after storage write.
|
||||
*
|
||||
* @return
|
||||
*/
|
||||
@Override
|
||||
public boolean isImplicitWithStorage() {
|
||||
|
||||
@@ -36,13 +36,6 @@ import com.uber.hoodie.config.HoodieWriteConfig;
|
||||
import com.uber.hoodie.exception.HoodieAppendException;
|
||||
import com.uber.hoodie.exception.HoodieUpsertException;
|
||||
import com.uber.hoodie.table.HoodieTable;
|
||||
import org.apache.avro.generic.GenericRecord;
|
||||
import org.apache.avro.generic.IndexedRecord;
|
||||
import org.apache.hadoop.fs.Path;
|
||||
import org.apache.log4j.LogManager;
|
||||
import org.apache.log4j.Logger;
|
||||
import org.apache.spark.TaskContext;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.util.ArrayList;
|
||||
import java.util.Iterator;
|
||||
@@ -50,13 +43,18 @@ import java.util.List;
|
||||
import java.util.Map;
|
||||
import java.util.Optional;
|
||||
import java.util.concurrent.atomic.AtomicLong;
|
||||
import org.apache.avro.generic.GenericRecord;
|
||||
import org.apache.avro.generic.IndexedRecord;
|
||||
import org.apache.hadoop.fs.Path;
|
||||
import org.apache.log4j.LogManager;
|
||||
import org.apache.log4j.Logger;
|
||||
import org.apache.spark.TaskContext;
|
||||
|
||||
/**
|
||||
* IO Operation to append data onto an existing file.
|
||||
*
|
||||
* @param <T>
|
||||
*/
|
||||
public class HoodieAppendHandle<T extends HoodieRecordPayload> extends HoodieIOHandle<T> {
|
||||
|
||||
private static Logger logger = LogManager.getLogger(HoodieMergeHandle.class);
|
||||
private static AtomicLong recordIndex = new AtomicLong(1);
|
||||
|
||||
@@ -133,7 +131,7 @@ public class HoodieAppendHandle<T extends HoodieRecordPayload> extends HoodieIOH
|
||||
try {
|
||||
Optional<IndexedRecord> avroRecord = hoodieRecord.getData().getInsertValue(schema);
|
||||
|
||||
if(avroRecord.isPresent()) {
|
||||
if (avroRecord.isPresent()) {
|
||||
String seqId = HoodieRecord.generateSequenceId(commitTime, TaskContext.getPartitionId(),
|
||||
recordIndex.getAndIncrement());
|
||||
HoodieAvroUtils
|
||||
@@ -164,18 +162,19 @@ public class HoodieAppendHandle<T extends HoodieRecordPayload> extends HoodieIOH
|
||||
metadata.put(HoodieLogBlock.LogMetadataType.INSTANT_TIME, commitTime);
|
||||
records.stream().forEach(record -> {
|
||||
Optional<IndexedRecord> indexedRecord = getIndexedRecord(record);
|
||||
if(indexedRecord.isPresent()) {
|
||||
if (indexedRecord.isPresent()) {
|
||||
recordList.add(indexedRecord.get());
|
||||
} else {
|
||||
keysToDelete.add(record.getRecordKey());
|
||||
}
|
||||
});
|
||||
try {
|
||||
if(recordList.size() > 0) {
|
||||
if (recordList.size() > 0) {
|
||||
writer = writer.appendBlock(new HoodieAvroDataBlock(recordList, schema, metadata));
|
||||
}
|
||||
if(keysToDelete.size() > 0) {
|
||||
writer = writer.appendBlock(new HoodieDeleteBlock(keysToDelete.stream().toArray(String[]::new), metadata));
|
||||
if (keysToDelete.size() > 0) {
|
||||
writer = writer.appendBlock(
|
||||
new HoodieDeleteBlock(keysToDelete.stream().toArray(String[]::new), metadata));
|
||||
}
|
||||
} catch (Exception e) {
|
||||
throw new HoodieAppendException(
|
||||
|
||||
@@ -27,27 +27,24 @@ import com.uber.hoodie.common.table.TableFileSystemView;
|
||||
import com.uber.hoodie.common.table.timeline.HoodieInstant;
|
||||
import com.uber.hoodie.config.HoodieWriteConfig;
|
||||
import com.uber.hoodie.table.HoodieTable;
|
||||
import org.apache.hadoop.fs.FileSystem;
|
||||
import org.apache.log4j.LogManager;
|
||||
import org.apache.log4j.Logger;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.util.ArrayList;
|
||||
import java.util.Iterator;
|
||||
import java.util.List;
|
||||
import java.util.Optional;
|
||||
import java.util.stream.Collectors;
|
||||
import org.apache.hadoop.fs.FileSystem;
|
||||
import org.apache.log4j.LogManager;
|
||||
import org.apache.log4j.Logger;
|
||||
|
||||
/**
|
||||
* Cleaner is responsible for garbage collecting older files in a given partition path, such that
|
||||
* <p>
|
||||
* 1) It provides sufficient time for existing queries running on older versions, to finish
|
||||
* <p>
|
||||
* 2) It bounds the growth of the files in the file system
|
||||
* <p>
|
||||
* TODO: Should all cleaning be done based on {@link com.uber.hoodie.common.model.HoodieCommitMetadata}
|
||||
* <p> 1) It provides sufficient time for existing queries running on older versions, to finish <p>
|
||||
* 2) It bounds the growth of the files in the file system <p> TODO: Should all cleaning be done
|
||||
* based on {@link com.uber.hoodie.common.model.HoodieCommitMetadata}
|
||||
*/
|
||||
public class HoodieCleanHelper<T extends HoodieRecordPayload<T>> {
|
||||
|
||||
private static Logger logger = LogManager.getLogger(HoodieCleanHelper.class);
|
||||
|
||||
private final TableFileSystemView fileSystemView;
|
||||
@@ -66,13 +63,9 @@ public class HoodieCleanHelper<T extends HoodieRecordPayload<T>> {
|
||||
|
||||
|
||||
/**
|
||||
* Selects the older versions of files for cleaning, such that it bounds the number of versions of each file.
|
||||
* This policy is useful, if you are simply interested in querying the table, and you don't want too many
|
||||
* versions for a single file (i.e run it with versionsRetained = 1)
|
||||
*
|
||||
* @param partitionPath
|
||||
* @return
|
||||
* @throws IOException
|
||||
* Selects the older versions of files for cleaning, such that it bounds the number of versions of
|
||||
* each file. This policy is useful, if you are simply interested in querying the table, and you
|
||||
* don't want too many versions for a single file (i.e run it with versionsRetained = 1)
|
||||
*/
|
||||
private List<String> getFilesToCleanKeepingLatestVersions(String partitionPath)
|
||||
throws IOException {
|
||||
@@ -93,7 +86,7 @@ public class HoodieCleanHelper<T extends HoodieRecordPayload<T>> {
|
||||
// Skip this most recent version
|
||||
FileSlice nextSlice = fileSliceIterator.next();
|
||||
HoodieDataFile dataFile = nextSlice.getDataFile().get();
|
||||
if(savepointedFiles.contains(dataFile.getFileName())) {
|
||||
if (savepointedFiles.contains(dataFile.getFileName())) {
|
||||
// do not clean up a savepoint data file
|
||||
continue;
|
||||
}
|
||||
@@ -118,22 +111,15 @@ public class HoodieCleanHelper<T extends HoodieRecordPayload<T>> {
|
||||
|
||||
|
||||
/**
|
||||
* Selects the versions for file for cleaning, such that it
|
||||
* <p>
|
||||
* - Leaves the latest version of the file untouched
|
||||
* - For older versions,
|
||||
* - It leaves all the commits untouched which has occured in last <code>config.getCleanerCommitsRetained()</code> commits
|
||||
* - It leaves ONE commit before this window. We assume that the max(query execution time) == commit_batch_time * config.getCleanerCommitsRetained(). This is 12 hours by default.
|
||||
* This is essential to leave the file used by the query thats running for the max time.
|
||||
* <p>
|
||||
* This provides the effect of having lookback into all changes that happened in the last X
|
||||
* commits. (eg: if you retain 24 commits, and commit batch time is 30 mins, then you have 12 hrs of lookback)
|
||||
* <p>
|
||||
* This policy is the default.
|
||||
*
|
||||
* @param partitionPath
|
||||
* @return
|
||||
* @throws IOException
|
||||
* Selects the versions for file for cleaning, such that it <p> - Leaves the latest version of the
|
||||
* file untouched - For older versions, - It leaves all the commits untouched which has occured in
|
||||
* last <code>config.getCleanerCommitsRetained()</code> commits - It leaves ONE commit before this
|
||||
* window. We assume that the max(query execution time) == commit_batch_time *
|
||||
* config.getCleanerCommitsRetained(). This is 12 hours by default. This is essential to leave the
|
||||
* file used by the query thats running for the max time. <p> This provides the effect of having
|
||||
* lookback into all changes that happened in the last X commits. (eg: if you retain 24 commits,
|
||||
* and commit batch time is 30 mins, then you have 12 hrs of lookback) <p> This policy is the
|
||||
* default.
|
||||
*/
|
||||
private List<String> getFilesToCleanKeepingLatestCommits(String partitionPath)
|
||||
throws IOException {
|
||||
@@ -164,7 +150,7 @@ public class HoodieCleanHelper<T extends HoodieRecordPayload<T>> {
|
||||
for (FileSlice aSlice : fileSliceList) {
|
||||
HoodieDataFile aFile = aSlice.getDataFile().get();
|
||||
String fileCommitTime = aFile.getCommitTime();
|
||||
if(savepointedFiles.contains(aFile.getFileName())) {
|
||||
if (savepointedFiles.contains(aFile.getFileName())) {
|
||||
// do not clean up a savepoint data file
|
||||
continue;
|
||||
}
|
||||
|
||||
@@ -39,6 +39,12 @@ import com.uber.hoodie.exception.HoodieCommitException;
|
||||
import com.uber.hoodie.exception.HoodieException;
|
||||
import com.uber.hoodie.exception.HoodieIOException;
|
||||
import com.uber.hoodie.table.HoodieTable;
|
||||
import java.io.IOException;
|
||||
import java.util.ArrayList;
|
||||
import java.util.List;
|
||||
import java.util.Optional;
|
||||
import java.util.stream.Collectors;
|
||||
import java.util.stream.Stream;
|
||||
import org.apache.avro.Schema;
|
||||
import org.apache.avro.generic.IndexedRecord;
|
||||
import org.apache.hadoop.fs.FileSystem;
|
||||
@@ -46,17 +52,11 @@ import org.apache.hadoop.fs.Path;
|
||||
import org.apache.log4j.LogManager;
|
||||
import org.apache.log4j.Logger;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.util.ArrayList;
|
||||
import java.util.List;
|
||||
import java.util.Optional;
|
||||
import java.util.stream.Collectors;
|
||||
import java.util.stream.Stream;
|
||||
|
||||
/**
|
||||
* Archiver to bound the growth of <action>.commit files
|
||||
*/
|
||||
public class HoodieCommitArchiveLog {
|
||||
|
||||
private static Logger log = LogManager.getLogger(HoodieCommitArchiveLog.class);
|
||||
|
||||
private final Path archiveFilePath;
|
||||
@@ -73,7 +73,7 @@ public class HoodieCommitArchiveLog {
|
||||
|
||||
private HoodieLogFormat.Writer openWriter() {
|
||||
try {
|
||||
if(this.writer == null) {
|
||||
if (this.writer == null) {
|
||||
return HoodieLogFormat.newWriterBuilder()
|
||||
.onParentPath(archiveFilePath.getParent())
|
||||
.withFileId(archiveFilePath.getName())
|
||||
@@ -83,17 +83,17 @@ public class HoodieCommitArchiveLog {
|
||||
} else {
|
||||
return this.writer;
|
||||
}
|
||||
} catch(InterruptedException | IOException e) {
|
||||
} catch (InterruptedException | IOException e) {
|
||||
throw new HoodieException("Unable to initialize HoodieLogFormat writer", e);
|
||||
}
|
||||
}
|
||||
|
||||
private void close() {
|
||||
try {
|
||||
if(this.writer != null) {
|
||||
if (this.writer != null) {
|
||||
this.writer.close();
|
||||
}
|
||||
} catch(IOException e) {
|
||||
} catch (IOException e) {
|
||||
throw new HoodieException("Unable to close HoodieLogFormat writer", e);
|
||||
}
|
||||
}
|
||||
@@ -125,10 +125,12 @@ public class HoodieCommitArchiveLog {
|
||||
int maxCommitsToKeep = config.getMaxCommitsToKeep();
|
||||
int minCommitsToKeep = config.getMinCommitsToKeep();
|
||||
|
||||
HoodieTable table = HoodieTable.getHoodieTable(new HoodieTableMetaClient(fs, config.getBasePath(), true), config);
|
||||
HoodieTable table = HoodieTable
|
||||
.getHoodieTable(new HoodieTableMetaClient(fs, config.getBasePath(), true), config);
|
||||
|
||||
// GroupBy each action and limit each action timeline to maxCommitsToKeep
|
||||
HoodieTimeline cleanAndRollbackTimeline = table.getActiveTimeline().getTimelineOfActions(Sets.newHashSet(HoodieTimeline.CLEAN_ACTION,
|
||||
HoodieTimeline cleanAndRollbackTimeline = table.getActiveTimeline()
|
||||
.getTimelineOfActions(Sets.newHashSet(HoodieTimeline.CLEAN_ACTION,
|
||||
HoodieTimeline.ROLLBACK_ACTION));
|
||||
Stream<HoodieInstant> instants = cleanAndRollbackTimeline.getInstants()
|
||||
.collect(Collectors.groupingBy(s -> s.getAction()))
|
||||
@@ -198,7 +200,7 @@ public class HoodieCommitArchiveLog {
|
||||
}
|
||||
HoodieAvroDataBlock block = new HoodieAvroDataBlock(records, wrapperSchema);
|
||||
this.writer = writer.appendBlock(block);
|
||||
} catch(Exception e) {
|
||||
} catch (Exception e) {
|
||||
throw new HoodieCommitException("Failed to archive commits", e);
|
||||
}
|
||||
}
|
||||
@@ -207,40 +209,48 @@ public class HoodieCommitArchiveLog {
|
||||
return archiveFilePath;
|
||||
}
|
||||
|
||||
private IndexedRecord convertToAvroRecord(HoodieTimeline commitTimeline, HoodieInstant hoodieInstant) throws IOException {
|
||||
private IndexedRecord convertToAvroRecord(HoodieTimeline commitTimeline,
|
||||
HoodieInstant hoodieInstant) throws IOException {
|
||||
HoodieArchivedMetaEntry archivedMetaWrapper = new HoodieArchivedMetaEntry();
|
||||
archivedMetaWrapper.setCommitTime(hoodieInstant.getTimestamp());
|
||||
switch(hoodieInstant.getAction()) {
|
||||
case HoodieTimeline.CLEAN_ACTION:{
|
||||
archivedMetaWrapper.setHoodieCleanMetadata(AvroUtils.deserializeAvroMetadata(commitTimeline.getInstantDetails(hoodieInstant).get(), HoodieCleanMetadata.class));
|
||||
switch (hoodieInstant.getAction()) {
|
||||
case HoodieTimeline.CLEAN_ACTION: {
|
||||
archivedMetaWrapper.setHoodieCleanMetadata(AvroUtils
|
||||
.deserializeAvroMetadata(commitTimeline.getInstantDetails(hoodieInstant).get(),
|
||||
HoodieCleanMetadata.class));
|
||||
archivedMetaWrapper.setActionType(ActionType.clean.name());
|
||||
break;
|
||||
}
|
||||
case HoodieTimeline.COMMIT_ACTION:{
|
||||
case HoodieTimeline.COMMIT_ACTION: {
|
||||
HoodieCommitMetadata commitMetadata = HoodieCommitMetadata
|
||||
.fromBytes(commitTimeline.getInstantDetails(hoodieInstant).get());
|
||||
archivedMetaWrapper.setHoodieCommitMetadata(commitMetadataConverter(commitMetadata));
|
||||
archivedMetaWrapper.setActionType(ActionType.commit.name());
|
||||
break;
|
||||
}
|
||||
case HoodieTimeline.COMPACTION_ACTION:{
|
||||
case HoodieTimeline.COMPACTION_ACTION: {
|
||||
com.uber.hoodie.common.model.HoodieCompactionMetadata compactionMetadata = com.uber.hoodie.common.model.HoodieCompactionMetadata
|
||||
.fromBytes(commitTimeline.getInstantDetails(hoodieInstant).get());
|
||||
archivedMetaWrapper.setHoodieCompactionMetadata(compactionMetadataConverter(compactionMetadata));
|
||||
archivedMetaWrapper
|
||||
.setHoodieCompactionMetadata(compactionMetadataConverter(compactionMetadata));
|
||||
archivedMetaWrapper.setActionType(ActionType.compaction.name());
|
||||
break;
|
||||
}
|
||||
case HoodieTimeline.ROLLBACK_ACTION:{
|
||||
archivedMetaWrapper.setHoodieRollbackMetadata(AvroUtils.deserializeAvroMetadata(commitTimeline.getInstantDetails(hoodieInstant).get(), HoodieRollbackMetadata.class));
|
||||
case HoodieTimeline.ROLLBACK_ACTION: {
|
||||
archivedMetaWrapper.setHoodieRollbackMetadata(AvroUtils
|
||||
.deserializeAvroMetadata(commitTimeline.getInstantDetails(hoodieInstant).get(),
|
||||
HoodieRollbackMetadata.class));
|
||||
archivedMetaWrapper.setActionType(ActionType.rollback.name());
|
||||
break;
|
||||
}
|
||||
case HoodieTimeline.SAVEPOINT_ACTION:{
|
||||
archivedMetaWrapper.setHoodieSavePointMetadata(AvroUtils.deserializeAvroMetadata(commitTimeline.getInstantDetails(hoodieInstant).get(), HoodieSavepointMetadata.class));
|
||||
case HoodieTimeline.SAVEPOINT_ACTION: {
|
||||
archivedMetaWrapper.setHoodieSavePointMetadata(AvroUtils
|
||||
.deserializeAvroMetadata(commitTimeline.getInstantDetails(hoodieInstant).get(),
|
||||
HoodieSavepointMetadata.class));
|
||||
archivedMetaWrapper.setActionType(ActionType.savepoint.name());
|
||||
break;
|
||||
}
|
||||
case HoodieTimeline.DELTA_COMMIT_ACTION:{
|
||||
case HoodieTimeline.DELTA_COMMIT_ACTION: {
|
||||
HoodieCommitMetadata commitMetadata = HoodieCommitMetadata
|
||||
.fromBytes(commitTimeline.getInstantDetails(hoodieInstant).get());
|
||||
archivedMetaWrapper.setHoodieCommitMetadata(commitMetadataConverter(commitMetadata));
|
||||
@@ -251,19 +261,23 @@ public class HoodieCommitArchiveLog {
|
||||
return archivedMetaWrapper;
|
||||
}
|
||||
|
||||
private com.uber.hoodie.avro.model.HoodieCommitMetadata commitMetadataConverter(HoodieCommitMetadata hoodieCommitMetadata) {
|
||||
private com.uber.hoodie.avro.model.HoodieCommitMetadata commitMetadataConverter(
|
||||
HoodieCommitMetadata hoodieCommitMetadata) {
|
||||
ObjectMapper mapper = new ObjectMapper();
|
||||
//Need this to ignore other public get() methods
|
||||
mapper.configure(DeserializationFeature.FAIL_ON_UNKNOWN_PROPERTIES, false);
|
||||
com.uber.hoodie.avro.model.HoodieCommitMetadata avroMetaData =
|
||||
mapper.convertValue(hoodieCommitMetadata, com.uber.hoodie.avro.model.HoodieCommitMetadata.class);
|
||||
mapper.convertValue(hoodieCommitMetadata,
|
||||
com.uber.hoodie.avro.model.HoodieCommitMetadata.class);
|
||||
return avroMetaData;
|
||||
}
|
||||
|
||||
private com.uber.hoodie.avro.model.HoodieCompactionMetadata compactionMetadataConverter(HoodieCompactionMetadata hoodieCompactionMetadata) {
|
||||
private com.uber.hoodie.avro.model.HoodieCompactionMetadata compactionMetadataConverter(
|
||||
HoodieCompactionMetadata hoodieCompactionMetadata) {
|
||||
ObjectMapper mapper = new ObjectMapper();
|
||||
mapper.configure(DeserializationFeature.FAIL_ON_UNKNOWN_PROPERTIES, false);
|
||||
com.uber.hoodie.avro.model.HoodieCompactionMetadata avroMetaData = mapper.convertValue(hoodieCompactionMetadata,
|
||||
com.uber.hoodie.avro.model.HoodieCompactionMetadata avroMetaData = mapper
|
||||
.convertValue(hoodieCompactionMetadata,
|
||||
com.uber.hoodie.avro.model.HoodieCompactionMetadata.class);
|
||||
return avroMetaData;
|
||||
}
|
||||
|
||||
@@ -29,17 +29,17 @@ import com.uber.hoodie.exception.HoodieInsertException;
|
||||
import com.uber.hoodie.io.storage.HoodieStorageWriter;
|
||||
import com.uber.hoodie.io.storage.HoodieStorageWriterFactory;
|
||||
import com.uber.hoodie.table.HoodieTable;
|
||||
import java.io.IOException;
|
||||
import java.util.Optional;
|
||||
import java.util.UUID;
|
||||
import org.apache.avro.generic.IndexedRecord;
|
||||
import org.apache.hadoop.fs.Path;
|
||||
import org.apache.log4j.LogManager;
|
||||
import org.apache.log4j.Logger;
|
||||
import org.apache.spark.TaskContext;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.util.Optional;
|
||||
import java.util.UUID;
|
||||
|
||||
public class HoodieCreateHandle<T extends HoodieRecordPayload> extends HoodieIOHandle<T> {
|
||||
|
||||
private static Logger logger = LogManager.getLogger(HoodieCreateHandle.class);
|
||||
|
||||
private final WriteStatus status;
|
||||
@@ -63,7 +63,8 @@ public class HoodieCreateHandle<T extends HoodieRecordPayload> extends HoodieIOH
|
||||
new Path(config.getBasePath(), partitionPath));
|
||||
partitionMetadata.trySave(TaskContext.getPartitionId());
|
||||
this.storageWriter =
|
||||
HoodieStorageWriterFactory.getStorageWriter(commitTime, path, hoodieTable, config, schema);
|
||||
HoodieStorageWriterFactory
|
||||
.getStorageWriter(commitTime, path, hoodieTable, config, schema);
|
||||
} catch (IOException e) {
|
||||
throw new HoodieInsertException(
|
||||
"Failed to initialize HoodieStorageWriter for path " + path, e);
|
||||
@@ -74,10 +75,8 @@ public class HoodieCreateHandle<T extends HoodieRecordPayload> extends HoodieIOH
|
||||
/**
|
||||
* Determines whether we can accept the incoming records, into the current file, depending on
|
||||
*
|
||||
* - Whether it belongs to the same partitionPath as existing records
|
||||
* - Whether the current file written bytes lt max file size
|
||||
*
|
||||
* @return
|
||||
* - Whether it belongs to the same partitionPath as existing records - Whether the current file
|
||||
* written bytes lt max file size
|
||||
*/
|
||||
public boolean canWrite(HoodieRecord record) {
|
||||
return storageWriter.canWrite() && record.getPartitionPath()
|
||||
@@ -86,15 +85,13 @@ public class HoodieCreateHandle<T extends HoodieRecordPayload> extends HoodieIOH
|
||||
|
||||
/**
|
||||
* Perform the actual writing of the given record into the backing file.
|
||||
*
|
||||
* @param record
|
||||
*/
|
||||
public void write(HoodieRecord record) {
|
||||
Optional recordMetadata = record.getData().getMetadata();
|
||||
try {
|
||||
Optional<IndexedRecord> avroRecord = record.getData().getInsertValue(schema);
|
||||
|
||||
if(avroRecord.isPresent()) {
|
||||
if (avroRecord.isPresent()) {
|
||||
storageWriter.writeAvroWithMetadata(avroRecord.get(), record);
|
||||
// update the new location of record, so we know where to find it next
|
||||
record.setNewLocation(new HoodieRecordLocation(commitTime, status.getFileId()));
|
||||
@@ -114,8 +111,6 @@ public class HoodieCreateHandle<T extends HoodieRecordPayload> extends HoodieIOH
|
||||
|
||||
/**
|
||||
* Performs actions to durably, persist the current changes and returns a WriteStatus object
|
||||
*
|
||||
* @return
|
||||
*/
|
||||
public WriteStatus close() {
|
||||
logger.info(
|
||||
|
||||
@@ -24,6 +24,7 @@ import com.uber.hoodie.common.util.HoodieAvroUtils;
|
||||
import com.uber.hoodie.config.HoodieWriteConfig;
|
||||
import com.uber.hoodie.exception.HoodieIOException;
|
||||
import com.uber.hoodie.table.HoodieTable;
|
||||
import java.io.IOException;
|
||||
import org.apache.avro.Schema;
|
||||
import org.apache.hadoop.fs.FileStatus;
|
||||
import org.apache.hadoop.fs.FileSystem;
|
||||
@@ -31,9 +32,8 @@ import org.apache.hadoop.fs.Path;
|
||||
import org.apache.log4j.LogManager;
|
||||
import org.apache.log4j.Logger;
|
||||
|
||||
import java.io.IOException;
|
||||
|
||||
public abstract class HoodieIOHandle<T extends HoodieRecordPayload> {
|
||||
|
||||
private static Logger logger = LogManager.getLogger(HoodieIOHandle.class);
|
||||
protected final String commitTime;
|
||||
protected final HoodieWriteConfig config;
|
||||
|
||||
@@ -16,19 +16,23 @@
|
||||
|
||||
package com.uber.hoodie.io;
|
||||
|
||||
import com.uber.hoodie.common.model.HoodiePartitionMetadata;
|
||||
import com.uber.hoodie.common.util.ReflectionUtils;
|
||||
import com.uber.hoodie.config.HoodieWriteConfig;
|
||||
import com.uber.hoodie.WriteStatus;
|
||||
import com.uber.hoodie.common.model.HoodiePartitionMetadata;
|
||||
import com.uber.hoodie.common.model.HoodieRecord;
|
||||
import com.uber.hoodie.common.model.HoodieRecordLocation;
|
||||
import com.uber.hoodie.common.model.HoodieRecordPayload;
|
||||
import com.uber.hoodie.common.model.HoodieWriteStat;
|
||||
import com.uber.hoodie.common.util.FSUtils;
|
||||
import com.uber.hoodie.common.util.ReflectionUtils;
|
||||
import com.uber.hoodie.config.HoodieWriteConfig;
|
||||
import com.uber.hoodie.exception.HoodieUpsertException;
|
||||
import com.uber.hoodie.io.storage.HoodieStorageWriter;
|
||||
import com.uber.hoodie.io.storage.HoodieStorageWriterFactory;
|
||||
import com.uber.hoodie.table.HoodieTable;
|
||||
import java.io.IOException;
|
||||
import java.util.HashMap;
|
||||
import java.util.Iterator;
|
||||
import java.util.Optional;
|
||||
import org.apache.avro.generic.GenericRecord;
|
||||
import org.apache.avro.generic.IndexedRecord;
|
||||
import org.apache.hadoop.fs.Path;
|
||||
@@ -36,13 +40,9 @@ import org.apache.log4j.LogManager;
|
||||
import org.apache.log4j.Logger;
|
||||
import org.apache.spark.TaskContext;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.util.HashMap;
|
||||
import java.util.Iterator;
|
||||
import java.util.Optional;
|
||||
|
||||
@SuppressWarnings("Duplicates")
|
||||
public class HoodieMergeHandle<T extends HoodieRecordPayload> extends HoodieIOHandle<T> {
|
||||
|
||||
private static Logger logger = LogManager.getLogger(HoodieMergeHandle.class);
|
||||
|
||||
private WriteStatus writeStatus;
|
||||
@@ -94,7 +94,7 @@ public class HoodieMergeHandle<T extends HoodieRecordPayload> extends HoodieIOHa
|
||||
oldFilePath = new Path(
|
||||
config.getBasePath() + "/" + record.getPartitionPath() + "/"
|
||||
+ latestValidFilePath);
|
||||
String relativePath = new Path( record.getPartitionPath() + "/" + FSUtils
|
||||
String relativePath = new Path(record.getPartitionPath() + "/" + FSUtils
|
||||
.makeDataFileName(commitTime, TaskContext.getPartitionId(), fileId)).toString();
|
||||
newFilePath = new Path(config.getBasePath(), relativePath);
|
||||
|
||||
@@ -129,10 +129,11 @@ public class HoodieMergeHandle<T extends HoodieRecordPayload> extends HoodieIOHa
|
||||
}
|
||||
|
||||
|
||||
private boolean writeUpdateRecord(HoodieRecord<T> hoodieRecord, Optional<IndexedRecord> indexedRecord) {
|
||||
private boolean writeUpdateRecord(HoodieRecord<T> hoodieRecord,
|
||||
Optional<IndexedRecord> indexedRecord) {
|
||||
Optional recordMetadata = hoodieRecord.getData().getMetadata();
|
||||
try {
|
||||
if(indexedRecord.isPresent()) {
|
||||
if (indexedRecord.isPresent()) {
|
||||
storageWriter.writeAvroWithMetadata(indexedRecord.get(), hoodieRecord);
|
||||
recordsWritten++;
|
||||
updatedRecordsWritten++;
|
||||
@@ -144,14 +145,15 @@ public class HoodieMergeHandle<T extends HoodieRecordPayload> extends HoodieIOHa
|
||||
writeStatus.markSuccess(hoodieRecord, recordMetadata);
|
||||
return true;
|
||||
} catch (Exception e) {
|
||||
logger.error("Error writing record "+ hoodieRecord, e);
|
||||
logger.error("Error writing record " + hoodieRecord, e);
|
||||
writeStatus.markFailure(hoodieRecord, e, recordMetadata);
|
||||
}
|
||||
return false;
|
||||
}
|
||||
|
||||
/**
|
||||
* Go through an old record. Here if we detect a newer version shows up, we write the new one to the file.
|
||||
* Go through an old record. Here if we detect a newer version shows up, we write the new one to
|
||||
* the file.
|
||||
*/
|
||||
public void write(GenericRecord oldRecord) {
|
||||
String key = oldRecord.get(HoodieRecord.RECORD_KEY_METADATA_FIELD).toString();
|
||||
@@ -159,7 +161,8 @@ public class HoodieMergeHandle<T extends HoodieRecordPayload> extends HoodieIOHa
|
||||
boolean copyOldRecord = true;
|
||||
if (keyToNewRecords.containsKey(key)) {
|
||||
try {
|
||||
Optional<IndexedRecord> combinedAvroRecord = hoodieRecord.getData().combineAndGetUpdateValue(oldRecord, schema);
|
||||
Optional<IndexedRecord> combinedAvroRecord = hoodieRecord.getData()
|
||||
.combineAndGetUpdateValue(oldRecord, schema);
|
||||
if (writeUpdateRecord(hoodieRecord, combinedAvroRecord)) {
|
||||
/* ONLY WHEN
|
||||
* 1) we have an update for this key AND
|
||||
@@ -171,7 +174,8 @@ public class HoodieMergeHandle<T extends HoodieRecordPayload> extends HoodieIOHa
|
||||
}
|
||||
keyToNewRecords.remove(key);
|
||||
} catch (Exception e) {
|
||||
throw new HoodieUpsertException("Failed to combine/merge new record with old value in storage, for new record {"
|
||||
throw new HoodieUpsertException(
|
||||
"Failed to combine/merge new record with old value in storage, for new record {"
|
||||
+ keyToNewRecords.get(key) + "}, old value {" + oldRecord + "}", e);
|
||||
}
|
||||
}
|
||||
@@ -193,7 +197,7 @@ public class HoodieMergeHandle<T extends HoodieRecordPayload> extends HoodieIOHa
|
||||
+ getOldFilePath() + " to new file " + newFilePath, e);
|
||||
throw new HoodieUpsertException(errMsg, e);
|
||||
}
|
||||
recordsWritten ++;
|
||||
recordsWritten++;
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@@ -18,7 +18,6 @@ package com.uber.hoodie.io.compact;
|
||||
|
||||
import com.uber.hoodie.common.model.HoodieDataFile;
|
||||
import com.uber.hoodie.common.model.HoodieLogFile;
|
||||
|
||||
import com.uber.hoodie.config.HoodieWriteConfig;
|
||||
import com.uber.hoodie.io.compact.strategy.CompactionStrategy;
|
||||
import java.io.Serializable;
|
||||
@@ -27,8 +26,8 @@ import java.util.Map;
|
||||
import java.util.stream.Collectors;
|
||||
|
||||
/**
|
||||
* Encapsulates all the needed information about a compaction
|
||||
* and make a decision whether this compaction is effective or not
|
||||
* Encapsulates all the needed information about a compaction and make a decision whether this
|
||||
* compaction is effective or not
|
||||
*
|
||||
* @see CompactionStrategy
|
||||
*/
|
||||
|
||||
@@ -22,18 +22,17 @@ import com.uber.hoodie.common.table.timeline.HoodieActiveTimeline;
|
||||
import com.uber.hoodie.common.table.timeline.HoodieInstant;
|
||||
import com.uber.hoodie.config.HoodieWriteConfig;
|
||||
import com.uber.hoodie.table.HoodieTable;
|
||||
import org.apache.spark.api.java.JavaSparkContext;
|
||||
|
||||
import java.io.Serializable;
|
||||
import java.util.Date;
|
||||
import org.apache.spark.api.java.JavaSparkContext;
|
||||
|
||||
/**
|
||||
* A HoodieCompactor runs compaction on a hoodie table
|
||||
*/
|
||||
public interface HoodieCompactor extends Serializable {
|
||||
|
||||
/**
|
||||
* Compact the delta files with the data files
|
||||
* @throws Exception
|
||||
*/
|
||||
HoodieCompactionMetadata compact(JavaSparkContext jsc, final HoodieWriteConfig config,
|
||||
HoodieTable hoodieTable) throws Exception;
|
||||
|
||||
@@ -16,14 +16,14 @@
|
||||
|
||||
package com.uber.hoodie.io.compact;
|
||||
|
||||
import static java.util.stream.Collectors.toList;
|
||||
|
||||
import com.google.common.base.Preconditions;
|
||||
import com.google.common.collect.Lists;
|
||||
import com.google.common.collect.Sets;
|
||||
import com.uber.hoodie.WriteStatus;
|
||||
import com.uber.hoodie.common.model.CompactionWriteStat;
|
||||
import com.uber.hoodie.common.model.HoodieAvroPayload;
|
||||
import com.uber.hoodie.common.model.HoodieCompactionMetadata;
|
||||
import com.uber.hoodie.common.model.HoodieRecordPayload;
|
||||
import com.uber.hoodie.common.model.HoodieTableType;
|
||||
import com.uber.hoodie.common.table.HoodieTableMetaClient;
|
||||
import com.uber.hoodie.common.table.HoodieTimeline;
|
||||
@@ -36,7 +36,12 @@ import com.uber.hoodie.config.HoodieWriteConfig;
|
||||
import com.uber.hoodie.exception.HoodieCompactionException;
|
||||
import com.uber.hoodie.table.HoodieCopyOnWriteTable;
|
||||
import com.uber.hoodie.table.HoodieTable;
|
||||
import java.io.IOException;
|
||||
import java.nio.charset.StandardCharsets;
|
||||
import java.util.Collection;
|
||||
import java.util.Iterator;
|
||||
import java.util.List;
|
||||
import java.util.Optional;
|
||||
import java.util.stream.Collectors;
|
||||
import java.util.stream.StreamSupport;
|
||||
import org.apache.avro.Schema;
|
||||
@@ -46,18 +51,10 @@ import org.apache.log4j.Logger;
|
||||
import org.apache.spark.api.java.JavaSparkContext;
|
||||
import org.apache.spark.api.java.function.FlatMapFunction;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.nio.charset.StandardCharsets;
|
||||
import java.util.Iterator;
|
||||
import java.util.List;
|
||||
import java.util.Optional;
|
||||
|
||||
import static java.util.stream.Collectors.*;
|
||||
|
||||
/**
|
||||
* HoodieRealtimeTableCompactor compacts a hoodie table with merge on read storage.
|
||||
* Computes all possible compactions, passes it through a CompactionFilter and executes
|
||||
* all the compactions and writes a new version of base files and make a normal commit
|
||||
* HoodieRealtimeTableCompactor compacts a hoodie table with merge on read storage. Computes all
|
||||
* possible compactions, passes it through a CompactionFilter and executes all the compactions and
|
||||
* writes a new version of base files and make a normal commit
|
||||
*
|
||||
* @see HoodieCompactor
|
||||
*/
|
||||
@@ -80,7 +77,8 @@ public class HoodieRealtimeTableCompactor implements HoodieCompactor {
|
||||
String compactionCommit = startCompactionCommit(hoodieTable);
|
||||
log.info("Compacting " + metaClient.getBasePath() + " with commit " + compactionCommit);
|
||||
List<String> partitionPaths =
|
||||
FSUtils.getAllPartitionPaths(metaClient.getFs(), metaClient.getBasePath(), config.shouldAssumeDatePartitioning());
|
||||
FSUtils.getAllPartitionPaths(metaClient.getFs(), metaClient.getBasePath(),
|
||||
config.shouldAssumeDatePartitioning());
|
||||
|
||||
log.info("Compaction looking for files to compact in " + partitionPaths + " partitions");
|
||||
List<CompactionOperation> operations =
|
||||
@@ -156,7 +154,8 @@ public class HoodieRealtimeTableCompactor implements HoodieCompactor {
|
||||
HoodieTimeline.DELTA_COMMIT_ACTION))
|
||||
.filterCompletedInstants().lastInstant().get().getTimestamp();
|
||||
|
||||
HoodieCompactedLogRecordScanner scanner = new HoodieCompactedLogRecordScanner(fs, metaClient.getBasePath(),
|
||||
HoodieCompactedLogRecordScanner scanner = new HoodieCompactedLogRecordScanner(fs,
|
||||
metaClient.getBasePath(),
|
||||
operation.getDeltaFilePaths(), readerSchema, maxInstantTime);
|
||||
if (!scanner.iterator().hasNext()) {
|
||||
return Lists.newArrayList();
|
||||
|
||||
@@ -28,8 +28,8 @@ import java.util.Map;
|
||||
import java.util.Optional;
|
||||
|
||||
/**
|
||||
* CompactionStrategy which looks at total IO to be done for the compaction (read + write)
|
||||
* and limits the list of compactions to be under a configured limit on the IO
|
||||
* CompactionStrategy which looks at total IO to be done for the compaction (read + write) and
|
||||
* limits the list of compactions to be under a configured limit on the IO
|
||||
*
|
||||
* @see CompactionStrategy
|
||||
*/
|
||||
@@ -62,7 +62,8 @@ public class BoundedIOCompactionStrategy implements CompactionStrategy {
|
||||
}
|
||||
|
||||
@Override
|
||||
public List<CompactionOperation> orderAndFilter(HoodieWriteConfig writeConfig, List<CompactionOperation> operations) {
|
||||
public List<CompactionOperation> orderAndFilter(HoodieWriteConfig writeConfig,
|
||||
List<CompactionOperation> operations) {
|
||||
// Iterate through the operations in order and accept operations as long as we are within the IO limit
|
||||
// Preserves the original ordering of compactions
|
||||
List<CompactionOperation> finalOperations = Lists.newArrayList();
|
||||
|
||||
@@ -25,12 +25,12 @@ import java.util.List;
|
||||
import java.util.Map;
|
||||
|
||||
/**
|
||||
* Strategy for compaction. Pluggable implementation of define how compaction should be done.
|
||||
* The implementations of this interface can capture the relevant metrics to order and filter
|
||||
* the final list of compaction operation to run in a single compaction.
|
||||
* Strategy for compaction. Pluggable implementation of define how compaction should be done. The
|
||||
* implementations of this interface can capture the relevant metrics to order and filter the final
|
||||
* list of compaction operation to run in a single compaction.
|
||||
*
|
||||
* Implementation of CompactionStrategy cannot hold any state.
|
||||
* Difference instantiations can be passed in every time
|
||||
* Implementation of CompactionStrategy cannot hold any state. Difference instantiations can be
|
||||
* passed in every time
|
||||
*
|
||||
* @see com.uber.hoodie.io.compact.HoodieRealtimeTableCompactor
|
||||
* @see CompactionOperation
|
||||
@@ -38,8 +38,8 @@ import java.util.Map;
|
||||
public interface CompactionStrategy extends Serializable {
|
||||
|
||||
/**
|
||||
* Callback hook when a CompactionOperation is created. Individual strategies can
|
||||
* capture the metrics they need to decide on the priority.
|
||||
* Callback hook when a CompactionOperation is created. Individual strategies can capture the
|
||||
* metrics they need to decide on the priority.
|
||||
*
|
||||
* @param dataFile - Base file to compact
|
||||
* @param partitionPath - Partition path
|
||||
@@ -50,8 +50,8 @@ public interface CompactionStrategy extends Serializable {
|
||||
List<HoodieLogFile> logFiles);
|
||||
|
||||
/**
|
||||
* Order and Filter the list of compactions. Use the metrics captured with the
|
||||
* captureMetrics to order and filter out compactions
|
||||
* Order and Filter the list of compactions. Use the metrics captured with the captureMetrics to
|
||||
* order and filter out compactions
|
||||
*
|
||||
* @param writeConfig - HoodieWriteConfig - config for this compaction is passed in
|
||||
* @param operations - list of compactions collected
|
||||
|
||||
@@ -27,8 +27,8 @@ import java.util.Optional;
|
||||
import java.util.stream.Collectors;
|
||||
|
||||
/**
|
||||
* LogFileSizeBasedCompactionStrategy orders the compactions based on the total log files size
|
||||
* and limits the compactions within a configured IO bound
|
||||
* LogFileSizeBasedCompactionStrategy orders the compactions based on the total log files size and
|
||||
* limits the compactions within a configured IO bound
|
||||
*
|
||||
* @see BoundedIOCompactionStrategy
|
||||
* @see CompactionStrategy
|
||||
|
||||
@@ -25,9 +25,9 @@ import java.util.List;
|
||||
import java.util.Map;
|
||||
|
||||
/**
|
||||
* UnBoundedCompactionStrategy will not change ordering or filter any compaction.
|
||||
* It is a pass-through and will compact all the base files which has a log file.
|
||||
* This usually means no-intelligence on compaction.
|
||||
* UnBoundedCompactionStrategy will not change ordering or filter any compaction. It is a
|
||||
* pass-through and will compact all the base files which has a log file. This usually means
|
||||
* no-intelligence on compaction.
|
||||
*
|
||||
* @see CompactionStrategy
|
||||
*/
|
||||
|
||||
@@ -17,11 +17,11 @@
|
||||
package com.uber.hoodie.io.storage;
|
||||
|
||||
import com.uber.hoodie.avro.HoodieAvroWriteSupport;
|
||||
import org.apache.avro.Schema;
|
||||
import org.apache.hadoop.conf.Configuration;
|
||||
import org.apache.parquet.hadoop.metadata.CompressionCodecName;
|
||||
|
||||
public class HoodieParquetConfig {
|
||||
|
||||
private HoodieAvroWriteSupport writeSupport;
|
||||
private CompressionCodecName compressionCodecName;
|
||||
private int blockSize;
|
||||
|
||||
@@ -20,6 +20,8 @@ import com.uber.hoodie.avro.HoodieAvroWriteSupport;
|
||||
import com.uber.hoodie.common.model.HoodieRecord;
|
||||
import com.uber.hoodie.common.model.HoodieRecordPayload;
|
||||
import com.uber.hoodie.common.util.HoodieAvroUtils;
|
||||
import java.io.IOException;
|
||||
import java.util.concurrent.atomic.AtomicLong;
|
||||
import org.apache.avro.Schema;
|
||||
import org.apache.avro.generic.GenericRecord;
|
||||
import org.apache.avro.generic.IndexedRecord;
|
||||
@@ -30,17 +32,13 @@ import org.apache.parquet.hadoop.ParquetFileWriter;
|
||||
import org.apache.parquet.hadoop.ParquetWriter;
|
||||
import org.apache.spark.TaskContext;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.util.concurrent.atomic.AtomicLong;
|
||||
|
||||
/**
|
||||
* HoodieParquetWriter extends the ParquetWriter to help limit the size of underlying file.
|
||||
* Provides a way to check if the current file can take more records with the <code>canWrite()</code>
|
||||
*
|
||||
* @param <T>
|
||||
* HoodieParquetWriter extends the ParquetWriter to help limit the size of underlying file. Provides
|
||||
* a way to check if the current file can take more records with the <code>canWrite()</code>
|
||||
*/
|
||||
public class HoodieParquetWriter<T extends HoodieRecordPayload, R extends IndexedRecord>
|
||||
extends ParquetWriter<IndexedRecord> implements HoodieStorageWriter<R> {
|
||||
|
||||
private static double STREAM_COMPRESSION_RATIO = 0.1;
|
||||
private static AtomicLong recordIndex = new AtomicLong(1);
|
||||
|
||||
@@ -101,7 +99,8 @@ public class HoodieParquetWriter<T extends HoodieRecordPayload, R extends Indexe
|
||||
return fs.getBytesWritten(file) < maxFileSize;
|
||||
}
|
||||
|
||||
@Override public void writeAvro(String key, IndexedRecord object) throws IOException {
|
||||
@Override
|
||||
public void writeAvro(String key, IndexedRecord object) throws IOException {
|
||||
super.write(object);
|
||||
writeSupport.add(key);
|
||||
}
|
||||
|
||||
@@ -17,13 +17,16 @@
|
||||
package com.uber.hoodie.io.storage;
|
||||
|
||||
import com.uber.hoodie.common.model.HoodieRecord;
|
||||
import java.io.IOException;
|
||||
import org.apache.avro.generic.IndexedRecord;
|
||||
|
||||
import java.io.IOException;
|
||||
|
||||
public interface HoodieStorageWriter<R extends IndexedRecord> {
|
||||
|
||||
void writeAvroWithMetadata(R newRecord, HoodieRecord record) throws IOException;
|
||||
|
||||
boolean canWrite();
|
||||
|
||||
void close() throws IOException;
|
||||
|
||||
void writeAvro(String key, R oldRecord) throws IOException;
|
||||
}
|
||||
|
||||
@@ -16,24 +16,24 @@
|
||||
|
||||
package com.uber.hoodie.io.storage;
|
||||
|
||||
import com.uber.hoodie.common.table.HoodieTableMetaClient;
|
||||
import com.uber.hoodie.config.HoodieWriteConfig;
|
||||
import com.uber.hoodie.avro.HoodieAvroWriteSupport;
|
||||
import com.uber.hoodie.common.BloomFilter;
|
||||
import com.uber.hoodie.common.model.HoodieRecordPayload;
|
||||
import com.uber.hoodie.common.util.FSUtils;
|
||||
import com.uber.hoodie.config.HoodieWriteConfig;
|
||||
import com.uber.hoodie.table.HoodieTable;
|
||||
import java.io.IOException;
|
||||
import org.apache.avro.Schema;
|
||||
import org.apache.avro.generic.IndexedRecord;
|
||||
import org.apache.hadoop.fs.Path;
|
||||
import org.apache.parquet.avro.AvroSchemaConverter;
|
||||
import org.apache.parquet.hadoop.metadata.CompressionCodecName;
|
||||
|
||||
import java.io.IOException;
|
||||
|
||||
public class HoodieStorageWriterFactory {
|
||||
|
||||
public static <T extends HoodieRecordPayload, R extends IndexedRecord> HoodieStorageWriter<R> getStorageWriter(
|
||||
String commitTime, Path path, HoodieTable<T> hoodieTable, HoodieWriteConfig config, Schema schema)
|
||||
String commitTime, Path path, HoodieTable<T> hoodieTable, HoodieWriteConfig config,
|
||||
Schema schema)
|
||||
throws IOException {
|
||||
//TODO - based on the metadata choose the implementation of HoodieStorageWriter
|
||||
// Currently only parquet is supported
|
||||
|
||||
@@ -16,17 +16,6 @@
|
||||
|
||||
package com.uber.hoodie.io.storage;
|
||||
|
||||
import org.apache.hadoop.conf.Configuration;
|
||||
import org.apache.hadoop.fs.*;
|
||||
import org.apache.hadoop.fs.permission.AclEntry;
|
||||
import org.apache.hadoop.fs.permission.AclStatus;
|
||||
import org.apache.hadoop.fs.permission.FsAction;
|
||||
import org.apache.hadoop.fs.permission.FsPermission;
|
||||
import org.apache.hadoop.security.AccessControlException;
|
||||
import org.apache.hadoop.security.Credentials;
|
||||
import org.apache.hadoop.security.token.Token;
|
||||
import org.apache.hadoop.util.Progressable;
|
||||
|
||||
import java.io.FileNotFoundException;
|
||||
import java.io.IOException;
|
||||
import java.net.URI;
|
||||
@@ -38,13 +27,41 @@ import java.util.Map;
|
||||
import java.util.Set;
|
||||
import java.util.concurrent.ConcurrentHashMap;
|
||||
import java.util.concurrent.ConcurrentMap;
|
||||
import org.apache.hadoop.conf.Configuration;
|
||||
import org.apache.hadoop.fs.BlockLocation;
|
||||
import org.apache.hadoop.fs.ContentSummary;
|
||||
import org.apache.hadoop.fs.CreateFlag;
|
||||
import org.apache.hadoop.fs.FSDataInputStream;
|
||||
import org.apache.hadoop.fs.FSDataOutputStream;
|
||||
import org.apache.hadoop.fs.FileAlreadyExistsException;
|
||||
import org.apache.hadoop.fs.FileChecksum;
|
||||
import org.apache.hadoop.fs.FileStatus;
|
||||
import org.apache.hadoop.fs.FileSystem;
|
||||
import org.apache.hadoop.fs.FsServerDefaults;
|
||||
import org.apache.hadoop.fs.FsStatus;
|
||||
import org.apache.hadoop.fs.LocatedFileStatus;
|
||||
import org.apache.hadoop.fs.Options;
|
||||
import org.apache.hadoop.fs.ParentNotDirectoryException;
|
||||
import org.apache.hadoop.fs.Path;
|
||||
import org.apache.hadoop.fs.PathFilter;
|
||||
import org.apache.hadoop.fs.RemoteIterator;
|
||||
import org.apache.hadoop.fs.UnsupportedFileSystemException;
|
||||
import org.apache.hadoop.fs.XAttrSetFlag;
|
||||
import org.apache.hadoop.fs.permission.AclEntry;
|
||||
import org.apache.hadoop.fs.permission.AclStatus;
|
||||
import org.apache.hadoop.fs.permission.FsAction;
|
||||
import org.apache.hadoop.fs.permission.FsPermission;
|
||||
import org.apache.hadoop.security.AccessControlException;
|
||||
import org.apache.hadoop.security.Credentials;
|
||||
import org.apache.hadoop.security.token.Token;
|
||||
import org.apache.hadoop.util.Progressable;
|
||||
|
||||
/**
|
||||
* HoodieWrapperFileSystem wraps the default file system.
|
||||
* It holds state about the open streams in the file system to support getting the
|
||||
* written size to each of the open streams.
|
||||
* HoodieWrapperFileSystem wraps the default file system. It holds state about the open streams in
|
||||
* the file system to support getting the written size to each of the open streams.
|
||||
*/
|
||||
public class HoodieWrapperFileSystem extends FileSystem {
|
||||
|
||||
private static final Set<String> SUPPORT_SCHEMES;
|
||||
public static final String HOODIE_SCHEME_PREFIX = "hoodie-";
|
||||
|
||||
@@ -65,7 +82,8 @@ public class HoodieWrapperFileSystem extends FileSystem {
|
||||
private FileSystem fileSystem;
|
||||
private URI uri;
|
||||
|
||||
@Override public void initialize(URI uri, Configuration conf) throws IOException {
|
||||
@Override
|
||||
public void initialize(URI uri, Configuration conf) throws IOException {
|
||||
// Get the default filesystem to decorate
|
||||
fileSystem = FileSystem.get(conf);
|
||||
// Do not need to explicitly initialize the default filesystem, its done already in the above FileSystem.get
|
||||
@@ -74,15 +92,18 @@ public class HoodieWrapperFileSystem extends FileSystem {
|
||||
this.uri = uri;
|
||||
}
|
||||
|
||||
@Override public URI getUri() {
|
||||
@Override
|
||||
public URI getUri() {
|
||||
return uri;
|
||||
}
|
||||
|
||||
@Override public FSDataInputStream open(Path f, int bufferSize) throws IOException {
|
||||
@Override
|
||||
public FSDataInputStream open(Path f, int bufferSize) throws IOException {
|
||||
return fileSystem.open(convertToDefaultPath(f), bufferSize);
|
||||
}
|
||||
|
||||
@Override public FSDataOutputStream create(Path f, FsPermission permission, boolean overwrite,
|
||||
@Override
|
||||
public FSDataOutputStream create(Path f, FsPermission permission, boolean overwrite,
|
||||
int bufferSize, short replication, long blockSize, Progressable progress)
|
||||
throws IOException {
|
||||
final Path translatedPath = convertToDefaultPath(f);
|
||||
@@ -99,7 +120,8 @@ public class HoodieWrapperFileSystem extends FileSystem {
|
||||
|
||||
SizeAwareFSDataOutputStream os =
|
||||
new SizeAwareFSDataOutputStream(fsDataOutputStream, new Runnable() {
|
||||
@Override public void run() {
|
||||
@Override
|
||||
public void run() {
|
||||
openStreams.remove(path.getName());
|
||||
}
|
||||
});
|
||||
@@ -107,33 +129,40 @@ public class HoodieWrapperFileSystem extends FileSystem {
|
||||
return os;
|
||||
}
|
||||
|
||||
@Override public FSDataOutputStream create(Path f, boolean overwrite) throws IOException {
|
||||
@Override
|
||||
public FSDataOutputStream create(Path f, boolean overwrite) throws IOException {
|
||||
return wrapOutputStream(f, fileSystem.create(convertToDefaultPath(f), overwrite));
|
||||
}
|
||||
|
||||
@Override public FSDataOutputStream create(Path f) throws IOException {
|
||||
@Override
|
||||
public FSDataOutputStream create(Path f) throws IOException {
|
||||
return wrapOutputStream(f, fileSystem.create(convertToDefaultPath(f)));
|
||||
}
|
||||
|
||||
@Override public FSDataOutputStream create(Path f, Progressable progress) throws IOException {
|
||||
@Override
|
||||
public FSDataOutputStream create(Path f, Progressable progress) throws IOException {
|
||||
return fileSystem.create(convertToDefaultPath(f), progress);
|
||||
}
|
||||
|
||||
@Override public FSDataOutputStream create(Path f, short replication) throws IOException {
|
||||
@Override
|
||||
public FSDataOutputStream create(Path f, short replication) throws IOException {
|
||||
return fileSystem.create(convertToDefaultPath(f), replication);
|
||||
}
|
||||
|
||||
@Override public FSDataOutputStream create(Path f, short replication, Progressable progress)
|
||||
@Override
|
||||
public FSDataOutputStream create(Path f, short replication, Progressable progress)
|
||||
throws IOException {
|
||||
return fileSystem.create(convertToDefaultPath(f), replication, progress);
|
||||
}
|
||||
|
||||
@Override public FSDataOutputStream create(Path f, boolean overwrite, int bufferSize)
|
||||
@Override
|
||||
public FSDataOutputStream create(Path f, boolean overwrite, int bufferSize)
|
||||
throws IOException {
|
||||
return fileSystem.create(convertToDefaultPath(f), overwrite, bufferSize);
|
||||
}
|
||||
|
||||
@Override public FSDataOutputStream create(Path f, boolean overwrite, int bufferSize,
|
||||
@Override
|
||||
public FSDataOutputStream create(Path f, boolean overwrite, int bufferSize,
|
||||
Progressable progress) throws IOException {
|
||||
return fileSystem.create(convertToDefaultPath(f), overwrite, bufferSize, progress);
|
||||
}
|
||||
@@ -173,91 +202,112 @@ public class HoodieWrapperFileSystem extends FileSystem {
|
||||
}
|
||||
|
||||
|
||||
@Override public FSDataOutputStream append(Path f, int bufferSize, Progressable progress)
|
||||
@Override
|
||||
public FSDataOutputStream append(Path f, int bufferSize, Progressable progress)
|
||||
throws IOException {
|
||||
return fileSystem.append(convertToDefaultPath(f), bufferSize, progress);
|
||||
}
|
||||
|
||||
@Override public boolean rename(Path src, Path dst) throws IOException {
|
||||
@Override
|
||||
public boolean rename(Path src, Path dst) throws IOException {
|
||||
return fileSystem.rename(convertToDefaultPath(src), convertToDefaultPath(dst));
|
||||
}
|
||||
|
||||
@Override public boolean delete(Path f, boolean recursive) throws IOException {
|
||||
@Override
|
||||
public boolean delete(Path f, boolean recursive) throws IOException {
|
||||
return fileSystem.delete(convertToDefaultPath(f), recursive);
|
||||
}
|
||||
|
||||
@Override public FileStatus[] listStatus(Path f) throws FileNotFoundException, IOException {
|
||||
@Override
|
||||
public FileStatus[] listStatus(Path f) throws FileNotFoundException, IOException {
|
||||
return fileSystem.listStatus(convertToDefaultPath(f));
|
||||
}
|
||||
|
||||
@Override public void setWorkingDirectory(Path new_dir) {
|
||||
@Override
|
||||
public void setWorkingDirectory(Path new_dir) {
|
||||
fileSystem.setWorkingDirectory(convertToDefaultPath(new_dir));
|
||||
}
|
||||
|
||||
@Override public Path getWorkingDirectory() {
|
||||
@Override
|
||||
public Path getWorkingDirectory() {
|
||||
return convertToHoodiePath(fileSystem.getWorkingDirectory());
|
||||
}
|
||||
|
||||
@Override public boolean mkdirs(Path f, FsPermission permission) throws IOException {
|
||||
@Override
|
||||
public boolean mkdirs(Path f, FsPermission permission) throws IOException {
|
||||
return fileSystem.mkdirs(convertToDefaultPath(f), permission);
|
||||
}
|
||||
|
||||
@Override public FileStatus getFileStatus(Path f) throws IOException {
|
||||
@Override
|
||||
public FileStatus getFileStatus(Path f) throws IOException {
|
||||
return fileSystem.getFileStatus(convertToDefaultPath(f));
|
||||
}
|
||||
|
||||
@Override public String getScheme() {
|
||||
@Override
|
||||
public String getScheme() {
|
||||
return uri.getScheme();
|
||||
}
|
||||
|
||||
@Override public String getCanonicalServiceName() {
|
||||
@Override
|
||||
public String getCanonicalServiceName() {
|
||||
return fileSystem.getCanonicalServiceName();
|
||||
}
|
||||
|
||||
@Override public String getName() {
|
||||
@Override
|
||||
public String getName() {
|
||||
return fileSystem.getName();
|
||||
}
|
||||
|
||||
@Override public Path makeQualified(Path path) {
|
||||
@Override
|
||||
public Path makeQualified(Path path) {
|
||||
return convertToHoodiePath(fileSystem.makeQualified(convertToDefaultPath(path)));
|
||||
}
|
||||
|
||||
@Override public Token<?> getDelegationToken(String renewer) throws IOException {
|
||||
@Override
|
||||
public Token<?> getDelegationToken(String renewer) throws IOException {
|
||||
return fileSystem.getDelegationToken(renewer);
|
||||
}
|
||||
|
||||
@Override public Token<?>[] addDelegationTokens(String renewer, Credentials credentials)
|
||||
@Override
|
||||
public Token<?>[] addDelegationTokens(String renewer, Credentials credentials)
|
||||
throws IOException {
|
||||
return fileSystem.addDelegationTokens(renewer, credentials);
|
||||
}
|
||||
|
||||
@Override public FileSystem[] getChildFileSystems() {
|
||||
@Override
|
||||
public FileSystem[] getChildFileSystems() {
|
||||
return fileSystem.getChildFileSystems();
|
||||
}
|
||||
|
||||
@Override public BlockLocation[] getFileBlockLocations(FileStatus file, long start, long len)
|
||||
@Override
|
||||
public BlockLocation[] getFileBlockLocations(FileStatus file, long start, long len)
|
||||
throws IOException {
|
||||
return fileSystem.getFileBlockLocations(file, start, len);
|
||||
}
|
||||
|
||||
@Override public BlockLocation[] getFileBlockLocations(Path p, long start, long len)
|
||||
@Override
|
||||
public BlockLocation[] getFileBlockLocations(Path p, long start, long len)
|
||||
throws IOException {
|
||||
return fileSystem.getFileBlockLocations(convertToDefaultPath(p), start, len);
|
||||
}
|
||||
|
||||
@Override public FsServerDefaults getServerDefaults() throws IOException {
|
||||
@Override
|
||||
public FsServerDefaults getServerDefaults() throws IOException {
|
||||
return fileSystem.getServerDefaults();
|
||||
}
|
||||
|
||||
@Override public FsServerDefaults getServerDefaults(Path p) throws IOException {
|
||||
@Override
|
||||
public FsServerDefaults getServerDefaults(Path p) throws IOException {
|
||||
return fileSystem.getServerDefaults(convertToDefaultPath(p));
|
||||
}
|
||||
|
||||
@Override public Path resolvePath(Path p) throws IOException {
|
||||
@Override
|
||||
public Path resolvePath(Path p) throws IOException {
|
||||
return convertToHoodiePath(fileSystem.resolvePath(convertToDefaultPath(p)));
|
||||
}
|
||||
|
||||
@Override public FSDataInputStream open(Path f) throws IOException {
|
||||
@Override
|
||||
public FSDataInputStream open(Path f) throws IOException {
|
||||
return fileSystem.open(convertToDefaultPath(f));
|
||||
}
|
||||
|
||||
@@ -278,7 +328,8 @@ public class HoodieWrapperFileSystem extends FileSystem {
|
||||
replication, blockSize, progress);
|
||||
}
|
||||
|
||||
@Override public FSDataOutputStream createNonRecursive(Path f, FsPermission permission,
|
||||
@Override
|
||||
public FSDataOutputStream createNonRecursive(Path f, FsPermission permission,
|
||||
EnumSet<CreateFlag> flags, int bufferSize, short replication, long blockSize,
|
||||
Progressable progress) throws IOException {
|
||||
return fileSystem
|
||||
@@ -286,122 +337,150 @@ public class HoodieWrapperFileSystem extends FileSystem {
|
||||
blockSize, progress);
|
||||
}
|
||||
|
||||
@Override public boolean createNewFile(Path f) throws IOException {
|
||||
@Override
|
||||
public boolean createNewFile(Path f) throws IOException {
|
||||
return fileSystem.createNewFile(convertToDefaultPath(f));
|
||||
}
|
||||
|
||||
@Override public FSDataOutputStream append(Path f) throws IOException {
|
||||
@Override
|
||||
public FSDataOutputStream append(Path f) throws IOException {
|
||||
return fileSystem.append(convertToDefaultPath(f));
|
||||
}
|
||||
|
||||
@Override public FSDataOutputStream append(Path f, int bufferSize) throws IOException {
|
||||
@Override
|
||||
public FSDataOutputStream append(Path f, int bufferSize) throws IOException {
|
||||
return fileSystem.append(convertToDefaultPath(f), bufferSize);
|
||||
}
|
||||
|
||||
@Override public void concat(Path trg, Path[] psrcs) throws IOException {
|
||||
@Override
|
||||
public void concat(Path trg, Path[] psrcs) throws IOException {
|
||||
Path[] psrcsNew = convertDefaults(psrcs);
|
||||
fileSystem.concat(convertToDefaultPath(trg), psrcsNew);
|
||||
}
|
||||
|
||||
@Override public short getReplication(Path src) throws IOException {
|
||||
@Override
|
||||
public short getReplication(Path src) throws IOException {
|
||||
return fileSystem.getReplication(convertToDefaultPath(src));
|
||||
}
|
||||
|
||||
@Override public boolean setReplication(Path src, short replication) throws IOException {
|
||||
@Override
|
||||
public boolean setReplication(Path src, short replication) throws IOException {
|
||||
return fileSystem.setReplication(convertToDefaultPath(src), replication);
|
||||
}
|
||||
|
||||
@Override public boolean delete(Path f) throws IOException {
|
||||
@Override
|
||||
public boolean delete(Path f) throws IOException {
|
||||
return fileSystem.delete(convertToDefaultPath(f));
|
||||
}
|
||||
|
||||
@Override public boolean deleteOnExit(Path f) throws IOException {
|
||||
@Override
|
||||
public boolean deleteOnExit(Path f) throws IOException {
|
||||
return fileSystem.deleteOnExit(convertToDefaultPath(f));
|
||||
}
|
||||
|
||||
@Override public boolean cancelDeleteOnExit(Path f) {
|
||||
@Override
|
||||
public boolean cancelDeleteOnExit(Path f) {
|
||||
return fileSystem.cancelDeleteOnExit(convertToDefaultPath(f));
|
||||
}
|
||||
|
||||
@Override public boolean exists(Path f) throws IOException {
|
||||
@Override
|
||||
public boolean exists(Path f) throws IOException {
|
||||
return fileSystem.exists(convertToDefaultPath(f));
|
||||
}
|
||||
|
||||
@Override public boolean isDirectory(Path f) throws IOException {
|
||||
@Override
|
||||
public boolean isDirectory(Path f) throws IOException {
|
||||
return fileSystem.isDirectory(convertToDefaultPath(f));
|
||||
}
|
||||
|
||||
@Override public boolean isFile(Path f) throws IOException {
|
||||
@Override
|
||||
public boolean isFile(Path f) throws IOException {
|
||||
return fileSystem.isFile(convertToDefaultPath(f));
|
||||
}
|
||||
|
||||
@Override public long getLength(Path f) throws IOException {
|
||||
@Override
|
||||
public long getLength(Path f) throws IOException {
|
||||
return fileSystem.getLength(convertToDefaultPath(f));
|
||||
}
|
||||
|
||||
@Override public ContentSummary getContentSummary(Path f) throws IOException {
|
||||
@Override
|
||||
public ContentSummary getContentSummary(Path f) throws IOException {
|
||||
return fileSystem.getContentSummary(convertToDefaultPath(f));
|
||||
}
|
||||
|
||||
@Override public RemoteIterator<Path> listCorruptFileBlocks(Path path) throws IOException {
|
||||
@Override
|
||||
public RemoteIterator<Path> listCorruptFileBlocks(Path path) throws IOException {
|
||||
return fileSystem.listCorruptFileBlocks(convertToDefaultPath(path));
|
||||
}
|
||||
|
||||
@Override public FileStatus[] listStatus(Path f, PathFilter filter)
|
||||
@Override
|
||||
public FileStatus[] listStatus(Path f, PathFilter filter)
|
||||
throws FileNotFoundException, IOException {
|
||||
return fileSystem.listStatus(convertToDefaultPath(f), filter);
|
||||
}
|
||||
|
||||
@Override public FileStatus[] listStatus(Path[] files)
|
||||
@Override
|
||||
public FileStatus[] listStatus(Path[] files)
|
||||
throws FileNotFoundException, IOException {
|
||||
return fileSystem.listStatus(convertDefaults(files));
|
||||
}
|
||||
|
||||
@Override public FileStatus[] listStatus(Path[] files, PathFilter filter)
|
||||
@Override
|
||||
public FileStatus[] listStatus(Path[] files, PathFilter filter)
|
||||
throws FileNotFoundException, IOException {
|
||||
return fileSystem.listStatus(convertDefaults(files), filter);
|
||||
}
|
||||
|
||||
@Override public FileStatus[] globStatus(Path pathPattern) throws IOException {
|
||||
@Override
|
||||
public FileStatus[] globStatus(Path pathPattern) throws IOException {
|
||||
return fileSystem.globStatus(convertToDefaultPath(pathPattern));
|
||||
}
|
||||
|
||||
@Override public FileStatus[] globStatus(Path pathPattern, PathFilter filter)
|
||||
@Override
|
||||
public FileStatus[] globStatus(Path pathPattern, PathFilter filter)
|
||||
throws IOException {
|
||||
return fileSystem.globStatus(convertToDefaultPath(pathPattern), filter);
|
||||
}
|
||||
|
||||
@Override public RemoteIterator<LocatedFileStatus> listLocatedStatus(Path f)
|
||||
@Override
|
||||
public RemoteIterator<LocatedFileStatus> listLocatedStatus(Path f)
|
||||
throws FileNotFoundException, IOException {
|
||||
return fileSystem.listLocatedStatus(convertToDefaultPath(f));
|
||||
}
|
||||
|
||||
@Override public RemoteIterator<LocatedFileStatus> listFiles(Path f, boolean recursive)
|
||||
@Override
|
||||
public RemoteIterator<LocatedFileStatus> listFiles(Path f, boolean recursive)
|
||||
throws FileNotFoundException, IOException {
|
||||
return fileSystem.listFiles(convertToDefaultPath(f), recursive);
|
||||
}
|
||||
|
||||
@Override public Path getHomeDirectory() {
|
||||
@Override
|
||||
public Path getHomeDirectory() {
|
||||
return convertToHoodiePath(fileSystem.getHomeDirectory());
|
||||
}
|
||||
|
||||
@Override public boolean mkdirs(Path f) throws IOException {
|
||||
@Override
|
||||
public boolean mkdirs(Path f) throws IOException {
|
||||
return fileSystem.mkdirs(convertToDefaultPath(f));
|
||||
}
|
||||
|
||||
@Override public void copyFromLocalFile(Path src, Path dst) throws IOException {
|
||||
@Override
|
||||
public void copyFromLocalFile(Path src, Path dst) throws IOException {
|
||||
fileSystem.copyFromLocalFile(convertToDefaultPath(src), convertToDefaultPath(dst));
|
||||
}
|
||||
|
||||
@Override public void moveFromLocalFile(Path[] srcs, Path dst) throws IOException {
|
||||
@Override
|
||||
public void moveFromLocalFile(Path[] srcs, Path dst) throws IOException {
|
||||
fileSystem.moveFromLocalFile(convertDefaults(srcs), convertToDefaultPath(dst));
|
||||
}
|
||||
|
||||
@Override public void moveFromLocalFile(Path src, Path dst) throws IOException {
|
||||
@Override
|
||||
public void moveFromLocalFile(Path src, Path dst) throws IOException {
|
||||
fileSystem.moveFromLocalFile(convertToDefaultPath(src), convertToDefaultPath(dst));
|
||||
}
|
||||
|
||||
@Override public void copyFromLocalFile(boolean delSrc, Path src, Path dst) throws IOException {
|
||||
@Override
|
||||
public void copyFromLocalFile(boolean delSrc, Path src, Path dst) throws IOException {
|
||||
fileSystem.copyFromLocalFile(delSrc, convertToDefaultPath(src), convertToDefaultPath(dst));
|
||||
}
|
||||
|
||||
@@ -412,21 +491,25 @@ public class HoodieWrapperFileSystem extends FileSystem {
|
||||
.copyFromLocalFile(delSrc, overwrite, convertDefaults(srcs), convertToDefaultPath(dst));
|
||||
}
|
||||
|
||||
@Override public void copyFromLocalFile(boolean delSrc, boolean overwrite, Path src, Path dst)
|
||||
@Override
|
||||
public void copyFromLocalFile(boolean delSrc, boolean overwrite, Path src, Path dst)
|
||||
throws IOException {
|
||||
fileSystem.copyFromLocalFile(delSrc, overwrite, convertToDefaultPath(src),
|
||||
convertToDefaultPath(dst));
|
||||
}
|
||||
|
||||
@Override public void copyToLocalFile(Path src, Path dst) throws IOException {
|
||||
@Override
|
||||
public void copyToLocalFile(Path src, Path dst) throws IOException {
|
||||
fileSystem.copyToLocalFile(convertToDefaultPath(src), convertToDefaultPath(dst));
|
||||
}
|
||||
|
||||
@Override public void moveToLocalFile(Path src, Path dst) throws IOException {
|
||||
@Override
|
||||
public void moveToLocalFile(Path src, Path dst) throws IOException {
|
||||
fileSystem.moveToLocalFile(convertToDefaultPath(src), convertToDefaultPath(dst));
|
||||
}
|
||||
|
||||
@Override public void copyToLocalFile(boolean delSrc, Path src, Path dst) throws IOException {
|
||||
@Override
|
||||
public void copyToLocalFile(boolean delSrc, Path src, Path dst) throws IOException {
|
||||
fileSystem.copyToLocalFile(delSrc, convertToDefaultPath(src), convertToDefaultPath(dst));
|
||||
}
|
||||
|
||||
@@ -437,193 +520,237 @@ public class HoodieWrapperFileSystem extends FileSystem {
|
||||
useRawLocalFileSystem);
|
||||
}
|
||||
|
||||
@Override public Path startLocalOutput(Path fsOutputFile, Path tmpLocalFile)
|
||||
@Override
|
||||
public Path startLocalOutput(Path fsOutputFile, Path tmpLocalFile)
|
||||
throws IOException {
|
||||
return convertToHoodiePath(fileSystem.startLocalOutput(convertToDefaultPath(fsOutputFile),
|
||||
convertToDefaultPath(tmpLocalFile)));
|
||||
}
|
||||
|
||||
@Override public void completeLocalOutput(Path fsOutputFile, Path tmpLocalFile)
|
||||
@Override
|
||||
public void completeLocalOutput(Path fsOutputFile, Path tmpLocalFile)
|
||||
throws IOException {
|
||||
fileSystem.completeLocalOutput(convertToDefaultPath(fsOutputFile),
|
||||
convertToDefaultPath(tmpLocalFile));
|
||||
}
|
||||
|
||||
@Override public void close() throws IOException {
|
||||
@Override
|
||||
public void close() throws IOException {
|
||||
fileSystem.close();
|
||||
}
|
||||
|
||||
@Override public long getUsed() throws IOException {
|
||||
@Override
|
||||
public long getUsed() throws IOException {
|
||||
return fileSystem.getUsed();
|
||||
}
|
||||
|
||||
@Override public long getBlockSize(Path f) throws IOException {
|
||||
@Override
|
||||
public long getBlockSize(Path f) throws IOException {
|
||||
return fileSystem.getBlockSize(convertToDefaultPath(f));
|
||||
}
|
||||
|
||||
@Override public long getDefaultBlockSize() {
|
||||
@Override
|
||||
public long getDefaultBlockSize() {
|
||||
return fileSystem.getDefaultBlockSize();
|
||||
}
|
||||
|
||||
@Override public long getDefaultBlockSize(Path f) {
|
||||
@Override
|
||||
public long getDefaultBlockSize(Path f) {
|
||||
return fileSystem.getDefaultBlockSize(convertToDefaultPath(f));
|
||||
}
|
||||
|
||||
@Override public short getDefaultReplication() {
|
||||
@Override
|
||||
public short getDefaultReplication() {
|
||||
return fileSystem.getDefaultReplication();
|
||||
}
|
||||
|
||||
@Override public short getDefaultReplication(Path path) {
|
||||
@Override
|
||||
public short getDefaultReplication(Path path) {
|
||||
return fileSystem.getDefaultReplication(convertToDefaultPath(path));
|
||||
}
|
||||
|
||||
@Override public void access(Path path, FsAction mode)
|
||||
@Override
|
||||
public void access(Path path, FsAction mode)
|
||||
throws AccessControlException, FileNotFoundException, IOException {
|
||||
fileSystem.access(convertToDefaultPath(path), mode);
|
||||
}
|
||||
|
||||
@Override public void createSymlink(Path target, Path link, boolean createParent)
|
||||
@Override
|
||||
public void createSymlink(Path target, Path link, boolean createParent)
|
||||
throws AccessControlException, FileAlreadyExistsException, FileNotFoundException,
|
||||
ParentNotDirectoryException, UnsupportedFileSystemException, IOException {
|
||||
fileSystem
|
||||
.createSymlink(convertToDefaultPath(target), convertToDefaultPath(link), createParent);
|
||||
}
|
||||
|
||||
@Override public FileStatus getFileLinkStatus(Path f)
|
||||
@Override
|
||||
public FileStatus getFileLinkStatus(Path f)
|
||||
throws AccessControlException, FileNotFoundException, UnsupportedFileSystemException,
|
||||
IOException {
|
||||
return fileSystem.getFileLinkStatus(convertToDefaultPath(f));
|
||||
}
|
||||
|
||||
@Override public boolean supportsSymlinks() {
|
||||
@Override
|
||||
public boolean supportsSymlinks() {
|
||||
return fileSystem.supportsSymlinks();
|
||||
}
|
||||
|
||||
@Override public Path getLinkTarget(Path f) throws IOException {
|
||||
@Override
|
||||
public Path getLinkTarget(Path f) throws IOException {
|
||||
return convertToHoodiePath(fileSystem.getLinkTarget(convertToDefaultPath(f)));
|
||||
}
|
||||
|
||||
@Override public FileChecksum getFileChecksum(Path f) throws IOException {
|
||||
@Override
|
||||
public FileChecksum getFileChecksum(Path f) throws IOException {
|
||||
return fileSystem.getFileChecksum(convertToDefaultPath(f));
|
||||
}
|
||||
|
||||
@Override public FileChecksum getFileChecksum(Path f, long length) throws IOException {
|
||||
@Override
|
||||
public FileChecksum getFileChecksum(Path f, long length) throws IOException {
|
||||
return fileSystem.getFileChecksum(convertToDefaultPath(f), length);
|
||||
}
|
||||
|
||||
@Override public void setVerifyChecksum(boolean verifyChecksum) {
|
||||
@Override
|
||||
public void setVerifyChecksum(boolean verifyChecksum) {
|
||||
fileSystem.setVerifyChecksum(verifyChecksum);
|
||||
}
|
||||
|
||||
@Override public void setWriteChecksum(boolean writeChecksum) {
|
||||
@Override
|
||||
public void setWriteChecksum(boolean writeChecksum) {
|
||||
fileSystem.setWriteChecksum(writeChecksum);
|
||||
}
|
||||
|
||||
@Override public FsStatus getStatus() throws IOException {
|
||||
@Override
|
||||
public FsStatus getStatus() throws IOException {
|
||||
return fileSystem.getStatus();
|
||||
}
|
||||
|
||||
@Override public FsStatus getStatus(Path p) throws IOException {
|
||||
@Override
|
||||
public FsStatus getStatus(Path p) throws IOException {
|
||||
return fileSystem.getStatus(convertToDefaultPath(p));
|
||||
}
|
||||
|
||||
@Override public void setPermission(Path p, FsPermission permission) throws IOException {
|
||||
@Override
|
||||
public void setPermission(Path p, FsPermission permission) throws IOException {
|
||||
fileSystem.setPermission(convertToDefaultPath(p), permission);
|
||||
}
|
||||
|
||||
@Override public void setOwner(Path p, String username, String groupname) throws IOException {
|
||||
@Override
|
||||
public void setOwner(Path p, String username, String groupname) throws IOException {
|
||||
fileSystem.setOwner(convertToDefaultPath(p), username, groupname);
|
||||
}
|
||||
|
||||
@Override public void setTimes(Path p, long mtime, long atime) throws IOException {
|
||||
@Override
|
||||
public void setTimes(Path p, long mtime, long atime) throws IOException {
|
||||
fileSystem.setTimes(convertToDefaultPath(p), mtime, atime);
|
||||
}
|
||||
|
||||
@Override public Path createSnapshot(Path path, String snapshotName) throws IOException {
|
||||
@Override
|
||||
public Path createSnapshot(Path path, String snapshotName) throws IOException {
|
||||
return convertToHoodiePath(
|
||||
fileSystem.createSnapshot(convertToDefaultPath(path), snapshotName));
|
||||
}
|
||||
|
||||
@Override public void renameSnapshot(Path path, String snapshotOldName, String snapshotNewName)
|
||||
@Override
|
||||
public void renameSnapshot(Path path, String snapshotOldName, String snapshotNewName)
|
||||
throws IOException {
|
||||
fileSystem.renameSnapshot(convertToDefaultPath(path), snapshotOldName, snapshotNewName);
|
||||
}
|
||||
|
||||
@Override public void deleteSnapshot(Path path, String snapshotName) throws IOException {
|
||||
@Override
|
||||
public void deleteSnapshot(Path path, String snapshotName) throws IOException {
|
||||
fileSystem.deleteSnapshot(convertToDefaultPath(path), snapshotName);
|
||||
}
|
||||
|
||||
@Override public void modifyAclEntries(Path path, List<AclEntry> aclSpec) throws IOException {
|
||||
@Override
|
||||
public void modifyAclEntries(Path path, List<AclEntry> aclSpec) throws IOException {
|
||||
fileSystem.modifyAclEntries(convertToDefaultPath(path), aclSpec);
|
||||
}
|
||||
|
||||
@Override public void removeAclEntries(Path path, List<AclEntry> aclSpec) throws IOException {
|
||||
@Override
|
||||
public void removeAclEntries(Path path, List<AclEntry> aclSpec) throws IOException {
|
||||
fileSystem.removeAclEntries(convertToDefaultPath(path), aclSpec);
|
||||
}
|
||||
|
||||
@Override public void removeDefaultAcl(Path path) throws IOException {
|
||||
@Override
|
||||
public void removeDefaultAcl(Path path) throws IOException {
|
||||
fileSystem.removeDefaultAcl(convertToDefaultPath(path));
|
||||
}
|
||||
|
||||
@Override public void removeAcl(Path path) throws IOException {
|
||||
@Override
|
||||
public void removeAcl(Path path) throws IOException {
|
||||
fileSystem.removeAcl(convertToDefaultPath(path));
|
||||
}
|
||||
|
||||
@Override public void setAcl(Path path, List<AclEntry> aclSpec) throws IOException {
|
||||
@Override
|
||||
public void setAcl(Path path, List<AclEntry> aclSpec) throws IOException {
|
||||
fileSystem.setAcl(convertToDefaultPath(path), aclSpec);
|
||||
}
|
||||
|
||||
@Override public AclStatus getAclStatus(Path path) throws IOException {
|
||||
@Override
|
||||
public AclStatus getAclStatus(Path path) throws IOException {
|
||||
return fileSystem.getAclStatus(convertToDefaultPath(path));
|
||||
}
|
||||
|
||||
@Override public void setXAttr(Path path, String name, byte[] value) throws IOException {
|
||||
@Override
|
||||
public void setXAttr(Path path, String name, byte[] value) throws IOException {
|
||||
fileSystem.setXAttr(convertToDefaultPath(path), name, value);
|
||||
}
|
||||
|
||||
@Override public void setXAttr(Path path, String name, byte[] value, EnumSet<XAttrSetFlag> flag)
|
||||
@Override
|
||||
public void setXAttr(Path path, String name, byte[] value, EnumSet<XAttrSetFlag> flag)
|
||||
throws IOException {
|
||||
fileSystem.setXAttr(convertToDefaultPath(path), name, value, flag);
|
||||
}
|
||||
|
||||
@Override public byte[] getXAttr(Path path, String name) throws IOException {
|
||||
@Override
|
||||
public byte[] getXAttr(Path path, String name) throws IOException {
|
||||
return fileSystem.getXAttr(convertToDefaultPath(path), name);
|
||||
}
|
||||
|
||||
@Override public Map<String, byte[]> getXAttrs(Path path) throws IOException {
|
||||
@Override
|
||||
public Map<String, byte[]> getXAttrs(Path path) throws IOException {
|
||||
return fileSystem.getXAttrs(convertToDefaultPath(path));
|
||||
}
|
||||
|
||||
@Override public Map<String, byte[]> getXAttrs(Path path, List<String> names)
|
||||
@Override
|
||||
public Map<String, byte[]> getXAttrs(Path path, List<String> names)
|
||||
throws IOException {
|
||||
return fileSystem.getXAttrs(convertToDefaultPath(path), names);
|
||||
}
|
||||
|
||||
@Override public List<String> listXAttrs(Path path) throws IOException {
|
||||
@Override
|
||||
public List<String> listXAttrs(Path path) throws IOException {
|
||||
return fileSystem.listXAttrs(convertToDefaultPath(path));
|
||||
}
|
||||
|
||||
@Override public void removeXAttr(Path path, String name) throws IOException {
|
||||
@Override
|
||||
public void removeXAttr(Path path, String name) throws IOException {
|
||||
fileSystem.removeXAttr(convertToDefaultPath(path), name);
|
||||
}
|
||||
|
||||
@Override public void setConf(Configuration conf) {
|
||||
@Override
|
||||
public void setConf(Configuration conf) {
|
||||
// ignore this. we will set conf on init
|
||||
}
|
||||
|
||||
@Override public Configuration getConf() {
|
||||
@Override
|
||||
public Configuration getConf() {
|
||||
return fileSystem.getConf();
|
||||
}
|
||||
|
||||
@Override public int hashCode() {
|
||||
@Override
|
||||
public int hashCode() {
|
||||
return fileSystem.hashCode();
|
||||
}
|
||||
|
||||
@Override public boolean equals(Object obj) {
|
||||
@Override
|
||||
public boolean equals(Object obj) {
|
||||
return fileSystem.equals(obj);
|
||||
}
|
||||
|
||||
@Override public String toString() {
|
||||
@Override
|
||||
public String toString() {
|
||||
return fileSystem.toString();
|
||||
}
|
||||
|
||||
|
||||
@@ -16,16 +16,16 @@
|
||||
|
||||
package com.uber.hoodie.io.storage;
|
||||
|
||||
import org.apache.hadoop.fs.FSDataOutputStream;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.util.concurrent.atomic.AtomicLong;
|
||||
import org.apache.hadoop.fs.FSDataOutputStream;
|
||||
|
||||
/**
|
||||
* Wrapper over <code>FSDataOutputStream</code> to keep track of the size of the written bytes.
|
||||
* This gives a cheap way to check on the underlying file size.
|
||||
* Wrapper over <code>FSDataOutputStream</code> to keep track of the size of the written bytes. This
|
||||
* gives a cheap way to check on the underlying file size.
|
||||
*/
|
||||
public class SizeAwareFSDataOutputStream extends FSDataOutputStream {
|
||||
|
||||
// A callback to call when the output stream is closed.
|
||||
private final Runnable closeCallback;
|
||||
// Keep track of the bytes written
|
||||
@@ -37,17 +37,20 @@ public class SizeAwareFSDataOutputStream extends FSDataOutputStream {
|
||||
this.closeCallback = closeCallback;
|
||||
}
|
||||
|
||||
@Override public synchronized void write(byte[] b, int off, int len) throws IOException {
|
||||
@Override
|
||||
public synchronized void write(byte[] b, int off, int len) throws IOException {
|
||||
bytesWritten.addAndGet(len);
|
||||
super.write(b, off, len);
|
||||
}
|
||||
|
||||
@Override public void write(byte[] b) throws IOException {
|
||||
@Override
|
||||
public void write(byte[] b) throws IOException {
|
||||
bytesWritten.addAndGet(b.length);
|
||||
super.write(b);
|
||||
}
|
||||
|
||||
@Override public void close() throws IOException {
|
||||
@Override
|
||||
public void close() throws IOException {
|
||||
super.close();
|
||||
closeCallback.run();
|
||||
}
|
||||
|
||||
@@ -22,7 +22,6 @@ import com.codahale.metrics.Timer;
|
||||
import com.google.common.annotations.VisibleForTesting;
|
||||
import com.uber.hoodie.common.model.HoodieCommitMetadata;
|
||||
import com.uber.hoodie.config.HoodieWriteConfig;
|
||||
|
||||
import org.apache.log4j.LogManager;
|
||||
import org.apache.log4j.Logger;
|
||||
|
||||
@@ -30,6 +29,7 @@ import org.apache.log4j.Logger;
|
||||
* Wrapper for metrics-related operations.
|
||||
*/
|
||||
public class HoodieMetrics {
|
||||
|
||||
private HoodieWriteConfig config = null;
|
||||
private String tableName = null;
|
||||
private static Logger logger = LogManager.getLogger(HoodieMetrics.class);
|
||||
@@ -77,7 +77,8 @@ public class HoodieMetrics {
|
||||
return commitTimer == null ? null : commitTimer.time();
|
||||
}
|
||||
|
||||
public void updateCommitMetrics(long commitEpochTimeInMs, long durationInMs, HoodieCommitMetadata metadata) {
|
||||
public void updateCommitMetrics(long commitEpochTimeInMs, long durationInMs,
|
||||
HoodieCommitMetadata metadata) {
|
||||
if (config.isMetricsOn()) {
|
||||
long totalPartitionsWritten = metadata.fetchTotalPartitionsWritten();
|
||||
long totalFilesInsert = metadata.fetchTotalFilesInsert();
|
||||
@@ -91,8 +92,10 @@ public class HoodieMetrics {
|
||||
registerGauge(getMetricsName("commit", "totalFilesInsert"), totalFilesInsert);
|
||||
registerGauge(getMetricsName("commit", "totalFilesUpdate"), totalFilesUpdate);
|
||||
registerGauge(getMetricsName("commit", "totalRecordsWritten"), totalRecordsWritten);
|
||||
registerGauge(getMetricsName("commit", "totalUpdateRecordsWritten"), totalUpdateRecordsWritten);
|
||||
registerGauge(getMetricsName("commit", "totalInsertRecordsWritten"), totalInsertRecordsWritten);
|
||||
registerGauge(getMetricsName("commit", "totalUpdateRecordsWritten"),
|
||||
totalUpdateRecordsWritten);
|
||||
registerGauge(getMetricsName("commit", "totalInsertRecordsWritten"),
|
||||
totalInsertRecordsWritten);
|
||||
registerGauge(getMetricsName("commit", "totalBytesWritten"), totalBytesWritten);
|
||||
registerGauge(getMetricsName("commit", "commitTime"), commitEpochTimeInMs);
|
||||
}
|
||||
@@ -139,8 +142,7 @@ public class HoodieMetrics {
|
||||
}
|
||||
|
||||
/**
|
||||
* By default, the timer context returns duration with nano seconds.
|
||||
* Convert it to millisecond.
|
||||
* By default, the timer context returns duration with nano seconds. Convert it to millisecond.
|
||||
*/
|
||||
public long getDurationInMs(long ctxDuration) {
|
||||
return ctxDuration / 1000000;
|
||||
|
||||
@@ -22,6 +22,7 @@ import java.io.Closeable;
|
||||
* Used for testing.
|
||||
*/
|
||||
public class InMemoryMetricsReporter extends MetricsReporter {
|
||||
|
||||
@Override
|
||||
public void start() {
|
||||
}
|
||||
|
||||
@@ -19,16 +19,15 @@ package com.uber.hoodie.metrics;
|
||||
import com.codahale.metrics.MetricRegistry;
|
||||
import com.google.common.io.Closeables;
|
||||
import com.uber.hoodie.config.HoodieWriteConfig;
|
||||
import com.uber.hoodie.config.HoodieMetricsConfig;
|
||||
import com.uber.hoodie.exception.HoodieException;
|
||||
import org.apache.commons.configuration.ConfigurationException;
|
||||
|
||||
import java.io.Closeable;
|
||||
import org.apache.commons.configuration.ConfigurationException;
|
||||
|
||||
/**
|
||||
* This is the main class of the metrics system.
|
||||
*/
|
||||
public class Metrics {
|
||||
|
||||
private static volatile boolean initialized = false;
|
||||
private static Metrics metrics = null;
|
||||
private final MetricRegistry registry;
|
||||
|
||||
@@ -21,19 +21,18 @@ import com.codahale.metrics.MetricRegistry;
|
||||
import com.codahale.metrics.graphite.Graphite;
|
||||
import com.codahale.metrics.graphite.GraphiteReporter;
|
||||
import com.uber.hoodie.config.HoodieWriteConfig;
|
||||
|
||||
import org.apache.log4j.LogManager;
|
||||
import org.apache.log4j.Logger;
|
||||
|
||||
import java.io.Closeable;
|
||||
import java.net.InetSocketAddress;
|
||||
import java.util.concurrent.TimeUnit;
|
||||
import org.apache.log4j.LogManager;
|
||||
import org.apache.log4j.Logger;
|
||||
|
||||
/**
|
||||
* Implementation of Graphite reporter, which connects to the Graphite server,
|
||||
* and send metrics to that server.
|
||||
* Implementation of Graphite reporter, which connects to the Graphite server, and send metrics to
|
||||
* that server.
|
||||
*/
|
||||
public class MetricsGraphiteReporter extends MetricsReporter {
|
||||
|
||||
private final MetricRegistry registry;
|
||||
private final GraphiteReporter graphiteReporter;
|
||||
private final HoodieWriteConfig config;
|
||||
|
||||
@@ -22,6 +22,7 @@ import java.io.Closeable;
|
||||
* Interface for implementing a Reporter.
|
||||
*/
|
||||
public abstract class MetricsReporter {
|
||||
|
||||
/**
|
||||
* Push out metrics at scheduled intervals
|
||||
*/
|
||||
|
||||
@@ -18,7 +18,6 @@ package com.uber.hoodie.metrics;
|
||||
|
||||
import com.codahale.metrics.MetricRegistry;
|
||||
import com.uber.hoodie.config.HoodieWriteConfig;
|
||||
|
||||
import org.apache.log4j.LogManager;
|
||||
import org.apache.log4j.Logger;
|
||||
|
||||
@@ -26,6 +25,7 @@ import org.apache.log4j.Logger;
|
||||
* Factory class for creating MetricsReporter.
|
||||
*/
|
||||
public class MetricsReporterFactory {
|
||||
|
||||
private static Logger logger = LogManager.getLogger(MetricsReporterFactory.class);
|
||||
|
||||
public static MetricsReporter createReporter(HoodieWriteConfig config,
|
||||
|
||||
@@ -17,8 +17,8 @@
|
||||
package com.uber.hoodie.metrics;
|
||||
|
||||
/**
|
||||
* Types of the reporter. Right now we only support Graphite.
|
||||
* We can include JMX and CSV in the future.
|
||||
* Types of the reporter. Right now we only support Graphite. We can include JMX and CSV in the
|
||||
* future.
|
||||
*/
|
||||
public enum MetricsReporterType {
|
||||
GRAPHITE,
|
||||
|
||||
@@ -70,28 +70,16 @@ import org.apache.spark.api.java.function.PairFlatMapFunction;
|
||||
import scala.Option;
|
||||
import scala.Tuple2;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.io.Serializable;
|
||||
import java.util.ArrayList;
|
||||
import java.util.Collections;
|
||||
import java.util.HashMap;
|
||||
import java.util.Iterator;
|
||||
import java.util.List;
|
||||
import java.util.Map;
|
||||
import java.util.Optional;
|
||||
import java.util.Set;
|
||||
import java.util.stream.Collectors;
|
||||
|
||||
/**
|
||||
* Implementation of a very heavily read-optimized Hoodie Table where
|
||||
*
|
||||
* INSERTS - Produce new files, block aligned to desired size (or)
|
||||
* Merge with the smallest existing file, to expand it
|
||||
* INSERTS - Produce new files, block aligned to desired size (or) Merge with the smallest existing
|
||||
* file, to expand it
|
||||
*
|
||||
* UPDATES - Produce a new version of the file, just replacing the updated records with new values
|
||||
*
|
||||
*/
|
||||
public class HoodieCopyOnWriteTable<T extends HoodieRecordPayload> extends HoodieTable<T> {
|
||||
|
||||
public HoodieCopyOnWriteTable(HoodieWriteConfig config, HoodieTableMetaClient metaClient) {
|
||||
super(config, metaClient);
|
||||
}
|
||||
@@ -107,6 +95,7 @@ public class HoodieCopyOnWriteTable<T extends HoodieRecordPayload> extends Hoodi
|
||||
* Helper class for a small file's location and its actual size on disk
|
||||
*/
|
||||
class SmallFile implements Serializable {
|
||||
|
||||
HoodieRecordLocation location;
|
||||
long sizeBytes;
|
||||
|
||||
@@ -121,11 +110,11 @@ public class HoodieCopyOnWriteTable<T extends HoodieRecordPayload> extends Hoodi
|
||||
}
|
||||
|
||||
/**
|
||||
* Helper class for an insert bucket along with the weight [0.0, 0.1]
|
||||
* that defines the amount of incoming inserts that should be allocated to
|
||||
* the bucket
|
||||
* Helper class for an insert bucket along with the weight [0.0, 0.1] that defines the amount of
|
||||
* incoming inserts that should be allocated to the bucket
|
||||
*/
|
||||
class InsertBucket implements Serializable {
|
||||
|
||||
int bucketNumber;
|
||||
// fraction of total inserts, that should go into this bucket
|
||||
double weight;
|
||||
@@ -144,6 +133,7 @@ public class HoodieCopyOnWriteTable<T extends HoodieRecordPayload> extends Hoodi
|
||||
* Helper class for a bucket's type (INSERT and UPDATE) and its file location
|
||||
*/
|
||||
class BucketInfo implements Serializable {
|
||||
|
||||
BucketType bucketType;
|
||||
String fileLoc;
|
||||
|
||||
@@ -164,8 +154,8 @@ public class HoodieCopyOnWriteTable<T extends HoodieRecordPayload> extends Hoodi
|
||||
class UpsertPartitioner extends Partitioner {
|
||||
|
||||
/**
|
||||
* Total number of RDD partitions, is determined by total buckets we want to
|
||||
* pack the incoming workload into
|
||||
* Total number of RDD partitions, is determined by total buckets we want to pack the incoming
|
||||
* workload into
|
||||
*/
|
||||
private int totalBuckets = 0;
|
||||
|
||||
@@ -181,8 +171,7 @@ public class HoodieCopyOnWriteTable<T extends HoodieRecordPayload> extends Hoodi
|
||||
|
||||
|
||||
/**
|
||||
* Helps us pack inserts into 1 or more buckets depending on number of
|
||||
* incoming records.
|
||||
* Helps us pack inserts into 1 or more buckets depending on number of incoming records.
|
||||
*/
|
||||
private HashMap<String, List<InsertBucket>> partitionPathToInsertBuckets;
|
||||
|
||||
@@ -236,24 +225,28 @@ public class HoodieCopyOnWriteTable<T extends HoodieRecordPayload> extends Hoodi
|
||||
if (pStat.getNumInserts() > 0) {
|
||||
|
||||
List<SmallFile> smallFiles = getSmallFiles(partitionPath);
|
||||
logger.info("For partitionPath : "+ partitionPath + " Small Files => " + smallFiles);
|
||||
logger.info("For partitionPath : " + partitionPath + " Small Files => " + smallFiles);
|
||||
|
||||
long totalUnassignedInserts = pStat.getNumInserts();
|
||||
List<Integer> bucketNumbers = new ArrayList<>();
|
||||
List<Long> recordsPerBucket = new ArrayList<>();
|
||||
|
||||
// first try packing this into one of the smallFiles
|
||||
for (SmallFile smallFile: smallFiles) {
|
||||
long recordsToAppend = Math.min((config.getParquetMaxFileSize() - smallFile.sizeBytes)/ averageRecordSize, totalUnassignedInserts);
|
||||
if (recordsToAppend > 0 && totalUnassignedInserts > 0){
|
||||
for (SmallFile smallFile : smallFiles) {
|
||||
long recordsToAppend = Math
|
||||
.min((config.getParquetMaxFileSize() - smallFile.sizeBytes) / averageRecordSize,
|
||||
totalUnassignedInserts);
|
||||
if (recordsToAppend > 0 && totalUnassignedInserts > 0) {
|
||||
// create a new bucket or re-use an existing bucket
|
||||
int bucket;
|
||||
if (updateLocationToBucket.containsKey(smallFile.location.getFileId())) {
|
||||
bucket = updateLocationToBucket.get(smallFile.location.getFileId());
|
||||
logger.info("Assigning " + recordsToAppend + " inserts to existing update bucket "+ bucket);
|
||||
logger.info("Assigning " + recordsToAppend + " inserts to existing update bucket "
|
||||
+ bucket);
|
||||
} else {
|
||||
bucket = addUpdateBucket(smallFile.location.getFileId());
|
||||
logger.info("Assigning " + recordsToAppend + " inserts to new update bucket "+ bucket);
|
||||
logger.info(
|
||||
"Assigning " + recordsToAppend + " inserts to new update bucket " + bucket);
|
||||
}
|
||||
bucketNumbers.add(bucket);
|
||||
recordsPerBucket.add(recordsToAppend);
|
||||
@@ -265,16 +258,17 @@ public class HoodieCopyOnWriteTable<T extends HoodieRecordPayload> extends Hoodi
|
||||
if (totalUnassignedInserts > 0) {
|
||||
long insertRecordsPerBucket = config.getCopyOnWriteInsertSplitSize();
|
||||
if (config.shouldAutoTuneInsertSplits()) {
|
||||
insertRecordsPerBucket = config.getParquetMaxFileSize()/averageRecordSize;
|
||||
insertRecordsPerBucket = config.getParquetMaxFileSize() / averageRecordSize;
|
||||
}
|
||||
|
||||
int insertBuckets = (int) Math.max(totalUnassignedInserts / insertRecordsPerBucket, 1L);
|
||||
logger.info("After small file assignment: unassignedInserts => " + totalUnassignedInserts
|
||||
logger
|
||||
.info("After small file assignment: unassignedInserts => " + totalUnassignedInserts
|
||||
+ ", totalInsertBuckets => " + insertBuckets
|
||||
+ ", recordsPerBucket => " + insertRecordsPerBucket);
|
||||
for (int b = 0; b < insertBuckets; b++) {
|
||||
bucketNumbers.add(totalBuckets);
|
||||
recordsPerBucket.add(totalUnassignedInserts/insertBuckets);
|
||||
recordsPerBucket.add(totalUnassignedInserts / insertBuckets);
|
||||
BucketInfo bucketInfo = new BucketInfo();
|
||||
bucketInfo.bucketType = BucketType.INSERT;
|
||||
bucketInfoMap.put(totalBuckets, bucketInfo);
|
||||
@@ -287,10 +281,11 @@ public class HoodieCopyOnWriteTable<T extends HoodieRecordPayload> extends Hoodi
|
||||
for (int i = 0; i < bucketNumbers.size(); i++) {
|
||||
InsertBucket bkt = new InsertBucket();
|
||||
bkt.bucketNumber = bucketNumbers.get(i);
|
||||
bkt.weight = (1.0 * recordsPerBucket.get(i))/pStat.getNumInserts();
|
||||
bkt.weight = (1.0 * recordsPerBucket.get(i)) / pStat.getNumInserts();
|
||||
insertBuckets.add(bkt);
|
||||
}
|
||||
logger.info("Total insert buckets for partition path "+ partitionPath + " => " + insertBuckets);
|
||||
logger.info(
|
||||
"Total insert buckets for partition path " + partitionPath + " => " + insertBuckets);
|
||||
partitionPathToInsertBuckets.put(partitionPath, insertBuckets);
|
||||
}
|
||||
}
|
||||
@@ -299,9 +294,6 @@ public class HoodieCopyOnWriteTable<T extends HoodieRecordPayload> extends Hoodi
|
||||
|
||||
/**
|
||||
* Returns a list of small files in the given partition path
|
||||
*
|
||||
* @param partitionPath
|
||||
* @return
|
||||
*/
|
||||
private List<SmallFile> getSmallFiles(String partitionPath) {
|
||||
List<SmallFile> smallFileLocations = new ArrayList<>();
|
||||
@@ -330,10 +322,8 @@ public class HoodieCopyOnWriteTable<T extends HoodieRecordPayload> extends Hoodi
|
||||
}
|
||||
|
||||
/**
|
||||
* Obtains the average record size based on records written during last commit.
|
||||
* Used for estimating how many records pack into one file.
|
||||
*
|
||||
* @return
|
||||
* Obtains the average record size based on records written during last commit. Used for
|
||||
* estimating how many records pack into one file.
|
||||
*/
|
||||
private long averageBytesPerRecord() {
|
||||
long avgSize = 0L;
|
||||
@@ -375,13 +365,15 @@ public class HoodieCopyOnWriteTable<T extends HoodieRecordPayload> extends Hoodi
|
||||
HoodieRecordLocation location = keyLocation._2().get();
|
||||
return updateLocationToBucket.get(location.getFileId());
|
||||
} else {
|
||||
List<InsertBucket> targetBuckets = partitionPathToInsertBuckets.get(keyLocation._1().getPartitionPath());
|
||||
List<InsertBucket> targetBuckets = partitionPathToInsertBuckets
|
||||
.get(keyLocation._1().getPartitionPath());
|
||||
// pick the target bucket to use based on the weights.
|
||||
double totalWeight = 0.0;
|
||||
final long totalInserts = Math.max(1, globalStat.getNumInserts());
|
||||
final long hashOfKey = Hashing.md5().hashString(keyLocation._1().getRecordKey(), StandardCharsets.UTF_8).asLong();
|
||||
final long hashOfKey = Hashing.md5()
|
||||
.hashString(keyLocation._1().getRecordKey(), StandardCharsets.UTF_8).asLong();
|
||||
final double r = 1.0 * Math.floorMod(hashOfKey, totalInserts) / totalInserts;
|
||||
for (InsertBucket insertBucket: targetBuckets) {
|
||||
for (InsertBucket insertBucket : targetBuckets) {
|
||||
totalWeight += insertBucket.weight;
|
||||
if (r <= totalWeight) {
|
||||
return insertBucket.bucketNumber;
|
||||
@@ -413,14 +405,14 @@ public class HoodieCopyOnWriteTable<T extends HoodieRecordPayload> extends Hoodi
|
||||
}
|
||||
|
||||
|
||||
|
||||
public Iterator<List<WriteStatus>> handleUpdate(String commitTime, String fileLoc, Iterator<HoodieRecord<T>> recordItr)
|
||||
public Iterator<List<WriteStatus>> handleUpdate(String commitTime, String fileLoc,
|
||||
Iterator<HoodieRecord<T>> recordItr)
|
||||
throws IOException {
|
||||
// these are updates
|
||||
HoodieMergeHandle upsertHandle = getUpdateHandle(commitTime, fileLoc, recordItr);
|
||||
if (upsertHandle.getOldFilePath() == null) {
|
||||
throw new HoodieUpsertException("Error in finding the old file path at commit " +
|
||||
commitTime +" at fileLoc: " + fileLoc);
|
||||
commitTime + " at fileLoc: " + fileLoc);
|
||||
} else {
|
||||
Configuration conf = FSUtils.getFs().getConf();
|
||||
AvroReadSupport.setAvroReadSchema(conf, upsertHandle.getSchema());
|
||||
@@ -448,14 +440,17 @@ public class HoodieCopyOnWriteTable<T extends HoodieRecordPayload> extends Hoodi
|
||||
logger.info("Upsert Handle has partition path as null " + upsertHandle.getOldFilePath()
|
||||
+ ", " + upsertHandle.getWriteStatus());
|
||||
}
|
||||
return Collections.singletonList(Collections.singletonList(upsertHandle.getWriteStatus())).iterator();
|
||||
return Collections.singletonList(Collections.singletonList(upsertHandle.getWriteStatus()))
|
||||
.iterator();
|
||||
}
|
||||
|
||||
protected HoodieMergeHandle getUpdateHandle(String commitTime, String fileLoc, Iterator<HoodieRecord<T>> recordItr) {
|
||||
protected HoodieMergeHandle getUpdateHandle(String commitTime, String fileLoc,
|
||||
Iterator<HoodieRecord<T>> recordItr) {
|
||||
return new HoodieMergeHandle<>(config, commitTime, this, recordItr, fileLoc);
|
||||
}
|
||||
|
||||
public Iterator<List<WriteStatus>> handleInsert(String commitTime, Iterator<HoodieRecord<T>> recordItr) throws Exception {
|
||||
public Iterator<List<WriteStatus>> handleInsert(String commitTime,
|
||||
Iterator<HoodieRecord<T>> recordItr) throws Exception {
|
||||
return new LazyInsertIterable<>(recordItr, config, commitTime, this);
|
||||
}
|
||||
|
||||
@@ -473,7 +468,8 @@ public class HoodieCopyOnWriteTable<T extends HoodieRecordPayload> extends Hoodi
|
||||
} else if (btype.equals(BucketType.UPDATE)) {
|
||||
return handleUpdate(commitTime, binfo.fileLoc, recordItr);
|
||||
} else {
|
||||
throw new HoodieUpsertException("Unknown bucketType " + btype + " for partition :" + partition);
|
||||
throw new HoodieUpsertException(
|
||||
"Unknown bucketType " + btype + " for partition :" + partition);
|
||||
}
|
||||
} catch (Throwable t) {
|
||||
String msg = "Error upserting bucketType " + btype + " for partition :" + partition;
|
||||
@@ -496,9 +492,9 @@ public class HoodieCopyOnWriteTable<T extends HoodieRecordPayload> extends Hoodi
|
||||
}
|
||||
|
||||
/**
|
||||
* Performs cleaning of partition paths according to cleaning policy and returns the number
|
||||
* of files cleaned. Handles skews in partitions to clean by making files to clean as the
|
||||
* unit of task distribution.
|
||||
* Performs cleaning of partition paths according to cleaning policy and returns the number of
|
||||
* files cleaned. Handles skews in partitions to clean by making files to clean as the unit of
|
||||
* task distribution.
|
||||
*
|
||||
* @throws IllegalArgumentException if unknown cleaning policy is provided
|
||||
*/
|
||||
@@ -506,7 +502,8 @@ public class HoodieCopyOnWriteTable<T extends HoodieRecordPayload> extends Hoodi
|
||||
public List<HoodieCleanStat> clean(JavaSparkContext jsc) {
|
||||
try {
|
||||
List<String> partitionsToClean =
|
||||
FSUtils.getAllPartitionPaths(getFs(), getMetaClient().getBasePath(), config.shouldAssumeDatePartitioning());
|
||||
FSUtils.getAllPartitionPaths(getFs(), getMetaClient().getBasePath(),
|
||||
config.shouldAssumeDatePartitioning());
|
||||
logger.info("Partitions to clean up : " + partitionsToClean + ", with policy " + config
|
||||
.getCleanerPolicy());
|
||||
if (partitionsToClean.isEmpty()) {
|
||||
@@ -520,19 +517,16 @@ public class HoodieCopyOnWriteTable<T extends HoodieRecordPayload> extends Hoodi
|
||||
}
|
||||
|
||||
/**
|
||||
*
|
||||
* Common method used for cleaning out parquet files under a partition path during rollback of a set of commits
|
||||
* @param partitionPath
|
||||
* @param commits
|
||||
* @return
|
||||
* @throws IOException
|
||||
* Common method used for cleaning out parquet files under a partition path during rollback of a
|
||||
* set of commits
|
||||
*/
|
||||
protected Map<FileStatus, Boolean> deleteCleanedFiles(String partitionPath, List<String> commits) throws IOException {
|
||||
protected Map<FileStatus, Boolean> deleteCleanedFiles(String partitionPath, List<String> commits)
|
||||
throws IOException {
|
||||
logger.info("Cleaning path " + partitionPath);
|
||||
FileSystem fs = FSUtils.getFs();
|
||||
FileStatus[] toBeDeleted =
|
||||
fs.listStatus(new Path(config.getBasePath(), partitionPath), path -> {
|
||||
if(!path.toString().contains(".parquet")) {
|
||||
if (!path.toString().contains(".parquet")) {
|
||||
return false;
|
||||
}
|
||||
String fileCommitTime = FSUtils.getCommitTime(path.getName());
|
||||
@@ -548,10 +542,12 @@ public class HoodieCopyOnWriteTable<T extends HoodieRecordPayload> extends Hoodi
|
||||
}
|
||||
|
||||
@Override
|
||||
public List<HoodieRollbackStat> rollback(JavaSparkContext jsc, List<String> commits) throws IOException {
|
||||
public List<HoodieRollbackStat> rollback(JavaSparkContext jsc, List<String> commits)
|
||||
throws IOException {
|
||||
String actionType = this.getCompactedCommitActionType();
|
||||
HoodieActiveTimeline activeTimeline = this.getActiveTimeline();
|
||||
List<String> inflights = this.getInflightCommitTimeline().getInstants().map(HoodieInstant::getTimestamp)
|
||||
List<String> inflights = this.getInflightCommitTimeline().getInstants()
|
||||
.map(HoodieInstant::getTimestamp)
|
||||
.collect(Collectors.toList());
|
||||
|
||||
// Atomically unpublish all the commits
|
||||
@@ -563,7 +559,8 @@ public class HoodieCopyOnWriteTable<T extends HoodieRecordPayload> extends Hoodi
|
||||
// delete all the data files for all these commits
|
||||
logger.info("Clean out all parquet files generated for commits: " + commits);
|
||||
List<HoodieRollbackStat> stats = jsc.parallelize(
|
||||
FSUtils.getAllPartitionPaths(FSUtils.getFs(), this.getMetaClient().getBasePath(), config.shouldAssumeDatePartitioning()))
|
||||
FSUtils.getAllPartitionPaths(FSUtils.getFs(), this.getMetaClient().getBasePath(),
|
||||
config.shouldAssumeDatePartitioning()))
|
||||
.map((Function<String, HoodieRollbackStat>) partitionPath -> {
|
||||
// Scan all partitions files with this commit time
|
||||
Map<FileStatus, Boolean> results = deleteCleanedFiles(partitionPath, commits);
|
||||
@@ -579,6 +576,7 @@ public class HoodieCopyOnWriteTable<T extends HoodieRecordPayload> extends Hoodi
|
||||
}
|
||||
|
||||
private static class PartitionCleanStat implements Serializable {
|
||||
|
||||
private final String partitionPath;
|
||||
private final List<String> deletePathPatterns = new ArrayList<>();
|
||||
private final List<String> successDeleteFiles = new ArrayList<>();
|
||||
@@ -613,7 +611,8 @@ public class HoodieCopyOnWriteTable<T extends HoodieRecordPayload> extends Hoodi
|
||||
}
|
||||
}
|
||||
|
||||
private List<HoodieCleanStat> cleanPartitionPaths(List<String> partitionsToClean, JavaSparkContext jsc) {
|
||||
private List<HoodieCleanStat> cleanPartitionPaths(List<String> partitionsToClean,
|
||||
JavaSparkContext jsc) {
|
||||
int cleanerParallelism = Math.min(partitionsToClean.size(), config.getCleanerParallelism());
|
||||
logger.info("Using cleanerParallelism: " + cleanerParallelism);
|
||||
List<Tuple2<String, PartitionCleanStat>> partitionCleanStats = jsc
|
||||
@@ -621,7 +620,8 @@ public class HoodieCopyOnWriteTable<T extends HoodieRecordPayload> extends Hoodi
|
||||
.flatMapToPair(getFilesToDeleteFunc(this, config))
|
||||
.repartition(cleanerParallelism) // repartition to remove skews
|
||||
.mapPartitionsToPair(deleteFilesFunc(this, config))
|
||||
.reduceByKey( // merge partition level clean stats below
|
||||
.reduceByKey(
|
||||
// merge partition level clean stats below
|
||||
(Function2<PartitionCleanStat, PartitionCleanStat, PartitionCleanStat>) (e1, e2) -> e1
|
||||
.merge(e2))
|
||||
.collect();
|
||||
|
||||
@@ -39,13 +39,6 @@ import com.uber.hoodie.exception.HoodieCompactionException;
|
||||
import com.uber.hoodie.exception.HoodieRollbackException;
|
||||
import com.uber.hoodie.io.HoodieAppendHandle;
|
||||
import com.uber.hoodie.io.compact.HoodieRealtimeTableCompactor;
|
||||
import org.apache.hadoop.fs.FileStatus;
|
||||
import org.apache.hadoop.fs.Path;
|
||||
import org.apache.log4j.LogManager;
|
||||
import org.apache.log4j.Logger;
|
||||
import org.apache.spark.api.java.JavaSparkContext;
|
||||
import org.apache.spark.api.java.function.Function;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.io.UncheckedIOException;
|
||||
import java.util.Arrays;
|
||||
@@ -56,6 +49,12 @@ import java.util.List;
|
||||
import java.util.Map;
|
||||
import java.util.Optional;
|
||||
import java.util.stream.Collectors;
|
||||
import org.apache.hadoop.fs.FileStatus;
|
||||
import org.apache.hadoop.fs.Path;
|
||||
import org.apache.log4j.LogManager;
|
||||
import org.apache.log4j.Logger;
|
||||
import org.apache.spark.api.java.JavaSparkContext;
|
||||
import org.apache.spark.api.java.function.Function;
|
||||
|
||||
|
||||
/**
|
||||
@@ -64,13 +63,15 @@ import java.util.stream.Collectors;
|
||||
* INSERTS - Same as HoodieCopyOnWriteTable - Produce new files, block aligned to desired size (or)
|
||||
* Merge with the smallest existing file, to expand it
|
||||
*
|
||||
* UPDATES - Appends the changes to a rolling log file maintained per file Id.
|
||||
* Compaction merges the log file into the base file.
|
||||
* UPDATES - Appends the changes to a rolling log file maintained per file Id. Compaction merges the
|
||||
* log file into the base file.
|
||||
*
|
||||
* WARNING - MOR table type does not support nested rollbacks, every rollback
|
||||
* must be followed by an attempted commit action
|
||||
* WARNING - MOR table type does not support nested rollbacks, every rollback must be followed by an
|
||||
* attempted commit action
|
||||
*/
|
||||
public class HoodieMergeOnReadTable<T extends HoodieRecordPayload> extends HoodieCopyOnWriteTable<T> {
|
||||
public class HoodieMergeOnReadTable<T extends HoodieRecordPayload> extends
|
||||
HoodieCopyOnWriteTable<T> {
|
||||
|
||||
private static Logger logger = LogManager.getLogger(HoodieMergeOnReadTable.class);
|
||||
|
||||
public HoodieMergeOnReadTable(HoodieWriteConfig config,
|
||||
@@ -119,15 +120,17 @@ public class HoodieMergeOnReadTable<T extends HoodieRecordPayload> extends Hoodi
|
||||
}
|
||||
|
||||
@Override
|
||||
public List<HoodieRollbackStat> rollback(JavaSparkContext jsc, List<String> commits) throws IOException {
|
||||
public List<HoodieRollbackStat> rollback(JavaSparkContext jsc, List<String> commits)
|
||||
throws IOException {
|
||||
|
||||
//At the moment, MOR table type does not support nested rollbacks
|
||||
if(commits.size() > 1) {
|
||||
if (commits.size() > 1) {
|
||||
throw new UnsupportedOperationException("Nested Rollbacks are not supported");
|
||||
}
|
||||
Map<String, HoodieInstant> commitsAndCompactions =
|
||||
this.getActiveTimeline()
|
||||
.getTimelineOfActions(Sets.newHashSet(HoodieActiveTimeline.COMMIT_ACTION, HoodieActiveTimeline.COMPACTION_ACTION, HoodieActiveTimeline.DELTA_COMMIT_ACTION))
|
||||
.getTimelineOfActions(Sets.newHashSet(HoodieActiveTimeline.COMMIT_ACTION,
|
||||
HoodieActiveTimeline.COMPACTION_ACTION, HoodieActiveTimeline.DELTA_COMMIT_ACTION))
|
||||
.getInstants()
|
||||
.filter(i -> commits.contains(i.getTimestamp()))
|
||||
.collect(Collectors.toMap(i -> i.getTimestamp(), i -> i));
|
||||
@@ -149,11 +152,14 @@ public class HoodieMergeOnReadTable<T extends HoodieRecordPayload> extends Hoodi
|
||||
try {
|
||||
logger.info("Starting to rollback Commit/Compaction " + instant);
|
||||
HoodieCommitMetadata commitMetadata = HoodieCommitMetadata
|
||||
.fromBytes(this.getCommitTimeline().getInstantDetails(new HoodieInstant(true, instant.getAction(), instant.getTimestamp())).get());
|
||||
.fromBytes(this.getCommitTimeline().getInstantDetails(
|
||||
new HoodieInstant(true, instant.getAction(), instant.getTimestamp())).get());
|
||||
|
||||
stats = jsc.parallelize(commitMetadata.getPartitionToWriteStats().keySet().stream().collect(Collectors.toList()))
|
||||
stats = jsc.parallelize(commitMetadata.getPartitionToWriteStats().keySet().stream()
|
||||
.collect(Collectors.toList()))
|
||||
.map((Function<String, HoodieRollbackStat>) partitionPath -> {
|
||||
Map<FileStatus, Boolean> results = super.deleteCleanedFiles(partitionPath, Arrays.asList(commit));
|
||||
Map<FileStatus, Boolean> results = super
|
||||
.deleteCleanedFiles(partitionPath, Arrays.asList(commit));
|
||||
return HoodieRollbackStat.newBuilder().withPartitionPath(partitionPath)
|
||||
.withDeletedFileResults(results).build();
|
||||
}).collect();
|
||||
@@ -167,40 +173,55 @@ public class HoodieMergeOnReadTable<T extends HoodieRecordPayload> extends Hoodi
|
||||
logger.info("Starting to rollback delta commit " + instant);
|
||||
|
||||
HoodieCommitMetadata commitMetadata = HoodieCommitMetadata
|
||||
.fromBytes(this.getCommitTimeline().getInstantDetails(new HoodieInstant(true, instant.getAction(), instant.getTimestamp())).get());
|
||||
.fromBytes(this.getCommitTimeline().getInstantDetails(
|
||||
new HoodieInstant(true, instant.getAction(), instant.getTimestamp())).get());
|
||||
|
||||
stats = jsc.parallelize(commitMetadata.getPartitionToWriteStats().keySet().stream().collect(Collectors.toList()))
|
||||
stats = jsc.parallelize(commitMetadata.getPartitionToWriteStats().keySet().stream()
|
||||
.collect(Collectors.toList()))
|
||||
.map((Function<String, HoodieRollbackStat>) partitionPath -> {
|
||||
// read commit file and (either append delete blocks or delete file)
|
||||
Map<FileStatus, Boolean> filesToDeletedStatus = new HashMap<>();
|
||||
Map<FileStatus, Long> filesToNumBlocksRollback = new HashMap<>();
|
||||
|
||||
// we do not know fileIds for inserts (first inserts are parquet files), delete all parquet files for the corresponding failed commit, if present (same as COW)
|
||||
filesToDeletedStatus = super.deleteCleanedFiles(partitionPath, Arrays.asList(commit));
|
||||
filesToDeletedStatus = super
|
||||
.deleteCleanedFiles(partitionPath, Arrays.asList(commit));
|
||||
|
||||
// append rollback blocks for updates
|
||||
commitMetadata.getPartitionToWriteStats().get(partitionPath).stream().filter(wStat -> wStat.getPrevCommit() != HoodieWriteStat.NULL_COMMIT).forEach(wStat -> {
|
||||
commitMetadata.getPartitionToWriteStats().get(partitionPath).stream()
|
||||
.filter(wStat -> wStat.getPrevCommit() != HoodieWriteStat.NULL_COMMIT)
|
||||
.forEach(wStat -> {
|
||||
HoodieLogFormat.Writer writer = null;
|
||||
try {
|
||||
writer = HoodieLogFormat.newWriterBuilder()
|
||||
.onParentPath(new Path(this.getMetaClient().getBasePath(), partitionPath))
|
||||
.onParentPath(
|
||||
new Path(this.getMetaClient().getBasePath(), partitionPath))
|
||||
.withFileId(wStat.getFileId()).overBaseCommit(wStat.getPrevCommit())
|
||||
.withFs(FSUtils.getFs()).withFileExtension(HoodieLogFile.DELTA_EXTENSION).build();
|
||||
.withFs(FSUtils.getFs())
|
||||
.withFileExtension(HoodieLogFile.DELTA_EXTENSION).build();
|
||||
Long numRollbackBlocks = 0L;
|
||||
// generate metadata
|
||||
Map<HoodieLogBlock.LogMetadataType, String> metadata = Maps.newHashMap();
|
||||
metadata.put(HoodieLogBlock.LogMetadataType.INSTANT_TIME, metaClient.getActiveTimeline().lastInstant().get().getTimestamp());
|
||||
metadata.put(HoodieLogBlock.LogMetadataType.INSTANT_TIME,
|
||||
metaClient.getActiveTimeline().lastInstant().get().getTimestamp());
|
||||
metadata.put(HoodieLogBlock.LogMetadataType.TARGET_INSTANT_TIME, commit);
|
||||
// if update belongs to an existing log file
|
||||
writer.appendBlock(new HoodieCommandBlock(HoodieCommandBlock.HoodieCommandBlockTypeEnum.ROLLBACK_PREVIOUS_BLOCK, metadata));
|
||||
writer.appendBlock(new HoodieCommandBlock(
|
||||
HoodieCommandBlock.HoodieCommandBlockTypeEnum.ROLLBACK_PREVIOUS_BLOCK,
|
||||
metadata));
|
||||
numRollbackBlocks++;
|
||||
if(wStat.getNumDeletes() > 0) {
|
||||
writer.appendBlock(new HoodieCommandBlock(HoodieCommandBlock.HoodieCommandBlockTypeEnum.ROLLBACK_PREVIOUS_BLOCK, metadata));
|
||||
if (wStat.getNumDeletes() > 0) {
|
||||
writer.appendBlock(new HoodieCommandBlock(
|
||||
HoodieCommandBlock.HoodieCommandBlockTypeEnum.ROLLBACK_PREVIOUS_BLOCK,
|
||||
metadata));
|
||||
numRollbackBlocks++;
|
||||
}
|
||||
filesToNumBlocksRollback.put(FSUtils.getFs().getFileStatus(writer.getLogFile().getPath()), numRollbackBlocks);
|
||||
filesToNumBlocksRollback
|
||||
.put(FSUtils.getFs().getFileStatus(writer.getLogFile().getPath()),
|
||||
numRollbackBlocks);
|
||||
} catch (IOException | InterruptedException io) {
|
||||
throw new HoodieRollbackException("Failed to rollback for commit " + commit, io);
|
||||
throw new HoodieRollbackException(
|
||||
"Failed to rollback for commit " + commit, io);
|
||||
} finally {
|
||||
try {
|
||||
writer.close();
|
||||
@@ -223,10 +244,12 @@ public class HoodieMergeOnReadTable<T extends HoodieRecordPayload> extends Hoodi
|
||||
}).flatMap(x -> x.stream()).collect(Collectors.toList());
|
||||
|
||||
commitsAndCompactions.entrySet().stream()
|
||||
.map(entry -> new HoodieInstant(true, entry.getValue().getAction(), entry.getValue().getTimestamp()))
|
||||
.map(entry -> new HoodieInstant(true, entry.getValue().getAction(),
|
||||
entry.getValue().getTimestamp()))
|
||||
.forEach(this.getActiveTimeline()::deleteInflight);
|
||||
|
||||
logger.debug("Time(in ms) taken to finish rollback " + (System.currentTimeMillis() - startTime));
|
||||
logger
|
||||
.debug("Time(in ms) taken to finish rollback " + (System.currentTimeMillis() - startTime));
|
||||
|
||||
return allRollbackStats;
|
||||
}
|
||||
|
||||
@@ -34,7 +34,6 @@ import com.uber.hoodie.common.util.AvroUtils;
|
||||
import com.uber.hoodie.config.HoodieWriteConfig;
|
||||
import com.uber.hoodie.exception.HoodieCommitException;
|
||||
import com.uber.hoodie.exception.HoodieException;
|
||||
import com.uber.hoodie.exception.HoodieRollbackException;
|
||||
import com.uber.hoodie.exception.HoodieSavepointException;
|
||||
import java.io.IOException;
|
||||
import java.io.Serializable;
|
||||
@@ -43,8 +42,6 @@ import java.util.List;
|
||||
import java.util.Optional;
|
||||
import java.util.stream.Collectors;
|
||||
import java.util.stream.Stream;
|
||||
|
||||
import org.apache.hadoop.fs.FileStatus;
|
||||
import org.apache.hadoop.fs.FileSystem;
|
||||
import org.apache.log4j.LogManager;
|
||||
import org.apache.log4j.Logger;
|
||||
@@ -55,6 +52,7 @@ import org.apache.spark.api.java.JavaSparkContext;
|
||||
* Abstract implementation of a HoodieTable
|
||||
*/
|
||||
public abstract class HoodieTable<T extends HoodieRecordPayload> implements Serializable {
|
||||
|
||||
protected final HoodieWriteConfig config;
|
||||
protected final HoodieTableMetaClient metaClient;
|
||||
private static Logger logger = LogManager.getLogger(HoodieTable.class);
|
||||
@@ -65,27 +63,19 @@ public abstract class HoodieTable<T extends HoodieRecordPayload> implements Seri
|
||||
}
|
||||
|
||||
/**
|
||||
* Provides a partitioner to perform the upsert operation, based on the
|
||||
* workload profile
|
||||
*
|
||||
* @return
|
||||
* Provides a partitioner to perform the upsert operation, based on the workload profile
|
||||
*/
|
||||
public abstract Partitioner getUpsertPartitioner(WorkloadProfile profile);
|
||||
|
||||
|
||||
/**
|
||||
* Provides a partitioner to perform the insert operation, based on the workload profile
|
||||
*
|
||||
* @return
|
||||
*/
|
||||
public abstract Partitioner getInsertPartitioner(WorkloadProfile profile);
|
||||
|
||||
|
||||
/**
|
||||
* Return whether this HoodieTable implementation can benefit from workload
|
||||
* profiling
|
||||
*
|
||||
* @return
|
||||
* Return whether this HoodieTable implementation can benefit from workload profiling
|
||||
*/
|
||||
public abstract boolean isWorkloadProfileNeeded();
|
||||
|
||||
@@ -103,8 +93,6 @@ public abstract class HoodieTable<T extends HoodieRecordPayload> implements Seri
|
||||
|
||||
/**
|
||||
* Get the view of the file system for this table
|
||||
*
|
||||
* @return
|
||||
*/
|
||||
public TableFileSystemView getFileSystemView() {
|
||||
return new HoodieTableFileSystemView(metaClient, getCompletedCommitTimeline());
|
||||
@@ -112,8 +100,6 @@ public abstract class HoodieTable<T extends HoodieRecordPayload> implements Seri
|
||||
|
||||
/**
|
||||
* Get the read optimized view of the file system for this table
|
||||
*
|
||||
* @return
|
||||
*/
|
||||
public TableFileSystemView.ReadOptimizedView getROFileSystemView() {
|
||||
return new HoodieTableFileSystemView(metaClient, getCompletedCommitTimeline());
|
||||
@@ -121,8 +107,6 @@ public abstract class HoodieTable<T extends HoodieRecordPayload> implements Seri
|
||||
|
||||
/**
|
||||
* Get the real time view of the file system for this table
|
||||
*
|
||||
* @return
|
||||
*/
|
||||
public TableFileSystemView.RealtimeView getRTFileSystemView() {
|
||||
return new HoodieTableFileSystemView(metaClient, getCompletedCommitTimeline());
|
||||
@@ -130,8 +114,6 @@ public abstract class HoodieTable<T extends HoodieRecordPayload> implements Seri
|
||||
|
||||
/**
|
||||
* Get the completed (commit + compaction) view of the file system for this table
|
||||
*
|
||||
* @return
|
||||
*/
|
||||
public TableFileSystemView getCompletedFileSystemView() {
|
||||
return new HoodieTableFileSystemView(metaClient, getCommitTimeline());
|
||||
@@ -139,7 +121,6 @@ public abstract class HoodieTable<T extends HoodieRecordPayload> implements Seri
|
||||
|
||||
/**
|
||||
* Get only the completed (no-inflights) commit timeline
|
||||
* @return
|
||||
*/
|
||||
public HoodieTimeline getCompletedCommitTimeline() {
|
||||
return getCommitTimeline().filterCompletedInstants();
|
||||
@@ -147,7 +128,6 @@ public abstract class HoodieTable<T extends HoodieRecordPayload> implements Seri
|
||||
|
||||
/**
|
||||
* Get only the inflights (no-completed) commit timeline
|
||||
* @return
|
||||
*/
|
||||
public HoodieTimeline getInflightCommitTimeline() {
|
||||
return getCommitTimeline().filterInflights();
|
||||
@@ -156,7 +136,6 @@ public abstract class HoodieTable<T extends HoodieRecordPayload> implements Seri
|
||||
|
||||
/**
|
||||
* Get only the completed (no-inflights) clean timeline
|
||||
* @return
|
||||
*/
|
||||
public HoodieTimeline getCompletedCleanTimeline() {
|
||||
return getActiveTimeline().getCleanerTimeline().filterCompletedInstants();
|
||||
@@ -164,7 +143,6 @@ public abstract class HoodieTable<T extends HoodieRecordPayload> implements Seri
|
||||
|
||||
/**
|
||||
* Get only the completed (no-inflights) savepoint timeline
|
||||
* @return
|
||||
*/
|
||||
public HoodieTimeline getCompletedSavepointTimeline() {
|
||||
return getActiveTimeline().getSavePointTimeline().filterCompletedInstants();
|
||||
@@ -172,7 +150,6 @@ public abstract class HoodieTable<T extends HoodieRecordPayload> implements Seri
|
||||
|
||||
/**
|
||||
* Get the list of savepoints in this table
|
||||
* @return
|
||||
*/
|
||||
public List<String> getSavepoints() {
|
||||
return getCompletedSavepointTimeline().getInstants().map(HoodieInstant::getTimestamp)
|
||||
@@ -181,10 +158,6 @@ public abstract class HoodieTable<T extends HoodieRecordPayload> implements Seri
|
||||
|
||||
/**
|
||||
* Get the list of data file names savepointed
|
||||
*
|
||||
* @param savepointTime
|
||||
* @return
|
||||
* @throws IOException
|
||||
*/
|
||||
public Stream<String> getSavepointedDataFiles(String savepointTime) {
|
||||
if (!getSavepoints().contains(savepointTime)) {
|
||||
@@ -211,8 +184,6 @@ public abstract class HoodieTable<T extends HoodieRecordPayload> implements Seri
|
||||
|
||||
/**
|
||||
* Get the commit timeline visible for this table
|
||||
*
|
||||
* @return
|
||||
*/
|
||||
public HoodieTimeline getCommitTimeline() {
|
||||
switch (metaClient.getTableType()) {
|
||||
@@ -223,13 +194,12 @@ public abstract class HoodieTable<T extends HoodieRecordPayload> implements Seri
|
||||
// Include commit action to be able to start doing a MOR over a COW dataset - no migration required
|
||||
return getActiveTimeline().getCommitsAndCompactionsTimeline();
|
||||
default:
|
||||
throw new HoodieException("Unsupported table type :"+ metaClient.getTableType());
|
||||
throw new HoodieException("Unsupported table type :" + metaClient.getTableType());
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Get only the completed (no-inflights) compaction commit timeline
|
||||
* @return
|
||||
*/
|
||||
public HoodieTimeline getCompletedCompactionCommitTimeline() {
|
||||
return getCompactionCommitTimeline().filterCompletedInstants();
|
||||
@@ -238,8 +208,6 @@ public abstract class HoodieTable<T extends HoodieRecordPayload> implements Seri
|
||||
|
||||
/**
|
||||
* Get the compacted commit timeline visible for this table
|
||||
*
|
||||
* @return
|
||||
*/
|
||||
public HoodieTimeline getCompactionCommitTimeline() {
|
||||
switch (metaClient.getTableType()) {
|
||||
@@ -250,13 +218,12 @@ public abstract class HoodieTable<T extends HoodieRecordPayload> implements Seri
|
||||
return getActiveTimeline().getTimelineOfActions(
|
||||
Sets.newHashSet(HoodieActiveTimeline.COMPACTION_ACTION));
|
||||
default:
|
||||
throw new HoodieException("Unsupported table type :"+ metaClient.getTableType());
|
||||
throw new HoodieException("Unsupported table type :" + metaClient.getTableType());
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Gets the commit action type
|
||||
* @return
|
||||
*/
|
||||
public String getCommitActionType() {
|
||||
switch (metaClient.getTableType()) {
|
||||
@@ -271,7 +238,6 @@ public abstract class HoodieTable<T extends HoodieRecordPayload> implements Seri
|
||||
|
||||
/**
|
||||
* Gets the action type for a compaction commit
|
||||
* @return
|
||||
*/
|
||||
public String getCompactedCommitActionType() {
|
||||
switch (metaClient.getTableType()) {
|
||||
@@ -280,27 +246,18 @@ public abstract class HoodieTable<T extends HoodieRecordPayload> implements Seri
|
||||
case MERGE_ON_READ:
|
||||
return HoodieTimeline.COMPACTION_ACTION;
|
||||
}
|
||||
throw new HoodieException("Unsupported table type :"+ metaClient.getTableType());
|
||||
throw new HoodieException("Unsupported table type :" + metaClient.getTableType());
|
||||
}
|
||||
|
||||
|
||||
|
||||
/**
|
||||
* Perform the ultimate IO for a given upserted (RDD) partition
|
||||
*
|
||||
* @param partition
|
||||
* @param recordIterator
|
||||
* @param partitioner
|
||||
*/
|
||||
public abstract Iterator<List<WriteStatus>> handleUpsertPartition(String commitTime,
|
||||
Integer partition, Iterator<HoodieRecord<T>> recordIterator, Partitioner partitioner);
|
||||
|
||||
/**
|
||||
* Perform the ultimate IO for a given inserted (RDD) partition
|
||||
*
|
||||
* @param partition
|
||||
* @param recordIterator
|
||||
* @param partitioner
|
||||
*/
|
||||
public abstract Iterator<List<WriteStatus>> handleInsertPartition(String commitTime,
|
||||
Integer partition, Iterator<HoodieRecord<T>> recordIterator, Partitioner partitioner);
|
||||
@@ -319,27 +276,21 @@ public abstract class HoodieTable<T extends HoodieRecordPayload> implements Seri
|
||||
}
|
||||
|
||||
/**
|
||||
* Run Compaction on the table.
|
||||
* Compaction arranges the data so that it is optimized for data access
|
||||
* Run Compaction on the table. Compaction arranges the data so that it is optimized for data
|
||||
* access
|
||||
*/
|
||||
public abstract Optional<HoodieCompactionMetadata> compact(JavaSparkContext jsc);
|
||||
|
||||
/**
|
||||
* Clean partition paths according to cleaning policy and returns the number
|
||||
* of files cleaned.
|
||||
* Clean partition paths according to cleaning policy and returns the number of files cleaned.
|
||||
*/
|
||||
public abstract List<HoodieCleanStat> clean(JavaSparkContext jsc);
|
||||
|
||||
/**
|
||||
* Rollback the (inflight/committed) record changes with the given commit time.
|
||||
* Four steps:
|
||||
* (1) Atomically unpublish this commit
|
||||
* (2) clean indexing data
|
||||
* (3) clean new generated parquet files / log blocks
|
||||
* (4) Finally, delete .<action>.commit or .<action>.inflight file
|
||||
* @param commits
|
||||
* @return
|
||||
* @throws HoodieRollbackException
|
||||
* Rollback the (inflight/committed) record changes with the given commit time. Four steps: (1)
|
||||
* Atomically unpublish this commit (2) clean indexing data (3) clean new generated parquet files
|
||||
* / log blocks (4) Finally, delete .<action>.commit or .<action>.inflight file
|
||||
*/
|
||||
public abstract List<HoodieRollbackStat> rollback(JavaSparkContext jsc, List<String> commits) throws IOException;
|
||||
public abstract List<HoodieRollbackStat> rollback(JavaSparkContext jsc, List<String> commits)
|
||||
throws IOException;
|
||||
}
|
||||
|
||||
@@ -20,13 +20,13 @@ import com.uber.hoodie.common.model.HoodieRecordPayload;
|
||||
import org.apache.spark.api.java.JavaRDD;
|
||||
|
||||
/**
|
||||
* Repartition input records into at least expected number of output spark partitions. It should give
|
||||
* below guarantees
|
||||
* - Output spark partition will have records from only one hoodie partition.
|
||||
* - Average records per output spark partitions should be almost equal to (#inputRecords / #outputSparkPartitions)
|
||||
* to avoid possible skews.
|
||||
* Repartition input records into at least expected number of output spark partitions. It should
|
||||
* give below guarantees - Output spark partition will have records from only one hoodie partition.
|
||||
* - Average records per output spark partitions should be almost equal to (#inputRecords /
|
||||
* #outputSparkPartitions) to avoid possible skews.
|
||||
*/
|
||||
public interface UserDefinedBulkInsertPartitioner<T extends HoodieRecordPayload> {
|
||||
|
||||
JavaRDD<HoodieRecord<T>> repartitionRecords(JavaRDD<HoodieRecord<T>> records, int outputSparkPartitions);
|
||||
JavaRDD<HoodieRecord<T>> repartitionRecords(JavaRDD<HoodieRecord<T>> records,
|
||||
int outputSparkPartitions);
|
||||
}
|
||||
|
||||
@@ -20,15 +20,11 @@ package com.uber.hoodie.table;
|
||||
import com.uber.hoodie.common.model.HoodieRecord;
|
||||
import com.uber.hoodie.common.model.HoodieRecordLocation;
|
||||
import com.uber.hoodie.common.model.HoodieRecordPayload;
|
||||
|
||||
import org.apache.spark.api.java.JavaRDD;
|
||||
import org.apache.spark.api.java.function.PairFunction;
|
||||
|
||||
import java.io.Serializable;
|
||||
import java.util.HashMap;
|
||||
import java.util.Map;
|
||||
import java.util.Set;
|
||||
|
||||
import org.apache.spark.api.java.JavaRDD;
|
||||
import scala.Option;
|
||||
import scala.Tuple2;
|
||||
|
||||
@@ -65,15 +61,18 @@ public class WorkloadProfile<T extends HoodieRecordPayload> implements Serializa
|
||||
|
||||
Map<Tuple2<String, Option<HoodieRecordLocation>>, Long> partitionLocationCounts = taggedRecords
|
||||
.mapToPair(record ->
|
||||
new Tuple2<>(new Tuple2<>(record.getPartitionPath(), Option.apply(record.getCurrentLocation())), record))
|
||||
new Tuple2<>(
|
||||
new Tuple2<>(record.getPartitionPath(), Option.apply(record.getCurrentLocation())),
|
||||
record))
|
||||
.countByKey();
|
||||
|
||||
for (Map.Entry<Tuple2<String, Option<HoodieRecordLocation>>, Long> e: partitionLocationCounts.entrySet()) {
|
||||
for (Map.Entry<Tuple2<String, Option<HoodieRecordLocation>>, Long> e : partitionLocationCounts
|
||||
.entrySet()) {
|
||||
String partitionPath = e.getKey()._1();
|
||||
Long count = e.getValue();
|
||||
Option<HoodieRecordLocation> locOption = e.getKey()._2();
|
||||
|
||||
if (!partitionPathStatMap.containsKey(partitionPath)){
|
||||
if (!partitionPathStatMap.containsKey(partitionPath)) {
|
||||
partitionPathStatMap.put(partitionPath, new WorkloadStat());
|
||||
}
|
||||
|
||||
@@ -97,7 +96,7 @@ public class WorkloadProfile<T extends HoodieRecordPayload> implements Serializa
|
||||
return partitionPathStatMap.keySet();
|
||||
}
|
||||
|
||||
public WorkloadStat getWorkloadStat(String partitionPath){
|
||||
public WorkloadStat getWorkloadStat(String partitionPath) {
|
||||
return partitionPathStatMap.get(partitionPath);
|
||||
}
|
||||
|
||||
|
||||
@@ -17,7 +17,6 @@
|
||||
package com.uber.hoodie.table;
|
||||
|
||||
import com.uber.hoodie.common.model.HoodieRecordLocation;
|
||||
|
||||
import java.io.Serializable;
|
||||
import java.util.HashMap;
|
||||
|
||||
@@ -25,6 +24,7 @@ import java.util.HashMap;
|
||||
* Wraps stats about a single partition path.
|
||||
*/
|
||||
public class WorkloadStat implements Serializable {
|
||||
|
||||
private long numInserts = 0L;
|
||||
|
||||
private long numUpdates = 0L;
|
||||
|
||||
@@ -13,7 +13,6 @@
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
#
|
||||
|
||||
# Set root logger level to DEBUG and its only appender to A1.
|
||||
log4j.rootLogger=INFO, A1
|
||||
# A1 is set to be a ConsoleAppender.
|
||||
|
||||
@@ -22,13 +22,12 @@ import com.uber.hoodie.common.HoodieTestDataGenerator;
|
||||
import com.uber.hoodie.common.model.HoodieAvroPayload;
|
||||
import com.uber.hoodie.common.model.HoodieRecord;
|
||||
import com.uber.hoodie.common.model.HoodieTableType;
|
||||
import com.uber.hoodie.common.table.HoodieTableConfig;
|
||||
import com.uber.hoodie.common.table.HoodieTableMetaClient;
|
||||
import com.uber.hoodie.common.util.FSUtils;
|
||||
import com.uber.hoodie.config.HoodieIndexConfig;
|
||||
import com.uber.hoodie.config.HoodieWriteConfig;
|
||||
import com.uber.hoodie.index.HoodieIndex;
|
||||
|
||||
import java.util.List;
|
||||
import org.apache.hadoop.fs.FileSystem;
|
||||
import org.apache.hadoop.fs.Path;
|
||||
import org.apache.log4j.LogManager;
|
||||
@@ -36,7 +35,6 @@ import org.apache.log4j.Logger;
|
||||
import org.apache.spark.SparkConf;
|
||||
import org.apache.spark.api.java.JavaRDD;
|
||||
import org.apache.spark.api.java.JavaSparkContext;
|
||||
import java.util.List;
|
||||
|
||||
/**
|
||||
* Driver program that uses the Hoodie client with synthetic workload, and performs basic
|
||||
@@ -44,13 +42,13 @@ import java.util.List;
|
||||
*/
|
||||
public class HoodieClientExample {
|
||||
|
||||
@Parameter(names={"--table-path", "-p"}, description = "path for Hoodie sample table")
|
||||
@Parameter(names = {"--table-path", "-p"}, description = "path for Hoodie sample table")
|
||||
private String tablePath = "file:///tmp/hoodie/sample-table";
|
||||
|
||||
@Parameter(names={"--table-name", "-n"}, description = "table name for Hoodie sample table")
|
||||
@Parameter(names = {"--table-name", "-n"}, description = "table name for Hoodie sample table")
|
||||
private String tableName = "hoodie_rt";
|
||||
|
||||
@Parameter(names={"--table-type", "-t"}, description = "One of COPY_ON_WRITE or MERGE_ON_READ")
|
||||
@Parameter(names = {"--table-type", "-t"}, description = "One of COPY_ON_WRITE or MERGE_ON_READ")
|
||||
private String tableType = HoodieTableType.COPY_ON_WRITE.name();
|
||||
|
||||
@Parameter(names = {"--help", "-h"}, help = true)
|
||||
@@ -85,7 +83,9 @@ public class HoodieClientExample {
|
||||
Path path = new Path(tablePath);
|
||||
FileSystem fs = FSUtils.getFs();
|
||||
if (!fs.exists(path)) {
|
||||
HoodieTableMetaClient.initTableType(fs, tablePath, HoodieTableType.valueOf(tableType), tableName, HoodieAvroPayload.class.getName());
|
||||
HoodieTableMetaClient
|
||||
.initTableType(fs, tablePath, HoodieTableType.valueOf(tableType), tableName,
|
||||
HoodieAvroPayload.class.getName());
|
||||
}
|
||||
|
||||
// Create the write client to write some records in
|
||||
|
||||
@@ -16,8 +16,12 @@
|
||||
|
||||
package com.uber.hoodie;
|
||||
|
||||
import com.google.common.collect.Iterables;
|
||||
import static org.junit.Assert.assertEquals;
|
||||
import static org.junit.Assert.assertFalse;
|
||||
import static org.junit.Assert.assertTrue;
|
||||
import static org.junit.Assert.fail;
|
||||
|
||||
import com.google.common.collect.Iterables;
|
||||
import com.uber.hoodie.common.HoodieCleanStat;
|
||||
import com.uber.hoodie.common.HoodieClientTestUtils;
|
||||
import com.uber.hoodie.common.HoodieTestDataGenerator;
|
||||
@@ -45,22 +49,6 @@ import com.uber.hoodie.config.HoodieWriteConfig;
|
||||
import com.uber.hoodie.exception.HoodieRollbackException;
|
||||
import com.uber.hoodie.index.HoodieIndex;
|
||||
import com.uber.hoodie.table.HoodieTable;
|
||||
|
||||
import org.apache.avro.generic.GenericRecord;
|
||||
import org.apache.commons.io.IOUtils;
|
||||
import org.apache.hadoop.fs.FileSystem;
|
||||
import org.apache.hadoop.fs.Path;
|
||||
import org.apache.spark.api.java.JavaRDD;
|
||||
import org.apache.spark.api.java.JavaSparkContext;
|
||||
import org.apache.spark.scheduler.SparkListener;
|
||||
import org.apache.spark.scheduler.SparkListenerTaskEnd;
|
||||
import org.apache.spark.sql.SQLContext;
|
||||
import org.apache.spark.util.AccumulatorV2;
|
||||
import org.junit.After;
|
||||
import org.junit.Before;
|
||||
import org.junit.Test;
|
||||
import org.junit.rules.TemporaryFolder;
|
||||
|
||||
import java.io.File;
|
||||
import java.io.FileInputStream;
|
||||
import java.io.IOException;
|
||||
@@ -76,15 +64,24 @@ import java.util.Optional;
|
||||
import java.util.Set;
|
||||
import java.util.TreeSet;
|
||||
import java.util.stream.Collectors;
|
||||
|
||||
import org.apache.avro.generic.GenericRecord;
|
||||
import org.apache.commons.io.IOUtils;
|
||||
import org.apache.hadoop.fs.FileSystem;
|
||||
import org.apache.hadoop.fs.Path;
|
||||
import org.apache.spark.api.java.JavaRDD;
|
||||
import org.apache.spark.api.java.JavaSparkContext;
|
||||
import org.apache.spark.scheduler.SparkListener;
|
||||
import org.apache.spark.scheduler.SparkListenerTaskEnd;
|
||||
import org.apache.spark.sql.SQLContext;
|
||||
import org.apache.spark.util.AccumulatorV2;
|
||||
import org.junit.After;
|
||||
import org.junit.Before;
|
||||
import org.junit.Test;
|
||||
import org.junit.rules.TemporaryFolder;
|
||||
import scala.collection.Iterator;
|
||||
|
||||
import static org.junit.Assert.assertEquals;
|
||||
import static org.junit.Assert.assertFalse;
|
||||
import static org.junit.Assert.assertTrue;
|
||||
import static org.junit.Assert.fail;
|
||||
|
||||
public class TestHoodieClientOnCopyOnWriteStorage implements Serializable {
|
||||
|
||||
private transient JavaSparkContext jsc = null;
|
||||
private transient SQLContext sqlContext;
|
||||
private String basePath = null;
|
||||
@@ -115,7 +112,8 @@ public class TestHoodieClientOnCopyOnWriteStorage implements Serializable {
|
||||
private HoodieWriteConfig.Builder getConfigBuilder() {
|
||||
return HoodieWriteConfig.newBuilder().withPath(basePath)
|
||||
.withSchema(HoodieTestDataGenerator.TRIP_EXAMPLE_SCHEMA).withParallelism(2, 2)
|
||||
.withCompactionConfig(HoodieCompactionConfig.newBuilder().compactionSmallFileSize(1024 * 1024).build())
|
||||
.withCompactionConfig(
|
||||
HoodieCompactionConfig.newBuilder().compactionSmallFileSize(1024 * 1024).build())
|
||||
.withStorageConfig(HoodieStorageConfig.newBuilder().limitFileSize(1024 * 1024).build())
|
||||
.forTable("test-trip-table").withIndexConfig(
|
||||
HoodieIndexConfig.newBuilder().withIndexType(HoodieIndex.IndexType.BLOOM).build());
|
||||
@@ -129,9 +127,11 @@ public class TestHoodieClientOnCopyOnWriteStorage implements Serializable {
|
||||
}
|
||||
|
||||
private void assertPartitionMetadata(String[] partitionPaths, FileSystem fs) throws IOException {
|
||||
for (String partitionPath: partitionPaths) {
|
||||
assertTrue(HoodiePartitionMetadata.hasPartitionMetadata(fs, new Path(basePath, partitionPath)));
|
||||
HoodiePartitionMetadata pmeta = new HoodiePartitionMetadata(fs, new Path(basePath, partitionPath));
|
||||
for (String partitionPath : partitionPaths) {
|
||||
assertTrue(
|
||||
HoodiePartitionMetadata.hasPartitionMetadata(fs, new Path(basePath, partitionPath)));
|
||||
HoodiePartitionMetadata pmeta = new HoodiePartitionMetadata(fs,
|
||||
new Path(basePath, partitionPath));
|
||||
pmeta.readFromFS();
|
||||
assertEquals(3, pmeta.getPartitionDepth());
|
||||
}
|
||||
@@ -140,13 +140,13 @@ public class TestHoodieClientOnCopyOnWriteStorage implements Serializable {
|
||||
private void checkTaggedRecords(List<HoodieRecord> taggedRecords, String commitTime) {
|
||||
for (HoodieRecord rec : taggedRecords) {
|
||||
assertTrue("Record " + rec + " found with no location.", rec.isCurrentLocationKnown());
|
||||
assertEquals("All records should have commit time "+ commitTime+", since updates were made",
|
||||
assertEquals(
|
||||
"All records should have commit time " + commitTime + ", since updates were made",
|
||||
rec.getCurrentLocation().getCommitTime(), commitTime);
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
|
||||
@Test
|
||||
public void testFilterExist() throws Exception {
|
||||
HoodieWriteConfig config = getConfig();
|
||||
@@ -231,17 +231,21 @@ public class TestHoodieClientOnCopyOnWriteStorage implements Serializable {
|
||||
|
||||
// verify that there is a commit
|
||||
HoodieTableMetaClient metaClient = new HoodieTableMetaClient(fs, basePath);
|
||||
HoodieTimeline timeline = new HoodieActiveTimeline(fs, metaClient.getMetaPath()).getCommitTimeline();
|
||||
HoodieTimeline timeline = new HoodieActiveTimeline(fs, metaClient.getMetaPath())
|
||||
.getCommitTimeline();
|
||||
|
||||
assertEquals("Expecting a single commit.", 1, timeline.findInstantsAfter("000", Integer.MAX_VALUE).countInstants());
|
||||
assertEquals("Latest commit should be 001", newCommitTime, timeline.lastInstant().get().getTimestamp());
|
||||
assertEquals("Expecting a single commit.", 1,
|
||||
timeline.findInstantsAfter("000", Integer.MAX_VALUE).countInstants());
|
||||
assertEquals("Latest commit should be 001", newCommitTime,
|
||||
timeline.lastInstant().get().getTimestamp());
|
||||
assertEquals("Must contain 200 records",
|
||||
records.size(),
|
||||
HoodieClientTestUtils.readCommit(basePath, sqlContext, timeline, newCommitTime).count());
|
||||
// Should have 100 records in table (check using Index), all in locations marked at commit
|
||||
HoodieTable table = HoodieTable.getHoodieTable(metaClient, getConfig());
|
||||
|
||||
List<HoodieRecord> taggedRecords = index.tagLocation(jsc.parallelize(records, 1), table).collect();
|
||||
List<HoodieRecord> taggedRecords = index.tagLocation(jsc.parallelize(records, 1), table)
|
||||
.collect();
|
||||
checkTaggedRecords(taggedRecords, "001");
|
||||
|
||||
/**
|
||||
@@ -265,8 +269,10 @@ public class TestHoodieClientOnCopyOnWriteStorage implements Serializable {
|
||||
|
||||
// verify there are now 2 commits
|
||||
timeline = new HoodieActiveTimeline(fs, metaClient.getMetaPath()).getCommitTimeline();
|
||||
assertEquals("Expecting two commits.", timeline.findInstantsAfter("000", Integer.MAX_VALUE).countInstants(), 2);
|
||||
assertEquals("Latest commit should be 004", timeline.lastInstant().get().getTimestamp(), newCommitTime);
|
||||
assertEquals("Expecting two commits.",
|
||||
timeline.findInstantsAfter("000", Integer.MAX_VALUE).countInstants(), 2);
|
||||
assertEquals("Latest commit should be 004", timeline.lastInstant().get().getTimestamp(),
|
||||
newCommitTime);
|
||||
|
||||
metaClient = new HoodieTableMetaClient(fs, basePath);
|
||||
table = HoodieTable.getHoodieTable(metaClient, getConfig());
|
||||
@@ -277,21 +283,20 @@ public class TestHoodieClientOnCopyOnWriteStorage implements Serializable {
|
||||
|
||||
// Check the entire dataset has 100 records still
|
||||
String[] fullPartitionPaths = new String[dataGen.getPartitionPaths().length];
|
||||
for (int i=0; i < fullPartitionPaths.length; i++) {
|
||||
for (int i = 0; i < fullPartitionPaths.length; i++) {
|
||||
fullPartitionPaths[i] = String.format("%s/%s/*", basePath, dataGen.getPartitionPaths()[i]);
|
||||
}
|
||||
assertEquals("Must contain 200 records",
|
||||
200,
|
||||
HoodieClientTestUtils.read(basePath, sqlContext, fs, fullPartitionPaths).count());
|
||||
|
||||
|
||||
// Check that the incremental consumption from time 000
|
||||
assertEquals("Incremental consumption from time 002, should give all records in commit 004",
|
||||
HoodieClientTestUtils.readCommit(basePath, sqlContext, timeline, newCommitTime).count(),
|
||||
HoodieClientTestUtils.readSince(basePath, sqlContext, timeline, "002").count());
|
||||
assertEquals("Incremental consumption from time 001, should give all records in commit 004",
|
||||
HoodieClientTestUtils.readCommit(basePath, sqlContext,timeline, newCommitTime).count(),
|
||||
HoodieClientTestUtils.readSince(basePath, sqlContext,timeline, "001").count());
|
||||
HoodieClientTestUtils.readCommit(basePath, sqlContext, timeline, newCommitTime).count(),
|
||||
HoodieClientTestUtils.readSince(basePath, sqlContext, timeline, "001").count());
|
||||
}
|
||||
|
||||
@Test
|
||||
@@ -322,15 +327,19 @@ public class TestHoodieClientOnCopyOnWriteStorage implements Serializable {
|
||||
|
||||
// verify that there is a commit
|
||||
HoodieTableMetaClient metaClient = new HoodieTableMetaClient(fs, basePath);
|
||||
HoodieTimeline timeline = new HoodieActiveTimeline(fs, metaClient.getMetaPath()).getCommitTimeline();
|
||||
assertEquals("Expecting a single commit.", 1, timeline.findInstantsAfter("000", Integer.MAX_VALUE).countInstants());
|
||||
assertEquals("Latest commit should be 001", newCommitTime, timeline.lastInstant().get().getTimestamp());
|
||||
HoodieTimeline timeline = new HoodieActiveTimeline(fs, metaClient.getMetaPath())
|
||||
.getCommitTimeline();
|
||||
assertEquals("Expecting a single commit.", 1,
|
||||
timeline.findInstantsAfter("000", Integer.MAX_VALUE).countInstants());
|
||||
assertEquals("Latest commit should be 001", newCommitTime,
|
||||
timeline.lastInstant().get().getTimestamp());
|
||||
assertEquals("Must contain 200 records", fewRecordsForInsert.size(),
|
||||
HoodieClientTestUtils.readCommit(basePath, sqlContext, timeline, newCommitTime).count());
|
||||
// Should have 100 records in table (check using Index), all in locations marked at commit
|
||||
HoodieTable table = HoodieTable.getHoodieTable(metaClient, getConfig());
|
||||
|
||||
List<HoodieRecord> taggedRecords = index.tagLocation(jsc.parallelize(fewRecordsForInsert, 1), table).collect();
|
||||
List<HoodieRecord> taggedRecords = index
|
||||
.tagLocation(jsc.parallelize(fewRecordsForInsert, 1), table).collect();
|
||||
checkTaggedRecords(taggedRecords, "001");
|
||||
|
||||
/**
|
||||
@@ -339,8 +348,8 @@ public class TestHoodieClientOnCopyOnWriteStorage implements Serializable {
|
||||
newCommitTime = "004";
|
||||
client.startCommitWithTime(newCommitTime);
|
||||
|
||||
fewRecordsForDelete = records.subList(0,50);
|
||||
List<HoodieRecord> fewRecordsForUpdate = records.subList(50,100);
|
||||
fewRecordsForDelete = records.subList(0, 50);
|
||||
List<HoodieRecord> fewRecordsForUpdate = records.subList(50, 100);
|
||||
records = dataGen.generateDeletesFromExistingRecords(fewRecordsForDelete);
|
||||
|
||||
records.addAll(fewRecordsForUpdate);
|
||||
@@ -351,18 +360,19 @@ public class TestHoodieClientOnCopyOnWriteStorage implements Serializable {
|
||||
|
||||
// verify there are now 2 commits
|
||||
timeline = new HoodieActiveTimeline(fs, metaClient.getMetaPath()).getCommitTimeline();
|
||||
assertEquals("Expecting two commits.", timeline.findInstantsAfter("000", Integer.MAX_VALUE).countInstants(), 2);
|
||||
assertEquals("Latest commit should be 004", timeline.lastInstant().get().getTimestamp(), newCommitTime);
|
||||
assertEquals("Expecting two commits.",
|
||||
timeline.findInstantsAfter("000", Integer.MAX_VALUE).countInstants(), 2);
|
||||
assertEquals("Latest commit should be 004", timeline.lastInstant().get().getTimestamp(),
|
||||
newCommitTime);
|
||||
|
||||
// Check the entire dataset has 150 records(200-50) still
|
||||
String[] fullPartitionPaths = new String[dataGen.getPartitionPaths().length];
|
||||
for (int i=0; i < fullPartitionPaths.length; i++) {
|
||||
for (int i = 0; i < fullPartitionPaths.length; i++) {
|
||||
fullPartitionPaths[i] = String.format("%s/%s/*", basePath, dataGen.getPartitionPaths()[i]);
|
||||
}
|
||||
assertEquals("Must contain 150 records", 150,
|
||||
HoodieClientTestUtils.read(basePath, sqlContext, fs, fullPartitionPaths).count());
|
||||
|
||||
|
||||
// Check that the incremental consumption from time 000
|
||||
assertEquals("Incremental consumption from latest commit, should give 50 updated records",
|
||||
50,
|
||||
@@ -384,7 +394,8 @@ public class TestHoodieClientOnCopyOnWriteStorage implements Serializable {
|
||||
.build()).build();
|
||||
HoodieWriteClient client = new HoodieWriteClient(jsc, cfg);
|
||||
FileSystem fs = FSUtils.getFs();
|
||||
HoodieTestDataGenerator.writePartitionMetadata(fs, HoodieTestDataGenerator.DEFAULT_PARTITION_PATHS, basePath);
|
||||
HoodieTestDataGenerator
|
||||
.writePartitionMetadata(fs, HoodieTestDataGenerator.DEFAULT_PARTITION_PATHS, basePath);
|
||||
|
||||
/**
|
||||
* Write 1 (only inserts)
|
||||
@@ -393,7 +404,8 @@ public class TestHoodieClientOnCopyOnWriteStorage implements Serializable {
|
||||
client.startCommitWithTime(newCommitTime);
|
||||
|
||||
List<HoodieRecord> records = dataGen.generateInserts(newCommitTime, 200);
|
||||
List<WriteStatus> statuses = client.upsert(jsc.parallelize(records, 1), newCommitTime).collect();
|
||||
List<WriteStatus> statuses = client.upsert(jsc.parallelize(records, 1), newCommitTime)
|
||||
.collect();
|
||||
assertNoWriteErrors(statuses);
|
||||
|
||||
/**
|
||||
@@ -437,7 +449,8 @@ public class TestHoodieClientOnCopyOnWriteStorage implements Serializable {
|
||||
// Verify there are no errors
|
||||
assertNoWriteErrors(statuses);
|
||||
|
||||
List<String> partitionPaths = FSUtils.getAllPartitionPaths(fs, cfg.getBasePath(), getConfig().shouldAssumeDatePartitioning());
|
||||
List<String> partitionPaths = FSUtils
|
||||
.getAllPartitionPaths(fs, cfg.getBasePath(), getConfig().shouldAssumeDatePartitioning());
|
||||
HoodieTableMetaClient metaClient = new HoodieTableMetaClient(fs, basePath);
|
||||
HoodieTable table = HoodieTable.getHoodieTable(metaClient, getConfig());
|
||||
final TableFileSystemView.ReadOptimizedView view = table.getROFileSystemView();
|
||||
@@ -478,7 +491,8 @@ public class TestHoodieClientOnCopyOnWriteStorage implements Serializable {
|
||||
.build()).build();
|
||||
HoodieWriteClient client = new HoodieWriteClient(jsc, cfg);
|
||||
FileSystem fs = FSUtils.getFs();
|
||||
HoodieTestDataGenerator.writePartitionMetadata(fs, HoodieTestDataGenerator.DEFAULT_PARTITION_PATHS, basePath);
|
||||
HoodieTestDataGenerator
|
||||
.writePartitionMetadata(fs, HoodieTestDataGenerator.DEFAULT_PARTITION_PATHS, basePath);
|
||||
|
||||
/**
|
||||
* Write 1 (only inserts)
|
||||
@@ -515,7 +529,8 @@ public class TestHoodieClientOnCopyOnWriteStorage implements Serializable {
|
||||
statuses = client.upsert(jsc.parallelize(records, 1), newCommitTime).collect();
|
||||
// Verify there are no errors
|
||||
assertNoWriteErrors(statuses);
|
||||
List<String> partitionPaths = FSUtils.getAllPartitionPaths(fs, cfg.getBasePath(), getConfig().shouldAssumeDatePartitioning());
|
||||
List<String> partitionPaths = FSUtils
|
||||
.getAllPartitionPaths(fs, cfg.getBasePath(), getConfig().shouldAssumeDatePartitioning());
|
||||
HoodieTableMetaClient metaClient = new HoodieTableMetaClient(fs, basePath);
|
||||
HoodieTable table = HoodieTable.getHoodieTable(metaClient, getConfig());
|
||||
final TableFileSystemView.ReadOptimizedView view1 = table.getROFileSystemView();
|
||||
@@ -525,7 +540,6 @@ public class TestHoodieClientOnCopyOnWriteStorage implements Serializable {
|
||||
}).collect(Collectors.toList());
|
||||
assertEquals("The data files for commit 003 should be present", 3, dataFiles.size());
|
||||
|
||||
|
||||
/**
|
||||
* Write 4 (updates)
|
||||
*/
|
||||
@@ -546,7 +560,6 @@ public class TestHoodieClientOnCopyOnWriteStorage implements Serializable {
|
||||
}).collect(Collectors.toList());
|
||||
assertEquals("The data files for commit 004 should be present", 3, dataFiles.size());
|
||||
|
||||
|
||||
// rolling back to a non existent savepoint must not succeed
|
||||
try {
|
||||
client.rollbackToSavepoint("001");
|
||||
@@ -606,8 +619,10 @@ public class TestHoodieClientOnCopyOnWriteStorage implements Serializable {
|
||||
|
||||
// verify that there is a commit
|
||||
HoodieTableMetaClient metaClient = new HoodieTableMetaClient(fs, basePath);
|
||||
HoodieTimeline timeline = new HoodieActiveTimeline(fs, metaClient.getMetaPath()).getCommitTimeline();
|
||||
assertEquals("Expecting a single commit.", 1, timeline.findInstantsAfter("000", Integer.MAX_VALUE).countInstants());
|
||||
HoodieTimeline timeline = new HoodieActiveTimeline(fs, metaClient.getMetaPath())
|
||||
.getCommitTimeline();
|
||||
assertEquals("Expecting a single commit.", 1,
|
||||
timeline.findInstantsAfter("000", Integer.MAX_VALUE).countInstants());
|
||||
// Should have 100 records in table (check using Index), all in locations marked at commit
|
||||
HoodieTable table = HoodieTable.getHoodieTable(metaClient, getConfig());
|
||||
assertFalse(table.getCompletedCommitTimeline().empty());
|
||||
@@ -617,7 +632,8 @@ public class TestHoodieClientOnCopyOnWriteStorage implements Serializable {
|
||||
assertEquals("The clean instant should be the same as the commit instant", commitTime,
|
||||
table.getCompletedCleanTimeline().getInstants().findFirst().get().getTimestamp());
|
||||
|
||||
List<HoodieRecord> taggedRecords = index.tagLocation(jsc.parallelize(records, 1), table).collect();
|
||||
List<HoodieRecord> taggedRecords = index.tagLocation(jsc.parallelize(records, 1), table)
|
||||
.collect();
|
||||
checkTaggedRecords(taggedRecords, newCommitTime);
|
||||
|
||||
// Keep doing some writes and clean inline. Make sure we have expected number of files remaining.
|
||||
@@ -641,18 +657,20 @@ public class TestHoodieClientOnCopyOnWriteStorage implements Serializable {
|
||||
// compute all the versions of all files, from time 0
|
||||
HashMap<String, TreeSet<String>> fileIdToVersions = new HashMap<>();
|
||||
for (HoodieInstant entry : timeline.getInstants().collect(Collectors.toList())) {
|
||||
HoodieCommitMetadata commitMetadata = HoodieCommitMetadata.fromBytes(timeline.getInstantDetails(entry).get());
|
||||
HoodieCommitMetadata commitMetadata = HoodieCommitMetadata
|
||||
.fromBytes(timeline.getInstantDetails(entry).get());
|
||||
|
||||
for (HoodieWriteStat wstat : commitMetadata.getWriteStats(partitionPath)) {
|
||||
if (!fileIdToVersions.containsKey(wstat.getFileId())) {
|
||||
fileIdToVersions.put(wstat.getFileId(), new TreeSet<>());
|
||||
}
|
||||
fileIdToVersions.get(wstat.getFileId()).add(FSUtils.getCommitTime(new Path(wstat.getPath()).getName()));
|
||||
fileIdToVersions.get(wstat.getFileId())
|
||||
.add(FSUtils.getCommitTime(new Path(wstat.getPath()).getName()));
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
List<HoodieFileGroup> fileGroups = fsView.getAllFileGroups(partitionPath).collect(Collectors.toList());
|
||||
List<HoodieFileGroup> fileGroups = fsView.getAllFileGroups(partitionPath)
|
||||
.collect(Collectors.toList());
|
||||
|
||||
for (HoodieFileGroup fileGroup : fileGroups) {
|
||||
// No file has no more than max versions
|
||||
@@ -665,7 +683,8 @@ public class TestHoodieClientOnCopyOnWriteStorage implements Serializable {
|
||||
// Each file, has the latest N versions (i.e cleaning gets rid of older versions)
|
||||
List<String> commitedVersions = new ArrayList<>(fileIdToVersions.get(fileId));
|
||||
for (int i = 0; i < dataFiles.size(); i++) {
|
||||
assertEquals("File " + fileId + " does not have latest versions on commits" + commitedVersions,
|
||||
assertEquals(
|
||||
"File " + fileId + " does not have latest versions on commits" + commitedVersions,
|
||||
Iterables.get(dataFiles, i).getCommitTime(),
|
||||
commitedVersions.get(commitedVersions.size() - 1 - i));
|
||||
}
|
||||
@@ -700,8 +719,10 @@ public class TestHoodieClientOnCopyOnWriteStorage implements Serializable {
|
||||
|
||||
// verify that there is a commit
|
||||
HoodieTableMetaClient metaClient = new HoodieTableMetaClient(fs, basePath);
|
||||
HoodieTimeline timeline = new HoodieActiveTimeline(fs, metaClient.getMetaPath()).getCommitTimeline();
|
||||
assertEquals("Expecting a single commit.", 1, timeline.findInstantsAfter("000", Integer.MAX_VALUE).countInstants());
|
||||
HoodieTimeline timeline = new HoodieActiveTimeline(fs, metaClient.getMetaPath())
|
||||
.getCommitTimeline();
|
||||
assertEquals("Expecting a single commit.", 1,
|
||||
timeline.findInstantsAfter("000", Integer.MAX_VALUE).countInstants());
|
||||
// Should have 100 records in table (check using Index), all in locations marked at commit
|
||||
HoodieTable table = HoodieTable.getHoodieTable(metaClient, getConfig());
|
||||
|
||||
@@ -712,7 +733,8 @@ public class TestHoodieClientOnCopyOnWriteStorage implements Serializable {
|
||||
assertEquals("The clean instant should be the same as the commit instant", commitTime,
|
||||
table.getCompletedCleanTimeline().getInstants().findFirst().get().getTimestamp());
|
||||
|
||||
List<HoodieRecord> taggedRecords = index.tagLocation(jsc.parallelize(records, 1), table).collect();
|
||||
List<HoodieRecord> taggedRecords = index.tagLocation(jsc.parallelize(records, 1), table)
|
||||
.collect();
|
||||
checkTaggedRecords(taggedRecords, newCommitTime);
|
||||
|
||||
// Keep doing some writes and clean inline. Make sure we have expected number of files remaining.
|
||||
@@ -734,7 +756,8 @@ public class TestHoodieClientOnCopyOnWriteStorage implements Serializable {
|
||||
activeTimeline.getInstants().collect(Collectors.toSet());
|
||||
if (earliestRetainedCommit.isPresent()) {
|
||||
acceptableCommits.removeAll(
|
||||
activeTimeline.findInstantsInRange("000", earliestRetainedCommit.get().getTimestamp()).getInstants()
|
||||
activeTimeline.findInstantsInRange("000", earliestRetainedCommit.get().getTimestamp())
|
||||
.getInstants()
|
||||
.collect(Collectors.toSet()));
|
||||
acceptableCommits.add(earliestRetainedCommit.get());
|
||||
}
|
||||
@@ -742,7 +765,8 @@ public class TestHoodieClientOnCopyOnWriteStorage implements Serializable {
|
||||
TableFileSystemView fsView = table1.getFileSystemView();
|
||||
// Need to ensure the following
|
||||
for (String partitionPath : dataGen.getPartitionPaths()) {
|
||||
List<HoodieFileGroup> fileGroups = fsView.getAllFileGroups(partitionPath).collect(Collectors.toList());
|
||||
List<HoodieFileGroup> fileGroups = fsView.getAllFileGroups(partitionPath)
|
||||
.collect(Collectors.toList());
|
||||
for (HoodieFileGroup fileGroup : fileGroups) {
|
||||
Set<String> commitTimes = new HashSet<>();
|
||||
fileGroup.getAllDataFiles().forEach(value -> {
|
||||
@@ -765,10 +789,9 @@ public class TestHoodieClientOnCopyOnWriteStorage implements Serializable {
|
||||
String commitTime3 = "20160506030611";
|
||||
new File(basePath + "/.hoodie").mkdirs();
|
||||
HoodieTestDataGenerator.writePartitionMetadata(FSUtils.getFs(),
|
||||
new String[] {"2016/05/01", "2016/05/02", "2016/05/06"},
|
||||
new String[]{"2016/05/01", "2016/05/02", "2016/05/06"},
|
||||
basePath);
|
||||
|
||||
|
||||
// Only first two have commit files
|
||||
HoodieTestUtils.createCommitFiles(basePath, commitTime1, commitTime2);
|
||||
// Third one has a .inflight intermediate commit file
|
||||
@@ -816,7 +839,6 @@ public class TestHoodieClientOnCopyOnWriteStorage implements Serializable {
|
||||
client.rollback(commitTime3);
|
||||
assertFalse(HoodieTestUtils.doesInflightExist(basePath, commitTime3));
|
||||
|
||||
|
||||
// Rollback commit2
|
||||
client.rollback(commitTime2);
|
||||
assertFalse(HoodieTestUtils.doesCommitExist(basePath, commitTime2));
|
||||
@@ -839,7 +861,6 @@ public class TestHoodieClientOnCopyOnWriteStorage implements Serializable {
|
||||
HoodieTestUtils.doesDataFileExist(basePath, "2016/05/02", commitTime2, file22) ||
|
||||
HoodieTestUtils.doesDataFileExist(basePath, "2016/05/06", commitTime2, file23));
|
||||
|
||||
|
||||
// Let's rollback commit1, Check results
|
||||
client.rollback(commitTime1);
|
||||
assertFalse(HoodieTestUtils.doesCommitExist(basePath, commitTime1));
|
||||
@@ -858,7 +879,7 @@ public class TestHoodieClientOnCopyOnWriteStorage implements Serializable {
|
||||
String commitTime3 = "20160506030611";
|
||||
new File(basePath + "/.hoodie").mkdirs();
|
||||
HoodieTestDataGenerator.writePartitionMetadata(FSUtils.getFs(),
|
||||
new String[] {"2016/05/01", "2016/05/02", "2016/05/06"},
|
||||
new String[]{"2016/05/01", "2016/05/02", "2016/05/06"},
|
||||
basePath);
|
||||
|
||||
// One good commit
|
||||
@@ -940,26 +961,29 @@ public class TestHoodieClientOnCopyOnWriteStorage implements Serializable {
|
||||
final String TEST_PARTITION_PATH = "2016/09/26";
|
||||
final int INSERT_SPLIT_LIMIT = 100;
|
||||
// setup the small file handling params
|
||||
HoodieWriteConfig config = getSmallInsertWriteConfig(INSERT_SPLIT_LIMIT); // hold upto 200 records max
|
||||
dataGen = new HoodieTestDataGenerator(new String[] {TEST_PARTITION_PATH});
|
||||
HoodieWriteConfig config = getSmallInsertWriteConfig(
|
||||
INSERT_SPLIT_LIMIT); // hold upto 200 records max
|
||||
dataGen = new HoodieTestDataGenerator(new String[]{TEST_PARTITION_PATH});
|
||||
|
||||
HoodieWriteClient client = new HoodieWriteClient(jsc, config);
|
||||
|
||||
// Inserts => will write file1
|
||||
String commitTime1 = "001";
|
||||
client.startCommitWithTime(commitTime1);
|
||||
List<HoodieRecord> inserts1 = dataGen.generateInserts(commitTime1, INSERT_SPLIT_LIMIT); // this writes ~500kb
|
||||
List<HoodieRecord> inserts1 = dataGen
|
||||
.generateInserts(commitTime1, INSERT_SPLIT_LIMIT); // this writes ~500kb
|
||||
Set<String> keys1 = HoodieClientTestUtils.getRecordKeys(inserts1);
|
||||
|
||||
JavaRDD<HoodieRecord> insertRecordsRDD1 = jsc.parallelize(inserts1, 1);
|
||||
List<WriteStatus> statuses= client.upsert(insertRecordsRDD1, commitTime1).collect();
|
||||
List<WriteStatus> statuses = client.upsert(insertRecordsRDD1, commitTime1).collect();
|
||||
|
||||
assertNoWriteErrors(statuses);
|
||||
|
||||
assertEquals("Just 1 file needs to be added.", 1, statuses.size());
|
||||
String file1 = statuses.get(0).getFileId();
|
||||
assertEquals("file should contain 100 records",
|
||||
ParquetUtils.readRowKeysFromParquet(new Path(basePath, TEST_PARTITION_PATH + "/" + FSUtils.makeDataFileName(commitTime1, 0, file1))).size(),
|
||||
ParquetUtils.readRowKeysFromParquet(new Path(basePath,
|
||||
TEST_PARTITION_PATH + "/" + FSUtils.makeDataFileName(commitTime1, 0, file1))).size(),
|
||||
100);
|
||||
|
||||
// Update + Inserts such that they just expand file1
|
||||
@@ -977,15 +1001,20 @@ public class TestHoodieClientOnCopyOnWriteStorage implements Serializable {
|
||||
|
||||
assertEquals("Just 1 file needs to be updated.", 1, statuses.size());
|
||||
assertEquals("Existing file should be expanded", file1, statuses.get(0).getFileId());
|
||||
assertEquals("Existing file should be expanded", commitTime1, statuses.get(0).getStat().getPrevCommit());
|
||||
Path newFile = new Path(basePath, TEST_PARTITION_PATH + "/" + FSUtils.makeDataFileName(commitTime2, 0, file1));
|
||||
assertEquals("file should contain 140 records", ParquetUtils.readRowKeysFromParquet(newFile).size(), 140);
|
||||
assertEquals("Existing file should be expanded", commitTime1,
|
||||
statuses.get(0).getStat().getPrevCommit());
|
||||
Path newFile = new Path(basePath,
|
||||
TEST_PARTITION_PATH + "/" + FSUtils.makeDataFileName(commitTime2, 0, file1));
|
||||
assertEquals("file should contain 140 records",
|
||||
ParquetUtils.readRowKeysFromParquet(newFile).size(), 140);
|
||||
|
||||
List<GenericRecord> records = ParquetUtils.readAvroRecords(newFile);
|
||||
for (GenericRecord record: records) {
|
||||
for (GenericRecord record : records) {
|
||||
String recordKey = record.get(HoodieRecord.RECORD_KEY_METADATA_FIELD).toString();
|
||||
assertEquals("only expect commit2", commitTime2, record.get(HoodieRecord.COMMIT_TIME_METADATA_FIELD).toString());
|
||||
assertTrue("key expected to be part of commit2", keys2.contains(recordKey) || keys1.contains(recordKey));
|
||||
assertEquals("only expect commit2", commitTime2,
|
||||
record.get(HoodieRecord.COMMIT_TIME_METADATA_FIELD).toString());
|
||||
assertTrue("key expected to be part of commit2",
|
||||
keys2.contains(recordKey) || keys1.contains(recordKey));
|
||||
}
|
||||
|
||||
// update + inserts such that file1 is updated and expanded, a new file2 is created.
|
||||
@@ -1004,14 +1033,15 @@ public class TestHoodieClientOnCopyOnWriteStorage implements Serializable {
|
||||
HoodieTableMetaClient metadata = new HoodieTableMetaClient(fs, basePath);
|
||||
HoodieTable table = HoodieTable.getHoodieTable(metadata, config);
|
||||
TableFileSystemView.ReadOptimizedView fileSystemView = table.getROFileSystemView();
|
||||
List<HoodieDataFile> files = fileSystemView.getLatestDataFilesBeforeOrOn(TEST_PARTITION_PATH, commitTime3).collect(
|
||||
List<HoodieDataFile> files = fileSystemView
|
||||
.getLatestDataFilesBeforeOrOn(TEST_PARTITION_PATH, commitTime3).collect(
|
||||
Collectors.toList());
|
||||
int numTotalInsertsInCommit3 = 0;
|
||||
for (HoodieDataFile file: files) {
|
||||
for (HoodieDataFile file : files) {
|
||||
if (file.getFileName().contains(file1)) {
|
||||
assertEquals("Existing file should be expanded", commitTime3, file.getCommitTime());
|
||||
records = ParquetUtils.readAvroRecords(new Path(file.getPath()));
|
||||
for (GenericRecord record: records) {
|
||||
for (GenericRecord record : records) {
|
||||
String recordKey = record.get(HoodieRecord.RECORD_KEY_METADATA_FIELD).toString();
|
||||
String recordCommitTime = record.get(HoodieRecord.COMMIT_TIME_METADATA_FIELD).toString();
|
||||
if (recordCommitTime.equals(commitTime3)) {
|
||||
@@ -1023,13 +1053,15 @@ public class TestHoodieClientOnCopyOnWriteStorage implements Serializable {
|
||||
}
|
||||
}
|
||||
}
|
||||
assertEquals("All keys added in commit 2 must be updated in commit3 correctly", 0, keys2.size());
|
||||
assertEquals("All keys added in commit 2 must be updated in commit3 correctly", 0,
|
||||
keys2.size());
|
||||
} else {
|
||||
assertEquals("New file must be written for commit 3", commitTime3, file.getCommitTime());
|
||||
records = ParquetUtils.readAvroRecords(new Path(file.getPath()));
|
||||
for (GenericRecord record: records) {
|
||||
for (GenericRecord record : records) {
|
||||
String recordKey = record.get(HoodieRecord.RECORD_KEY_METADATA_FIELD).toString();
|
||||
assertEquals("only expect commit3", commitTime3, record.get(HoodieRecord.COMMIT_TIME_METADATA_FIELD).toString());
|
||||
assertEquals("only expect commit3", commitTime3,
|
||||
record.get(HoodieRecord.COMMIT_TIME_METADATA_FIELD).toString());
|
||||
assertTrue("key expected to be part of commit3", keys3.contains(recordKey));
|
||||
}
|
||||
numTotalInsertsInCommit3 += records.size();
|
||||
@@ -1044,17 +1076,19 @@ public class TestHoodieClientOnCopyOnWriteStorage implements Serializable {
|
||||
final String TEST_PARTITION_PATH = "2016/09/26";
|
||||
final int INSERT_SPLIT_LIMIT = 100;
|
||||
// setup the small file handling params
|
||||
HoodieWriteConfig config = getSmallInsertWriteConfig(INSERT_SPLIT_LIMIT); // hold upto 200 records max
|
||||
dataGen = new HoodieTestDataGenerator(new String[] {TEST_PARTITION_PATH});
|
||||
HoodieWriteConfig config = getSmallInsertWriteConfig(
|
||||
INSERT_SPLIT_LIMIT); // hold upto 200 records max
|
||||
dataGen = new HoodieTestDataGenerator(new String[]{TEST_PARTITION_PATH});
|
||||
HoodieWriteClient client = new HoodieWriteClient(jsc, config);
|
||||
|
||||
// Inserts => will write file1
|
||||
String commitTime1 = "001";
|
||||
client.startCommitWithTime(commitTime1);
|
||||
List<HoodieRecord> inserts1 = dataGen.generateInserts(commitTime1, INSERT_SPLIT_LIMIT); // this writes ~500kb
|
||||
List<HoodieRecord> inserts1 = dataGen
|
||||
.generateInserts(commitTime1, INSERT_SPLIT_LIMIT); // this writes ~500kb
|
||||
Set<String> keys1 = HoodieClientTestUtils.getRecordKeys(inserts1);
|
||||
JavaRDD<HoodieRecord> insertRecordsRDD1 = jsc.parallelize(inserts1, 1);
|
||||
List<WriteStatus> statuses= client.insert(insertRecordsRDD1, commitTime1).collect();
|
||||
List<WriteStatus> statuses = client.insert(insertRecordsRDD1, commitTime1).collect();
|
||||
|
||||
assertNoWriteErrors(statuses);
|
||||
assertPartitionMetadata(new String[]{TEST_PARTITION_PATH}, FSUtils.getFs());
|
||||
@@ -1062,7 +1096,8 @@ public class TestHoodieClientOnCopyOnWriteStorage implements Serializable {
|
||||
assertEquals("Just 1 file needs to be added.", 1, statuses.size());
|
||||
String file1 = statuses.get(0).getFileId();
|
||||
assertEquals("file should contain 100 records",
|
||||
ParquetUtils.readRowKeysFromParquet(new Path(basePath, TEST_PARTITION_PATH + "/" + FSUtils.makeDataFileName(commitTime1, 0, file1))).size(),
|
||||
ParquetUtils.readRowKeysFromParquet(new Path(basePath,
|
||||
TEST_PARTITION_PATH + "/" + FSUtils.makeDataFileName(commitTime1, 0, file1))).size(),
|
||||
100);
|
||||
|
||||
// Second, set of Inserts should just expand file1
|
||||
@@ -1076,16 +1111,21 @@ public class TestHoodieClientOnCopyOnWriteStorage implements Serializable {
|
||||
|
||||
assertEquals("Just 1 file needs to be updated.", 1, statuses.size());
|
||||
assertEquals("Existing file should be expanded", file1, statuses.get(0).getFileId());
|
||||
assertEquals("Existing file should be expanded", commitTime1, statuses.get(0).getStat().getPrevCommit());
|
||||
Path newFile = new Path(basePath, TEST_PARTITION_PATH + "/" + FSUtils.makeDataFileName(commitTime2, 0, file1));
|
||||
assertEquals("file should contain 140 records", ParquetUtils.readRowKeysFromParquet(newFile).size(), 140);
|
||||
assertEquals("Existing file should be expanded", commitTime1,
|
||||
statuses.get(0).getStat().getPrevCommit());
|
||||
Path newFile = new Path(basePath,
|
||||
TEST_PARTITION_PATH + "/" + FSUtils.makeDataFileName(commitTime2, 0, file1));
|
||||
assertEquals("file should contain 140 records",
|
||||
ParquetUtils.readRowKeysFromParquet(newFile).size(), 140);
|
||||
|
||||
List<GenericRecord> records = ParquetUtils.readAvroRecords(newFile);
|
||||
for (GenericRecord record: records) {
|
||||
for (GenericRecord record : records) {
|
||||
String recordKey = record.get(HoodieRecord.RECORD_KEY_METADATA_FIELD).toString();
|
||||
String recCommitTime = record.get(HoodieRecord.COMMIT_TIME_METADATA_FIELD).toString();
|
||||
assertTrue("Record expected to be part of commit 1 or commit2", commitTime1.equals(recCommitTime) || commitTime2.equals(recCommitTime));
|
||||
assertTrue("key expected to be part of commit 1 or commit2", keys2.contains(recordKey) || keys1.contains(recordKey));
|
||||
assertTrue("Record expected to be part of commit 1 or commit2",
|
||||
commitTime1.equals(recCommitTime) || commitTime2.equals(recCommitTime));
|
||||
assertTrue("key expected to be part of commit 1 or commit2",
|
||||
keys2.contains(recordKey) || keys1.contains(recordKey));
|
||||
}
|
||||
|
||||
// Lots of inserts such that file1 is updated and expanded, a new file2 is created.
|
||||
@@ -1097,7 +1137,6 @@ public class TestHoodieClientOnCopyOnWriteStorage implements Serializable {
|
||||
assertNoWriteErrors(statuses);
|
||||
assertEquals("2 files needs to be committed.", 2, statuses.size());
|
||||
|
||||
|
||||
FileSystem fs = FSUtils.getFs();
|
||||
HoodieTableMetaClient metaClient = new HoodieTableMetaClient(fs, basePath);
|
||||
HoodieTable table = HoodieTable.getHoodieTable(metaClient, config);
|
||||
@@ -1106,14 +1145,14 @@ public class TestHoodieClientOnCopyOnWriteStorage implements Serializable {
|
||||
.collect(Collectors.toList());
|
||||
assertEquals("Total of 2 valid data files", 2, files.size());
|
||||
|
||||
|
||||
int totalInserts = 0;
|
||||
for (HoodieDataFile file: files) {
|
||||
for (HoodieDataFile file : files) {
|
||||
assertEquals("All files must be at commit 3", commitTime3, file.getCommitTime());
|
||||
records = ParquetUtils.readAvroRecords(new Path(file.getPath()));
|
||||
totalInserts += records.size();
|
||||
}
|
||||
assertEquals("Total number of records must add up", totalInserts, inserts1.size() + inserts2.size() + insert3.size());
|
||||
assertEquals("Total number of records must add up", totalInserts,
|
||||
inserts1.size() + inserts2.size() + insert3.size());
|
||||
}
|
||||
|
||||
@Test
|
||||
@@ -1130,27 +1169,35 @@ public class TestHoodieClientOnCopyOnWriteStorage implements Serializable {
|
||||
String file1P0C0 = HoodieTestUtils.createNewDataFile(basePath, partitionPaths[0], "000");
|
||||
String file1P1C0 = HoodieTestUtils.createNewDataFile(basePath, partitionPaths[1], "000");
|
||||
HoodieTable table = HoodieTable
|
||||
.getHoodieTable(new HoodieTableMetaClient(FSUtils.getFs(), config.getBasePath(), true), config);
|
||||
.getHoodieTable(new HoodieTableMetaClient(FSUtils.getFs(), config.getBasePath(), true),
|
||||
config);
|
||||
|
||||
List<HoodieCleanStat> hoodieCleanStatsOne = table.clean(jsc);
|
||||
assertEquals("Must not clean any files" , 0, getCleanStat(hoodieCleanStatsOne, partitionPaths[0]).getSuccessDeleteFiles().size());
|
||||
assertEquals("Must not clean any files" , 0, getCleanStat(hoodieCleanStatsOne, partitionPaths[1]).getSuccessDeleteFiles().size());
|
||||
assertEquals("Must not clean any files", 0,
|
||||
getCleanStat(hoodieCleanStatsOne, partitionPaths[0]).getSuccessDeleteFiles().size());
|
||||
assertEquals("Must not clean any files", 0,
|
||||
getCleanStat(hoodieCleanStatsOne, partitionPaths[1]).getSuccessDeleteFiles().size());
|
||||
assertTrue(HoodieTestUtils.doesDataFileExist(basePath, partitionPaths[0], "000", file1P0C0));
|
||||
assertTrue(HoodieTestUtils.doesDataFileExist(basePath, partitionPaths[1], "000", file1P1C0));
|
||||
|
||||
// make next commit, with 1 insert & 1 update per partition
|
||||
HoodieTestUtils.createCommitFiles(basePath, "001");
|
||||
table = HoodieTable
|
||||
.getHoodieTable(new HoodieTableMetaClient(FSUtils.getFs(), config.getBasePath(), true), config);
|
||||
.getHoodieTable(new HoodieTableMetaClient(FSUtils.getFs(), config.getBasePath(), true),
|
||||
config);
|
||||
|
||||
String file2P0C1 = HoodieTestUtils.createNewDataFile(basePath, partitionPaths[0], "001"); // insert
|
||||
String file2P1C1 = HoodieTestUtils.createNewDataFile(basePath, partitionPaths[1], "001"); // insert
|
||||
String file2P0C1 = HoodieTestUtils
|
||||
.createNewDataFile(basePath, partitionPaths[0], "001"); // insert
|
||||
String file2P1C1 = HoodieTestUtils
|
||||
.createNewDataFile(basePath, partitionPaths[1], "001"); // insert
|
||||
HoodieTestUtils.createDataFile(basePath, partitionPaths[0], "001", file1P0C0); // update
|
||||
HoodieTestUtils.createDataFile(basePath, partitionPaths[1], "001", file1P1C0); // update
|
||||
|
||||
List<HoodieCleanStat> hoodieCleanStatsTwo = table.clean(jsc);
|
||||
assertEquals("Must clean 1 file" , 1, getCleanStat(hoodieCleanStatsTwo, partitionPaths[0]).getSuccessDeleteFiles().size());
|
||||
assertEquals("Must clean 1 file" , 1, getCleanStat(hoodieCleanStatsTwo, partitionPaths[1]).getSuccessDeleteFiles().size());
|
||||
assertEquals("Must clean 1 file", 1,
|
||||
getCleanStat(hoodieCleanStatsTwo, partitionPaths[0]).getSuccessDeleteFiles().size());
|
||||
assertEquals("Must clean 1 file", 1,
|
||||
getCleanStat(hoodieCleanStatsTwo, partitionPaths[1]).getSuccessDeleteFiles().size());
|
||||
assertTrue(HoodieTestUtils.doesDataFileExist(basePath, partitionPaths[0], "001", file2P0C1));
|
||||
assertTrue(HoodieTestUtils.doesDataFileExist(basePath, partitionPaths[1], "001", file2P1C1));
|
||||
assertFalse(HoodieTestUtils.doesDataFileExist(basePath, partitionPaths[0], "000", file1P0C0));
|
||||
@@ -1159,14 +1206,16 @@ public class TestHoodieClientOnCopyOnWriteStorage implements Serializable {
|
||||
// make next commit, with 2 updates to existing files, and 1 insert
|
||||
HoodieTestUtils.createCommitFiles(basePath, "002");
|
||||
table = HoodieTable
|
||||
.getHoodieTable(new HoodieTableMetaClient(FSUtils.getFs(), config.getBasePath(), true), config);
|
||||
.getHoodieTable(new HoodieTableMetaClient(FSUtils.getFs(), config.getBasePath(), true),
|
||||
config);
|
||||
|
||||
HoodieTestUtils.createDataFile(basePath, partitionPaths[0], "002", file1P0C0); // update
|
||||
HoodieTestUtils.createDataFile(basePath, partitionPaths[0], "002", file2P0C1); // update
|
||||
String file3P0C2 = HoodieTestUtils.createNewDataFile(basePath, partitionPaths[0], "002");
|
||||
|
||||
List<HoodieCleanStat> hoodieCleanStatsThree = table.clean(jsc);
|
||||
assertEquals("Must clean two files" , 2, getCleanStat(hoodieCleanStatsThree, partitionPaths[0]).getSuccessDeleteFiles().size());
|
||||
assertEquals("Must clean two files", 2,
|
||||
getCleanStat(hoodieCleanStatsThree, partitionPaths[0]).getSuccessDeleteFiles().size());
|
||||
assertFalse(HoodieTestUtils.doesDataFileExist(basePath, partitionPaths[0], "001", file1P0C0));
|
||||
assertFalse(HoodieTestUtils.doesDataFileExist(basePath, partitionPaths[0], "001", file2P0C1));
|
||||
assertTrue(HoodieTestUtils.doesDataFileExist(basePath, partitionPaths[0], "002", file3P0C2));
|
||||
@@ -1174,7 +1223,8 @@ public class TestHoodieClientOnCopyOnWriteStorage implements Serializable {
|
||||
// No cleaning on partially written file, with no commit.
|
||||
HoodieTestUtils.createDataFile(basePath, partitionPaths[0], "003", file3P0C2); // update
|
||||
List<HoodieCleanStat> hoodieCleanStatsFour = table.clean(jsc);
|
||||
assertEquals("Must not clean any files" , 0, getCleanStat(hoodieCleanStatsFour, partitionPaths[0]).getSuccessDeleteFiles().size());
|
||||
assertEquals("Must not clean any files", 0,
|
||||
getCleanStat(hoodieCleanStatsFour, partitionPaths[0]).getSuccessDeleteFiles().size());
|
||||
assertTrue(HoodieTestUtils.doesDataFileExist(basePath, partitionPaths[0], "002", file3P0C2));
|
||||
}
|
||||
|
||||
@@ -1187,31 +1237,39 @@ public class TestHoodieClientOnCopyOnWriteStorage implements Serializable {
|
||||
.withCleanerPolicy(HoodieCleaningPolicy.KEEP_LATEST_FILE_VERSIONS)
|
||||
.retainFileVersions(1).build()).build();
|
||||
|
||||
|
||||
HoodieTableMetaClient metaClient = HoodieTestUtils.initTableType(basePath, HoodieTableType.MERGE_ON_READ);
|
||||
HoodieTableMetaClient metaClient = HoodieTestUtils
|
||||
.initTableType(basePath, HoodieTableType.MERGE_ON_READ);
|
||||
|
||||
// Make 3 files, one base file and 2 log files associated with base file
|
||||
String file1P0 = HoodieTestUtils.createNewDataFile(basePath, partitionPaths[0], "000");
|
||||
String file2P0L0 = HoodieTestUtils.createNewLogFile(basePath, partitionPaths[0], "000", file1P0, Optional.empty());
|
||||
String file2P0L1 = HoodieTestUtils.createNewLogFile(basePath, partitionPaths[0], "000", file1P0, Optional.of(2));
|
||||
String file2P0L0 = HoodieTestUtils
|
||||
.createNewLogFile(basePath, partitionPaths[0], "000", file1P0, Optional.empty());
|
||||
String file2P0L1 = HoodieTestUtils
|
||||
.createNewLogFile(basePath, partitionPaths[0], "000", file1P0, Optional.of(2));
|
||||
// make 1 compaction commit
|
||||
HoodieTestUtils.createCompactionCommitFiles(basePath, "000");
|
||||
|
||||
// Make 4 files, one base file and 3 log files associated with base file
|
||||
HoodieTestUtils.createDataFile(basePath, partitionPaths[0], "001", file1P0);
|
||||
file2P0L0 = HoodieTestUtils.createNewLogFile(basePath, partitionPaths[0], "001", file1P0, Optional.empty());
|
||||
file2P0L0 = HoodieTestUtils.createNewLogFile(basePath, partitionPaths[0], "001", file1P0, Optional.of(2));
|
||||
file2P0L0 = HoodieTestUtils.createNewLogFile(basePath, partitionPaths[0], "001", file1P0, Optional.of(3));
|
||||
file2P0L0 = HoodieTestUtils
|
||||
.createNewLogFile(basePath, partitionPaths[0], "001", file1P0, Optional.empty());
|
||||
file2P0L0 = HoodieTestUtils
|
||||
.createNewLogFile(basePath, partitionPaths[0], "001", file1P0, Optional.of(2));
|
||||
file2P0L0 = HoodieTestUtils
|
||||
.createNewLogFile(basePath, partitionPaths[0], "001", file1P0, Optional.of(3));
|
||||
// make 1 compaction commit
|
||||
HoodieTestUtils.createCompactionCommitFiles(basePath, "001");
|
||||
|
||||
HoodieTable table = HoodieTable
|
||||
.getHoodieTable(metaClient, config);
|
||||
List<HoodieCleanStat> hoodieCleanStats = table.clean(jsc);
|
||||
assertEquals("Must clean three files, one parquet and 2 log files" , 3, getCleanStat(hoodieCleanStats, partitionPaths[0]).getSuccessDeleteFiles().size());
|
||||
assertEquals("Must clean three files, one parquet and 2 log files", 3,
|
||||
getCleanStat(hoodieCleanStats, partitionPaths[0]).getSuccessDeleteFiles().size());
|
||||
assertFalse(HoodieTestUtils.doesDataFileExist(basePath, partitionPaths[0], "000", file1P0));
|
||||
assertFalse(HoodieTestUtils.doesLogFileExist(basePath, partitionPaths[0], "000", file2P0L0, Optional.empty()));
|
||||
assertFalse(HoodieTestUtils.doesLogFileExist(basePath, partitionPaths[0], "000", file2P0L0, Optional.of(2)));
|
||||
assertFalse(HoodieTestUtils
|
||||
.doesLogFileExist(basePath, partitionPaths[0], "000", file2P0L0, Optional.empty()));
|
||||
assertFalse(HoodieTestUtils
|
||||
.doesLogFileExist(basePath, partitionPaths[0], "000", file2P0L0, Optional.of(2)));
|
||||
}
|
||||
|
||||
@Test
|
||||
@@ -1229,27 +1287,35 @@ public class TestHoodieClientOnCopyOnWriteStorage implements Serializable {
|
||||
String file1P1C0 = HoodieTestUtils.createNewDataFile(basePath, partitionPaths[1], "000");
|
||||
|
||||
HoodieTable table = HoodieTable
|
||||
.getHoodieTable(new HoodieTableMetaClient(FSUtils.getFs(), config.getBasePath(), true), config);
|
||||
.getHoodieTable(new HoodieTableMetaClient(FSUtils.getFs(), config.getBasePath(), true),
|
||||
config);
|
||||
|
||||
List<HoodieCleanStat> hoodieCleanStatsOne = table.clean(jsc);
|
||||
assertEquals("Must not clean any files" , 0, getCleanStat(hoodieCleanStatsOne, partitionPaths[0]).getSuccessDeleteFiles().size());
|
||||
assertEquals("Must not clean any files" , 0, getCleanStat(hoodieCleanStatsOne, partitionPaths[1]).getSuccessDeleteFiles().size());
|
||||
assertEquals("Must not clean any files", 0,
|
||||
getCleanStat(hoodieCleanStatsOne, partitionPaths[0]).getSuccessDeleteFiles().size());
|
||||
assertEquals("Must not clean any files", 0,
|
||||
getCleanStat(hoodieCleanStatsOne, partitionPaths[1]).getSuccessDeleteFiles().size());
|
||||
assertTrue(HoodieTestUtils.doesDataFileExist(basePath, partitionPaths[0], "000", file1P0C0));
|
||||
assertTrue(HoodieTestUtils.doesDataFileExist(basePath, partitionPaths[1], "000", file1P1C0));
|
||||
|
||||
// make next commit, with 1 insert & 1 update per partition
|
||||
HoodieTestUtils.createCommitFiles(basePath, "001");
|
||||
table = HoodieTable
|
||||
.getHoodieTable(new HoodieTableMetaClient(FSUtils.getFs(), config.getBasePath(), true), config);
|
||||
.getHoodieTable(new HoodieTableMetaClient(FSUtils.getFs(), config.getBasePath(), true),
|
||||
config);
|
||||
|
||||
String file2P0C1 = HoodieTestUtils.createNewDataFile(basePath, partitionPaths[0], "001"); // insert
|
||||
String file2P1C1 = HoodieTestUtils.createNewDataFile(basePath, partitionPaths[1], "001"); // insert
|
||||
String file2P0C1 = HoodieTestUtils
|
||||
.createNewDataFile(basePath, partitionPaths[0], "001"); // insert
|
||||
String file2P1C1 = HoodieTestUtils
|
||||
.createNewDataFile(basePath, partitionPaths[1], "001"); // insert
|
||||
HoodieTestUtils.createDataFile(basePath, partitionPaths[0], "001", file1P0C0); // update
|
||||
HoodieTestUtils.createDataFile(basePath, partitionPaths[1], "001", file1P1C0); // update
|
||||
|
||||
List<HoodieCleanStat> hoodieCleanStatsTwo = table.clean(jsc);
|
||||
assertEquals("Must not clean any files" , 0, getCleanStat(hoodieCleanStatsTwo, partitionPaths[0]).getSuccessDeleteFiles().size());
|
||||
assertEquals("Must not clean any files" , 0, getCleanStat(hoodieCleanStatsTwo, partitionPaths[1]).getSuccessDeleteFiles().size());
|
||||
assertEquals("Must not clean any files", 0,
|
||||
getCleanStat(hoodieCleanStatsTwo, partitionPaths[0]).getSuccessDeleteFiles().size());
|
||||
assertEquals("Must not clean any files", 0,
|
||||
getCleanStat(hoodieCleanStatsTwo, partitionPaths[1]).getSuccessDeleteFiles().size());
|
||||
assertTrue(HoodieTestUtils.doesDataFileExist(basePath, partitionPaths[0], "001", file2P0C1));
|
||||
assertTrue(HoodieTestUtils.doesDataFileExist(basePath, partitionPaths[1], "001", file2P1C1));
|
||||
assertTrue(HoodieTestUtils.doesDataFileExist(basePath, partitionPaths[0], "000", file1P0C0));
|
||||
@@ -1258,7 +1324,8 @@ public class TestHoodieClientOnCopyOnWriteStorage implements Serializable {
|
||||
// make next commit, with 2 updates to existing files, and 1 insert
|
||||
HoodieTestUtils.createCommitFiles(basePath, "002");
|
||||
table = HoodieTable
|
||||
.getHoodieTable(new HoodieTableMetaClient(FSUtils.getFs(), config.getBasePath(), true), config);
|
||||
.getHoodieTable(new HoodieTableMetaClient(FSUtils.getFs(), config.getBasePath(), true),
|
||||
config);
|
||||
|
||||
HoodieTestUtils.createDataFile(basePath, partitionPaths[0], "002", file1P0C0); // update
|
||||
HoodieTestUtils.createDataFile(basePath, partitionPaths[0], "002", file2P0C1); // update
|
||||
@@ -1274,7 +1341,8 @@ public class TestHoodieClientOnCopyOnWriteStorage implements Serializable {
|
||||
// make next commit, with 2 updates to existing files, and 1 insert
|
||||
HoodieTestUtils.createCommitFiles(basePath, "003");
|
||||
table = HoodieTable
|
||||
.getHoodieTable(new HoodieTableMetaClient(FSUtils.getFs(), config.getBasePath(), true), config);
|
||||
.getHoodieTable(new HoodieTableMetaClient(FSUtils.getFs(), config.getBasePath(), true),
|
||||
config);
|
||||
|
||||
HoodieTestUtils.createDataFile(basePath, partitionPaths[0], "003", file1P0C0); // update
|
||||
HoodieTestUtils.createDataFile(basePath, partitionPaths[0], "003", file2P0C1); // update
|
||||
@@ -1282,7 +1350,8 @@ public class TestHoodieClientOnCopyOnWriteStorage implements Serializable {
|
||||
|
||||
List<HoodieCleanStat> hoodieCleanStatsFour = table.clean(jsc);
|
||||
assertEquals(
|
||||
"Must not clean one old file", 1, getCleanStat(hoodieCleanStatsFour, partitionPaths[0]).getSuccessDeleteFiles().size());
|
||||
"Must not clean one old file", 1,
|
||||
getCleanStat(hoodieCleanStatsFour, partitionPaths[0]).getSuccessDeleteFiles().size());
|
||||
|
||||
assertFalse(HoodieTestUtils.doesDataFileExist(basePath, partitionPaths[0], "000", file1P0C0));
|
||||
assertTrue(HoodieTestUtils.doesDataFileExist(basePath, partitionPaths[0], "001", file1P0C0));
|
||||
@@ -1295,7 +1364,8 @@ public class TestHoodieClientOnCopyOnWriteStorage implements Serializable {
|
||||
// No cleaning on partially written file, with no commit.
|
||||
HoodieTestUtils.createDataFile(basePath, partitionPaths[0], "004", file3P0C2); // update
|
||||
List<HoodieCleanStat> hoodieCleanStatsFive = table.clean(jsc);
|
||||
assertEquals("Must not clean any files" , 0, getCleanStat(hoodieCleanStatsFive, partitionPaths[0]).getSuccessDeleteFiles().size());
|
||||
assertEquals("Must not clean any files", 0,
|
||||
getCleanStat(hoodieCleanStatsFive, partitionPaths[0]).getSuccessDeleteFiles().size());
|
||||
assertTrue(HoodieTestUtils.doesDataFileExist(basePath, partitionPaths[0], "001", file1P0C0));
|
||||
assertTrue(HoodieTestUtils.doesDataFileExist(basePath, partitionPaths[0], "001", file2P0C1));
|
||||
}
|
||||
@@ -1344,13 +1414,14 @@ public class TestHoodieClientOnCopyOnWriteStorage implements Serializable {
|
||||
|
||||
Iterator<AccumulatorV2<?, ?>> iterator = taskEnd.taskMetrics().accumulators()
|
||||
.iterator();
|
||||
while(iterator.hasNext()) {
|
||||
while (iterator.hasNext()) {
|
||||
AccumulatorV2 accumulator = iterator.next();
|
||||
if (taskEnd.stageId() == 1 &&
|
||||
accumulator.isRegistered() &&
|
||||
accumulator.name().isDefined() &&
|
||||
accumulator.name().get().equals("internal.metrics.shuffle.read.recordsRead")) {
|
||||
stageOneShuffleReadTaskRecordsCountMap.put(taskEnd.taskInfo().taskId(), (Long) accumulator.value());
|
||||
stageOneShuffleReadTaskRecordsCountMap
|
||||
.put(taskEnd.taskInfo().taskId(), (Long) accumulator.value());
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -1378,22 +1449,27 @@ public class TestHoodieClientOnCopyOnWriteStorage implements Serializable {
|
||||
updateAllFilesInPartition(filesP2C0, partitionPaths[2], "003");
|
||||
|
||||
HoodieTable table = HoodieTable
|
||||
.getHoodieTable(new HoodieTableMetaClient(FSUtils.getFs(), config.getBasePath(), true), config);
|
||||
.getHoodieTable(new HoodieTableMetaClient(FSUtils.getFs(), config.getBasePath(), true),
|
||||
config);
|
||||
List<HoodieCleanStat> hoodieCleanStats = table.clean(jsc);
|
||||
|
||||
assertEquals(100, getCleanStat(hoodieCleanStats, partitionPaths[0]).getSuccessDeleteFiles().size());
|
||||
assertEquals(10, getCleanStat(hoodieCleanStats, partitionPaths[1]).getSuccessDeleteFiles().size());
|
||||
assertEquals(10, getCleanStat(hoodieCleanStats, partitionPaths[2]).getSuccessDeleteFiles().size());
|
||||
assertEquals(100,
|
||||
getCleanStat(hoodieCleanStats, partitionPaths[0]).getSuccessDeleteFiles().size());
|
||||
assertEquals(10,
|
||||
getCleanStat(hoodieCleanStats, partitionPaths[1]).getSuccessDeleteFiles().size());
|
||||
assertEquals(10,
|
||||
getCleanStat(hoodieCleanStats, partitionPaths[2]).getSuccessDeleteFiles().size());
|
||||
|
||||
// 3 tasks are expected since the number of partitions is 3
|
||||
assertEquals(3, stageOneShuffleReadTaskRecordsCountMap.keySet().size());
|
||||
// Sum of all records processed = total number of files to clean
|
||||
assertEquals(120, stageOneShuffleReadTaskRecordsCountMap
|
||||
.values().stream().reduce((a,b) -> a + b).get().intValue());
|
||||
.values().stream().reduce((a, b) -> a + b).get().intValue());
|
||||
assertTrue("The skew in handling files to clean is not removed. "
|
||||
+ "Each task should handle more records than the partitionPath with least files "
|
||||
+ "and less records than the partitionPath with most files.",
|
||||
stageOneShuffleReadTaskRecordsCountMap.values().stream().filter(a -> a > 10 && a < 100).count() == 3);
|
||||
stageOneShuffleReadTaskRecordsCountMap.values().stream().filter(a -> a > 10 && a < 100)
|
||||
.count() == 3);
|
||||
}
|
||||
|
||||
public void testCommitWritesRelativePaths() throws Exception {
|
||||
@@ -1454,7 +1530,8 @@ public class TestHoodieClientOnCopyOnWriteStorage implements Serializable {
|
||||
}
|
||||
}
|
||||
|
||||
private List<String> createFilesInPartition(String partitionPath, String commitTime, int numFiles) throws IOException {
|
||||
private List<String> createFilesInPartition(String partitionPath, String commitTime, int numFiles)
|
||||
throws IOException {
|
||||
List<String> files = new ArrayList<>();
|
||||
for (int i = 0; i < numFiles; i++) {
|
||||
files.add(HoodieTestUtils.createNewDataFile(basePath, partitionPath, commitTime));
|
||||
|
||||
@@ -29,15 +29,6 @@ import com.uber.hoodie.common.table.view.HoodieTableFileSystemView;
|
||||
import com.uber.hoodie.common.util.FSUtils;
|
||||
import com.uber.hoodie.exception.HoodieException;
|
||||
import com.uber.hoodie.table.HoodieTable;
|
||||
|
||||
import org.apache.hadoop.fs.FileSystem;
|
||||
import org.apache.hadoop.fs.Path;
|
||||
import org.apache.spark.sql.Dataset;
|
||||
import org.apache.spark.sql.Row;
|
||||
import org.apache.spark.sql.SQLContext;
|
||||
|
||||
import org.apache.spark.SparkConf;
|
||||
|
||||
import java.io.File;
|
||||
import java.io.IOException;
|
||||
import java.io.RandomAccessFile;
|
||||
@@ -49,6 +40,12 @@ import java.util.Iterator;
|
||||
import java.util.List;
|
||||
import java.util.Set;
|
||||
import java.util.stream.Collectors;
|
||||
import org.apache.hadoop.fs.FileSystem;
|
||||
import org.apache.hadoop.fs.Path;
|
||||
import org.apache.spark.SparkConf;
|
||||
import org.apache.spark.sql.Dataset;
|
||||
import org.apache.spark.sql.Row;
|
||||
import org.apache.spark.sql.SQLContext;
|
||||
|
||||
/**
|
||||
* Utility methods to aid testing inside the HoodieClient module.
|
||||
@@ -66,14 +63,15 @@ public class HoodieClientTestUtils {
|
||||
|
||||
public static Set<String> getRecordKeys(List<HoodieRecord> hoodieRecords) {
|
||||
Set<String> keys = new HashSet<>();
|
||||
for (HoodieRecord rec: hoodieRecords) {
|
||||
for (HoodieRecord rec : hoodieRecords) {
|
||||
keys.add(rec.getRecordKey());
|
||||
}
|
||||
return keys;
|
||||
}
|
||||
|
||||
private static void fakeMetaFile(String basePath, String commitTime, String suffix) throws IOException {
|
||||
String parentPath = basePath + "/"+ HoodieTableMetaClient.METAFOLDER_NAME;
|
||||
private static void fakeMetaFile(String basePath, String commitTime, String suffix)
|
||||
throws IOException {
|
||||
String parentPath = basePath + "/" + HoodieTableMetaClient.METAFOLDER_NAME;
|
||||
new File(parentPath).mkdirs();
|
||||
new File(parentPath + "/" + commitTime + suffix).createNewFile();
|
||||
}
|
||||
@@ -87,14 +85,17 @@ public class HoodieClientTestUtils {
|
||||
fakeMetaFile(basePath, commitTime, HoodieTimeline.INFLIGHT_EXTENSION);
|
||||
}
|
||||
|
||||
public static void fakeDataFile(String basePath, String partitionPath, String commitTime, String fileId) throws Exception {
|
||||
public static void fakeDataFile(String basePath, String partitionPath, String commitTime,
|
||||
String fileId) throws Exception {
|
||||
fakeDataFile(basePath, partitionPath, commitTime, fileId, 0);
|
||||
}
|
||||
|
||||
public static void fakeDataFile(String basePath, String partitionPath, String commitTime, String fileId, long length) throws Exception {
|
||||
public static void fakeDataFile(String basePath, String partitionPath, String commitTime,
|
||||
String fileId, long length) throws Exception {
|
||||
String parentPath = String.format("%s/%s", basePath, partitionPath);
|
||||
new File(parentPath).mkdirs();
|
||||
String path = String.format("%s/%s", parentPath, FSUtils.makeDataFileName(commitTime, 0, fileId));
|
||||
String path = String
|
||||
.format("%s/%s", parentPath, FSUtils.makeDataFileName(commitTime, 0, fileId));
|
||||
new File(path).createNewFile();
|
||||
new RandomAccessFile(path, "rw").setLength(length);
|
||||
}
|
||||
@@ -129,7 +130,8 @@ public class HoodieClientTestUtils {
|
||||
new HoodieException("No commit exists at " + commitTime);
|
||||
}
|
||||
try {
|
||||
HashMap<String, String> paths = getLatestFileIDsToFullPath(basePath, commitTimeline, Arrays.asList(commitInstant));
|
||||
HashMap<String, String> paths = getLatestFileIDsToFullPath(basePath, commitTimeline,
|
||||
Arrays.asList(commitInstant));
|
||||
return sqlContext.read()
|
||||
.parquet(paths.values().toArray(new String[paths.size()]))
|
||||
.filter(String.format("%s ='%s'", HoodieRecord.COMMIT_TIME_METADATA_FIELD, commitTime));
|
||||
@@ -150,12 +152,15 @@ public class HoodieClientTestUtils {
|
||||
.getInstants().collect(Collectors.toList());
|
||||
try {
|
||||
// Go over the commit metadata, and obtain the new files that need to be read.
|
||||
HashMap<String, String> fileIdToFullPath = getLatestFileIDsToFullPath(basePath, commitTimeline, commitsToReturn);
|
||||
HashMap<String, String> fileIdToFullPath = getLatestFileIDsToFullPath(basePath,
|
||||
commitTimeline, commitsToReturn);
|
||||
return sqlContext.read()
|
||||
.parquet(fileIdToFullPath.values().toArray(new String[fileIdToFullPath.size()]))
|
||||
.filter(String.format("%s >'%s'", HoodieRecord.COMMIT_TIME_METADATA_FIELD, lastCommitTime));
|
||||
.filter(
|
||||
String.format("%s >'%s'", HoodieRecord.COMMIT_TIME_METADATA_FIELD, lastCommitTime));
|
||||
} catch (IOException e) {
|
||||
throw new HoodieException("Error pulling data incrementally from commitTimestamp :" + lastCommitTime, e);
|
||||
throw new HoodieException(
|
||||
"Error pulling data incrementally from commitTimestamp :" + lastCommitTime, e);
|
||||
}
|
||||
}
|
||||
|
||||
@@ -171,7 +176,8 @@ public class HoodieClientTestUtils {
|
||||
HoodieTable hoodieTable = HoodieTable
|
||||
.getHoodieTable(new HoodieTableMetaClient(fs, basePath, true), null);
|
||||
for (String path : paths) {
|
||||
TableFileSystemView.ReadOptimizedView fileSystemView = new HoodieTableFileSystemView(hoodieTable.getMetaClient(),
|
||||
TableFileSystemView.ReadOptimizedView fileSystemView = new HoodieTableFileSystemView(
|
||||
hoodieTable.getMetaClient(),
|
||||
hoodieTable.getCompletedCommitTimeline(), fs.globStatus(new Path(path)));
|
||||
List<HoodieDataFile> latestFiles = fileSystemView.getLatestDataFiles().collect(
|
||||
Collectors.toList());
|
||||
|
||||
@@ -16,9 +16,16 @@
|
||||
|
||||
package com.uber.hoodie.common;
|
||||
|
||||
import static com.uber.hoodie.common.HoodieTestDataGenerator.TRIP_EXAMPLE_SCHEMA;
|
||||
|
||||
import com.uber.hoodie.common.util.FSUtils;
|
||||
import com.uber.hoodie.common.util.HoodieAvroUtils;
|
||||
import com.uber.hoodie.hadoop.realtime.HoodieRealtimeInputFormat;
|
||||
import java.io.IOException;
|
||||
import java.util.ArrayList;
|
||||
import java.util.Arrays;
|
||||
import java.util.List;
|
||||
import java.util.stream.Collectors;
|
||||
import org.apache.avro.Schema;
|
||||
import org.apache.avro.generic.GenericRecord;
|
||||
import org.apache.avro.generic.GenericRecordBuilder;
|
||||
@@ -30,20 +37,13 @@ import org.apache.hadoop.mapred.InputSplit;
|
||||
import org.apache.hadoop.mapred.JobConf;
|
||||
import org.apache.hadoop.mapred.RecordReader;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.util.ArrayList;
|
||||
import java.util.Arrays;
|
||||
import java.util.List;
|
||||
import java.util.stream.Collectors;
|
||||
|
||||
import static com.uber.hoodie.common.HoodieTestDataGenerator.TRIP_EXAMPLE_SCHEMA;
|
||||
|
||||
/**
|
||||
* Utility methods to aid in testing MergeOnRead (workaround for HoodieReadClient for MOR)
|
||||
*/
|
||||
public class HoodieMergeOnReadTestUtils {
|
||||
|
||||
public static List<GenericRecord> getRecordsUsingInputFormat(List<String> inputPaths) throws IOException {
|
||||
public static List<GenericRecord> getRecordsUsingInputFormat(List<String> inputPaths)
|
||||
throws IOException {
|
||||
JobConf jobConf = new JobConf();
|
||||
Schema schema = HoodieAvroUtils.addMetadataFields(Schema.parse(TRIP_EXAMPLE_SCHEMA));
|
||||
HoodieRealtimeInputFormat inputFormat = new HoodieRealtimeInputFormat();
|
||||
@@ -75,10 +75,12 @@ public class HoodieMergeOnReadTestUtils {
|
||||
}).get();
|
||||
}
|
||||
|
||||
private static void setPropsForInputFormat(HoodieRealtimeInputFormat inputFormat, JobConf jobConf, Schema schema) {
|
||||
private static void setPropsForInputFormat(HoodieRealtimeInputFormat inputFormat, JobConf jobConf,
|
||||
Schema schema) {
|
||||
List<Schema.Field> fields = schema.getFields();
|
||||
String names = fields.stream().map(f -> f.name().toString()).collect(Collectors.joining(","));
|
||||
String postions = fields.stream().map(f -> String.valueOf(f.pos())).collect(Collectors.joining(","));
|
||||
String postions = fields.stream().map(f -> String.valueOf(f.pos()))
|
||||
.collect(Collectors.joining(","));
|
||||
Configuration conf = FSUtils.getFs().getConf();
|
||||
jobConf.set(ColumnProjectionUtils.READ_COLUMN_NAMES_CONF_STR, names);
|
||||
jobConf.set(ColumnProjectionUtils.READ_COLUMN_IDS_CONF_STR, postions);
|
||||
|
||||
@@ -16,17 +16,21 @@
|
||||
|
||||
package com.uber.hoodie.common;
|
||||
|
||||
import com.uber.hoodie.avro.model.HoodieCleanMetadata;
|
||||
import com.uber.hoodie.common.model.HoodieCleaningPolicy;
|
||||
import com.uber.hoodie.common.model.HoodieCommitMetadata;
|
||||
import com.uber.hoodie.common.model.HoodieKey;
|
||||
import com.uber.hoodie.common.model.HoodiePartitionMetadata;
|
||||
import com.uber.hoodie.common.model.HoodieRecord;
|
||||
import com.uber.hoodie.common.table.HoodieTableMetaClient;
|
||||
import com.uber.hoodie.common.table.HoodieTimeline;
|
||||
import com.uber.hoodie.common.util.AvroUtils;
|
||||
import com.uber.hoodie.common.util.FSUtils;
|
||||
import com.uber.hoodie.common.util.HoodieAvroUtils;
|
||||
import java.io.IOException;
|
||||
import java.nio.charset.StandardCharsets;
|
||||
import java.util.ArrayList;
|
||||
import java.util.List;
|
||||
import java.util.Optional;
|
||||
import java.util.Random;
|
||||
import java.util.UUID;
|
||||
import org.apache.avro.Schema;
|
||||
import org.apache.avro.generic.GenericData;
|
||||
import org.apache.avro.generic.GenericRecord;
|
||||
@@ -34,15 +38,6 @@ import org.apache.hadoop.fs.FSDataOutputStream;
|
||||
import org.apache.hadoop.fs.FileSystem;
|
||||
import org.apache.hadoop.fs.Path;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.nio.charset.StandardCharsets;
|
||||
import java.util.ArrayList;
|
||||
import java.util.Arrays;
|
||||
import java.util.List;
|
||||
import java.util.Optional;
|
||||
import java.util.Random;
|
||||
import java.util.UUID;
|
||||
|
||||
/**
|
||||
* Class to be used in tests to keep generating test inserts and updates against a corpus.
|
||||
*
|
||||
@@ -51,6 +46,7 @@ import java.util.UUID;
|
||||
public class HoodieTestDataGenerator {
|
||||
|
||||
static class KeyPartition {
|
||||
|
||||
HoodieKey key;
|
||||
String partitionPath;
|
||||
}
|
||||
@@ -74,14 +70,17 @@ public class HoodieTestDataGenerator {
|
||||
public static final String[] DEFAULT_PARTITION_PATHS = {"2016/03/15", "2015/03/16", "2015/03/17"};
|
||||
|
||||
|
||||
public static void writePartitionMetadata(FileSystem fs, String[] partitionPaths, String basePath) {
|
||||
for (String partitionPath: partitionPaths) {
|
||||
new HoodiePartitionMetadata(fs, "000", new Path(basePath), new Path(basePath, partitionPath)).trySave(0);
|
||||
public static void writePartitionMetadata(FileSystem fs, String[] partitionPaths,
|
||||
String basePath) {
|
||||
for (String partitionPath : partitionPaths) {
|
||||
new HoodiePartitionMetadata(fs, "000", new Path(basePath), new Path(basePath, partitionPath))
|
||||
.trySave(0);
|
||||
}
|
||||
}
|
||||
|
||||
private List<KeyPartition> existingKeysList = new ArrayList<>();
|
||||
public static Schema avroSchema = HoodieAvroUtils.addMetadataFields(new Schema.Parser().parse(TRIP_EXAMPLE_SCHEMA));
|
||||
public static Schema avroSchema = HoodieAvroUtils
|
||||
.addMetadataFields(new Schema.Parser().parse(TRIP_EXAMPLE_SCHEMA));
|
||||
private static Random rand = new Random(46474747);
|
||||
private String[] partitionPaths = DEFAULT_PARTITION_PATHS;
|
||||
|
||||
@@ -95,8 +94,8 @@ public class HoodieTestDataGenerator {
|
||||
|
||||
|
||||
/**
|
||||
* Generates new inserts, uniformly across the partition paths above. It also updates the list
|
||||
* of existing keys.
|
||||
* Generates new inserts, uniformly across the partition paths above. It also updates the list of
|
||||
* existing keys.
|
||||
*/
|
||||
public List<HoodieRecord> generateInserts(String commitTime, int n) throws IOException {
|
||||
List<HoodieRecord> inserts = new ArrayList<>();
|
||||
@@ -119,9 +118,10 @@ public class HoodieTestDataGenerator {
|
||||
return generateDeletesFromExistingRecords(inserts);
|
||||
}
|
||||
|
||||
public List<HoodieRecord> generateDeletesFromExistingRecords(List<HoodieRecord> existingRecords) throws IOException {
|
||||
public List<HoodieRecord> generateDeletesFromExistingRecords(List<HoodieRecord> existingRecords)
|
||||
throws IOException {
|
||||
List<HoodieRecord> deletes = new ArrayList<>();
|
||||
for (HoodieRecord existingRecord: existingRecords) {
|
||||
for (HoodieRecord existingRecord : existingRecords) {
|
||||
HoodieRecord record = generateDeleteRecord(existingRecord);
|
||||
deletes.add(record);
|
||||
|
||||
@@ -131,14 +131,17 @@ public class HoodieTestDataGenerator {
|
||||
|
||||
public HoodieRecord generateDeleteRecord(HoodieRecord existingRecord) throws IOException {
|
||||
HoodieKey key = existingRecord.getKey();
|
||||
TestRawTripPayload payload = new TestRawTripPayload(Optional.empty(), key.getRecordKey(), key.getPartitionPath(), null, true);
|
||||
TestRawTripPayload payload = new TestRawTripPayload(Optional.empty(), key.getRecordKey(),
|
||||
key.getPartitionPath(), null, true);
|
||||
return new HoodieRecord(key, payload);
|
||||
}
|
||||
|
||||
public List<HoodieRecord> generateUpdates(String commitTime, List<HoodieRecord> baseRecords) throws IOException {
|
||||
public List<HoodieRecord> generateUpdates(String commitTime, List<HoodieRecord> baseRecords)
|
||||
throws IOException {
|
||||
List<HoodieRecord> updates = new ArrayList<>();
|
||||
for (HoodieRecord baseRecord: baseRecords) {
|
||||
HoodieRecord record = new HoodieRecord(baseRecord.getKey(), generateRandomValue(baseRecord.getKey(), commitTime));
|
||||
for (HoodieRecord baseRecord : baseRecords) {
|
||||
HoodieRecord record = new HoodieRecord(baseRecord.getKey(),
|
||||
generateRandomValue(baseRecord.getKey(), commitTime));
|
||||
updates.add(record);
|
||||
}
|
||||
return updates;
|
||||
@@ -162,11 +165,13 @@ public class HoodieTestDataGenerator {
|
||||
* Generates a new avro record of the above schema format, retaining the key if optionally
|
||||
* provided.
|
||||
*/
|
||||
public static TestRawTripPayload generateRandomValue(HoodieKey key, String commitTime) throws IOException {
|
||||
public static TestRawTripPayload generateRandomValue(HoodieKey key, String commitTime)
|
||||
throws IOException {
|
||||
GenericRecord rec = generateGenericRecord(key.getRecordKey(), "rider-" + commitTime,
|
||||
"driver-" + commitTime, 0.0);
|
||||
HoodieAvroUtils.addCommitMetadataToRecord(rec, commitTime, "-1");
|
||||
return new TestRawTripPayload(rec.toString(), key.getRecordKey(), key.getPartitionPath(), TRIP_EXAMPLE_SCHEMA);
|
||||
return new TestRawTripPayload(rec.toString(), key.getRecordKey(), key.getPartitionPath(),
|
||||
TRIP_EXAMPLE_SCHEMA);
|
||||
}
|
||||
|
||||
public static GenericRecord generateGenericRecord(String rowKey, String riderName,
|
||||
@@ -186,7 +191,8 @@ public class HoodieTestDataGenerator {
|
||||
|
||||
public static void createCommitFile(String basePath, String commitTime) throws IOException {
|
||||
Path commitFile =
|
||||
new Path(basePath + "/" + HoodieTableMetaClient.METAFOLDER_NAME + "/" + HoodieTimeline.makeCommitFileName(commitTime));
|
||||
new Path(basePath + "/" + HoodieTableMetaClient.METAFOLDER_NAME + "/" + HoodieTimeline
|
||||
.makeCommitFileName(commitTime));
|
||||
FileSystem fs = FSUtils.getFs();
|
||||
FSDataOutputStream os = fs.create(commitFile, true);
|
||||
HoodieCommitMetadata commitMetadata = new HoodieCommitMetadata();
|
||||
|
||||
@@ -17,31 +17,32 @@
|
||||
package com.uber.hoodie.common;
|
||||
|
||||
import com.fasterxml.jackson.databind.ObjectMapper;
|
||||
|
||||
import com.uber.hoodie.WriteStatus;
|
||||
import com.uber.hoodie.avro.MercifulJsonConverter;
|
||||
import com.uber.hoodie.common.model.HoodieRecord;
|
||||
import com.uber.hoodie.common.model.HoodieRecordPayload;
|
||||
|
||||
import java.io.ByteArrayInputStream;
|
||||
import java.io.ByteArrayOutputStream;
|
||||
import java.io.IOException;
|
||||
import java.io.StringWriter;
|
||||
import java.util.HashMap;
|
||||
import java.util.List;
|
||||
import java.util.Map.Entry;
|
||||
import org.apache.avro.Schema;
|
||||
import org.apache.avro.generic.IndexedRecord;
|
||||
import org.apache.commons.io.IOUtils;
|
||||
|
||||
import java.io.*;
|
||||
import java.util.Map;
|
||||
import java.util.Map.Entry;
|
||||
import java.util.Optional;
|
||||
import java.util.zip.Deflater;
|
||||
import java.util.zip.DeflaterOutputStream;
|
||||
import java.util.zip.InflaterInputStream;
|
||||
import org.apache.avro.Schema;
|
||||
import org.apache.avro.generic.IndexedRecord;
|
||||
import org.apache.commons.io.IOUtils;
|
||||
|
||||
/**
|
||||
* Example row change event based on some example data used by testcases. The data avro schema is
|
||||
* src/test/resources/schema1.
|
||||
*/
|
||||
public class TestRawTripPayload implements HoodieRecordPayload<TestRawTripPayload> {
|
||||
|
||||
private transient static final ObjectMapper mapper = new ObjectMapper();
|
||||
private String partitionPath;
|
||||
private String rowKey;
|
||||
@@ -51,7 +52,7 @@ public class TestRawTripPayload implements HoodieRecordPayload<TestRawTripPayloa
|
||||
|
||||
public TestRawTripPayload(Optional<String> jsonData, String rowKey, String partitionPath,
|
||||
String schemaStr, Boolean isDeleted) throws IOException {
|
||||
if(jsonData.isPresent()) {
|
||||
if (jsonData.isPresent()) {
|
||||
this.jsonDataCompressed = compressData(jsonData.get());
|
||||
this.dataSize = jsonData.get().length();
|
||||
}
|
||||
@@ -61,7 +62,7 @@ public class TestRawTripPayload implements HoodieRecordPayload<TestRawTripPayloa
|
||||
}
|
||||
|
||||
public TestRawTripPayload(String jsonData, String rowKey, String partitionPath,
|
||||
String schemaStr)throws IOException {
|
||||
String schemaStr) throws IOException {
|
||||
this(Optional.of(jsonData), rowKey, partitionPath, schemaStr, false);
|
||||
}
|
||||
|
||||
@@ -79,16 +80,20 @@ public class TestRawTripPayload implements HoodieRecordPayload<TestRawTripPayloa
|
||||
}
|
||||
|
||||
|
||||
@Override public TestRawTripPayload preCombine(TestRawTripPayload another) {
|
||||
@Override
|
||||
public TestRawTripPayload preCombine(TestRawTripPayload another) {
|
||||
return another;
|
||||
}
|
||||
|
||||
@Override public Optional<IndexedRecord> combineAndGetUpdateValue(IndexedRecord oldRec, Schema schema) throws IOException {
|
||||
@Override
|
||||
public Optional<IndexedRecord> combineAndGetUpdateValue(IndexedRecord oldRec, Schema schema)
|
||||
throws IOException {
|
||||
return this.getInsertValue(schema);
|
||||
}
|
||||
|
||||
@Override public Optional<IndexedRecord> getInsertValue(Schema schema) throws IOException {
|
||||
if(isDeleted){
|
||||
@Override
|
||||
public Optional<IndexedRecord> getInsertValue(Schema schema) throws IOException {
|
||||
if (isDeleted) {
|
||||
return Optional.empty();
|
||||
} else {
|
||||
MercifulJsonConverter jsonConverter = new MercifulJsonConverter(schema);
|
||||
@@ -135,16 +140,17 @@ public class TestRawTripPayload implements HoodieRecordPayload<TestRawTripPayloa
|
||||
}
|
||||
|
||||
/**
|
||||
* A custom {@link WriteStatus} that merges passed metadata key value map
|
||||
* to {@code WriteStatus.markSuccess()} and {@code WriteStatus.markFailure()}.
|
||||
* A custom {@link WriteStatus} that merges passed metadata key value map to {@code
|
||||
* WriteStatus.markSuccess()} and {@code WriteStatus.markFailure()}.
|
||||
*/
|
||||
public static class MetadataMergeWriteStatus extends WriteStatus {
|
||||
|
||||
private Map<String, String> mergedMetadataMap = new HashMap<>();
|
||||
|
||||
@Override
|
||||
public void markSuccess(HoodieRecord record, Optional<Map<String, String>> recordMetadata) {
|
||||
super.markSuccess(record, recordMetadata);
|
||||
if(recordMetadata.isPresent()) {
|
||||
if (recordMetadata.isPresent()) {
|
||||
mergeMetadataMaps(recordMetadata.get(), mergedMetadataMap);
|
||||
}
|
||||
}
|
||||
@@ -153,25 +159,27 @@ public class TestRawTripPayload implements HoodieRecordPayload<TestRawTripPayloa
|
||||
public void markFailure(HoodieRecord record, Throwable t,
|
||||
Optional<Map<String, String>> recordMetadata) {
|
||||
super.markFailure(record, t, recordMetadata);
|
||||
if(recordMetadata.isPresent()) {
|
||||
if (recordMetadata.isPresent()) {
|
||||
mergeMetadataMaps(recordMetadata.get(), mergedMetadataMap);
|
||||
}
|
||||
}
|
||||
|
||||
public static Map<String, String> mergeMetadataForWriteStatuses(List<WriteStatus> writeStatuses) {
|
||||
public static Map<String, String> mergeMetadataForWriteStatuses(
|
||||
List<WriteStatus> writeStatuses) {
|
||||
Map<String, String> allWriteStatusMergedMetadataMap = new HashMap<>();
|
||||
for (WriteStatus writeStatus : writeStatuses) {
|
||||
MetadataMergeWriteStatus.mergeMetadataMaps(
|
||||
((MetadataMergeWriteStatus)writeStatus).getMergedMetadataMap(),
|
||||
((MetadataMergeWriteStatus) writeStatus).getMergedMetadataMap(),
|
||||
allWriteStatusMergedMetadataMap);
|
||||
}
|
||||
return allWriteStatusMergedMetadataMap;
|
||||
}
|
||||
|
||||
private static void mergeMetadataMaps(Map<String, String> mergeFromMap, Map<String, String> mergeToMap) {
|
||||
private static void mergeMetadataMaps(Map<String, String> mergeFromMap,
|
||||
Map<String, String> mergeToMap) {
|
||||
for (Entry<String, String> entry : mergeFromMap.entrySet()) {
|
||||
String key = entry.getKey();
|
||||
if(!mergeToMap.containsKey(key)) {
|
||||
if (!mergeToMap.containsKey(key)) {
|
||||
mergeToMap.put(key, "0");
|
||||
}
|
||||
mergeToMap
|
||||
|
||||
@@ -16,7 +16,7 @@
|
||||
|
||||
package com.uber.hoodie.config;
|
||||
|
||||
import static org.junit.Assert.*;
|
||||
import static org.junit.Assert.assertEquals;
|
||||
|
||||
import com.google.common.collect.Maps;
|
||||
import com.uber.hoodie.config.HoodieWriteConfig.Builder;
|
||||
@@ -29,6 +29,7 @@ import java.util.Properties;
|
||||
import org.junit.Test;
|
||||
|
||||
public class HoodieWriteConfigTest {
|
||||
|
||||
@Test
|
||||
public void testPropertyLoading() throws IOException {
|
||||
Builder builder = HoodieWriteConfig.newBuilder().withPath("/tmp");
|
||||
@@ -46,9 +47,10 @@ public class HoodieWriteConfigTest {
|
||||
HoodieWriteConfig config = builder.build();
|
||||
assertEquals(config.getMaxCommitsToKeep(), 5);
|
||||
assertEquals(config.getMinCommitsToKeep(), 2);
|
||||
}
|
||||
}
|
||||
|
||||
private ByteArrayOutputStream saveParamsIntoOutputStream(Map<String, String> params) throws IOException {
|
||||
private ByteArrayOutputStream saveParamsIntoOutputStream(Map<String, String> params)
|
||||
throws IOException {
|
||||
Properties properties = new Properties();
|
||||
properties.putAll(params);
|
||||
ByteArrayOutputStream outStream = new ByteArrayOutputStream();
|
||||
|
||||
@@ -16,31 +16,30 @@
|
||||
|
||||
package com.uber.hoodie.func;
|
||||
|
||||
import com.uber.hoodie.common.table.HoodieTableMetaClient;
|
||||
import com.uber.hoodie.common.table.HoodieTimeline;
|
||||
import com.uber.hoodie.config.HoodieWriteConfig;
|
||||
import static org.junit.Assert.fail;
|
||||
|
||||
import com.uber.hoodie.WriteStatus;
|
||||
import com.uber.hoodie.common.TestRawTripPayload;
|
||||
import com.uber.hoodie.common.model.HoodieKey;
|
||||
import com.uber.hoodie.common.model.HoodieRecord;
|
||||
import com.uber.hoodie.common.model.HoodieRecordLocation;
|
||||
import com.uber.hoodie.common.model.HoodieTestUtils;
|
||||
import com.uber.hoodie.common.table.HoodieTableMetaClient;
|
||||
import com.uber.hoodie.common.table.HoodieTimeline;
|
||||
import com.uber.hoodie.common.util.FSUtils;
|
||||
import com.uber.hoodie.config.HoodieWriteConfig;
|
||||
import com.uber.hoodie.table.HoodieCopyOnWriteTable;
|
||||
|
||||
import java.util.ArrayList;
|
||||
import java.util.Iterator;
|
||||
import java.util.List;
|
||||
import org.apache.commons.io.IOUtils;
|
||||
import org.apache.hadoop.fs.Path;
|
||||
import org.junit.Before;
|
||||
import org.junit.Test;
|
||||
import org.junit.rules.TemporaryFolder;
|
||||
|
||||
import java.util.ArrayList;
|
||||
import java.util.Iterator;
|
||||
import java.util.List;
|
||||
|
||||
import static org.junit.Assert.fail;
|
||||
|
||||
public class TestUpdateMapFunction {
|
||||
|
||||
private String basePath = null;
|
||||
|
||||
@Before
|
||||
@@ -90,7 +89,6 @@ public class TestUpdateMapFunction {
|
||||
String fileId = insertResult.next().get(0).getFileId();
|
||||
System.out.println(fileId);
|
||||
|
||||
|
||||
table = new HoodieCopyOnWriteTable(config, metadata);
|
||||
// New content with values for the newly added field
|
||||
recordStr1 =
|
||||
|
||||
@@ -16,17 +16,16 @@
|
||||
|
||||
package com.uber.hoodie.index;
|
||||
|
||||
import com.uber.hoodie.config.HoodieWriteConfig;
|
||||
import static org.junit.Assert.assertTrue;
|
||||
|
||||
import com.uber.hoodie.config.HoodieIndexConfig;
|
||||
import com.uber.hoodie.config.HoodieWriteConfig;
|
||||
import com.uber.hoodie.index.bloom.HoodieBloomIndex;
|
||||
import com.uber.hoodie.index.hbase.HBaseIndex;
|
||||
|
||||
import org.junit.Test;
|
||||
|
||||
import static org.junit.Assert.*;
|
||||
|
||||
public class TestHoodieIndex {
|
||||
|
||||
@Test
|
||||
public void testCreateIndex() throws Exception {
|
||||
HoodieWriteConfig.Builder clientConfigBuilder = HoodieWriteConfig.newBuilder();
|
||||
|
||||
@@ -18,28 +18,39 @@
|
||||
|
||||
package com.uber.hoodie.index.bloom;
|
||||
|
||||
import static org.junit.Assert.assertEquals;
|
||||
import static org.junit.Assert.assertFalse;
|
||||
import static org.junit.Assert.assertNotNull;
|
||||
import static org.junit.Assert.assertNull;
|
||||
import static org.junit.Assert.assertTrue;
|
||||
import static org.junit.Assert.fail;
|
||||
|
||||
import com.google.common.base.Optional;
|
||||
import com.google.common.collect.Lists;
|
||||
|
||||
import com.uber.hoodie.common.HoodieClientTestUtils;
|
||||
import com.uber.hoodie.common.table.HoodieTableMetaClient;
|
||||
import com.uber.hoodie.config.HoodieIndexConfig;
|
||||
import com.uber.hoodie.config.HoodieWriteConfig;
|
||||
import com.uber.hoodie.avro.HoodieAvroWriteSupport;
|
||||
import com.uber.hoodie.common.BloomFilter;
|
||||
import com.uber.hoodie.common.HoodieClientTestUtils;
|
||||
import com.uber.hoodie.common.TestRawTripPayload;
|
||||
import com.uber.hoodie.common.model.HoodieKey;
|
||||
import com.uber.hoodie.common.model.HoodieRecord;
|
||||
import com.uber.hoodie.common.model.HoodieTestUtils;
|
||||
import com.uber.hoodie.common.table.HoodieTableMetaClient;
|
||||
import com.uber.hoodie.common.util.FSUtils;
|
||||
import com.uber.hoodie.common.util.HoodieAvroUtils;
|
||||
|
||||
import com.uber.hoodie.index.bloom.BloomIndexFileInfo;
|
||||
import com.uber.hoodie.index.bloom.HoodieBloomIndex;
|
||||
import com.uber.hoodie.index.bloom.HoodieBloomIndexCheckFunction;
|
||||
import com.uber.hoodie.config.HoodieWriteConfig;
|
||||
import com.uber.hoodie.io.storage.HoodieParquetConfig;
|
||||
import com.uber.hoodie.io.storage.HoodieParquetWriter;
|
||||
import com.uber.hoodie.table.HoodieTable;
|
||||
import java.io.File;
|
||||
import java.io.IOException;
|
||||
import java.text.SimpleDateFormat;
|
||||
import java.util.Arrays;
|
||||
import java.util.Date;
|
||||
import java.util.HashMap;
|
||||
import java.util.List;
|
||||
import java.util.Map;
|
||||
import java.util.UUID;
|
||||
import java.util.stream.Collectors;
|
||||
import org.apache.avro.Schema;
|
||||
import org.apache.avro.generic.GenericRecord;
|
||||
import org.apache.commons.io.IOUtils;
|
||||
@@ -47,11 +58,8 @@ import org.apache.hadoop.conf.Configuration;
|
||||
import org.apache.hadoop.fs.FileSystem;
|
||||
import org.apache.hadoop.fs.Path;
|
||||
import org.apache.parquet.avro.AvroSchemaConverter;
|
||||
import org.apache.parquet.avro.AvroWriteSupport;
|
||||
import org.apache.parquet.hadoop.ParquetWriter;
|
||||
import org.apache.parquet.hadoop.api.WriteSupport;
|
||||
import org.apache.parquet.hadoop.metadata.CompressionCodecName;
|
||||
import org.apache.spark.SparkConf;
|
||||
import org.apache.spark.api.java.JavaPairRDD;
|
||||
import org.apache.spark.api.java.JavaRDD;
|
||||
import org.apache.spark.api.java.JavaSparkContext;
|
||||
@@ -59,20 +67,10 @@ import org.junit.After;
|
||||
import org.junit.Before;
|
||||
import org.junit.Test;
|
||||
import org.junit.rules.TemporaryFolder;
|
||||
import org.mockito.Mockito;
|
||||
|
||||
import scala.Tuple2;
|
||||
|
||||
import java.io.File;
|
||||
import java.io.IOException;
|
||||
import java.text.SimpleDateFormat;
|
||||
import java.util.*;
|
||||
import java.util.stream.Collectors;
|
||||
import java.util.stream.Stream;
|
||||
|
||||
import static org.junit.Assert.*;
|
||||
|
||||
public class TestHoodieBloomIndex {
|
||||
|
||||
private JavaSparkContext jsc = null;
|
||||
private String basePath = null;
|
||||
private transient final FileSystem fs;
|
||||
@@ -106,15 +104,20 @@ public class TestHoodieBloomIndex {
|
||||
String recordStr4 = "{\"_row_key\":\"4eb5b87c-1fej-4edd-87b4-6ec96dc405a0\",\"time\":\"2015-01-31T03:16:41.415Z\",\"number\":32}";
|
||||
|
||||
TestRawTripPayload rowChange1 = new TestRawTripPayload(recordStr1);
|
||||
HoodieRecord record1 = new HoodieRecord(new HoodieKey(rowChange1.getRowKey(), rowChange1.getPartitionPath()), rowChange1);
|
||||
HoodieRecord record1 = new HoodieRecord(
|
||||
new HoodieKey(rowChange1.getRowKey(), rowChange1.getPartitionPath()), rowChange1);
|
||||
TestRawTripPayload rowChange2 = new TestRawTripPayload(recordStr2);
|
||||
HoodieRecord record2 = new HoodieRecord(new HoodieKey(rowChange2.getRowKey(), rowChange2.getPartitionPath()), rowChange2);
|
||||
HoodieRecord record2 = new HoodieRecord(
|
||||
new HoodieKey(rowChange2.getRowKey(), rowChange2.getPartitionPath()), rowChange2);
|
||||
TestRawTripPayload rowChange3 = new TestRawTripPayload(recordStr3);
|
||||
HoodieRecord record3 = new HoodieRecord(new HoodieKey(rowChange3.getRowKey(), rowChange3.getPartitionPath()), rowChange3);
|
||||
HoodieRecord record3 = new HoodieRecord(
|
||||
new HoodieKey(rowChange3.getRowKey(), rowChange3.getPartitionPath()), rowChange3);
|
||||
TestRawTripPayload rowChange4 = new TestRawTripPayload(recordStr4);
|
||||
HoodieRecord record4 = new HoodieRecord(new HoodieKey(rowChange4.getRowKey(), rowChange4.getPartitionPath()), rowChange4);
|
||||
HoodieRecord record4 = new HoodieRecord(
|
||||
new HoodieKey(rowChange4.getRowKey(), rowChange4.getPartitionPath()), rowChange4);
|
||||
|
||||
JavaRDD<HoodieRecord> recordRDD = jsc.parallelize(Arrays.asList(record1, record2, record3, record4));
|
||||
JavaRDD<HoodieRecord> recordRDD = jsc
|
||||
.parallelize(Arrays.asList(record1, record2, record3, record4));
|
||||
|
||||
// Load to memory
|
||||
HoodieWriteConfig config = HoodieWriteConfig.newBuilder().withPath(basePath).build();
|
||||
@@ -144,20 +147,31 @@ public class TestHoodieBloomIndex {
|
||||
new File(basePath + "/2016/04/01").mkdirs();
|
||||
new File(basePath + "/2015/03/12").mkdirs();
|
||||
|
||||
TestRawTripPayload rowChange1 = new TestRawTripPayload("{\"_row_key\":\"000\",\"time\":\"2016-01-31T03:16:41.415Z\",\"number\":12}");
|
||||
HoodieRecord record1 = new HoodieRecord(new HoodieKey(rowChange1.getRowKey(), rowChange1.getPartitionPath()), rowChange1);
|
||||
TestRawTripPayload rowChange2 = new TestRawTripPayload("{\"_row_key\":\"001\",\"time\":\"2016-01-31T03:16:41.415Z\",\"number\":12}");
|
||||
HoodieRecord record2 = new HoodieRecord(new HoodieKey(rowChange2.getRowKey(), rowChange2.getPartitionPath()), rowChange2);
|
||||
TestRawTripPayload rowChange3 = new TestRawTripPayload("{\"_row_key\":\"002\",\"time\":\"2016-01-31T03:16:41.415Z\",\"number\":12}");
|
||||
HoodieRecord record3 = new HoodieRecord(new HoodieKey(rowChange3.getRowKey(), rowChange3.getPartitionPath()), rowChange3);
|
||||
TestRawTripPayload rowChange4 = new TestRawTripPayload("{\"_row_key\":\"003\",\"time\":\"2016-01-31T03:16:41.415Z\",\"number\":12}");
|
||||
HoodieRecord record4 = new HoodieRecord(new HoodieKey(rowChange4.getRowKey(), rowChange4.getPartitionPath()), rowChange4);
|
||||
TestRawTripPayload rowChange1 = new TestRawTripPayload(
|
||||
"{\"_row_key\":\"000\",\"time\":\"2016-01-31T03:16:41.415Z\",\"number\":12}");
|
||||
HoodieRecord record1 = new HoodieRecord(
|
||||
new HoodieKey(rowChange1.getRowKey(), rowChange1.getPartitionPath()), rowChange1);
|
||||
TestRawTripPayload rowChange2 = new TestRawTripPayload(
|
||||
"{\"_row_key\":\"001\",\"time\":\"2016-01-31T03:16:41.415Z\",\"number\":12}");
|
||||
HoodieRecord record2 = new HoodieRecord(
|
||||
new HoodieKey(rowChange2.getRowKey(), rowChange2.getPartitionPath()), rowChange2);
|
||||
TestRawTripPayload rowChange3 = new TestRawTripPayload(
|
||||
"{\"_row_key\":\"002\",\"time\":\"2016-01-31T03:16:41.415Z\",\"number\":12}");
|
||||
HoodieRecord record3 = new HoodieRecord(
|
||||
new HoodieKey(rowChange3.getRowKey(), rowChange3.getPartitionPath()), rowChange3);
|
||||
TestRawTripPayload rowChange4 = new TestRawTripPayload(
|
||||
"{\"_row_key\":\"003\",\"time\":\"2016-01-31T03:16:41.415Z\",\"number\":12}");
|
||||
HoodieRecord record4 = new HoodieRecord(
|
||||
new HoodieKey(rowChange4.getRowKey(), rowChange4.getPartitionPath()), rowChange4);
|
||||
|
||||
|
||||
writeParquetFile("2016/04/01","2_0_20160401010101.parquet", Lists.newArrayList(), schema, null, false);
|
||||
writeParquetFile("2015/03/12","1_0_20150312101010.parquet", Lists.newArrayList(), schema, null, false);
|
||||
writeParquetFile("2015/03/12","3_0_20150312101010.parquet", Arrays.asList(record1), schema, null, false);
|
||||
writeParquetFile("2015/03/12","4_0_20150312101010.parquet", Arrays.asList(record2, record3, record4), schema, null, false);
|
||||
writeParquetFile("2016/04/01", "2_0_20160401010101.parquet", Lists.newArrayList(), schema, null,
|
||||
false);
|
||||
writeParquetFile("2015/03/12", "1_0_20150312101010.parquet", Lists.newArrayList(), schema, null,
|
||||
false);
|
||||
writeParquetFile("2015/03/12", "3_0_20150312101010.parquet", Arrays.asList(record1), schema,
|
||||
null, false);
|
||||
writeParquetFile("2015/03/12", "4_0_20150312101010.parquet",
|
||||
Arrays.asList(record2, record3, record4), schema, null, false);
|
||||
|
||||
List<String> partitions = Arrays.asList("2016/01/21", "2016/04/01", "2015/03/12");
|
||||
HoodieTableMetaClient metadata = new HoodieTableMetaClient(fs, basePath);
|
||||
@@ -185,9 +199,11 @@ public class TestHoodieBloomIndex {
|
||||
|
||||
List<Tuple2<String, BloomIndexFileInfo>> expected = Arrays.asList(
|
||||
new Tuple2<>("2016/04/01", new BloomIndexFileInfo("2_0_20160401010101.parquet")),
|
||||
new Tuple2<>("2015/03/12",new BloomIndexFileInfo("1_0_20150312101010.parquet")),
|
||||
new Tuple2<>("2015/03/12",new BloomIndexFileInfo("3_0_20150312101010.parquet", "000", "000")),
|
||||
new Tuple2<>("2015/03/12",new BloomIndexFileInfo("4_0_20150312101010.parquet", "001", "003"))
|
||||
new Tuple2<>("2015/03/12", new BloomIndexFileInfo("1_0_20150312101010.parquet")),
|
||||
new Tuple2<>("2015/03/12",
|
||||
new BloomIndexFileInfo("3_0_20150312101010.parquet", "000", "000")),
|
||||
new Tuple2<>("2015/03/12",
|
||||
new BloomIndexFileInfo("4_0_20150312101010.parquet", "001", "003"))
|
||||
);
|
||||
assertEquals(expected, filesList);
|
||||
}
|
||||
@@ -200,7 +216,6 @@ public class TestHoodieBloomIndex {
|
||||
.build();
|
||||
HoodieBloomIndex index = new HoodieBloomIndex(config, jsc);
|
||||
|
||||
|
||||
final Map<String, List<BloomIndexFileInfo>> partitionToFileIndexInfo = new HashMap<>();
|
||||
partitionToFileIndexInfo.put("2017/10/22", Arrays.asList(
|
||||
new BloomIndexFileInfo("f1"),
|
||||
@@ -212,14 +227,13 @@ public class TestHoodieBloomIndex {
|
||||
|
||||
JavaPairRDD<String, String> partitionRecordKeyPairRDD = jsc
|
||||
.parallelize(Arrays.asList(
|
||||
new Tuple2<>("2017/10/22","003"),
|
||||
new Tuple2<>("2017/10/22","002"),
|
||||
new Tuple2<>("2017/10/22","005"),
|
||||
new Tuple2<>("2017/10/22","004")
|
||||
new Tuple2<>("2017/10/22", "003"),
|
||||
new Tuple2<>("2017/10/22", "002"),
|
||||
new Tuple2<>("2017/10/22", "005"),
|
||||
new Tuple2<>("2017/10/22", "004")
|
||||
))
|
||||
.mapToPair(t -> t);
|
||||
|
||||
|
||||
List<Tuple2<String, Tuple2<String, HoodieKey>>> comparisonKeyList = index
|
||||
.explodeRecordRDDWithFileComparisons(partitionToFileIndexInfo, partitionRecordKeyPairRDD)
|
||||
.collect();
|
||||
@@ -240,7 +254,8 @@ public class TestHoodieBloomIndex {
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testCheckUUIDsAgainstOneFile() throws IOException, InterruptedException, ClassNotFoundException {
|
||||
public void testCheckUUIDsAgainstOneFile()
|
||||
throws IOException, InterruptedException, ClassNotFoundException {
|
||||
|
||||
// Create some records to use
|
||||
String recordStr1 = "{\"_row_key\":\"1eb5b87a-1feh-4edd-87b4-6ec96dc405a0\",\"time\":\"2016-01-31T03:16:41.415Z\",\"number\":12}";
|
||||
@@ -248,19 +263,23 @@ public class TestHoodieBloomIndex {
|
||||
String recordStr3 = "{\"_row_key\":\"3eb5b87c-1fej-4edd-87b4-6ec96dc405a0\",\"time\":\"2016-01-31T03:16:41.415Z\",\"number\":15}";
|
||||
String recordStr4 = "{\"_row_key\":\"4eb5b87c-1fej-4edd-87b4-6ec96dc405a0\",\"time\":\"2016-01-31T03:16:41.415Z\",\"number\":32}";
|
||||
TestRawTripPayload rowChange1 = new TestRawTripPayload(recordStr1);
|
||||
HoodieRecord record1 = new HoodieRecord(new HoodieKey(rowChange1.getRowKey(), rowChange1.getPartitionPath()), rowChange1);
|
||||
HoodieRecord record1 = new HoodieRecord(
|
||||
new HoodieKey(rowChange1.getRowKey(), rowChange1.getPartitionPath()), rowChange1);
|
||||
TestRawTripPayload rowChange2 = new TestRawTripPayload(recordStr2);
|
||||
HoodieRecord record2 = new HoodieRecord(new HoodieKey(rowChange2.getRowKey(), rowChange2.getPartitionPath()), rowChange2);
|
||||
HoodieRecord record2 = new HoodieRecord(
|
||||
new HoodieKey(rowChange2.getRowKey(), rowChange2.getPartitionPath()), rowChange2);
|
||||
TestRawTripPayload rowChange3 = new TestRawTripPayload(recordStr3);
|
||||
HoodieRecord record3 = new HoodieRecord(new HoodieKey(rowChange3.getRowKey(), rowChange3.getPartitionPath()), rowChange3);
|
||||
HoodieRecord record3 = new HoodieRecord(
|
||||
new HoodieKey(rowChange3.getRowKey(), rowChange3.getPartitionPath()), rowChange3);
|
||||
TestRawTripPayload rowChange4 = new TestRawTripPayload(recordStr4);
|
||||
HoodieRecord record4 = new HoodieRecord(new HoodieKey(rowChange4.getRowKey(), rowChange4.getPartitionPath()), rowChange4);
|
||||
|
||||
HoodieRecord record4 = new HoodieRecord(
|
||||
new HoodieKey(rowChange4.getRowKey(), rowChange4.getPartitionPath()), rowChange4);
|
||||
|
||||
// We write record1, record2 to a parquet file, but the bloom filter contains (record1, record2, record3).
|
||||
BloomFilter filter = new BloomFilter(10000, 0.0000001);
|
||||
filter.add(record3.getRecordKey());
|
||||
String filename = writeParquetFile("2016/01/31", Arrays.asList(record1, record2), schema, filter, true);
|
||||
String filename = writeParquetFile("2016/01/31", Arrays.asList(record1, record2), schema,
|
||||
filter, true);
|
||||
|
||||
// The bloom filter contains 3 records
|
||||
assertTrue(filter.mightContain(record1.getRecordKey()));
|
||||
@@ -299,7 +318,8 @@ public class TestHoodieBloomIndex {
|
||||
try {
|
||||
bloomIndex.tagLocation(recordRDD, table);
|
||||
} catch (IllegalArgumentException e) {
|
||||
fail("EmptyRDD should not result in IllegalArgumentException: Positive number of slices required");
|
||||
fail(
|
||||
"EmptyRDD should not result in IllegalArgumentException: Positive number of slices required");
|
||||
}
|
||||
}
|
||||
|
||||
@@ -313,14 +333,19 @@ public class TestHoodieBloomIndex {
|
||||
String recordStr3 = "{\"_row_key\":\"3eb5b87c-1fej-4edd-87b4-6ec96dc405a0\",\"time\":\"2016-01-31T03:16:41.415Z\",\"number\":15}";
|
||||
String recordStr4 = "{\"_row_key\":\"4eb5b87c-1fej-4edd-87b4-6ec96dc405a0\",\"time\":\"2015-01-31T03:16:41.415Z\",\"number\":32}";
|
||||
TestRawTripPayload rowChange1 = new TestRawTripPayload(recordStr1);
|
||||
HoodieRecord record1 = new HoodieRecord(new HoodieKey(rowChange1.getRowKey(), rowChange1.getPartitionPath()), rowChange1);
|
||||
HoodieRecord record1 = new HoodieRecord(
|
||||
new HoodieKey(rowChange1.getRowKey(), rowChange1.getPartitionPath()), rowChange1);
|
||||
TestRawTripPayload rowChange2 = new TestRawTripPayload(recordStr2);
|
||||
HoodieRecord record2 = new HoodieRecord(new HoodieKey(rowChange2.getRowKey(), rowChange2.getPartitionPath()), rowChange2);
|
||||
HoodieRecord record2 = new HoodieRecord(
|
||||
new HoodieKey(rowChange2.getRowKey(), rowChange2.getPartitionPath()), rowChange2);
|
||||
TestRawTripPayload rowChange3 = new TestRawTripPayload(recordStr3);
|
||||
HoodieRecord record3 = new HoodieRecord(new HoodieKey(rowChange3.getRowKey(), rowChange3.getPartitionPath()), rowChange3);
|
||||
HoodieRecord record3 = new HoodieRecord(
|
||||
new HoodieKey(rowChange3.getRowKey(), rowChange3.getPartitionPath()), rowChange3);
|
||||
TestRawTripPayload rowChange4 = new TestRawTripPayload(recordStr4);
|
||||
HoodieRecord record4 = new HoodieRecord(new HoodieKey(rowChange4.getRowKey(), rowChange4.getPartitionPath()), rowChange4);
|
||||
JavaRDD<HoodieRecord> recordRDD = jsc.parallelize(Arrays.asList(record1, record2, record3, record4));
|
||||
HoodieRecord record4 = new HoodieRecord(
|
||||
new HoodieKey(rowChange4.getRowKey(), rowChange4.getPartitionPath()), rowChange4);
|
||||
JavaRDD<HoodieRecord> recordRDD = jsc
|
||||
.parallelize(Arrays.asList(record1, record2, record3, record4));
|
||||
|
||||
// Also create the metadata and config
|
||||
HoodieTableMetaClient metadata = new HoodieTableMetaClient(fs, basePath);
|
||||
@@ -390,7 +415,8 @@ public class TestHoodieBloomIndex {
|
||||
|
||||
// Let's tag
|
||||
HoodieBloomIndex bloomIndex = new HoodieBloomIndex(config, jsc);
|
||||
JavaPairRDD<HoodieKey, Optional<String>> taggedRecordRDD = bloomIndex.fetchRecordLocation(keysRDD, table);
|
||||
JavaPairRDD<HoodieKey, Optional<String>> taggedRecordRDD = bloomIndex
|
||||
.fetchRecordLocation(keysRDD, table);
|
||||
|
||||
// Should not find any files
|
||||
for (Tuple2<HoodieKey, Optional<String>> record : taggedRecordRDD.collect()) {
|
||||
@@ -436,9 +462,11 @@ public class TestHoodieBloomIndex {
|
||||
|
||||
// We write record1 to a parquet file, using a bloom filter having both records
|
||||
TestRawTripPayload rowChange1 = new TestRawTripPayload(recordStr1);
|
||||
HoodieRecord record1 = new HoodieRecord(new HoodieKey(rowChange1.getRowKey(), rowChange1.getPartitionPath()), rowChange1);
|
||||
HoodieRecord record1 = new HoodieRecord(
|
||||
new HoodieKey(rowChange1.getRowKey(), rowChange1.getPartitionPath()), rowChange1);
|
||||
TestRawTripPayload rowChange2 = new TestRawTripPayload(recordStr2);
|
||||
HoodieRecord record2 = new HoodieRecord(new HoodieKey(rowChange2.getRowKey(), rowChange2.getPartitionPath()), rowChange2);
|
||||
HoodieRecord record2 = new HoodieRecord(
|
||||
new HoodieKey(rowChange2.getRowKey(), rowChange2.getPartitionPath()), rowChange2);
|
||||
|
||||
BloomFilter filter = new BloomFilter(10000, 0.0000001);
|
||||
filter.add(record2.getRecordKey());
|
||||
@@ -472,21 +500,22 @@ public class TestHoodieBloomIndex {
|
||||
String fileId = UUID.randomUUID().toString();
|
||||
String filename = FSUtils.makeDataFileName(commitTime, 1, fileId);
|
||||
|
||||
|
||||
return writeParquetFile(partitionPath, filename, records, schema, filter, createCommitTime);
|
||||
}
|
||||
|
||||
private String writeParquetFile(String partitionPath, String filename, List<HoodieRecord> records, Schema schema,
|
||||
private String writeParquetFile(String partitionPath, String filename, List<HoodieRecord> records,
|
||||
Schema schema,
|
||||
BloomFilter filter, boolean createCommitTime) throws IOException {
|
||||
|
||||
|
||||
if (filter == null) {
|
||||
filter = new BloomFilter(10000, 0.0000001);
|
||||
}
|
||||
HoodieAvroWriteSupport writeSupport = new HoodieAvroWriteSupport(new AvroSchemaConverter().convert(schema), schema, filter);
|
||||
HoodieAvroWriteSupport writeSupport = new HoodieAvroWriteSupport(
|
||||
new AvroSchemaConverter().convert(schema), schema, filter);
|
||||
String commitTime = FSUtils.getCommitTime(filename);
|
||||
HoodieParquetConfig config = new HoodieParquetConfig(writeSupport, CompressionCodecName.GZIP,
|
||||
ParquetWriter.DEFAULT_BLOCK_SIZE, ParquetWriter.DEFAULT_PAGE_SIZE, 120 * 1024 * 1024, new Configuration());
|
||||
ParquetWriter.DEFAULT_BLOCK_SIZE, ParquetWriter.DEFAULT_PAGE_SIZE, 120 * 1024 * 1024,
|
||||
new Configuration());
|
||||
HoodieParquetWriter writer = new HoodieParquetWriter(
|
||||
commitTime,
|
||||
new Path(basePath + "/" + partitionPath + "/" + filename),
|
||||
@@ -496,7 +525,9 @@ public class TestHoodieBloomIndex {
|
||||
for (HoodieRecord record : records) {
|
||||
GenericRecord avroRecord = (GenericRecord) record.getData().getInsertValue(schema).get();
|
||||
HoodieAvroUtils.addCommitMetadataToRecord(avroRecord, commitTime, "" + seqId++);
|
||||
HoodieAvroUtils.addHoodieKeyToRecord(avroRecord, record.getRecordKey(), record.getPartitionPath(), filename);
|
||||
HoodieAvroUtils
|
||||
.addHoodieKeyToRecord(avroRecord, record.getRecordKey(), record.getPartitionPath(),
|
||||
filename);
|
||||
writer.writeAvro(record.getRecordKey(), avroRecord);
|
||||
filter.add(record.getRecordKey());
|
||||
}
|
||||
@@ -505,7 +536,9 @@ public class TestHoodieBloomIndex {
|
||||
if (createCommitTime) {
|
||||
// Also make sure the commit is valid
|
||||
new File(basePath + "/" + HoodieTableMetaClient.METAFOLDER_NAME).mkdirs();
|
||||
new File(basePath + "/" + HoodieTableMetaClient.METAFOLDER_NAME + "/" + commitTime + ".commit").createNewFile();
|
||||
new File(
|
||||
basePath + "/" + HoodieTableMetaClient.METAFOLDER_NAME + "/" + commitTime + ".commit")
|
||||
.createNewFile();
|
||||
}
|
||||
return filename;
|
||||
}
|
||||
|
||||
@@ -16,9 +16,11 @@
|
||||
|
||||
package com.uber.hoodie.io;
|
||||
|
||||
import static org.junit.Assert.assertEquals;
|
||||
import static org.junit.Assert.assertTrue;
|
||||
|
||||
import com.uber.hoodie.avro.model.HoodieArchivedMetaEntry;
|
||||
import com.uber.hoodie.common.HoodieTestDataGenerator;
|
||||
import com.uber.hoodie.common.model.HoodieArchivedLogFile;
|
||||
import com.uber.hoodie.common.model.HoodieLogFile;
|
||||
import com.uber.hoodie.common.model.HoodieTestUtils;
|
||||
import com.uber.hoodie.common.table.HoodieTableMetaClient;
|
||||
@@ -29,6 +31,11 @@ import com.uber.hoodie.common.table.timeline.HoodieInstant;
|
||||
import com.uber.hoodie.common.util.FSUtils;
|
||||
import com.uber.hoodie.config.HoodieCompactionConfig;
|
||||
import com.uber.hoodie.config.HoodieWriteConfig;
|
||||
import java.io.IOException;
|
||||
import java.util.ArrayList;
|
||||
import java.util.Collections;
|
||||
import java.util.List;
|
||||
import java.util.stream.Collectors;
|
||||
import org.apache.avro.generic.GenericRecord;
|
||||
import org.apache.avro.generic.IndexedRecord;
|
||||
import org.apache.hadoop.fs.FileSystem;
|
||||
@@ -37,16 +44,8 @@ import org.junit.Before;
|
||||
import org.junit.Test;
|
||||
import org.junit.rules.TemporaryFolder;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.util.ArrayList;
|
||||
import java.util.Collections;
|
||||
import java.util.List;
|
||||
import java.util.stream.Collectors;
|
||||
|
||||
import static org.junit.Assert.assertEquals;
|
||||
import static org.junit.Assert.assertTrue;
|
||||
|
||||
public class TestHoodieCommitArchiveLog {
|
||||
|
||||
private String basePath;
|
||||
private FileSystem fs;
|
||||
|
||||
@@ -97,7 +96,8 @@ public class TestHoodieCommitArchiveLog {
|
||||
HoodieTestUtils.createCleanFiles(basePath, "105");
|
||||
|
||||
//reload the timeline and get all the commmits before archive
|
||||
timeline = metadata.getActiveTimeline().reload().getAllCommitsTimeline().filterCompletedInstants();
|
||||
timeline = metadata.getActiveTimeline().reload().getAllCommitsTimeline()
|
||||
.filterCompletedInstants();
|
||||
List<HoodieInstant> originalCommits = timeline.getInstants().collect(Collectors.toList());
|
||||
|
||||
assertEquals("Loaded 6 commits and the count should match", 12, timeline.countInstants());
|
||||
@@ -107,27 +107,30 @@ public class TestHoodieCommitArchiveLog {
|
||||
assertTrue(archiveLog.archiveIfRequired());
|
||||
|
||||
//reload the timeline and remove the remaining commits
|
||||
timeline = metadata.getActiveTimeline().reload().getAllCommitsTimeline().filterCompletedInstants();
|
||||
timeline = metadata.getActiveTimeline().reload().getAllCommitsTimeline()
|
||||
.filterCompletedInstants();
|
||||
originalCommits.removeAll(timeline.getInstants().collect(Collectors.toList()));
|
||||
|
||||
//read the file
|
||||
HoodieLogFormat.Reader reader = HoodieLogFormat.newReader(FSUtils.getFs(),
|
||||
new HoodieLogFile(new Path(basePath + "/.hoodie/.commits_.archive.1")), HoodieArchivedMetaEntry.getClassSchema(), false);
|
||||
new HoodieLogFile(new Path(basePath + "/.hoodie/.commits_.archive.1")),
|
||||
HoodieArchivedMetaEntry.getClassSchema(), false);
|
||||
|
||||
int archivedRecordsCount = 0;
|
||||
List<IndexedRecord> readRecords = new ArrayList<>();
|
||||
//read the avro blocks and validate the number of records written in each avro block
|
||||
while(reader.hasNext()) {
|
||||
while (reader.hasNext()) {
|
||||
HoodieAvroDataBlock blk = (HoodieAvroDataBlock) reader.next();
|
||||
List<IndexedRecord> records = blk.getRecords();
|
||||
readRecords.addAll(records);
|
||||
assertEquals("Archived and read records for each block are same", 8, records.size());
|
||||
archivedRecordsCount += records.size();
|
||||
}
|
||||
assertEquals("Total archived records and total read records are the same count", 8, archivedRecordsCount);
|
||||
assertEquals("Total archived records and total read records are the same count", 8,
|
||||
archivedRecordsCount);
|
||||
|
||||
//make sure the archived commits are the same as the (originalcommits - commitsleft)
|
||||
List<String> readCommits = readRecords.stream().map(r -> (GenericRecord)r).map(r -> {
|
||||
List<String> readCommits = readRecords.stream().map(r -> (GenericRecord) r).map(r -> {
|
||||
return r.get("commitTime").toString();
|
||||
}).collect(Collectors.toList());
|
||||
Collections.sort(readCommits);
|
||||
@@ -158,7 +161,8 @@ public class TestHoodieCommitArchiveLog {
|
||||
boolean result = archiveLog.archiveIfRequired();
|
||||
assertTrue(result);
|
||||
timeline =
|
||||
metadata.getActiveTimeline().reload().getCommitsAndCompactionsTimeline().filterCompletedInstants();
|
||||
metadata.getActiveTimeline().reload().getCommitsAndCompactionsTimeline()
|
||||
.filterCompletedInstants();
|
||||
assertEquals("Should not archive commits when maxCommitsToKeep is 5", 4,
|
||||
timeline.countInstants());
|
||||
}
|
||||
@@ -184,7 +188,8 @@ public class TestHoodieCommitArchiveLog {
|
||||
boolean result = archiveLog.archiveIfRequired();
|
||||
assertTrue(result);
|
||||
timeline =
|
||||
metadata.getActiveTimeline().reload().getCommitsAndCompactionsTimeline().filterCompletedInstants();
|
||||
metadata.getActiveTimeline().reload().getCommitsAndCompactionsTimeline()
|
||||
.filterCompletedInstants();
|
||||
assertTrue("Archived commits should always be safe",
|
||||
timeline.containsOrBeforeTimelineStarts("100"));
|
||||
assertTrue("Archived commits should always be safe",
|
||||
@@ -217,7 +222,8 @@ public class TestHoodieCommitArchiveLog {
|
||||
boolean result = archiveLog.archiveIfRequired();
|
||||
assertTrue(result);
|
||||
timeline =
|
||||
metadata.getActiveTimeline().reload().getCommitsAndCompactionsTimeline().filterCompletedInstants();
|
||||
metadata.getActiveTimeline().reload().getCommitsAndCompactionsTimeline()
|
||||
.filterCompletedInstants();
|
||||
assertEquals(
|
||||
"Since we have a savepoint at 101, we should never archive any commit after 101 (we only archive 100)",
|
||||
5, timeline.countInstants());
|
||||
|
||||
@@ -16,7 +16,9 @@
|
||||
|
||||
package com.uber.hoodie.io;
|
||||
|
||||
import com.uber.hoodie.HoodieReadClient;
|
||||
import static org.junit.Assert.assertEquals;
|
||||
import static org.junit.Assert.assertTrue;
|
||||
|
||||
import com.uber.hoodie.HoodieWriteClient;
|
||||
import com.uber.hoodie.WriteStatus;
|
||||
import com.uber.hoodie.common.HoodieClientTestUtils;
|
||||
@@ -34,13 +36,16 @@ import com.uber.hoodie.config.HoodieCompactionConfig;
|
||||
import com.uber.hoodie.config.HoodieIndexConfig;
|
||||
import com.uber.hoodie.config.HoodieStorageConfig;
|
||||
import com.uber.hoodie.config.HoodieWriteConfig;
|
||||
import com.uber.hoodie.index.bloom.HoodieBloomIndex;
|
||||
import com.uber.hoodie.index.HoodieIndex;
|
||||
import com.uber.hoodie.index.bloom.HoodieBloomIndex;
|
||||
import com.uber.hoodie.io.compact.HoodieCompactor;
|
||||
import com.uber.hoodie.io.compact.HoodieRealtimeTableCompactor;
|
||||
import com.uber.hoodie.table.HoodieTable;
|
||||
import java.io.File;
|
||||
import java.io.IOException;
|
||||
import java.util.List;
|
||||
import java.util.stream.Collectors;
|
||||
import org.apache.hadoop.fs.FileSystem;
|
||||
import org.apache.spark.SparkConf;
|
||||
import org.apache.spark.api.java.JavaRDD;
|
||||
import org.apache.spark.api.java.JavaSparkContext;
|
||||
import org.junit.After;
|
||||
@@ -48,15 +53,8 @@ import org.junit.Before;
|
||||
import org.junit.Test;
|
||||
import org.junit.rules.TemporaryFolder;
|
||||
|
||||
import java.io.File;
|
||||
import java.io.IOException;
|
||||
import java.util.List;
|
||||
import java.util.stream.Collectors;
|
||||
|
||||
import static org.junit.Assert.assertEquals;
|
||||
import static org.junit.Assert.assertTrue;
|
||||
|
||||
public class TestHoodieCompactor {
|
||||
|
||||
private transient JavaSparkContext jsc = null;
|
||||
private String basePath = null;
|
||||
private HoodieCompactor compactor;
|
||||
@@ -194,7 +192,7 @@ public class TestHoodieCompactor {
|
||||
List<FileSlice> groupedLogFiles = table.getRTFileSystemView()
|
||||
.getLatestFileSlices(partitionPath)
|
||||
.collect(Collectors.toList());
|
||||
for (FileSlice slice: groupedLogFiles) {
|
||||
for (FileSlice slice : groupedLogFiles) {
|
||||
assertTrue(
|
||||
"After compaction there should be no log files visiable on a Realtime view",
|
||||
slice.getLogFiles().collect(Collectors.toList()).isEmpty());
|
||||
|
||||
@@ -17,12 +17,10 @@
|
||||
package com.uber.hoodie.io.strategy;
|
||||
|
||||
import static org.junit.Assert.assertEquals;
|
||||
import static org.junit.Assert.assertThat;
|
||||
import static org.junit.Assert.assertTrue;
|
||||
|
||||
import com.beust.jcommander.internal.Lists;
|
||||
import com.google.common.collect.Maps;
|
||||
|
||||
import com.uber.hoodie.config.HoodieCompactionConfig;
|
||||
import com.uber.hoodie.config.HoodieWriteConfig;
|
||||
import com.uber.hoodie.io.compact.CompactionOperation;
|
||||
|
||||
@@ -17,9 +17,7 @@
|
||||
package com.uber.hoodie.io.strategy;
|
||||
|
||||
import com.uber.hoodie.common.model.HoodieDataFile;
|
||||
import com.uber.hoodie.common.util.FSUtils;
|
||||
import java.util.UUID;
|
||||
import org.apache.hadoop.fs.FileStatus;
|
||||
|
||||
public class TestHoodieDataFile extends HoodieDataFile {
|
||||
|
||||
|
||||
@@ -18,7 +18,6 @@ package com.uber.hoodie.io.strategy;
|
||||
|
||||
import com.uber.hoodie.common.model.HoodieLogFile;
|
||||
import java.util.Optional;
|
||||
|
||||
import org.apache.hadoop.fs.Path;
|
||||
|
||||
public class TestHoodieLogFile extends HoodieLogFile {
|
||||
|
||||
@@ -16,17 +16,17 @@
|
||||
|
||||
package com.uber.hoodie.metrics;
|
||||
|
||||
import com.uber.hoodie.config.HoodieWriteConfig;
|
||||
|
||||
import org.apache.commons.configuration.ConfigurationException;
|
||||
import org.junit.Before;
|
||||
import org.junit.Test;
|
||||
|
||||
import static org.junit.Assert.assertTrue;
|
||||
import static org.mockito.Mockito.mock;
|
||||
import static org.mockito.Mockito.when;
|
||||
|
||||
import com.uber.hoodie.config.HoodieWriteConfig;
|
||||
import org.apache.commons.configuration.ConfigurationException;
|
||||
import org.junit.Before;
|
||||
import org.junit.Test;
|
||||
|
||||
public class TestHoodieMetrics {
|
||||
|
||||
private HoodieMetrics metrics = null;
|
||||
|
||||
@Before
|
||||
@@ -40,6 +40,7 @@ public class TestHoodieMetrics {
|
||||
@Test
|
||||
public void testRegisterGauge() {
|
||||
metrics.registerGauge("metric1", 123L);
|
||||
assertTrue(Metrics.getInstance().getRegistry().getGauges().get("metric1").getValue().toString().equals("123"));
|
||||
assertTrue(Metrics.getInstance().getRegistry().getGauges().get("metric1").getValue().toString()
|
||||
.equals("123"));
|
||||
}
|
||||
}
|
||||
|
||||
@@ -16,26 +16,37 @@
|
||||
|
||||
package com.uber.hoodie.table;
|
||||
|
||||
import com.uber.hoodie.common.TestRawTripPayload.MetadataMergeWriteStatus;
|
||||
import com.uber.hoodie.common.table.HoodieTableMetaClient;
|
||||
import com.uber.hoodie.common.table.HoodieTimeline;
|
||||
import com.uber.hoodie.config.HoodieWriteConfig;
|
||||
import static org.junit.Assert.assertEquals;
|
||||
import static org.junit.Assert.assertFalse;
|
||||
import static org.junit.Assert.assertTrue;
|
||||
import static org.mockito.Mockito.mock;
|
||||
import static org.mockito.Mockito.when;
|
||||
|
||||
import com.uber.hoodie.WriteStatus;
|
||||
import com.uber.hoodie.common.BloomFilter;
|
||||
import com.uber.hoodie.common.HoodieClientTestUtils;
|
||||
import com.uber.hoodie.common.HoodieTestDataGenerator;
|
||||
import com.uber.hoodie.common.TestRawTripPayload;
|
||||
import com.uber.hoodie.common.TestRawTripPayload.MetadataMergeWriteStatus;
|
||||
import com.uber.hoodie.common.model.HoodieKey;
|
||||
import com.uber.hoodie.common.model.HoodieRecord;
|
||||
import com.uber.hoodie.common.model.HoodieRecordLocation;
|
||||
import com.uber.hoodie.common.model.HoodieTestUtils;
|
||||
import com.uber.hoodie.common.table.HoodieTableMetaClient;
|
||||
import com.uber.hoodie.common.table.HoodieTimeline;
|
||||
import com.uber.hoodie.common.util.FSUtils;
|
||||
import com.uber.hoodie.common.util.ParquetUtils;
|
||||
|
||||
import com.uber.hoodie.config.HoodieCompactionConfig;
|
||||
import com.uber.hoodie.io.HoodieCreateHandle;
|
||||
import com.uber.hoodie.config.HoodieStorageConfig;
|
||||
import com.uber.hoodie.config.HoodieWriteConfig;
|
||||
import com.uber.hoodie.io.HoodieCreateHandle;
|
||||
import java.io.File;
|
||||
import java.util.ArrayList;
|
||||
import java.util.Arrays;
|
||||
import java.util.Iterator;
|
||||
import java.util.List;
|
||||
import java.util.Map;
|
||||
import java.util.UUID;
|
||||
import org.apache.avro.generic.GenericRecord;
|
||||
import org.apache.commons.io.IOUtils;
|
||||
import org.apache.hadoop.fs.FileSystem;
|
||||
@@ -47,22 +58,11 @@ import org.junit.After;
|
||||
import org.junit.Before;
|
||||
import org.junit.Test;
|
||||
import org.junit.rules.TemporaryFolder;
|
||||
|
||||
import java.io.File;
|
||||
import java.util.ArrayList;
|
||||
import java.util.Arrays;
|
||||
import java.util.Iterator;
|
||||
import java.util.List;
|
||||
import java.util.UUID;
|
||||
|
||||
import scala.Option;
|
||||
import scala.Tuple2;
|
||||
|
||||
import static org.junit.Assert.*;
|
||||
import static org.mockito.Mockito.mock;
|
||||
import static org.mockito.Mockito.when;
|
||||
|
||||
public class TestCopyOnWriteTable {
|
||||
|
||||
private String basePath = null;
|
||||
private transient JavaSparkContext jsc = null;
|
||||
|
||||
@@ -104,7 +104,8 @@ public class TestCopyOnWriteTable {
|
||||
|
||||
private HoodieWriteConfig.Builder makeHoodieClientConfigBuilder() throws Exception {
|
||||
// Prepare the AvroParquetIO
|
||||
String schemaStr = IOUtils.toString(getClass().getResourceAsStream("/exampleSchema.txt"), "UTF-8");
|
||||
String schemaStr = IOUtils
|
||||
.toString(getClass().getResourceAsStream("/exampleSchema.txt"), "UTF-8");
|
||||
return HoodieWriteConfig.newBuilder().withPath(basePath).withSchema(schemaStr);
|
||||
}
|
||||
|
||||
@@ -127,11 +128,17 @@ public class TestCopyOnWriteTable {
|
||||
|
||||
List<HoodieRecord> records = new ArrayList<>();
|
||||
TestRawTripPayload rowChange1 = new TestRawTripPayload(recordStr1);
|
||||
records.add(new HoodieRecord(new HoodieKey(rowChange1.getRowKey(), rowChange1.getPartitionPath()), rowChange1));
|
||||
records.add(
|
||||
new HoodieRecord(new HoodieKey(rowChange1.getRowKey(), rowChange1.getPartitionPath()),
|
||||
rowChange1));
|
||||
TestRawTripPayload rowChange2 = new TestRawTripPayload(recordStr2);
|
||||
records.add(new HoodieRecord(new HoodieKey(rowChange2.getRowKey(), rowChange2.getPartitionPath()), rowChange2));
|
||||
records.add(
|
||||
new HoodieRecord(new HoodieKey(rowChange2.getRowKey(), rowChange2.getPartitionPath()),
|
||||
rowChange2));
|
||||
TestRawTripPayload rowChange3 = new TestRawTripPayload(recordStr3);
|
||||
records.add(new HoodieRecord(new HoodieKey(rowChange3.getRowKey(), rowChange3.getPartitionPath()), rowChange3));
|
||||
records.add(
|
||||
new HoodieRecord(new HoodieKey(rowChange3.getRowKey(), rowChange3.getPartitionPath()),
|
||||
rowChange3));
|
||||
|
||||
// Insert new records
|
||||
HoodieClientTestUtils.collectStatuses(table.handleInsert(firstCommitTime, records.iterator()));
|
||||
@@ -159,7 +166,7 @@ public class TestCopyOnWriteTable {
|
||||
List<GenericRecord> fileRecords = ParquetUtils.readAvroRecords(parquetFilePath);
|
||||
GenericRecord newRecord;
|
||||
int index = 0;
|
||||
for (GenericRecord record: fileRecords) {
|
||||
for (GenericRecord record : fileRecords) {
|
||||
assertTrue(record.get("_row_key").toString().equals(records.get(index).getRecordKey()));
|
||||
index++;
|
||||
}
|
||||
@@ -167,11 +174,15 @@ public class TestCopyOnWriteTable {
|
||||
// We update the 1st record & add a new record
|
||||
String updateRecordStr1 = "{\"_row_key\":\"8eb5b87a-1feh-4edd-87b4-6ec96dc405a0\",\"time\":\"2016-01-31T03:16:41.415Z\",\"number\":15}";
|
||||
TestRawTripPayload updateRowChanges1 = new TestRawTripPayload(updateRecordStr1);
|
||||
HoodieRecord updatedRecord1 = new HoodieRecord(new HoodieKey(updateRowChanges1.getRowKey(), updateRowChanges1.getPartitionPath()), updateRowChanges1);
|
||||
updatedRecord1.setCurrentLocation(new HoodieRecordLocation(null, FSUtils.getFileId(parquetFile.getName())));
|
||||
HoodieRecord updatedRecord1 = new HoodieRecord(
|
||||
new HoodieKey(updateRowChanges1.getRowKey(), updateRowChanges1.getPartitionPath()),
|
||||
updateRowChanges1);
|
||||
updatedRecord1.setCurrentLocation(
|
||||
new HoodieRecordLocation(null, FSUtils.getFileId(parquetFile.getName())));
|
||||
|
||||
TestRawTripPayload rowChange4 = new TestRawTripPayload(recordStr4);
|
||||
HoodieRecord insertedRecord1 = new HoodieRecord(new HoodieKey(rowChange4.getRowKey(), rowChange4.getPartitionPath()), rowChange4);
|
||||
HoodieRecord insertedRecord1 = new HoodieRecord(
|
||||
new HoodieKey(rowChange4.getRowKey(), rowChange4.getPartitionPath()), rowChange4);
|
||||
|
||||
List<HoodieRecord> updatedRecords = Arrays.asList(updatedRecord1, insertedRecord1);
|
||||
|
||||
@@ -179,7 +190,9 @@ public class TestCopyOnWriteTable {
|
||||
String newCommitTime = HoodieTestUtils.makeNewCommitTime();
|
||||
metadata = new HoodieTableMetaClient(FSUtils.getFs(), basePath);
|
||||
table = new HoodieCopyOnWriteTable(config, metadata);
|
||||
Iterator<List<WriteStatus>> iter = table.handleUpdate(newCommitTime, updatedRecord1.getCurrentLocation().getFileId(), updatedRecords.iterator());
|
||||
Iterator<List<WriteStatus>> iter = table
|
||||
.handleUpdate(newCommitTime, updatedRecord1.getCurrentLocation().getFileId(),
|
||||
updatedRecords.iterator());
|
||||
|
||||
// Check the updated file
|
||||
File updatedParquetFile = null;
|
||||
@@ -197,7 +210,8 @@ public class TestCopyOnWriteTable {
|
||||
assertTrue(updatedParquetFile != null);
|
||||
// Check whether the record has been updated
|
||||
Path updatedParquetFilePath = new Path(updatedParquetFile.getAbsolutePath());
|
||||
BloomFilter updatedFilter = ParquetUtils.readBloomFilterFromParquetMetadata(updatedParquetFilePath);
|
||||
BloomFilter updatedFilter = ParquetUtils
|
||||
.readBloomFilterFromParquetMetadata(updatedParquetFilePath);
|
||||
for (HoodieRecord record : records) {
|
||||
// No change to the _row_key
|
||||
assertTrue(updatedFilter.mightContain(record.getRecordKey()));
|
||||
@@ -206,7 +220,8 @@ public class TestCopyOnWriteTable {
|
||||
assertTrue(updatedFilter.mightContain(insertedRecord1.getRecordKey()));
|
||||
records.add(insertedRecord1);// add this so it can further check below
|
||||
|
||||
ParquetReader updatedReader = ParquetReader.builder(new AvroReadSupport<>(), updatedParquetFilePath).build();
|
||||
ParquetReader updatedReader = ParquetReader
|
||||
.builder(new AvroReadSupport<>(), updatedParquetFilePath).build();
|
||||
index = 0;
|
||||
while ((newRecord = (GenericRecord) updatedReader.read()) != null) {
|
||||
assertTrue(newRecord.get("_row_key").toString().equals(records.get(index).getRecordKey()));
|
||||
@@ -243,7 +258,8 @@ public class TestCopyOnWriteTable {
|
||||
@Test
|
||||
public void testMetadataAggregateFromWriteStatus() throws Exception {
|
||||
// Prepare the AvroParquetIO
|
||||
HoodieWriteConfig config = makeHoodieClientConfigBuilder().withWriteStatusClass(MetadataMergeWriteStatus.class).build();
|
||||
HoodieWriteConfig config = makeHoodieClientConfigBuilder()
|
||||
.withWriteStatusClass(MetadataMergeWriteStatus.class).build();
|
||||
String firstCommitTime = HoodieTestUtils.makeNewCommitTime();
|
||||
HoodieTableMetaClient metadata = new HoodieTableMetaClient(FSUtils.getFs(), basePath);
|
||||
|
||||
@@ -256,11 +272,17 @@ public class TestCopyOnWriteTable {
|
||||
|
||||
List<HoodieRecord> records = new ArrayList<>();
|
||||
TestRawTripPayload rowChange1 = new TestRawTripPayload(recordStr1);
|
||||
records.add(new HoodieRecord(new HoodieKey(rowChange1.getRowKey(), rowChange1.getPartitionPath()), rowChange1));
|
||||
records.add(
|
||||
new HoodieRecord(new HoodieKey(rowChange1.getRowKey(), rowChange1.getPartitionPath()),
|
||||
rowChange1));
|
||||
TestRawTripPayload rowChange2 = new TestRawTripPayload(recordStr2);
|
||||
records.add(new HoodieRecord(new HoodieKey(rowChange2.getRowKey(), rowChange2.getPartitionPath()), rowChange2));
|
||||
records.add(
|
||||
new HoodieRecord(new HoodieKey(rowChange2.getRowKey(), rowChange2.getPartitionPath()),
|
||||
rowChange2));
|
||||
TestRawTripPayload rowChange3 = new TestRawTripPayload(recordStr3);
|
||||
records.add(new HoodieRecord(new HoodieKey(rowChange3.getRowKey(), rowChange3.getPartitionPath()), rowChange3));
|
||||
records.add(
|
||||
new HoodieRecord(new HoodieKey(rowChange3.getRowKey(), rowChange3.getPartitionPath()),
|
||||
rowChange3));
|
||||
|
||||
// Insert new records
|
||||
List<WriteStatus> writeStatuses = HoodieClientTestUtils
|
||||
@@ -286,7 +308,8 @@ public class TestCopyOnWriteTable {
|
||||
records.addAll(newHoodieRecords(1, "2016-02-01T03:16:41.415Z"));
|
||||
|
||||
// Simulate crash after first file
|
||||
List<WriteStatus> statuses = HoodieClientTestUtils.collectStatuses(table.handleInsert(commitTime, records.iterator()));
|
||||
List<WriteStatus> statuses = HoodieClientTestUtils
|
||||
.collectStatuses(table.handleInsert(commitTime, records.iterator()));
|
||||
WriteStatus status = statuses.get(0);
|
||||
Path partialFile = new Path(String.format("%s/%s/%s",
|
||||
basePath,
|
||||
@@ -299,7 +322,8 @@ public class TestCopyOnWriteTable {
|
||||
records = newHoodieRecords(10, "2016-01-31T03:16:41.415Z");
|
||||
records.addAll(newHoodieRecords(1, "2016-02-01T03:16:41.415Z"));
|
||||
|
||||
statuses = HoodieClientTestUtils.collectStatuses(table.handleInsert(commitTime, records.iterator()));
|
||||
statuses = HoodieClientTestUtils
|
||||
.collectStatuses(table.handleInsert(commitTime, records.iterator()));
|
||||
status = statuses.get(0);
|
||||
|
||||
Path retriedFIle = new Path(String.format("%s/%s/%s",
|
||||
@@ -312,7 +336,8 @@ public class TestCopyOnWriteTable {
|
||||
}
|
||||
|
||||
|
||||
@Test public void testInsertRecords() throws Exception {
|
||||
@Test
|
||||
public void testInsertRecords() throws Exception {
|
||||
HoodieWriteConfig config = makeHoodieClientConfig();
|
||||
String commitTime = HoodieTestUtils.makeNewCommitTime();
|
||||
HoodieTableMetaClient metadata = new HoodieTableMetaClient(FSUtils.getFs(), basePath);
|
||||
@@ -324,8 +349,8 @@ public class TestCopyOnWriteTable {
|
||||
records.addAll(newHoodieRecords(1, "2016-02-01T03:16:41.415Z"));
|
||||
|
||||
// Insert new records
|
||||
List<WriteStatus> returnedStatuses = HoodieClientTestUtils.collectStatuses(table.handleInsert(commitTime, records.iterator()));
|
||||
|
||||
List<WriteStatus> returnedStatuses = HoodieClientTestUtils
|
||||
.collectStatuses(table.handleInsert(commitTime, records.iterator()));
|
||||
|
||||
// TODO: check the actual files and make sure 11 records, total were written.
|
||||
assertEquals(2, returnedStatuses.size());
|
||||
@@ -343,7 +368,8 @@ public class TestCopyOnWriteTable {
|
||||
records.addAll(newHoodieRecords(1, "2016-02-02T03:16:41.415Z"));
|
||||
|
||||
// Insert new records
|
||||
returnedStatuses = HoodieClientTestUtils.collectStatuses(table.handleInsert(commitTime, records.iterator()));
|
||||
returnedStatuses = HoodieClientTestUtils
|
||||
.collectStatuses(table.handleInsert(commitTime, records.iterator()));
|
||||
|
||||
assertEquals(3, returnedStatuses.size());
|
||||
assertEquals("2016/01/31", returnedStatuses.get(0).getPartitionPath());
|
||||
@@ -357,7 +383,8 @@ public class TestCopyOnWriteTable {
|
||||
|
||||
}
|
||||
|
||||
@Test public void testFileSizeUpsertRecords() throws Exception {
|
||||
@Test
|
||||
public void testFileSizeUpsertRecords() throws Exception {
|
||||
HoodieWriteConfig config = makeHoodieClientConfigBuilder().withStorageConfig(
|
||||
HoodieStorageConfig.newBuilder().limitFileSize(64 * 1024).parquetBlockSize(64 * 1024)
|
||||
.parquetPageSize(64 * 1024).build()).build();
|
||||
@@ -368,9 +395,11 @@ public class TestCopyOnWriteTable {
|
||||
List<HoodieRecord> records = new ArrayList<>();
|
||||
// Approx 1150 records are written for block size of 64KB
|
||||
for (int i = 0; i < 2000; i++) {
|
||||
String recordStr = "{\"_row_key\":\"" + UUID.randomUUID().toString() + "\",\"time\":\"2016-01-31T03:16:41.415Z\",\"number\":" + i + "}";
|
||||
String recordStr = "{\"_row_key\":\"" + UUID.randomUUID().toString()
|
||||
+ "\",\"time\":\"2016-01-31T03:16:41.415Z\",\"number\":" + i + "}";
|
||||
TestRawTripPayload rowChange = new TestRawTripPayload(recordStr);
|
||||
records.add(new HoodieRecord(new HoodieKey(rowChange.getRowKey(), rowChange.getPartitionPath()),
|
||||
records
|
||||
.add(new HoodieRecord(new HoodieKey(rowChange.getRowKey(), rowChange.getPartitionPath()),
|
||||
rowChange));
|
||||
}
|
||||
|
||||
@@ -380,7 +409,8 @@ public class TestCopyOnWriteTable {
|
||||
// Check the updated file
|
||||
int counts = 0;
|
||||
for (File file : new File(basePath + "/2016/01/31").listFiles()) {
|
||||
if (file.getName().endsWith(".parquet") && FSUtils.getCommitTime(file.getName()).equals(commitTime)) {
|
||||
if (file.getName().endsWith(".parquet") && FSUtils.getCommitTime(file.getName())
|
||||
.equals(commitTime)) {
|
||||
System.out.println(file.getName() + "-" + file.length());
|
||||
counts++;
|
||||
}
|
||||
@@ -391,7 +421,6 @@ public class TestCopyOnWriteTable {
|
||||
}
|
||||
|
||||
|
||||
|
||||
private List<HoodieCopyOnWriteTable.InsertBucket> testUpsertPartitioner(int smallFileSize,
|
||||
int numInserts,
|
||||
int numUpdates,
|
||||
@@ -400,8 +429,10 @@ public class TestCopyOnWriteTable {
|
||||
final String TEST_PARTITION_PATH = "2016/09/26";
|
||||
HoodieWriteConfig config = makeHoodieClientConfigBuilder()
|
||||
.withCompactionConfig(HoodieCompactionConfig.newBuilder()
|
||||
.compactionSmallFileSize(smallFileSize).insertSplitSize(100).autoTuneInsertSplits(autoSplitInserts).build())
|
||||
.withStorageConfig(HoodieStorageConfig.newBuilder().limitFileSize(1000 * 1024).build()).build();
|
||||
.compactionSmallFileSize(smallFileSize).insertSplitSize(100)
|
||||
.autoTuneInsertSplits(autoSplitInserts).build())
|
||||
.withStorageConfig(HoodieStorageConfig.newBuilder().limitFileSize(1000 * 1024).build())
|
||||
.build();
|
||||
|
||||
HoodieClientTestUtils.fakeCommitFile(basePath, "001");
|
||||
HoodieClientTestUtils.fakeDataFile(basePath, TEST_PARTITION_PATH, "001", "file1", fileSize);
|
||||
@@ -409,10 +440,11 @@ public class TestCopyOnWriteTable {
|
||||
HoodieTableMetaClient metadata = new HoodieTableMetaClient(FSUtils.getFs(), basePath);
|
||||
HoodieCopyOnWriteTable table = new HoodieCopyOnWriteTable(config, metadata);
|
||||
|
||||
HoodieTestDataGenerator dataGenerator = new HoodieTestDataGenerator(new String[]{TEST_PARTITION_PATH});
|
||||
HoodieTestDataGenerator dataGenerator = new HoodieTestDataGenerator(
|
||||
new String[]{TEST_PARTITION_PATH});
|
||||
List<HoodieRecord> insertRecords = dataGenerator.generateInserts("001", numInserts);
|
||||
List<HoodieRecord> updateRecords = dataGenerator.generateUpdates("001", numUpdates);
|
||||
for (HoodieRecord updateRec: updateRecords) {
|
||||
for (HoodieRecord updateRec : updateRecords) {
|
||||
updateRec.setCurrentLocation(new HoodieRecordLocation("001", "file1"));
|
||||
}
|
||||
List<HoodieRecord> records = new ArrayList<>();
|
||||
@@ -430,7 +462,8 @@ public class TestCopyOnWriteTable {
|
||||
assertEquals("Bucket 2 is INSERT", HoodieCopyOnWriteTable.BucketType.INSERT,
|
||||
partitioner.getBucketInfo(2).bucketType);
|
||||
assertEquals("Update record should have gone to the 1 update partiton", 0,
|
||||
partitioner.getPartition(new Tuple2<>(updateRecords.get(0).getKey(), Option.apply(updateRecords.get(0).getCurrentLocation()))));
|
||||
partitioner.getPartition(new Tuple2<>(updateRecords.get(0).getKey(),
|
||||
Option.apply(updateRecords.get(0).getCurrentLocation()))));
|
||||
return partitioner.getInsertBuckets(TEST_PARTITION_PATH);
|
||||
}
|
||||
|
||||
@@ -438,7 +471,8 @@ public class TestCopyOnWriteTable {
|
||||
@Test
|
||||
public void testUpsertPartitioner() throws Exception {
|
||||
// Inserts + Updates... Check all updates go together & inserts subsplit
|
||||
List<HoodieCopyOnWriteTable.InsertBucket> insertBuckets = testUpsertPartitioner(0, 200, 100, 1024, false);
|
||||
List<HoodieCopyOnWriteTable.InsertBucket> insertBuckets = testUpsertPartitioner(0, 200, 100,
|
||||
1024, false);
|
||||
assertEquals("Total of 2 insert buckets", 2, insertBuckets.size());
|
||||
}
|
||||
|
||||
@@ -446,16 +480,21 @@ public class TestCopyOnWriteTable {
|
||||
@Test
|
||||
public void testUpsertPartitionerWithSmallInsertHandling() throws Exception {
|
||||
// Inserts + Updates .. Check updates go together & inserts subsplit, after expanding smallest file
|
||||
List<HoodieCopyOnWriteTable.InsertBucket> insertBuckets = testUpsertPartitioner(1000 * 1024, 400, 100, 800 * 1024, false);
|
||||
List<HoodieCopyOnWriteTable.InsertBucket> insertBuckets = testUpsertPartitioner(1000 * 1024,
|
||||
400, 100, 800 * 1024, false);
|
||||
assertEquals("Total of 3 insert buckets", 3, insertBuckets.size());
|
||||
assertEquals("First insert bucket must be same as update bucket", 0, insertBuckets.get(0).bucketNumber);
|
||||
assertEquals("First insert bucket should have weight 0.5", 0.5, insertBuckets.get(0).weight, 0.01);
|
||||
assertEquals("First insert bucket must be same as update bucket", 0,
|
||||
insertBuckets.get(0).bucketNumber);
|
||||
assertEquals("First insert bucket should have weight 0.5", 0.5, insertBuckets.get(0).weight,
|
||||
0.01);
|
||||
|
||||
// Now with insert split size auto tuned
|
||||
insertBuckets = testUpsertPartitioner(1000 * 1024, 2400, 100, 800 * 1024, true);
|
||||
assertEquals("Total of 3 insert buckets", 3, insertBuckets.size());
|
||||
assertEquals("First insert bucket must be same as update bucket", 0, insertBuckets.get(0).bucketNumber);
|
||||
assertEquals("First insert bucket should have weight 0.5", 200.0/2400, insertBuckets.get(0).weight, 0.01);
|
||||
assertEquals("First insert bucket must be same as update bucket", 0,
|
||||
insertBuckets.get(0).bucketNumber);
|
||||
assertEquals("First insert bucket should have weight 0.5", 200.0 / 2400,
|
||||
insertBuckets.get(0).weight, 0.01);
|
||||
}
|
||||
|
||||
@After
|
||||
|
||||
Some files were not shown because too many files have changed in this diff Show More
Reference in New Issue
Block a user