Reformatting code per Google Code Style all over
This commit is contained in:
committed by
vinoth chandar
parent
5a62480a92
commit
e45679f5e2
@@ -15,7 +15,9 @@
|
|||||||
~ limitations under the License.
|
~ limitations under the License.
|
||||||
-->
|
-->
|
||||||
|
|
||||||
<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
|
<project xmlns="http://maven.apache.org/POM/4.0.0"
|
||||||
|
xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
|
||||||
|
xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
|
||||||
<parent>
|
<parent>
|
||||||
<artifactId>hoodie</artifactId>
|
<artifactId>hoodie</artifactId>
|
||||||
<groupId>com.uber.hoodie</groupId>
|
<groupId>com.uber.hoodie</groupId>
|
||||||
|
|||||||
@@ -17,12 +17,12 @@
|
|||||||
package com.uber.hoodie.cli;
|
package com.uber.hoodie.cli;
|
||||||
|
|
||||||
import com.uber.hoodie.common.table.HoodieTableMetaClient;
|
import com.uber.hoodie.common.table.HoodieTableMetaClient;
|
||||||
|
import java.io.IOException;
|
||||||
import org.apache.hadoop.conf.Configuration;
|
import org.apache.hadoop.conf.Configuration;
|
||||||
import org.apache.hadoop.fs.FileSystem;
|
import org.apache.hadoop.fs.FileSystem;
|
||||||
|
|
||||||
import java.io.IOException;
|
|
||||||
|
|
||||||
public class HoodieCLI {
|
public class HoodieCLI {
|
||||||
|
|
||||||
public static Configuration conf;
|
public static Configuration conf;
|
||||||
public static FileSystem fs;
|
public static FileSystem fs;
|
||||||
public static CLIState state = CLIState.INIT;
|
public static CLIState state = CLIState.INIT;
|
||||||
@@ -43,7 +43,7 @@ public class HoodieCLI {
|
|||||||
}
|
}
|
||||||
|
|
||||||
public static void initFS(boolean force) throws IOException {
|
public static void initFS(boolean force) throws IOException {
|
||||||
if(fs == null || force) {
|
if (fs == null || force) {
|
||||||
fs = FileSystem.get(conf);
|
fs = FileSystem.get(conf);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -17,7 +17,6 @@
|
|||||||
package com.uber.hoodie.cli;
|
package com.uber.hoodie.cli;
|
||||||
|
|
||||||
import dnl.utils.text.table.TextTable;
|
import dnl.utils.text.table.TextTable;
|
||||||
|
|
||||||
import java.io.ByteArrayOutputStream;
|
import java.io.ByteArrayOutputStream;
|
||||||
import java.io.PrintStream;
|
import java.io.PrintStream;
|
||||||
import java.nio.charset.Charset;
|
import java.nio.charset.Charset;
|
||||||
|
|||||||
@@ -16,7 +16,6 @@
|
|||||||
|
|
||||||
package com.uber.hoodie.cli;
|
package com.uber.hoodie.cli;
|
||||||
|
|
||||||
import com.uber.hoodie.common.table.HoodieTableConfig;
|
|
||||||
import org.springframework.core.Ordered;
|
import org.springframework.core.Ordered;
|
||||||
import org.springframework.core.annotation.Order;
|
import org.springframework.core.annotation.Order;
|
||||||
import org.springframework.shell.plugin.support.DefaultPromptProvider;
|
import org.springframework.shell.plugin.support.DefaultPromptProvider;
|
||||||
|
|||||||
@@ -22,9 +22,13 @@ import org.springframework.shell.plugin.support.DefaultBannerProvider;
|
|||||||
import org.springframework.shell.support.util.OsUtils;
|
import org.springframework.shell.support.util.OsUtils;
|
||||||
import org.springframework.stereotype.Component;
|
import org.springframework.stereotype.Component;
|
||||||
|
|
||||||
@Component @Order(Ordered.HIGHEST_PRECEDENCE) public class HoodieSplashScreen
|
@Component
|
||||||
|
@Order(Ordered.HIGHEST_PRECEDENCE)
|
||||||
|
public class HoodieSplashScreen
|
||||||
extends DefaultBannerProvider {
|
extends DefaultBannerProvider {
|
||||||
private static String screen = "============================================" + OsUtils.LINE_SEPARATOR +
|
|
||||||
|
private static String screen =
|
||||||
|
"============================================" + OsUtils.LINE_SEPARATOR +
|
||||||
"* *" + OsUtils.LINE_SEPARATOR +
|
"* *" + OsUtils.LINE_SEPARATOR +
|
||||||
"* _ _ _ _ *" + OsUtils.LINE_SEPARATOR +
|
"* _ _ _ _ *" + OsUtils.LINE_SEPARATOR +
|
||||||
"* | | | | | (_) *" + OsUtils.LINE_SEPARATOR +
|
"* | | | | | (_) *" + OsUtils.LINE_SEPARATOR +
|
||||||
@@ -49,7 +53,8 @@ import org.springframework.stereotype.Component;
|
|||||||
return "Welcome to Hoodie CLI. Please type help if you are looking for help. ";
|
return "Welcome to Hoodie CLI. Please type help if you are looking for help. ";
|
||||||
}
|
}
|
||||||
|
|
||||||
@Override public String getProviderName() {
|
@Override
|
||||||
|
public String getProviderName() {
|
||||||
return "Hoodie Banner";
|
return "Hoodie Banner";
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -16,16 +16,14 @@
|
|||||||
|
|
||||||
package com.uber.hoodie.cli;
|
package com.uber.hoodie.cli;
|
||||||
|
|
||||||
|
import java.io.IOException;
|
||||||
import org.springframework.shell.Bootstrap;
|
import org.springframework.shell.Bootstrap;
|
||||||
|
|
||||||
import java.io.IOException;
|
|
||||||
|
|
||||||
public class Main {
|
public class Main {
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Main class that delegates to Spring Shell's Bootstrap class in order to simplify debugging inside an IDE
|
* Main class that delegates to Spring Shell's Bootstrap class in order to simplify debugging
|
||||||
*
|
* inside an IDE
|
||||||
* @param args
|
|
||||||
* @throws IOException
|
|
||||||
*/
|
*/
|
||||||
public static void main(String[] args) throws IOException {
|
public static void main(String[] args) throws IOException {
|
||||||
Bootstrap.main(args);
|
Bootstrap.main(args);
|
||||||
|
|||||||
@@ -24,6 +24,10 @@ import com.uber.hoodie.common.table.HoodieTimeline;
|
|||||||
import com.uber.hoodie.common.table.log.HoodieLogFormat;
|
import com.uber.hoodie.common.table.log.HoodieLogFormat;
|
||||||
import com.uber.hoodie.common.table.log.block.HoodieAvroDataBlock;
|
import com.uber.hoodie.common.table.log.block.HoodieAvroDataBlock;
|
||||||
import com.uber.hoodie.common.util.FSUtils;
|
import com.uber.hoodie.common.util.FSUtils;
|
||||||
|
import java.io.IOException;
|
||||||
|
import java.util.ArrayList;
|
||||||
|
import java.util.List;
|
||||||
|
import java.util.stream.Collectors;
|
||||||
import org.apache.avro.generic.GenericRecord;
|
import org.apache.avro.generic.GenericRecord;
|
||||||
import org.apache.avro.generic.IndexedRecord;
|
import org.apache.avro.generic.IndexedRecord;
|
||||||
import org.apache.hadoop.fs.FileStatus;
|
import org.apache.hadoop.fs.FileStatus;
|
||||||
@@ -34,11 +38,6 @@ import org.springframework.shell.core.annotation.CliCommand;
|
|||||||
import org.springframework.shell.core.annotation.CliOption;
|
import org.springframework.shell.core.annotation.CliOption;
|
||||||
import org.springframework.stereotype.Component;
|
import org.springframework.stereotype.Component;
|
||||||
|
|
||||||
import java.io.IOException;
|
|
||||||
import java.util.ArrayList;
|
|
||||||
import java.util.List;
|
|
||||||
import java.util.stream.Collectors;
|
|
||||||
|
|
||||||
@Component
|
@Component
|
||||||
public class ArchivedCommitsCommand implements CommandMarker {
|
public class ArchivedCommitsCommand implements CommandMarker {
|
||||||
|
|
||||||
@@ -49,13 +48,16 @@ public class ArchivedCommitsCommand implements CommandMarker {
|
|||||||
|
|
||||||
@CliCommand(value = "show archived commits", help = "Read commits from archived files and show details")
|
@CliCommand(value = "show archived commits", help = "Read commits from archived files and show details")
|
||||||
public String showCommits(
|
public String showCommits(
|
||||||
@CliOption(key = {"limit"}, mandatory = false, help = "Limit commits", unspecifiedDefaultValue = "10")
|
@CliOption(key = {
|
||||||
|
"limit"}, mandatory = false, help = "Limit commits", unspecifiedDefaultValue = "10")
|
||||||
final Integer limit) throws IOException {
|
final Integer limit) throws IOException {
|
||||||
|
|
||||||
System.out.println("===============> Showing only " + limit + " archived commits <===============");
|
System.out
|
||||||
FileStatus [] fsStatuses = FSUtils.getFs().globStatus(new Path(HoodieCLI.tableMetadata.getBasePath() + "/.hoodie/.commits_.archive*"));
|
.println("===============> Showing only " + limit + " archived commits <===============");
|
||||||
|
FileStatus[] fsStatuses = FSUtils.getFs().globStatus(
|
||||||
|
new Path(HoodieCLI.tableMetadata.getBasePath() + "/.hoodie/.commits_.archive*"));
|
||||||
List<String[]> allCommits = new ArrayList<>();
|
List<String[]> allCommits = new ArrayList<>();
|
||||||
for(FileStatus fs : fsStatuses) {
|
for (FileStatus fs : fsStatuses) {
|
||||||
//read the archived file
|
//read the archived file
|
||||||
HoodieLogFormat.Reader reader = HoodieLogFormat.newReader(FSUtils.getFs(),
|
HoodieLogFormat.Reader reader = HoodieLogFormat.newReader(FSUtils.getFs(),
|
||||||
new HoodieLogFile(fs.getPath()), HoodieArchivedMetaEntry.getClassSchema(), false);
|
new HoodieLogFile(fs.getPath()), HoodieArchivedMetaEntry.getClassSchema(), false);
|
||||||
@@ -67,11 +69,13 @@ public class ArchivedCommitsCommand implements CommandMarker {
|
|||||||
List<IndexedRecord> records = blk.getRecords();
|
List<IndexedRecord> records = blk.getRecords();
|
||||||
readRecords.addAll(records);
|
readRecords.addAll(records);
|
||||||
}
|
}
|
||||||
List<String[]> readCommits = readRecords.stream().map(r -> (GenericRecord)r).map(r -> readCommit(r)).limit(limit).collect(Collectors.toList());
|
List<String[]> readCommits = readRecords.stream().map(r -> (GenericRecord) r)
|
||||||
|
.map(r -> readCommit(r)).limit(limit).collect(Collectors.toList());
|
||||||
allCommits.addAll(readCommits);
|
allCommits.addAll(readCommits);
|
||||||
}
|
}
|
||||||
return HoodiePrintHelper.print(
|
return HoodiePrintHelper.print(
|
||||||
new String[] {"CommitTime", "CommitType", "CommitDetails"}, allCommits.toArray(new String[allCommits.size()][]));
|
new String[]{"CommitTime", "CommitType", "CommitDetails"},
|
||||||
|
allCommits.toArray(new String[allCommits.size()][]));
|
||||||
}
|
}
|
||||||
|
|
||||||
private String[] readCommit(GenericRecord record) {
|
private String[] readCommit(GenericRecord record) {
|
||||||
|
|||||||
@@ -24,21 +24,21 @@ import com.uber.hoodie.common.table.HoodieTimeline;
|
|||||||
import com.uber.hoodie.common.table.timeline.HoodieActiveTimeline;
|
import com.uber.hoodie.common.table.timeline.HoodieActiveTimeline;
|
||||||
import com.uber.hoodie.common.table.timeline.HoodieInstant;
|
import com.uber.hoodie.common.table.timeline.HoodieInstant;
|
||||||
import com.uber.hoodie.common.util.AvroUtils;
|
import com.uber.hoodie.common.util.AvroUtils;
|
||||||
import org.springframework.shell.core.CommandMarker;
|
|
||||||
import org.springframework.shell.core.annotation.CliAvailabilityIndicator;
|
|
||||||
import org.springframework.shell.core.annotation.CliCommand;
|
|
||||||
import org.springframework.shell.core.annotation.CliOption;
|
|
||||||
import org.springframework.stereotype.Component;
|
|
||||||
|
|
||||||
import java.io.IOException;
|
import java.io.IOException;
|
||||||
import java.util.ArrayList;
|
import java.util.ArrayList;
|
||||||
import java.util.Collections;
|
import java.util.Collections;
|
||||||
import java.util.List;
|
import java.util.List;
|
||||||
import java.util.Map;
|
import java.util.Map;
|
||||||
import java.util.stream.Collectors;
|
import java.util.stream.Collectors;
|
||||||
|
import org.springframework.shell.core.CommandMarker;
|
||||||
|
import org.springframework.shell.core.annotation.CliAvailabilityIndicator;
|
||||||
|
import org.springframework.shell.core.annotation.CliCommand;
|
||||||
|
import org.springframework.shell.core.annotation.CliOption;
|
||||||
|
import org.springframework.stereotype.Component;
|
||||||
|
|
||||||
@Component
|
@Component
|
||||||
public class CleansCommand implements CommandMarker {
|
public class CleansCommand implements CommandMarker {
|
||||||
|
|
||||||
@CliAvailabilityIndicator({"cleans show"})
|
@CliAvailabilityIndicator({"cleans show"})
|
||||||
public boolean isShowAvailable() {
|
public boolean isShowAvailable() {
|
||||||
return HoodieCLI.tableMetadata != null;
|
return HoodieCLI.tableMetadata != null;
|
||||||
@@ -65,12 +65,12 @@ public class CleansCommand implements CommandMarker {
|
|||||||
HoodieInstant clean = cleans.get(i);
|
HoodieInstant clean = cleans.get(i);
|
||||||
HoodieCleanMetadata cleanMetadata =
|
HoodieCleanMetadata cleanMetadata =
|
||||||
AvroUtils.deserializeHoodieCleanMetadata(timeline.getInstantDetails(clean).get());
|
AvroUtils.deserializeHoodieCleanMetadata(timeline.getInstantDetails(clean).get());
|
||||||
rows[i] = new String[] {clean.getTimestamp(), cleanMetadata.getEarliestCommitToRetain(),
|
rows[i] = new String[]{clean.getTimestamp(), cleanMetadata.getEarliestCommitToRetain(),
|
||||||
String.valueOf(cleanMetadata.getTotalFilesDeleted()),
|
String.valueOf(cleanMetadata.getTotalFilesDeleted()),
|
||||||
String.valueOf(cleanMetadata.getTimeTakenInMillis())};
|
String.valueOf(cleanMetadata.getTimeTakenInMillis())};
|
||||||
}
|
}
|
||||||
return HoodiePrintHelper.print(
|
return HoodiePrintHelper.print(
|
||||||
new String[] {"CleanTime", "EarliestCommandRetained", "Total Files Deleted",
|
new String[]{"CleanTime", "EarliestCommandRetained", "Total Files Deleted",
|
||||||
"Total Time Taken"}, rows);
|
"Total Time Taken"}, rows);
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -97,16 +97,17 @@ public class CleansCommand implements CommandMarker {
|
|||||||
HoodieCleanMetadata cleanMetadata =
|
HoodieCleanMetadata cleanMetadata =
|
||||||
AvroUtils.deserializeHoodieCleanMetadata(timeline.getInstantDetails(cleanInstant).get());
|
AvroUtils.deserializeHoodieCleanMetadata(timeline.getInstantDetails(cleanInstant).get());
|
||||||
List<String[]> rows = new ArrayList<>();
|
List<String[]> rows = new ArrayList<>();
|
||||||
for (Map.Entry<String, HoodieCleanPartitionMetadata> entry : cleanMetadata.getPartitionMetadata().entrySet()) {
|
for (Map.Entry<String, HoodieCleanPartitionMetadata> entry : cleanMetadata
|
||||||
|
.getPartitionMetadata().entrySet()) {
|
||||||
String path = entry.getKey();
|
String path = entry.getKey();
|
||||||
HoodieCleanPartitionMetadata stats = entry.getValue();
|
HoodieCleanPartitionMetadata stats = entry.getValue();
|
||||||
String policy = stats.getPolicy();
|
String policy = stats.getPolicy();
|
||||||
String totalSuccessDeletedFiles = String.valueOf(stats.getSuccessDeleteFiles().size());
|
String totalSuccessDeletedFiles = String.valueOf(stats.getSuccessDeleteFiles().size());
|
||||||
String totalFailedDeletedFiles = String.valueOf(stats.getFailedDeleteFiles().size());
|
String totalFailedDeletedFiles = String.valueOf(stats.getFailedDeleteFiles().size());
|
||||||
rows.add(new String[] {path, policy, totalSuccessDeletedFiles, totalFailedDeletedFiles});
|
rows.add(new String[]{path, policy, totalSuccessDeletedFiles, totalFailedDeletedFiles});
|
||||||
}
|
}
|
||||||
return HoodiePrintHelper.print(
|
return HoodiePrintHelper.print(
|
||||||
new String[] {"Partition Path", "Cleaning policy", "Total Files Successfully Deleted",
|
new String[]{"Partition Path", "Cleaning policy", "Total Files Successfully Deleted",
|
||||||
"Total Failed Deletions"}, rows.toArray(new String[rows.size()][]));
|
"Total Failed Deletions"}, rows.toArray(new String[rows.size()][]));
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -27,7 +27,12 @@ import com.uber.hoodie.common.table.HoodieTimeline;
|
|||||||
import com.uber.hoodie.common.table.timeline.HoodieActiveTimeline;
|
import com.uber.hoodie.common.table.timeline.HoodieActiveTimeline;
|
||||||
import com.uber.hoodie.common.table.timeline.HoodieInstant;
|
import com.uber.hoodie.common.table.timeline.HoodieInstant;
|
||||||
import com.uber.hoodie.common.util.NumericUtils;
|
import com.uber.hoodie.common.util.NumericUtils;
|
||||||
|
import java.io.IOException;
|
||||||
|
import java.util.ArrayList;
|
||||||
|
import java.util.Collections;
|
||||||
|
import java.util.List;
|
||||||
|
import java.util.Map;
|
||||||
|
import java.util.stream.Collectors;
|
||||||
import org.apache.spark.launcher.SparkLauncher;
|
import org.apache.spark.launcher.SparkLauncher;
|
||||||
import org.springframework.shell.core.CommandMarker;
|
import org.springframework.shell.core.CommandMarker;
|
||||||
import org.springframework.shell.core.annotation.CliAvailabilityIndicator;
|
import org.springframework.shell.core.annotation.CliAvailabilityIndicator;
|
||||||
@@ -35,15 +40,9 @@ import org.springframework.shell.core.annotation.CliCommand;
|
|||||||
import org.springframework.shell.core.annotation.CliOption;
|
import org.springframework.shell.core.annotation.CliOption;
|
||||||
import org.springframework.stereotype.Component;
|
import org.springframework.stereotype.Component;
|
||||||
|
|
||||||
import java.io.IOException;
|
|
||||||
import java.util.ArrayList;
|
|
||||||
import java.util.Collections;
|
|
||||||
import java.util.List;
|
|
||||||
import java.util.Map;
|
|
||||||
import java.util.stream.Collectors;
|
|
||||||
|
|
||||||
@Component
|
@Component
|
||||||
public class CommitsCommand implements CommandMarker {
|
public class CommitsCommand implements CommandMarker {
|
||||||
|
|
||||||
@CliAvailabilityIndicator({"commits show"})
|
@CliAvailabilityIndicator({"commits show"})
|
||||||
public boolean isShowAvailable() {
|
public boolean isShowAvailable() {
|
||||||
return HoodieCLI.tableMetadata != null;
|
return HoodieCLI.tableMetadata != null;
|
||||||
@@ -70,7 +69,8 @@ public class CommitsCommand implements CommandMarker {
|
|||||||
"limit"}, mandatory = false, help = "Limit commits", unspecifiedDefaultValue = "10")
|
"limit"}, mandatory = false, help = "Limit commits", unspecifiedDefaultValue = "10")
|
||||||
final Integer limit) throws IOException {
|
final Integer limit) throws IOException {
|
||||||
HoodieActiveTimeline activeTimeline = HoodieCLI.tableMetadata.getActiveTimeline();
|
HoodieActiveTimeline activeTimeline = HoodieCLI.tableMetadata.getActiveTimeline();
|
||||||
HoodieTimeline timeline = activeTimeline.getCommitsAndCompactionsTimeline().filterCompletedInstants();
|
HoodieTimeline timeline = activeTimeline.getCommitsAndCompactionsTimeline()
|
||||||
|
.filterCompletedInstants();
|
||||||
List<HoodieInstant> commits = timeline.getInstants().collect(Collectors.toList());
|
List<HoodieInstant> commits = timeline.getInstants().collect(Collectors.toList());
|
||||||
String[][] rows = new String[commits.size()][];
|
String[][] rows = new String[commits.size()][];
|
||||||
Collections.reverse(commits);
|
Collections.reverse(commits);
|
||||||
@@ -78,7 +78,7 @@ public class CommitsCommand implements CommandMarker {
|
|||||||
HoodieInstant commit = commits.get(i);
|
HoodieInstant commit = commits.get(i);
|
||||||
HoodieCommitMetadata commitMetadata =
|
HoodieCommitMetadata commitMetadata =
|
||||||
HoodieCommitMetadata.fromBytes(timeline.getInstantDetails(commit).get());
|
HoodieCommitMetadata.fromBytes(timeline.getInstantDetails(commit).get());
|
||||||
rows[i] = new String[] {commit.getTimestamp(),
|
rows[i] = new String[]{commit.getTimestamp(),
|
||||||
NumericUtils.humanReadableByteCount(commitMetadata.fetchTotalBytesWritten()),
|
NumericUtils.humanReadableByteCount(commitMetadata.fetchTotalBytesWritten()),
|
||||||
String.valueOf(commitMetadata.fetchTotalFilesInsert()),
|
String.valueOf(commitMetadata.fetchTotalFilesInsert()),
|
||||||
String.valueOf(commitMetadata.fetchTotalFilesUpdated()),
|
String.valueOf(commitMetadata.fetchTotalFilesUpdated()),
|
||||||
@@ -88,7 +88,7 @@ public class CommitsCommand implements CommandMarker {
|
|||||||
String.valueOf(commitMetadata.fetchTotalWriteErrors())};
|
String.valueOf(commitMetadata.fetchTotalWriteErrors())};
|
||||||
}
|
}
|
||||||
return HoodiePrintHelper.print(
|
return HoodiePrintHelper.print(
|
||||||
new String[] {"CommitTime", "Total Written (B)", "Total Files Added",
|
new String[]{"CommitTime", "Total Written (B)", "Total Files Added",
|
||||||
"Total Files Updated", "Total Partitions Written", "Total Records Written",
|
"Total Files Updated", "Total Partitions Written", "Total Records Written",
|
||||||
"Total Update Records Written", "Total Errors"}, rows);
|
"Total Update Records Written", "Total Errors"}, rows);
|
||||||
}
|
}
|
||||||
@@ -108,8 +108,10 @@ public class CommitsCommand implements CommandMarker {
|
|||||||
@CliOption(key = {"sparkProperties"}, help = "Spark Properites File Path")
|
@CliOption(key = {"sparkProperties"}, help = "Spark Properites File Path")
|
||||||
final String sparkPropertiesPath) throws Exception {
|
final String sparkPropertiesPath) throws Exception {
|
||||||
HoodieActiveTimeline activeTimeline = HoodieCLI.tableMetadata.getActiveTimeline();
|
HoodieActiveTimeline activeTimeline = HoodieCLI.tableMetadata.getActiveTimeline();
|
||||||
HoodieTimeline timeline = activeTimeline.getCommitsAndCompactionsTimeline().filterCompletedInstants();
|
HoodieTimeline timeline = activeTimeline.getCommitsAndCompactionsTimeline()
|
||||||
HoodieInstant commitInstant = new HoodieInstant(false, HoodieTimeline.COMMIT_ACTION, commitTime);
|
.filterCompletedInstants();
|
||||||
|
HoodieInstant commitInstant = new HoodieInstant(false, HoodieTimeline.COMMIT_ACTION,
|
||||||
|
commitTime);
|
||||||
|
|
||||||
if (!timeline.containsInstant(commitInstant)) {
|
if (!timeline.containsInstant(commitInstant)) {
|
||||||
return "Commit " + commitTime + " not found in Commits " + timeline;
|
return "Commit " + commitTime + " not found in Commits " + timeline;
|
||||||
@@ -135,8 +137,10 @@ public class CommitsCommand implements CommandMarker {
|
|||||||
@CliOption(key = {"commit"}, help = "Commit to show")
|
@CliOption(key = {"commit"}, help = "Commit to show")
|
||||||
final String commitTime) throws Exception {
|
final String commitTime) throws Exception {
|
||||||
HoodieActiveTimeline activeTimeline = HoodieCLI.tableMetadata.getActiveTimeline();
|
HoodieActiveTimeline activeTimeline = HoodieCLI.tableMetadata.getActiveTimeline();
|
||||||
HoodieTimeline timeline = activeTimeline.getCommitsAndCompactionsTimeline().filterCompletedInstants();
|
HoodieTimeline timeline = activeTimeline.getCommitsAndCompactionsTimeline()
|
||||||
HoodieInstant commitInstant = new HoodieInstant(false, HoodieTimeline.COMMIT_ACTION, commitTime);
|
.filterCompletedInstants();
|
||||||
|
HoodieInstant commitInstant = new HoodieInstant(false, HoodieTimeline.COMMIT_ACTION,
|
||||||
|
commitTime);
|
||||||
|
|
||||||
if (!timeline.containsInstant(commitInstant)) {
|
if (!timeline.containsInstant(commitInstant)) {
|
||||||
return "Commit " + commitTime + " not found in Commits " + timeline;
|
return "Commit " + commitTime + " not found in Commits " + timeline;
|
||||||
@@ -165,7 +169,7 @@ public class CommitsCommand implements CommandMarker {
|
|||||||
totalBytesWritten += stat.getTotalWriteBytes();
|
totalBytesWritten += stat.getTotalWriteBytes();
|
||||||
totalWriteErrors += stat.getTotalWriteErrors();
|
totalWriteErrors += stat.getTotalWriteErrors();
|
||||||
}
|
}
|
||||||
rows.add(new String[] {path, String.valueOf(totalFilesAdded),
|
rows.add(new String[]{path, String.valueOf(totalFilesAdded),
|
||||||
String.valueOf(totalFilesUpdated), String.valueOf(totalRecordsInserted),
|
String.valueOf(totalFilesUpdated), String.valueOf(totalRecordsInserted),
|
||||||
String.valueOf(totalRecordsUpdated),
|
String.valueOf(totalRecordsUpdated),
|
||||||
NumericUtils.humanReadableByteCount(totalBytesWritten),
|
NumericUtils.humanReadableByteCount(totalBytesWritten),
|
||||||
@@ -173,7 +177,7 @@ public class CommitsCommand implements CommandMarker {
|
|||||||
|
|
||||||
}
|
}
|
||||||
return HoodiePrintHelper.print(
|
return HoodiePrintHelper.print(
|
||||||
new String[] {"Partition Path", "Total Files Added", "Total Files Updated",
|
new String[]{"Partition Path", "Total Files Added", "Total Files Updated",
|
||||||
"Total Records Inserted", "Total Records Updated", "Total Bytes Written",
|
"Total Records Inserted", "Total Records Updated", "Total Bytes Written",
|
||||||
"Total Errors"}, rows.toArray(new String[rows.size()][]));
|
"Total Errors"}, rows.toArray(new String[rows.size()][]));
|
||||||
}
|
}
|
||||||
@@ -183,8 +187,10 @@ public class CommitsCommand implements CommandMarker {
|
|||||||
@CliOption(key = {"commit"}, help = "Commit to show")
|
@CliOption(key = {"commit"}, help = "Commit to show")
|
||||||
final String commitTime) throws Exception {
|
final String commitTime) throws Exception {
|
||||||
HoodieActiveTimeline activeTimeline = HoodieCLI.tableMetadata.getActiveTimeline();
|
HoodieActiveTimeline activeTimeline = HoodieCLI.tableMetadata.getActiveTimeline();
|
||||||
HoodieTimeline timeline = activeTimeline.getCommitsAndCompactionsTimeline().filterCompletedInstants();
|
HoodieTimeline timeline = activeTimeline.getCommitsAndCompactionsTimeline()
|
||||||
HoodieInstant commitInstant = new HoodieInstant(false, HoodieTimeline.COMMIT_ACTION, commitTime);
|
.filterCompletedInstants();
|
||||||
|
HoodieInstant commitInstant = new HoodieInstant(false, HoodieTimeline.COMMIT_ACTION,
|
||||||
|
commitTime);
|
||||||
|
|
||||||
if (!timeline.containsInstant(commitInstant)) {
|
if (!timeline.containsInstant(commitInstant)) {
|
||||||
return "Commit " + commitTime + " not found in Commits " + timeline;
|
return "Commit " + commitTime + " not found in Commits " + timeline;
|
||||||
@@ -197,14 +203,14 @@ public class CommitsCommand implements CommandMarker {
|
|||||||
String path = entry.getKey();
|
String path = entry.getKey();
|
||||||
List<HoodieWriteStat> stats = entry.getValue();
|
List<HoodieWriteStat> stats = entry.getValue();
|
||||||
for (HoodieWriteStat stat : stats) {
|
for (HoodieWriteStat stat : stats) {
|
||||||
rows.add(new String[] {path, stat.getFileId(), stat.getPrevCommit(),
|
rows.add(new String[]{path, stat.getFileId(), stat.getPrevCommit(),
|
||||||
String.valueOf(stat.getNumUpdateWrites()), String.valueOf(stat.getNumWrites()),
|
String.valueOf(stat.getNumUpdateWrites()), String.valueOf(stat.getNumWrites()),
|
||||||
String.valueOf(stat.getTotalWriteBytes()),
|
String.valueOf(stat.getTotalWriteBytes()),
|
||||||
String.valueOf(stat.getTotalWriteErrors())});
|
String.valueOf(stat.getTotalWriteErrors())});
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
return HoodiePrintHelper.print(
|
return HoodiePrintHelper.print(
|
||||||
new String[] {"Partition Path", "File ID", "Previous Commit", "Total Records Updated",
|
new String[]{"Partition Path", "File ID", "Previous Commit", "Total Records Updated",
|
||||||
"Total Records Written", "Total Bytes Written", "Total Errors"},
|
"Total Records Written", "Total Bytes Written", "Total Errors"},
|
||||||
rows.toArray(new String[rows.size()][]));
|
rows.toArray(new String[rows.size()][]));
|
||||||
}
|
}
|
||||||
@@ -219,16 +225,23 @@ public class CommitsCommand implements CommandMarker {
|
|||||||
@CliOption(key = {"path"}, help = "Path of the dataset to compare to")
|
@CliOption(key = {"path"}, help = "Path of the dataset to compare to")
|
||||||
final String path) throws Exception {
|
final String path) throws Exception {
|
||||||
HoodieTableMetaClient target = new HoodieTableMetaClient(HoodieCLI.fs, path);
|
HoodieTableMetaClient target = new HoodieTableMetaClient(HoodieCLI.fs, path);
|
||||||
HoodieTimeline targetTimeline = target.getActiveTimeline().getCommitsAndCompactionsTimeline().filterCompletedInstants();;
|
HoodieTimeline targetTimeline = target.getActiveTimeline().getCommitsAndCompactionsTimeline()
|
||||||
|
.filterCompletedInstants();
|
||||||
|
;
|
||||||
HoodieTableMetaClient source = HoodieCLI.tableMetadata;
|
HoodieTableMetaClient source = HoodieCLI.tableMetadata;
|
||||||
HoodieTimeline sourceTimeline = source.getActiveTimeline().getCommitsAndCompactionsTimeline().filterCompletedInstants();;
|
HoodieTimeline sourceTimeline = source.getActiveTimeline().getCommitsAndCompactionsTimeline()
|
||||||
|
.filterCompletedInstants();
|
||||||
|
;
|
||||||
String targetLatestCommit =
|
String targetLatestCommit =
|
||||||
targetTimeline.getInstants().iterator().hasNext() ? "0" : targetTimeline.lastInstant().get().getTimestamp();
|
targetTimeline.getInstants().iterator().hasNext() ? "0"
|
||||||
|
: targetTimeline.lastInstant().get().getTimestamp();
|
||||||
String sourceLatestCommit =
|
String sourceLatestCommit =
|
||||||
sourceTimeline.getInstants().iterator().hasNext() ? "0" : sourceTimeline.lastInstant().get().getTimestamp();
|
sourceTimeline.getInstants().iterator().hasNext() ? "0"
|
||||||
|
: sourceTimeline.lastInstant().get().getTimestamp();
|
||||||
|
|
||||||
if (sourceLatestCommit != null &&
|
if (sourceLatestCommit != null &&
|
||||||
HoodieTimeline.compareTimestamps(targetLatestCommit, sourceLatestCommit, HoodieTimeline.GREATER)) {
|
HoodieTimeline
|
||||||
|
.compareTimestamps(targetLatestCommit, sourceLatestCommit, HoodieTimeline.GREATER)) {
|
||||||
// source is behind the target
|
// source is behind the target
|
||||||
List<String> commitsToCatchup =
|
List<String> commitsToCatchup =
|
||||||
targetTimeline.findInstantsAfter(sourceLatestCommit, Integer.MAX_VALUE)
|
targetTimeline.findInstantsAfter(sourceLatestCommit, Integer.MAX_VALUE)
|
||||||
|
|||||||
@@ -18,15 +18,15 @@ package com.uber.hoodie.cli.commands;
|
|||||||
|
|
||||||
import com.uber.hoodie.cli.HoodieCLI;
|
import com.uber.hoodie.cli.HoodieCLI;
|
||||||
import com.uber.hoodie.common.table.HoodieTableMetaClient;
|
import com.uber.hoodie.common.table.HoodieTableMetaClient;
|
||||||
|
import java.io.IOException;
|
||||||
import org.springframework.shell.core.CommandMarker;
|
import org.springframework.shell.core.CommandMarker;
|
||||||
import org.springframework.shell.core.annotation.CliCommand;
|
import org.springframework.shell.core.annotation.CliCommand;
|
||||||
import org.springframework.shell.core.annotation.CliOption;
|
import org.springframework.shell.core.annotation.CliOption;
|
||||||
import org.springframework.stereotype.Component;
|
import org.springframework.stereotype.Component;
|
||||||
|
|
||||||
import java.io.IOException;
|
|
||||||
|
|
||||||
@Component
|
@Component
|
||||||
public class DatasetsCommand implements CommandMarker {
|
public class DatasetsCommand implements CommandMarker {
|
||||||
|
|
||||||
@CliCommand(value = "connect", help = "Connect to a hoodie dataset")
|
@CliCommand(value = "connect", help = "Connect to a hoodie dataset")
|
||||||
public String connect(
|
public String connect(
|
||||||
@CliOption(key = {"path"}, mandatory = true, help = "Base Path of the dataset")
|
@CliOption(key = {"path"}, mandatory = true, help = "Base Path of the dataset")
|
||||||
|
|||||||
@@ -68,7 +68,8 @@ public class HDFSParquetImportCommand implements CommandMarker {
|
|||||||
boolean initialized = HoodieCLI.initConf();
|
boolean initialized = HoodieCLI.initConf();
|
||||||
HoodieCLI.initFS(initialized);
|
HoodieCLI.initFS(initialized);
|
||||||
String sparkPropertiesPath = Utils
|
String sparkPropertiesPath = Utils
|
||||||
.getDefaultPropertiesFile(scala.collection.JavaConversions.propertiesAsScalaMap(System.getProperties()));
|
.getDefaultPropertiesFile(
|
||||||
|
scala.collection.JavaConversions.propertiesAsScalaMap(System.getProperties()));
|
||||||
SparkLauncher sparkLauncher = SparkUtil.initLauncher(sparkPropertiesPath);
|
SparkLauncher sparkLauncher = SparkUtil.initLauncher(sparkPropertiesPath);
|
||||||
|
|
||||||
sparkLauncher.addAppArgs(SparkCommand.IMPORT.toString(), srcPath, targetPath, tableName,
|
sparkLauncher.addAppArgs(SparkCommand.IMPORT.toString(), srcPath, targetPath, tableName,
|
||||||
|
|||||||
@@ -16,23 +16,23 @@
|
|||||||
|
|
||||||
package com.uber.hoodie.cli.commands;
|
package com.uber.hoodie.cli.commands;
|
||||||
|
|
||||||
|
import com.uber.hoodie.cli.HoodieCLI;
|
||||||
import com.uber.hoodie.cli.utils.CommitUtil;
|
import com.uber.hoodie.cli.utils.CommitUtil;
|
||||||
import com.uber.hoodie.cli.utils.HiveUtil;
|
import com.uber.hoodie.cli.utils.HiveUtil;
|
||||||
import com.uber.hoodie.cli.HoodieCLI;
|
|
||||||
import com.uber.hoodie.common.table.HoodieTableMetaClient;
|
import com.uber.hoodie.common.table.HoodieTableMetaClient;
|
||||||
import com.uber.hoodie.common.table.HoodieTimeline;
|
import com.uber.hoodie.common.table.HoodieTimeline;
|
||||||
import com.uber.hoodie.common.table.timeline.HoodieInstant;
|
import com.uber.hoodie.common.table.timeline.HoodieInstant;
|
||||||
|
import java.util.List;
|
||||||
|
import java.util.stream.Collectors;
|
||||||
import org.springframework.shell.core.CommandMarker;
|
import org.springframework.shell.core.CommandMarker;
|
||||||
import org.springframework.shell.core.annotation.CliAvailabilityIndicator;
|
import org.springframework.shell.core.annotation.CliAvailabilityIndicator;
|
||||||
import org.springframework.shell.core.annotation.CliCommand;
|
import org.springframework.shell.core.annotation.CliCommand;
|
||||||
import org.springframework.shell.core.annotation.CliOption;
|
import org.springframework.shell.core.annotation.CliOption;
|
||||||
import org.springframework.stereotype.Component;
|
import org.springframework.stereotype.Component;
|
||||||
|
|
||||||
import java.util.List;
|
|
||||||
import java.util.stream.Collectors;
|
|
||||||
|
|
||||||
@Component
|
@Component
|
||||||
public class HoodieSyncCommand implements CommandMarker {
|
public class HoodieSyncCommand implements CommandMarker {
|
||||||
|
|
||||||
@CliAvailabilityIndicator({"sync validate"})
|
@CliAvailabilityIndicator({"sync validate"})
|
||||||
public boolean isSyncVerificationAvailable() {
|
public boolean isSyncVerificationAvailable() {
|
||||||
return HoodieCLI.tableMetadata != null && HoodieCLI.syncTableMetadata != null;
|
return HoodieCLI.tableMetadata != null && HoodieCLI.syncTableMetadata != null;
|
||||||
@@ -70,14 +70,18 @@ public class HoodieSyncCommand implements CommandMarker {
|
|||||||
sourceCount = HiveUtil.countRecords(hiveServerUrl, source, srcDb, hiveUser, hivePass);
|
sourceCount = HiveUtil.countRecords(hiveServerUrl, source, srcDb, hiveUser, hivePass);
|
||||||
targetCount = HiveUtil.countRecords(hiveServerUrl, target, tgtDb, hiveUser, hivePass);
|
targetCount = HiveUtil.countRecords(hiveServerUrl, target, tgtDb, hiveUser, hivePass);
|
||||||
} else if ("latestPartitions".equals(mode)) {
|
} else if ("latestPartitions".equals(mode)) {
|
||||||
sourceCount = HiveUtil.countRecords(hiveServerUrl, source, srcDb, partitionCount, hiveUser, hivePass);
|
sourceCount = HiveUtil
|
||||||
targetCount = HiveUtil.countRecords(hiveServerUrl, target, tgtDb, partitionCount, hiveUser, hivePass);
|
.countRecords(hiveServerUrl, source, srcDb, partitionCount, hiveUser, hivePass);
|
||||||
|
targetCount = HiveUtil
|
||||||
|
.countRecords(hiveServerUrl, target, tgtDb, partitionCount, hiveUser, hivePass);
|
||||||
}
|
}
|
||||||
|
|
||||||
String targetLatestCommit =
|
String targetLatestCommit =
|
||||||
targetTimeline.getInstants().iterator().hasNext() ? "0" : targetTimeline.lastInstant().get().getTimestamp();
|
targetTimeline.getInstants().iterator().hasNext() ? "0"
|
||||||
|
: targetTimeline.lastInstant().get().getTimestamp();
|
||||||
String sourceLatestCommit =
|
String sourceLatestCommit =
|
||||||
sourceTimeline.getInstants().iterator().hasNext() ? "0" : sourceTimeline.lastInstant().get().getTimestamp();
|
sourceTimeline.getInstants().iterator().hasNext() ? "0"
|
||||||
|
: sourceTimeline.lastInstant().get().getTimestamp();
|
||||||
|
|
||||||
if (sourceLatestCommit != null && HoodieTimeline
|
if (sourceLatestCommit != null && HoodieTimeline
|
||||||
.compareTimestamps(targetLatestCommit, sourceLatestCommit, HoodieTimeline.GREATER)) {
|
.compareTimestamps(targetLatestCommit, sourceLatestCommit, HoodieTimeline.GREATER)) {
|
||||||
|
|||||||
@@ -22,7 +22,8 @@ import com.uber.hoodie.cli.utils.InputStreamConsumer;
|
|||||||
import com.uber.hoodie.cli.utils.SparkUtil;
|
import com.uber.hoodie.cli.utils.SparkUtil;
|
||||||
import com.uber.hoodie.common.model.HoodiePartitionMetadata;
|
import com.uber.hoodie.common.model.HoodiePartitionMetadata;
|
||||||
import com.uber.hoodie.common.util.FSUtils;
|
import com.uber.hoodie.common.util.FSUtils;
|
||||||
|
import java.io.IOException;
|
||||||
|
import java.util.List;
|
||||||
import org.apache.hadoop.fs.Path;
|
import org.apache.hadoop.fs.Path;
|
||||||
import org.apache.spark.launcher.SparkLauncher;
|
import org.apache.spark.launcher.SparkLauncher;
|
||||||
import org.springframework.shell.core.CommandMarker;
|
import org.springframework.shell.core.CommandMarker;
|
||||||
@@ -31,9 +32,6 @@ import org.springframework.shell.core.annotation.CliCommand;
|
|||||||
import org.springframework.shell.core.annotation.CliOption;
|
import org.springframework.shell.core.annotation.CliOption;
|
||||||
import org.springframework.stereotype.Component;
|
import org.springframework.stereotype.Component;
|
||||||
|
|
||||||
import java.io.IOException;
|
|
||||||
import java.util.List;
|
|
||||||
|
|
||||||
@Component
|
@Component
|
||||||
public class RepairsCommand implements CommandMarker {
|
public class RepairsCommand implements CommandMarker {
|
||||||
|
|
||||||
@@ -52,7 +50,8 @@ public class RepairsCommand implements CommandMarker {
|
|||||||
@CliOption(key = {
|
@CliOption(key = {
|
||||||
"duplicatedPartitionPath"}, help = "Partition Path containing the duplicates", mandatory = true)
|
"duplicatedPartitionPath"}, help = "Partition Path containing the duplicates", mandatory = true)
|
||||||
final String duplicatedPartitionPath,
|
final String duplicatedPartitionPath,
|
||||||
@CliOption(key = {"repairedOutputPath"}, help = "Location to place the repaired files", mandatory = true)
|
@CliOption(key = {
|
||||||
|
"repairedOutputPath"}, help = "Location to place the repaired files", mandatory = true)
|
||||||
final String repairedOutputPath,
|
final String repairedOutputPath,
|
||||||
@CliOption(key = {"sparkProperties"}, help = "Spark Properites File Path", mandatory = true)
|
@CliOption(key = {"sparkProperties"}, help = "Spark Properites File Path", mandatory = true)
|
||||||
final String sparkPropertiesPath) throws Exception {
|
final String sparkPropertiesPath) throws Exception {
|
||||||
@@ -71,7 +70,6 @@ public class RepairsCommand implements CommandMarker {
|
|||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
@CliCommand(value = "repair addpartitionmeta", help = "Add partition metadata to a dataset, if not present")
|
@CliCommand(value = "repair addpartitionmeta", help = "Add partition metadata to a dataset, if not present")
|
||||||
public String addPartitionMeta(
|
public String addPartitionMeta(
|
||||||
@CliOption(key = {"dryrun"},
|
@CliOption(key = {"dryrun"},
|
||||||
@@ -79,17 +77,20 @@ public class RepairsCommand implements CommandMarker {
|
|||||||
unspecifiedDefaultValue = "true")
|
unspecifiedDefaultValue = "true")
|
||||||
final boolean dryRun) throws IOException {
|
final boolean dryRun) throws IOException {
|
||||||
|
|
||||||
String latestCommit = HoodieCLI.tableMetadata.getActiveTimeline().getCommitTimeline().lastInstant().get().getTimestamp();
|
String latestCommit = HoodieCLI.tableMetadata.getActiveTimeline().getCommitTimeline()
|
||||||
|
.lastInstant().get().getTimestamp();
|
||||||
List<String> partitionPaths = FSUtils.getAllFoldersThreeLevelsDown(HoodieCLI.fs,
|
List<String> partitionPaths = FSUtils.getAllFoldersThreeLevelsDown(HoodieCLI.fs,
|
||||||
HoodieCLI.tableMetadata.getBasePath());
|
HoodieCLI.tableMetadata.getBasePath());
|
||||||
Path basePath = new Path(HoodieCLI.tableMetadata.getBasePath());
|
Path basePath = new Path(HoodieCLI.tableMetadata.getBasePath());
|
||||||
String[][] rows = new String[partitionPaths.size() + 1][];
|
String[][] rows = new String[partitionPaths.size() + 1][];
|
||||||
|
|
||||||
int ind = 0;
|
int ind = 0;
|
||||||
for (String partition: partitionPaths) {
|
for (String partition : partitionPaths) {
|
||||||
Path partitionPath = new Path(basePath, partition);
|
Path partitionPath = new Path(basePath, partition);
|
||||||
String[] row = new String[3];
|
String[] row = new String[3];
|
||||||
row[0] = partition; row[1] = "Yes"; row[2] = "None";
|
row[0] = partition;
|
||||||
|
row[1] = "Yes";
|
||||||
|
row[2] = "None";
|
||||||
if (!HoodiePartitionMetadata.hasPartitionMetadata(HoodieCLI.fs, partitionPath)) {
|
if (!HoodiePartitionMetadata.hasPartitionMetadata(HoodieCLI.fs, partitionPath)) {
|
||||||
row[1] = "No";
|
row[1] = "No";
|
||||||
if (!dryRun) {
|
if (!dryRun) {
|
||||||
@@ -105,6 +106,6 @@ public class RepairsCommand implements CommandMarker {
|
|||||||
}
|
}
|
||||||
|
|
||||||
return HoodiePrintHelper.print(
|
return HoodiePrintHelper.print(
|
||||||
new String[] {"Partition Path", "Metadata Present?", "Action"}, rows);
|
new String[]{"Partition Path", "Metadata Present?", "Action"}, rows);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -27,6 +27,10 @@ import com.uber.hoodie.common.table.timeline.HoodieInstant;
|
|||||||
import com.uber.hoodie.config.HoodieIndexConfig;
|
import com.uber.hoodie.config.HoodieIndexConfig;
|
||||||
import com.uber.hoodie.config.HoodieWriteConfig;
|
import com.uber.hoodie.config.HoodieWriteConfig;
|
||||||
import com.uber.hoodie.index.HoodieIndex;
|
import com.uber.hoodie.index.HoodieIndex;
|
||||||
|
import java.io.IOException;
|
||||||
|
import java.util.Collections;
|
||||||
|
import java.util.List;
|
||||||
|
import java.util.stream.Collectors;
|
||||||
import org.apache.spark.api.java.JavaSparkContext;
|
import org.apache.spark.api.java.JavaSparkContext;
|
||||||
import org.apache.spark.launcher.SparkLauncher;
|
import org.apache.spark.launcher.SparkLauncher;
|
||||||
import org.springframework.shell.core.CommandMarker;
|
import org.springframework.shell.core.CommandMarker;
|
||||||
@@ -35,13 +39,9 @@ import org.springframework.shell.core.annotation.CliCommand;
|
|||||||
import org.springframework.shell.core.annotation.CliOption;
|
import org.springframework.shell.core.annotation.CliOption;
|
||||||
import org.springframework.stereotype.Component;
|
import org.springframework.stereotype.Component;
|
||||||
|
|
||||||
import java.io.IOException;
|
|
||||||
import java.util.Collections;
|
|
||||||
import java.util.List;
|
|
||||||
import java.util.stream.Collectors;
|
|
||||||
|
|
||||||
@Component
|
@Component
|
||||||
public class SavepointsCommand implements CommandMarker {
|
public class SavepointsCommand implements CommandMarker {
|
||||||
|
|
||||||
@CliAvailabilityIndicator({"savepoints show"})
|
@CliAvailabilityIndicator({"savepoints show"})
|
||||||
public boolean isShowAvailable() {
|
public boolean isShowAvailable() {
|
||||||
return HoodieCLI.tableMetadata != null;
|
return HoodieCLI.tableMetadata != null;
|
||||||
@@ -60,7 +60,8 @@ public class SavepointsCommand implements CommandMarker {
|
|||||||
|
|
||||||
@CliAvailabilityIndicator({"savepoint rollback"})
|
@CliAvailabilityIndicator({"savepoint rollback"})
|
||||||
public boolean isRollbackToSavepointAvailable() {
|
public boolean isRollbackToSavepointAvailable() {
|
||||||
return HoodieCLI.tableMetadata != null && !HoodieCLI.tableMetadata.getActiveTimeline().getSavePointTimeline().filterCompletedInstants().empty();
|
return HoodieCLI.tableMetadata != null && !HoodieCLI.tableMetadata.getActiveTimeline()
|
||||||
|
.getSavePointTimeline().filterCompletedInstants().empty();
|
||||||
}
|
}
|
||||||
|
|
||||||
@CliCommand(value = "savepoints show", help = "Show the savepoints")
|
@CliCommand(value = "savepoints show", help = "Show the savepoints")
|
||||||
@@ -72,9 +73,9 @@ public class SavepointsCommand implements CommandMarker {
|
|||||||
Collections.reverse(commits);
|
Collections.reverse(commits);
|
||||||
for (int i = 0; i < commits.size(); i++) {
|
for (int i = 0; i < commits.size(); i++) {
|
||||||
HoodieInstant commit = commits.get(i);
|
HoodieInstant commit = commits.get(i);
|
||||||
rows[i] = new String[] {commit.getTimestamp()};
|
rows[i] = new String[]{commit.getTimestamp()};
|
||||||
}
|
}
|
||||||
return HoodiePrintHelper.print(new String[] {"SavepointTime"}, rows);
|
return HoodiePrintHelper.print(new String[]{"SavepointTime"}, rows);
|
||||||
}
|
}
|
||||||
|
|
||||||
@CliCommand(value = "savepoint create", help = "Savepoint a commit")
|
@CliCommand(value = "savepoint create", help = "Savepoint a commit")
|
||||||
@@ -152,5 +153,4 @@ public class SavepointsCommand implements CommandMarker {
|
|||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -52,7 +52,7 @@ public class SparkMain {
|
|||||||
|
|
||||||
JavaSparkContext jsc = SparkUtil.initJavaSparkConf("hoodie-cli-" + command);
|
JavaSparkContext jsc = SparkUtil.initJavaSparkConf("hoodie-cli-" + command);
|
||||||
int returnCode = 0;
|
int returnCode = 0;
|
||||||
switch(cmd) {
|
switch (cmd) {
|
||||||
case ROLLBACK:
|
case ROLLBACK:
|
||||||
assert (args.length == 3);
|
assert (args.length == 3);
|
||||||
returnCode = rollback(jsc, args[1], args[2]);
|
returnCode = rollback(jsc, args[1], args[2]);
|
||||||
@@ -98,7 +98,7 @@ public class SparkMain {
|
|||||||
String basePath)
|
String basePath)
|
||||||
throws Exception {
|
throws Exception {
|
||||||
DedupeSparkJob job = new DedupeSparkJob(basePath,
|
DedupeSparkJob job = new DedupeSparkJob(basePath,
|
||||||
duplicatedPartitionPath,repairedOutputPath,new SQLContext(jsc), FSUtils.getFs());
|
duplicatedPartitionPath, repairedOutputPath, new SQLContext(jsc), FSUtils.getFs());
|
||||||
job.fixDuplicates(true);
|
job.fixDuplicates(true);
|
||||||
return 0;
|
return 0;
|
||||||
}
|
}
|
||||||
@@ -115,7 +115,8 @@ public class SparkMain {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
private static int rollbackToSavepoint(JavaSparkContext jsc, String savepointTime, String basePath)
|
private static int rollbackToSavepoint(JavaSparkContext jsc, String savepointTime,
|
||||||
|
String basePath)
|
||||||
throws Exception {
|
throws Exception {
|
||||||
HoodieWriteClient client = createHoodieClient(jsc, basePath);
|
HoodieWriteClient client = createHoodieClient(jsc, basePath);
|
||||||
if (client.rollbackToSavepoint(savepointTime)) {
|
if (client.rollbackToSavepoint(savepointTime)) {
|
||||||
|
|||||||
@@ -28,7 +28,10 @@ import com.uber.hoodie.common.table.timeline.HoodieActiveTimeline;
|
|||||||
import com.uber.hoodie.common.table.timeline.HoodieInstant;
|
import com.uber.hoodie.common.table.timeline.HoodieInstant;
|
||||||
import com.uber.hoodie.common.util.FSUtils;
|
import com.uber.hoodie.common.util.FSUtils;
|
||||||
import com.uber.hoodie.common.util.NumericUtils;
|
import com.uber.hoodie.common.util.NumericUtils;
|
||||||
|
import java.io.IOException;
|
||||||
|
import java.text.DecimalFormat;
|
||||||
|
import java.util.HashMap;
|
||||||
|
import java.util.stream.Collectors;
|
||||||
import org.apache.hadoop.fs.FileStatus;
|
import org.apache.hadoop.fs.FileStatus;
|
||||||
import org.apache.hadoop.fs.FileSystem;
|
import org.apache.hadoop.fs.FileSystem;
|
||||||
import org.apache.hadoop.fs.Path;
|
import org.apache.hadoop.fs.Path;
|
||||||
@@ -38,13 +41,9 @@ import org.springframework.shell.core.annotation.CliCommand;
|
|||||||
import org.springframework.shell.core.annotation.CliOption;
|
import org.springframework.shell.core.annotation.CliOption;
|
||||||
import org.springframework.stereotype.Component;
|
import org.springframework.stereotype.Component;
|
||||||
|
|
||||||
import java.io.IOException;
|
|
||||||
import java.text.DecimalFormat;
|
|
||||||
import java.util.HashMap;
|
|
||||||
import java.util.stream.Collectors;
|
|
||||||
|
|
||||||
@Component
|
@Component
|
||||||
public class StatsCommand implements CommandMarker {
|
public class StatsCommand implements CommandMarker {
|
||||||
|
|
||||||
@CliAvailabilityIndicator({"stats wa"})
|
@CliAvailabilityIndicator({"stats wa"})
|
||||||
public boolean isWriteAmpAvailable() {
|
public boolean isWriteAmpAvailable() {
|
||||||
return HoodieCLI.tableMetadata != null;
|
return HoodieCLI.tableMetadata != null;
|
||||||
@@ -64,13 +63,14 @@ public class StatsCommand implements CommandMarker {
|
|||||||
for (HoodieInstant commitTime : timeline.getInstants().collect(
|
for (HoodieInstant commitTime : timeline.getInstants().collect(
|
||||||
Collectors.toList())) {
|
Collectors.toList())) {
|
||||||
String waf = "0";
|
String waf = "0";
|
||||||
HoodieCommitMetadata commit = HoodieCommitMetadata.fromBytes(activeTimeline.getInstantDetails(commitTime).get());
|
HoodieCommitMetadata commit = HoodieCommitMetadata
|
||||||
|
.fromBytes(activeTimeline.getInstantDetails(commitTime).get());
|
||||||
if (commit.fetchTotalUpdateRecordsWritten() > 0) {
|
if (commit.fetchTotalUpdateRecordsWritten() > 0) {
|
||||||
waf = df.format(
|
waf = df.format(
|
||||||
(float) commit.fetchTotalRecordsWritten() / commit
|
(float) commit.fetchTotalRecordsWritten() / commit
|
||||||
.fetchTotalUpdateRecordsWritten());
|
.fetchTotalUpdateRecordsWritten());
|
||||||
}
|
}
|
||||||
rows[i++] = new String[] {commitTime.getTimestamp(),
|
rows[i++] = new String[]{commitTime.getTimestamp(),
|
||||||
String.valueOf(commit.fetchTotalUpdateRecordsWritten()),
|
String.valueOf(commit.fetchTotalUpdateRecordsWritten()),
|
||||||
String.valueOf(commit.fetchTotalRecordsWritten()), waf};
|
String.valueOf(commit.fetchTotalRecordsWritten()), waf};
|
||||||
totalRecordsUpserted += commit.fetchTotalUpdateRecordsWritten();
|
totalRecordsUpserted += commit.fetchTotalUpdateRecordsWritten();
|
||||||
@@ -80,10 +80,10 @@ public class StatsCommand implements CommandMarker {
|
|||||||
if (totalRecordsUpserted > 0) {
|
if (totalRecordsUpserted > 0) {
|
||||||
waf = df.format((float) totalRecordsWritten / totalRecordsUpserted);
|
waf = df.format((float) totalRecordsWritten / totalRecordsUpserted);
|
||||||
}
|
}
|
||||||
rows[i] = new String[] {"Total", String.valueOf(totalRecordsUpserted),
|
rows[i] = new String[]{"Total", String.valueOf(totalRecordsUpserted),
|
||||||
String.valueOf(totalRecordsWritten), waf};
|
String.valueOf(totalRecordsWritten), waf};
|
||||||
return HoodiePrintHelper.print(
|
return HoodiePrintHelper.print(
|
||||||
new String[] {"CommitTime", "Total Upserted", "Total Written",
|
new String[]{"CommitTime", "Total Upserted", "Total Written",
|
||||||
"Write Amplifiation Factor"}, rows);
|
"Write Amplifiation Factor"}, rows);
|
||||||
|
|
||||||
}
|
}
|
||||||
@@ -105,7 +105,8 @@ public class StatsCommand implements CommandMarker {
|
|||||||
|
|
||||||
@CliCommand(value = "stats filesizes", help = "File Sizes. Display summary stats on sizes of files")
|
@CliCommand(value = "stats filesizes", help = "File Sizes. Display summary stats on sizes of files")
|
||||||
public String fileSizeStats(
|
public String fileSizeStats(
|
||||||
@CliOption(key = {"partitionPath"}, help = "regex to select files, eg: 2016/08/02", unspecifiedDefaultValue = "*/*/*")
|
@CliOption(key = {
|
||||||
|
"partitionPath"}, help = "regex to select files, eg: 2016/08/02", unspecifiedDefaultValue = "*/*/*")
|
||||||
final String globRegex) throws IOException {
|
final String globRegex) throws IOException {
|
||||||
|
|
||||||
FileSystem fs = HoodieCLI.fs;
|
FileSystem fs = HoodieCLI.fs;
|
||||||
@@ -118,7 +119,7 @@ public class StatsCommand implements CommandMarker {
|
|||||||
final int MAX_FILES = 1000000;
|
final int MAX_FILES = 1000000;
|
||||||
Histogram globalHistogram = new Histogram(new UniformReservoir(MAX_FILES));
|
Histogram globalHistogram = new Histogram(new UniformReservoir(MAX_FILES));
|
||||||
HashMap<String, Histogram> commitHistoMap = new HashMap<String, Histogram>();
|
HashMap<String, Histogram> commitHistoMap = new HashMap<String, Histogram>();
|
||||||
for (FileStatus fileStatus: statuses) {
|
for (FileStatus fileStatus : statuses) {
|
||||||
String commitTime = FSUtils.getCommitTime(fileStatus.getPath().getName());
|
String commitTime = FSUtils.getCommitTime(fileStatus.getPath().getName());
|
||||||
long sz = fileStatus.getLen();
|
long sz = fileStatus.getLen();
|
||||||
if (!commitHistoMap.containsKey(commitTime)) {
|
if (!commitHistoMap.containsKey(commitTime)) {
|
||||||
@@ -130,7 +131,7 @@ public class StatsCommand implements CommandMarker {
|
|||||||
|
|
||||||
String[][] rows = new String[commitHistoMap.size() + 1][];
|
String[][] rows = new String[commitHistoMap.size() + 1][];
|
||||||
int ind = 0;
|
int ind = 0;
|
||||||
for (String commitTime: commitHistoMap.keySet()) {
|
for (String commitTime : commitHistoMap.keySet()) {
|
||||||
Snapshot s = commitHistoMap.get(commitTime).getSnapshot();
|
Snapshot s = commitHistoMap.get(commitTime).getSnapshot();
|
||||||
rows[ind++] = printFileSizeHistogram(commitTime, s);
|
rows[ind++] = printFileSizeHistogram(commitTime, s);
|
||||||
}
|
}
|
||||||
@@ -138,6 +139,7 @@ public class StatsCommand implements CommandMarker {
|
|||||||
rows[ind++] = printFileSizeHistogram("ALL", s);
|
rows[ind++] = printFileSizeHistogram("ALL", s);
|
||||||
|
|
||||||
return HoodiePrintHelper.print(
|
return HoodiePrintHelper.print(
|
||||||
new String[] {"CommitTime", "Min", "10th", "50th", "avg", "95th", "Max", "NumFiles", "StdDev"}, rows);
|
new String[]{"CommitTime", "Min", "10th", "50th", "avg", "95th", "Max", "NumFiles",
|
||||||
|
"StdDev"}, rows);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -23,9 +23,10 @@ import org.springframework.stereotype.Component;
|
|||||||
|
|
||||||
@Component
|
@Component
|
||||||
public class UtilsCommand implements CommandMarker {
|
public class UtilsCommand implements CommandMarker {
|
||||||
@CliCommand(value = "utils loadClass", help = "Load a class" )
|
|
||||||
|
@CliCommand(value = "utils loadClass", help = "Load a class")
|
||||||
public String loadClass(
|
public String loadClass(
|
||||||
@CliOption(key = {"class"}, help = "Check mode" ) final String clazz
|
@CliOption(key = {"class"}, help = "Check mode") final String clazz
|
||||||
) throws Exception {
|
) throws Exception {
|
||||||
Class klass = Class.forName(clazz);
|
Class klass = Class.forName(clazz);
|
||||||
return klass.getProtectionDomain().getCodeSource().getLocation().toExternalForm();
|
return klass.getProtectionDomain().getCodeSource().getLocation().toExternalForm();
|
||||||
|
|||||||
@@ -20,16 +20,17 @@ import com.uber.hoodie.common.model.HoodieCommitMetadata;
|
|||||||
import com.uber.hoodie.common.table.HoodieTableMetaClient;
|
import com.uber.hoodie.common.table.HoodieTableMetaClient;
|
||||||
import com.uber.hoodie.common.table.HoodieTimeline;
|
import com.uber.hoodie.common.table.HoodieTimeline;
|
||||||
import com.uber.hoodie.common.table.timeline.HoodieInstant;
|
import com.uber.hoodie.common.table.timeline.HoodieInstant;
|
||||||
|
|
||||||
import java.io.IOException;
|
import java.io.IOException;
|
||||||
import java.util.List;
|
import java.util.List;
|
||||||
|
|
||||||
public class CommitUtil {
|
public class CommitUtil {
|
||||||
|
|
||||||
public static long countNewRecords(HoodieTableMetaClient target, List<String> commitsToCatchup)
|
public static long countNewRecords(HoodieTableMetaClient target, List<String> commitsToCatchup)
|
||||||
throws IOException {
|
throws IOException {
|
||||||
long totalNew = 0;
|
long totalNew = 0;
|
||||||
HoodieTimeline timeline = target.getActiveTimeline().reload().getCommitTimeline().filterCompletedInstants();
|
HoodieTimeline timeline = target.getActiveTimeline().reload().getCommitTimeline()
|
||||||
for(String commit:commitsToCatchup) {
|
.filterCompletedInstants();
|
||||||
|
for (String commit : commitsToCatchup) {
|
||||||
HoodieCommitMetadata c = HoodieCommitMetadata.fromBytes(timeline
|
HoodieCommitMetadata c = HoodieCommitMetadata.fromBytes(timeline
|
||||||
.getInstantDetails(new HoodieInstant(false, HoodieTimeline.COMMIT_ACTION, commit))
|
.getInstantDetails(new HoodieInstant(false, HoodieTimeline.COMMIT_ACTION, commit))
|
||||||
.get());
|
.get());
|
||||||
|
|||||||
@@ -17,16 +17,16 @@
|
|||||||
package com.uber.hoodie.cli.utils;
|
package com.uber.hoodie.cli.utils;
|
||||||
|
|
||||||
import com.uber.hoodie.common.table.HoodieTableMetaClient;
|
import com.uber.hoodie.common.table.HoodieTableMetaClient;
|
||||||
import org.apache.commons.dbcp.BasicDataSource;
|
|
||||||
import org.joda.time.DateTime;
|
|
||||||
|
|
||||||
import javax.sql.DataSource;
|
|
||||||
import java.sql.Connection;
|
import java.sql.Connection;
|
||||||
import java.sql.ResultSet;
|
import java.sql.ResultSet;
|
||||||
import java.sql.SQLException;
|
import java.sql.SQLException;
|
||||||
import java.sql.Statement;
|
import java.sql.Statement;
|
||||||
|
import javax.sql.DataSource;
|
||||||
|
import org.apache.commons.dbcp.BasicDataSource;
|
||||||
|
import org.joda.time.DateTime;
|
||||||
|
|
||||||
public class HiveUtil {
|
public class HiveUtil {
|
||||||
|
|
||||||
private static String driverName = "org.apache.hive.jdbc.HiveDriver";
|
private static String driverName = "org.apache.hive.jdbc.HiveDriver";
|
||||||
|
|
||||||
static {
|
static {
|
||||||
@@ -39,7 +39,8 @@ public class HiveUtil {
|
|||||||
|
|
||||||
private static Connection connection;
|
private static Connection connection;
|
||||||
|
|
||||||
private static Connection getConnection(String jdbcUrl, String user, String pass) throws SQLException {
|
private static Connection getConnection(String jdbcUrl, String user, String pass)
|
||||||
|
throws SQLException {
|
||||||
DataSource ds = getDatasource(jdbcUrl, user, pass);
|
DataSource ds = getDatasource(jdbcUrl, user, pass);
|
||||||
return ds.getConnection();
|
return ds.getConnection();
|
||||||
}
|
}
|
||||||
@@ -53,22 +54,25 @@ public class HiveUtil {
|
|||||||
return ds;
|
return ds;
|
||||||
}
|
}
|
||||||
|
|
||||||
public static long countRecords(String jdbcUrl, HoodieTableMetaClient source, String dbName, String user, String pass) throws SQLException {
|
public static long countRecords(String jdbcUrl, HoodieTableMetaClient source, String dbName,
|
||||||
|
String user, String pass) throws SQLException {
|
||||||
Connection conn = HiveUtil.getConnection(jdbcUrl, user, pass);
|
Connection conn = HiveUtil.getConnection(jdbcUrl, user, pass);
|
||||||
ResultSet rs = null;
|
ResultSet rs = null;
|
||||||
Statement stmt = conn.createStatement();
|
Statement stmt = conn.createStatement();
|
||||||
try {
|
try {
|
||||||
//stmt.execute("set mapred.job.queue.name=<queue_name>");
|
//stmt.execute("set mapred.job.queue.name=<queue_name>");
|
||||||
stmt.execute("set hive.input.format=org.apache.hadoop.hive.ql.io.HiveInputFormat" );
|
stmt.execute("set hive.input.format=org.apache.hadoop.hive.ql.io.HiveInputFormat");
|
||||||
stmt.execute("set hive.stats.autogather=false" );
|
stmt.execute("set hive.stats.autogather=false");
|
||||||
rs = stmt.executeQuery(
|
rs = stmt.executeQuery(
|
||||||
"select count(`_hoodie_commit_time`) as cnt from " + dbName + "." + source.getTableConfig()
|
"select count(`_hoodie_commit_time`) as cnt from " + dbName + "." + source
|
||||||
|
.getTableConfig()
|
||||||
.getTableName());
|
.getTableName());
|
||||||
long count = -1;
|
long count = -1;
|
||||||
if(rs.next()) {
|
if (rs.next()) {
|
||||||
count = rs.getLong("cnt");
|
count = rs.getLong("cnt");
|
||||||
}
|
}
|
||||||
System.out.println("Total records in " + source.getTableConfig().getTableName() + " is " + count);
|
System.out
|
||||||
|
.println("Total records in " + source.getTableConfig().getTableName() + " is " + count);
|
||||||
return count;
|
return count;
|
||||||
} finally {
|
} finally {
|
||||||
if (rs != null) {
|
if (rs != null) {
|
||||||
@@ -94,7 +98,8 @@ public class HiveUtil {
|
|||||||
return countRecords(jdbcUrl, source, srcDb, startDateStr, endDateStr, user, pass);
|
return countRecords(jdbcUrl, source, srcDb, startDateStr, endDateStr, user, pass);
|
||||||
}
|
}
|
||||||
|
|
||||||
private static long countRecords(String jdbcUrl, HoodieTableMetaClient source, String srcDb, String startDateStr,
|
private static long countRecords(String jdbcUrl, HoodieTableMetaClient source, String srcDb,
|
||||||
|
String startDateStr,
|
||||||
String endDateStr, String user, String pass) throws SQLException {
|
String endDateStr, String user, String pass) throws SQLException {
|
||||||
Connection conn = HiveUtil.getConnection(jdbcUrl, user, pass);
|
Connection conn = HiveUtil.getConnection(jdbcUrl, user, pass);
|
||||||
ResultSet rs = null;
|
ResultSet rs = null;
|
||||||
@@ -107,7 +112,7 @@ public class HiveUtil {
|
|||||||
"select count(`_hoodie_commit_time`) as cnt from " + srcDb + "." + source.getTableConfig()
|
"select count(`_hoodie_commit_time`) as cnt from " + srcDb + "." + source.getTableConfig()
|
||||||
.getTableName() + " where datestr>'" + startDateStr + "' and datestr<='"
|
.getTableName() + " where datestr>'" + startDateStr + "' and datestr<='"
|
||||||
+ endDateStr + "'");
|
+ endDateStr + "'");
|
||||||
if(rs.next()) {
|
if (rs.next()) {
|
||||||
return rs.getLong("cnt");
|
return rs.getLong("cnt");
|
||||||
}
|
}
|
||||||
return -1;
|
return -1;
|
||||||
|
|||||||
@@ -23,8 +23,10 @@ import java.io.InputStreamReader;
|
|||||||
import java.util.logging.Logger;
|
import java.util.logging.Logger;
|
||||||
|
|
||||||
public class InputStreamConsumer extends Thread {
|
public class InputStreamConsumer extends Thread {
|
||||||
|
|
||||||
protected final static Logger LOG = Logger.getLogger(InputStreamConsumer.class.getName());
|
protected final static Logger LOG = Logger.getLogger(InputStreamConsumer.class.getName());
|
||||||
private InputStream is;
|
private InputStream is;
|
||||||
|
|
||||||
public InputStreamConsumer(InputStream is) {
|
public InputStreamConsumer(InputStream is) {
|
||||||
this.is = is;
|
this.is = is;
|
||||||
}
|
}
|
||||||
@@ -35,8 +37,9 @@ public class InputStreamConsumer extends Thread {
|
|||||||
InputStreamReader isr = new InputStreamReader(is);
|
InputStreamReader isr = new InputStreamReader(is);
|
||||||
BufferedReader br = new BufferedReader(isr);
|
BufferedReader br = new BufferedReader(isr);
|
||||||
String line;
|
String line;
|
||||||
while ( (line = br.readLine()) != null)
|
while ((line = br.readLine()) != null) {
|
||||||
LOG.info(line);
|
LOG.info(line);
|
||||||
|
}
|
||||||
} catch (IOException ioe) {
|
} catch (IOException ioe) {
|
||||||
LOG.severe(ioe.toString());
|
LOG.severe(ioe.toString());
|
||||||
ioe.printStackTrace();
|
ioe.printStackTrace();
|
||||||
|
|||||||
@@ -18,26 +18,20 @@ package com.uber.hoodie.cli.utils;
|
|||||||
|
|
||||||
import com.uber.hoodie.HoodieWriteClient;
|
import com.uber.hoodie.HoodieWriteClient;
|
||||||
import com.uber.hoodie.cli.commands.SparkMain;
|
import com.uber.hoodie.cli.commands.SparkMain;
|
||||||
|
import java.io.File;
|
||||||
|
import java.net.URISyntaxException;
|
||||||
import org.apache.log4j.Logger;
|
import org.apache.log4j.Logger;
|
||||||
import org.apache.spark.SparkConf;
|
import org.apache.spark.SparkConf;
|
||||||
import org.apache.spark.api.java.JavaSparkContext;
|
import org.apache.spark.api.java.JavaSparkContext;
|
||||||
import org.apache.spark.launcher.SparkLauncher;
|
import org.apache.spark.launcher.SparkLauncher;
|
||||||
|
|
||||||
import java.io.File;
|
|
||||||
import java.net.URISyntaxException;
|
|
||||||
|
|
||||||
public class SparkUtil {
|
public class SparkUtil {
|
||||||
|
|
||||||
public static Logger logger = Logger.getLogger(SparkUtil.class);
|
public static Logger logger = Logger.getLogger(SparkUtil.class);
|
||||||
public static final String DEFUALT_SPARK_MASTER = "yarn-client";
|
public static final String DEFUALT_SPARK_MASTER = "yarn-client";
|
||||||
|
|
||||||
/**
|
/**
|
||||||
*
|
|
||||||
* TODO: Need to fix a bunch of hardcoded stuff here eg: history server, spark distro
|
* TODO: Need to fix a bunch of hardcoded stuff here eg: history server, spark distro
|
||||||
*
|
|
||||||
* @return
|
|
||||||
* @throws URISyntaxException
|
|
||||||
*/
|
*/
|
||||||
public static SparkLauncher initLauncher(String propertiesFile) throws URISyntaxException {
|
public static SparkLauncher initLauncher(String propertiesFile) throws URISyntaxException {
|
||||||
String currentJar = new File(
|
String currentJar = new File(
|
||||||
@@ -65,7 +59,8 @@ public class SparkUtil {
|
|||||||
// Configure hadoop conf
|
// Configure hadoop conf
|
||||||
sparkConf.set("spark.hadoop.mapred.output.compress", "true");
|
sparkConf.set("spark.hadoop.mapred.output.compress", "true");
|
||||||
sparkConf.set("spark.hadoop.mapred.output.compression.codec", "true");
|
sparkConf.set("spark.hadoop.mapred.output.compression.codec", "true");
|
||||||
sparkConf.set("spark.hadoop.mapred.output.compression.codec", "org.apache.hadoop.io.compress.GzipCodec");
|
sparkConf.set("spark.hadoop.mapred.output.compression.codec",
|
||||||
|
"org.apache.hadoop.io.compress.GzipCodec");
|
||||||
sparkConf.set("spark.hadoop.mapred.output.compression.type", "BLOCK");
|
sparkConf.set("spark.hadoop.mapred.output.compression.type", "BLOCK");
|
||||||
|
|
||||||
sparkConf = HoodieWriteClient.registerClasses(sparkConf);
|
sparkConf = HoodieWriteClient.registerClasses(sparkConf);
|
||||||
|
|||||||
@@ -21,6 +21,6 @@
|
|||||||
xsi:schemaLocation="http://www.springframework.org/schema/beans http://www.springframework.org/schema/beans/spring-beans.xsd
|
xsi:schemaLocation="http://www.springframework.org/schema/beans http://www.springframework.org/schema/beans/spring-beans.xsd
|
||||||
http://www.springframework.org/schema/context http://www.springframework.org/schema/context/spring-context.xsd">
|
http://www.springframework.org/schema/context http://www.springframework.org/schema/context/spring-context.xsd">
|
||||||
|
|
||||||
<context:component-scan base-package="com.uber.hoodie.cli" />
|
<context:component-scan base-package="com.uber.hoodie.cli"/>
|
||||||
|
|
||||||
</beans>
|
</beans>
|
||||||
|
|||||||
@@ -34,7 +34,7 @@ import scala.collection.mutable._
|
|||||||
/**
|
/**
|
||||||
* Spark job to de-duplicate data present in a partition path
|
* Spark job to de-duplicate data present in a partition path
|
||||||
*/
|
*/
|
||||||
class DedupeSparkJob (basePath: String,
|
class DedupeSparkJob(basePath: String,
|
||||||
duplicatedPartitionPath: String,
|
duplicatedPartitionPath: String,
|
||||||
repairOutputPath: String,
|
repairOutputPath: String,
|
||||||
sqlContext: SQLContext,
|
sqlContext: SQLContext,
|
||||||
@@ -50,8 +50,9 @@ class DedupeSparkJob (basePath: String,
|
|||||||
* @param tblName
|
* @param tblName
|
||||||
* @return
|
* @return
|
||||||
*/
|
*/
|
||||||
def getDupeKeyDF(tblName: String) : DataFrame = {
|
def getDupeKeyDF(tblName: String): DataFrame = {
|
||||||
val dupeSql = s"""
|
val dupeSql =
|
||||||
|
s"""
|
||||||
select `${HoodieRecord.RECORD_KEY_METADATA_FIELD}` as dupe_key,
|
select `${HoodieRecord.RECORD_KEY_METADATA_FIELD}` as dupe_key,
|
||||||
count(*) as dupe_cnt
|
count(*) as dupe_cnt
|
||||||
from ${tblName}
|
from ${tblName}
|
||||||
@@ -69,7 +70,7 @@ class DedupeSparkJob (basePath: String,
|
|||||||
*
|
*
|
||||||
* @return
|
* @return
|
||||||
*/
|
*/
|
||||||
private def planDuplicateFix() : HashMap[String, HashSet[String]] = {
|
private def planDuplicateFix(): HashMap[String, HashSet[String]] = {
|
||||||
|
|
||||||
val tmpTableName = s"htbl_${System.currentTimeMillis()}"
|
val tmpTableName = s"htbl_${System.currentTimeMillis()}"
|
||||||
val dedupeTblName = s"${tmpTableName}_dupeKeys"
|
val dedupeTblName = s"${tmpTableName}_dupeKeys"
|
||||||
@@ -78,17 +79,18 @@ class DedupeSparkJob (basePath: String,
|
|||||||
|
|
||||||
val allFiles = fs.listStatus(new org.apache.hadoop.fs.Path(s"${basePath}/${duplicatedPartitionPath}"))
|
val allFiles = fs.listStatus(new org.apache.hadoop.fs.Path(s"${basePath}/${duplicatedPartitionPath}"))
|
||||||
val fsView = new HoodieTableFileSystemView(metadata, metadata.getActiveTimeline.getCommitTimeline.filterCompletedInstants(), allFiles)
|
val fsView = new HoodieTableFileSystemView(metadata, metadata.getActiveTimeline.getCommitTimeline.filterCompletedInstants(), allFiles)
|
||||||
val latestFiles:java.util.List[HoodieDataFile] = fsView.getLatestDataFiles().collect(Collectors.toList[HoodieDataFile]())
|
val latestFiles: java.util.List[HoodieDataFile] = fsView.getLatestDataFiles().collect(Collectors.toList[HoodieDataFile]())
|
||||||
val filteredStatuses = latestFiles.map(f => f.getPath)
|
val filteredStatuses = latestFiles.map(f => f.getPath)
|
||||||
LOG.info(s" List of files under partition: ${} => ${filteredStatuses.mkString(" ")}")
|
LOG.info(s" List of files under partition: ${} => ${filteredStatuses.mkString(" ")}")
|
||||||
|
|
||||||
val df = sqlContext.parquetFile(filteredStatuses:_*)
|
val df = sqlContext.parquetFile(filteredStatuses: _*)
|
||||||
df.registerTempTable(tmpTableName)
|
df.registerTempTable(tmpTableName)
|
||||||
val dupeKeyDF = getDupeKeyDF(tmpTableName)
|
val dupeKeyDF = getDupeKeyDF(tmpTableName)
|
||||||
dupeKeyDF.registerTempTable(dedupeTblName)
|
dupeKeyDF.registerTempTable(dedupeTblName)
|
||||||
|
|
||||||
// Obtain necessary satellite information for duplicate rows
|
// Obtain necessary satellite information for duplicate rows
|
||||||
val dupeDataSql = s"""
|
val dupeDataSql =
|
||||||
|
s"""
|
||||||
SELECT `_hoodie_record_key`, `_hoodie_partition_path`, `_hoodie_file_name`, `_hoodie_commit_time`
|
SELECT `_hoodie_record_key`, `_hoodie_partition_path`, `_hoodie_file_name`, `_hoodie_commit_time`
|
||||||
FROM ${tmpTableName} h
|
FROM ${tmpTableName} h
|
||||||
JOIN ${dedupeTblName} d
|
JOIN ${dedupeTblName} d
|
||||||
@@ -111,9 +113,9 @@ class DedupeSparkJob (basePath: String,
|
|||||||
|
|
||||||
rows.foreach(r => {
|
rows.foreach(r => {
|
||||||
val c = r(3).asInstanceOf[String].toLong
|
val c = r(3).asInstanceOf[String].toLong
|
||||||
if (c != maxCommit){
|
if (c != maxCommit) {
|
||||||
val f = r(2).asInstanceOf[String].split("_")(0)
|
val f = r(2).asInstanceOf[String].split("_")(0)
|
||||||
if (!fileToDeleteKeyMap.contains(f)){
|
if (!fileToDeleteKeyMap.contains(f)) {
|
||||||
fileToDeleteKeyMap(f) = HashSet[String]()
|
fileToDeleteKeyMap(f) = HashSet[String]()
|
||||||
}
|
}
|
||||||
fileToDeleteKeyMap(f).add(key)
|
fileToDeleteKeyMap(f).add(key)
|
||||||
@@ -130,28 +132,30 @@ class DedupeSparkJob (basePath: String,
|
|||||||
val allFiles = fs.listStatus(new Path(s"${basePath}/${duplicatedPartitionPath}"))
|
val allFiles = fs.listStatus(new Path(s"${basePath}/${duplicatedPartitionPath}"))
|
||||||
val fsView = new HoodieTableFileSystemView(metadata, metadata.getActiveTimeline.getCommitTimeline.filterCompletedInstants(), allFiles)
|
val fsView = new HoodieTableFileSystemView(metadata, metadata.getActiveTimeline.getCommitTimeline.filterCompletedInstants(), allFiles)
|
||||||
|
|
||||||
val latestFiles:java.util.List[HoodieDataFile] = fsView.getLatestDataFiles().collect(Collectors.toList[HoodieDataFile]())
|
val latestFiles: java.util.List[HoodieDataFile] = fsView.getLatestDataFiles().collect(Collectors.toList[HoodieDataFile]())
|
||||||
|
|
||||||
val fileNameToPathMap = latestFiles.map(f => (f.getFileId, new Path(f.getPath))).toMap
|
val fileNameToPathMap = latestFiles.map(f => (f.getFileId, new Path(f.getPath))).toMap
|
||||||
val dupeFixPlan = planDuplicateFix()
|
val dupeFixPlan = planDuplicateFix()
|
||||||
|
|
||||||
// 1. Copy all latest files into the temp fix path
|
// 1. Copy all latest files into the temp fix path
|
||||||
fileNameToPathMap.foreach{ case(fileName, filePath) => {
|
fileNameToPathMap.foreach { case (fileName, filePath) => {
|
||||||
val badSuffix = if (dupeFixPlan.contains(fileName)) ".bad" else ""
|
val badSuffix = if (dupeFixPlan.contains(fileName)) ".bad" else ""
|
||||||
val dstPath = new Path(s"${repairOutputPath}/${filePath.getName}${badSuffix}")
|
val dstPath = new Path(s"${repairOutputPath}/${filePath.getName}${badSuffix}")
|
||||||
LOG.info(s"Copying from ${filePath} to ${dstPath}")
|
LOG.info(s"Copying from ${filePath} to ${dstPath}")
|
||||||
FileUtil.copy(fs, filePath, fs, dstPath, false, true, fs.getConf)
|
FileUtil.copy(fs, filePath, fs, dstPath, false, true, fs.getConf)
|
||||||
}}
|
}
|
||||||
|
}
|
||||||
|
|
||||||
// 2. Remove duplicates from the bad files
|
// 2. Remove duplicates from the bad files
|
||||||
dupeFixPlan.foreach{case(fileName, keysToSkip) => {
|
dupeFixPlan.foreach { case (fileName, keysToSkip) => {
|
||||||
val commitTime = FSUtils.getCommitTime(fileNameToPathMap(fileName).getName)
|
val commitTime = FSUtils.getCommitTime(fileNameToPathMap(fileName).getName)
|
||||||
val badFilePath = new Path(s"${repairOutputPath}/${fileNameToPathMap(fileName).getName}.bad")
|
val badFilePath = new Path(s"${repairOutputPath}/${fileNameToPathMap(fileName).getName}.bad")
|
||||||
val newFilePath = new Path(s"${repairOutputPath}/${fileNameToPathMap(fileName).getName}")
|
val newFilePath = new Path(s"${repairOutputPath}/${fileNameToPathMap(fileName).getName}")
|
||||||
LOG.info(" Skipping and writing new file for : " + fileName)
|
LOG.info(" Skipping and writing new file for : " + fileName)
|
||||||
SparkHelpers.skipKeysAndWriteNewFile(commitTime, fs, badFilePath, newFilePath, dupeFixPlan(fileName))
|
SparkHelpers.skipKeysAndWriteNewFile(commitTime, fs, badFilePath, newFilePath, dupeFixPlan(fileName))
|
||||||
fs.delete(badFilePath, false)
|
fs.delete(badFilePath, false)
|
||||||
}}
|
}
|
||||||
|
}
|
||||||
|
|
||||||
// 3. Check that there are no duplicates anymore.
|
// 3. Check that there are no duplicates anymore.
|
||||||
val df = sqlContext.read.parquet(s"${repairOutputPath}/*.parquet")
|
val df = sqlContext.read.parquet(s"${repairOutputPath}/*.parquet")
|
||||||
@@ -186,6 +190,7 @@ class DedupeSparkJob (basePath: String,
|
|||||||
LOG.info(s"[FOR REAL!!!] Copying from ${srcPath} to ${dstPath}")
|
LOG.info(s"[FOR REAL!!!] Copying from ${srcPath} to ${dstPath}")
|
||||||
FileUtil.copy(fs, srcPath, fs, dstPath, false, true, fs.getConf)
|
FileUtil.copy(fs, srcPath, fs, dstPath, false, true, fs.getConf)
|
||||||
}
|
}
|
||||||
}}
|
}
|
||||||
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -17,9 +17,9 @@
|
|||||||
package com.uber.hoodie.cli
|
package com.uber.hoodie.cli
|
||||||
|
|
||||||
import com.uber.hoodie.avro.HoodieAvroWriteSupport
|
import com.uber.hoodie.avro.HoodieAvroWriteSupport
|
||||||
import com.uber.hoodie.common.{BloomFilter, HoodieJsonPayload}
|
|
||||||
import com.uber.hoodie.common.model.HoodieRecord
|
import com.uber.hoodie.common.model.HoodieRecord
|
||||||
import com.uber.hoodie.common.util.ParquetUtils
|
import com.uber.hoodie.common.util.ParquetUtils
|
||||||
|
import com.uber.hoodie.common.{BloomFilter, HoodieJsonPayload}
|
||||||
import com.uber.hoodie.config.{HoodieIndexConfig, HoodieStorageConfig}
|
import com.uber.hoodie.config.{HoodieIndexConfig, HoodieStorageConfig}
|
||||||
import com.uber.hoodie.io.storage.{HoodieParquetConfig, HoodieParquetWriter}
|
import com.uber.hoodie.io.storage.{HoodieParquetConfig, HoodieParquetWriter}
|
||||||
import org.apache.avro.Schema
|
import org.apache.avro.Schema
|
||||||
@@ -107,7 +107,7 @@ class SparkHelper(sqlContext: SQLContext, fs: FileSystem) {
|
|||||||
* @param file
|
* @param file
|
||||||
* @param sqlContext
|
* @param sqlContext
|
||||||
*/
|
*/
|
||||||
def getKeyCount(file: String, sqlContext: org.apache.spark.sql.SQLContext) ={
|
def getKeyCount(file: String, sqlContext: org.apache.spark.sql.SQLContext) = {
|
||||||
println(getRowKeyDF(file).collect().size)
|
println(getRowKeyDF(file).collect().size)
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -122,7 +122,7 @@ class SparkHelper(sqlContext: SQLContext, fs: FileSystem) {
|
|||||||
* @param file
|
* @param file
|
||||||
* @return
|
* @return
|
||||||
*/
|
*/
|
||||||
def fileKeysAgainstBF(conf: Configuration, sqlContext: SQLContext, file: String) : Boolean = {
|
def fileKeysAgainstBF(conf: Configuration, sqlContext: SQLContext, file: String): Boolean = {
|
||||||
val bfStr = SparkHelpers.getBloomFilter(file, conf)
|
val bfStr = SparkHelpers.getBloomFilter(file, conf)
|
||||||
val bf = new com.uber.hoodie.common.BloomFilter(bfStr)
|
val bf = new com.uber.hoodie.common.BloomFilter(bfStr)
|
||||||
val foundCount = sqlContext.parquetFile(file)
|
val foundCount = sqlContext.parquetFile(file)
|
||||||
@@ -134,7 +134,7 @@ class SparkHelper(sqlContext: SQLContext, fs: FileSystem) {
|
|||||||
totalCount == foundCount
|
totalCount == foundCount
|
||||||
}
|
}
|
||||||
|
|
||||||
def getDistinctKeyDF(paths: List[String]) : DataFrame = {
|
def getDistinctKeyDF(paths: List[String]): DataFrame = {
|
||||||
sqlContext.read.parquet(paths:_*).select(s"`${HoodieRecord.RECORD_KEY_METADATA_FIELD}`").distinct()
|
sqlContext.read.parquet(paths: _*).select(s"`${HoodieRecord.RECORD_KEY_METADATA_FIELD}`").distinct()
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -15,7 +15,9 @@
|
|||||||
~ limitations under the License.
|
~ limitations under the License.
|
||||||
-->
|
-->
|
||||||
|
|
||||||
<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
|
<project xmlns="http://maven.apache.org/POM/4.0.0"
|
||||||
|
xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
|
||||||
|
xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
|
||||||
<parent>
|
<parent>
|
||||||
<artifactId>hoodie</artifactId>
|
<artifactId>hoodie</artifactId>
|
||||||
<groupId>com.uber.hoodie</groupId>
|
<groupId>com.uber.hoodie</groupId>
|
||||||
|
|||||||
@@ -17,25 +17,19 @@
|
|||||||
package com.uber.hoodie;
|
package com.uber.hoodie;
|
||||||
|
|
||||||
import com.google.common.base.Optional;
|
import com.google.common.base.Optional;
|
||||||
|
|
||||||
import com.uber.hoodie.common.model.HoodieCommitMetadata;
|
|
||||||
import com.uber.hoodie.common.model.HoodieDataFile;
|
|
||||||
import com.uber.hoodie.common.model.HoodieKey;
|
import com.uber.hoodie.common.model.HoodieKey;
|
||||||
import com.uber.hoodie.common.model.HoodieRecord;
|
import com.uber.hoodie.common.model.HoodieRecord;
|
||||||
import com.uber.hoodie.common.table.HoodieTableMetaClient;
|
import com.uber.hoodie.common.table.HoodieTableMetaClient;
|
||||||
import com.uber.hoodie.common.table.HoodieTimeline;
|
import com.uber.hoodie.common.table.HoodieTimeline;
|
||||||
import com.uber.hoodie.common.table.TableFileSystemView;
|
|
||||||
import com.uber.hoodie.common.table.timeline.HoodieInstant;
|
|
||||||
import com.uber.hoodie.common.table.view.HoodieTableFileSystemView;
|
|
||||||
import com.uber.hoodie.common.util.FSUtils;
|
import com.uber.hoodie.common.util.FSUtils;
|
||||||
import com.uber.hoodie.config.HoodieWriteConfig;
|
import com.uber.hoodie.config.HoodieWriteConfig;
|
||||||
import com.uber.hoodie.exception.HoodieException;
|
|
||||||
import com.uber.hoodie.index.bloom.HoodieBloomIndex;
|
import com.uber.hoodie.index.bloom.HoodieBloomIndex;
|
||||||
|
|
||||||
import com.uber.hoodie.table.HoodieTable;
|
import com.uber.hoodie.table.HoodieTable;
|
||||||
|
import java.io.Serializable;
|
||||||
|
import java.util.HashSet;
|
||||||
|
import java.util.List;
|
||||||
|
import java.util.Set;
|
||||||
import org.apache.hadoop.fs.FileSystem;
|
import org.apache.hadoop.fs.FileSystem;
|
||||||
import org.apache.hadoop.fs.Path;
|
|
||||||
import org.apache.log4j.LogManager;
|
import org.apache.log4j.LogManager;
|
||||||
import org.apache.log4j.Logger;
|
import org.apache.log4j.Logger;
|
||||||
import org.apache.spark.SparkConf;
|
import org.apache.spark.SparkConf;
|
||||||
@@ -46,21 +40,10 @@ import org.apache.spark.sql.Dataset;
|
|||||||
import org.apache.spark.sql.Row;
|
import org.apache.spark.sql.Row;
|
||||||
import org.apache.spark.sql.SQLContext;
|
import org.apache.spark.sql.SQLContext;
|
||||||
import org.apache.spark.sql.types.StructType;
|
import org.apache.spark.sql.types.StructType;
|
||||||
|
|
||||||
import java.io.IOException;
|
|
||||||
import java.io.Serializable;
|
|
||||||
import java.util.ArrayList;
|
|
||||||
import java.util.HashMap;
|
|
||||||
import java.util.HashSet;
|
|
||||||
import java.util.List;
|
|
||||||
import java.util.Set;
|
|
||||||
import java.util.stream.Collectors;
|
|
||||||
|
|
||||||
import scala.Tuple2;
|
import scala.Tuple2;
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Provides an RDD based API for accessing/filtering Hoodie tables, based on keys.
|
* Provides an RDD based API for accessing/filtering Hoodie tables, based on keys.
|
||||||
*
|
|
||||||
*/
|
*/
|
||||||
public class HoodieReadClient implements Serializable {
|
public class HoodieReadClient implements Serializable {
|
||||||
|
|
||||||
@@ -70,8 +53,8 @@ public class HoodieReadClient implements Serializable {
|
|||||||
|
|
||||||
private transient final FileSystem fs;
|
private transient final FileSystem fs;
|
||||||
/**
|
/**
|
||||||
* TODO: We need to persist the index type into hoodie.properties and be able to access the
|
* TODO: We need to persist the index type into hoodie.properties and be able to access the index
|
||||||
* index just with a simple basepath pointing to the dataset. Until, then just always assume a
|
* just with a simple basepath pointing to the dataset. Until, then just always assume a
|
||||||
* BloomIndex
|
* BloomIndex
|
||||||
*/
|
*/
|
||||||
private transient final HoodieBloomIndex index;
|
private transient final HoodieBloomIndex index;
|
||||||
@@ -117,7 +100,8 @@ public class HoodieReadClient implements Serializable {
|
|||||||
|
|
||||||
private void assertSqlContext() {
|
private void assertSqlContext() {
|
||||||
if (!sqlContextOpt.isPresent()) {
|
if (!sqlContextOpt.isPresent()) {
|
||||||
throw new IllegalStateException("SQLContext must be set, when performing dataframe operations");
|
throw new IllegalStateException(
|
||||||
|
"SQLContext must be set, when performing dataframe operations");
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -158,10 +142,10 @@ public class HoodieReadClient implements Serializable {
|
|||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Checks if the given [Keys] exists in the hoodie table and returns [Key,
|
* Checks if the given [Keys] exists in the hoodie table and returns [Key, Optional[FullFilePath]]
|
||||||
* Optional[FullFilePath]] If the optional FullFilePath value is not present, then the key is
|
* If the optional FullFilePath value is not present, then the key is not found. If the
|
||||||
* not found. If the FullFilePath value is present, it is the path component (without scheme) of
|
* FullFilePath value is present, it is the path component (without scheme) of the URI underlying
|
||||||
* the URI underlying file
|
* file
|
||||||
*/
|
*/
|
||||||
public JavaPairRDD<HoodieKey, Optional<String>> checkExists(JavaRDD<HoodieKey> hoodieKeys) {
|
public JavaPairRDD<HoodieKey, Optional<String>> checkExists(JavaRDD<HoodieKey> hoodieKeys) {
|
||||||
return index.fetchRecordLocation(hoodieKeys, hoodieTable);
|
return index.fetchRecordLocation(hoodieKeys, hoodieTable);
|
||||||
|
|||||||
@@ -50,10 +50,21 @@ import com.uber.hoodie.func.BulkInsertMapFunction;
|
|||||||
import com.uber.hoodie.index.HoodieIndex;
|
import com.uber.hoodie.index.HoodieIndex;
|
||||||
import com.uber.hoodie.io.HoodieCommitArchiveLog;
|
import com.uber.hoodie.io.HoodieCommitArchiveLog;
|
||||||
import com.uber.hoodie.metrics.HoodieMetrics;
|
import com.uber.hoodie.metrics.HoodieMetrics;
|
||||||
import com.uber.hoodie.table.UserDefinedBulkInsertPartitioner;
|
|
||||||
import com.uber.hoodie.table.HoodieTable;
|
import com.uber.hoodie.table.HoodieTable;
|
||||||
|
import com.uber.hoodie.table.UserDefinedBulkInsertPartitioner;
|
||||||
import com.uber.hoodie.table.WorkloadProfile;
|
import com.uber.hoodie.table.WorkloadProfile;
|
||||||
import com.uber.hoodie.table.WorkloadStat;
|
import com.uber.hoodie.table.WorkloadStat;
|
||||||
|
import java.io.IOException;
|
||||||
|
import java.io.Serializable;
|
||||||
|
import java.nio.charset.StandardCharsets;
|
||||||
|
import java.text.ParseException;
|
||||||
|
import java.util.Collections;
|
||||||
|
import java.util.Date;
|
||||||
|
import java.util.HashMap;
|
||||||
|
import java.util.List;
|
||||||
|
import java.util.Map;
|
||||||
|
import java.util.Optional;
|
||||||
|
import java.util.stream.Collectors;
|
||||||
import org.apache.hadoop.fs.FileSystem;
|
import org.apache.hadoop.fs.FileSystem;
|
||||||
import org.apache.log4j.LogManager;
|
import org.apache.log4j.LogManager;
|
||||||
import org.apache.log4j.Logger;
|
import org.apache.log4j.Logger;
|
||||||
@@ -66,25 +77,12 @@ import org.apache.spark.storage.StorageLevel;
|
|||||||
import scala.Option;
|
import scala.Option;
|
||||||
import scala.Tuple2;
|
import scala.Tuple2;
|
||||||
|
|
||||||
import java.io.IOException;
|
|
||||||
import java.io.Serializable;
|
|
||||||
import java.nio.charset.StandardCharsets;
|
|
||||||
import java.text.ParseException;
|
|
||||||
import java.util.Collections;
|
|
||||||
import java.util.Date;
|
|
||||||
import java.util.HashMap;
|
|
||||||
import java.util.List;
|
|
||||||
import java.util.Map;
|
|
||||||
import java.util.Optional;
|
|
||||||
import java.util.stream.Collectors;
|
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Hoodie Write Client helps you build datasets on HDFS [insert()] and then
|
* Hoodie Write Client helps you build datasets on HDFS [insert()] and then perform efficient
|
||||||
* perform efficient mutations on a HDFS dataset [upsert()]
|
* mutations on a HDFS dataset [upsert()]
|
||||||
*
|
|
||||||
* Note that, at any given time, there can only be one Spark job performing
|
|
||||||
* these operatons on a Hoodie dataset.
|
|
||||||
*
|
*
|
||||||
|
* Note that, at any given time, there can only be one Spark job performing these operatons on a
|
||||||
|
* Hoodie dataset.
|
||||||
*/
|
*/
|
||||||
public class HoodieWriteClient<T extends HoodieRecordPayload> implements Serializable {
|
public class HoodieWriteClient<T extends HoodieRecordPayload> implements Serializable {
|
||||||
|
|
||||||
@@ -102,7 +100,8 @@ public class HoodieWriteClient<T extends HoodieRecordPayload> implements Seriali
|
|||||||
* @param clientConfig
|
* @param clientConfig
|
||||||
* @throws Exception
|
* @throws Exception
|
||||||
*/
|
*/
|
||||||
public HoodieWriteClient(JavaSparkContext jsc, HoodieWriteConfig clientConfig) throws Exception {
|
public HoodieWriteClient(JavaSparkContext jsc, HoodieWriteConfig clientConfig)
|
||||||
|
throws Exception {
|
||||||
this(jsc, clientConfig, false);
|
this(jsc, clientConfig, false);
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -111,7 +110,8 @@ public class HoodieWriteClient<T extends HoodieRecordPayload> implements Seriali
|
|||||||
* @param clientConfig
|
* @param clientConfig
|
||||||
* @param rollbackInFlight
|
* @param rollbackInFlight
|
||||||
*/
|
*/
|
||||||
public HoodieWriteClient(JavaSparkContext jsc, HoodieWriteConfig clientConfig, boolean rollbackInFlight) {
|
public HoodieWriteClient(JavaSparkContext jsc, HoodieWriteConfig clientConfig,
|
||||||
|
boolean rollbackInFlight) {
|
||||||
this.fs = FSUtils.getFs();
|
this.fs = FSUtils.getFs();
|
||||||
this.jsc = jsc;
|
this.jsc = jsc;
|
||||||
this.config = clientConfig;
|
this.config = clientConfig;
|
||||||
@@ -170,8 +170,8 @@ public class HoodieWriteClient<T extends HoodieRecordPayload> implements Seriali
|
|||||||
* Inserts the given HoodieRecords, into the table. This API is intended to be used for normal
|
* Inserts the given HoodieRecords, into the table. This API is intended to be used for normal
|
||||||
* writes.
|
* writes.
|
||||||
*
|
*
|
||||||
* This implementation skips the index check and is able to leverage benefits such as
|
* This implementation skips the index check and is able to leverage benefits such as small file
|
||||||
* small file handling/blocking alignment, as with upsert(), by profiling the workload
|
* handling/blocking alignment, as with upsert(), by profiling the workload
|
||||||
*
|
*
|
||||||
* @param records HoodieRecords to insert
|
* @param records HoodieRecords to insert
|
||||||
* @param commitTime Commit Time handle
|
* @param commitTime Commit Time handle
|
||||||
@@ -210,7 +210,8 @@ public class HoodieWriteClient<T extends HoodieRecordPayload> implements Seriali
|
|||||||
* @param commitTime Commit Time handle
|
* @param commitTime Commit Time handle
|
||||||
* @return JavaRDD[WriteStatus] - RDD of WriteStatus to inspect errors and counts
|
* @return JavaRDD[WriteStatus] - RDD of WriteStatus to inspect errors and counts
|
||||||
*/
|
*/
|
||||||
public JavaRDD<WriteStatus> bulkInsert(JavaRDD<HoodieRecord<T>> records, final String commitTime) {
|
public JavaRDD<WriteStatus> bulkInsert(JavaRDD<HoodieRecord<T>> records,
|
||||||
|
final String commitTime) {
|
||||||
return bulkInsert(records, commitTime, Option.empty());
|
return bulkInsert(records, commitTime, Option.empty());
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -221,16 +222,18 @@ public class HoodieWriteClient<T extends HoodieRecordPayload> implements Seriali
|
|||||||
*
|
*
|
||||||
* This implementation uses sortBy (which does range partitioning based on reservoir sampling) and
|
* This implementation uses sortBy (which does range partitioning based on reservoir sampling) and
|
||||||
* attempts to control the numbers of files with less memory compared to the {@link
|
* attempts to control the numbers of files with less memory compared to the {@link
|
||||||
* HoodieWriteClient#insert(JavaRDD, String)}. Optionally it allows users to specify their own partitioner. If
|
* HoodieWriteClient#insert(JavaRDD, String)}. Optionally it allows users to specify their own
|
||||||
* specified then it will be used for repartitioning records. See {@link UserDefinedBulkInsertPartitioner}.
|
* partitioner. If specified then it will be used for repartitioning records. See {@link
|
||||||
|
* UserDefinedBulkInsertPartitioner}.
|
||||||
*
|
*
|
||||||
* @param records HoodieRecords to insert
|
* @param records HoodieRecords to insert
|
||||||
* @param commitTime Commit Time handle
|
* @param commitTime Commit Time handle
|
||||||
* @param bulkInsertPartitioner If specified then it will be used to partition input records before they are
|
* @param bulkInsertPartitioner If specified then it will be used to partition input records
|
||||||
* inserted into hoodie.
|
* before they are inserted into hoodie.
|
||||||
* @return JavaRDD[WriteStatus] - RDD of WriteStatus to inspect errors and counts
|
* @return JavaRDD[WriteStatus] - RDD of WriteStatus to inspect errors and counts
|
||||||
*/
|
*/
|
||||||
public JavaRDD<WriteStatus> bulkInsert(JavaRDD<HoodieRecord<T>> records, final String commitTime,
|
public JavaRDD<WriteStatus> bulkInsert(JavaRDD<HoodieRecord<T>> records,
|
||||||
|
final String commitTime,
|
||||||
Option<UserDefinedBulkInsertPartitioner> bulkInsertPartitioner) {
|
Option<UserDefinedBulkInsertPartitioner> bulkInsertPartitioner) {
|
||||||
writeContext = metrics.getCommitCtx();
|
writeContext = metrics.getCommitCtx();
|
||||||
// Create a Hoodie table which encapsulated the commits and files visible
|
// Create a Hoodie table which encapsulated the commits and files visible
|
||||||
@@ -240,7 +243,8 @@ public class HoodieWriteClient<T extends HoodieRecordPayload> implements Seriali
|
|||||||
try {
|
try {
|
||||||
// De-dupe/merge if needed
|
// De-dupe/merge if needed
|
||||||
JavaRDD<HoodieRecord<T>> dedupedRecords =
|
JavaRDD<HoodieRecord<T>> dedupedRecords =
|
||||||
combineOnCondition(config.shouldCombineBeforeInsert(), records, config.getInsertShuffleParallelism());
|
combineOnCondition(config.shouldCombineBeforeInsert(), records,
|
||||||
|
config.getInsertShuffleParallelism());
|
||||||
|
|
||||||
final JavaRDD<HoodieRecord<T>> repartitionedRecords;
|
final JavaRDD<HoodieRecord<T>> repartitionedRecords;
|
||||||
if (bulkInsertPartitioner.isDefined()) {
|
if (bulkInsertPartitioner.isDefined()) {
|
||||||
@@ -259,7 +263,8 @@ public class HoodieWriteClient<T extends HoodieRecordPayload> implements Seriali
|
|||||||
}, true, config.getBulkInsertShuffleParallelism());
|
}, true, config.getBulkInsertShuffleParallelism());
|
||||||
}
|
}
|
||||||
JavaRDD<WriteStatus> writeStatusRDD = repartitionedRecords
|
JavaRDD<WriteStatus> writeStatusRDD = repartitionedRecords
|
||||||
.mapPartitionsWithIndex(new BulkInsertMapFunction<T>(commitTime, config, table), true)
|
.mapPartitionsWithIndex(new BulkInsertMapFunction<T>(commitTime, config, table),
|
||||||
|
true)
|
||||||
.flatMap(writeStatuses -> writeStatuses.iterator());
|
.flatMap(writeStatuses -> writeStatuses.iterator());
|
||||||
|
|
||||||
return updateIndexAndCommitIfNeeded(writeStatusRDD, table, commitTime);
|
return updateIndexAndCommitIfNeeded(writeStatusRDD, table, commitTime);
|
||||||
@@ -267,12 +272,13 @@ public class HoodieWriteClient<T extends HoodieRecordPayload> implements Seriali
|
|||||||
if (e instanceof HoodieInsertException) {
|
if (e instanceof HoodieInsertException) {
|
||||||
throw e;
|
throw e;
|
||||||
}
|
}
|
||||||
throw new HoodieInsertException("Failed to bulk insert for commit time " + commitTime, e);
|
throw new HoodieInsertException("Failed to bulk insert for commit time " + commitTime,
|
||||||
|
e);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
private void commitOnAutoCommit(String commitTime, JavaRDD<WriteStatus> resultRDD) {
|
private void commitOnAutoCommit(String commitTime, JavaRDD<WriteStatus> resultRDD) {
|
||||||
if(config.shouldAutoCommit()) {
|
if (config.shouldAutoCommit()) {
|
||||||
logger.info("Auto commit enabled: Committing " + commitTime);
|
logger.info("Auto commit enabled: Committing " + commitTime);
|
||||||
boolean commitResult = commit(commitTime, resultRDD);
|
boolean commitResult = commit(commitTime, resultRDD);
|
||||||
if (!commitResult) {
|
if (!commitResult) {
|
||||||
@@ -286,24 +292,22 @@ public class HoodieWriteClient<T extends HoodieRecordPayload> implements Seriali
|
|||||||
private JavaRDD<HoodieRecord<T>> combineOnCondition(boolean condition,
|
private JavaRDD<HoodieRecord<T>> combineOnCondition(boolean condition,
|
||||||
JavaRDD<HoodieRecord<T>> records,
|
JavaRDD<HoodieRecord<T>> records,
|
||||||
int parallelism) {
|
int parallelism) {
|
||||||
if(condition) {
|
if (condition) {
|
||||||
return deduplicateRecords(records, parallelism);
|
return deduplicateRecords(records, parallelism);
|
||||||
}
|
}
|
||||||
return records;
|
return records;
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
*
|
* Save the workload profile in an intermediate file (here re-using commit files) This is useful
|
||||||
* Save the workload profile in an intermediate file (here re-using commit files)
|
* when performing rollback for MOR datasets. Only updates are recorded in the workload profile
|
||||||
* This is useful when performing rollback for MOR datasets. Only updates are recorded
|
* metadata since updates to log blocks are unknown across batches Inserts (which are new parquet
|
||||||
* in the workload profile metadata since updates to log blocks are unknown across batches
|
* files) are rolled back based on commit time. // TODO : Create a new WorkloadProfile metadata
|
||||||
* Inserts (which are new parquet files) are rolled back based on commit time.
|
* file instead of using HoodieCommitMetadata
|
||||||
* // TODO : Create a new WorkloadProfile metadata file instead of using HoodieCommitMetadata
|
|
||||||
* @param profile
|
|
||||||
* @param commitTime
|
|
||||||
* @throws HoodieCommitException
|
|
||||||
*/
|
*/
|
||||||
private void saveWorkloadProfileMetadataToInflight(WorkloadProfile profile, HoodieTable<T> table, String commitTime) throws HoodieCommitException {
|
private void saveWorkloadProfileMetadataToInflight(WorkloadProfile profile,
|
||||||
|
HoodieTable<T> table,
|
||||||
|
String commitTime) throws HoodieCommitException {
|
||||||
try {
|
try {
|
||||||
HoodieCommitMetadata metadata = new HoodieCommitMetadata();
|
HoodieCommitMetadata metadata = new HoodieCommitMetadata();
|
||||||
profile.getPartitionPaths().stream().forEach(path -> {
|
profile.getPartitionPaths().stream().forEach(path -> {
|
||||||
@@ -320,8 +324,9 @@ public class HoodieWriteClient<T extends HoodieRecordPayload> implements Seriali
|
|||||||
Optional<HoodieInstant> instant = activeTimeline.filterInflights().lastInstant();
|
Optional<HoodieInstant> instant = activeTimeline.filterInflights().lastInstant();
|
||||||
activeTimeline.saveToInflight(instant.get(),
|
activeTimeline.saveToInflight(instant.get(),
|
||||||
Optional.of(metadata.toJsonString().getBytes(StandardCharsets.UTF_8)));
|
Optional.of(metadata.toJsonString().getBytes(StandardCharsets.UTF_8)));
|
||||||
} catch(IOException io) {
|
} catch (IOException io) {
|
||||||
throw new HoodieCommitException("Failed to commit " + commitTime + " unable to save inflight metadata ", io);
|
throw new HoodieCommitException(
|
||||||
|
"Failed to commit " + commitTime + " unable to save inflight metadata ", io);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -358,7 +363,8 @@ public class HoodieWriteClient<T extends HoodieRecordPayload> implements Seriali
|
|||||||
return updateIndexAndCommitIfNeeded(writeStatusRDD, hoodieTable, commitTime);
|
return updateIndexAndCommitIfNeeded(writeStatusRDD, hoodieTable, commitTime);
|
||||||
}
|
}
|
||||||
|
|
||||||
private Partitioner getPartitioner(HoodieTable table, boolean isUpsert, WorkloadProfile profile) {
|
private Partitioner getPartitioner(HoodieTable table, boolean isUpsert,
|
||||||
|
WorkloadProfile profile) {
|
||||||
if (isUpsert) {
|
if (isUpsert) {
|
||||||
return table.getUpsertPartitioner(profile);
|
return table.getUpsertPartitioner(profile);
|
||||||
} else {
|
} else {
|
||||||
@@ -366,7 +372,8 @@ public class HoodieWriteClient<T extends HoodieRecordPayload> implements Seriali
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
private JavaRDD<WriteStatus> updateIndexAndCommitIfNeeded(JavaRDD<WriteStatus> writeStatusRDD, HoodieTable<T> table, String commitTime) {
|
private JavaRDD<WriteStatus> updateIndexAndCommitIfNeeded(JavaRDD<WriteStatus> writeStatusRDD,
|
||||||
|
HoodieTable<T> table, String commitTime) {
|
||||||
// Update the index back
|
// Update the index back
|
||||||
JavaRDD<WriteStatus> statuses = index.updateLocation(writeStatusRDD, table);
|
JavaRDD<WriteStatus> statuses = index.updateLocation(writeStatusRDD, table);
|
||||||
// Trigger the insert and collect statuses
|
// Trigger the insert and collect statuses
|
||||||
@@ -375,10 +382,13 @@ public class HoodieWriteClient<T extends HoodieRecordPayload> implements Seriali
|
|||||||
return statuses;
|
return statuses;
|
||||||
}
|
}
|
||||||
|
|
||||||
private JavaRDD<HoodieRecord<T>> partition(JavaRDD<HoodieRecord<T>> dedupedRecords, Partitioner partitioner) {
|
private JavaRDD<HoodieRecord<T>> partition(JavaRDD<HoodieRecord<T>> dedupedRecords,
|
||||||
|
Partitioner partitioner) {
|
||||||
return dedupedRecords
|
return dedupedRecords
|
||||||
.mapToPair(record ->
|
.mapToPair(record ->
|
||||||
new Tuple2<>(new Tuple2<>(record.getKey(), Option.apply(record.getCurrentLocation())), record))
|
new Tuple2<>(
|
||||||
|
new Tuple2<>(record.getKey(), Option.apply(record.getCurrentLocation())),
|
||||||
|
record))
|
||||||
.partitionBy(partitioner)
|
.partitionBy(partitioner)
|
||||||
.map(tuple -> tuple._2());
|
.map(tuple -> tuple._2());
|
||||||
}
|
}
|
||||||
@@ -438,7 +448,7 @@ public class HoodieWriteClient<T extends HoodieRecordPayload> implements Seriali
|
|||||||
|
|
||||||
// We cannot have unbounded commit files. Archive commits if we have to archive
|
// We cannot have unbounded commit files. Archive commits if we have to archive
|
||||||
archiveLog.archiveIfRequired();
|
archiveLog.archiveIfRequired();
|
||||||
if(config.isAutoClean()) {
|
if (config.isAutoClean()) {
|
||||||
// Call clean to cleanup if there is anything to cleanup after the commit,
|
// Call clean to cleanup if there is anything to cleanup after the commit,
|
||||||
logger.info("Auto cleaning is enabled. Running cleaner now");
|
logger.info("Auto cleaning is enabled. Running cleaner now");
|
||||||
clean(commitTime);
|
clean(commitTime);
|
||||||
@@ -465,12 +475,12 @@ public class HoodieWriteClient<T extends HoodieRecordPayload> implements Seriali
|
|||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Savepoint a specific commit. Latest version of data files as of the passed in commitTime
|
* Savepoint a specific commit. Latest version of data files as of the passed in commitTime will
|
||||||
* will be referenced in the savepoint and will never be cleaned. The savepointed commit
|
* be referenced in the savepoint and will never be cleaned. The savepointed commit will never be
|
||||||
* will never be rolledback or archived.
|
* rolledback or archived.
|
||||||
*
|
*
|
||||||
* This gives an option to rollback the state to the savepoint anytime.
|
* This gives an option to rollback the state to the savepoint anytime. Savepoint needs to be
|
||||||
* Savepoint needs to be manually created and deleted.
|
* manually created and deleted.
|
||||||
*
|
*
|
||||||
* Savepoint should be on a commit that could not have been cleaned.
|
* Savepoint should be on a commit that could not have been cleaned.
|
||||||
*
|
*
|
||||||
@@ -491,12 +501,12 @@ public class HoodieWriteClient<T extends HoodieRecordPayload> implements Seriali
|
|||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Savepoint a specific commit. Latest version of data files as of the passed in commitTime
|
* Savepoint a specific commit. Latest version of data files as of the passed in commitTime will
|
||||||
* will be referenced in the savepoint and will never be cleaned. The savepointed commit
|
* be referenced in the savepoint and will never be cleaned. The savepointed commit will never be
|
||||||
* will never be rolledback or archived.
|
* rolledback or archived.
|
||||||
*
|
*
|
||||||
* This gives an option to rollback the state to the savepoint anytime.
|
* This gives an option to rollback the state to the savepoint anytime. Savepoint needs to be
|
||||||
* Savepoint needs to be manually created and deleted.
|
* manually created and deleted.
|
||||||
*
|
*
|
||||||
* Savepoint should be on a commit that could not have been cleaned.
|
* Savepoint should be on a commit that could not have been cleaned.
|
||||||
*
|
*
|
||||||
@@ -510,9 +520,11 @@ public class HoodieWriteClient<T extends HoodieRecordPayload> implements Seriali
|
|||||||
.getHoodieTable(new HoodieTableMetaClient(fs, config.getBasePath(), true), config);
|
.getHoodieTable(new HoodieTableMetaClient(fs, config.getBasePath(), true), config);
|
||||||
Optional<HoodieInstant> cleanInstant = table.getCompletedCleanTimeline().lastInstant();
|
Optional<HoodieInstant> cleanInstant = table.getCompletedCleanTimeline().lastInstant();
|
||||||
|
|
||||||
HoodieInstant commitInstant = new HoodieInstant(false, HoodieTimeline.COMMIT_ACTION, commitTime);
|
HoodieInstant commitInstant = new HoodieInstant(false, HoodieTimeline.COMMIT_ACTION,
|
||||||
if(!table.getCompletedCommitTimeline().containsInstant(commitInstant)) {
|
commitTime);
|
||||||
throw new HoodieSavepointException("Could not savepoint non-existing commit " + commitInstant);
|
if (!table.getCompletedCommitTimeline().containsInstant(commitInstant)) {
|
||||||
|
throw new HoodieSavepointException(
|
||||||
|
"Could not savepoint non-existing commit " + commitInstant);
|
||||||
}
|
}
|
||||||
|
|
||||||
try {
|
try {
|
||||||
@@ -534,7 +546,8 @@ public class HoodieWriteClient<T extends HoodieRecordPayload> implements Seriali
|
|||||||
+ lastCommitRetained);
|
+ lastCommitRetained);
|
||||||
|
|
||||||
Map<String, List<String>> latestFilesMap = jsc.parallelize(
|
Map<String, List<String>> latestFilesMap = jsc.parallelize(
|
||||||
FSUtils.getAllPartitionPaths(fs, table.getMetaClient().getBasePath(), config.shouldAssumeDatePartitioning()))
|
FSUtils.getAllPartitionPaths(fs, table.getMetaClient().getBasePath(),
|
||||||
|
config.shouldAssumeDatePartitioning()))
|
||||||
.mapToPair((PairFunction<String, String, List<String>>) partitionPath -> {
|
.mapToPair((PairFunction<String, String, List<String>>) partitionPath -> {
|
||||||
// Scan all partitions files with this commit time
|
// Scan all partitions files with this commit time
|
||||||
logger.info("Collecting latest files in partition path " + partitionPath);
|
logger.info("Collecting latest files in partition path " + partitionPath);
|
||||||
@@ -559,8 +572,8 @@ public class HoodieWriteClient<T extends HoodieRecordPayload> implements Seriali
|
|||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Delete a savepoint that was created. Once the savepoint is deleted, the commit can be rolledback
|
* Delete a savepoint that was created. Once the savepoint is deleted, the commit can be
|
||||||
* and cleaner may clean up data files.
|
* rolledback and cleaner may clean up data files.
|
||||||
*
|
*
|
||||||
* @param savepointTime - delete the savepoint
|
* @param savepointTime - delete the savepoint
|
||||||
* @return true if the savepoint was deleted successfully
|
* @return true if the savepoint was deleted successfully
|
||||||
@@ -586,9 +599,8 @@ public class HoodieWriteClient<T extends HoodieRecordPayload> implements Seriali
|
|||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Rollback the state to the savepoint.
|
* Rollback the state to the savepoint. WARNING: This rollsback recent commits and deleted data
|
||||||
* WARNING: This rollsback recent commits and deleted data files. Queries accessing the files
|
* files. Queries accessing the files will mostly fail. This should be done during a downtime.
|
||||||
* will mostly fail. This should be done during a downtime.
|
|
||||||
*
|
*
|
||||||
* @param savepointTime - savepoint time to rollback to
|
* @param savepointTime - savepoint time to rollback to
|
||||||
* @return true if the savepoint was rollecback to successfully
|
* @return true if the savepoint was rollecback to successfully
|
||||||
@@ -616,7 +628,8 @@ public class HoodieWriteClient<T extends HoodieRecordPayload> implements Seriali
|
|||||||
|
|
||||||
// Make sure the rollback was successful
|
// Make sure the rollback was successful
|
||||||
Optional<HoodieInstant> lastInstant =
|
Optional<HoodieInstant> lastInstant =
|
||||||
activeTimeline.reload().getCommitsAndCompactionsTimeline().filterCompletedInstants().lastInstant();
|
activeTimeline.reload().getCommitsAndCompactionsTimeline().filterCompletedInstants()
|
||||||
|
.lastInstant();
|
||||||
Preconditions.checkArgument(lastInstant.isPresent());
|
Preconditions.checkArgument(lastInstant.isPresent());
|
||||||
Preconditions.checkArgument(lastInstant.get().getTimestamp().equals(savepointTime),
|
Preconditions.checkArgument(lastInstant.get().getTimestamp().equals(savepointTime),
|
||||||
savepointTime + "is not the last commit after rolling back " + commitsToRollback
|
savepointTime + "is not the last commit after rolling back " + commitsToRollback
|
||||||
@@ -625,12 +638,9 @@ public class HoodieWriteClient<T extends HoodieRecordPayload> implements Seriali
|
|||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Rollback the (inflight/committed) record changes with the given commit time.
|
* Rollback the (inflight/committed) record changes with the given commit time. Three steps: (1)
|
||||||
* Three steps:
|
* Atomically unpublish this commit (2) clean indexing data, (3) clean new generated parquet
|
||||||
* (1) Atomically unpublish this commit
|
* files. (4) Finally delete .commit or .inflight file,
|
||||||
* (2) clean indexing data,
|
|
||||||
* (3) clean new generated parquet files.
|
|
||||||
* (4) Finally delete .commit or .inflight file,
|
|
||||||
*/
|
*/
|
||||||
public boolean rollback(final String commitTime) throws HoodieRollbackException {
|
public boolean rollback(final String commitTime) throws HoodieRollbackException {
|
||||||
rollback(Lists.newArrayList(commitTime));
|
rollback(Lists.newArrayList(commitTime));
|
||||||
@@ -638,7 +648,7 @@ public class HoodieWriteClient<T extends HoodieRecordPayload> implements Seriali
|
|||||||
}
|
}
|
||||||
|
|
||||||
private void rollback(List<String> commits) {
|
private void rollback(List<String> commits) {
|
||||||
if(commits.isEmpty()) {
|
if (commits.isEmpty()) {
|
||||||
logger.info("List of commits to rollback is empty");
|
logger.info("List of commits to rollback is empty");
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
@@ -702,7 +712,9 @@ public class HoodieWriteClient<T extends HoodieRecordPayload> implements Seriali
|
|||||||
Optional<Long> durationInMs = Optional.empty();
|
Optional<Long> durationInMs = Optional.empty();
|
||||||
if (context != null) {
|
if (context != null) {
|
||||||
durationInMs = Optional.of(metrics.getDurationInMs(context.stop()));
|
durationInMs = Optional.of(metrics.getDurationInMs(context.stop()));
|
||||||
Long numFilesDeleted = stats.stream().mapToLong(stat -> stat.getSuccessDeleteFiles().size()).sum();
|
Long numFilesDeleted = stats.stream()
|
||||||
|
.mapToLong(stat -> stat.getSuccessDeleteFiles().size())
|
||||||
|
.sum();
|
||||||
metrics.updateRollbackMetrics(durationInMs.get(), numFilesDeleted);
|
metrics.updateRollbackMetrics(durationInMs.get(), numFilesDeleted);
|
||||||
}
|
}
|
||||||
HoodieRollbackMetadata rollbackMetadata =
|
HoodieRollbackMetadata rollbackMetadata =
|
||||||
@@ -733,9 +745,9 @@ public class HoodieWriteClient<T extends HoodieRecordPayload> implements Seriali
|
|||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Clean up any stale/old files/data lying around (either on file storage or index storage)
|
* Clean up any stale/old files/data lying around (either on file storage or index storage) based
|
||||||
* based on the configurations and CleaningPolicy used. (typically files that no longer can be used
|
* on the configurations and CleaningPolicy used. (typically files that no longer can be used by a
|
||||||
* by a running query can be cleaned)
|
* running query can be cleaned)
|
||||||
*/
|
*/
|
||||||
public void clean() throws HoodieIOException {
|
public void clean() throws HoodieIOException {
|
||||||
String startCleanTime = HoodieActiveTimeline.createNewCommitTime();
|
String startCleanTime = HoodieActiveTimeline.createNewCommitTime();
|
||||||
@@ -743,9 +755,9 @@ public class HoodieWriteClient<T extends HoodieRecordPayload> implements Seriali
|
|||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Clean up any stale/old files/data lying around (either on file storage or index storage)
|
* Clean up any stale/old files/data lying around (either on file storage or index storage) based
|
||||||
* based on the configurations and CleaningPolicy used. (typically files that no longer can be used
|
* on the configurations and CleaningPolicy used. (typically files that no longer can be used by a
|
||||||
* by a running query can be cleaned)
|
* running query can be cleaned)
|
||||||
*/
|
*/
|
||||||
private void clean(String startCleanTime) throws HoodieIOException {
|
private void clean(String startCleanTime) throws HoodieIOException {
|
||||||
try {
|
try {
|
||||||
@@ -811,14 +823,16 @@ public class HoodieWriteClient<T extends HoodieRecordPayload> implements Seriali
|
|||||||
}
|
}
|
||||||
|
|
||||||
public static SparkConf registerClasses(SparkConf conf) {
|
public static SparkConf registerClasses(SparkConf conf) {
|
||||||
conf.registerKryoClasses(new Class[]{HoodieWriteConfig.class, HoodieRecord.class, HoodieKey.class});
|
conf.registerKryoClasses(
|
||||||
|
new Class[]{HoodieWriteConfig.class, HoodieRecord.class, HoodieKey.class});
|
||||||
return conf;
|
return conf;
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Deduplicate Hoodie records, using the given deduplication funciton.
|
* Deduplicate Hoodie records, using the given deduplication funciton.
|
||||||
*/
|
*/
|
||||||
private JavaRDD<HoodieRecord<T>> deduplicateRecords(JavaRDD<HoodieRecord<T>> records, int parallelism) {
|
private JavaRDD<HoodieRecord<T>> deduplicateRecords(JavaRDD<HoodieRecord<T>> records,
|
||||||
|
int parallelism) {
|
||||||
return records
|
return records
|
||||||
.mapToPair(record -> new Tuple2<>(record.getKey(), record))
|
.mapToPair(record -> new Tuple2<>(record.getKey(), record))
|
||||||
.reduceByKey((rec1, rec2) -> {
|
.reduceByKey((rec1, rec2) -> {
|
||||||
@@ -833,8 +847,6 @@ public class HoodieWriteClient<T extends HoodieRecordPayload> implements Seriali
|
|||||||
|
|
||||||
/**
|
/**
|
||||||
* Cleanup all inflight commits
|
* Cleanup all inflight commits
|
||||||
*
|
|
||||||
* @throws IOException
|
|
||||||
*/
|
*/
|
||||||
private void rollbackInflightCommits() {
|
private void rollbackInflightCommits() {
|
||||||
HoodieTable<T> table = HoodieTable
|
HoodieTable<T> table = HoodieTable
|
||||||
|
|||||||
@@ -19,7 +19,6 @@ package com.uber.hoodie;
|
|||||||
import com.uber.hoodie.common.model.HoodieKey;
|
import com.uber.hoodie.common.model.HoodieKey;
|
||||||
import com.uber.hoodie.common.model.HoodieRecord;
|
import com.uber.hoodie.common.model.HoodieRecord;
|
||||||
import com.uber.hoodie.common.model.HoodieWriteStat;
|
import com.uber.hoodie.common.model.HoodieWriteStat;
|
||||||
|
|
||||||
import java.io.Serializable;
|
import java.io.Serializable;
|
||||||
import java.util.ArrayList;
|
import java.util.ArrayList;
|
||||||
import java.util.HashMap;
|
import java.util.HashMap;
|
||||||
@@ -50,12 +49,14 @@ public class WriteStatus implements Serializable {
|
|||||||
private long totalErrorRecords = 0;
|
private long totalErrorRecords = 0;
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Mark write as success, optionally using given parameters for the purpose of calculating
|
* Mark write as success, optionally using given parameters for the purpose of calculating some
|
||||||
* some aggregate metrics. This method is not meant to cache passed arguments, since WriteStatus
|
* aggregate metrics. This method is not meant to cache passed arguments, since WriteStatus
|
||||||
* objects are collected in Spark Driver.
|
* objects are collected in Spark Driver.
|
||||||
*
|
*
|
||||||
* @param record deflated {@code HoodieRecord} containing information that uniquely identifies it.
|
* @param record deflated {@code HoodieRecord} containing information that uniquely identifies
|
||||||
* @param optionalRecordMetadata optional metadata related to data contained in {@link HoodieRecord} before deflation.
|
* it.
|
||||||
|
* @param optionalRecordMetadata optional metadata related to data contained in {@link
|
||||||
|
* HoodieRecord} before deflation.
|
||||||
*/
|
*/
|
||||||
public void markSuccess(HoodieRecord record,
|
public void markSuccess(HoodieRecord record,
|
||||||
Optional<Map<String, String>> optionalRecordMetadata) {
|
Optional<Map<String, String>> optionalRecordMetadata) {
|
||||||
@@ -64,12 +65,14 @@ public class WriteStatus implements Serializable {
|
|||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Mark write as failed, optionally using given parameters for the purpose of calculating
|
* Mark write as failed, optionally using given parameters for the purpose of calculating some
|
||||||
* some aggregate metrics. This method is not meant to cache passed arguments, since WriteStatus
|
* aggregate metrics. This method is not meant to cache passed arguments, since WriteStatus
|
||||||
* objects are collected in Spark Driver.
|
* objects are collected in Spark Driver.
|
||||||
*
|
*
|
||||||
* @param record deflated {@code HoodieRecord} containing information that uniquely identifies it.
|
* @param record deflated {@code HoodieRecord} containing information that uniquely identifies
|
||||||
* @param optionalRecordMetadata optional metadata related to data contained in {@link HoodieRecord} before deflation.
|
* it.
|
||||||
|
* @param optionalRecordMetadata optional metadata related to data contained in {@link
|
||||||
|
* HoodieRecord} before deflation.
|
||||||
*/
|
*/
|
||||||
public void markFailure(HoodieRecord record, Throwable t,
|
public void markFailure(HoodieRecord record, Throwable t,
|
||||||
Optional<Map<String, String>> optionalRecordMetadata) {
|
Optional<Map<String, String>> optionalRecordMetadata) {
|
||||||
@@ -139,7 +142,9 @@ public class WriteStatus implements Serializable {
|
|||||||
return totalRecords;
|
return totalRecords;
|
||||||
}
|
}
|
||||||
|
|
||||||
public long getTotalErrorRecords() { return totalErrorRecords; }
|
public long getTotalErrorRecords() {
|
||||||
|
return totalErrorRecords;
|
||||||
|
}
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
public String toString() {
|
public String toString() {
|
||||||
|
|||||||
@@ -17,14 +17,15 @@
|
|||||||
package com.uber.hoodie.config;
|
package com.uber.hoodie.config;
|
||||||
|
|
||||||
import java.io.Serializable;
|
import java.io.Serializable;
|
||||||
import java.util.Map;
|
|
||||||
import java.util.Properties;
|
import java.util.Properties;
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Default Way to load Hoodie config through a java.util.Properties
|
* Default Way to load Hoodie config through a java.util.Properties
|
||||||
*/
|
*/
|
||||||
public class DefaultHoodieConfig implements Serializable {
|
public class DefaultHoodieConfig implements Serializable {
|
||||||
|
|
||||||
protected final Properties props;
|
protected final Properties props;
|
||||||
|
|
||||||
public DefaultHoodieConfig(Properties props) {
|
public DefaultHoodieConfig(Properties props) {
|
||||||
this.props = props;
|
this.props = props;
|
||||||
}
|
}
|
||||||
@@ -40,7 +41,8 @@ public class DefaultHoodieConfig implements Serializable {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
public static void setDefaultOnCondition(Properties props, boolean condition, DefaultHoodieConfig config) {
|
public static void setDefaultOnCondition(Properties props, boolean condition,
|
||||||
|
DefaultHoodieConfig config) {
|
||||||
if (condition) {
|
if (condition) {
|
||||||
props.putAll(config.getProps());
|
props.putAll(config.getProps());
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -19,21 +19,20 @@ package com.uber.hoodie.config;
|
|||||||
import com.google.common.base.Preconditions;
|
import com.google.common.base.Preconditions;
|
||||||
import com.uber.hoodie.common.model.HoodieAvroPayload;
|
import com.uber.hoodie.common.model.HoodieAvroPayload;
|
||||||
import com.uber.hoodie.common.model.HoodieCleaningPolicy;
|
import com.uber.hoodie.common.model.HoodieCleaningPolicy;
|
||||||
import com.uber.hoodie.common.model.HoodieRecordPayload;
|
|
||||||
import com.uber.hoodie.io.compact.strategy.CompactionStrategy;
|
import com.uber.hoodie.io.compact.strategy.CompactionStrategy;
|
||||||
import com.uber.hoodie.io.compact.strategy.LogFileSizeBasedCompactionStrategy;
|
import com.uber.hoodie.io.compact.strategy.LogFileSizeBasedCompactionStrategy;
|
||||||
|
|
||||||
import javax.annotation.concurrent.Immutable;
|
|
||||||
import java.io.File;
|
import java.io.File;
|
||||||
import java.io.FileReader;
|
import java.io.FileReader;
|
||||||
import java.io.IOException;
|
import java.io.IOException;
|
||||||
import java.util.Properties;
|
import java.util.Properties;
|
||||||
|
import javax.annotation.concurrent.Immutable;
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Compaction related config
|
* Compaction related config
|
||||||
*/
|
*/
|
||||||
@Immutable
|
@Immutable
|
||||||
public class HoodieCompactionConfig extends DefaultHoodieConfig {
|
public class HoodieCompactionConfig extends DefaultHoodieConfig {
|
||||||
|
|
||||||
public static final String CLEANER_POLICY_PROP = "hoodie.cleaner.policy";
|
public static final String CLEANER_POLICY_PROP = "hoodie.cleaner.policy";
|
||||||
private static final String DEFAULT_CLEANER_POLICY =
|
private static final String DEFAULT_CLEANER_POLICY =
|
||||||
HoodieCleaningPolicy.KEEP_LATEST_COMMITS.name();
|
HoodieCleaningPolicy.KEEP_LATEST_COMMITS.name();
|
||||||
@@ -66,7 +65,9 @@ public class HoodieCompactionConfig extends DefaultHoodieConfig {
|
|||||||
public static final String DEFAULT_PARQUET_SMALL_FILE_LIMIT_BYTES = String.valueOf(0);
|
public static final String DEFAULT_PARQUET_SMALL_FILE_LIMIT_BYTES = String.valueOf(0);
|
||||||
|
|
||||||
|
|
||||||
/** Configs related to specific table types **/
|
/**
|
||||||
|
* Configs related to specific table types
|
||||||
|
**/
|
||||||
// Number of inserts, that will be put each partition/bucket for writing
|
// Number of inserts, that will be put each partition/bucket for writing
|
||||||
public static final String COPY_ON_WRITE_TABLE_INSERT_SPLIT_SIZE = "hoodie.copyonwrite.insert.split.size";
|
public static final String COPY_ON_WRITE_TABLE_INSERT_SPLIT_SIZE = "hoodie.copyonwrite.insert.split.size";
|
||||||
// The rationale to pick the insert parallelism is the following. Writing out 100MB files,
|
// The rationale to pick the insert parallelism is the following. Writing out 100MB files,
|
||||||
@@ -82,7 +83,8 @@ public class HoodieCompactionConfig extends DefaultHoodieConfig {
|
|||||||
// This value is used as a guessimate for the record size, if we can't determine this from previous commits
|
// This value is used as a guessimate for the record size, if we can't determine this from previous commits
|
||||||
public static final String COPY_ON_WRITE_TABLE_RECORD_SIZE_ESTIMATE = "hoodie.copyonwrite.record.size.estimate";
|
public static final String COPY_ON_WRITE_TABLE_RECORD_SIZE_ESTIMATE = "hoodie.copyonwrite.record.size.estimate";
|
||||||
// Used to determine how much more can be packed into a small file, before it exceeds the size limit.
|
// Used to determine how much more can be packed into a small file, before it exceeds the size limit.
|
||||||
public static final String DEFAULT_COPY_ON_WRITE_TABLE_RECORD_SIZE_ESTIMATE = String.valueOf(1024);
|
public static final String DEFAULT_COPY_ON_WRITE_TABLE_RECORD_SIZE_ESTIMATE = String
|
||||||
|
.valueOf(1024);
|
||||||
|
|
||||||
public static final String CLEANER_PARALLELISM = "hoodie.cleaner.parallelism";
|
public static final String CLEANER_PARALLELISM = "hoodie.cleaner.parallelism";
|
||||||
public static final String DEFAULT_CLEANER_PARALLELISM = String.valueOf(200);
|
public static final String DEFAULT_CLEANER_PARALLELISM = String.valueOf(200);
|
||||||
@@ -93,7 +95,8 @@ public class HoodieCompactionConfig extends DefaultHoodieConfig {
|
|||||||
|
|
||||||
public static final String COMPACTION_STRATEGY_PROP = "hoodie.compaction.strategy";
|
public static final String COMPACTION_STRATEGY_PROP = "hoodie.compaction.strategy";
|
||||||
// 200GB of target IO per compaction
|
// 200GB of target IO per compaction
|
||||||
public static final String DEFAULT_COMPACTION_STRATEGY = LogFileSizeBasedCompactionStrategy.class.getName();
|
public static final String DEFAULT_COMPACTION_STRATEGY = LogFileSizeBasedCompactionStrategy.class
|
||||||
|
.getName();
|
||||||
|
|
||||||
// used to merge records written to log file
|
// used to merge records written to log file
|
||||||
public static final String DEFAULT_PAYLOAD_CLASS = HoodieAvroPayload.class.getName();
|
public static final String DEFAULT_PAYLOAD_CLASS = HoodieAvroPayload.class.getName();
|
||||||
@@ -108,6 +111,7 @@ public class HoodieCompactionConfig extends DefaultHoodieConfig {
|
|||||||
}
|
}
|
||||||
|
|
||||||
public static class Builder {
|
public static class Builder {
|
||||||
|
|
||||||
private final Properties props = new Properties();
|
private final Properties props = new Properties();
|
||||||
|
|
||||||
public Builder fromFile(File propertiesFile) throws IOException {
|
public Builder fromFile(File propertiesFile) throws IOException {
|
||||||
@@ -174,12 +178,14 @@ public class HoodieCompactionConfig extends DefaultHoodieConfig {
|
|||||||
}
|
}
|
||||||
|
|
||||||
public Builder autoTuneInsertSplits(boolean autoTuneInsertSplits) {
|
public Builder autoTuneInsertSplits(boolean autoTuneInsertSplits) {
|
||||||
props.setProperty(COPY_ON_WRITE_TABLE_AUTO_SPLIT_INSERTS, String.valueOf(autoTuneInsertSplits));
|
props.setProperty(COPY_ON_WRITE_TABLE_AUTO_SPLIT_INSERTS,
|
||||||
|
String.valueOf(autoTuneInsertSplits));
|
||||||
return this;
|
return this;
|
||||||
}
|
}
|
||||||
|
|
||||||
public Builder approxRecordSize(int recordSizeEstimate) {
|
public Builder approxRecordSize(int recordSizeEstimate) {
|
||||||
props.setProperty(COPY_ON_WRITE_TABLE_RECORD_SIZE_ESTIMATE, String.valueOf(recordSizeEstimate));
|
props.setProperty(COPY_ON_WRITE_TABLE_RECORD_SIZE_ESTIMATE,
|
||||||
|
String.valueOf(recordSizeEstimate));
|
||||||
return this;
|
return this;
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -199,7 +205,8 @@ public class HoodieCompactionConfig extends DefaultHoodieConfig {
|
|||||||
}
|
}
|
||||||
|
|
||||||
public Builder withTargetIOPerCompactionInMB(long targetIOPerCompactionInMB) {
|
public Builder withTargetIOPerCompactionInMB(long targetIOPerCompactionInMB) {
|
||||||
props.setProperty(TARGET_IO_PER_COMPACTION_IN_MB_PROP, String.valueOf(targetIOPerCompactionInMB));
|
props.setProperty(TARGET_IO_PER_COMPACTION_IN_MB_PROP,
|
||||||
|
String.valueOf(targetIOPerCompactionInMB));
|
||||||
return this;
|
return this;
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -228,7 +235,8 @@ public class HoodieCompactionConfig extends DefaultHoodieConfig {
|
|||||||
setDefaultOnCondition(props, !props.containsKey(COPY_ON_WRITE_TABLE_AUTO_SPLIT_INSERTS),
|
setDefaultOnCondition(props, !props.containsKey(COPY_ON_WRITE_TABLE_AUTO_SPLIT_INSERTS),
|
||||||
COPY_ON_WRITE_TABLE_AUTO_SPLIT_INSERTS, DEFAULT_COPY_ON_WRITE_TABLE_AUTO_SPLIT_INSERTS);
|
COPY_ON_WRITE_TABLE_AUTO_SPLIT_INSERTS, DEFAULT_COPY_ON_WRITE_TABLE_AUTO_SPLIT_INSERTS);
|
||||||
setDefaultOnCondition(props, !props.containsKey(COPY_ON_WRITE_TABLE_RECORD_SIZE_ESTIMATE),
|
setDefaultOnCondition(props, !props.containsKey(COPY_ON_WRITE_TABLE_RECORD_SIZE_ESTIMATE),
|
||||||
COPY_ON_WRITE_TABLE_RECORD_SIZE_ESTIMATE, DEFAULT_COPY_ON_WRITE_TABLE_RECORD_SIZE_ESTIMATE);
|
COPY_ON_WRITE_TABLE_RECORD_SIZE_ESTIMATE,
|
||||||
|
DEFAULT_COPY_ON_WRITE_TABLE_RECORD_SIZE_ESTIMATE);
|
||||||
setDefaultOnCondition(props, !props.containsKey(CLEANER_PARALLELISM),
|
setDefaultOnCondition(props, !props.containsKey(CLEANER_PARALLELISM),
|
||||||
CLEANER_PARALLELISM, DEFAULT_CLEANER_PARALLELISM);
|
CLEANER_PARALLELISM, DEFAULT_CLEANER_PARALLELISM);
|
||||||
setDefaultOnCondition(props, !props.containsKey(COMPACTION_STRATEGY_PROP),
|
setDefaultOnCondition(props, !props.containsKey(COMPACTION_STRATEGY_PROP),
|
||||||
|
|||||||
@@ -16,14 +16,12 @@
|
|||||||
|
|
||||||
package com.uber.hoodie.config;
|
package com.uber.hoodie.config;
|
||||||
|
|
||||||
import com.google.common.base.Preconditions;
|
|
||||||
import com.uber.hoodie.index.HoodieIndex;
|
import com.uber.hoodie.index.HoodieIndex;
|
||||||
|
|
||||||
import javax.annotation.concurrent.Immutable;
|
|
||||||
import java.io.File;
|
import java.io.File;
|
||||||
import java.io.FileReader;
|
import java.io.FileReader;
|
||||||
import java.io.IOException;
|
import java.io.IOException;
|
||||||
import java.util.Properties;
|
import java.util.Properties;
|
||||||
|
import javax.annotation.concurrent.Immutable;
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Indexing related config
|
* Indexing related config
|
||||||
@@ -64,6 +62,7 @@ public class HoodieIndexConfig extends DefaultHoodieConfig {
|
|||||||
}
|
}
|
||||||
|
|
||||||
public static class Builder {
|
public static class Builder {
|
||||||
|
|
||||||
private final Properties props = new Properties();
|
private final Properties props = new Properties();
|
||||||
|
|
||||||
public Builder fromFile(File propertiesFile) throws IOException {
|
public Builder fromFile(File propertiesFile) throws IOException {
|
||||||
|
|||||||
@@ -17,12 +17,11 @@
|
|||||||
package com.uber.hoodie.config;
|
package com.uber.hoodie.config;
|
||||||
|
|
||||||
import com.uber.hoodie.metrics.MetricsReporterType;
|
import com.uber.hoodie.metrics.MetricsReporterType;
|
||||||
|
|
||||||
import javax.annotation.concurrent.Immutable;
|
|
||||||
import java.io.File;
|
import java.io.File;
|
||||||
import java.io.FileReader;
|
import java.io.FileReader;
|
||||||
import java.io.IOException;
|
import java.io.IOException;
|
||||||
import java.util.Properties;
|
import java.util.Properties;
|
||||||
|
import javax.annotation.concurrent.Immutable;
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Fetch the configurations used by the Metrics system.
|
* Fetch the configurations used by the Metrics system.
|
||||||
@@ -56,6 +55,7 @@ public class HoodieMetricsConfig extends DefaultHoodieConfig {
|
|||||||
}
|
}
|
||||||
|
|
||||||
public static class Builder {
|
public static class Builder {
|
||||||
|
|
||||||
private final Properties props = new Properties();
|
private final Properties props = new Properties();
|
||||||
|
|
||||||
public Builder fromFile(File propertiesFile) throws IOException {
|
public Builder fromFile(File propertiesFile) throws IOException {
|
||||||
|
|||||||
@@ -16,17 +16,18 @@
|
|||||||
|
|
||||||
package com.uber.hoodie.config;
|
package com.uber.hoodie.config;
|
||||||
|
|
||||||
import javax.annotation.concurrent.Immutable;
|
|
||||||
import java.io.File;
|
import java.io.File;
|
||||||
import java.io.FileReader;
|
import java.io.FileReader;
|
||||||
import java.io.IOException;
|
import java.io.IOException;
|
||||||
import java.util.Properties;
|
import java.util.Properties;
|
||||||
|
import javax.annotation.concurrent.Immutable;
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Storage related config
|
* Storage related config
|
||||||
*/
|
*/
|
||||||
@Immutable
|
@Immutable
|
||||||
public class HoodieStorageConfig extends DefaultHoodieConfig {
|
public class HoodieStorageConfig extends DefaultHoodieConfig {
|
||||||
|
|
||||||
public static final String PARQUET_FILE_MAX_BYTES = "hoodie.parquet.max.file.size";
|
public static final String PARQUET_FILE_MAX_BYTES = "hoodie.parquet.max.file.size";
|
||||||
public static final String DEFAULT_PARQUET_FILE_MAX_BYTES = String.valueOf(120 * 1024 * 1024);
|
public static final String DEFAULT_PARQUET_FILE_MAX_BYTES = String.valueOf(120 * 1024 * 1024);
|
||||||
public static final String PARQUET_BLOCK_SIZE_BYTES = "hoodie.parquet.block.size";
|
public static final String PARQUET_BLOCK_SIZE_BYTES = "hoodie.parquet.block.size";
|
||||||
@@ -43,6 +44,7 @@ public class HoodieStorageConfig extends DefaultHoodieConfig {
|
|||||||
}
|
}
|
||||||
|
|
||||||
public static class Builder {
|
public static class Builder {
|
||||||
|
|
||||||
private final Properties props = new Properties();
|
private final Properties props = new Properties();
|
||||||
|
|
||||||
public Builder fromFile(File propertiesFile) throws IOException {
|
public Builder fromFile(File propertiesFile) throws IOException {
|
||||||
|
|||||||
@@ -24,21 +24,21 @@ import com.uber.hoodie.common.util.ReflectionUtils;
|
|||||||
import com.uber.hoodie.index.HoodieIndex;
|
import com.uber.hoodie.index.HoodieIndex;
|
||||||
import com.uber.hoodie.io.compact.strategy.CompactionStrategy;
|
import com.uber.hoodie.io.compact.strategy.CompactionStrategy;
|
||||||
import com.uber.hoodie.metrics.MetricsReporterType;
|
import com.uber.hoodie.metrics.MetricsReporterType;
|
||||||
import org.apache.spark.storage.StorageLevel;
|
|
||||||
|
|
||||||
import javax.annotation.concurrent.Immutable;
|
|
||||||
import java.io.File;
|
import java.io.File;
|
||||||
import java.io.FileReader;
|
import java.io.FileReader;
|
||||||
import java.io.IOException;
|
import java.io.IOException;
|
||||||
import java.io.InputStream;
|
import java.io.InputStream;
|
||||||
import java.util.Map;
|
import java.util.Map;
|
||||||
import java.util.Properties;
|
import java.util.Properties;
|
||||||
|
import javax.annotation.concurrent.Immutable;
|
||||||
|
import org.apache.spark.storage.StorageLevel;
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Class storing configs for the {@link com.uber.hoodie.HoodieWriteClient}
|
* Class storing configs for the {@link com.uber.hoodie.HoodieWriteClient}
|
||||||
*/
|
*/
|
||||||
@Immutable
|
@Immutable
|
||||||
public class HoodieWriteConfig extends DefaultHoodieConfig {
|
public class HoodieWriteConfig extends DefaultHoodieConfig {
|
||||||
|
|
||||||
private static final String BASE_PATH_PROP = "hoodie.base.path";
|
private static final String BASE_PATH_PROP = "hoodie.base.path";
|
||||||
private static final String AVRO_SCHEMA = "hoodie.avro.schema";
|
private static final String AVRO_SCHEMA = "hoodie.avro.schema";
|
||||||
public static final String TABLE_NAME = "hoodie.table.name";
|
public static final String TABLE_NAME = "hoodie.table.name";
|
||||||
@@ -141,7 +141,8 @@ public class HoodieWriteConfig extends DefaultHoodieConfig {
|
|||||||
}
|
}
|
||||||
|
|
||||||
public int getParquetSmallFileLimit() {
|
public int getParquetSmallFileLimit() {
|
||||||
return Integer.parseInt(props.getProperty(HoodieCompactionConfig.PARQUET_SMALL_FILE_LIMIT_BYTES));
|
return Integer
|
||||||
|
.parseInt(props.getProperty(HoodieCompactionConfig.PARQUET_SMALL_FILE_LIMIT_BYTES));
|
||||||
}
|
}
|
||||||
|
|
||||||
public int getCopyOnWriteInsertSplitSize() {
|
public int getCopyOnWriteInsertSplitSize() {
|
||||||
@@ -177,11 +178,13 @@ public class HoodieWriteConfig extends DefaultHoodieConfig {
|
|||||||
}
|
}
|
||||||
|
|
||||||
public CompactionStrategy getCompactionStrategy() {
|
public CompactionStrategy getCompactionStrategy() {
|
||||||
return ReflectionUtils.loadClass(props.getProperty(HoodieCompactionConfig.COMPACTION_STRATEGY_PROP));
|
return ReflectionUtils
|
||||||
|
.loadClass(props.getProperty(HoodieCompactionConfig.COMPACTION_STRATEGY_PROP));
|
||||||
}
|
}
|
||||||
|
|
||||||
public Long getTargetIOPerCompactionInMB() {
|
public Long getTargetIOPerCompactionInMB() {
|
||||||
return Long.parseLong(props.getProperty(HoodieCompactionConfig.TARGET_IO_PER_COMPACTION_IN_MB_PROP));
|
return Long
|
||||||
|
.parseLong(props.getProperty(HoodieCompactionConfig.TARGET_IO_PER_COMPACTION_IN_MB_PROP));
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
@@ -216,7 +219,8 @@ public class HoodieWriteConfig extends DefaultHoodieConfig {
|
|||||||
}
|
}
|
||||||
|
|
||||||
public boolean getBloomIndexPruneByRanges() {
|
public boolean getBloomIndexPruneByRanges() {
|
||||||
return Boolean.parseBoolean(props.getProperty(HoodieIndexConfig.BLOOM_INDEX_PRUNE_BY_RANGES_PROP));
|
return Boolean
|
||||||
|
.parseBoolean(props.getProperty(HoodieIndexConfig.BLOOM_INDEX_PRUNE_BY_RANGES_PROP));
|
||||||
}
|
}
|
||||||
|
|
||||||
public boolean getBloomIndexUseCaching() {
|
public boolean getBloomIndexUseCaching() {
|
||||||
@@ -271,8 +275,8 @@ public class HoodieWriteConfig extends DefaultHoodieConfig {
|
|||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
public static class Builder {
|
public static class Builder {
|
||||||
|
|
||||||
private final Properties props = new Properties();
|
private final Properties props = new Properties();
|
||||||
private boolean isIndexConfigSet = false;
|
private boolean isIndexConfigSet = false;
|
||||||
private boolean isStorageConfigSet = false;
|
private boolean isStorageConfigSet = false;
|
||||||
@@ -371,7 +375,8 @@ public class HoodieWriteConfig extends DefaultHoodieConfig {
|
|||||||
}
|
}
|
||||||
|
|
||||||
public Builder withAssumeDatePartitioning(boolean assumeDatePartitioning) {
|
public Builder withAssumeDatePartitioning(boolean assumeDatePartitioning) {
|
||||||
props.setProperty(HOODIE_ASSUME_DATE_PARTITIONING_PROP, String.valueOf(assumeDatePartitioning));
|
props.setProperty(HOODIE_ASSUME_DATE_PARTITIONING_PROP,
|
||||||
|
String.valueOf(assumeDatePartitioning));
|
||||||
return this;
|
return this;
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -386,7 +391,8 @@ public class HoodieWriteConfig extends DefaultHoodieConfig {
|
|||||||
Preconditions.checkArgument(config.getBasePath() != null);
|
Preconditions.checkArgument(config.getBasePath() != null);
|
||||||
setDefaultOnCondition(props, !props.containsKey(INSERT_PARALLELISM), INSERT_PARALLELISM,
|
setDefaultOnCondition(props, !props.containsKey(INSERT_PARALLELISM), INSERT_PARALLELISM,
|
||||||
DEFAULT_PARALLELISM);
|
DEFAULT_PARALLELISM);
|
||||||
setDefaultOnCondition(props, !props.containsKey(BULKINSERT_PARALLELISM), BULKINSERT_PARALLELISM,
|
setDefaultOnCondition(props, !props.containsKey(BULKINSERT_PARALLELISM),
|
||||||
|
BULKINSERT_PARALLELISM,
|
||||||
DEFAULT_PARALLELISM);
|
DEFAULT_PARALLELISM);
|
||||||
setDefaultOnCondition(props, !props.containsKey(UPSERT_PARALLELISM), UPSERT_PARALLELISM,
|
setDefaultOnCondition(props, !props.containsKey(UPSERT_PARALLELISM), UPSERT_PARALLELISM,
|
||||||
DEFAULT_PARALLELISM);
|
DEFAULT_PARALLELISM);
|
||||||
|
|||||||
@@ -17,11 +17,11 @@
|
|||||||
package com.uber.hoodie.exception;
|
package com.uber.hoodie.exception;
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* <p>
|
* <p> Exception thrown for any higher level errors when <code>HoodieClient</code> is doing a delta
|
||||||
* Exception thrown for any higher level errors when <code>HoodieClient</code> is doing a delta commit
|
* commit </p>
|
||||||
* </p>
|
|
||||||
*/
|
*/
|
||||||
public class HoodieAppendException extends HoodieException {
|
public class HoodieAppendException extends HoodieException {
|
||||||
|
|
||||||
public HoodieAppendException(String msg, Throwable e) {
|
public HoodieAppendException(String msg, Throwable e) {
|
||||||
super(msg, e);
|
super(msg, e);
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -17,11 +17,11 @@
|
|||||||
package com.uber.hoodie.exception;
|
package com.uber.hoodie.exception;
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* <p>
|
* <p> Exception thrown for any higher level errors when <code>HoodieClient</code> is doing a Commit
|
||||||
* Exception thrown for any higher level errors when <code>HoodieClient</code> is doing a Commit
|
|
||||||
* </p>
|
* </p>
|
||||||
*/
|
*/
|
||||||
public class HoodieCommitException extends HoodieException {
|
public class HoodieCommitException extends HoodieException {
|
||||||
|
|
||||||
public HoodieCommitException(String msg) {
|
public HoodieCommitException(String msg) {
|
||||||
super(msg);
|
super(msg);
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -17,6 +17,7 @@
|
|||||||
package com.uber.hoodie.exception;
|
package com.uber.hoodie.exception;
|
||||||
|
|
||||||
public class HoodieCompactionException extends HoodieException {
|
public class HoodieCompactionException extends HoodieException {
|
||||||
|
|
||||||
public HoodieCompactionException(String msg) {
|
public HoodieCompactionException(String msg) {
|
||||||
super(msg);
|
super(msg);
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -18,11 +18,10 @@ package com.uber.hoodie.exception;
|
|||||||
|
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* <p>
|
* <p> Exception thrown when dependent system is not available </p>
|
||||||
* Exception thrown when dependent system is not available
|
|
||||||
* </p>
|
|
||||||
*/
|
*/
|
||||||
public class HoodieDependentSystemUnavailableException extends HoodieException {
|
public class HoodieDependentSystemUnavailableException extends HoodieException {
|
||||||
|
|
||||||
public static final String HBASE = "HBASE";
|
public static final String HBASE = "HBASE";
|
||||||
|
|
||||||
public HoodieDependentSystemUnavailableException(String system, String connectURL) {
|
public HoodieDependentSystemUnavailableException(String system, String connectURL) {
|
||||||
|
|||||||
@@ -16,14 +16,12 @@
|
|||||||
|
|
||||||
package com.uber.hoodie.exception;
|
package com.uber.hoodie.exception;
|
||||||
|
|
||||||
import java.io.IOException;
|
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* <p>
|
* <p> Exception thrown for any higher level errors when <code>HoodieClient</code> is doing a bulk
|
||||||
* Exception thrown for any higher level errors when <code>HoodieClient</code> is doing a bulk insert
|
* insert </p>
|
||||||
* </p>
|
|
||||||
*/
|
*/
|
||||||
public class HoodieInsertException extends HoodieException {
|
public class HoodieInsertException extends HoodieException {
|
||||||
|
|
||||||
public HoodieInsertException(String msg, Throwable e) {
|
public HoodieInsertException(String msg, Throwable e) {
|
||||||
super(msg, e);
|
super(msg, e);
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -17,11 +17,11 @@
|
|||||||
package com.uber.hoodie.exception;
|
package com.uber.hoodie.exception;
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* <p>
|
* <p> Exception thrown for any higher level errors when <code>HoodieClient</code> is doing a
|
||||||
* Exception thrown for any higher level errors when <code>HoodieClient</code> is doing a incremental upsert
|
* incremental upsert </p>
|
||||||
* </p>
|
|
||||||
*/
|
*/
|
||||||
public class HoodieUpsertException extends HoodieException {
|
public class HoodieUpsertException extends HoodieException {
|
||||||
|
|
||||||
public HoodieUpsertException(String msg, Throwable e) {
|
public HoodieUpsertException(String msg, Throwable e) {
|
||||||
super(msg, e);
|
super(msg, e);
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -16,16 +16,14 @@
|
|||||||
|
|
||||||
package com.uber.hoodie.func;
|
package com.uber.hoodie.func;
|
||||||
|
|
||||||
import com.uber.hoodie.common.table.HoodieTableMetaClient;
|
|
||||||
import com.uber.hoodie.config.HoodieWriteConfig;
|
|
||||||
import com.uber.hoodie.WriteStatus;
|
import com.uber.hoodie.WriteStatus;
|
||||||
import com.uber.hoodie.common.model.HoodieRecord;
|
import com.uber.hoodie.common.model.HoodieRecord;
|
||||||
import com.uber.hoodie.common.model.HoodieRecordPayload;
|
import com.uber.hoodie.common.model.HoodieRecordPayload;
|
||||||
|
import com.uber.hoodie.config.HoodieWriteConfig;
|
||||||
import com.uber.hoodie.table.HoodieTable;
|
import com.uber.hoodie.table.HoodieTable;
|
||||||
import org.apache.spark.api.java.function.Function2;
|
|
||||||
|
|
||||||
import java.util.Iterator;
|
import java.util.Iterator;
|
||||||
import java.util.List;
|
import java.util.List;
|
||||||
|
import org.apache.spark.api.java.function.Function2;
|
||||||
|
|
||||||
|
|
||||||
/**
|
/**
|
||||||
@@ -46,7 +44,8 @@ public class BulkInsertMapFunction<T extends HoodieRecordPayload>
|
|||||||
}
|
}
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
public Iterator<List<WriteStatus>> call(Integer partition, Iterator<HoodieRecord<T>> sortedRecordItr)
|
public Iterator<List<WriteStatus>> call(Integer partition,
|
||||||
|
Iterator<HoodieRecord<T>> sortedRecordItr)
|
||||||
throws Exception {
|
throws Exception {
|
||||||
return new LazyInsertIterable<>(sortedRecordItr, config, commitTime, hoodieTable);
|
return new LazyInsertIterable<>(sortedRecordItr, config, commitTime, hoodieTable);
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -16,27 +16,26 @@
|
|||||||
|
|
||||||
package com.uber.hoodie.func;
|
package com.uber.hoodie.func;
|
||||||
|
|
||||||
import com.uber.hoodie.config.HoodieWriteConfig;
|
|
||||||
import com.uber.hoodie.WriteStatus;
|
import com.uber.hoodie.WriteStatus;
|
||||||
import com.uber.hoodie.common.model.HoodieRecord;
|
import com.uber.hoodie.common.model.HoodieRecord;
|
||||||
import com.uber.hoodie.common.model.HoodieRecordPayload;
|
import com.uber.hoodie.common.model.HoodieRecordPayload;
|
||||||
|
import com.uber.hoodie.config.HoodieWriteConfig;
|
||||||
import com.uber.hoodie.io.HoodieIOHandle;
|
|
||||||
import com.uber.hoodie.io.HoodieCreateHandle;
|
import com.uber.hoodie.io.HoodieCreateHandle;
|
||||||
|
import com.uber.hoodie.io.HoodieIOHandle;
|
||||||
import com.uber.hoodie.table.HoodieTable;
|
import com.uber.hoodie.table.HoodieTable;
|
||||||
import org.apache.spark.TaskContext;
|
|
||||||
|
|
||||||
import java.util.ArrayList;
|
import java.util.ArrayList;
|
||||||
import java.util.HashSet;
|
import java.util.HashSet;
|
||||||
import java.util.Iterator;
|
import java.util.Iterator;
|
||||||
import java.util.List;
|
import java.util.List;
|
||||||
import java.util.Set;
|
import java.util.Set;
|
||||||
|
import org.apache.spark.TaskContext;
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Lazy Iterable, that writes a stream of HoodieRecords sorted by the partitionPath,
|
* Lazy Iterable, that writes a stream of HoodieRecords sorted by the partitionPath, into new
|
||||||
* into new files.
|
* files.
|
||||||
*/
|
*/
|
||||||
public class LazyInsertIterable<T extends HoodieRecordPayload> extends LazyIterableIterator<HoodieRecord<T>, List<WriteStatus>> {
|
public class LazyInsertIterable<T extends HoodieRecordPayload> extends
|
||||||
|
LazyIterableIterator<HoodieRecord<T>, List<WriteStatus>> {
|
||||||
|
|
||||||
private final HoodieWriteConfig hoodieConfig;
|
private final HoodieWriteConfig hoodieConfig;
|
||||||
private final String commitTime;
|
private final String commitTime;
|
||||||
@@ -53,11 +52,13 @@ public class LazyInsertIterable<T extends HoodieRecordPayload> extends LazyItera
|
|||||||
this.hoodieTable = hoodieTable;
|
this.hoodieTable = hoodieTable;
|
||||||
}
|
}
|
||||||
|
|
||||||
@Override protected void start() {
|
@Override
|
||||||
|
protected void start() {
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
@Override protected List<WriteStatus> computeNext() {
|
@Override
|
||||||
|
protected List<WriteStatus> computeNext() {
|
||||||
List<WriteStatus> statuses = new ArrayList<>();
|
List<WriteStatus> statuses = new ArrayList<>();
|
||||||
|
|
||||||
while (inputItr.hasNext()) {
|
while (inputItr.hasNext()) {
|
||||||
@@ -108,7 +109,8 @@ public class LazyInsertIterable<T extends HoodieRecordPayload> extends LazyItera
|
|||||||
return statuses;
|
return statuses;
|
||||||
}
|
}
|
||||||
|
|
||||||
@Override protected void end() {
|
@Override
|
||||||
|
protected void end() {
|
||||||
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -31,6 +31,7 @@ import java.util.Iterator;
|
|||||||
* responsible for calling inputIterator.next() and doing the processing in computeNext()
|
* responsible for calling inputIterator.next() and doing the processing in computeNext()
|
||||||
*/
|
*/
|
||||||
public abstract class LazyIterableIterator<I, O> implements Iterable<O>, Iterator<O> {
|
public abstract class LazyIterableIterator<I, O> implements Iterable<O>, Iterator<O> {
|
||||||
|
|
||||||
protected Iterator<I> inputItr = null;
|
protected Iterator<I> inputItr = null;
|
||||||
private boolean consumed = false;
|
private boolean consumed = false;
|
||||||
private boolean startCalled = false;
|
private boolean startCalled = false;
|
||||||
@@ -56,7 +57,6 @@ public abstract class LazyIterableIterator<I, O> implements Iterable<O>, Iterato
|
|||||||
*/
|
*/
|
||||||
protected abstract void end();
|
protected abstract void end();
|
||||||
|
|
||||||
|
|
||||||
//////////////////
|
//////////////////
|
||||||
// iterable implementation
|
// iterable implementation
|
||||||
|
|
||||||
@@ -87,8 +87,9 @@ public abstract class LazyIterableIterator<I, O> implements Iterable<O>, Iterato
|
|||||||
@Override
|
@Override
|
||||||
public Iterator<O> iterator() {
|
public Iterator<O> iterator() {
|
||||||
//check for consumed inputItr
|
//check for consumed inputItr
|
||||||
if (consumed)
|
if (consumed) {
|
||||||
throw new RuntimeException("Invalid repeated inputItr consumption.");
|
throw new RuntimeException("Invalid repeated inputItr consumption.");
|
||||||
|
}
|
||||||
|
|
||||||
//hand out self as inputItr exactly once (note: do not hand out the input
|
//hand out self as inputItr exactly once (note: do not hand out the input
|
||||||
//inputItr since it is consumed by the self inputItr implementation)
|
//inputItr since it is consumed by the self inputItr implementation)
|
||||||
|
|||||||
@@ -17,29 +17,26 @@
|
|||||||
package com.uber.hoodie.index;
|
package com.uber.hoodie.index;
|
||||||
|
|
||||||
import com.google.common.base.Optional;
|
import com.google.common.base.Optional;
|
||||||
|
|
||||||
import com.uber.hoodie.config.HoodieWriteConfig;
|
|
||||||
import com.uber.hoodie.WriteStatus;
|
import com.uber.hoodie.WriteStatus;
|
||||||
import com.uber.hoodie.common.model.HoodieKey;
|
import com.uber.hoodie.common.model.HoodieKey;
|
||||||
import com.uber.hoodie.common.model.HoodieRecordPayload;
|
|
||||||
import com.uber.hoodie.common.model.HoodieRecord;
|
import com.uber.hoodie.common.model.HoodieRecord;
|
||||||
|
import com.uber.hoodie.common.model.HoodieRecordPayload;
|
||||||
|
import com.uber.hoodie.config.HoodieWriteConfig;
|
||||||
import com.uber.hoodie.exception.HoodieIndexException;
|
import com.uber.hoodie.exception.HoodieIndexException;
|
||||||
import com.uber.hoodie.index.bloom.HoodieBloomIndex;
|
import com.uber.hoodie.index.bloom.HoodieBloomIndex;
|
||||||
import com.uber.hoodie.index.bucketed.BucketedIndex;
|
import com.uber.hoodie.index.bucketed.BucketedIndex;
|
||||||
import com.uber.hoodie.index.hbase.HBaseIndex;
|
import com.uber.hoodie.index.hbase.HBaseIndex;
|
||||||
import com.uber.hoodie.table.HoodieTable;
|
import com.uber.hoodie.table.HoodieTable;
|
||||||
|
import java.io.Serializable;
|
||||||
import org.apache.spark.api.java.JavaPairRDD;
|
import org.apache.spark.api.java.JavaPairRDD;
|
||||||
import org.apache.spark.api.java.JavaRDD;
|
import org.apache.spark.api.java.JavaRDD;
|
||||||
import org.apache.spark.api.java.JavaSparkContext;
|
import org.apache.spark.api.java.JavaSparkContext;
|
||||||
|
|
||||||
import java.io.Serializable;
|
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Base class for different types of indexes to determine the mapping from uuid
|
* Base class for different types of indexes to determine the mapping from uuid
|
||||||
*
|
|
||||||
*/
|
*/
|
||||||
public abstract class HoodieIndex<T extends HoodieRecordPayload> implements Serializable {
|
public abstract class HoodieIndex<T extends HoodieRecordPayload> implements Serializable {
|
||||||
|
|
||||||
protected transient JavaSparkContext jsc = null;
|
protected transient JavaSparkContext jsc = null;
|
||||||
|
|
||||||
public enum IndexType {
|
public enum IndexType {
|
||||||
@@ -58,12 +55,9 @@ public abstract class HoodieIndex<T extends HoodieRecordPayload> implements Seri
|
|||||||
|
|
||||||
/**
|
/**
|
||||||
* Checks if the given [Keys] exists in the hoodie table and returns [Key, Optional[FullFilePath]]
|
* Checks if the given [Keys] exists in the hoodie table and returns [Key, Optional[FullFilePath]]
|
||||||
* If the optional FullFilePath value is not present, then the key is not found. If the FullFilePath
|
* If the optional FullFilePath value is not present, then the key is not found. If the
|
||||||
* value is present, it is the path component (without scheme) of the URI underlying file
|
* FullFilePath value is present, it is the path component (without scheme) of the URI underlying
|
||||||
*
|
* file
|
||||||
* @param hoodieKeys
|
|
||||||
* @param table
|
|
||||||
* @return
|
|
||||||
*/
|
*/
|
||||||
public abstract JavaPairRDD<HoodieKey, Optional<String>> fetchRecordLocation(
|
public abstract JavaPairRDD<HoodieKey, Optional<String>> fetchRecordLocation(
|
||||||
JavaRDD<HoodieKey> hoodieKeys, final HoodieTable<T> table);
|
JavaRDD<HoodieKey> hoodieKeys, final HoodieTable<T> table);
|
||||||
@@ -89,17 +83,17 @@ public abstract class HoodieIndex<T extends HoodieRecordPayload> implements Seri
|
|||||||
public abstract boolean rollbackCommit(String commitTime);
|
public abstract boolean rollbackCommit(String commitTime);
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* An index is `global` if {@link HoodieKey} to fileID mapping, does not depend on the `partitionPath`.
|
* An index is `global` if {@link HoodieKey} to fileID mapping, does not depend on the
|
||||||
* Such an implementation is able to obtain the same mapping, for two hoodie keys with same `recordKey`
|
* `partitionPath`. Such an implementation is able to obtain the same mapping, for two hoodie keys
|
||||||
* but different `partitionPath`
|
* with same `recordKey` but different `partitionPath`
|
||||||
*
|
*
|
||||||
* @return whether or not, the index implementation is global in nature
|
* @return whether or not, the index implementation is global in nature
|
||||||
*/
|
*/
|
||||||
public abstract boolean isGlobal();
|
public abstract boolean isGlobal();
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* This is used by storage to determine, if its safe to send inserts, straight to the log,
|
* This is used by storage to determine, if its safe to send inserts, straight to the log, i.e
|
||||||
* i.e having a {@link com.uber.hoodie.common.model.FileSlice}, with no data file.
|
* having a {@link com.uber.hoodie.common.model.FileSlice}, with no data file.
|
||||||
*
|
*
|
||||||
* @return Returns true/false depending on whether the impl has this capability
|
* @return Returns true/false depending on whether the impl has this capability
|
||||||
*/
|
*/
|
||||||
@@ -107,12 +101,8 @@ public abstract class HoodieIndex<T extends HoodieRecordPayload> implements Seri
|
|||||||
|
|
||||||
|
|
||||||
/**
|
/**
|
||||||
*
|
|
||||||
* An index is "implicit" with respect to storage, if just writing new data to a file slice,
|
* An index is "implicit" with respect to storage, if just writing new data to a file slice,
|
||||||
* updates the index as well. This is used by storage, to save memory footprint in
|
* updates the index as well. This is used by storage, to save memory footprint in certain cases.
|
||||||
* certain cases.
|
|
||||||
*
|
|
||||||
* @return
|
|
||||||
*/
|
*/
|
||||||
public abstract boolean isImplicitWithStorage();
|
public abstract boolean isImplicitWithStorage();
|
||||||
|
|
||||||
|
|||||||
@@ -17,32 +17,27 @@
|
|||||||
package com.uber.hoodie.index;
|
package com.uber.hoodie.index;
|
||||||
|
|
||||||
import com.google.common.base.Optional;
|
import com.google.common.base.Optional;
|
||||||
import com.uber.hoodie.common.table.HoodieTableMetaClient;
|
|
||||||
import com.uber.hoodie.config.HoodieWriteConfig;
|
|
||||||
import com.uber.hoodie.WriteStatus;
|
import com.uber.hoodie.WriteStatus;
|
||||||
import com.uber.hoodie.common.model.HoodieKey;
|
import com.uber.hoodie.common.model.HoodieKey;
|
||||||
import com.uber.hoodie.common.model.HoodieRecord;
|
import com.uber.hoodie.common.model.HoodieRecord;
|
||||||
import com.uber.hoodie.common.model.HoodieRecordLocation;
|
import com.uber.hoodie.common.model.HoodieRecordLocation;
|
||||||
import com.uber.hoodie.common.model.HoodieRecordPayload;
|
import com.uber.hoodie.common.model.HoodieRecordPayload;
|
||||||
|
import com.uber.hoodie.config.HoodieWriteConfig;
|
||||||
import com.uber.hoodie.table.HoodieTable;
|
import com.uber.hoodie.table.HoodieTable;
|
||||||
|
import java.util.ArrayList;
|
||||||
|
import java.util.Iterator;
|
||||||
|
import java.util.List;
|
||||||
|
import java.util.concurrent.ConcurrentHashMap;
|
||||||
|
import java.util.concurrent.ConcurrentMap;
|
||||||
import org.apache.spark.api.java.JavaPairRDD;
|
import org.apache.spark.api.java.JavaPairRDD;
|
||||||
import org.apache.spark.api.java.JavaRDD;
|
import org.apache.spark.api.java.JavaRDD;
|
||||||
import org.apache.spark.api.java.JavaSparkContext;
|
import org.apache.spark.api.java.JavaSparkContext;
|
||||||
import org.apache.spark.api.java.function.Function;
|
import org.apache.spark.api.java.function.Function;
|
||||||
import org.apache.spark.api.java.function.Function2;
|
import org.apache.spark.api.java.function.Function2;
|
||||||
|
|
||||||
import java.util.ArrayList;
|
|
||||||
import java.util.Iterator;
|
|
||||||
import java.util.List;
|
|
||||||
import java.util.concurrent.ConcurrentHashMap;
|
|
||||||
import java.util.concurrent.ConcurrentMap;
|
|
||||||
|
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Hoodie Index implementation backed by an in-memory Hash map.
|
* Hoodie Index implementation backed by an in-memory Hash map. <p> ONLY USE FOR LOCAL TESTING
|
||||||
* <p>
|
|
||||||
* ONLY USE FOR LOCAL TESTING
|
|
||||||
*/
|
*/
|
||||||
public class InMemoryHashIndex<T extends HoodieRecordPayload> extends HoodieIndex<T> {
|
public class InMemoryHashIndex<T extends HoodieRecordPayload> extends HoodieIndex<T> {
|
||||||
|
|
||||||
@@ -64,6 +59,7 @@ public class InMemoryHashIndex<T extends HoodieRecordPayload> extends HoodieInde
|
|||||||
*/
|
*/
|
||||||
class LocationTagFunction
|
class LocationTagFunction
|
||||||
implements Function2<Integer, Iterator<HoodieRecord<T>>, Iterator<HoodieRecord<T>>> {
|
implements Function2<Integer, Iterator<HoodieRecord<T>>, Iterator<HoodieRecord<T>>> {
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
public Iterator<HoodieRecord<T>> call(Integer partitionNum,
|
public Iterator<HoodieRecord<T>> call(Integer partitionNum,
|
||||||
Iterator<HoodieRecord<T>> hoodieRecordIterator) {
|
Iterator<HoodieRecord<T>> hoodieRecordIterator) {
|
||||||
@@ -115,8 +111,6 @@ public class InMemoryHashIndex<T extends HoodieRecordPayload> extends HoodieInde
|
|||||||
|
|
||||||
/**
|
/**
|
||||||
* Only looks up by recordKey
|
* Only looks up by recordKey
|
||||||
*
|
|
||||||
* @return
|
|
||||||
*/
|
*/
|
||||||
@Override
|
@Override
|
||||||
public boolean isGlobal() {
|
public boolean isGlobal() {
|
||||||
@@ -125,8 +119,6 @@ public class InMemoryHashIndex<T extends HoodieRecordPayload> extends HoodieInde
|
|||||||
|
|
||||||
/**
|
/**
|
||||||
* Mapping is available in HBase already.
|
* Mapping is available in HBase already.
|
||||||
*
|
|
||||||
* @return
|
|
||||||
*/
|
*/
|
||||||
@Override
|
@Override
|
||||||
public boolean canIndexLogFiles() {
|
public boolean canIndexLogFiles() {
|
||||||
@@ -135,8 +127,6 @@ public class InMemoryHashIndex<T extends HoodieRecordPayload> extends HoodieInde
|
|||||||
|
|
||||||
/**
|
/**
|
||||||
* Index needs to be explicitly updated after storage write.
|
* Index needs to be explicitly updated after storage write.
|
||||||
*
|
|
||||||
* @return
|
|
||||||
*/
|
*/
|
||||||
@Override
|
@Override
|
||||||
public boolean isImplicitWithStorage() {
|
public boolean isImplicitWithStorage() {
|
||||||
|
|||||||
@@ -19,7 +19,6 @@
|
|||||||
package com.uber.hoodie.index.bloom;
|
package com.uber.hoodie.index.bloom;
|
||||||
|
|
||||||
import com.google.common.base.Objects;
|
import com.google.common.base.Objects;
|
||||||
|
|
||||||
import java.io.Serializable;
|
import java.io.Serializable;
|
||||||
|
|
||||||
/**
|
/**
|
||||||
@@ -63,8 +62,6 @@ public class BloomIndexFileInfo implements Serializable {
|
|||||||
|
|
||||||
/**
|
/**
|
||||||
* Does the given key fall within the range (inclusive)
|
* Does the given key fall within the range (inclusive)
|
||||||
* @param recordKey
|
|
||||||
* @return
|
|
||||||
*/
|
*/
|
||||||
public boolean isKeyInRange(String recordKey) {
|
public boolean isKeyInRange(String recordKey) {
|
||||||
return minRecordKey.compareTo(recordKey) <= 0 &&
|
return minRecordKey.compareTo(recordKey) <= 0 &&
|
||||||
@@ -73,8 +70,12 @@ public class BloomIndexFileInfo implements Serializable {
|
|||||||
|
|
||||||
@Override
|
@Override
|
||||||
public boolean equals(Object o) {
|
public boolean equals(Object o) {
|
||||||
if (this == o) return true;
|
if (this == o) {
|
||||||
if (o == null || getClass() != o.getClass()) return false;
|
return true;
|
||||||
|
}
|
||||||
|
if (o == null || getClass() != o.getClass()) {
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
|
||||||
BloomIndexFileInfo that = (BloomIndexFileInfo) o;
|
BloomIndexFileInfo that = (BloomIndexFileInfo) o;
|
||||||
return Objects.equal(that.fileName, fileName) &&
|
return Objects.equal(that.fileName, fileName) &&
|
||||||
|
|||||||
@@ -18,9 +18,12 @@
|
|||||||
|
|
||||||
package com.uber.hoodie.index.bloom;
|
package com.uber.hoodie.index.bloom;
|
||||||
|
|
||||||
|
import static java.util.stream.Collectors.groupingBy;
|
||||||
|
import static java.util.stream.Collectors.mapping;
|
||||||
|
import static java.util.stream.Collectors.toList;
|
||||||
|
|
||||||
import com.google.common.annotations.VisibleForTesting;
|
import com.google.common.annotations.VisibleForTesting;
|
||||||
import com.google.common.base.Optional;
|
import com.google.common.base.Optional;
|
||||||
|
|
||||||
import com.uber.hoodie.WriteStatus;
|
import com.uber.hoodie.WriteStatus;
|
||||||
import com.uber.hoodie.common.model.HoodieDataFile;
|
import com.uber.hoodie.common.model.HoodieDataFile;
|
||||||
import com.uber.hoodie.common.model.HoodieKey;
|
import com.uber.hoodie.common.model.HoodieKey;
|
||||||
@@ -34,7 +37,10 @@ import com.uber.hoodie.config.HoodieWriteConfig;
|
|||||||
import com.uber.hoodie.exception.MetadataNotFoundException;
|
import com.uber.hoodie.exception.MetadataNotFoundException;
|
||||||
import com.uber.hoodie.index.HoodieIndex;
|
import com.uber.hoodie.index.HoodieIndex;
|
||||||
import com.uber.hoodie.table.HoodieTable;
|
import com.uber.hoodie.table.HoodieTable;
|
||||||
|
import java.util.ArrayList;
|
||||||
|
import java.util.List;
|
||||||
|
import java.util.Map;
|
||||||
|
import java.util.stream.Collectors;
|
||||||
import org.apache.hadoop.fs.Path;
|
import org.apache.hadoop.fs.Path;
|
||||||
import org.apache.log4j.LogManager;
|
import org.apache.log4j.LogManager;
|
||||||
import org.apache.log4j.Logger;
|
import org.apache.log4j.Logger;
|
||||||
@@ -42,16 +48,8 @@ import org.apache.spark.api.java.JavaPairRDD;
|
|||||||
import org.apache.spark.api.java.JavaRDD;
|
import org.apache.spark.api.java.JavaRDD;
|
||||||
import org.apache.spark.api.java.JavaSparkContext;
|
import org.apache.spark.api.java.JavaSparkContext;
|
||||||
import org.apache.spark.storage.StorageLevel;
|
import org.apache.spark.storage.StorageLevel;
|
||||||
|
|
||||||
import scala.Tuple2;
|
import scala.Tuple2;
|
||||||
|
|
||||||
import java.util.ArrayList;
|
|
||||||
import java.util.List;
|
|
||||||
import java.util.Map;
|
|
||||||
import java.util.stream.Collectors;
|
|
||||||
|
|
||||||
import static java.util.stream.Collectors.*;
|
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Indexing mechanism based on bloom filter. Each parquet file includes its row_key bloom filter in
|
* Indexing mechanism based on bloom filter. Each parquet file includes its row_key bloom filter in
|
||||||
* its metadata.
|
* its metadata.
|
||||||
@@ -64,14 +62,16 @@ public class HoodieBloomIndex<T extends HoodieRecordPayload> extends HoodieIndex
|
|||||||
private static final int SPARK_MAXIMUM_BYTES_PER_PARTITION = 1500 * 1024 * 1024;
|
private static final int SPARK_MAXIMUM_BYTES_PER_PARTITION = 1500 * 1024 * 1024;
|
||||||
// this is how much a triplet of (partitionPath, fileId, recordKey) costs.
|
// this is how much a triplet of (partitionPath, fileId, recordKey) costs.
|
||||||
private static final int BYTES_PER_PARTITION_FILE_KEY_TRIPLET = 300;
|
private static final int BYTES_PER_PARTITION_FILE_KEY_TRIPLET = 300;
|
||||||
private static int MAX_ITEMS_PER_SHUFFLE_PARTITION = SPARK_MAXIMUM_BYTES_PER_PARTITION / BYTES_PER_PARTITION_FILE_KEY_TRIPLET;
|
private static int MAX_ITEMS_PER_SHUFFLE_PARTITION =
|
||||||
|
SPARK_MAXIMUM_BYTES_PER_PARTITION / BYTES_PER_PARTITION_FILE_KEY_TRIPLET;
|
||||||
|
|
||||||
public HoodieBloomIndex(HoodieWriteConfig config, JavaSparkContext jsc) {
|
public HoodieBloomIndex(HoodieWriteConfig config, JavaSparkContext jsc) {
|
||||||
super(config, jsc);
|
super(config, jsc);
|
||||||
}
|
}
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
public JavaRDD<HoodieRecord<T>> tagLocation(JavaRDD<HoodieRecord<T>> recordRDD, final HoodieTable<T> hoodieTable) {
|
public JavaRDD<HoodieRecord<T>> tagLocation(JavaRDD<HoodieRecord<T>> recordRDD,
|
||||||
|
final HoodieTable<T> hoodieTable) {
|
||||||
|
|
||||||
// Step 0: cache the input record RDD
|
// Step 0: cache the input record RDD
|
||||||
if (config.getBloomIndexUseCaching()) {
|
if (config.getBloomIndexUseCaching()) {
|
||||||
@@ -83,7 +83,8 @@ public class HoodieBloomIndex<T extends HoodieRecordPayload> extends HoodieIndex
|
|||||||
.mapToPair(record -> new Tuple2<>(record.getPartitionPath(), record.getRecordKey()));
|
.mapToPair(record -> new Tuple2<>(record.getPartitionPath(), record.getRecordKey()));
|
||||||
|
|
||||||
// Lookup indexes for all the partition/recordkey pair
|
// Lookup indexes for all the partition/recordkey pair
|
||||||
JavaPairRDD<String, String> rowKeyFilenamePairRDD = lookupIndex(partitionRecordKeyPairRDD, hoodieTable);
|
JavaPairRDD<String, String> rowKeyFilenamePairRDD = lookupIndex(partitionRecordKeyPairRDD,
|
||||||
|
hoodieTable);
|
||||||
|
|
||||||
// Cache the result, for subsequent stages.
|
// Cache the result, for subsequent stages.
|
||||||
if (config.getBloomIndexUseCaching()) {
|
if (config.getBloomIndexUseCaching()) {
|
||||||
@@ -96,7 +97,8 @@ public class HoodieBloomIndex<T extends HoodieRecordPayload> extends HoodieIndex
|
|||||||
|
|
||||||
// Step 4: Tag the incoming records, as inserts or updates, by joining with existing record keys
|
// Step 4: Tag the incoming records, as inserts or updates, by joining with existing record keys
|
||||||
// Cost: 4 sec.
|
// Cost: 4 sec.
|
||||||
JavaRDD<HoodieRecord<T>> taggedRecordRDD = tagLocationBacktoRecords(rowKeyFilenamePairRDD, recordRDD);
|
JavaRDD<HoodieRecord<T>> taggedRecordRDD = tagLocationBacktoRecords(rowKeyFilenamePairRDD,
|
||||||
|
recordRDD);
|
||||||
|
|
||||||
if (config.getBloomIndexUseCaching()) {
|
if (config.getBloomIndexUseCaching()) {
|
||||||
recordRDD.unpersist(); // unpersist the input Record RDD
|
recordRDD.unpersist(); // unpersist the input Record RDD
|
||||||
@@ -135,8 +137,8 @@ public class HoodieBloomIndex<T extends HoodieRecordPayload> extends HoodieIndex
|
|||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Lookup the location for each record key and return the pair<record_key,location> for all
|
* Lookup the location for each record key and return the pair<record_key,location> for all record
|
||||||
* record keys already present and drop the record keys if not present
|
* keys already present and drop the record keys if not present
|
||||||
*/
|
*/
|
||||||
private JavaPairRDD<String, String> lookupIndex(
|
private JavaPairRDD<String, String> lookupIndex(
|
||||||
JavaPairRDD<String, String> partitionRecordKeyPairRDD, final HoodieTable<T> hoodieTable) {
|
JavaPairRDD<String, String> partitionRecordKeyPairRDD, final HoodieTable<T> hoodieTable) {
|
||||||
@@ -145,25 +147,27 @@ public class HoodieBloomIndex<T extends HoodieRecordPayload> extends HoodieIndex
|
|||||||
List<String> affectedPartitionPathList = new ArrayList<>(recordsPerPartition.keySet());
|
List<String> affectedPartitionPathList = new ArrayList<>(recordsPerPartition.keySet());
|
||||||
|
|
||||||
// Step 2: Load all involved files as <Partition, filename> pairs
|
// Step 2: Load all involved files as <Partition, filename> pairs
|
||||||
List<Tuple2<String, BloomIndexFileInfo>> fileInfoList = loadInvolvedFiles(affectedPartitionPathList, hoodieTable);
|
List<Tuple2<String, BloomIndexFileInfo>> fileInfoList = loadInvolvedFiles(
|
||||||
|
affectedPartitionPathList, hoodieTable);
|
||||||
final Map<String, List<BloomIndexFileInfo>> partitionToFileInfo = fileInfoList.stream()
|
final Map<String, List<BloomIndexFileInfo>> partitionToFileInfo = fileInfoList.stream()
|
||||||
.collect(groupingBy(Tuple2::_1, mapping(Tuple2::_2, toList())));
|
.collect(groupingBy(Tuple2::_1, mapping(Tuple2::_2, toList())));
|
||||||
|
|
||||||
// Step 3: Obtain a RDD, for each incoming record, that already exists, with the file id, that contains it.
|
// Step 3: Obtain a RDD, for each incoming record, that already exists, with the file id, that contains it.
|
||||||
int parallelism = autoComputeParallelism(recordsPerPartition, partitionToFileInfo, partitionRecordKeyPairRDD);
|
int parallelism = autoComputeParallelism(recordsPerPartition, partitionToFileInfo,
|
||||||
return findMatchingFilesForRecordKeys(partitionToFileInfo, partitionRecordKeyPairRDD, parallelism);
|
partitionRecordKeyPairRDD);
|
||||||
|
return findMatchingFilesForRecordKeys(partitionToFileInfo, partitionRecordKeyPairRDD,
|
||||||
|
parallelism);
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* The index lookup can be skewed in three dimensions : #files, #partitions, #records
|
* The index lookup can be skewed in three dimensions : #files, #partitions, #records
|
||||||
*
|
*
|
||||||
* To be able to smoothly handle skews, we need to compute how to split each partitions into
|
* To be able to smoothly handle skews, we need to compute how to split each partitions into
|
||||||
* subpartitions. We do it here, in a way that keeps the amount of each Spark join partition to
|
* subpartitions. We do it here, in a way that keeps the amount of each Spark join partition to <
|
||||||
* < 2GB.
|
* 2GB.
|
||||||
*
|
|
||||||
* If {@link com.uber.hoodie.config.HoodieIndexConfig#BLOOM_INDEX_PARALLELISM_PROP} is specified as a NON-zero number,
|
|
||||||
* then that is used explicitly.
|
|
||||||
*
|
*
|
||||||
|
* If {@link com.uber.hoodie.config.HoodieIndexConfig#BLOOM_INDEX_PARALLELISM_PROP} is specified
|
||||||
|
* as a NON-zero number, then that is used explicitly.
|
||||||
*/
|
*/
|
||||||
private int autoComputeParallelism(final Map<String, Long> recordsPerPartition,
|
private int autoComputeParallelism(final Map<String, Long> recordsPerPartition,
|
||||||
final Map<String, List<BloomIndexFileInfo>> partitionToFileInfo,
|
final Map<String, List<BloomIndexFileInfo>> partitionToFileInfo,
|
||||||
@@ -172,7 +176,8 @@ public class HoodieBloomIndex<T extends HoodieRecordPayload> extends HoodieIndex
|
|||||||
long totalComparisons = 0;
|
long totalComparisons = 0;
|
||||||
if (config.getBloomIndexPruneByRanges()) {
|
if (config.getBloomIndexPruneByRanges()) {
|
||||||
// we will just try exploding the input and then count to determine comparisons
|
// we will just try exploding the input and then count to determine comparisons
|
||||||
totalComparisons = explodeRecordRDDWithFileComparisons(partitionToFileInfo, partitionRecordKeyPairRDD).count();
|
totalComparisons = explodeRecordRDDWithFileComparisons(partitionToFileInfo,
|
||||||
|
partitionRecordKeyPairRDD).count();
|
||||||
} else {
|
} else {
|
||||||
// if not pruning by ranges, then each file in a partition needs to compared against all
|
// if not pruning by ranges, then each file in a partition needs to compared against all
|
||||||
// records for a partition.
|
// records for a partition.
|
||||||
@@ -181,30 +186,36 @@ public class HoodieBloomIndex<T extends HoodieRecordPayload> extends HoodieIndex
|
|||||||
long totalFiles = 0, totalRecords = 0;
|
long totalFiles = 0, totalRecords = 0;
|
||||||
for (String partitionPath : recordsPerPartition.keySet()) {
|
for (String partitionPath : recordsPerPartition.keySet()) {
|
||||||
long numRecords = recordsPerPartition.get(partitionPath);
|
long numRecords = recordsPerPartition.get(partitionPath);
|
||||||
long numFiles = filesPerPartition.containsKey(partitionPath) ? filesPerPartition.get(partitionPath) : 1L;
|
long numFiles =
|
||||||
|
filesPerPartition.containsKey(partitionPath) ? filesPerPartition.get(partitionPath)
|
||||||
|
: 1L;
|
||||||
|
|
||||||
totalComparisons += numFiles * numRecords;
|
totalComparisons += numFiles * numRecords;
|
||||||
totalFiles += filesPerPartition.containsKey(partitionPath) ? filesPerPartition.get(partitionPath) : 0L;
|
totalFiles +=
|
||||||
|
filesPerPartition.containsKey(partitionPath) ? filesPerPartition.get(partitionPath)
|
||||||
|
: 0L;
|
||||||
totalRecords += numRecords;
|
totalRecords += numRecords;
|
||||||
}
|
}
|
||||||
logger.info("TotalRecords: " + totalRecords + ", TotalFiles: " + totalFiles + ", TotalAffectedPartitions:" + recordsPerPartition.size());
|
logger.info("TotalRecords: " + totalRecords + ", TotalFiles: " + totalFiles
|
||||||
|
+ ", TotalAffectedPartitions:" + recordsPerPartition.size());
|
||||||
}
|
}
|
||||||
|
|
||||||
// each partition will have an item per comparison.
|
// each partition will have an item per comparison.
|
||||||
int parallelism = (int) (totalComparisons/ MAX_ITEMS_PER_SHUFFLE_PARTITION + 1);
|
int parallelism = (int) (totalComparisons / MAX_ITEMS_PER_SHUFFLE_PARTITION + 1);
|
||||||
logger.info("Auto computed parallelism :" + parallelism + ", totalComparisons: " + totalComparisons);
|
logger.info(
|
||||||
|
"Auto computed parallelism :" + parallelism + ", totalComparisons: " + totalComparisons);
|
||||||
return parallelism;
|
return parallelism;
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Its crucial to pick the right parallelism.
|
* Its crucial to pick the right parallelism.
|
||||||
*
|
*
|
||||||
* totalSubPartitions : this is deemed safe limit, to be nice with Spark.
|
* totalSubPartitions : this is deemed safe limit, to be nice with Spark. inputParallelism :
|
||||||
* inputParallelism : typically number of input file splits
|
* typically number of input file splits
|
||||||
*
|
*
|
||||||
* We pick the max such that, we are always safe, but go higher if say a there are a lot of
|
* We pick the max such that, we are always safe, but go higher if say a there are a lot of input
|
||||||
* input files. (otherwise, we will fallback to number of partitions in input and end up with
|
* files. (otherwise, we will fallback to number of partitions in input and end up with slow
|
||||||
* slow performance)
|
* performance)
|
||||||
*/
|
*/
|
||||||
private int determineParallelism(int inputParallelism, int totalSubPartitions) {
|
private int determineParallelism(int inputParallelism, int totalSubPartitions) {
|
||||||
// If bloom index parallelism is set, use it to to check against the input parallelism and take the max
|
// If bloom index parallelism is set, use it to to check against the input parallelism and take the max
|
||||||
@@ -221,9 +232,11 @@ public class HoodieBloomIndex<T extends HoodieRecordPayload> extends HoodieIndex
|
|||||||
* Load all involved files as <Partition, filename> pair RDD.
|
* Load all involved files as <Partition, filename> pair RDD.
|
||||||
*/
|
*/
|
||||||
@VisibleForTesting
|
@VisibleForTesting
|
||||||
List<Tuple2<String, BloomIndexFileInfo>> loadInvolvedFiles(List<String> partitions, final HoodieTable<T> hoodieTable) {
|
List<Tuple2<String, BloomIndexFileInfo>> loadInvolvedFiles(List<String> partitions,
|
||||||
|
final HoodieTable<T> hoodieTable) {
|
||||||
// Obtain the latest data files from all the partitions.
|
// Obtain the latest data files from all the partitions.
|
||||||
List<Tuple2<String, HoodieDataFile>> dataFilesList = jsc.parallelize(partitions, Math.max(partitions.size(), 1))
|
List<Tuple2<String, HoodieDataFile>> dataFilesList = jsc
|
||||||
|
.parallelize(partitions, Math.max(partitions.size(), 1))
|
||||||
.flatMapToPair(partitionPath -> {
|
.flatMapToPair(partitionPath -> {
|
||||||
java.util.Optional<HoodieInstant> latestCommitTime =
|
java.util.Optional<HoodieInstant> latestCommitTime =
|
||||||
hoodieTable.getCommitTimeline().filterCompletedInstants().lastInstant();
|
hoodieTable.getCommitTimeline().filterCompletedInstants().lastInstant();
|
||||||
@@ -243,8 +256,10 @@ public class HoodieBloomIndex<T extends HoodieRecordPayload> extends HoodieIndex
|
|||||||
return jsc.parallelize(dataFilesList, Math.max(dataFilesList.size(), 1))
|
return jsc.parallelize(dataFilesList, Math.max(dataFilesList.size(), 1))
|
||||||
.mapToPair(ft -> {
|
.mapToPair(ft -> {
|
||||||
try {
|
try {
|
||||||
String[] minMaxKeys = ParquetUtils.readMinMaxRecordKeys(ft._2().getFileStatus().getPath());
|
String[] minMaxKeys = ParquetUtils
|
||||||
return new Tuple2<>(ft._1(), new BloomIndexFileInfo(ft._2().getFileName(), minMaxKeys[0], minMaxKeys[1]));
|
.readMinMaxRecordKeys(ft._2().getFileStatus().getPath());
|
||||||
|
return new Tuple2<>(ft._1(),
|
||||||
|
new BloomIndexFileInfo(ft._2().getFileName(), minMaxKeys[0], minMaxKeys[1]));
|
||||||
} catch (MetadataNotFoundException me) {
|
} catch (MetadataNotFoundException me) {
|
||||||
logger.warn("Unable to find range metadata in file :" + ft._2());
|
logger.warn("Unable to find range metadata in file :" + ft._2());
|
||||||
return new Tuple2<>(ft._1(), new BloomIndexFileInfo(ft._2().getFileName()));
|
return new Tuple2<>(ft._1(), new BloomIndexFileInfo(ft._2().getFileName()));
|
||||||
@@ -266,8 +281,6 @@ public class HoodieBloomIndex<T extends HoodieRecordPayload> extends HoodieIndex
|
|||||||
|
|
||||||
/**
|
/**
|
||||||
* This is not global, since we depend on the partitionPath to do the lookup
|
* This is not global, since we depend on the partitionPath to do the lookup
|
||||||
*
|
|
||||||
* @return
|
|
||||||
*/
|
*/
|
||||||
@Override
|
@Override
|
||||||
public boolean isGlobal() {
|
public boolean isGlobal() {
|
||||||
@@ -276,8 +289,6 @@ public class HoodieBloomIndex<T extends HoodieRecordPayload> extends HoodieIndex
|
|||||||
|
|
||||||
/**
|
/**
|
||||||
* No indexes into log files yet.
|
* No indexes into log files yet.
|
||||||
*
|
|
||||||
* @return
|
|
||||||
*/
|
*/
|
||||||
@Override
|
@Override
|
||||||
public boolean canIndexLogFiles() {
|
public boolean canIndexLogFiles() {
|
||||||
@@ -286,8 +297,6 @@ public class HoodieBloomIndex<T extends HoodieRecordPayload> extends HoodieIndex
|
|||||||
|
|
||||||
/**
|
/**
|
||||||
* Bloom filters are stored, into the same data files.
|
* Bloom filters are stored, into the same data files.
|
||||||
*
|
|
||||||
* @return
|
|
||||||
*/
|
*/
|
||||||
@Override
|
@Override
|
||||||
public boolean isImplicitWithStorage() {
|
public boolean isImplicitWithStorage() {
|
||||||
@@ -295,12 +304,8 @@ public class HoodieBloomIndex<T extends HoodieRecordPayload> extends HoodieIndex
|
|||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* if we dont have key ranges, then also we need to compare against the file. no other choice
|
* if we dont have key ranges, then also we need to compare against the file. no other choice if
|
||||||
* if we do, then only compare the file if the record key falls in range.
|
* we do, then only compare the file if the record key falls in range.
|
||||||
|
|
||||||
* @param indexInfo
|
|
||||||
* @param recordKey
|
|
||||||
* @return
|
|
||||||
*/
|
*/
|
||||||
private boolean shouldCompareWithFile(BloomIndexFileInfo indexInfo, String recordKey) {
|
private boolean shouldCompareWithFile(BloomIndexFileInfo indexInfo, String recordKey) {
|
||||||
return !indexInfo.hasKeyRanges() || indexInfo.isKeyInRange(recordKey);
|
return !indexInfo.hasKeyRanges() || indexInfo.isKeyInRange(recordKey);
|
||||||
@@ -308,19 +313,16 @@ public class HoodieBloomIndex<T extends HoodieRecordPayload> extends HoodieIndex
|
|||||||
|
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* For each incoming record, produce N output records, 1 each for each file against which the record's key
|
* For each incoming record, produce N output records, 1 each for each file against which the
|
||||||
* needs to be checked. For datasets, where the keys have a definite insert order (e.g: timestamp as prefix),
|
* record's key needs to be checked. For datasets, where the keys have a definite insert order
|
||||||
* the number of files to be compared gets cut down a lot from range pruning.
|
* (e.g: timestamp as prefix), the number of files to be compared gets cut down a lot from range
|
||||||
*
|
* pruning.
|
||||||
*
|
|
||||||
* @param partitionToFileIndexInfo
|
|
||||||
* @param partitionRecordKeyPairRDD
|
|
||||||
* @return
|
|
||||||
*/
|
*/
|
||||||
// sub-partition to ensure the records can be looked up against files & also prune file<=>record comparisons based on recordKey
|
// sub-partition to ensure the records can be looked up against files & also prune file<=>record comparisons based on recordKey
|
||||||
// ranges in the index info.
|
// ranges in the index info.
|
||||||
@VisibleForTesting
|
@VisibleForTesting
|
||||||
JavaPairRDD<String, Tuple2<String, HoodieKey>> explodeRecordRDDWithFileComparisons(final Map<String, List<BloomIndexFileInfo>> partitionToFileIndexInfo,
|
JavaPairRDD<String, Tuple2<String, HoodieKey>> explodeRecordRDDWithFileComparisons(
|
||||||
|
final Map<String, List<BloomIndexFileInfo>> partitionToFileIndexInfo,
|
||||||
JavaPairRDD<String, String> partitionRecordKeyPairRDD) {
|
JavaPairRDD<String, String> partitionRecordKeyPairRDD) {
|
||||||
return partitionRecordKeyPairRDD
|
return partitionRecordKeyPairRDD
|
||||||
.map(partitionRecordKeyPair -> {
|
.map(partitionRecordKeyPair -> {
|
||||||
@@ -329,13 +331,15 @@ public class HoodieBloomIndex<T extends HoodieRecordPayload> extends HoodieIndex
|
|||||||
|
|
||||||
List<BloomIndexFileInfo> indexInfos = partitionToFileIndexInfo.get(partitionPath);
|
List<BloomIndexFileInfo> indexInfos = partitionToFileIndexInfo.get(partitionPath);
|
||||||
List<Tuple2<String, Tuple2<String, HoodieKey>>> recordComparisons = new ArrayList<>();
|
List<Tuple2<String, Tuple2<String, HoodieKey>>> recordComparisons = new ArrayList<>();
|
||||||
if (indexInfos != null) { // could be null, if there are no files in a given partition yet.
|
if (indexInfos
|
||||||
|
!= null) { // could be null, if there are no files in a given partition yet.
|
||||||
// for each candidate file in partition, that needs to be compared.
|
// for each candidate file in partition, that needs to be compared.
|
||||||
for (BloomIndexFileInfo indexInfo : indexInfos) {
|
for (BloomIndexFileInfo indexInfo : indexInfos) {
|
||||||
if (shouldCompareWithFile(indexInfo, recordKey)) {
|
if (shouldCompareWithFile(indexInfo, recordKey)) {
|
||||||
recordComparisons.add(
|
recordComparisons.add(
|
||||||
new Tuple2<>(String.format("%s#%s", indexInfo.getFileName(), recordKey),
|
new Tuple2<>(String.format("%s#%s", indexInfo.getFileName(), recordKey),
|
||||||
new Tuple2<>(indexInfo.getFileName(), new HoodieKey(recordKey, partitionPath))));
|
new Tuple2<>(indexInfo.getFileName(),
|
||||||
|
new HoodieKey(recordKey, partitionPath))));
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@@ -347,22 +351,23 @@ public class HoodieBloomIndex<T extends HoodieRecordPayload> extends HoodieIndex
|
|||||||
/**
|
/**
|
||||||
* Find out <RowKey, filename> pair. All workload grouped by file-level.
|
* Find out <RowKey, filename> pair. All workload grouped by file-level.
|
||||||
*
|
*
|
||||||
* Join PairRDD(PartitionPath, RecordKey) and PairRDD(PartitionPath, File) & then repartition
|
* Join PairRDD(PartitionPath, RecordKey) and PairRDD(PartitionPath, File) & then repartition such
|
||||||
* such that each RDD partition is a file, then for each file, we do
|
* that each RDD partition is a file, then for each file, we do (1) load bloom filter, (2) load
|
||||||
* (1) load bloom filter,
|
* rowKeys, (3) Tag rowKey
|
||||||
* (2) load rowKeys,
|
|
||||||
* (3) Tag rowKey
|
|
||||||
*
|
*
|
||||||
* Make sure the parallelism is atleast the groupby parallelism for tagging location
|
* Make sure the parallelism is atleast the groupby parallelism for tagging location
|
||||||
*/
|
*/
|
||||||
@VisibleForTesting
|
@VisibleForTesting
|
||||||
JavaPairRDD<String, String> findMatchingFilesForRecordKeys(final Map<String, List<BloomIndexFileInfo>> partitionToFileIndexInfo,
|
JavaPairRDD<String, String> findMatchingFilesForRecordKeys(
|
||||||
|
final Map<String, List<BloomIndexFileInfo>> partitionToFileIndexInfo,
|
||||||
JavaPairRDD<String, String> partitionRecordKeyPairRDD,
|
JavaPairRDD<String, String> partitionRecordKeyPairRDD,
|
||||||
int totalSubpartitions) {
|
int totalSubpartitions) {
|
||||||
|
|
||||||
int joinParallelism = determineParallelism(partitionRecordKeyPairRDD.partitions().size(), totalSubpartitions);
|
int joinParallelism = determineParallelism(partitionRecordKeyPairRDD.partitions().size(),
|
||||||
|
totalSubpartitions);
|
||||||
|
|
||||||
JavaPairRDD<String, Tuple2<String, HoodieKey>> fileSortedTripletRDD = explodeRecordRDDWithFileComparisons(partitionToFileIndexInfo, partitionRecordKeyPairRDD)
|
JavaPairRDD<String, Tuple2<String, HoodieKey>> fileSortedTripletRDD = explodeRecordRDDWithFileComparisons(
|
||||||
|
partitionToFileIndexInfo, partitionRecordKeyPairRDD)
|
||||||
// sort further based on filename, such that all checking for the file can happen within a single partition, on-the-fly
|
// sort further based on filename, such that all checking for the file can happen within a single partition, on-the-fly
|
||||||
.sortByKey(true, joinParallelism);
|
.sortByKey(true, joinParallelism);
|
||||||
|
|
||||||
@@ -382,7 +387,8 @@ public class HoodieBloomIndex<T extends HoodieRecordPayload> extends HoodieIndex
|
|||||||
/**
|
/**
|
||||||
* Tag the <rowKey, filename> back to the original HoodieRecord RDD.
|
* Tag the <rowKey, filename> back to the original HoodieRecord RDD.
|
||||||
*/
|
*/
|
||||||
private JavaRDD<HoodieRecord<T>> tagLocationBacktoRecords(JavaPairRDD<String, String> rowKeyFilenamePairRDD,
|
private JavaRDD<HoodieRecord<T>> tagLocationBacktoRecords(
|
||||||
|
JavaPairRDD<String, String> rowKeyFilenamePairRDD,
|
||||||
JavaRDD<HoodieRecord<T>> recordRDD) {
|
JavaRDD<HoodieRecord<T>> recordRDD) {
|
||||||
JavaPairRDD<String, HoodieRecord<T>> rowKeyRecordPairRDD = recordRDD
|
JavaPairRDD<String, HoodieRecord<T>> rowKeyRecordPairRDD = recordRDD
|
||||||
.mapToPair(record -> new Tuple2<>(record.getRecordKey(), record));
|
.mapToPair(record -> new Tuple2<>(record.getRecordKey(), record));
|
||||||
@@ -404,7 +410,8 @@ public class HoodieBloomIndex<T extends HoodieRecordPayload> extends HoodieIndex
|
|||||||
}
|
}
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
public JavaRDD<WriteStatus> updateLocation(JavaRDD<WriteStatus> writeStatusRDD, HoodieTable<T> hoodieTable) {
|
public JavaRDD<WriteStatus> updateLocation(JavaRDD<WriteStatus> writeStatusRDD,
|
||||||
|
HoodieTable<T> hoodieTable) {
|
||||||
return writeStatusRDD;
|
return writeStatusRDD;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -24,24 +24,22 @@ import com.uber.hoodie.common.util.ParquetUtils;
|
|||||||
import com.uber.hoodie.exception.HoodieException;
|
import com.uber.hoodie.exception.HoodieException;
|
||||||
import com.uber.hoodie.exception.HoodieIndexException;
|
import com.uber.hoodie.exception.HoodieIndexException;
|
||||||
import com.uber.hoodie.func.LazyIterableIterator;
|
import com.uber.hoodie.func.LazyIterableIterator;
|
||||||
|
|
||||||
import org.apache.hadoop.fs.Path;
|
|
||||||
import org.apache.log4j.LogManager;
|
|
||||||
import org.apache.log4j.Logger;
|
|
||||||
import org.apache.spark.api.java.function.Function2;
|
|
||||||
|
|
||||||
import java.util.ArrayList;
|
import java.util.ArrayList;
|
||||||
import java.util.Iterator;
|
import java.util.Iterator;
|
||||||
import java.util.List;
|
import java.util.List;
|
||||||
import java.util.Set;
|
import java.util.Set;
|
||||||
|
import org.apache.hadoop.fs.Path;
|
||||||
|
import org.apache.log4j.LogManager;
|
||||||
|
import org.apache.log4j.Logger;
|
||||||
|
import org.apache.spark.api.java.function.Function2;
|
||||||
import scala.Tuple2;
|
import scala.Tuple2;
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Function performing actual checking of RDD parition containing (fileId, hoodieKeys) against the
|
* Function performing actual checking of RDD parition containing (fileId, hoodieKeys) against the
|
||||||
* actual files
|
* actual files
|
||||||
*/
|
*/
|
||||||
public class HoodieBloomIndexCheckFunction implements Function2<Integer, Iterator<Tuple2<String, Tuple2<String, HoodieKey>>>, Iterator<List<IndexLookupResult>>> {
|
public class HoodieBloomIndexCheckFunction implements
|
||||||
|
Function2<Integer, Iterator<Tuple2<String, Tuple2<String, HoodieKey>>>, Iterator<List<IndexLookupResult>>> {
|
||||||
|
|
||||||
private static Logger logger = LogManager.getLogger(HoodieBloomIndexCheckFunction.class);
|
private static Logger logger = LogManager.getLogger(HoodieBloomIndexCheckFunction.class);
|
||||||
|
|
||||||
@@ -54,7 +52,8 @@ public class HoodieBloomIndexCheckFunction implements Function2<Integer, Iterato
|
|||||||
/**
|
/**
|
||||||
* Given a list of row keys and one file, return only row keys existing in that file.
|
* Given a list of row keys and one file, return only row keys existing in that file.
|
||||||
*/
|
*/
|
||||||
public static List<String> checkCandidatesAgainstFile(List<String> candidateRecordKeys, Path filePath) throws HoodieIndexException {
|
public static List<String> checkCandidatesAgainstFile(List<String> candidateRecordKeys,
|
||||||
|
Path filePath) throws HoodieIndexException {
|
||||||
List<String> foundRecordKeys = new ArrayList<>();
|
List<String> foundRecordKeys = new ArrayList<>();
|
||||||
try {
|
try {
|
||||||
// Load all rowKeys from the file, to double-confirm
|
// Load all rowKeys from the file, to double-confirm
|
||||||
@@ -69,18 +68,20 @@ public class HoodieBloomIndexCheckFunction implements Function2<Integer, Iterato
|
|||||||
foundRecordKeys.add(rowKey);
|
foundRecordKeys.add(rowKey);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
logger.info("After checking with row keys, we have " + foundRecordKeys.size() + " results, for file " + filePath + " => " + foundRecordKeys);
|
logger.info("After checking with row keys, we have " + foundRecordKeys.size()
|
||||||
|
+ " results, for file " + filePath + " => " + foundRecordKeys);
|
||||||
if (logger.isDebugEnabled()) {
|
if (logger.isDebugEnabled()) {
|
||||||
logger.debug("Keys matching for file " + filePath + " => " + foundRecordKeys);
|
logger.debug("Keys matching for file " + filePath + " => " + foundRecordKeys);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
} catch (Exception e){
|
} catch (Exception e) {
|
||||||
throw new HoodieIndexException("Error checking candidate keys against file.", e);
|
throw new HoodieIndexException("Error checking candidate keys against file.", e);
|
||||||
}
|
}
|
||||||
return foundRecordKeys;
|
return foundRecordKeys;
|
||||||
}
|
}
|
||||||
|
|
||||||
class LazyKeyCheckIterator extends LazyIterableIterator<Tuple2<String, Tuple2<String, HoodieKey>>, List<IndexLookupResult>> {
|
class LazyKeyCheckIterator extends
|
||||||
|
LazyIterableIterator<Tuple2<String, Tuple2<String, HoodieKey>>, List<IndexLookupResult>> {
|
||||||
|
|
||||||
private List<String> candidateRecordKeys;
|
private List<String> candidateRecordKeys;
|
||||||
|
|
||||||
@@ -90,7 +91,8 @@ public class HoodieBloomIndexCheckFunction implements Function2<Integer, Iterato
|
|||||||
|
|
||||||
private String currentParitionPath;
|
private String currentParitionPath;
|
||||||
|
|
||||||
LazyKeyCheckIterator(Iterator<Tuple2<String, Tuple2<String, HoodieKey>>> fileParitionRecordKeyTripletItr) {
|
LazyKeyCheckIterator(
|
||||||
|
Iterator<Tuple2<String, Tuple2<String, HoodieKey>>> fileParitionRecordKeyTripletItr) {
|
||||||
super(fileParitionRecordKeyTripletItr);
|
super(fileParitionRecordKeyTripletItr);
|
||||||
currentFile = null;
|
currentFile = null;
|
||||||
candidateRecordKeys = new ArrayList<>();
|
candidateRecordKeys = new ArrayList<>();
|
||||||
@@ -144,11 +146,15 @@ public class HoodieBloomIndexCheckFunction implements Function2<Integer, Iterato
|
|||||||
} else {
|
} else {
|
||||||
// do the actual checking of file & break out
|
// do the actual checking of file & break out
|
||||||
Path filePath = new Path(basePath + "/" + currentParitionPath + "/" + currentFile);
|
Path filePath = new Path(basePath + "/" + currentParitionPath + "/" + currentFile);
|
||||||
logger.info("#1 After bloom filter, the candidate row keys is reduced to " + candidateRecordKeys.size() + " for " + filePath);
|
logger.info(
|
||||||
|
"#1 After bloom filter, the candidate row keys is reduced to " + candidateRecordKeys
|
||||||
|
.size() + " for " + filePath);
|
||||||
if (logger.isDebugEnabled()) {
|
if (logger.isDebugEnabled()) {
|
||||||
logger.debug("#The candidate row keys for " + filePath + " => " + candidateRecordKeys);
|
logger
|
||||||
|
.debug("#The candidate row keys for " + filePath + " => " + candidateRecordKeys);
|
||||||
}
|
}
|
||||||
ret.add(new IndexLookupResult(currentFile, checkCandidatesAgainstFile(candidateRecordKeys, filePath)));
|
ret.add(new IndexLookupResult(currentFile,
|
||||||
|
checkCandidatesAgainstFile(candidateRecordKeys, filePath)));
|
||||||
|
|
||||||
initState(fileName, partitionPath);
|
initState(fileName, partitionPath);
|
||||||
if (bloomFilter.mightContain(recordKey)) {
|
if (bloomFilter.mightContain(recordKey)) {
|
||||||
@@ -164,11 +170,14 @@ public class HoodieBloomIndexCheckFunction implements Function2<Integer, Iterato
|
|||||||
// handle case, where we ran out of input, finish pending work, update return val
|
// handle case, where we ran out of input, finish pending work, update return val
|
||||||
if (!inputItr.hasNext()) {
|
if (!inputItr.hasNext()) {
|
||||||
Path filePath = new Path(basePath + "/" + currentParitionPath + "/" + currentFile);
|
Path filePath = new Path(basePath + "/" + currentParitionPath + "/" + currentFile);
|
||||||
logger.info("#2 After bloom filter, the candidate row keys is reduced to " + candidateRecordKeys.size() + " for " + filePath);
|
logger.info(
|
||||||
|
"#2 After bloom filter, the candidate row keys is reduced to " + candidateRecordKeys
|
||||||
|
.size() + " for " + filePath);
|
||||||
if (logger.isDebugEnabled()) {
|
if (logger.isDebugEnabled()) {
|
||||||
logger.debug("#The candidate row keys for " + filePath + " => " + candidateRecordKeys);
|
logger.debug("#The candidate row keys for " + filePath + " => " + candidateRecordKeys);
|
||||||
}
|
}
|
||||||
ret.add(new IndexLookupResult(currentFile, checkCandidatesAgainstFile(candidateRecordKeys, filePath)));
|
ret.add(new IndexLookupResult(currentFile,
|
||||||
|
checkCandidatesAgainstFile(candidateRecordKeys, filePath)));
|
||||||
}
|
}
|
||||||
|
|
||||||
} catch (Throwable e) {
|
} catch (Throwable e) {
|
||||||
@@ -189,7 +198,8 @@ public class HoodieBloomIndexCheckFunction implements Function2<Integer, Iterato
|
|||||||
|
|
||||||
@Override
|
@Override
|
||||||
public Iterator<List<IndexLookupResult>> call(Integer partition,
|
public Iterator<List<IndexLookupResult>> call(Integer partition,
|
||||||
Iterator<Tuple2<String, Tuple2<String, HoodieKey>>> fileParitionRecordKeyTripletItr) throws Exception {
|
Iterator<Tuple2<String, Tuple2<String, HoodieKey>>> fileParitionRecordKeyTripletItr)
|
||||||
|
throws Exception {
|
||||||
return new LazyKeyCheckIterator(fileParitionRecordKeyTripletItr);
|
return new LazyKeyCheckIterator(fileParitionRecordKeyTripletItr);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -19,7 +19,6 @@
|
|||||||
package com.uber.hoodie.index.bucketed;
|
package com.uber.hoodie.index.bucketed;
|
||||||
|
|
||||||
import com.google.common.base.Optional;
|
import com.google.common.base.Optional;
|
||||||
|
|
||||||
import com.uber.hoodie.WriteStatus;
|
import com.uber.hoodie.WriteStatus;
|
||||||
import com.uber.hoodie.common.model.HoodieKey;
|
import com.uber.hoodie.common.model.HoodieKey;
|
||||||
import com.uber.hoodie.common.model.HoodieRecord;
|
import com.uber.hoodie.common.model.HoodieRecord;
|
||||||
@@ -29,29 +28,22 @@ import com.uber.hoodie.config.HoodieWriteConfig;
|
|||||||
import com.uber.hoodie.exception.HoodieIndexException;
|
import com.uber.hoodie.exception.HoodieIndexException;
|
||||||
import com.uber.hoodie.index.HoodieIndex;
|
import com.uber.hoodie.index.HoodieIndex;
|
||||||
import com.uber.hoodie.table.HoodieTable;
|
import com.uber.hoodie.table.HoodieTable;
|
||||||
|
|
||||||
import org.apache.log4j.LogManager;
|
import org.apache.log4j.LogManager;
|
||||||
import org.apache.log4j.Logger;
|
import org.apache.log4j.Logger;
|
||||||
import org.apache.spark.api.java.JavaPairRDD;
|
import org.apache.spark.api.java.JavaPairRDD;
|
||||||
import org.apache.spark.api.java.JavaRDD;
|
import org.apache.spark.api.java.JavaRDD;
|
||||||
import org.apache.spark.api.java.JavaSparkContext;
|
import org.apache.spark.api.java.JavaSparkContext;
|
||||||
|
|
||||||
import scala.Tuple2;
|
import scala.Tuple2;
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* An `stateless` index implementation that will using a deterministic mapping function to
|
* An `stateless` index implementation that will using a deterministic mapping function to determine
|
||||||
* determine the fileID for a given record.
|
* the fileID for a given record.
|
||||||
*
|
|
||||||
* Pros:
|
|
||||||
* - Fast
|
|
||||||
*
|
|
||||||
* Cons :
|
|
||||||
* - Need to tune the number of buckets per partition path manually (FIXME: Need to autotune this)
|
|
||||||
* - Could increase write amplification on copy-on-write storage since inserts always rewrite files
|
|
||||||
* - Not global.
|
|
||||||
*
|
|
||||||
*
|
*
|
||||||
|
* Pros: - Fast
|
||||||
*
|
*
|
||||||
|
* Cons : - Need to tune the number of buckets per partition path manually (FIXME: Need to autotune
|
||||||
|
* this) - Could increase write amplification on copy-on-write storage since inserts always rewrite
|
||||||
|
* files - Not global.
|
||||||
*/
|
*/
|
||||||
public class BucketedIndex<T extends HoodieRecordPayload> extends HoodieIndex<T> {
|
public class BucketedIndex<T extends HoodieRecordPayload> extends HoodieIndex<T> {
|
||||||
|
|
||||||
@@ -66,12 +58,14 @@ public class BucketedIndex<T extends HoodieRecordPayload> extends HoodieIndex<T>
|
|||||||
}
|
}
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
public JavaPairRDD<HoodieKey, Optional<String>> fetchRecordLocation(JavaRDD<HoodieKey> hoodieKeys, HoodieTable<T> table) {
|
public JavaPairRDD<HoodieKey, Optional<String>> fetchRecordLocation(JavaRDD<HoodieKey> hoodieKeys,
|
||||||
|
HoodieTable<T> table) {
|
||||||
return hoodieKeys.mapToPair(hk -> new Tuple2<>(hk, Optional.of(getBucket(hk.getRecordKey()))));
|
return hoodieKeys.mapToPair(hk -> new Tuple2<>(hk, Optional.of(getBucket(hk.getRecordKey()))));
|
||||||
}
|
}
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
public JavaRDD<HoodieRecord<T>> tagLocation(JavaRDD<HoodieRecord<T>> recordRDD, HoodieTable<T> hoodieTable) throws HoodieIndexException {
|
public JavaRDD<HoodieRecord<T>> tagLocation(JavaRDD<HoodieRecord<T>> recordRDD,
|
||||||
|
HoodieTable<T> hoodieTable) throws HoodieIndexException {
|
||||||
return recordRDD.map(record -> {
|
return recordRDD.map(record -> {
|
||||||
String bucket = getBucket(record.getRecordKey());
|
String bucket = getBucket(record.getRecordKey());
|
||||||
//HACK(vc) a non-existent commit is provided here.
|
//HACK(vc) a non-existent commit is provided here.
|
||||||
@@ -81,7 +75,8 @@ public class BucketedIndex<T extends HoodieRecordPayload> extends HoodieIndex<T>
|
|||||||
}
|
}
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
public JavaRDD<WriteStatus> updateLocation(JavaRDD<WriteStatus> writeStatusRDD, HoodieTable<T> hoodieTable) throws HoodieIndexException {
|
public JavaRDD<WriteStatus> updateLocation(JavaRDD<WriteStatus> writeStatusRDD,
|
||||||
|
HoodieTable<T> hoodieTable) throws HoodieIndexException {
|
||||||
return writeStatusRDD;
|
return writeStatusRDD;
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -93,8 +88,6 @@ public class BucketedIndex<T extends HoodieRecordPayload> extends HoodieIndex<T>
|
|||||||
|
|
||||||
/**
|
/**
|
||||||
* Bucketing is still done within each partition.
|
* Bucketing is still done within each partition.
|
||||||
*
|
|
||||||
* @return
|
|
||||||
*/
|
*/
|
||||||
@Override
|
@Override
|
||||||
public boolean isGlobal() {
|
public boolean isGlobal() {
|
||||||
@@ -102,10 +95,8 @@ public class BucketedIndex<T extends HoodieRecordPayload> extends HoodieIndex<T>
|
|||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Since indexing is just a deterministic hash, we can identify file group correctly even without an index
|
* Since indexing is just a deterministic hash, we can identify file group correctly even without
|
||||||
* on the actual log file.
|
* an index on the actual log file.
|
||||||
*
|
|
||||||
* @return
|
|
||||||
*/
|
*/
|
||||||
@Override
|
@Override
|
||||||
public boolean canIndexLogFiles() {
|
public boolean canIndexLogFiles() {
|
||||||
@@ -114,8 +105,6 @@ public class BucketedIndex<T extends HoodieRecordPayload> extends HoodieIndex<T>
|
|||||||
|
|
||||||
/**
|
/**
|
||||||
* Indexing is just a hash function.
|
* Indexing is just a hash function.
|
||||||
*
|
|
||||||
* @return
|
|
||||||
*/
|
*/
|
||||||
@Override
|
@Override
|
||||||
public boolean isImplicitWithStorage() {
|
public boolean isImplicitWithStorage() {
|
||||||
|
|||||||
@@ -19,24 +19,33 @@
|
|||||||
package com.uber.hoodie.index.hbase;
|
package com.uber.hoodie.index.hbase;
|
||||||
|
|
||||||
import com.google.common.base.Optional;
|
import com.google.common.base.Optional;
|
||||||
import com.uber.hoodie.common.table.HoodieTimeline;
|
|
||||||
import com.uber.hoodie.common.table.timeline.HoodieInstant;
|
|
||||||
import com.uber.hoodie.config.HoodieWriteConfig;
|
|
||||||
import com.uber.hoodie.WriteStatus;
|
import com.uber.hoodie.WriteStatus;
|
||||||
import com.uber.hoodie.common.model.HoodieKey;
|
import com.uber.hoodie.common.model.HoodieKey;
|
||||||
|
import com.uber.hoodie.common.model.HoodieRecord;
|
||||||
import com.uber.hoodie.common.model.HoodieRecordLocation;
|
import com.uber.hoodie.common.model.HoodieRecordLocation;
|
||||||
import com.uber.hoodie.common.model.HoodieRecordPayload;
|
import com.uber.hoodie.common.model.HoodieRecordPayload;
|
||||||
import com.uber.hoodie.common.model.HoodieRecord;
|
import com.uber.hoodie.common.table.HoodieTimeline;
|
||||||
|
import com.uber.hoodie.common.table.timeline.HoodieInstant;
|
||||||
import com.uber.hoodie.config.HoodieIndexConfig;
|
import com.uber.hoodie.config.HoodieIndexConfig;
|
||||||
|
import com.uber.hoodie.config.HoodieWriteConfig;
|
||||||
import com.uber.hoodie.exception.HoodieDependentSystemUnavailableException;
|
import com.uber.hoodie.exception.HoodieDependentSystemUnavailableException;
|
||||||
import com.uber.hoodie.exception.HoodieIndexException;
|
import com.uber.hoodie.exception.HoodieIndexException;
|
||||||
import com.uber.hoodie.index.HoodieIndex;
|
import com.uber.hoodie.index.HoodieIndex;
|
||||||
import com.uber.hoodie.table.HoodieTable;
|
import com.uber.hoodie.table.HoodieTable;
|
||||||
|
import java.io.IOException;
|
||||||
|
import java.util.ArrayList;
|
||||||
|
import java.util.Iterator;
|
||||||
|
import java.util.List;
|
||||||
import org.apache.hadoop.conf.Configuration;
|
import org.apache.hadoop.conf.Configuration;
|
||||||
import org.apache.hadoop.hbase.HBaseConfiguration;
|
import org.apache.hadoop.hbase.HBaseConfiguration;
|
||||||
import org.apache.hadoop.hbase.TableName;
|
import org.apache.hadoop.hbase.TableName;
|
||||||
import org.apache.hadoop.hbase.client.*;
|
import org.apache.hadoop.hbase.client.Connection;
|
||||||
|
import org.apache.hadoop.hbase.client.ConnectionFactory;
|
||||||
|
import org.apache.hadoop.hbase.client.Delete;
|
||||||
|
import org.apache.hadoop.hbase.client.Get;
|
||||||
|
import org.apache.hadoop.hbase.client.HTable;
|
||||||
|
import org.apache.hadoop.hbase.client.Put;
|
||||||
|
import org.apache.hadoop.hbase.client.Result;
|
||||||
import org.apache.hadoop.hbase.util.Bytes;
|
import org.apache.hadoop.hbase.util.Bytes;
|
||||||
import org.apache.log4j.LogManager;
|
import org.apache.log4j.LogManager;
|
||||||
import org.apache.log4j.Logger;
|
import org.apache.log4j.Logger;
|
||||||
@@ -45,15 +54,11 @@ import org.apache.spark.api.java.JavaRDD;
|
|||||||
import org.apache.spark.api.java.JavaSparkContext;
|
import org.apache.spark.api.java.JavaSparkContext;
|
||||||
import org.apache.spark.api.java.function.Function2;
|
import org.apache.spark.api.java.function.Function2;
|
||||||
|
|
||||||
import java.io.IOException;
|
|
||||||
import java.util.ArrayList;
|
|
||||||
import java.util.Iterator;
|
|
||||||
import java.util.List;
|
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Hoodie Index implementation backed by HBase
|
* Hoodie Index implementation backed by HBase
|
||||||
*/
|
*/
|
||||||
public class HBaseIndex<T extends HoodieRecordPayload> extends HoodieIndex<T> {
|
public class HBaseIndex<T extends HoodieRecordPayload> extends HoodieIndex<T> {
|
||||||
|
|
||||||
private final static byte[] SYSTEM_COLUMN_FAMILY = Bytes.toBytes("_s");
|
private final static byte[] SYSTEM_COLUMN_FAMILY = Bytes.toBytes("_s");
|
||||||
private final static byte[] COMMIT_TS_COLUMN = Bytes.toBytes("commit_ts");
|
private final static byte[] COMMIT_TS_COLUMN = Bytes.toBytes("commit_ts");
|
||||||
private final static byte[] FILE_NAME_COLUMN = Bytes.toBytes("file_name");
|
private final static byte[] FILE_NAME_COLUMN = Bytes.toBytes("file_name");
|
||||||
@@ -144,9 +149,7 @@ public class HBaseIndex<T extends HoodieRecordPayload> extends HoodieIndex<T> {
|
|||||||
} catch (IOException e) {
|
} catch (IOException e) {
|
||||||
throw new HoodieIndexException(
|
throw new HoodieIndexException(
|
||||||
"Failed to Tag indexed locations because of exception with HBase Client", e);
|
"Failed to Tag indexed locations because of exception with HBase Client", e);
|
||||||
}
|
} finally {
|
||||||
|
|
||||||
finally {
|
|
||||||
if (hTable != null) {
|
if (hTable != null) {
|
||||||
try {
|
try {
|
||||||
hTable.close();
|
hTable.close();
|
||||||
@@ -161,11 +164,14 @@ public class HBaseIndex<T extends HoodieRecordPayload> extends HoodieIndex<T> {
|
|||||||
}
|
}
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
public JavaRDD<HoodieRecord<T>> tagLocation(JavaRDD<HoodieRecord<T>> recordRDD, HoodieTable<T> hoodieTable) {
|
public JavaRDD<HoodieRecord<T>> tagLocation(JavaRDD<HoodieRecord<T>> recordRDD,
|
||||||
|
HoodieTable<T> hoodieTable) {
|
||||||
return recordRDD.mapPartitionsWithIndex(this.new LocationTagFunction(hoodieTable), true);
|
return recordRDD.mapPartitionsWithIndex(this.new LocationTagFunction(hoodieTable), true);
|
||||||
}
|
}
|
||||||
|
|
||||||
class UpdateLocationTask implements Function2<Integer, Iterator<WriteStatus>, Iterator<WriteStatus>> {
|
class UpdateLocationTask implements
|
||||||
|
Function2<Integer, Iterator<WriteStatus>, Iterator<WriteStatus>> {
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
public Iterator<WriteStatus> call(Integer partition, Iterator<WriteStatus> statusIterator) {
|
public Iterator<WriteStatus> call(Integer partition, Iterator<WriteStatus> statusIterator) {
|
||||||
|
|
||||||
@@ -187,7 +193,7 @@ public class HBaseIndex<T extends HoodieRecordPayload> extends HoodieIndex<T> {
|
|||||||
for (HoodieRecord rec : writeStatus.getWrittenRecords()) {
|
for (HoodieRecord rec : writeStatus.getWrittenRecords()) {
|
||||||
if (!writeStatus.isErrored(rec.getKey())) {
|
if (!writeStatus.isErrored(rec.getKey())) {
|
||||||
java.util.Optional<HoodieRecordLocation> loc = rec.getNewLocation();
|
java.util.Optional<HoodieRecordLocation> loc = rec.getNewLocation();
|
||||||
if(loc.isPresent()) {
|
if (loc.isPresent()) {
|
||||||
Put put = new Put(Bytes.toBytes(rec.getRecordKey()));
|
Put put = new Put(Bytes.toBytes(rec.getRecordKey()));
|
||||||
put.addColumn(SYSTEM_COLUMN_FAMILY, COMMIT_TS_COLUMN,
|
put.addColumn(SYSTEM_COLUMN_FAMILY, COMMIT_TS_COLUMN,
|
||||||
Bytes.toBytes(loc.get().getCommitTime()));
|
Bytes.toBytes(loc.get().getCommitTime()));
|
||||||
@@ -244,8 +250,6 @@ public class HBaseIndex<T extends HoodieRecordPayload> extends HoodieIndex<T> {
|
|||||||
|
|
||||||
/**
|
/**
|
||||||
* Only looks up by recordKey
|
* Only looks up by recordKey
|
||||||
*
|
|
||||||
* @return
|
|
||||||
*/
|
*/
|
||||||
@Override
|
@Override
|
||||||
public boolean isGlobal() {
|
public boolean isGlobal() {
|
||||||
@@ -254,8 +258,6 @@ public class HBaseIndex<T extends HoodieRecordPayload> extends HoodieIndex<T> {
|
|||||||
|
|
||||||
/**
|
/**
|
||||||
* Mapping is available in HBase already.
|
* Mapping is available in HBase already.
|
||||||
*
|
|
||||||
* @return
|
|
||||||
*/
|
*/
|
||||||
@Override
|
@Override
|
||||||
public boolean canIndexLogFiles() {
|
public boolean canIndexLogFiles() {
|
||||||
@@ -264,8 +266,6 @@ public class HBaseIndex<T extends HoodieRecordPayload> extends HoodieIndex<T> {
|
|||||||
|
|
||||||
/**
|
/**
|
||||||
* Index needs to be explicitly updated after storage write.
|
* Index needs to be explicitly updated after storage write.
|
||||||
*
|
|
||||||
* @return
|
|
||||||
*/
|
*/
|
||||||
@Override
|
@Override
|
||||||
public boolean isImplicitWithStorage() {
|
public boolean isImplicitWithStorage() {
|
||||||
|
|||||||
@@ -36,13 +36,6 @@ import com.uber.hoodie.config.HoodieWriteConfig;
|
|||||||
import com.uber.hoodie.exception.HoodieAppendException;
|
import com.uber.hoodie.exception.HoodieAppendException;
|
||||||
import com.uber.hoodie.exception.HoodieUpsertException;
|
import com.uber.hoodie.exception.HoodieUpsertException;
|
||||||
import com.uber.hoodie.table.HoodieTable;
|
import com.uber.hoodie.table.HoodieTable;
|
||||||
import org.apache.avro.generic.GenericRecord;
|
|
||||||
import org.apache.avro.generic.IndexedRecord;
|
|
||||||
import org.apache.hadoop.fs.Path;
|
|
||||||
import org.apache.log4j.LogManager;
|
|
||||||
import org.apache.log4j.Logger;
|
|
||||||
import org.apache.spark.TaskContext;
|
|
||||||
|
|
||||||
import java.io.IOException;
|
import java.io.IOException;
|
||||||
import java.util.ArrayList;
|
import java.util.ArrayList;
|
||||||
import java.util.Iterator;
|
import java.util.Iterator;
|
||||||
@@ -50,13 +43,18 @@ import java.util.List;
|
|||||||
import java.util.Map;
|
import java.util.Map;
|
||||||
import java.util.Optional;
|
import java.util.Optional;
|
||||||
import java.util.concurrent.atomic.AtomicLong;
|
import java.util.concurrent.atomic.AtomicLong;
|
||||||
|
import org.apache.avro.generic.GenericRecord;
|
||||||
|
import org.apache.avro.generic.IndexedRecord;
|
||||||
|
import org.apache.hadoop.fs.Path;
|
||||||
|
import org.apache.log4j.LogManager;
|
||||||
|
import org.apache.log4j.Logger;
|
||||||
|
import org.apache.spark.TaskContext;
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* IO Operation to append data onto an existing file.
|
* IO Operation to append data onto an existing file.
|
||||||
*
|
|
||||||
* @param <T>
|
|
||||||
*/
|
*/
|
||||||
public class HoodieAppendHandle<T extends HoodieRecordPayload> extends HoodieIOHandle<T> {
|
public class HoodieAppendHandle<T extends HoodieRecordPayload> extends HoodieIOHandle<T> {
|
||||||
|
|
||||||
private static Logger logger = LogManager.getLogger(HoodieMergeHandle.class);
|
private static Logger logger = LogManager.getLogger(HoodieMergeHandle.class);
|
||||||
private static AtomicLong recordIndex = new AtomicLong(1);
|
private static AtomicLong recordIndex = new AtomicLong(1);
|
||||||
|
|
||||||
@@ -133,7 +131,7 @@ public class HoodieAppendHandle<T extends HoodieRecordPayload> extends HoodieIOH
|
|||||||
try {
|
try {
|
||||||
Optional<IndexedRecord> avroRecord = hoodieRecord.getData().getInsertValue(schema);
|
Optional<IndexedRecord> avroRecord = hoodieRecord.getData().getInsertValue(schema);
|
||||||
|
|
||||||
if(avroRecord.isPresent()) {
|
if (avroRecord.isPresent()) {
|
||||||
String seqId = HoodieRecord.generateSequenceId(commitTime, TaskContext.getPartitionId(),
|
String seqId = HoodieRecord.generateSequenceId(commitTime, TaskContext.getPartitionId(),
|
||||||
recordIndex.getAndIncrement());
|
recordIndex.getAndIncrement());
|
||||||
HoodieAvroUtils
|
HoodieAvroUtils
|
||||||
@@ -164,18 +162,19 @@ public class HoodieAppendHandle<T extends HoodieRecordPayload> extends HoodieIOH
|
|||||||
metadata.put(HoodieLogBlock.LogMetadataType.INSTANT_TIME, commitTime);
|
metadata.put(HoodieLogBlock.LogMetadataType.INSTANT_TIME, commitTime);
|
||||||
records.stream().forEach(record -> {
|
records.stream().forEach(record -> {
|
||||||
Optional<IndexedRecord> indexedRecord = getIndexedRecord(record);
|
Optional<IndexedRecord> indexedRecord = getIndexedRecord(record);
|
||||||
if(indexedRecord.isPresent()) {
|
if (indexedRecord.isPresent()) {
|
||||||
recordList.add(indexedRecord.get());
|
recordList.add(indexedRecord.get());
|
||||||
} else {
|
} else {
|
||||||
keysToDelete.add(record.getRecordKey());
|
keysToDelete.add(record.getRecordKey());
|
||||||
}
|
}
|
||||||
});
|
});
|
||||||
try {
|
try {
|
||||||
if(recordList.size() > 0) {
|
if (recordList.size() > 0) {
|
||||||
writer = writer.appendBlock(new HoodieAvroDataBlock(recordList, schema, metadata));
|
writer = writer.appendBlock(new HoodieAvroDataBlock(recordList, schema, metadata));
|
||||||
}
|
}
|
||||||
if(keysToDelete.size() > 0) {
|
if (keysToDelete.size() > 0) {
|
||||||
writer = writer.appendBlock(new HoodieDeleteBlock(keysToDelete.stream().toArray(String[]::new), metadata));
|
writer = writer.appendBlock(
|
||||||
|
new HoodieDeleteBlock(keysToDelete.stream().toArray(String[]::new), metadata));
|
||||||
}
|
}
|
||||||
} catch (Exception e) {
|
} catch (Exception e) {
|
||||||
throw new HoodieAppendException(
|
throw new HoodieAppendException(
|
||||||
|
|||||||
@@ -27,27 +27,24 @@ import com.uber.hoodie.common.table.TableFileSystemView;
|
|||||||
import com.uber.hoodie.common.table.timeline.HoodieInstant;
|
import com.uber.hoodie.common.table.timeline.HoodieInstant;
|
||||||
import com.uber.hoodie.config.HoodieWriteConfig;
|
import com.uber.hoodie.config.HoodieWriteConfig;
|
||||||
import com.uber.hoodie.table.HoodieTable;
|
import com.uber.hoodie.table.HoodieTable;
|
||||||
import org.apache.hadoop.fs.FileSystem;
|
|
||||||
import org.apache.log4j.LogManager;
|
|
||||||
import org.apache.log4j.Logger;
|
|
||||||
|
|
||||||
import java.io.IOException;
|
import java.io.IOException;
|
||||||
import java.util.ArrayList;
|
import java.util.ArrayList;
|
||||||
import java.util.Iterator;
|
import java.util.Iterator;
|
||||||
import java.util.List;
|
import java.util.List;
|
||||||
import java.util.Optional;
|
import java.util.Optional;
|
||||||
import java.util.stream.Collectors;
|
import java.util.stream.Collectors;
|
||||||
|
import org.apache.hadoop.fs.FileSystem;
|
||||||
|
import org.apache.log4j.LogManager;
|
||||||
|
import org.apache.log4j.Logger;
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Cleaner is responsible for garbage collecting older files in a given partition path, such that
|
* Cleaner is responsible for garbage collecting older files in a given partition path, such that
|
||||||
* <p>
|
* <p> 1) It provides sufficient time for existing queries running on older versions, to finish <p>
|
||||||
* 1) It provides sufficient time for existing queries running on older versions, to finish
|
* 2) It bounds the growth of the files in the file system <p> TODO: Should all cleaning be done
|
||||||
* <p>
|
* based on {@link com.uber.hoodie.common.model.HoodieCommitMetadata}
|
||||||
* 2) It bounds the growth of the files in the file system
|
|
||||||
* <p>
|
|
||||||
* TODO: Should all cleaning be done based on {@link com.uber.hoodie.common.model.HoodieCommitMetadata}
|
|
||||||
*/
|
*/
|
||||||
public class HoodieCleanHelper<T extends HoodieRecordPayload<T>> {
|
public class HoodieCleanHelper<T extends HoodieRecordPayload<T>> {
|
||||||
|
|
||||||
private static Logger logger = LogManager.getLogger(HoodieCleanHelper.class);
|
private static Logger logger = LogManager.getLogger(HoodieCleanHelper.class);
|
||||||
|
|
||||||
private final TableFileSystemView fileSystemView;
|
private final TableFileSystemView fileSystemView;
|
||||||
@@ -66,13 +63,9 @@ public class HoodieCleanHelper<T extends HoodieRecordPayload<T>> {
|
|||||||
|
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Selects the older versions of files for cleaning, such that it bounds the number of versions of each file.
|
* Selects the older versions of files for cleaning, such that it bounds the number of versions of
|
||||||
* This policy is useful, if you are simply interested in querying the table, and you don't want too many
|
* each file. This policy is useful, if you are simply interested in querying the table, and you
|
||||||
* versions for a single file (i.e run it with versionsRetained = 1)
|
* don't want too many versions for a single file (i.e run it with versionsRetained = 1)
|
||||||
*
|
|
||||||
* @param partitionPath
|
|
||||||
* @return
|
|
||||||
* @throws IOException
|
|
||||||
*/
|
*/
|
||||||
private List<String> getFilesToCleanKeepingLatestVersions(String partitionPath)
|
private List<String> getFilesToCleanKeepingLatestVersions(String partitionPath)
|
||||||
throws IOException {
|
throws IOException {
|
||||||
@@ -93,7 +86,7 @@ public class HoodieCleanHelper<T extends HoodieRecordPayload<T>> {
|
|||||||
// Skip this most recent version
|
// Skip this most recent version
|
||||||
FileSlice nextSlice = fileSliceIterator.next();
|
FileSlice nextSlice = fileSliceIterator.next();
|
||||||
HoodieDataFile dataFile = nextSlice.getDataFile().get();
|
HoodieDataFile dataFile = nextSlice.getDataFile().get();
|
||||||
if(savepointedFiles.contains(dataFile.getFileName())) {
|
if (savepointedFiles.contains(dataFile.getFileName())) {
|
||||||
// do not clean up a savepoint data file
|
// do not clean up a savepoint data file
|
||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
@@ -118,22 +111,15 @@ public class HoodieCleanHelper<T extends HoodieRecordPayload<T>> {
|
|||||||
|
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Selects the versions for file for cleaning, such that it
|
* Selects the versions for file for cleaning, such that it <p> - Leaves the latest version of the
|
||||||
* <p>
|
* file untouched - For older versions, - It leaves all the commits untouched which has occured in
|
||||||
* - Leaves the latest version of the file untouched
|
* last <code>config.getCleanerCommitsRetained()</code> commits - It leaves ONE commit before this
|
||||||
* - For older versions,
|
* window. We assume that the max(query execution time) == commit_batch_time *
|
||||||
* - It leaves all the commits untouched which has occured in last <code>config.getCleanerCommitsRetained()</code> commits
|
* config.getCleanerCommitsRetained(). This is 12 hours by default. This is essential to leave the
|
||||||
* - It leaves ONE commit before this window. We assume that the max(query execution time) == commit_batch_time * config.getCleanerCommitsRetained(). This is 12 hours by default.
|
* file used by the query thats running for the max time. <p> This provides the effect of having
|
||||||
* This is essential to leave the file used by the query thats running for the max time.
|
* lookback into all changes that happened in the last X commits. (eg: if you retain 24 commits,
|
||||||
* <p>
|
* and commit batch time is 30 mins, then you have 12 hrs of lookback) <p> This policy is the
|
||||||
* This provides the effect of having lookback into all changes that happened in the last X
|
* default.
|
||||||
* commits. (eg: if you retain 24 commits, and commit batch time is 30 mins, then you have 12 hrs of lookback)
|
|
||||||
* <p>
|
|
||||||
* This policy is the default.
|
|
||||||
*
|
|
||||||
* @param partitionPath
|
|
||||||
* @return
|
|
||||||
* @throws IOException
|
|
||||||
*/
|
*/
|
||||||
private List<String> getFilesToCleanKeepingLatestCommits(String partitionPath)
|
private List<String> getFilesToCleanKeepingLatestCommits(String partitionPath)
|
||||||
throws IOException {
|
throws IOException {
|
||||||
@@ -164,7 +150,7 @@ public class HoodieCleanHelper<T extends HoodieRecordPayload<T>> {
|
|||||||
for (FileSlice aSlice : fileSliceList) {
|
for (FileSlice aSlice : fileSliceList) {
|
||||||
HoodieDataFile aFile = aSlice.getDataFile().get();
|
HoodieDataFile aFile = aSlice.getDataFile().get();
|
||||||
String fileCommitTime = aFile.getCommitTime();
|
String fileCommitTime = aFile.getCommitTime();
|
||||||
if(savepointedFiles.contains(aFile.getFileName())) {
|
if (savepointedFiles.contains(aFile.getFileName())) {
|
||||||
// do not clean up a savepoint data file
|
// do not clean up a savepoint data file
|
||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -39,6 +39,12 @@ import com.uber.hoodie.exception.HoodieCommitException;
|
|||||||
import com.uber.hoodie.exception.HoodieException;
|
import com.uber.hoodie.exception.HoodieException;
|
||||||
import com.uber.hoodie.exception.HoodieIOException;
|
import com.uber.hoodie.exception.HoodieIOException;
|
||||||
import com.uber.hoodie.table.HoodieTable;
|
import com.uber.hoodie.table.HoodieTable;
|
||||||
|
import java.io.IOException;
|
||||||
|
import java.util.ArrayList;
|
||||||
|
import java.util.List;
|
||||||
|
import java.util.Optional;
|
||||||
|
import java.util.stream.Collectors;
|
||||||
|
import java.util.stream.Stream;
|
||||||
import org.apache.avro.Schema;
|
import org.apache.avro.Schema;
|
||||||
import org.apache.avro.generic.IndexedRecord;
|
import org.apache.avro.generic.IndexedRecord;
|
||||||
import org.apache.hadoop.fs.FileSystem;
|
import org.apache.hadoop.fs.FileSystem;
|
||||||
@@ -46,17 +52,11 @@ import org.apache.hadoop.fs.Path;
|
|||||||
import org.apache.log4j.LogManager;
|
import org.apache.log4j.LogManager;
|
||||||
import org.apache.log4j.Logger;
|
import org.apache.log4j.Logger;
|
||||||
|
|
||||||
import java.io.IOException;
|
|
||||||
import java.util.ArrayList;
|
|
||||||
import java.util.List;
|
|
||||||
import java.util.Optional;
|
|
||||||
import java.util.stream.Collectors;
|
|
||||||
import java.util.stream.Stream;
|
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Archiver to bound the growth of <action>.commit files
|
* Archiver to bound the growth of <action>.commit files
|
||||||
*/
|
*/
|
||||||
public class HoodieCommitArchiveLog {
|
public class HoodieCommitArchiveLog {
|
||||||
|
|
||||||
private static Logger log = LogManager.getLogger(HoodieCommitArchiveLog.class);
|
private static Logger log = LogManager.getLogger(HoodieCommitArchiveLog.class);
|
||||||
|
|
||||||
private final Path archiveFilePath;
|
private final Path archiveFilePath;
|
||||||
@@ -73,7 +73,7 @@ public class HoodieCommitArchiveLog {
|
|||||||
|
|
||||||
private HoodieLogFormat.Writer openWriter() {
|
private HoodieLogFormat.Writer openWriter() {
|
||||||
try {
|
try {
|
||||||
if(this.writer == null) {
|
if (this.writer == null) {
|
||||||
return HoodieLogFormat.newWriterBuilder()
|
return HoodieLogFormat.newWriterBuilder()
|
||||||
.onParentPath(archiveFilePath.getParent())
|
.onParentPath(archiveFilePath.getParent())
|
||||||
.withFileId(archiveFilePath.getName())
|
.withFileId(archiveFilePath.getName())
|
||||||
@@ -83,17 +83,17 @@ public class HoodieCommitArchiveLog {
|
|||||||
} else {
|
} else {
|
||||||
return this.writer;
|
return this.writer;
|
||||||
}
|
}
|
||||||
} catch(InterruptedException | IOException e) {
|
} catch (InterruptedException | IOException e) {
|
||||||
throw new HoodieException("Unable to initialize HoodieLogFormat writer", e);
|
throw new HoodieException("Unable to initialize HoodieLogFormat writer", e);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
private void close() {
|
private void close() {
|
||||||
try {
|
try {
|
||||||
if(this.writer != null) {
|
if (this.writer != null) {
|
||||||
this.writer.close();
|
this.writer.close();
|
||||||
}
|
}
|
||||||
} catch(IOException e) {
|
} catch (IOException e) {
|
||||||
throw new HoodieException("Unable to close HoodieLogFormat writer", e);
|
throw new HoodieException("Unable to close HoodieLogFormat writer", e);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@@ -125,10 +125,12 @@ public class HoodieCommitArchiveLog {
|
|||||||
int maxCommitsToKeep = config.getMaxCommitsToKeep();
|
int maxCommitsToKeep = config.getMaxCommitsToKeep();
|
||||||
int minCommitsToKeep = config.getMinCommitsToKeep();
|
int minCommitsToKeep = config.getMinCommitsToKeep();
|
||||||
|
|
||||||
HoodieTable table = HoodieTable.getHoodieTable(new HoodieTableMetaClient(fs, config.getBasePath(), true), config);
|
HoodieTable table = HoodieTable
|
||||||
|
.getHoodieTable(new HoodieTableMetaClient(fs, config.getBasePath(), true), config);
|
||||||
|
|
||||||
// GroupBy each action and limit each action timeline to maxCommitsToKeep
|
// GroupBy each action and limit each action timeline to maxCommitsToKeep
|
||||||
HoodieTimeline cleanAndRollbackTimeline = table.getActiveTimeline().getTimelineOfActions(Sets.newHashSet(HoodieTimeline.CLEAN_ACTION,
|
HoodieTimeline cleanAndRollbackTimeline = table.getActiveTimeline()
|
||||||
|
.getTimelineOfActions(Sets.newHashSet(HoodieTimeline.CLEAN_ACTION,
|
||||||
HoodieTimeline.ROLLBACK_ACTION));
|
HoodieTimeline.ROLLBACK_ACTION));
|
||||||
Stream<HoodieInstant> instants = cleanAndRollbackTimeline.getInstants()
|
Stream<HoodieInstant> instants = cleanAndRollbackTimeline.getInstants()
|
||||||
.collect(Collectors.groupingBy(s -> s.getAction()))
|
.collect(Collectors.groupingBy(s -> s.getAction()))
|
||||||
@@ -198,7 +200,7 @@ public class HoodieCommitArchiveLog {
|
|||||||
}
|
}
|
||||||
HoodieAvroDataBlock block = new HoodieAvroDataBlock(records, wrapperSchema);
|
HoodieAvroDataBlock block = new HoodieAvroDataBlock(records, wrapperSchema);
|
||||||
this.writer = writer.appendBlock(block);
|
this.writer = writer.appendBlock(block);
|
||||||
} catch(Exception e) {
|
} catch (Exception e) {
|
||||||
throw new HoodieCommitException("Failed to archive commits", e);
|
throw new HoodieCommitException("Failed to archive commits", e);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@@ -207,40 +209,48 @@ public class HoodieCommitArchiveLog {
|
|||||||
return archiveFilePath;
|
return archiveFilePath;
|
||||||
}
|
}
|
||||||
|
|
||||||
private IndexedRecord convertToAvroRecord(HoodieTimeline commitTimeline, HoodieInstant hoodieInstant) throws IOException {
|
private IndexedRecord convertToAvroRecord(HoodieTimeline commitTimeline,
|
||||||
|
HoodieInstant hoodieInstant) throws IOException {
|
||||||
HoodieArchivedMetaEntry archivedMetaWrapper = new HoodieArchivedMetaEntry();
|
HoodieArchivedMetaEntry archivedMetaWrapper = new HoodieArchivedMetaEntry();
|
||||||
archivedMetaWrapper.setCommitTime(hoodieInstant.getTimestamp());
|
archivedMetaWrapper.setCommitTime(hoodieInstant.getTimestamp());
|
||||||
switch(hoodieInstant.getAction()) {
|
switch (hoodieInstant.getAction()) {
|
||||||
case HoodieTimeline.CLEAN_ACTION:{
|
case HoodieTimeline.CLEAN_ACTION: {
|
||||||
archivedMetaWrapper.setHoodieCleanMetadata(AvroUtils.deserializeAvroMetadata(commitTimeline.getInstantDetails(hoodieInstant).get(), HoodieCleanMetadata.class));
|
archivedMetaWrapper.setHoodieCleanMetadata(AvroUtils
|
||||||
|
.deserializeAvroMetadata(commitTimeline.getInstantDetails(hoodieInstant).get(),
|
||||||
|
HoodieCleanMetadata.class));
|
||||||
archivedMetaWrapper.setActionType(ActionType.clean.name());
|
archivedMetaWrapper.setActionType(ActionType.clean.name());
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
case HoodieTimeline.COMMIT_ACTION:{
|
case HoodieTimeline.COMMIT_ACTION: {
|
||||||
HoodieCommitMetadata commitMetadata = HoodieCommitMetadata
|
HoodieCommitMetadata commitMetadata = HoodieCommitMetadata
|
||||||
.fromBytes(commitTimeline.getInstantDetails(hoodieInstant).get());
|
.fromBytes(commitTimeline.getInstantDetails(hoodieInstant).get());
|
||||||
archivedMetaWrapper.setHoodieCommitMetadata(commitMetadataConverter(commitMetadata));
|
archivedMetaWrapper.setHoodieCommitMetadata(commitMetadataConverter(commitMetadata));
|
||||||
archivedMetaWrapper.setActionType(ActionType.commit.name());
|
archivedMetaWrapper.setActionType(ActionType.commit.name());
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
case HoodieTimeline.COMPACTION_ACTION:{
|
case HoodieTimeline.COMPACTION_ACTION: {
|
||||||
com.uber.hoodie.common.model.HoodieCompactionMetadata compactionMetadata = com.uber.hoodie.common.model.HoodieCompactionMetadata
|
com.uber.hoodie.common.model.HoodieCompactionMetadata compactionMetadata = com.uber.hoodie.common.model.HoodieCompactionMetadata
|
||||||
.fromBytes(commitTimeline.getInstantDetails(hoodieInstant).get());
|
.fromBytes(commitTimeline.getInstantDetails(hoodieInstant).get());
|
||||||
archivedMetaWrapper.setHoodieCompactionMetadata(compactionMetadataConverter(compactionMetadata));
|
archivedMetaWrapper
|
||||||
|
.setHoodieCompactionMetadata(compactionMetadataConverter(compactionMetadata));
|
||||||
archivedMetaWrapper.setActionType(ActionType.compaction.name());
|
archivedMetaWrapper.setActionType(ActionType.compaction.name());
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
case HoodieTimeline.ROLLBACK_ACTION:{
|
case HoodieTimeline.ROLLBACK_ACTION: {
|
||||||
archivedMetaWrapper.setHoodieRollbackMetadata(AvroUtils.deserializeAvroMetadata(commitTimeline.getInstantDetails(hoodieInstant).get(), HoodieRollbackMetadata.class));
|
archivedMetaWrapper.setHoodieRollbackMetadata(AvroUtils
|
||||||
|
.deserializeAvroMetadata(commitTimeline.getInstantDetails(hoodieInstant).get(),
|
||||||
|
HoodieRollbackMetadata.class));
|
||||||
archivedMetaWrapper.setActionType(ActionType.rollback.name());
|
archivedMetaWrapper.setActionType(ActionType.rollback.name());
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
case HoodieTimeline.SAVEPOINT_ACTION:{
|
case HoodieTimeline.SAVEPOINT_ACTION: {
|
||||||
archivedMetaWrapper.setHoodieSavePointMetadata(AvroUtils.deserializeAvroMetadata(commitTimeline.getInstantDetails(hoodieInstant).get(), HoodieSavepointMetadata.class));
|
archivedMetaWrapper.setHoodieSavePointMetadata(AvroUtils
|
||||||
|
.deserializeAvroMetadata(commitTimeline.getInstantDetails(hoodieInstant).get(),
|
||||||
|
HoodieSavepointMetadata.class));
|
||||||
archivedMetaWrapper.setActionType(ActionType.savepoint.name());
|
archivedMetaWrapper.setActionType(ActionType.savepoint.name());
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
case HoodieTimeline.DELTA_COMMIT_ACTION:{
|
case HoodieTimeline.DELTA_COMMIT_ACTION: {
|
||||||
HoodieCommitMetadata commitMetadata = HoodieCommitMetadata
|
HoodieCommitMetadata commitMetadata = HoodieCommitMetadata
|
||||||
.fromBytes(commitTimeline.getInstantDetails(hoodieInstant).get());
|
.fromBytes(commitTimeline.getInstantDetails(hoodieInstant).get());
|
||||||
archivedMetaWrapper.setHoodieCommitMetadata(commitMetadataConverter(commitMetadata));
|
archivedMetaWrapper.setHoodieCommitMetadata(commitMetadataConverter(commitMetadata));
|
||||||
@@ -251,19 +261,23 @@ public class HoodieCommitArchiveLog {
|
|||||||
return archivedMetaWrapper;
|
return archivedMetaWrapper;
|
||||||
}
|
}
|
||||||
|
|
||||||
private com.uber.hoodie.avro.model.HoodieCommitMetadata commitMetadataConverter(HoodieCommitMetadata hoodieCommitMetadata) {
|
private com.uber.hoodie.avro.model.HoodieCommitMetadata commitMetadataConverter(
|
||||||
|
HoodieCommitMetadata hoodieCommitMetadata) {
|
||||||
ObjectMapper mapper = new ObjectMapper();
|
ObjectMapper mapper = new ObjectMapper();
|
||||||
//Need this to ignore other public get() methods
|
//Need this to ignore other public get() methods
|
||||||
mapper.configure(DeserializationFeature.FAIL_ON_UNKNOWN_PROPERTIES, false);
|
mapper.configure(DeserializationFeature.FAIL_ON_UNKNOWN_PROPERTIES, false);
|
||||||
com.uber.hoodie.avro.model.HoodieCommitMetadata avroMetaData =
|
com.uber.hoodie.avro.model.HoodieCommitMetadata avroMetaData =
|
||||||
mapper.convertValue(hoodieCommitMetadata, com.uber.hoodie.avro.model.HoodieCommitMetadata.class);
|
mapper.convertValue(hoodieCommitMetadata,
|
||||||
|
com.uber.hoodie.avro.model.HoodieCommitMetadata.class);
|
||||||
return avroMetaData;
|
return avroMetaData;
|
||||||
}
|
}
|
||||||
|
|
||||||
private com.uber.hoodie.avro.model.HoodieCompactionMetadata compactionMetadataConverter(HoodieCompactionMetadata hoodieCompactionMetadata) {
|
private com.uber.hoodie.avro.model.HoodieCompactionMetadata compactionMetadataConverter(
|
||||||
|
HoodieCompactionMetadata hoodieCompactionMetadata) {
|
||||||
ObjectMapper mapper = new ObjectMapper();
|
ObjectMapper mapper = new ObjectMapper();
|
||||||
mapper.configure(DeserializationFeature.FAIL_ON_UNKNOWN_PROPERTIES, false);
|
mapper.configure(DeserializationFeature.FAIL_ON_UNKNOWN_PROPERTIES, false);
|
||||||
com.uber.hoodie.avro.model.HoodieCompactionMetadata avroMetaData = mapper.convertValue(hoodieCompactionMetadata,
|
com.uber.hoodie.avro.model.HoodieCompactionMetadata avroMetaData = mapper
|
||||||
|
.convertValue(hoodieCompactionMetadata,
|
||||||
com.uber.hoodie.avro.model.HoodieCompactionMetadata.class);
|
com.uber.hoodie.avro.model.HoodieCompactionMetadata.class);
|
||||||
return avroMetaData;
|
return avroMetaData;
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -29,17 +29,17 @@ import com.uber.hoodie.exception.HoodieInsertException;
|
|||||||
import com.uber.hoodie.io.storage.HoodieStorageWriter;
|
import com.uber.hoodie.io.storage.HoodieStorageWriter;
|
||||||
import com.uber.hoodie.io.storage.HoodieStorageWriterFactory;
|
import com.uber.hoodie.io.storage.HoodieStorageWriterFactory;
|
||||||
import com.uber.hoodie.table.HoodieTable;
|
import com.uber.hoodie.table.HoodieTable;
|
||||||
|
import java.io.IOException;
|
||||||
|
import java.util.Optional;
|
||||||
|
import java.util.UUID;
|
||||||
import org.apache.avro.generic.IndexedRecord;
|
import org.apache.avro.generic.IndexedRecord;
|
||||||
import org.apache.hadoop.fs.Path;
|
import org.apache.hadoop.fs.Path;
|
||||||
import org.apache.log4j.LogManager;
|
import org.apache.log4j.LogManager;
|
||||||
import org.apache.log4j.Logger;
|
import org.apache.log4j.Logger;
|
||||||
import org.apache.spark.TaskContext;
|
import org.apache.spark.TaskContext;
|
||||||
|
|
||||||
import java.io.IOException;
|
|
||||||
import java.util.Optional;
|
|
||||||
import java.util.UUID;
|
|
||||||
|
|
||||||
public class HoodieCreateHandle<T extends HoodieRecordPayload> extends HoodieIOHandle<T> {
|
public class HoodieCreateHandle<T extends HoodieRecordPayload> extends HoodieIOHandle<T> {
|
||||||
|
|
||||||
private static Logger logger = LogManager.getLogger(HoodieCreateHandle.class);
|
private static Logger logger = LogManager.getLogger(HoodieCreateHandle.class);
|
||||||
|
|
||||||
private final WriteStatus status;
|
private final WriteStatus status;
|
||||||
@@ -63,7 +63,8 @@ public class HoodieCreateHandle<T extends HoodieRecordPayload> extends HoodieIOH
|
|||||||
new Path(config.getBasePath(), partitionPath));
|
new Path(config.getBasePath(), partitionPath));
|
||||||
partitionMetadata.trySave(TaskContext.getPartitionId());
|
partitionMetadata.trySave(TaskContext.getPartitionId());
|
||||||
this.storageWriter =
|
this.storageWriter =
|
||||||
HoodieStorageWriterFactory.getStorageWriter(commitTime, path, hoodieTable, config, schema);
|
HoodieStorageWriterFactory
|
||||||
|
.getStorageWriter(commitTime, path, hoodieTable, config, schema);
|
||||||
} catch (IOException e) {
|
} catch (IOException e) {
|
||||||
throw new HoodieInsertException(
|
throw new HoodieInsertException(
|
||||||
"Failed to initialize HoodieStorageWriter for path " + path, e);
|
"Failed to initialize HoodieStorageWriter for path " + path, e);
|
||||||
@@ -74,10 +75,8 @@ public class HoodieCreateHandle<T extends HoodieRecordPayload> extends HoodieIOH
|
|||||||
/**
|
/**
|
||||||
* Determines whether we can accept the incoming records, into the current file, depending on
|
* Determines whether we can accept the incoming records, into the current file, depending on
|
||||||
*
|
*
|
||||||
* - Whether it belongs to the same partitionPath as existing records
|
* - Whether it belongs to the same partitionPath as existing records - Whether the current file
|
||||||
* - Whether the current file written bytes lt max file size
|
* written bytes lt max file size
|
||||||
*
|
|
||||||
* @return
|
|
||||||
*/
|
*/
|
||||||
public boolean canWrite(HoodieRecord record) {
|
public boolean canWrite(HoodieRecord record) {
|
||||||
return storageWriter.canWrite() && record.getPartitionPath()
|
return storageWriter.canWrite() && record.getPartitionPath()
|
||||||
@@ -86,15 +85,13 @@ public class HoodieCreateHandle<T extends HoodieRecordPayload> extends HoodieIOH
|
|||||||
|
|
||||||
/**
|
/**
|
||||||
* Perform the actual writing of the given record into the backing file.
|
* Perform the actual writing of the given record into the backing file.
|
||||||
*
|
|
||||||
* @param record
|
|
||||||
*/
|
*/
|
||||||
public void write(HoodieRecord record) {
|
public void write(HoodieRecord record) {
|
||||||
Optional recordMetadata = record.getData().getMetadata();
|
Optional recordMetadata = record.getData().getMetadata();
|
||||||
try {
|
try {
|
||||||
Optional<IndexedRecord> avroRecord = record.getData().getInsertValue(schema);
|
Optional<IndexedRecord> avroRecord = record.getData().getInsertValue(schema);
|
||||||
|
|
||||||
if(avroRecord.isPresent()) {
|
if (avroRecord.isPresent()) {
|
||||||
storageWriter.writeAvroWithMetadata(avroRecord.get(), record);
|
storageWriter.writeAvroWithMetadata(avroRecord.get(), record);
|
||||||
// update the new location of record, so we know where to find it next
|
// update the new location of record, so we know where to find it next
|
||||||
record.setNewLocation(new HoodieRecordLocation(commitTime, status.getFileId()));
|
record.setNewLocation(new HoodieRecordLocation(commitTime, status.getFileId()));
|
||||||
@@ -114,8 +111,6 @@ public class HoodieCreateHandle<T extends HoodieRecordPayload> extends HoodieIOH
|
|||||||
|
|
||||||
/**
|
/**
|
||||||
* Performs actions to durably, persist the current changes and returns a WriteStatus object
|
* Performs actions to durably, persist the current changes and returns a WriteStatus object
|
||||||
*
|
|
||||||
* @return
|
|
||||||
*/
|
*/
|
||||||
public WriteStatus close() {
|
public WriteStatus close() {
|
||||||
logger.info(
|
logger.info(
|
||||||
|
|||||||
@@ -24,6 +24,7 @@ import com.uber.hoodie.common.util.HoodieAvroUtils;
|
|||||||
import com.uber.hoodie.config.HoodieWriteConfig;
|
import com.uber.hoodie.config.HoodieWriteConfig;
|
||||||
import com.uber.hoodie.exception.HoodieIOException;
|
import com.uber.hoodie.exception.HoodieIOException;
|
||||||
import com.uber.hoodie.table.HoodieTable;
|
import com.uber.hoodie.table.HoodieTable;
|
||||||
|
import java.io.IOException;
|
||||||
import org.apache.avro.Schema;
|
import org.apache.avro.Schema;
|
||||||
import org.apache.hadoop.fs.FileStatus;
|
import org.apache.hadoop.fs.FileStatus;
|
||||||
import org.apache.hadoop.fs.FileSystem;
|
import org.apache.hadoop.fs.FileSystem;
|
||||||
@@ -31,9 +32,8 @@ import org.apache.hadoop.fs.Path;
|
|||||||
import org.apache.log4j.LogManager;
|
import org.apache.log4j.LogManager;
|
||||||
import org.apache.log4j.Logger;
|
import org.apache.log4j.Logger;
|
||||||
|
|
||||||
import java.io.IOException;
|
|
||||||
|
|
||||||
public abstract class HoodieIOHandle<T extends HoodieRecordPayload> {
|
public abstract class HoodieIOHandle<T extends HoodieRecordPayload> {
|
||||||
|
|
||||||
private static Logger logger = LogManager.getLogger(HoodieIOHandle.class);
|
private static Logger logger = LogManager.getLogger(HoodieIOHandle.class);
|
||||||
protected final String commitTime;
|
protected final String commitTime;
|
||||||
protected final HoodieWriteConfig config;
|
protected final HoodieWriteConfig config;
|
||||||
|
|||||||
@@ -16,19 +16,23 @@
|
|||||||
|
|
||||||
package com.uber.hoodie.io;
|
package com.uber.hoodie.io;
|
||||||
|
|
||||||
import com.uber.hoodie.common.model.HoodiePartitionMetadata;
|
|
||||||
import com.uber.hoodie.common.util.ReflectionUtils;
|
|
||||||
import com.uber.hoodie.config.HoodieWriteConfig;
|
|
||||||
import com.uber.hoodie.WriteStatus;
|
import com.uber.hoodie.WriteStatus;
|
||||||
|
import com.uber.hoodie.common.model.HoodiePartitionMetadata;
|
||||||
import com.uber.hoodie.common.model.HoodieRecord;
|
import com.uber.hoodie.common.model.HoodieRecord;
|
||||||
import com.uber.hoodie.common.model.HoodieRecordLocation;
|
import com.uber.hoodie.common.model.HoodieRecordLocation;
|
||||||
import com.uber.hoodie.common.model.HoodieRecordPayload;
|
import com.uber.hoodie.common.model.HoodieRecordPayload;
|
||||||
import com.uber.hoodie.common.model.HoodieWriteStat;
|
import com.uber.hoodie.common.model.HoodieWriteStat;
|
||||||
import com.uber.hoodie.common.util.FSUtils;
|
import com.uber.hoodie.common.util.FSUtils;
|
||||||
|
import com.uber.hoodie.common.util.ReflectionUtils;
|
||||||
|
import com.uber.hoodie.config.HoodieWriteConfig;
|
||||||
import com.uber.hoodie.exception.HoodieUpsertException;
|
import com.uber.hoodie.exception.HoodieUpsertException;
|
||||||
import com.uber.hoodie.io.storage.HoodieStorageWriter;
|
import com.uber.hoodie.io.storage.HoodieStorageWriter;
|
||||||
import com.uber.hoodie.io.storage.HoodieStorageWriterFactory;
|
import com.uber.hoodie.io.storage.HoodieStorageWriterFactory;
|
||||||
import com.uber.hoodie.table.HoodieTable;
|
import com.uber.hoodie.table.HoodieTable;
|
||||||
|
import java.io.IOException;
|
||||||
|
import java.util.HashMap;
|
||||||
|
import java.util.Iterator;
|
||||||
|
import java.util.Optional;
|
||||||
import org.apache.avro.generic.GenericRecord;
|
import org.apache.avro.generic.GenericRecord;
|
||||||
import org.apache.avro.generic.IndexedRecord;
|
import org.apache.avro.generic.IndexedRecord;
|
||||||
import org.apache.hadoop.fs.Path;
|
import org.apache.hadoop.fs.Path;
|
||||||
@@ -36,13 +40,9 @@ import org.apache.log4j.LogManager;
|
|||||||
import org.apache.log4j.Logger;
|
import org.apache.log4j.Logger;
|
||||||
import org.apache.spark.TaskContext;
|
import org.apache.spark.TaskContext;
|
||||||
|
|
||||||
import java.io.IOException;
|
|
||||||
import java.util.HashMap;
|
|
||||||
import java.util.Iterator;
|
|
||||||
import java.util.Optional;
|
|
||||||
|
|
||||||
@SuppressWarnings("Duplicates")
|
@SuppressWarnings("Duplicates")
|
||||||
public class HoodieMergeHandle<T extends HoodieRecordPayload> extends HoodieIOHandle<T> {
|
public class HoodieMergeHandle<T extends HoodieRecordPayload> extends HoodieIOHandle<T> {
|
||||||
|
|
||||||
private static Logger logger = LogManager.getLogger(HoodieMergeHandle.class);
|
private static Logger logger = LogManager.getLogger(HoodieMergeHandle.class);
|
||||||
|
|
||||||
private WriteStatus writeStatus;
|
private WriteStatus writeStatus;
|
||||||
@@ -94,7 +94,7 @@ public class HoodieMergeHandle<T extends HoodieRecordPayload> extends HoodieIOHa
|
|||||||
oldFilePath = new Path(
|
oldFilePath = new Path(
|
||||||
config.getBasePath() + "/" + record.getPartitionPath() + "/"
|
config.getBasePath() + "/" + record.getPartitionPath() + "/"
|
||||||
+ latestValidFilePath);
|
+ latestValidFilePath);
|
||||||
String relativePath = new Path( record.getPartitionPath() + "/" + FSUtils
|
String relativePath = new Path(record.getPartitionPath() + "/" + FSUtils
|
||||||
.makeDataFileName(commitTime, TaskContext.getPartitionId(), fileId)).toString();
|
.makeDataFileName(commitTime, TaskContext.getPartitionId(), fileId)).toString();
|
||||||
newFilePath = new Path(config.getBasePath(), relativePath);
|
newFilePath = new Path(config.getBasePath(), relativePath);
|
||||||
|
|
||||||
@@ -129,10 +129,11 @@ public class HoodieMergeHandle<T extends HoodieRecordPayload> extends HoodieIOHa
|
|||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
private boolean writeUpdateRecord(HoodieRecord<T> hoodieRecord, Optional<IndexedRecord> indexedRecord) {
|
private boolean writeUpdateRecord(HoodieRecord<T> hoodieRecord,
|
||||||
|
Optional<IndexedRecord> indexedRecord) {
|
||||||
Optional recordMetadata = hoodieRecord.getData().getMetadata();
|
Optional recordMetadata = hoodieRecord.getData().getMetadata();
|
||||||
try {
|
try {
|
||||||
if(indexedRecord.isPresent()) {
|
if (indexedRecord.isPresent()) {
|
||||||
storageWriter.writeAvroWithMetadata(indexedRecord.get(), hoodieRecord);
|
storageWriter.writeAvroWithMetadata(indexedRecord.get(), hoodieRecord);
|
||||||
recordsWritten++;
|
recordsWritten++;
|
||||||
updatedRecordsWritten++;
|
updatedRecordsWritten++;
|
||||||
@@ -144,14 +145,15 @@ public class HoodieMergeHandle<T extends HoodieRecordPayload> extends HoodieIOHa
|
|||||||
writeStatus.markSuccess(hoodieRecord, recordMetadata);
|
writeStatus.markSuccess(hoodieRecord, recordMetadata);
|
||||||
return true;
|
return true;
|
||||||
} catch (Exception e) {
|
} catch (Exception e) {
|
||||||
logger.error("Error writing record "+ hoodieRecord, e);
|
logger.error("Error writing record " + hoodieRecord, e);
|
||||||
writeStatus.markFailure(hoodieRecord, e, recordMetadata);
|
writeStatus.markFailure(hoodieRecord, e, recordMetadata);
|
||||||
}
|
}
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Go through an old record. Here if we detect a newer version shows up, we write the new one to the file.
|
* Go through an old record. Here if we detect a newer version shows up, we write the new one to
|
||||||
|
* the file.
|
||||||
*/
|
*/
|
||||||
public void write(GenericRecord oldRecord) {
|
public void write(GenericRecord oldRecord) {
|
||||||
String key = oldRecord.get(HoodieRecord.RECORD_KEY_METADATA_FIELD).toString();
|
String key = oldRecord.get(HoodieRecord.RECORD_KEY_METADATA_FIELD).toString();
|
||||||
@@ -159,7 +161,8 @@ public class HoodieMergeHandle<T extends HoodieRecordPayload> extends HoodieIOHa
|
|||||||
boolean copyOldRecord = true;
|
boolean copyOldRecord = true;
|
||||||
if (keyToNewRecords.containsKey(key)) {
|
if (keyToNewRecords.containsKey(key)) {
|
||||||
try {
|
try {
|
||||||
Optional<IndexedRecord> combinedAvroRecord = hoodieRecord.getData().combineAndGetUpdateValue(oldRecord, schema);
|
Optional<IndexedRecord> combinedAvroRecord = hoodieRecord.getData()
|
||||||
|
.combineAndGetUpdateValue(oldRecord, schema);
|
||||||
if (writeUpdateRecord(hoodieRecord, combinedAvroRecord)) {
|
if (writeUpdateRecord(hoodieRecord, combinedAvroRecord)) {
|
||||||
/* ONLY WHEN
|
/* ONLY WHEN
|
||||||
* 1) we have an update for this key AND
|
* 1) we have an update for this key AND
|
||||||
@@ -171,7 +174,8 @@ public class HoodieMergeHandle<T extends HoodieRecordPayload> extends HoodieIOHa
|
|||||||
}
|
}
|
||||||
keyToNewRecords.remove(key);
|
keyToNewRecords.remove(key);
|
||||||
} catch (Exception e) {
|
} catch (Exception e) {
|
||||||
throw new HoodieUpsertException("Failed to combine/merge new record with old value in storage, for new record {"
|
throw new HoodieUpsertException(
|
||||||
|
"Failed to combine/merge new record with old value in storage, for new record {"
|
||||||
+ keyToNewRecords.get(key) + "}, old value {" + oldRecord + "}", e);
|
+ keyToNewRecords.get(key) + "}, old value {" + oldRecord + "}", e);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@@ -193,7 +197,7 @@ public class HoodieMergeHandle<T extends HoodieRecordPayload> extends HoodieIOHa
|
|||||||
+ getOldFilePath() + " to new file " + newFilePath, e);
|
+ getOldFilePath() + " to new file " + newFilePath, e);
|
||||||
throw new HoodieUpsertException(errMsg, e);
|
throw new HoodieUpsertException(errMsg, e);
|
||||||
}
|
}
|
||||||
recordsWritten ++;
|
recordsWritten++;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|||||||
@@ -18,7 +18,6 @@ package com.uber.hoodie.io.compact;
|
|||||||
|
|
||||||
import com.uber.hoodie.common.model.HoodieDataFile;
|
import com.uber.hoodie.common.model.HoodieDataFile;
|
||||||
import com.uber.hoodie.common.model.HoodieLogFile;
|
import com.uber.hoodie.common.model.HoodieLogFile;
|
||||||
|
|
||||||
import com.uber.hoodie.config.HoodieWriteConfig;
|
import com.uber.hoodie.config.HoodieWriteConfig;
|
||||||
import com.uber.hoodie.io.compact.strategy.CompactionStrategy;
|
import com.uber.hoodie.io.compact.strategy.CompactionStrategy;
|
||||||
import java.io.Serializable;
|
import java.io.Serializable;
|
||||||
@@ -27,8 +26,8 @@ import java.util.Map;
|
|||||||
import java.util.stream.Collectors;
|
import java.util.stream.Collectors;
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Encapsulates all the needed information about a compaction
|
* Encapsulates all the needed information about a compaction and make a decision whether this
|
||||||
* and make a decision whether this compaction is effective or not
|
* compaction is effective or not
|
||||||
*
|
*
|
||||||
* @see CompactionStrategy
|
* @see CompactionStrategy
|
||||||
*/
|
*/
|
||||||
|
|||||||
@@ -22,18 +22,17 @@ import com.uber.hoodie.common.table.timeline.HoodieActiveTimeline;
|
|||||||
import com.uber.hoodie.common.table.timeline.HoodieInstant;
|
import com.uber.hoodie.common.table.timeline.HoodieInstant;
|
||||||
import com.uber.hoodie.config.HoodieWriteConfig;
|
import com.uber.hoodie.config.HoodieWriteConfig;
|
||||||
import com.uber.hoodie.table.HoodieTable;
|
import com.uber.hoodie.table.HoodieTable;
|
||||||
import org.apache.spark.api.java.JavaSparkContext;
|
|
||||||
|
|
||||||
import java.io.Serializable;
|
import java.io.Serializable;
|
||||||
import java.util.Date;
|
import java.util.Date;
|
||||||
|
import org.apache.spark.api.java.JavaSparkContext;
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* A HoodieCompactor runs compaction on a hoodie table
|
* A HoodieCompactor runs compaction on a hoodie table
|
||||||
*/
|
*/
|
||||||
public interface HoodieCompactor extends Serializable {
|
public interface HoodieCompactor extends Serializable {
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Compact the delta files with the data files
|
* Compact the delta files with the data files
|
||||||
* @throws Exception
|
|
||||||
*/
|
*/
|
||||||
HoodieCompactionMetadata compact(JavaSparkContext jsc, final HoodieWriteConfig config,
|
HoodieCompactionMetadata compact(JavaSparkContext jsc, final HoodieWriteConfig config,
|
||||||
HoodieTable hoodieTable) throws Exception;
|
HoodieTable hoodieTable) throws Exception;
|
||||||
|
|||||||
@@ -16,14 +16,14 @@
|
|||||||
|
|
||||||
package com.uber.hoodie.io.compact;
|
package com.uber.hoodie.io.compact;
|
||||||
|
|
||||||
|
import static java.util.stream.Collectors.toList;
|
||||||
|
|
||||||
import com.google.common.base.Preconditions;
|
import com.google.common.base.Preconditions;
|
||||||
import com.google.common.collect.Lists;
|
import com.google.common.collect.Lists;
|
||||||
import com.google.common.collect.Sets;
|
import com.google.common.collect.Sets;
|
||||||
import com.uber.hoodie.WriteStatus;
|
import com.uber.hoodie.WriteStatus;
|
||||||
import com.uber.hoodie.common.model.CompactionWriteStat;
|
import com.uber.hoodie.common.model.CompactionWriteStat;
|
||||||
import com.uber.hoodie.common.model.HoodieAvroPayload;
|
|
||||||
import com.uber.hoodie.common.model.HoodieCompactionMetadata;
|
import com.uber.hoodie.common.model.HoodieCompactionMetadata;
|
||||||
import com.uber.hoodie.common.model.HoodieRecordPayload;
|
|
||||||
import com.uber.hoodie.common.model.HoodieTableType;
|
import com.uber.hoodie.common.model.HoodieTableType;
|
||||||
import com.uber.hoodie.common.table.HoodieTableMetaClient;
|
import com.uber.hoodie.common.table.HoodieTableMetaClient;
|
||||||
import com.uber.hoodie.common.table.HoodieTimeline;
|
import com.uber.hoodie.common.table.HoodieTimeline;
|
||||||
@@ -36,7 +36,12 @@ import com.uber.hoodie.config.HoodieWriteConfig;
|
|||||||
import com.uber.hoodie.exception.HoodieCompactionException;
|
import com.uber.hoodie.exception.HoodieCompactionException;
|
||||||
import com.uber.hoodie.table.HoodieCopyOnWriteTable;
|
import com.uber.hoodie.table.HoodieCopyOnWriteTable;
|
||||||
import com.uber.hoodie.table.HoodieTable;
|
import com.uber.hoodie.table.HoodieTable;
|
||||||
|
import java.io.IOException;
|
||||||
|
import java.nio.charset.StandardCharsets;
|
||||||
import java.util.Collection;
|
import java.util.Collection;
|
||||||
|
import java.util.Iterator;
|
||||||
|
import java.util.List;
|
||||||
|
import java.util.Optional;
|
||||||
import java.util.stream.Collectors;
|
import java.util.stream.Collectors;
|
||||||
import java.util.stream.StreamSupport;
|
import java.util.stream.StreamSupport;
|
||||||
import org.apache.avro.Schema;
|
import org.apache.avro.Schema;
|
||||||
@@ -46,18 +51,10 @@ import org.apache.log4j.Logger;
|
|||||||
import org.apache.spark.api.java.JavaSparkContext;
|
import org.apache.spark.api.java.JavaSparkContext;
|
||||||
import org.apache.spark.api.java.function.FlatMapFunction;
|
import org.apache.spark.api.java.function.FlatMapFunction;
|
||||||
|
|
||||||
import java.io.IOException;
|
|
||||||
import java.nio.charset.StandardCharsets;
|
|
||||||
import java.util.Iterator;
|
|
||||||
import java.util.List;
|
|
||||||
import java.util.Optional;
|
|
||||||
|
|
||||||
import static java.util.stream.Collectors.*;
|
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* HoodieRealtimeTableCompactor compacts a hoodie table with merge on read storage.
|
* HoodieRealtimeTableCompactor compacts a hoodie table with merge on read storage. Computes all
|
||||||
* Computes all possible compactions, passes it through a CompactionFilter and executes
|
* possible compactions, passes it through a CompactionFilter and executes all the compactions and
|
||||||
* all the compactions and writes a new version of base files and make a normal commit
|
* writes a new version of base files and make a normal commit
|
||||||
*
|
*
|
||||||
* @see HoodieCompactor
|
* @see HoodieCompactor
|
||||||
*/
|
*/
|
||||||
@@ -80,7 +77,8 @@ public class HoodieRealtimeTableCompactor implements HoodieCompactor {
|
|||||||
String compactionCommit = startCompactionCommit(hoodieTable);
|
String compactionCommit = startCompactionCommit(hoodieTable);
|
||||||
log.info("Compacting " + metaClient.getBasePath() + " with commit " + compactionCommit);
|
log.info("Compacting " + metaClient.getBasePath() + " with commit " + compactionCommit);
|
||||||
List<String> partitionPaths =
|
List<String> partitionPaths =
|
||||||
FSUtils.getAllPartitionPaths(metaClient.getFs(), metaClient.getBasePath(), config.shouldAssumeDatePartitioning());
|
FSUtils.getAllPartitionPaths(metaClient.getFs(), metaClient.getBasePath(),
|
||||||
|
config.shouldAssumeDatePartitioning());
|
||||||
|
|
||||||
log.info("Compaction looking for files to compact in " + partitionPaths + " partitions");
|
log.info("Compaction looking for files to compact in " + partitionPaths + " partitions");
|
||||||
List<CompactionOperation> operations =
|
List<CompactionOperation> operations =
|
||||||
@@ -156,7 +154,8 @@ public class HoodieRealtimeTableCompactor implements HoodieCompactor {
|
|||||||
HoodieTimeline.DELTA_COMMIT_ACTION))
|
HoodieTimeline.DELTA_COMMIT_ACTION))
|
||||||
.filterCompletedInstants().lastInstant().get().getTimestamp();
|
.filterCompletedInstants().lastInstant().get().getTimestamp();
|
||||||
|
|
||||||
HoodieCompactedLogRecordScanner scanner = new HoodieCompactedLogRecordScanner(fs, metaClient.getBasePath(),
|
HoodieCompactedLogRecordScanner scanner = new HoodieCompactedLogRecordScanner(fs,
|
||||||
|
metaClient.getBasePath(),
|
||||||
operation.getDeltaFilePaths(), readerSchema, maxInstantTime);
|
operation.getDeltaFilePaths(), readerSchema, maxInstantTime);
|
||||||
if (!scanner.iterator().hasNext()) {
|
if (!scanner.iterator().hasNext()) {
|
||||||
return Lists.newArrayList();
|
return Lists.newArrayList();
|
||||||
|
|||||||
@@ -28,8 +28,8 @@ import java.util.Map;
|
|||||||
import java.util.Optional;
|
import java.util.Optional;
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* CompactionStrategy which looks at total IO to be done for the compaction (read + write)
|
* CompactionStrategy which looks at total IO to be done for the compaction (read + write) and
|
||||||
* and limits the list of compactions to be under a configured limit on the IO
|
* limits the list of compactions to be under a configured limit on the IO
|
||||||
*
|
*
|
||||||
* @see CompactionStrategy
|
* @see CompactionStrategy
|
||||||
*/
|
*/
|
||||||
@@ -62,7 +62,8 @@ public class BoundedIOCompactionStrategy implements CompactionStrategy {
|
|||||||
}
|
}
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
public List<CompactionOperation> orderAndFilter(HoodieWriteConfig writeConfig, List<CompactionOperation> operations) {
|
public List<CompactionOperation> orderAndFilter(HoodieWriteConfig writeConfig,
|
||||||
|
List<CompactionOperation> operations) {
|
||||||
// Iterate through the operations in order and accept operations as long as we are within the IO limit
|
// Iterate through the operations in order and accept operations as long as we are within the IO limit
|
||||||
// Preserves the original ordering of compactions
|
// Preserves the original ordering of compactions
|
||||||
List<CompactionOperation> finalOperations = Lists.newArrayList();
|
List<CompactionOperation> finalOperations = Lists.newArrayList();
|
||||||
|
|||||||
@@ -25,12 +25,12 @@ import java.util.List;
|
|||||||
import java.util.Map;
|
import java.util.Map;
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Strategy for compaction. Pluggable implementation of define how compaction should be done.
|
* Strategy for compaction. Pluggable implementation of define how compaction should be done. The
|
||||||
* The implementations of this interface can capture the relevant metrics to order and filter
|
* implementations of this interface can capture the relevant metrics to order and filter the final
|
||||||
* the final list of compaction operation to run in a single compaction.
|
* list of compaction operation to run in a single compaction.
|
||||||
*
|
*
|
||||||
* Implementation of CompactionStrategy cannot hold any state.
|
* Implementation of CompactionStrategy cannot hold any state. Difference instantiations can be
|
||||||
* Difference instantiations can be passed in every time
|
* passed in every time
|
||||||
*
|
*
|
||||||
* @see com.uber.hoodie.io.compact.HoodieRealtimeTableCompactor
|
* @see com.uber.hoodie.io.compact.HoodieRealtimeTableCompactor
|
||||||
* @see CompactionOperation
|
* @see CompactionOperation
|
||||||
@@ -38,8 +38,8 @@ import java.util.Map;
|
|||||||
public interface CompactionStrategy extends Serializable {
|
public interface CompactionStrategy extends Serializable {
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Callback hook when a CompactionOperation is created. Individual strategies can
|
* Callback hook when a CompactionOperation is created. Individual strategies can capture the
|
||||||
* capture the metrics they need to decide on the priority.
|
* metrics they need to decide on the priority.
|
||||||
*
|
*
|
||||||
* @param dataFile - Base file to compact
|
* @param dataFile - Base file to compact
|
||||||
* @param partitionPath - Partition path
|
* @param partitionPath - Partition path
|
||||||
@@ -50,8 +50,8 @@ public interface CompactionStrategy extends Serializable {
|
|||||||
List<HoodieLogFile> logFiles);
|
List<HoodieLogFile> logFiles);
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Order and Filter the list of compactions. Use the metrics captured with the
|
* Order and Filter the list of compactions. Use the metrics captured with the captureMetrics to
|
||||||
* captureMetrics to order and filter out compactions
|
* order and filter out compactions
|
||||||
*
|
*
|
||||||
* @param writeConfig - HoodieWriteConfig - config for this compaction is passed in
|
* @param writeConfig - HoodieWriteConfig - config for this compaction is passed in
|
||||||
* @param operations - list of compactions collected
|
* @param operations - list of compactions collected
|
||||||
|
|||||||
@@ -27,8 +27,8 @@ import java.util.Optional;
|
|||||||
import java.util.stream.Collectors;
|
import java.util.stream.Collectors;
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* LogFileSizeBasedCompactionStrategy orders the compactions based on the total log files size
|
* LogFileSizeBasedCompactionStrategy orders the compactions based on the total log files size and
|
||||||
* and limits the compactions within a configured IO bound
|
* limits the compactions within a configured IO bound
|
||||||
*
|
*
|
||||||
* @see BoundedIOCompactionStrategy
|
* @see BoundedIOCompactionStrategy
|
||||||
* @see CompactionStrategy
|
* @see CompactionStrategy
|
||||||
|
|||||||
@@ -25,9 +25,9 @@ import java.util.List;
|
|||||||
import java.util.Map;
|
import java.util.Map;
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* UnBoundedCompactionStrategy will not change ordering or filter any compaction.
|
* UnBoundedCompactionStrategy will not change ordering or filter any compaction. It is a
|
||||||
* It is a pass-through and will compact all the base files which has a log file.
|
* pass-through and will compact all the base files which has a log file. This usually means
|
||||||
* This usually means no-intelligence on compaction.
|
* no-intelligence on compaction.
|
||||||
*
|
*
|
||||||
* @see CompactionStrategy
|
* @see CompactionStrategy
|
||||||
*/
|
*/
|
||||||
|
|||||||
@@ -17,11 +17,11 @@
|
|||||||
package com.uber.hoodie.io.storage;
|
package com.uber.hoodie.io.storage;
|
||||||
|
|
||||||
import com.uber.hoodie.avro.HoodieAvroWriteSupport;
|
import com.uber.hoodie.avro.HoodieAvroWriteSupport;
|
||||||
import org.apache.avro.Schema;
|
|
||||||
import org.apache.hadoop.conf.Configuration;
|
import org.apache.hadoop.conf.Configuration;
|
||||||
import org.apache.parquet.hadoop.metadata.CompressionCodecName;
|
import org.apache.parquet.hadoop.metadata.CompressionCodecName;
|
||||||
|
|
||||||
public class HoodieParquetConfig {
|
public class HoodieParquetConfig {
|
||||||
|
|
||||||
private HoodieAvroWriteSupport writeSupport;
|
private HoodieAvroWriteSupport writeSupport;
|
||||||
private CompressionCodecName compressionCodecName;
|
private CompressionCodecName compressionCodecName;
|
||||||
private int blockSize;
|
private int blockSize;
|
||||||
|
|||||||
@@ -20,6 +20,8 @@ import com.uber.hoodie.avro.HoodieAvroWriteSupport;
|
|||||||
import com.uber.hoodie.common.model.HoodieRecord;
|
import com.uber.hoodie.common.model.HoodieRecord;
|
||||||
import com.uber.hoodie.common.model.HoodieRecordPayload;
|
import com.uber.hoodie.common.model.HoodieRecordPayload;
|
||||||
import com.uber.hoodie.common.util.HoodieAvroUtils;
|
import com.uber.hoodie.common.util.HoodieAvroUtils;
|
||||||
|
import java.io.IOException;
|
||||||
|
import java.util.concurrent.atomic.AtomicLong;
|
||||||
import org.apache.avro.Schema;
|
import org.apache.avro.Schema;
|
||||||
import org.apache.avro.generic.GenericRecord;
|
import org.apache.avro.generic.GenericRecord;
|
||||||
import org.apache.avro.generic.IndexedRecord;
|
import org.apache.avro.generic.IndexedRecord;
|
||||||
@@ -30,17 +32,13 @@ import org.apache.parquet.hadoop.ParquetFileWriter;
|
|||||||
import org.apache.parquet.hadoop.ParquetWriter;
|
import org.apache.parquet.hadoop.ParquetWriter;
|
||||||
import org.apache.spark.TaskContext;
|
import org.apache.spark.TaskContext;
|
||||||
|
|
||||||
import java.io.IOException;
|
|
||||||
import java.util.concurrent.atomic.AtomicLong;
|
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* HoodieParquetWriter extends the ParquetWriter to help limit the size of underlying file.
|
* HoodieParquetWriter extends the ParquetWriter to help limit the size of underlying file. Provides
|
||||||
* Provides a way to check if the current file can take more records with the <code>canWrite()</code>
|
* a way to check if the current file can take more records with the <code>canWrite()</code>
|
||||||
*
|
|
||||||
* @param <T>
|
|
||||||
*/
|
*/
|
||||||
public class HoodieParquetWriter<T extends HoodieRecordPayload, R extends IndexedRecord>
|
public class HoodieParquetWriter<T extends HoodieRecordPayload, R extends IndexedRecord>
|
||||||
extends ParquetWriter<IndexedRecord> implements HoodieStorageWriter<R> {
|
extends ParquetWriter<IndexedRecord> implements HoodieStorageWriter<R> {
|
||||||
|
|
||||||
private static double STREAM_COMPRESSION_RATIO = 0.1;
|
private static double STREAM_COMPRESSION_RATIO = 0.1;
|
||||||
private static AtomicLong recordIndex = new AtomicLong(1);
|
private static AtomicLong recordIndex = new AtomicLong(1);
|
||||||
|
|
||||||
@@ -101,7 +99,8 @@ public class HoodieParquetWriter<T extends HoodieRecordPayload, R extends Indexe
|
|||||||
return fs.getBytesWritten(file) < maxFileSize;
|
return fs.getBytesWritten(file) < maxFileSize;
|
||||||
}
|
}
|
||||||
|
|
||||||
@Override public void writeAvro(String key, IndexedRecord object) throws IOException {
|
@Override
|
||||||
|
public void writeAvro(String key, IndexedRecord object) throws IOException {
|
||||||
super.write(object);
|
super.write(object);
|
||||||
writeSupport.add(key);
|
writeSupport.add(key);
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -17,13 +17,16 @@
|
|||||||
package com.uber.hoodie.io.storage;
|
package com.uber.hoodie.io.storage;
|
||||||
|
|
||||||
import com.uber.hoodie.common.model.HoodieRecord;
|
import com.uber.hoodie.common.model.HoodieRecord;
|
||||||
|
import java.io.IOException;
|
||||||
import org.apache.avro.generic.IndexedRecord;
|
import org.apache.avro.generic.IndexedRecord;
|
||||||
|
|
||||||
import java.io.IOException;
|
|
||||||
|
|
||||||
public interface HoodieStorageWriter<R extends IndexedRecord> {
|
public interface HoodieStorageWriter<R extends IndexedRecord> {
|
||||||
|
|
||||||
void writeAvroWithMetadata(R newRecord, HoodieRecord record) throws IOException;
|
void writeAvroWithMetadata(R newRecord, HoodieRecord record) throws IOException;
|
||||||
|
|
||||||
boolean canWrite();
|
boolean canWrite();
|
||||||
|
|
||||||
void close() throws IOException;
|
void close() throws IOException;
|
||||||
|
|
||||||
void writeAvro(String key, R oldRecord) throws IOException;
|
void writeAvro(String key, R oldRecord) throws IOException;
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -16,24 +16,24 @@
|
|||||||
|
|
||||||
package com.uber.hoodie.io.storage;
|
package com.uber.hoodie.io.storage;
|
||||||
|
|
||||||
import com.uber.hoodie.common.table.HoodieTableMetaClient;
|
|
||||||
import com.uber.hoodie.config.HoodieWriteConfig;
|
|
||||||
import com.uber.hoodie.avro.HoodieAvroWriteSupport;
|
import com.uber.hoodie.avro.HoodieAvroWriteSupport;
|
||||||
import com.uber.hoodie.common.BloomFilter;
|
import com.uber.hoodie.common.BloomFilter;
|
||||||
import com.uber.hoodie.common.model.HoodieRecordPayload;
|
import com.uber.hoodie.common.model.HoodieRecordPayload;
|
||||||
import com.uber.hoodie.common.util.FSUtils;
|
import com.uber.hoodie.common.util.FSUtils;
|
||||||
|
import com.uber.hoodie.config.HoodieWriteConfig;
|
||||||
import com.uber.hoodie.table.HoodieTable;
|
import com.uber.hoodie.table.HoodieTable;
|
||||||
|
import java.io.IOException;
|
||||||
import org.apache.avro.Schema;
|
import org.apache.avro.Schema;
|
||||||
import org.apache.avro.generic.IndexedRecord;
|
import org.apache.avro.generic.IndexedRecord;
|
||||||
import org.apache.hadoop.fs.Path;
|
import org.apache.hadoop.fs.Path;
|
||||||
import org.apache.parquet.avro.AvroSchemaConverter;
|
import org.apache.parquet.avro.AvroSchemaConverter;
|
||||||
import org.apache.parquet.hadoop.metadata.CompressionCodecName;
|
import org.apache.parquet.hadoop.metadata.CompressionCodecName;
|
||||||
|
|
||||||
import java.io.IOException;
|
|
||||||
|
|
||||||
public class HoodieStorageWriterFactory {
|
public class HoodieStorageWriterFactory {
|
||||||
|
|
||||||
public static <T extends HoodieRecordPayload, R extends IndexedRecord> HoodieStorageWriter<R> getStorageWriter(
|
public static <T extends HoodieRecordPayload, R extends IndexedRecord> HoodieStorageWriter<R> getStorageWriter(
|
||||||
String commitTime, Path path, HoodieTable<T> hoodieTable, HoodieWriteConfig config, Schema schema)
|
String commitTime, Path path, HoodieTable<T> hoodieTable, HoodieWriteConfig config,
|
||||||
|
Schema schema)
|
||||||
throws IOException {
|
throws IOException {
|
||||||
//TODO - based on the metadata choose the implementation of HoodieStorageWriter
|
//TODO - based on the metadata choose the implementation of HoodieStorageWriter
|
||||||
// Currently only parquet is supported
|
// Currently only parquet is supported
|
||||||
|
|||||||
@@ -16,17 +16,6 @@
|
|||||||
|
|
||||||
package com.uber.hoodie.io.storage;
|
package com.uber.hoodie.io.storage;
|
||||||
|
|
||||||
import org.apache.hadoop.conf.Configuration;
|
|
||||||
import org.apache.hadoop.fs.*;
|
|
||||||
import org.apache.hadoop.fs.permission.AclEntry;
|
|
||||||
import org.apache.hadoop.fs.permission.AclStatus;
|
|
||||||
import org.apache.hadoop.fs.permission.FsAction;
|
|
||||||
import org.apache.hadoop.fs.permission.FsPermission;
|
|
||||||
import org.apache.hadoop.security.AccessControlException;
|
|
||||||
import org.apache.hadoop.security.Credentials;
|
|
||||||
import org.apache.hadoop.security.token.Token;
|
|
||||||
import org.apache.hadoop.util.Progressable;
|
|
||||||
|
|
||||||
import java.io.FileNotFoundException;
|
import java.io.FileNotFoundException;
|
||||||
import java.io.IOException;
|
import java.io.IOException;
|
||||||
import java.net.URI;
|
import java.net.URI;
|
||||||
@@ -38,13 +27,41 @@ import java.util.Map;
|
|||||||
import java.util.Set;
|
import java.util.Set;
|
||||||
import java.util.concurrent.ConcurrentHashMap;
|
import java.util.concurrent.ConcurrentHashMap;
|
||||||
import java.util.concurrent.ConcurrentMap;
|
import java.util.concurrent.ConcurrentMap;
|
||||||
|
import org.apache.hadoop.conf.Configuration;
|
||||||
|
import org.apache.hadoop.fs.BlockLocation;
|
||||||
|
import org.apache.hadoop.fs.ContentSummary;
|
||||||
|
import org.apache.hadoop.fs.CreateFlag;
|
||||||
|
import org.apache.hadoop.fs.FSDataInputStream;
|
||||||
|
import org.apache.hadoop.fs.FSDataOutputStream;
|
||||||
|
import org.apache.hadoop.fs.FileAlreadyExistsException;
|
||||||
|
import org.apache.hadoop.fs.FileChecksum;
|
||||||
|
import org.apache.hadoop.fs.FileStatus;
|
||||||
|
import org.apache.hadoop.fs.FileSystem;
|
||||||
|
import org.apache.hadoop.fs.FsServerDefaults;
|
||||||
|
import org.apache.hadoop.fs.FsStatus;
|
||||||
|
import org.apache.hadoop.fs.LocatedFileStatus;
|
||||||
|
import org.apache.hadoop.fs.Options;
|
||||||
|
import org.apache.hadoop.fs.ParentNotDirectoryException;
|
||||||
|
import org.apache.hadoop.fs.Path;
|
||||||
|
import org.apache.hadoop.fs.PathFilter;
|
||||||
|
import org.apache.hadoop.fs.RemoteIterator;
|
||||||
|
import org.apache.hadoop.fs.UnsupportedFileSystemException;
|
||||||
|
import org.apache.hadoop.fs.XAttrSetFlag;
|
||||||
|
import org.apache.hadoop.fs.permission.AclEntry;
|
||||||
|
import org.apache.hadoop.fs.permission.AclStatus;
|
||||||
|
import org.apache.hadoop.fs.permission.FsAction;
|
||||||
|
import org.apache.hadoop.fs.permission.FsPermission;
|
||||||
|
import org.apache.hadoop.security.AccessControlException;
|
||||||
|
import org.apache.hadoop.security.Credentials;
|
||||||
|
import org.apache.hadoop.security.token.Token;
|
||||||
|
import org.apache.hadoop.util.Progressable;
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* HoodieWrapperFileSystem wraps the default file system.
|
* HoodieWrapperFileSystem wraps the default file system. It holds state about the open streams in
|
||||||
* It holds state about the open streams in the file system to support getting the
|
* the file system to support getting the written size to each of the open streams.
|
||||||
* written size to each of the open streams.
|
|
||||||
*/
|
*/
|
||||||
public class HoodieWrapperFileSystem extends FileSystem {
|
public class HoodieWrapperFileSystem extends FileSystem {
|
||||||
|
|
||||||
private static final Set<String> SUPPORT_SCHEMES;
|
private static final Set<String> SUPPORT_SCHEMES;
|
||||||
public static final String HOODIE_SCHEME_PREFIX = "hoodie-";
|
public static final String HOODIE_SCHEME_PREFIX = "hoodie-";
|
||||||
|
|
||||||
@@ -65,7 +82,8 @@ public class HoodieWrapperFileSystem extends FileSystem {
|
|||||||
private FileSystem fileSystem;
|
private FileSystem fileSystem;
|
||||||
private URI uri;
|
private URI uri;
|
||||||
|
|
||||||
@Override public void initialize(URI uri, Configuration conf) throws IOException {
|
@Override
|
||||||
|
public void initialize(URI uri, Configuration conf) throws IOException {
|
||||||
// Get the default filesystem to decorate
|
// Get the default filesystem to decorate
|
||||||
fileSystem = FileSystem.get(conf);
|
fileSystem = FileSystem.get(conf);
|
||||||
// Do not need to explicitly initialize the default filesystem, its done already in the above FileSystem.get
|
// Do not need to explicitly initialize the default filesystem, its done already in the above FileSystem.get
|
||||||
@@ -74,15 +92,18 @@ public class HoodieWrapperFileSystem extends FileSystem {
|
|||||||
this.uri = uri;
|
this.uri = uri;
|
||||||
}
|
}
|
||||||
|
|
||||||
@Override public URI getUri() {
|
@Override
|
||||||
|
public URI getUri() {
|
||||||
return uri;
|
return uri;
|
||||||
}
|
}
|
||||||
|
|
||||||
@Override public FSDataInputStream open(Path f, int bufferSize) throws IOException {
|
@Override
|
||||||
|
public FSDataInputStream open(Path f, int bufferSize) throws IOException {
|
||||||
return fileSystem.open(convertToDefaultPath(f), bufferSize);
|
return fileSystem.open(convertToDefaultPath(f), bufferSize);
|
||||||
}
|
}
|
||||||
|
|
||||||
@Override public FSDataOutputStream create(Path f, FsPermission permission, boolean overwrite,
|
@Override
|
||||||
|
public FSDataOutputStream create(Path f, FsPermission permission, boolean overwrite,
|
||||||
int bufferSize, short replication, long blockSize, Progressable progress)
|
int bufferSize, short replication, long blockSize, Progressable progress)
|
||||||
throws IOException {
|
throws IOException {
|
||||||
final Path translatedPath = convertToDefaultPath(f);
|
final Path translatedPath = convertToDefaultPath(f);
|
||||||
@@ -99,7 +120,8 @@ public class HoodieWrapperFileSystem extends FileSystem {
|
|||||||
|
|
||||||
SizeAwareFSDataOutputStream os =
|
SizeAwareFSDataOutputStream os =
|
||||||
new SizeAwareFSDataOutputStream(fsDataOutputStream, new Runnable() {
|
new SizeAwareFSDataOutputStream(fsDataOutputStream, new Runnable() {
|
||||||
@Override public void run() {
|
@Override
|
||||||
|
public void run() {
|
||||||
openStreams.remove(path.getName());
|
openStreams.remove(path.getName());
|
||||||
}
|
}
|
||||||
});
|
});
|
||||||
@@ -107,33 +129,40 @@ public class HoodieWrapperFileSystem extends FileSystem {
|
|||||||
return os;
|
return os;
|
||||||
}
|
}
|
||||||
|
|
||||||
@Override public FSDataOutputStream create(Path f, boolean overwrite) throws IOException {
|
@Override
|
||||||
|
public FSDataOutputStream create(Path f, boolean overwrite) throws IOException {
|
||||||
return wrapOutputStream(f, fileSystem.create(convertToDefaultPath(f), overwrite));
|
return wrapOutputStream(f, fileSystem.create(convertToDefaultPath(f), overwrite));
|
||||||
}
|
}
|
||||||
|
|
||||||
@Override public FSDataOutputStream create(Path f) throws IOException {
|
@Override
|
||||||
|
public FSDataOutputStream create(Path f) throws IOException {
|
||||||
return wrapOutputStream(f, fileSystem.create(convertToDefaultPath(f)));
|
return wrapOutputStream(f, fileSystem.create(convertToDefaultPath(f)));
|
||||||
}
|
}
|
||||||
|
|
||||||
@Override public FSDataOutputStream create(Path f, Progressable progress) throws IOException {
|
@Override
|
||||||
|
public FSDataOutputStream create(Path f, Progressable progress) throws IOException {
|
||||||
return fileSystem.create(convertToDefaultPath(f), progress);
|
return fileSystem.create(convertToDefaultPath(f), progress);
|
||||||
}
|
}
|
||||||
|
|
||||||
@Override public FSDataOutputStream create(Path f, short replication) throws IOException {
|
@Override
|
||||||
|
public FSDataOutputStream create(Path f, short replication) throws IOException {
|
||||||
return fileSystem.create(convertToDefaultPath(f), replication);
|
return fileSystem.create(convertToDefaultPath(f), replication);
|
||||||
}
|
}
|
||||||
|
|
||||||
@Override public FSDataOutputStream create(Path f, short replication, Progressable progress)
|
@Override
|
||||||
|
public FSDataOutputStream create(Path f, short replication, Progressable progress)
|
||||||
throws IOException {
|
throws IOException {
|
||||||
return fileSystem.create(convertToDefaultPath(f), replication, progress);
|
return fileSystem.create(convertToDefaultPath(f), replication, progress);
|
||||||
}
|
}
|
||||||
|
|
||||||
@Override public FSDataOutputStream create(Path f, boolean overwrite, int bufferSize)
|
@Override
|
||||||
|
public FSDataOutputStream create(Path f, boolean overwrite, int bufferSize)
|
||||||
throws IOException {
|
throws IOException {
|
||||||
return fileSystem.create(convertToDefaultPath(f), overwrite, bufferSize);
|
return fileSystem.create(convertToDefaultPath(f), overwrite, bufferSize);
|
||||||
}
|
}
|
||||||
|
|
||||||
@Override public FSDataOutputStream create(Path f, boolean overwrite, int bufferSize,
|
@Override
|
||||||
|
public FSDataOutputStream create(Path f, boolean overwrite, int bufferSize,
|
||||||
Progressable progress) throws IOException {
|
Progressable progress) throws IOException {
|
||||||
return fileSystem.create(convertToDefaultPath(f), overwrite, bufferSize, progress);
|
return fileSystem.create(convertToDefaultPath(f), overwrite, bufferSize, progress);
|
||||||
}
|
}
|
||||||
@@ -173,91 +202,112 @@ public class HoodieWrapperFileSystem extends FileSystem {
|
|||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
@Override public FSDataOutputStream append(Path f, int bufferSize, Progressable progress)
|
@Override
|
||||||
|
public FSDataOutputStream append(Path f, int bufferSize, Progressable progress)
|
||||||
throws IOException {
|
throws IOException {
|
||||||
return fileSystem.append(convertToDefaultPath(f), bufferSize, progress);
|
return fileSystem.append(convertToDefaultPath(f), bufferSize, progress);
|
||||||
}
|
}
|
||||||
|
|
||||||
@Override public boolean rename(Path src, Path dst) throws IOException {
|
@Override
|
||||||
|
public boolean rename(Path src, Path dst) throws IOException {
|
||||||
return fileSystem.rename(convertToDefaultPath(src), convertToDefaultPath(dst));
|
return fileSystem.rename(convertToDefaultPath(src), convertToDefaultPath(dst));
|
||||||
}
|
}
|
||||||
|
|
||||||
@Override public boolean delete(Path f, boolean recursive) throws IOException {
|
@Override
|
||||||
|
public boolean delete(Path f, boolean recursive) throws IOException {
|
||||||
return fileSystem.delete(convertToDefaultPath(f), recursive);
|
return fileSystem.delete(convertToDefaultPath(f), recursive);
|
||||||
}
|
}
|
||||||
|
|
||||||
@Override public FileStatus[] listStatus(Path f) throws FileNotFoundException, IOException {
|
@Override
|
||||||
|
public FileStatus[] listStatus(Path f) throws FileNotFoundException, IOException {
|
||||||
return fileSystem.listStatus(convertToDefaultPath(f));
|
return fileSystem.listStatus(convertToDefaultPath(f));
|
||||||
}
|
}
|
||||||
|
|
||||||
@Override public void setWorkingDirectory(Path new_dir) {
|
@Override
|
||||||
|
public void setWorkingDirectory(Path new_dir) {
|
||||||
fileSystem.setWorkingDirectory(convertToDefaultPath(new_dir));
|
fileSystem.setWorkingDirectory(convertToDefaultPath(new_dir));
|
||||||
}
|
}
|
||||||
|
|
||||||
@Override public Path getWorkingDirectory() {
|
@Override
|
||||||
|
public Path getWorkingDirectory() {
|
||||||
return convertToHoodiePath(fileSystem.getWorkingDirectory());
|
return convertToHoodiePath(fileSystem.getWorkingDirectory());
|
||||||
}
|
}
|
||||||
|
|
||||||
@Override public boolean mkdirs(Path f, FsPermission permission) throws IOException {
|
@Override
|
||||||
|
public boolean mkdirs(Path f, FsPermission permission) throws IOException {
|
||||||
return fileSystem.mkdirs(convertToDefaultPath(f), permission);
|
return fileSystem.mkdirs(convertToDefaultPath(f), permission);
|
||||||
}
|
}
|
||||||
|
|
||||||
@Override public FileStatus getFileStatus(Path f) throws IOException {
|
@Override
|
||||||
|
public FileStatus getFileStatus(Path f) throws IOException {
|
||||||
return fileSystem.getFileStatus(convertToDefaultPath(f));
|
return fileSystem.getFileStatus(convertToDefaultPath(f));
|
||||||
}
|
}
|
||||||
|
|
||||||
@Override public String getScheme() {
|
@Override
|
||||||
|
public String getScheme() {
|
||||||
return uri.getScheme();
|
return uri.getScheme();
|
||||||
}
|
}
|
||||||
|
|
||||||
@Override public String getCanonicalServiceName() {
|
@Override
|
||||||
|
public String getCanonicalServiceName() {
|
||||||
return fileSystem.getCanonicalServiceName();
|
return fileSystem.getCanonicalServiceName();
|
||||||
}
|
}
|
||||||
|
|
||||||
@Override public String getName() {
|
@Override
|
||||||
|
public String getName() {
|
||||||
return fileSystem.getName();
|
return fileSystem.getName();
|
||||||
}
|
}
|
||||||
|
|
||||||
@Override public Path makeQualified(Path path) {
|
@Override
|
||||||
|
public Path makeQualified(Path path) {
|
||||||
return convertToHoodiePath(fileSystem.makeQualified(convertToDefaultPath(path)));
|
return convertToHoodiePath(fileSystem.makeQualified(convertToDefaultPath(path)));
|
||||||
}
|
}
|
||||||
|
|
||||||
@Override public Token<?> getDelegationToken(String renewer) throws IOException {
|
@Override
|
||||||
|
public Token<?> getDelegationToken(String renewer) throws IOException {
|
||||||
return fileSystem.getDelegationToken(renewer);
|
return fileSystem.getDelegationToken(renewer);
|
||||||
}
|
}
|
||||||
|
|
||||||
@Override public Token<?>[] addDelegationTokens(String renewer, Credentials credentials)
|
@Override
|
||||||
|
public Token<?>[] addDelegationTokens(String renewer, Credentials credentials)
|
||||||
throws IOException {
|
throws IOException {
|
||||||
return fileSystem.addDelegationTokens(renewer, credentials);
|
return fileSystem.addDelegationTokens(renewer, credentials);
|
||||||
}
|
}
|
||||||
|
|
||||||
@Override public FileSystem[] getChildFileSystems() {
|
@Override
|
||||||
|
public FileSystem[] getChildFileSystems() {
|
||||||
return fileSystem.getChildFileSystems();
|
return fileSystem.getChildFileSystems();
|
||||||
}
|
}
|
||||||
|
|
||||||
@Override public BlockLocation[] getFileBlockLocations(FileStatus file, long start, long len)
|
@Override
|
||||||
|
public BlockLocation[] getFileBlockLocations(FileStatus file, long start, long len)
|
||||||
throws IOException {
|
throws IOException {
|
||||||
return fileSystem.getFileBlockLocations(file, start, len);
|
return fileSystem.getFileBlockLocations(file, start, len);
|
||||||
}
|
}
|
||||||
|
|
||||||
@Override public BlockLocation[] getFileBlockLocations(Path p, long start, long len)
|
@Override
|
||||||
|
public BlockLocation[] getFileBlockLocations(Path p, long start, long len)
|
||||||
throws IOException {
|
throws IOException {
|
||||||
return fileSystem.getFileBlockLocations(convertToDefaultPath(p), start, len);
|
return fileSystem.getFileBlockLocations(convertToDefaultPath(p), start, len);
|
||||||
}
|
}
|
||||||
|
|
||||||
@Override public FsServerDefaults getServerDefaults() throws IOException {
|
@Override
|
||||||
|
public FsServerDefaults getServerDefaults() throws IOException {
|
||||||
return fileSystem.getServerDefaults();
|
return fileSystem.getServerDefaults();
|
||||||
}
|
}
|
||||||
|
|
||||||
@Override public FsServerDefaults getServerDefaults(Path p) throws IOException {
|
@Override
|
||||||
|
public FsServerDefaults getServerDefaults(Path p) throws IOException {
|
||||||
return fileSystem.getServerDefaults(convertToDefaultPath(p));
|
return fileSystem.getServerDefaults(convertToDefaultPath(p));
|
||||||
}
|
}
|
||||||
|
|
||||||
@Override public Path resolvePath(Path p) throws IOException {
|
@Override
|
||||||
|
public Path resolvePath(Path p) throws IOException {
|
||||||
return convertToHoodiePath(fileSystem.resolvePath(convertToDefaultPath(p)));
|
return convertToHoodiePath(fileSystem.resolvePath(convertToDefaultPath(p)));
|
||||||
}
|
}
|
||||||
|
|
||||||
@Override public FSDataInputStream open(Path f) throws IOException {
|
@Override
|
||||||
|
public FSDataInputStream open(Path f) throws IOException {
|
||||||
return fileSystem.open(convertToDefaultPath(f));
|
return fileSystem.open(convertToDefaultPath(f));
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -278,7 +328,8 @@ public class HoodieWrapperFileSystem extends FileSystem {
|
|||||||
replication, blockSize, progress);
|
replication, blockSize, progress);
|
||||||
}
|
}
|
||||||
|
|
||||||
@Override public FSDataOutputStream createNonRecursive(Path f, FsPermission permission,
|
@Override
|
||||||
|
public FSDataOutputStream createNonRecursive(Path f, FsPermission permission,
|
||||||
EnumSet<CreateFlag> flags, int bufferSize, short replication, long blockSize,
|
EnumSet<CreateFlag> flags, int bufferSize, short replication, long blockSize,
|
||||||
Progressable progress) throws IOException {
|
Progressable progress) throws IOException {
|
||||||
return fileSystem
|
return fileSystem
|
||||||
@@ -286,122 +337,150 @@ public class HoodieWrapperFileSystem extends FileSystem {
|
|||||||
blockSize, progress);
|
blockSize, progress);
|
||||||
}
|
}
|
||||||
|
|
||||||
@Override public boolean createNewFile(Path f) throws IOException {
|
@Override
|
||||||
|
public boolean createNewFile(Path f) throws IOException {
|
||||||
return fileSystem.createNewFile(convertToDefaultPath(f));
|
return fileSystem.createNewFile(convertToDefaultPath(f));
|
||||||
}
|
}
|
||||||
|
|
||||||
@Override public FSDataOutputStream append(Path f) throws IOException {
|
@Override
|
||||||
|
public FSDataOutputStream append(Path f) throws IOException {
|
||||||
return fileSystem.append(convertToDefaultPath(f));
|
return fileSystem.append(convertToDefaultPath(f));
|
||||||
}
|
}
|
||||||
|
|
||||||
@Override public FSDataOutputStream append(Path f, int bufferSize) throws IOException {
|
@Override
|
||||||
|
public FSDataOutputStream append(Path f, int bufferSize) throws IOException {
|
||||||
return fileSystem.append(convertToDefaultPath(f), bufferSize);
|
return fileSystem.append(convertToDefaultPath(f), bufferSize);
|
||||||
}
|
}
|
||||||
|
|
||||||
@Override public void concat(Path trg, Path[] psrcs) throws IOException {
|
@Override
|
||||||
|
public void concat(Path trg, Path[] psrcs) throws IOException {
|
||||||
Path[] psrcsNew = convertDefaults(psrcs);
|
Path[] psrcsNew = convertDefaults(psrcs);
|
||||||
fileSystem.concat(convertToDefaultPath(trg), psrcsNew);
|
fileSystem.concat(convertToDefaultPath(trg), psrcsNew);
|
||||||
}
|
}
|
||||||
|
|
||||||
@Override public short getReplication(Path src) throws IOException {
|
@Override
|
||||||
|
public short getReplication(Path src) throws IOException {
|
||||||
return fileSystem.getReplication(convertToDefaultPath(src));
|
return fileSystem.getReplication(convertToDefaultPath(src));
|
||||||
}
|
}
|
||||||
|
|
||||||
@Override public boolean setReplication(Path src, short replication) throws IOException {
|
@Override
|
||||||
|
public boolean setReplication(Path src, short replication) throws IOException {
|
||||||
return fileSystem.setReplication(convertToDefaultPath(src), replication);
|
return fileSystem.setReplication(convertToDefaultPath(src), replication);
|
||||||
}
|
}
|
||||||
|
|
||||||
@Override public boolean delete(Path f) throws IOException {
|
@Override
|
||||||
|
public boolean delete(Path f) throws IOException {
|
||||||
return fileSystem.delete(convertToDefaultPath(f));
|
return fileSystem.delete(convertToDefaultPath(f));
|
||||||
}
|
}
|
||||||
|
|
||||||
@Override public boolean deleteOnExit(Path f) throws IOException {
|
@Override
|
||||||
|
public boolean deleteOnExit(Path f) throws IOException {
|
||||||
return fileSystem.deleteOnExit(convertToDefaultPath(f));
|
return fileSystem.deleteOnExit(convertToDefaultPath(f));
|
||||||
}
|
}
|
||||||
|
|
||||||
@Override public boolean cancelDeleteOnExit(Path f) {
|
@Override
|
||||||
|
public boolean cancelDeleteOnExit(Path f) {
|
||||||
return fileSystem.cancelDeleteOnExit(convertToDefaultPath(f));
|
return fileSystem.cancelDeleteOnExit(convertToDefaultPath(f));
|
||||||
}
|
}
|
||||||
|
|
||||||
@Override public boolean exists(Path f) throws IOException {
|
@Override
|
||||||
|
public boolean exists(Path f) throws IOException {
|
||||||
return fileSystem.exists(convertToDefaultPath(f));
|
return fileSystem.exists(convertToDefaultPath(f));
|
||||||
}
|
}
|
||||||
|
|
||||||
@Override public boolean isDirectory(Path f) throws IOException {
|
@Override
|
||||||
|
public boolean isDirectory(Path f) throws IOException {
|
||||||
return fileSystem.isDirectory(convertToDefaultPath(f));
|
return fileSystem.isDirectory(convertToDefaultPath(f));
|
||||||
}
|
}
|
||||||
|
|
||||||
@Override public boolean isFile(Path f) throws IOException {
|
@Override
|
||||||
|
public boolean isFile(Path f) throws IOException {
|
||||||
return fileSystem.isFile(convertToDefaultPath(f));
|
return fileSystem.isFile(convertToDefaultPath(f));
|
||||||
}
|
}
|
||||||
|
|
||||||
@Override public long getLength(Path f) throws IOException {
|
@Override
|
||||||
|
public long getLength(Path f) throws IOException {
|
||||||
return fileSystem.getLength(convertToDefaultPath(f));
|
return fileSystem.getLength(convertToDefaultPath(f));
|
||||||
}
|
}
|
||||||
|
|
||||||
@Override public ContentSummary getContentSummary(Path f) throws IOException {
|
@Override
|
||||||
|
public ContentSummary getContentSummary(Path f) throws IOException {
|
||||||
return fileSystem.getContentSummary(convertToDefaultPath(f));
|
return fileSystem.getContentSummary(convertToDefaultPath(f));
|
||||||
}
|
}
|
||||||
|
|
||||||
@Override public RemoteIterator<Path> listCorruptFileBlocks(Path path) throws IOException {
|
@Override
|
||||||
|
public RemoteIterator<Path> listCorruptFileBlocks(Path path) throws IOException {
|
||||||
return fileSystem.listCorruptFileBlocks(convertToDefaultPath(path));
|
return fileSystem.listCorruptFileBlocks(convertToDefaultPath(path));
|
||||||
}
|
}
|
||||||
|
|
||||||
@Override public FileStatus[] listStatus(Path f, PathFilter filter)
|
@Override
|
||||||
|
public FileStatus[] listStatus(Path f, PathFilter filter)
|
||||||
throws FileNotFoundException, IOException {
|
throws FileNotFoundException, IOException {
|
||||||
return fileSystem.listStatus(convertToDefaultPath(f), filter);
|
return fileSystem.listStatus(convertToDefaultPath(f), filter);
|
||||||
}
|
}
|
||||||
|
|
||||||
@Override public FileStatus[] listStatus(Path[] files)
|
@Override
|
||||||
|
public FileStatus[] listStatus(Path[] files)
|
||||||
throws FileNotFoundException, IOException {
|
throws FileNotFoundException, IOException {
|
||||||
return fileSystem.listStatus(convertDefaults(files));
|
return fileSystem.listStatus(convertDefaults(files));
|
||||||
}
|
}
|
||||||
|
|
||||||
@Override public FileStatus[] listStatus(Path[] files, PathFilter filter)
|
@Override
|
||||||
|
public FileStatus[] listStatus(Path[] files, PathFilter filter)
|
||||||
throws FileNotFoundException, IOException {
|
throws FileNotFoundException, IOException {
|
||||||
return fileSystem.listStatus(convertDefaults(files), filter);
|
return fileSystem.listStatus(convertDefaults(files), filter);
|
||||||
}
|
}
|
||||||
|
|
||||||
@Override public FileStatus[] globStatus(Path pathPattern) throws IOException {
|
@Override
|
||||||
|
public FileStatus[] globStatus(Path pathPattern) throws IOException {
|
||||||
return fileSystem.globStatus(convertToDefaultPath(pathPattern));
|
return fileSystem.globStatus(convertToDefaultPath(pathPattern));
|
||||||
}
|
}
|
||||||
|
|
||||||
@Override public FileStatus[] globStatus(Path pathPattern, PathFilter filter)
|
@Override
|
||||||
|
public FileStatus[] globStatus(Path pathPattern, PathFilter filter)
|
||||||
throws IOException {
|
throws IOException {
|
||||||
return fileSystem.globStatus(convertToDefaultPath(pathPattern), filter);
|
return fileSystem.globStatus(convertToDefaultPath(pathPattern), filter);
|
||||||
}
|
}
|
||||||
|
|
||||||
@Override public RemoteIterator<LocatedFileStatus> listLocatedStatus(Path f)
|
@Override
|
||||||
|
public RemoteIterator<LocatedFileStatus> listLocatedStatus(Path f)
|
||||||
throws FileNotFoundException, IOException {
|
throws FileNotFoundException, IOException {
|
||||||
return fileSystem.listLocatedStatus(convertToDefaultPath(f));
|
return fileSystem.listLocatedStatus(convertToDefaultPath(f));
|
||||||
}
|
}
|
||||||
|
|
||||||
@Override public RemoteIterator<LocatedFileStatus> listFiles(Path f, boolean recursive)
|
@Override
|
||||||
|
public RemoteIterator<LocatedFileStatus> listFiles(Path f, boolean recursive)
|
||||||
throws FileNotFoundException, IOException {
|
throws FileNotFoundException, IOException {
|
||||||
return fileSystem.listFiles(convertToDefaultPath(f), recursive);
|
return fileSystem.listFiles(convertToDefaultPath(f), recursive);
|
||||||
}
|
}
|
||||||
|
|
||||||
@Override public Path getHomeDirectory() {
|
@Override
|
||||||
|
public Path getHomeDirectory() {
|
||||||
return convertToHoodiePath(fileSystem.getHomeDirectory());
|
return convertToHoodiePath(fileSystem.getHomeDirectory());
|
||||||
}
|
}
|
||||||
|
|
||||||
@Override public boolean mkdirs(Path f) throws IOException {
|
@Override
|
||||||
|
public boolean mkdirs(Path f) throws IOException {
|
||||||
return fileSystem.mkdirs(convertToDefaultPath(f));
|
return fileSystem.mkdirs(convertToDefaultPath(f));
|
||||||
}
|
}
|
||||||
|
|
||||||
@Override public void copyFromLocalFile(Path src, Path dst) throws IOException {
|
@Override
|
||||||
|
public void copyFromLocalFile(Path src, Path dst) throws IOException {
|
||||||
fileSystem.copyFromLocalFile(convertToDefaultPath(src), convertToDefaultPath(dst));
|
fileSystem.copyFromLocalFile(convertToDefaultPath(src), convertToDefaultPath(dst));
|
||||||
}
|
}
|
||||||
|
|
||||||
@Override public void moveFromLocalFile(Path[] srcs, Path dst) throws IOException {
|
@Override
|
||||||
|
public void moveFromLocalFile(Path[] srcs, Path dst) throws IOException {
|
||||||
fileSystem.moveFromLocalFile(convertDefaults(srcs), convertToDefaultPath(dst));
|
fileSystem.moveFromLocalFile(convertDefaults(srcs), convertToDefaultPath(dst));
|
||||||
}
|
}
|
||||||
|
|
||||||
@Override public void moveFromLocalFile(Path src, Path dst) throws IOException {
|
@Override
|
||||||
|
public void moveFromLocalFile(Path src, Path dst) throws IOException {
|
||||||
fileSystem.moveFromLocalFile(convertToDefaultPath(src), convertToDefaultPath(dst));
|
fileSystem.moveFromLocalFile(convertToDefaultPath(src), convertToDefaultPath(dst));
|
||||||
}
|
}
|
||||||
|
|
||||||
@Override public void copyFromLocalFile(boolean delSrc, Path src, Path dst) throws IOException {
|
@Override
|
||||||
|
public void copyFromLocalFile(boolean delSrc, Path src, Path dst) throws IOException {
|
||||||
fileSystem.copyFromLocalFile(delSrc, convertToDefaultPath(src), convertToDefaultPath(dst));
|
fileSystem.copyFromLocalFile(delSrc, convertToDefaultPath(src), convertToDefaultPath(dst));
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -412,21 +491,25 @@ public class HoodieWrapperFileSystem extends FileSystem {
|
|||||||
.copyFromLocalFile(delSrc, overwrite, convertDefaults(srcs), convertToDefaultPath(dst));
|
.copyFromLocalFile(delSrc, overwrite, convertDefaults(srcs), convertToDefaultPath(dst));
|
||||||
}
|
}
|
||||||
|
|
||||||
@Override public void copyFromLocalFile(boolean delSrc, boolean overwrite, Path src, Path dst)
|
@Override
|
||||||
|
public void copyFromLocalFile(boolean delSrc, boolean overwrite, Path src, Path dst)
|
||||||
throws IOException {
|
throws IOException {
|
||||||
fileSystem.copyFromLocalFile(delSrc, overwrite, convertToDefaultPath(src),
|
fileSystem.copyFromLocalFile(delSrc, overwrite, convertToDefaultPath(src),
|
||||||
convertToDefaultPath(dst));
|
convertToDefaultPath(dst));
|
||||||
}
|
}
|
||||||
|
|
||||||
@Override public void copyToLocalFile(Path src, Path dst) throws IOException {
|
@Override
|
||||||
|
public void copyToLocalFile(Path src, Path dst) throws IOException {
|
||||||
fileSystem.copyToLocalFile(convertToDefaultPath(src), convertToDefaultPath(dst));
|
fileSystem.copyToLocalFile(convertToDefaultPath(src), convertToDefaultPath(dst));
|
||||||
}
|
}
|
||||||
|
|
||||||
@Override public void moveToLocalFile(Path src, Path dst) throws IOException {
|
@Override
|
||||||
|
public void moveToLocalFile(Path src, Path dst) throws IOException {
|
||||||
fileSystem.moveToLocalFile(convertToDefaultPath(src), convertToDefaultPath(dst));
|
fileSystem.moveToLocalFile(convertToDefaultPath(src), convertToDefaultPath(dst));
|
||||||
}
|
}
|
||||||
|
|
||||||
@Override public void copyToLocalFile(boolean delSrc, Path src, Path dst) throws IOException {
|
@Override
|
||||||
|
public void copyToLocalFile(boolean delSrc, Path src, Path dst) throws IOException {
|
||||||
fileSystem.copyToLocalFile(delSrc, convertToDefaultPath(src), convertToDefaultPath(dst));
|
fileSystem.copyToLocalFile(delSrc, convertToDefaultPath(src), convertToDefaultPath(dst));
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -437,193 +520,237 @@ public class HoodieWrapperFileSystem extends FileSystem {
|
|||||||
useRawLocalFileSystem);
|
useRawLocalFileSystem);
|
||||||
}
|
}
|
||||||
|
|
||||||
@Override public Path startLocalOutput(Path fsOutputFile, Path tmpLocalFile)
|
@Override
|
||||||
|
public Path startLocalOutput(Path fsOutputFile, Path tmpLocalFile)
|
||||||
throws IOException {
|
throws IOException {
|
||||||
return convertToHoodiePath(fileSystem.startLocalOutput(convertToDefaultPath(fsOutputFile),
|
return convertToHoodiePath(fileSystem.startLocalOutput(convertToDefaultPath(fsOutputFile),
|
||||||
convertToDefaultPath(tmpLocalFile)));
|
convertToDefaultPath(tmpLocalFile)));
|
||||||
}
|
}
|
||||||
|
|
||||||
@Override public void completeLocalOutput(Path fsOutputFile, Path tmpLocalFile)
|
@Override
|
||||||
|
public void completeLocalOutput(Path fsOutputFile, Path tmpLocalFile)
|
||||||
throws IOException {
|
throws IOException {
|
||||||
fileSystem.completeLocalOutput(convertToDefaultPath(fsOutputFile),
|
fileSystem.completeLocalOutput(convertToDefaultPath(fsOutputFile),
|
||||||
convertToDefaultPath(tmpLocalFile));
|
convertToDefaultPath(tmpLocalFile));
|
||||||
}
|
}
|
||||||
|
|
||||||
@Override public void close() throws IOException {
|
@Override
|
||||||
|
public void close() throws IOException {
|
||||||
fileSystem.close();
|
fileSystem.close();
|
||||||
}
|
}
|
||||||
|
|
||||||
@Override public long getUsed() throws IOException {
|
@Override
|
||||||
|
public long getUsed() throws IOException {
|
||||||
return fileSystem.getUsed();
|
return fileSystem.getUsed();
|
||||||
}
|
}
|
||||||
|
|
||||||
@Override public long getBlockSize(Path f) throws IOException {
|
@Override
|
||||||
|
public long getBlockSize(Path f) throws IOException {
|
||||||
return fileSystem.getBlockSize(convertToDefaultPath(f));
|
return fileSystem.getBlockSize(convertToDefaultPath(f));
|
||||||
}
|
}
|
||||||
|
|
||||||
@Override public long getDefaultBlockSize() {
|
@Override
|
||||||
|
public long getDefaultBlockSize() {
|
||||||
return fileSystem.getDefaultBlockSize();
|
return fileSystem.getDefaultBlockSize();
|
||||||
}
|
}
|
||||||
|
|
||||||
@Override public long getDefaultBlockSize(Path f) {
|
@Override
|
||||||
|
public long getDefaultBlockSize(Path f) {
|
||||||
return fileSystem.getDefaultBlockSize(convertToDefaultPath(f));
|
return fileSystem.getDefaultBlockSize(convertToDefaultPath(f));
|
||||||
}
|
}
|
||||||
|
|
||||||
@Override public short getDefaultReplication() {
|
@Override
|
||||||
|
public short getDefaultReplication() {
|
||||||
return fileSystem.getDefaultReplication();
|
return fileSystem.getDefaultReplication();
|
||||||
}
|
}
|
||||||
|
|
||||||
@Override public short getDefaultReplication(Path path) {
|
@Override
|
||||||
|
public short getDefaultReplication(Path path) {
|
||||||
return fileSystem.getDefaultReplication(convertToDefaultPath(path));
|
return fileSystem.getDefaultReplication(convertToDefaultPath(path));
|
||||||
}
|
}
|
||||||
|
|
||||||
@Override public void access(Path path, FsAction mode)
|
@Override
|
||||||
|
public void access(Path path, FsAction mode)
|
||||||
throws AccessControlException, FileNotFoundException, IOException {
|
throws AccessControlException, FileNotFoundException, IOException {
|
||||||
fileSystem.access(convertToDefaultPath(path), mode);
|
fileSystem.access(convertToDefaultPath(path), mode);
|
||||||
}
|
}
|
||||||
|
|
||||||
@Override public void createSymlink(Path target, Path link, boolean createParent)
|
@Override
|
||||||
|
public void createSymlink(Path target, Path link, boolean createParent)
|
||||||
throws AccessControlException, FileAlreadyExistsException, FileNotFoundException,
|
throws AccessControlException, FileAlreadyExistsException, FileNotFoundException,
|
||||||
ParentNotDirectoryException, UnsupportedFileSystemException, IOException {
|
ParentNotDirectoryException, UnsupportedFileSystemException, IOException {
|
||||||
fileSystem
|
fileSystem
|
||||||
.createSymlink(convertToDefaultPath(target), convertToDefaultPath(link), createParent);
|
.createSymlink(convertToDefaultPath(target), convertToDefaultPath(link), createParent);
|
||||||
}
|
}
|
||||||
|
|
||||||
@Override public FileStatus getFileLinkStatus(Path f)
|
@Override
|
||||||
|
public FileStatus getFileLinkStatus(Path f)
|
||||||
throws AccessControlException, FileNotFoundException, UnsupportedFileSystemException,
|
throws AccessControlException, FileNotFoundException, UnsupportedFileSystemException,
|
||||||
IOException {
|
IOException {
|
||||||
return fileSystem.getFileLinkStatus(convertToDefaultPath(f));
|
return fileSystem.getFileLinkStatus(convertToDefaultPath(f));
|
||||||
}
|
}
|
||||||
|
|
||||||
@Override public boolean supportsSymlinks() {
|
@Override
|
||||||
|
public boolean supportsSymlinks() {
|
||||||
return fileSystem.supportsSymlinks();
|
return fileSystem.supportsSymlinks();
|
||||||
}
|
}
|
||||||
|
|
||||||
@Override public Path getLinkTarget(Path f) throws IOException {
|
@Override
|
||||||
|
public Path getLinkTarget(Path f) throws IOException {
|
||||||
return convertToHoodiePath(fileSystem.getLinkTarget(convertToDefaultPath(f)));
|
return convertToHoodiePath(fileSystem.getLinkTarget(convertToDefaultPath(f)));
|
||||||
}
|
}
|
||||||
|
|
||||||
@Override public FileChecksum getFileChecksum(Path f) throws IOException {
|
@Override
|
||||||
|
public FileChecksum getFileChecksum(Path f) throws IOException {
|
||||||
return fileSystem.getFileChecksum(convertToDefaultPath(f));
|
return fileSystem.getFileChecksum(convertToDefaultPath(f));
|
||||||
}
|
}
|
||||||
|
|
||||||
@Override public FileChecksum getFileChecksum(Path f, long length) throws IOException {
|
@Override
|
||||||
|
public FileChecksum getFileChecksum(Path f, long length) throws IOException {
|
||||||
return fileSystem.getFileChecksum(convertToDefaultPath(f), length);
|
return fileSystem.getFileChecksum(convertToDefaultPath(f), length);
|
||||||
}
|
}
|
||||||
|
|
||||||
@Override public void setVerifyChecksum(boolean verifyChecksum) {
|
@Override
|
||||||
|
public void setVerifyChecksum(boolean verifyChecksum) {
|
||||||
fileSystem.setVerifyChecksum(verifyChecksum);
|
fileSystem.setVerifyChecksum(verifyChecksum);
|
||||||
}
|
}
|
||||||
|
|
||||||
@Override public void setWriteChecksum(boolean writeChecksum) {
|
@Override
|
||||||
|
public void setWriteChecksum(boolean writeChecksum) {
|
||||||
fileSystem.setWriteChecksum(writeChecksum);
|
fileSystem.setWriteChecksum(writeChecksum);
|
||||||
}
|
}
|
||||||
|
|
||||||
@Override public FsStatus getStatus() throws IOException {
|
@Override
|
||||||
|
public FsStatus getStatus() throws IOException {
|
||||||
return fileSystem.getStatus();
|
return fileSystem.getStatus();
|
||||||
}
|
}
|
||||||
|
|
||||||
@Override public FsStatus getStatus(Path p) throws IOException {
|
@Override
|
||||||
|
public FsStatus getStatus(Path p) throws IOException {
|
||||||
return fileSystem.getStatus(convertToDefaultPath(p));
|
return fileSystem.getStatus(convertToDefaultPath(p));
|
||||||
}
|
}
|
||||||
|
|
||||||
@Override public void setPermission(Path p, FsPermission permission) throws IOException {
|
@Override
|
||||||
|
public void setPermission(Path p, FsPermission permission) throws IOException {
|
||||||
fileSystem.setPermission(convertToDefaultPath(p), permission);
|
fileSystem.setPermission(convertToDefaultPath(p), permission);
|
||||||
}
|
}
|
||||||
|
|
||||||
@Override public void setOwner(Path p, String username, String groupname) throws IOException {
|
@Override
|
||||||
|
public void setOwner(Path p, String username, String groupname) throws IOException {
|
||||||
fileSystem.setOwner(convertToDefaultPath(p), username, groupname);
|
fileSystem.setOwner(convertToDefaultPath(p), username, groupname);
|
||||||
}
|
}
|
||||||
|
|
||||||
@Override public void setTimes(Path p, long mtime, long atime) throws IOException {
|
@Override
|
||||||
|
public void setTimes(Path p, long mtime, long atime) throws IOException {
|
||||||
fileSystem.setTimes(convertToDefaultPath(p), mtime, atime);
|
fileSystem.setTimes(convertToDefaultPath(p), mtime, atime);
|
||||||
}
|
}
|
||||||
|
|
||||||
@Override public Path createSnapshot(Path path, String snapshotName) throws IOException {
|
@Override
|
||||||
|
public Path createSnapshot(Path path, String snapshotName) throws IOException {
|
||||||
return convertToHoodiePath(
|
return convertToHoodiePath(
|
||||||
fileSystem.createSnapshot(convertToDefaultPath(path), snapshotName));
|
fileSystem.createSnapshot(convertToDefaultPath(path), snapshotName));
|
||||||
}
|
}
|
||||||
|
|
||||||
@Override public void renameSnapshot(Path path, String snapshotOldName, String snapshotNewName)
|
@Override
|
||||||
|
public void renameSnapshot(Path path, String snapshotOldName, String snapshotNewName)
|
||||||
throws IOException {
|
throws IOException {
|
||||||
fileSystem.renameSnapshot(convertToDefaultPath(path), snapshotOldName, snapshotNewName);
|
fileSystem.renameSnapshot(convertToDefaultPath(path), snapshotOldName, snapshotNewName);
|
||||||
}
|
}
|
||||||
|
|
||||||
@Override public void deleteSnapshot(Path path, String snapshotName) throws IOException {
|
@Override
|
||||||
|
public void deleteSnapshot(Path path, String snapshotName) throws IOException {
|
||||||
fileSystem.deleteSnapshot(convertToDefaultPath(path), snapshotName);
|
fileSystem.deleteSnapshot(convertToDefaultPath(path), snapshotName);
|
||||||
}
|
}
|
||||||
|
|
||||||
@Override public void modifyAclEntries(Path path, List<AclEntry> aclSpec) throws IOException {
|
@Override
|
||||||
|
public void modifyAclEntries(Path path, List<AclEntry> aclSpec) throws IOException {
|
||||||
fileSystem.modifyAclEntries(convertToDefaultPath(path), aclSpec);
|
fileSystem.modifyAclEntries(convertToDefaultPath(path), aclSpec);
|
||||||
}
|
}
|
||||||
|
|
||||||
@Override public void removeAclEntries(Path path, List<AclEntry> aclSpec) throws IOException {
|
@Override
|
||||||
|
public void removeAclEntries(Path path, List<AclEntry> aclSpec) throws IOException {
|
||||||
fileSystem.removeAclEntries(convertToDefaultPath(path), aclSpec);
|
fileSystem.removeAclEntries(convertToDefaultPath(path), aclSpec);
|
||||||
}
|
}
|
||||||
|
|
||||||
@Override public void removeDefaultAcl(Path path) throws IOException {
|
@Override
|
||||||
|
public void removeDefaultAcl(Path path) throws IOException {
|
||||||
fileSystem.removeDefaultAcl(convertToDefaultPath(path));
|
fileSystem.removeDefaultAcl(convertToDefaultPath(path));
|
||||||
}
|
}
|
||||||
|
|
||||||
@Override public void removeAcl(Path path) throws IOException {
|
@Override
|
||||||
|
public void removeAcl(Path path) throws IOException {
|
||||||
fileSystem.removeAcl(convertToDefaultPath(path));
|
fileSystem.removeAcl(convertToDefaultPath(path));
|
||||||
}
|
}
|
||||||
|
|
||||||
@Override public void setAcl(Path path, List<AclEntry> aclSpec) throws IOException {
|
@Override
|
||||||
|
public void setAcl(Path path, List<AclEntry> aclSpec) throws IOException {
|
||||||
fileSystem.setAcl(convertToDefaultPath(path), aclSpec);
|
fileSystem.setAcl(convertToDefaultPath(path), aclSpec);
|
||||||
}
|
}
|
||||||
|
|
||||||
@Override public AclStatus getAclStatus(Path path) throws IOException {
|
@Override
|
||||||
|
public AclStatus getAclStatus(Path path) throws IOException {
|
||||||
return fileSystem.getAclStatus(convertToDefaultPath(path));
|
return fileSystem.getAclStatus(convertToDefaultPath(path));
|
||||||
}
|
}
|
||||||
|
|
||||||
@Override public void setXAttr(Path path, String name, byte[] value) throws IOException {
|
@Override
|
||||||
|
public void setXAttr(Path path, String name, byte[] value) throws IOException {
|
||||||
fileSystem.setXAttr(convertToDefaultPath(path), name, value);
|
fileSystem.setXAttr(convertToDefaultPath(path), name, value);
|
||||||
}
|
}
|
||||||
|
|
||||||
@Override public void setXAttr(Path path, String name, byte[] value, EnumSet<XAttrSetFlag> flag)
|
@Override
|
||||||
|
public void setXAttr(Path path, String name, byte[] value, EnumSet<XAttrSetFlag> flag)
|
||||||
throws IOException {
|
throws IOException {
|
||||||
fileSystem.setXAttr(convertToDefaultPath(path), name, value, flag);
|
fileSystem.setXAttr(convertToDefaultPath(path), name, value, flag);
|
||||||
}
|
}
|
||||||
|
|
||||||
@Override public byte[] getXAttr(Path path, String name) throws IOException {
|
@Override
|
||||||
|
public byte[] getXAttr(Path path, String name) throws IOException {
|
||||||
return fileSystem.getXAttr(convertToDefaultPath(path), name);
|
return fileSystem.getXAttr(convertToDefaultPath(path), name);
|
||||||
}
|
}
|
||||||
|
|
||||||
@Override public Map<String, byte[]> getXAttrs(Path path) throws IOException {
|
@Override
|
||||||
|
public Map<String, byte[]> getXAttrs(Path path) throws IOException {
|
||||||
return fileSystem.getXAttrs(convertToDefaultPath(path));
|
return fileSystem.getXAttrs(convertToDefaultPath(path));
|
||||||
}
|
}
|
||||||
|
|
||||||
@Override public Map<String, byte[]> getXAttrs(Path path, List<String> names)
|
@Override
|
||||||
|
public Map<String, byte[]> getXAttrs(Path path, List<String> names)
|
||||||
throws IOException {
|
throws IOException {
|
||||||
return fileSystem.getXAttrs(convertToDefaultPath(path), names);
|
return fileSystem.getXAttrs(convertToDefaultPath(path), names);
|
||||||
}
|
}
|
||||||
|
|
||||||
@Override public List<String> listXAttrs(Path path) throws IOException {
|
@Override
|
||||||
|
public List<String> listXAttrs(Path path) throws IOException {
|
||||||
return fileSystem.listXAttrs(convertToDefaultPath(path));
|
return fileSystem.listXAttrs(convertToDefaultPath(path));
|
||||||
}
|
}
|
||||||
|
|
||||||
@Override public void removeXAttr(Path path, String name) throws IOException {
|
@Override
|
||||||
|
public void removeXAttr(Path path, String name) throws IOException {
|
||||||
fileSystem.removeXAttr(convertToDefaultPath(path), name);
|
fileSystem.removeXAttr(convertToDefaultPath(path), name);
|
||||||
}
|
}
|
||||||
|
|
||||||
@Override public void setConf(Configuration conf) {
|
@Override
|
||||||
|
public void setConf(Configuration conf) {
|
||||||
// ignore this. we will set conf on init
|
// ignore this. we will set conf on init
|
||||||
}
|
}
|
||||||
|
|
||||||
@Override public Configuration getConf() {
|
@Override
|
||||||
|
public Configuration getConf() {
|
||||||
return fileSystem.getConf();
|
return fileSystem.getConf();
|
||||||
}
|
}
|
||||||
|
|
||||||
@Override public int hashCode() {
|
@Override
|
||||||
|
public int hashCode() {
|
||||||
return fileSystem.hashCode();
|
return fileSystem.hashCode();
|
||||||
}
|
}
|
||||||
|
|
||||||
@Override public boolean equals(Object obj) {
|
@Override
|
||||||
|
public boolean equals(Object obj) {
|
||||||
return fileSystem.equals(obj);
|
return fileSystem.equals(obj);
|
||||||
}
|
}
|
||||||
|
|
||||||
@Override public String toString() {
|
@Override
|
||||||
|
public String toString() {
|
||||||
return fileSystem.toString();
|
return fileSystem.toString();
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|||||||
@@ -16,16 +16,16 @@
|
|||||||
|
|
||||||
package com.uber.hoodie.io.storage;
|
package com.uber.hoodie.io.storage;
|
||||||
|
|
||||||
import org.apache.hadoop.fs.FSDataOutputStream;
|
|
||||||
|
|
||||||
import java.io.IOException;
|
import java.io.IOException;
|
||||||
import java.util.concurrent.atomic.AtomicLong;
|
import java.util.concurrent.atomic.AtomicLong;
|
||||||
|
import org.apache.hadoop.fs.FSDataOutputStream;
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Wrapper over <code>FSDataOutputStream</code> to keep track of the size of the written bytes.
|
* Wrapper over <code>FSDataOutputStream</code> to keep track of the size of the written bytes. This
|
||||||
* This gives a cheap way to check on the underlying file size.
|
* gives a cheap way to check on the underlying file size.
|
||||||
*/
|
*/
|
||||||
public class SizeAwareFSDataOutputStream extends FSDataOutputStream {
|
public class SizeAwareFSDataOutputStream extends FSDataOutputStream {
|
||||||
|
|
||||||
// A callback to call when the output stream is closed.
|
// A callback to call when the output stream is closed.
|
||||||
private final Runnable closeCallback;
|
private final Runnable closeCallback;
|
||||||
// Keep track of the bytes written
|
// Keep track of the bytes written
|
||||||
@@ -37,17 +37,20 @@ public class SizeAwareFSDataOutputStream extends FSDataOutputStream {
|
|||||||
this.closeCallback = closeCallback;
|
this.closeCallback = closeCallback;
|
||||||
}
|
}
|
||||||
|
|
||||||
@Override public synchronized void write(byte[] b, int off, int len) throws IOException {
|
@Override
|
||||||
|
public synchronized void write(byte[] b, int off, int len) throws IOException {
|
||||||
bytesWritten.addAndGet(len);
|
bytesWritten.addAndGet(len);
|
||||||
super.write(b, off, len);
|
super.write(b, off, len);
|
||||||
}
|
}
|
||||||
|
|
||||||
@Override public void write(byte[] b) throws IOException {
|
@Override
|
||||||
|
public void write(byte[] b) throws IOException {
|
||||||
bytesWritten.addAndGet(b.length);
|
bytesWritten.addAndGet(b.length);
|
||||||
super.write(b);
|
super.write(b);
|
||||||
}
|
}
|
||||||
|
|
||||||
@Override public void close() throws IOException {
|
@Override
|
||||||
|
public void close() throws IOException {
|
||||||
super.close();
|
super.close();
|
||||||
closeCallback.run();
|
closeCallback.run();
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -22,7 +22,6 @@ import com.codahale.metrics.Timer;
|
|||||||
import com.google.common.annotations.VisibleForTesting;
|
import com.google.common.annotations.VisibleForTesting;
|
||||||
import com.uber.hoodie.common.model.HoodieCommitMetadata;
|
import com.uber.hoodie.common.model.HoodieCommitMetadata;
|
||||||
import com.uber.hoodie.config.HoodieWriteConfig;
|
import com.uber.hoodie.config.HoodieWriteConfig;
|
||||||
|
|
||||||
import org.apache.log4j.LogManager;
|
import org.apache.log4j.LogManager;
|
||||||
import org.apache.log4j.Logger;
|
import org.apache.log4j.Logger;
|
||||||
|
|
||||||
@@ -30,6 +29,7 @@ import org.apache.log4j.Logger;
|
|||||||
* Wrapper for metrics-related operations.
|
* Wrapper for metrics-related operations.
|
||||||
*/
|
*/
|
||||||
public class HoodieMetrics {
|
public class HoodieMetrics {
|
||||||
|
|
||||||
private HoodieWriteConfig config = null;
|
private HoodieWriteConfig config = null;
|
||||||
private String tableName = null;
|
private String tableName = null;
|
||||||
private static Logger logger = LogManager.getLogger(HoodieMetrics.class);
|
private static Logger logger = LogManager.getLogger(HoodieMetrics.class);
|
||||||
@@ -77,7 +77,8 @@ public class HoodieMetrics {
|
|||||||
return commitTimer == null ? null : commitTimer.time();
|
return commitTimer == null ? null : commitTimer.time();
|
||||||
}
|
}
|
||||||
|
|
||||||
public void updateCommitMetrics(long commitEpochTimeInMs, long durationInMs, HoodieCommitMetadata metadata) {
|
public void updateCommitMetrics(long commitEpochTimeInMs, long durationInMs,
|
||||||
|
HoodieCommitMetadata metadata) {
|
||||||
if (config.isMetricsOn()) {
|
if (config.isMetricsOn()) {
|
||||||
long totalPartitionsWritten = metadata.fetchTotalPartitionsWritten();
|
long totalPartitionsWritten = metadata.fetchTotalPartitionsWritten();
|
||||||
long totalFilesInsert = metadata.fetchTotalFilesInsert();
|
long totalFilesInsert = metadata.fetchTotalFilesInsert();
|
||||||
@@ -91,8 +92,10 @@ public class HoodieMetrics {
|
|||||||
registerGauge(getMetricsName("commit", "totalFilesInsert"), totalFilesInsert);
|
registerGauge(getMetricsName("commit", "totalFilesInsert"), totalFilesInsert);
|
||||||
registerGauge(getMetricsName("commit", "totalFilesUpdate"), totalFilesUpdate);
|
registerGauge(getMetricsName("commit", "totalFilesUpdate"), totalFilesUpdate);
|
||||||
registerGauge(getMetricsName("commit", "totalRecordsWritten"), totalRecordsWritten);
|
registerGauge(getMetricsName("commit", "totalRecordsWritten"), totalRecordsWritten);
|
||||||
registerGauge(getMetricsName("commit", "totalUpdateRecordsWritten"), totalUpdateRecordsWritten);
|
registerGauge(getMetricsName("commit", "totalUpdateRecordsWritten"),
|
||||||
registerGauge(getMetricsName("commit", "totalInsertRecordsWritten"), totalInsertRecordsWritten);
|
totalUpdateRecordsWritten);
|
||||||
|
registerGauge(getMetricsName("commit", "totalInsertRecordsWritten"),
|
||||||
|
totalInsertRecordsWritten);
|
||||||
registerGauge(getMetricsName("commit", "totalBytesWritten"), totalBytesWritten);
|
registerGauge(getMetricsName("commit", "totalBytesWritten"), totalBytesWritten);
|
||||||
registerGauge(getMetricsName("commit", "commitTime"), commitEpochTimeInMs);
|
registerGauge(getMetricsName("commit", "commitTime"), commitEpochTimeInMs);
|
||||||
}
|
}
|
||||||
@@ -139,8 +142,7 @@ public class HoodieMetrics {
|
|||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* By default, the timer context returns duration with nano seconds.
|
* By default, the timer context returns duration with nano seconds. Convert it to millisecond.
|
||||||
* Convert it to millisecond.
|
|
||||||
*/
|
*/
|
||||||
public long getDurationInMs(long ctxDuration) {
|
public long getDurationInMs(long ctxDuration) {
|
||||||
return ctxDuration / 1000000;
|
return ctxDuration / 1000000;
|
||||||
|
|||||||
@@ -22,6 +22,7 @@ import java.io.Closeable;
|
|||||||
* Used for testing.
|
* Used for testing.
|
||||||
*/
|
*/
|
||||||
public class InMemoryMetricsReporter extends MetricsReporter {
|
public class InMemoryMetricsReporter extends MetricsReporter {
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
public void start() {
|
public void start() {
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -19,16 +19,15 @@ package com.uber.hoodie.metrics;
|
|||||||
import com.codahale.metrics.MetricRegistry;
|
import com.codahale.metrics.MetricRegistry;
|
||||||
import com.google.common.io.Closeables;
|
import com.google.common.io.Closeables;
|
||||||
import com.uber.hoodie.config.HoodieWriteConfig;
|
import com.uber.hoodie.config.HoodieWriteConfig;
|
||||||
import com.uber.hoodie.config.HoodieMetricsConfig;
|
|
||||||
import com.uber.hoodie.exception.HoodieException;
|
import com.uber.hoodie.exception.HoodieException;
|
||||||
import org.apache.commons.configuration.ConfigurationException;
|
|
||||||
|
|
||||||
import java.io.Closeable;
|
import java.io.Closeable;
|
||||||
|
import org.apache.commons.configuration.ConfigurationException;
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* This is the main class of the metrics system.
|
* This is the main class of the metrics system.
|
||||||
*/
|
*/
|
||||||
public class Metrics {
|
public class Metrics {
|
||||||
|
|
||||||
private static volatile boolean initialized = false;
|
private static volatile boolean initialized = false;
|
||||||
private static Metrics metrics = null;
|
private static Metrics metrics = null;
|
||||||
private final MetricRegistry registry;
|
private final MetricRegistry registry;
|
||||||
|
|||||||
@@ -21,19 +21,18 @@ import com.codahale.metrics.MetricRegistry;
|
|||||||
import com.codahale.metrics.graphite.Graphite;
|
import com.codahale.metrics.graphite.Graphite;
|
||||||
import com.codahale.metrics.graphite.GraphiteReporter;
|
import com.codahale.metrics.graphite.GraphiteReporter;
|
||||||
import com.uber.hoodie.config.HoodieWriteConfig;
|
import com.uber.hoodie.config.HoodieWriteConfig;
|
||||||
|
|
||||||
import org.apache.log4j.LogManager;
|
|
||||||
import org.apache.log4j.Logger;
|
|
||||||
|
|
||||||
import java.io.Closeable;
|
import java.io.Closeable;
|
||||||
import java.net.InetSocketAddress;
|
import java.net.InetSocketAddress;
|
||||||
import java.util.concurrent.TimeUnit;
|
import java.util.concurrent.TimeUnit;
|
||||||
|
import org.apache.log4j.LogManager;
|
||||||
|
import org.apache.log4j.Logger;
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Implementation of Graphite reporter, which connects to the Graphite server,
|
* Implementation of Graphite reporter, which connects to the Graphite server, and send metrics to
|
||||||
* and send metrics to that server.
|
* that server.
|
||||||
*/
|
*/
|
||||||
public class MetricsGraphiteReporter extends MetricsReporter {
|
public class MetricsGraphiteReporter extends MetricsReporter {
|
||||||
|
|
||||||
private final MetricRegistry registry;
|
private final MetricRegistry registry;
|
||||||
private final GraphiteReporter graphiteReporter;
|
private final GraphiteReporter graphiteReporter;
|
||||||
private final HoodieWriteConfig config;
|
private final HoodieWriteConfig config;
|
||||||
|
|||||||
@@ -22,6 +22,7 @@ import java.io.Closeable;
|
|||||||
* Interface for implementing a Reporter.
|
* Interface for implementing a Reporter.
|
||||||
*/
|
*/
|
||||||
public abstract class MetricsReporter {
|
public abstract class MetricsReporter {
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Push out metrics at scheduled intervals
|
* Push out metrics at scheduled intervals
|
||||||
*/
|
*/
|
||||||
|
|||||||
@@ -18,7 +18,6 @@ package com.uber.hoodie.metrics;
|
|||||||
|
|
||||||
import com.codahale.metrics.MetricRegistry;
|
import com.codahale.metrics.MetricRegistry;
|
||||||
import com.uber.hoodie.config.HoodieWriteConfig;
|
import com.uber.hoodie.config.HoodieWriteConfig;
|
||||||
|
|
||||||
import org.apache.log4j.LogManager;
|
import org.apache.log4j.LogManager;
|
||||||
import org.apache.log4j.Logger;
|
import org.apache.log4j.Logger;
|
||||||
|
|
||||||
@@ -26,6 +25,7 @@ import org.apache.log4j.Logger;
|
|||||||
* Factory class for creating MetricsReporter.
|
* Factory class for creating MetricsReporter.
|
||||||
*/
|
*/
|
||||||
public class MetricsReporterFactory {
|
public class MetricsReporterFactory {
|
||||||
|
|
||||||
private static Logger logger = LogManager.getLogger(MetricsReporterFactory.class);
|
private static Logger logger = LogManager.getLogger(MetricsReporterFactory.class);
|
||||||
|
|
||||||
public static MetricsReporter createReporter(HoodieWriteConfig config,
|
public static MetricsReporter createReporter(HoodieWriteConfig config,
|
||||||
|
|||||||
@@ -17,8 +17,8 @@
|
|||||||
package com.uber.hoodie.metrics;
|
package com.uber.hoodie.metrics;
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Types of the reporter. Right now we only support Graphite.
|
* Types of the reporter. Right now we only support Graphite. We can include JMX and CSV in the
|
||||||
* We can include JMX and CSV in the future.
|
* future.
|
||||||
*/
|
*/
|
||||||
public enum MetricsReporterType {
|
public enum MetricsReporterType {
|
||||||
GRAPHITE,
|
GRAPHITE,
|
||||||
|
|||||||
@@ -70,28 +70,16 @@ import org.apache.spark.api.java.function.PairFlatMapFunction;
|
|||||||
import scala.Option;
|
import scala.Option;
|
||||||
import scala.Tuple2;
|
import scala.Tuple2;
|
||||||
|
|
||||||
import java.io.IOException;
|
|
||||||
import java.io.Serializable;
|
|
||||||
import java.util.ArrayList;
|
|
||||||
import java.util.Collections;
|
|
||||||
import java.util.HashMap;
|
|
||||||
import java.util.Iterator;
|
|
||||||
import java.util.List;
|
|
||||||
import java.util.Map;
|
|
||||||
import java.util.Optional;
|
|
||||||
import java.util.Set;
|
|
||||||
import java.util.stream.Collectors;
|
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Implementation of a very heavily read-optimized Hoodie Table where
|
* Implementation of a very heavily read-optimized Hoodie Table where
|
||||||
*
|
*
|
||||||
* INSERTS - Produce new files, block aligned to desired size (or)
|
* INSERTS - Produce new files, block aligned to desired size (or) Merge with the smallest existing
|
||||||
* Merge with the smallest existing file, to expand it
|
* file, to expand it
|
||||||
*
|
*
|
||||||
* UPDATES - Produce a new version of the file, just replacing the updated records with new values
|
* UPDATES - Produce a new version of the file, just replacing the updated records with new values
|
||||||
*
|
|
||||||
*/
|
*/
|
||||||
public class HoodieCopyOnWriteTable<T extends HoodieRecordPayload> extends HoodieTable<T> {
|
public class HoodieCopyOnWriteTable<T extends HoodieRecordPayload> extends HoodieTable<T> {
|
||||||
|
|
||||||
public HoodieCopyOnWriteTable(HoodieWriteConfig config, HoodieTableMetaClient metaClient) {
|
public HoodieCopyOnWriteTable(HoodieWriteConfig config, HoodieTableMetaClient metaClient) {
|
||||||
super(config, metaClient);
|
super(config, metaClient);
|
||||||
}
|
}
|
||||||
@@ -107,6 +95,7 @@ public class HoodieCopyOnWriteTable<T extends HoodieRecordPayload> extends Hoodi
|
|||||||
* Helper class for a small file's location and its actual size on disk
|
* Helper class for a small file's location and its actual size on disk
|
||||||
*/
|
*/
|
||||||
class SmallFile implements Serializable {
|
class SmallFile implements Serializable {
|
||||||
|
|
||||||
HoodieRecordLocation location;
|
HoodieRecordLocation location;
|
||||||
long sizeBytes;
|
long sizeBytes;
|
||||||
|
|
||||||
@@ -121,11 +110,11 @@ public class HoodieCopyOnWriteTable<T extends HoodieRecordPayload> extends Hoodi
|
|||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Helper class for an insert bucket along with the weight [0.0, 0.1]
|
* Helper class for an insert bucket along with the weight [0.0, 0.1] that defines the amount of
|
||||||
* that defines the amount of incoming inserts that should be allocated to
|
* incoming inserts that should be allocated to the bucket
|
||||||
* the bucket
|
|
||||||
*/
|
*/
|
||||||
class InsertBucket implements Serializable {
|
class InsertBucket implements Serializable {
|
||||||
|
|
||||||
int bucketNumber;
|
int bucketNumber;
|
||||||
// fraction of total inserts, that should go into this bucket
|
// fraction of total inserts, that should go into this bucket
|
||||||
double weight;
|
double weight;
|
||||||
@@ -144,6 +133,7 @@ public class HoodieCopyOnWriteTable<T extends HoodieRecordPayload> extends Hoodi
|
|||||||
* Helper class for a bucket's type (INSERT and UPDATE) and its file location
|
* Helper class for a bucket's type (INSERT and UPDATE) and its file location
|
||||||
*/
|
*/
|
||||||
class BucketInfo implements Serializable {
|
class BucketInfo implements Serializable {
|
||||||
|
|
||||||
BucketType bucketType;
|
BucketType bucketType;
|
||||||
String fileLoc;
|
String fileLoc;
|
||||||
|
|
||||||
@@ -164,8 +154,8 @@ public class HoodieCopyOnWriteTable<T extends HoodieRecordPayload> extends Hoodi
|
|||||||
class UpsertPartitioner extends Partitioner {
|
class UpsertPartitioner extends Partitioner {
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Total number of RDD partitions, is determined by total buckets we want to
|
* Total number of RDD partitions, is determined by total buckets we want to pack the incoming
|
||||||
* pack the incoming workload into
|
* workload into
|
||||||
*/
|
*/
|
||||||
private int totalBuckets = 0;
|
private int totalBuckets = 0;
|
||||||
|
|
||||||
@@ -181,8 +171,7 @@ public class HoodieCopyOnWriteTable<T extends HoodieRecordPayload> extends Hoodi
|
|||||||
|
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Helps us pack inserts into 1 or more buckets depending on number of
|
* Helps us pack inserts into 1 or more buckets depending on number of incoming records.
|
||||||
* incoming records.
|
|
||||||
*/
|
*/
|
||||||
private HashMap<String, List<InsertBucket>> partitionPathToInsertBuckets;
|
private HashMap<String, List<InsertBucket>> partitionPathToInsertBuckets;
|
||||||
|
|
||||||
@@ -236,24 +225,28 @@ public class HoodieCopyOnWriteTable<T extends HoodieRecordPayload> extends Hoodi
|
|||||||
if (pStat.getNumInserts() > 0) {
|
if (pStat.getNumInserts() > 0) {
|
||||||
|
|
||||||
List<SmallFile> smallFiles = getSmallFiles(partitionPath);
|
List<SmallFile> smallFiles = getSmallFiles(partitionPath);
|
||||||
logger.info("For partitionPath : "+ partitionPath + " Small Files => " + smallFiles);
|
logger.info("For partitionPath : " + partitionPath + " Small Files => " + smallFiles);
|
||||||
|
|
||||||
long totalUnassignedInserts = pStat.getNumInserts();
|
long totalUnassignedInserts = pStat.getNumInserts();
|
||||||
List<Integer> bucketNumbers = new ArrayList<>();
|
List<Integer> bucketNumbers = new ArrayList<>();
|
||||||
List<Long> recordsPerBucket = new ArrayList<>();
|
List<Long> recordsPerBucket = new ArrayList<>();
|
||||||
|
|
||||||
// first try packing this into one of the smallFiles
|
// first try packing this into one of the smallFiles
|
||||||
for (SmallFile smallFile: smallFiles) {
|
for (SmallFile smallFile : smallFiles) {
|
||||||
long recordsToAppend = Math.min((config.getParquetMaxFileSize() - smallFile.sizeBytes)/ averageRecordSize, totalUnassignedInserts);
|
long recordsToAppend = Math
|
||||||
if (recordsToAppend > 0 && totalUnassignedInserts > 0){
|
.min((config.getParquetMaxFileSize() - smallFile.sizeBytes) / averageRecordSize,
|
||||||
|
totalUnassignedInserts);
|
||||||
|
if (recordsToAppend > 0 && totalUnassignedInserts > 0) {
|
||||||
// create a new bucket or re-use an existing bucket
|
// create a new bucket or re-use an existing bucket
|
||||||
int bucket;
|
int bucket;
|
||||||
if (updateLocationToBucket.containsKey(smallFile.location.getFileId())) {
|
if (updateLocationToBucket.containsKey(smallFile.location.getFileId())) {
|
||||||
bucket = updateLocationToBucket.get(smallFile.location.getFileId());
|
bucket = updateLocationToBucket.get(smallFile.location.getFileId());
|
||||||
logger.info("Assigning " + recordsToAppend + " inserts to existing update bucket "+ bucket);
|
logger.info("Assigning " + recordsToAppend + " inserts to existing update bucket "
|
||||||
|
+ bucket);
|
||||||
} else {
|
} else {
|
||||||
bucket = addUpdateBucket(smallFile.location.getFileId());
|
bucket = addUpdateBucket(smallFile.location.getFileId());
|
||||||
logger.info("Assigning " + recordsToAppend + " inserts to new update bucket "+ bucket);
|
logger.info(
|
||||||
|
"Assigning " + recordsToAppend + " inserts to new update bucket " + bucket);
|
||||||
}
|
}
|
||||||
bucketNumbers.add(bucket);
|
bucketNumbers.add(bucket);
|
||||||
recordsPerBucket.add(recordsToAppend);
|
recordsPerBucket.add(recordsToAppend);
|
||||||
@@ -265,16 +258,17 @@ public class HoodieCopyOnWriteTable<T extends HoodieRecordPayload> extends Hoodi
|
|||||||
if (totalUnassignedInserts > 0) {
|
if (totalUnassignedInserts > 0) {
|
||||||
long insertRecordsPerBucket = config.getCopyOnWriteInsertSplitSize();
|
long insertRecordsPerBucket = config.getCopyOnWriteInsertSplitSize();
|
||||||
if (config.shouldAutoTuneInsertSplits()) {
|
if (config.shouldAutoTuneInsertSplits()) {
|
||||||
insertRecordsPerBucket = config.getParquetMaxFileSize()/averageRecordSize;
|
insertRecordsPerBucket = config.getParquetMaxFileSize() / averageRecordSize;
|
||||||
}
|
}
|
||||||
|
|
||||||
int insertBuckets = (int) Math.max(totalUnassignedInserts / insertRecordsPerBucket, 1L);
|
int insertBuckets = (int) Math.max(totalUnassignedInserts / insertRecordsPerBucket, 1L);
|
||||||
logger.info("After small file assignment: unassignedInserts => " + totalUnassignedInserts
|
logger
|
||||||
|
.info("After small file assignment: unassignedInserts => " + totalUnassignedInserts
|
||||||
+ ", totalInsertBuckets => " + insertBuckets
|
+ ", totalInsertBuckets => " + insertBuckets
|
||||||
+ ", recordsPerBucket => " + insertRecordsPerBucket);
|
+ ", recordsPerBucket => " + insertRecordsPerBucket);
|
||||||
for (int b = 0; b < insertBuckets; b++) {
|
for (int b = 0; b < insertBuckets; b++) {
|
||||||
bucketNumbers.add(totalBuckets);
|
bucketNumbers.add(totalBuckets);
|
||||||
recordsPerBucket.add(totalUnassignedInserts/insertBuckets);
|
recordsPerBucket.add(totalUnassignedInserts / insertBuckets);
|
||||||
BucketInfo bucketInfo = new BucketInfo();
|
BucketInfo bucketInfo = new BucketInfo();
|
||||||
bucketInfo.bucketType = BucketType.INSERT;
|
bucketInfo.bucketType = BucketType.INSERT;
|
||||||
bucketInfoMap.put(totalBuckets, bucketInfo);
|
bucketInfoMap.put(totalBuckets, bucketInfo);
|
||||||
@@ -287,10 +281,11 @@ public class HoodieCopyOnWriteTable<T extends HoodieRecordPayload> extends Hoodi
|
|||||||
for (int i = 0; i < bucketNumbers.size(); i++) {
|
for (int i = 0; i < bucketNumbers.size(); i++) {
|
||||||
InsertBucket bkt = new InsertBucket();
|
InsertBucket bkt = new InsertBucket();
|
||||||
bkt.bucketNumber = bucketNumbers.get(i);
|
bkt.bucketNumber = bucketNumbers.get(i);
|
||||||
bkt.weight = (1.0 * recordsPerBucket.get(i))/pStat.getNumInserts();
|
bkt.weight = (1.0 * recordsPerBucket.get(i)) / pStat.getNumInserts();
|
||||||
insertBuckets.add(bkt);
|
insertBuckets.add(bkt);
|
||||||
}
|
}
|
||||||
logger.info("Total insert buckets for partition path "+ partitionPath + " => " + insertBuckets);
|
logger.info(
|
||||||
|
"Total insert buckets for partition path " + partitionPath + " => " + insertBuckets);
|
||||||
partitionPathToInsertBuckets.put(partitionPath, insertBuckets);
|
partitionPathToInsertBuckets.put(partitionPath, insertBuckets);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@@ -299,9 +294,6 @@ public class HoodieCopyOnWriteTable<T extends HoodieRecordPayload> extends Hoodi
|
|||||||
|
|
||||||
/**
|
/**
|
||||||
* Returns a list of small files in the given partition path
|
* Returns a list of small files in the given partition path
|
||||||
*
|
|
||||||
* @param partitionPath
|
|
||||||
* @return
|
|
||||||
*/
|
*/
|
||||||
private List<SmallFile> getSmallFiles(String partitionPath) {
|
private List<SmallFile> getSmallFiles(String partitionPath) {
|
||||||
List<SmallFile> smallFileLocations = new ArrayList<>();
|
List<SmallFile> smallFileLocations = new ArrayList<>();
|
||||||
@@ -330,10 +322,8 @@ public class HoodieCopyOnWriteTable<T extends HoodieRecordPayload> extends Hoodi
|
|||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Obtains the average record size based on records written during last commit.
|
* Obtains the average record size based on records written during last commit. Used for
|
||||||
* Used for estimating how many records pack into one file.
|
* estimating how many records pack into one file.
|
||||||
*
|
|
||||||
* @return
|
|
||||||
*/
|
*/
|
||||||
private long averageBytesPerRecord() {
|
private long averageBytesPerRecord() {
|
||||||
long avgSize = 0L;
|
long avgSize = 0L;
|
||||||
@@ -375,13 +365,15 @@ public class HoodieCopyOnWriteTable<T extends HoodieRecordPayload> extends Hoodi
|
|||||||
HoodieRecordLocation location = keyLocation._2().get();
|
HoodieRecordLocation location = keyLocation._2().get();
|
||||||
return updateLocationToBucket.get(location.getFileId());
|
return updateLocationToBucket.get(location.getFileId());
|
||||||
} else {
|
} else {
|
||||||
List<InsertBucket> targetBuckets = partitionPathToInsertBuckets.get(keyLocation._1().getPartitionPath());
|
List<InsertBucket> targetBuckets = partitionPathToInsertBuckets
|
||||||
|
.get(keyLocation._1().getPartitionPath());
|
||||||
// pick the target bucket to use based on the weights.
|
// pick the target bucket to use based on the weights.
|
||||||
double totalWeight = 0.0;
|
double totalWeight = 0.0;
|
||||||
final long totalInserts = Math.max(1, globalStat.getNumInserts());
|
final long totalInserts = Math.max(1, globalStat.getNumInserts());
|
||||||
final long hashOfKey = Hashing.md5().hashString(keyLocation._1().getRecordKey(), StandardCharsets.UTF_8).asLong();
|
final long hashOfKey = Hashing.md5()
|
||||||
|
.hashString(keyLocation._1().getRecordKey(), StandardCharsets.UTF_8).asLong();
|
||||||
final double r = 1.0 * Math.floorMod(hashOfKey, totalInserts) / totalInserts;
|
final double r = 1.0 * Math.floorMod(hashOfKey, totalInserts) / totalInserts;
|
||||||
for (InsertBucket insertBucket: targetBuckets) {
|
for (InsertBucket insertBucket : targetBuckets) {
|
||||||
totalWeight += insertBucket.weight;
|
totalWeight += insertBucket.weight;
|
||||||
if (r <= totalWeight) {
|
if (r <= totalWeight) {
|
||||||
return insertBucket.bucketNumber;
|
return insertBucket.bucketNumber;
|
||||||
@@ -413,14 +405,14 @@ public class HoodieCopyOnWriteTable<T extends HoodieRecordPayload> extends Hoodi
|
|||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
public Iterator<List<WriteStatus>> handleUpdate(String commitTime, String fileLoc,
|
||||||
public Iterator<List<WriteStatus>> handleUpdate(String commitTime, String fileLoc, Iterator<HoodieRecord<T>> recordItr)
|
Iterator<HoodieRecord<T>> recordItr)
|
||||||
throws IOException {
|
throws IOException {
|
||||||
// these are updates
|
// these are updates
|
||||||
HoodieMergeHandle upsertHandle = getUpdateHandle(commitTime, fileLoc, recordItr);
|
HoodieMergeHandle upsertHandle = getUpdateHandle(commitTime, fileLoc, recordItr);
|
||||||
if (upsertHandle.getOldFilePath() == null) {
|
if (upsertHandle.getOldFilePath() == null) {
|
||||||
throw new HoodieUpsertException("Error in finding the old file path at commit " +
|
throw new HoodieUpsertException("Error in finding the old file path at commit " +
|
||||||
commitTime +" at fileLoc: " + fileLoc);
|
commitTime + " at fileLoc: " + fileLoc);
|
||||||
} else {
|
} else {
|
||||||
Configuration conf = FSUtils.getFs().getConf();
|
Configuration conf = FSUtils.getFs().getConf();
|
||||||
AvroReadSupport.setAvroReadSchema(conf, upsertHandle.getSchema());
|
AvroReadSupport.setAvroReadSchema(conf, upsertHandle.getSchema());
|
||||||
@@ -448,14 +440,17 @@ public class HoodieCopyOnWriteTable<T extends HoodieRecordPayload> extends Hoodi
|
|||||||
logger.info("Upsert Handle has partition path as null " + upsertHandle.getOldFilePath()
|
logger.info("Upsert Handle has partition path as null " + upsertHandle.getOldFilePath()
|
||||||
+ ", " + upsertHandle.getWriteStatus());
|
+ ", " + upsertHandle.getWriteStatus());
|
||||||
}
|
}
|
||||||
return Collections.singletonList(Collections.singletonList(upsertHandle.getWriteStatus())).iterator();
|
return Collections.singletonList(Collections.singletonList(upsertHandle.getWriteStatus()))
|
||||||
|
.iterator();
|
||||||
}
|
}
|
||||||
|
|
||||||
protected HoodieMergeHandle getUpdateHandle(String commitTime, String fileLoc, Iterator<HoodieRecord<T>> recordItr) {
|
protected HoodieMergeHandle getUpdateHandle(String commitTime, String fileLoc,
|
||||||
|
Iterator<HoodieRecord<T>> recordItr) {
|
||||||
return new HoodieMergeHandle<>(config, commitTime, this, recordItr, fileLoc);
|
return new HoodieMergeHandle<>(config, commitTime, this, recordItr, fileLoc);
|
||||||
}
|
}
|
||||||
|
|
||||||
public Iterator<List<WriteStatus>> handleInsert(String commitTime, Iterator<HoodieRecord<T>> recordItr) throws Exception {
|
public Iterator<List<WriteStatus>> handleInsert(String commitTime,
|
||||||
|
Iterator<HoodieRecord<T>> recordItr) throws Exception {
|
||||||
return new LazyInsertIterable<>(recordItr, config, commitTime, this);
|
return new LazyInsertIterable<>(recordItr, config, commitTime, this);
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -473,7 +468,8 @@ public class HoodieCopyOnWriteTable<T extends HoodieRecordPayload> extends Hoodi
|
|||||||
} else if (btype.equals(BucketType.UPDATE)) {
|
} else if (btype.equals(BucketType.UPDATE)) {
|
||||||
return handleUpdate(commitTime, binfo.fileLoc, recordItr);
|
return handleUpdate(commitTime, binfo.fileLoc, recordItr);
|
||||||
} else {
|
} else {
|
||||||
throw new HoodieUpsertException("Unknown bucketType " + btype + " for partition :" + partition);
|
throw new HoodieUpsertException(
|
||||||
|
"Unknown bucketType " + btype + " for partition :" + partition);
|
||||||
}
|
}
|
||||||
} catch (Throwable t) {
|
} catch (Throwable t) {
|
||||||
String msg = "Error upserting bucketType " + btype + " for partition :" + partition;
|
String msg = "Error upserting bucketType " + btype + " for partition :" + partition;
|
||||||
@@ -496,9 +492,9 @@ public class HoodieCopyOnWriteTable<T extends HoodieRecordPayload> extends Hoodi
|
|||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Performs cleaning of partition paths according to cleaning policy and returns the number
|
* Performs cleaning of partition paths according to cleaning policy and returns the number of
|
||||||
* of files cleaned. Handles skews in partitions to clean by making files to clean as the
|
* files cleaned. Handles skews in partitions to clean by making files to clean as the unit of
|
||||||
* unit of task distribution.
|
* task distribution.
|
||||||
*
|
*
|
||||||
* @throws IllegalArgumentException if unknown cleaning policy is provided
|
* @throws IllegalArgumentException if unknown cleaning policy is provided
|
||||||
*/
|
*/
|
||||||
@@ -506,7 +502,8 @@ public class HoodieCopyOnWriteTable<T extends HoodieRecordPayload> extends Hoodi
|
|||||||
public List<HoodieCleanStat> clean(JavaSparkContext jsc) {
|
public List<HoodieCleanStat> clean(JavaSparkContext jsc) {
|
||||||
try {
|
try {
|
||||||
List<String> partitionsToClean =
|
List<String> partitionsToClean =
|
||||||
FSUtils.getAllPartitionPaths(getFs(), getMetaClient().getBasePath(), config.shouldAssumeDatePartitioning());
|
FSUtils.getAllPartitionPaths(getFs(), getMetaClient().getBasePath(),
|
||||||
|
config.shouldAssumeDatePartitioning());
|
||||||
logger.info("Partitions to clean up : " + partitionsToClean + ", with policy " + config
|
logger.info("Partitions to clean up : " + partitionsToClean + ", with policy " + config
|
||||||
.getCleanerPolicy());
|
.getCleanerPolicy());
|
||||||
if (partitionsToClean.isEmpty()) {
|
if (partitionsToClean.isEmpty()) {
|
||||||
@@ -520,19 +517,16 @@ public class HoodieCopyOnWriteTable<T extends HoodieRecordPayload> extends Hoodi
|
|||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
*
|
* Common method used for cleaning out parquet files under a partition path during rollback of a
|
||||||
* Common method used for cleaning out parquet files under a partition path during rollback of a set of commits
|
* set of commits
|
||||||
* @param partitionPath
|
|
||||||
* @param commits
|
|
||||||
* @return
|
|
||||||
* @throws IOException
|
|
||||||
*/
|
*/
|
||||||
protected Map<FileStatus, Boolean> deleteCleanedFiles(String partitionPath, List<String> commits) throws IOException {
|
protected Map<FileStatus, Boolean> deleteCleanedFiles(String partitionPath, List<String> commits)
|
||||||
|
throws IOException {
|
||||||
logger.info("Cleaning path " + partitionPath);
|
logger.info("Cleaning path " + partitionPath);
|
||||||
FileSystem fs = FSUtils.getFs();
|
FileSystem fs = FSUtils.getFs();
|
||||||
FileStatus[] toBeDeleted =
|
FileStatus[] toBeDeleted =
|
||||||
fs.listStatus(new Path(config.getBasePath(), partitionPath), path -> {
|
fs.listStatus(new Path(config.getBasePath(), partitionPath), path -> {
|
||||||
if(!path.toString().contains(".parquet")) {
|
if (!path.toString().contains(".parquet")) {
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
String fileCommitTime = FSUtils.getCommitTime(path.getName());
|
String fileCommitTime = FSUtils.getCommitTime(path.getName());
|
||||||
@@ -548,10 +542,12 @@ public class HoodieCopyOnWriteTable<T extends HoodieRecordPayload> extends Hoodi
|
|||||||
}
|
}
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
public List<HoodieRollbackStat> rollback(JavaSparkContext jsc, List<String> commits) throws IOException {
|
public List<HoodieRollbackStat> rollback(JavaSparkContext jsc, List<String> commits)
|
||||||
|
throws IOException {
|
||||||
String actionType = this.getCompactedCommitActionType();
|
String actionType = this.getCompactedCommitActionType();
|
||||||
HoodieActiveTimeline activeTimeline = this.getActiveTimeline();
|
HoodieActiveTimeline activeTimeline = this.getActiveTimeline();
|
||||||
List<String> inflights = this.getInflightCommitTimeline().getInstants().map(HoodieInstant::getTimestamp)
|
List<String> inflights = this.getInflightCommitTimeline().getInstants()
|
||||||
|
.map(HoodieInstant::getTimestamp)
|
||||||
.collect(Collectors.toList());
|
.collect(Collectors.toList());
|
||||||
|
|
||||||
// Atomically unpublish all the commits
|
// Atomically unpublish all the commits
|
||||||
@@ -563,7 +559,8 @@ public class HoodieCopyOnWriteTable<T extends HoodieRecordPayload> extends Hoodi
|
|||||||
// delete all the data files for all these commits
|
// delete all the data files for all these commits
|
||||||
logger.info("Clean out all parquet files generated for commits: " + commits);
|
logger.info("Clean out all parquet files generated for commits: " + commits);
|
||||||
List<HoodieRollbackStat> stats = jsc.parallelize(
|
List<HoodieRollbackStat> stats = jsc.parallelize(
|
||||||
FSUtils.getAllPartitionPaths(FSUtils.getFs(), this.getMetaClient().getBasePath(), config.shouldAssumeDatePartitioning()))
|
FSUtils.getAllPartitionPaths(FSUtils.getFs(), this.getMetaClient().getBasePath(),
|
||||||
|
config.shouldAssumeDatePartitioning()))
|
||||||
.map((Function<String, HoodieRollbackStat>) partitionPath -> {
|
.map((Function<String, HoodieRollbackStat>) partitionPath -> {
|
||||||
// Scan all partitions files with this commit time
|
// Scan all partitions files with this commit time
|
||||||
Map<FileStatus, Boolean> results = deleteCleanedFiles(partitionPath, commits);
|
Map<FileStatus, Boolean> results = deleteCleanedFiles(partitionPath, commits);
|
||||||
@@ -579,6 +576,7 @@ public class HoodieCopyOnWriteTable<T extends HoodieRecordPayload> extends Hoodi
|
|||||||
}
|
}
|
||||||
|
|
||||||
private static class PartitionCleanStat implements Serializable {
|
private static class PartitionCleanStat implements Serializable {
|
||||||
|
|
||||||
private final String partitionPath;
|
private final String partitionPath;
|
||||||
private final List<String> deletePathPatterns = new ArrayList<>();
|
private final List<String> deletePathPatterns = new ArrayList<>();
|
||||||
private final List<String> successDeleteFiles = new ArrayList<>();
|
private final List<String> successDeleteFiles = new ArrayList<>();
|
||||||
@@ -613,7 +611,8 @@ public class HoodieCopyOnWriteTable<T extends HoodieRecordPayload> extends Hoodi
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
private List<HoodieCleanStat> cleanPartitionPaths(List<String> partitionsToClean, JavaSparkContext jsc) {
|
private List<HoodieCleanStat> cleanPartitionPaths(List<String> partitionsToClean,
|
||||||
|
JavaSparkContext jsc) {
|
||||||
int cleanerParallelism = Math.min(partitionsToClean.size(), config.getCleanerParallelism());
|
int cleanerParallelism = Math.min(partitionsToClean.size(), config.getCleanerParallelism());
|
||||||
logger.info("Using cleanerParallelism: " + cleanerParallelism);
|
logger.info("Using cleanerParallelism: " + cleanerParallelism);
|
||||||
List<Tuple2<String, PartitionCleanStat>> partitionCleanStats = jsc
|
List<Tuple2<String, PartitionCleanStat>> partitionCleanStats = jsc
|
||||||
@@ -621,7 +620,8 @@ public class HoodieCopyOnWriteTable<T extends HoodieRecordPayload> extends Hoodi
|
|||||||
.flatMapToPair(getFilesToDeleteFunc(this, config))
|
.flatMapToPair(getFilesToDeleteFunc(this, config))
|
||||||
.repartition(cleanerParallelism) // repartition to remove skews
|
.repartition(cleanerParallelism) // repartition to remove skews
|
||||||
.mapPartitionsToPair(deleteFilesFunc(this, config))
|
.mapPartitionsToPair(deleteFilesFunc(this, config))
|
||||||
.reduceByKey( // merge partition level clean stats below
|
.reduceByKey(
|
||||||
|
// merge partition level clean stats below
|
||||||
(Function2<PartitionCleanStat, PartitionCleanStat, PartitionCleanStat>) (e1, e2) -> e1
|
(Function2<PartitionCleanStat, PartitionCleanStat, PartitionCleanStat>) (e1, e2) -> e1
|
||||||
.merge(e2))
|
.merge(e2))
|
||||||
.collect();
|
.collect();
|
||||||
|
|||||||
@@ -39,13 +39,6 @@ import com.uber.hoodie.exception.HoodieCompactionException;
|
|||||||
import com.uber.hoodie.exception.HoodieRollbackException;
|
import com.uber.hoodie.exception.HoodieRollbackException;
|
||||||
import com.uber.hoodie.io.HoodieAppendHandle;
|
import com.uber.hoodie.io.HoodieAppendHandle;
|
||||||
import com.uber.hoodie.io.compact.HoodieRealtimeTableCompactor;
|
import com.uber.hoodie.io.compact.HoodieRealtimeTableCompactor;
|
||||||
import org.apache.hadoop.fs.FileStatus;
|
|
||||||
import org.apache.hadoop.fs.Path;
|
|
||||||
import org.apache.log4j.LogManager;
|
|
||||||
import org.apache.log4j.Logger;
|
|
||||||
import org.apache.spark.api.java.JavaSparkContext;
|
|
||||||
import org.apache.spark.api.java.function.Function;
|
|
||||||
|
|
||||||
import java.io.IOException;
|
import java.io.IOException;
|
||||||
import java.io.UncheckedIOException;
|
import java.io.UncheckedIOException;
|
||||||
import java.util.Arrays;
|
import java.util.Arrays;
|
||||||
@@ -56,6 +49,12 @@ import java.util.List;
|
|||||||
import java.util.Map;
|
import java.util.Map;
|
||||||
import java.util.Optional;
|
import java.util.Optional;
|
||||||
import java.util.stream.Collectors;
|
import java.util.stream.Collectors;
|
||||||
|
import org.apache.hadoop.fs.FileStatus;
|
||||||
|
import org.apache.hadoop.fs.Path;
|
||||||
|
import org.apache.log4j.LogManager;
|
||||||
|
import org.apache.log4j.Logger;
|
||||||
|
import org.apache.spark.api.java.JavaSparkContext;
|
||||||
|
import org.apache.spark.api.java.function.Function;
|
||||||
|
|
||||||
|
|
||||||
/**
|
/**
|
||||||
@@ -64,13 +63,15 @@ import java.util.stream.Collectors;
|
|||||||
* INSERTS - Same as HoodieCopyOnWriteTable - Produce new files, block aligned to desired size (or)
|
* INSERTS - Same as HoodieCopyOnWriteTable - Produce new files, block aligned to desired size (or)
|
||||||
* Merge with the smallest existing file, to expand it
|
* Merge with the smallest existing file, to expand it
|
||||||
*
|
*
|
||||||
* UPDATES - Appends the changes to a rolling log file maintained per file Id.
|
* UPDATES - Appends the changes to a rolling log file maintained per file Id. Compaction merges the
|
||||||
* Compaction merges the log file into the base file.
|
* log file into the base file.
|
||||||
*
|
*
|
||||||
* WARNING - MOR table type does not support nested rollbacks, every rollback
|
* WARNING - MOR table type does not support nested rollbacks, every rollback must be followed by an
|
||||||
* must be followed by an attempted commit action
|
* attempted commit action
|
||||||
*/
|
*/
|
||||||
public class HoodieMergeOnReadTable<T extends HoodieRecordPayload> extends HoodieCopyOnWriteTable<T> {
|
public class HoodieMergeOnReadTable<T extends HoodieRecordPayload> extends
|
||||||
|
HoodieCopyOnWriteTable<T> {
|
||||||
|
|
||||||
private static Logger logger = LogManager.getLogger(HoodieMergeOnReadTable.class);
|
private static Logger logger = LogManager.getLogger(HoodieMergeOnReadTable.class);
|
||||||
|
|
||||||
public HoodieMergeOnReadTable(HoodieWriteConfig config,
|
public HoodieMergeOnReadTable(HoodieWriteConfig config,
|
||||||
@@ -119,15 +120,17 @@ public class HoodieMergeOnReadTable<T extends HoodieRecordPayload> extends Hoodi
|
|||||||
}
|
}
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
public List<HoodieRollbackStat> rollback(JavaSparkContext jsc, List<String> commits) throws IOException {
|
public List<HoodieRollbackStat> rollback(JavaSparkContext jsc, List<String> commits)
|
||||||
|
throws IOException {
|
||||||
|
|
||||||
//At the moment, MOR table type does not support nested rollbacks
|
//At the moment, MOR table type does not support nested rollbacks
|
||||||
if(commits.size() > 1) {
|
if (commits.size() > 1) {
|
||||||
throw new UnsupportedOperationException("Nested Rollbacks are not supported");
|
throw new UnsupportedOperationException("Nested Rollbacks are not supported");
|
||||||
}
|
}
|
||||||
Map<String, HoodieInstant> commitsAndCompactions =
|
Map<String, HoodieInstant> commitsAndCompactions =
|
||||||
this.getActiveTimeline()
|
this.getActiveTimeline()
|
||||||
.getTimelineOfActions(Sets.newHashSet(HoodieActiveTimeline.COMMIT_ACTION, HoodieActiveTimeline.COMPACTION_ACTION, HoodieActiveTimeline.DELTA_COMMIT_ACTION))
|
.getTimelineOfActions(Sets.newHashSet(HoodieActiveTimeline.COMMIT_ACTION,
|
||||||
|
HoodieActiveTimeline.COMPACTION_ACTION, HoodieActiveTimeline.DELTA_COMMIT_ACTION))
|
||||||
.getInstants()
|
.getInstants()
|
||||||
.filter(i -> commits.contains(i.getTimestamp()))
|
.filter(i -> commits.contains(i.getTimestamp()))
|
||||||
.collect(Collectors.toMap(i -> i.getTimestamp(), i -> i));
|
.collect(Collectors.toMap(i -> i.getTimestamp(), i -> i));
|
||||||
@@ -149,11 +152,14 @@ public class HoodieMergeOnReadTable<T extends HoodieRecordPayload> extends Hoodi
|
|||||||
try {
|
try {
|
||||||
logger.info("Starting to rollback Commit/Compaction " + instant);
|
logger.info("Starting to rollback Commit/Compaction " + instant);
|
||||||
HoodieCommitMetadata commitMetadata = HoodieCommitMetadata
|
HoodieCommitMetadata commitMetadata = HoodieCommitMetadata
|
||||||
.fromBytes(this.getCommitTimeline().getInstantDetails(new HoodieInstant(true, instant.getAction(), instant.getTimestamp())).get());
|
.fromBytes(this.getCommitTimeline().getInstantDetails(
|
||||||
|
new HoodieInstant(true, instant.getAction(), instant.getTimestamp())).get());
|
||||||
|
|
||||||
stats = jsc.parallelize(commitMetadata.getPartitionToWriteStats().keySet().stream().collect(Collectors.toList()))
|
stats = jsc.parallelize(commitMetadata.getPartitionToWriteStats().keySet().stream()
|
||||||
|
.collect(Collectors.toList()))
|
||||||
.map((Function<String, HoodieRollbackStat>) partitionPath -> {
|
.map((Function<String, HoodieRollbackStat>) partitionPath -> {
|
||||||
Map<FileStatus, Boolean> results = super.deleteCleanedFiles(partitionPath, Arrays.asList(commit));
|
Map<FileStatus, Boolean> results = super
|
||||||
|
.deleteCleanedFiles(partitionPath, Arrays.asList(commit));
|
||||||
return HoodieRollbackStat.newBuilder().withPartitionPath(partitionPath)
|
return HoodieRollbackStat.newBuilder().withPartitionPath(partitionPath)
|
||||||
.withDeletedFileResults(results).build();
|
.withDeletedFileResults(results).build();
|
||||||
}).collect();
|
}).collect();
|
||||||
@@ -167,40 +173,55 @@ public class HoodieMergeOnReadTable<T extends HoodieRecordPayload> extends Hoodi
|
|||||||
logger.info("Starting to rollback delta commit " + instant);
|
logger.info("Starting to rollback delta commit " + instant);
|
||||||
|
|
||||||
HoodieCommitMetadata commitMetadata = HoodieCommitMetadata
|
HoodieCommitMetadata commitMetadata = HoodieCommitMetadata
|
||||||
.fromBytes(this.getCommitTimeline().getInstantDetails(new HoodieInstant(true, instant.getAction(), instant.getTimestamp())).get());
|
.fromBytes(this.getCommitTimeline().getInstantDetails(
|
||||||
|
new HoodieInstant(true, instant.getAction(), instant.getTimestamp())).get());
|
||||||
|
|
||||||
stats = jsc.parallelize(commitMetadata.getPartitionToWriteStats().keySet().stream().collect(Collectors.toList()))
|
stats = jsc.parallelize(commitMetadata.getPartitionToWriteStats().keySet().stream()
|
||||||
|
.collect(Collectors.toList()))
|
||||||
.map((Function<String, HoodieRollbackStat>) partitionPath -> {
|
.map((Function<String, HoodieRollbackStat>) partitionPath -> {
|
||||||
// read commit file and (either append delete blocks or delete file)
|
// read commit file and (either append delete blocks or delete file)
|
||||||
Map<FileStatus, Boolean> filesToDeletedStatus = new HashMap<>();
|
Map<FileStatus, Boolean> filesToDeletedStatus = new HashMap<>();
|
||||||
Map<FileStatus, Long> filesToNumBlocksRollback = new HashMap<>();
|
Map<FileStatus, Long> filesToNumBlocksRollback = new HashMap<>();
|
||||||
|
|
||||||
// we do not know fileIds for inserts (first inserts are parquet files), delete all parquet files for the corresponding failed commit, if present (same as COW)
|
// we do not know fileIds for inserts (first inserts are parquet files), delete all parquet files for the corresponding failed commit, if present (same as COW)
|
||||||
filesToDeletedStatus = super.deleteCleanedFiles(partitionPath, Arrays.asList(commit));
|
filesToDeletedStatus = super
|
||||||
|
.deleteCleanedFiles(partitionPath, Arrays.asList(commit));
|
||||||
|
|
||||||
// append rollback blocks for updates
|
// append rollback blocks for updates
|
||||||
commitMetadata.getPartitionToWriteStats().get(partitionPath).stream().filter(wStat -> wStat.getPrevCommit() != HoodieWriteStat.NULL_COMMIT).forEach(wStat -> {
|
commitMetadata.getPartitionToWriteStats().get(partitionPath).stream()
|
||||||
|
.filter(wStat -> wStat.getPrevCommit() != HoodieWriteStat.NULL_COMMIT)
|
||||||
|
.forEach(wStat -> {
|
||||||
HoodieLogFormat.Writer writer = null;
|
HoodieLogFormat.Writer writer = null;
|
||||||
try {
|
try {
|
||||||
writer = HoodieLogFormat.newWriterBuilder()
|
writer = HoodieLogFormat.newWriterBuilder()
|
||||||
.onParentPath(new Path(this.getMetaClient().getBasePath(), partitionPath))
|
.onParentPath(
|
||||||
|
new Path(this.getMetaClient().getBasePath(), partitionPath))
|
||||||
.withFileId(wStat.getFileId()).overBaseCommit(wStat.getPrevCommit())
|
.withFileId(wStat.getFileId()).overBaseCommit(wStat.getPrevCommit())
|
||||||
.withFs(FSUtils.getFs()).withFileExtension(HoodieLogFile.DELTA_EXTENSION).build();
|
.withFs(FSUtils.getFs())
|
||||||
|
.withFileExtension(HoodieLogFile.DELTA_EXTENSION).build();
|
||||||
Long numRollbackBlocks = 0L;
|
Long numRollbackBlocks = 0L;
|
||||||
// generate metadata
|
// generate metadata
|
||||||
Map<HoodieLogBlock.LogMetadataType, String> metadata = Maps.newHashMap();
|
Map<HoodieLogBlock.LogMetadataType, String> metadata = Maps.newHashMap();
|
||||||
metadata.put(HoodieLogBlock.LogMetadataType.INSTANT_TIME, metaClient.getActiveTimeline().lastInstant().get().getTimestamp());
|
metadata.put(HoodieLogBlock.LogMetadataType.INSTANT_TIME,
|
||||||
|
metaClient.getActiveTimeline().lastInstant().get().getTimestamp());
|
||||||
metadata.put(HoodieLogBlock.LogMetadataType.TARGET_INSTANT_TIME, commit);
|
metadata.put(HoodieLogBlock.LogMetadataType.TARGET_INSTANT_TIME, commit);
|
||||||
// if update belongs to an existing log file
|
// if update belongs to an existing log file
|
||||||
writer.appendBlock(new HoodieCommandBlock(HoodieCommandBlock.HoodieCommandBlockTypeEnum.ROLLBACK_PREVIOUS_BLOCK, metadata));
|
writer.appendBlock(new HoodieCommandBlock(
|
||||||
|
HoodieCommandBlock.HoodieCommandBlockTypeEnum.ROLLBACK_PREVIOUS_BLOCK,
|
||||||
|
metadata));
|
||||||
numRollbackBlocks++;
|
numRollbackBlocks++;
|
||||||
if(wStat.getNumDeletes() > 0) {
|
if (wStat.getNumDeletes() > 0) {
|
||||||
writer.appendBlock(new HoodieCommandBlock(HoodieCommandBlock.HoodieCommandBlockTypeEnum.ROLLBACK_PREVIOUS_BLOCK, metadata));
|
writer.appendBlock(new HoodieCommandBlock(
|
||||||
|
HoodieCommandBlock.HoodieCommandBlockTypeEnum.ROLLBACK_PREVIOUS_BLOCK,
|
||||||
|
metadata));
|
||||||
numRollbackBlocks++;
|
numRollbackBlocks++;
|
||||||
}
|
}
|
||||||
filesToNumBlocksRollback.put(FSUtils.getFs().getFileStatus(writer.getLogFile().getPath()), numRollbackBlocks);
|
filesToNumBlocksRollback
|
||||||
|
.put(FSUtils.getFs().getFileStatus(writer.getLogFile().getPath()),
|
||||||
|
numRollbackBlocks);
|
||||||
} catch (IOException | InterruptedException io) {
|
} catch (IOException | InterruptedException io) {
|
||||||
throw new HoodieRollbackException("Failed to rollback for commit " + commit, io);
|
throw new HoodieRollbackException(
|
||||||
|
"Failed to rollback for commit " + commit, io);
|
||||||
} finally {
|
} finally {
|
||||||
try {
|
try {
|
||||||
writer.close();
|
writer.close();
|
||||||
@@ -223,10 +244,12 @@ public class HoodieMergeOnReadTable<T extends HoodieRecordPayload> extends Hoodi
|
|||||||
}).flatMap(x -> x.stream()).collect(Collectors.toList());
|
}).flatMap(x -> x.stream()).collect(Collectors.toList());
|
||||||
|
|
||||||
commitsAndCompactions.entrySet().stream()
|
commitsAndCompactions.entrySet().stream()
|
||||||
.map(entry -> new HoodieInstant(true, entry.getValue().getAction(), entry.getValue().getTimestamp()))
|
.map(entry -> new HoodieInstant(true, entry.getValue().getAction(),
|
||||||
|
entry.getValue().getTimestamp()))
|
||||||
.forEach(this.getActiveTimeline()::deleteInflight);
|
.forEach(this.getActiveTimeline()::deleteInflight);
|
||||||
|
|
||||||
logger.debug("Time(in ms) taken to finish rollback " + (System.currentTimeMillis() - startTime));
|
logger
|
||||||
|
.debug("Time(in ms) taken to finish rollback " + (System.currentTimeMillis() - startTime));
|
||||||
|
|
||||||
return allRollbackStats;
|
return allRollbackStats;
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -34,7 +34,6 @@ import com.uber.hoodie.common.util.AvroUtils;
|
|||||||
import com.uber.hoodie.config.HoodieWriteConfig;
|
import com.uber.hoodie.config.HoodieWriteConfig;
|
||||||
import com.uber.hoodie.exception.HoodieCommitException;
|
import com.uber.hoodie.exception.HoodieCommitException;
|
||||||
import com.uber.hoodie.exception.HoodieException;
|
import com.uber.hoodie.exception.HoodieException;
|
||||||
import com.uber.hoodie.exception.HoodieRollbackException;
|
|
||||||
import com.uber.hoodie.exception.HoodieSavepointException;
|
import com.uber.hoodie.exception.HoodieSavepointException;
|
||||||
import java.io.IOException;
|
import java.io.IOException;
|
||||||
import java.io.Serializable;
|
import java.io.Serializable;
|
||||||
@@ -43,8 +42,6 @@ import java.util.List;
|
|||||||
import java.util.Optional;
|
import java.util.Optional;
|
||||||
import java.util.stream.Collectors;
|
import java.util.stream.Collectors;
|
||||||
import java.util.stream.Stream;
|
import java.util.stream.Stream;
|
||||||
|
|
||||||
import org.apache.hadoop.fs.FileStatus;
|
|
||||||
import org.apache.hadoop.fs.FileSystem;
|
import org.apache.hadoop.fs.FileSystem;
|
||||||
import org.apache.log4j.LogManager;
|
import org.apache.log4j.LogManager;
|
||||||
import org.apache.log4j.Logger;
|
import org.apache.log4j.Logger;
|
||||||
@@ -55,6 +52,7 @@ import org.apache.spark.api.java.JavaSparkContext;
|
|||||||
* Abstract implementation of a HoodieTable
|
* Abstract implementation of a HoodieTable
|
||||||
*/
|
*/
|
||||||
public abstract class HoodieTable<T extends HoodieRecordPayload> implements Serializable {
|
public abstract class HoodieTable<T extends HoodieRecordPayload> implements Serializable {
|
||||||
|
|
||||||
protected final HoodieWriteConfig config;
|
protected final HoodieWriteConfig config;
|
||||||
protected final HoodieTableMetaClient metaClient;
|
protected final HoodieTableMetaClient metaClient;
|
||||||
private static Logger logger = LogManager.getLogger(HoodieTable.class);
|
private static Logger logger = LogManager.getLogger(HoodieTable.class);
|
||||||
@@ -65,27 +63,19 @@ public abstract class HoodieTable<T extends HoodieRecordPayload> implements Seri
|
|||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Provides a partitioner to perform the upsert operation, based on the
|
* Provides a partitioner to perform the upsert operation, based on the workload profile
|
||||||
* workload profile
|
|
||||||
*
|
|
||||||
* @return
|
|
||||||
*/
|
*/
|
||||||
public abstract Partitioner getUpsertPartitioner(WorkloadProfile profile);
|
public abstract Partitioner getUpsertPartitioner(WorkloadProfile profile);
|
||||||
|
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Provides a partitioner to perform the insert operation, based on the workload profile
|
* Provides a partitioner to perform the insert operation, based on the workload profile
|
||||||
*
|
|
||||||
* @return
|
|
||||||
*/
|
*/
|
||||||
public abstract Partitioner getInsertPartitioner(WorkloadProfile profile);
|
public abstract Partitioner getInsertPartitioner(WorkloadProfile profile);
|
||||||
|
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Return whether this HoodieTable implementation can benefit from workload
|
* Return whether this HoodieTable implementation can benefit from workload profiling
|
||||||
* profiling
|
|
||||||
*
|
|
||||||
* @return
|
|
||||||
*/
|
*/
|
||||||
public abstract boolean isWorkloadProfileNeeded();
|
public abstract boolean isWorkloadProfileNeeded();
|
||||||
|
|
||||||
@@ -103,8 +93,6 @@ public abstract class HoodieTable<T extends HoodieRecordPayload> implements Seri
|
|||||||
|
|
||||||
/**
|
/**
|
||||||
* Get the view of the file system for this table
|
* Get the view of the file system for this table
|
||||||
*
|
|
||||||
* @return
|
|
||||||
*/
|
*/
|
||||||
public TableFileSystemView getFileSystemView() {
|
public TableFileSystemView getFileSystemView() {
|
||||||
return new HoodieTableFileSystemView(metaClient, getCompletedCommitTimeline());
|
return new HoodieTableFileSystemView(metaClient, getCompletedCommitTimeline());
|
||||||
@@ -112,8 +100,6 @@ public abstract class HoodieTable<T extends HoodieRecordPayload> implements Seri
|
|||||||
|
|
||||||
/**
|
/**
|
||||||
* Get the read optimized view of the file system for this table
|
* Get the read optimized view of the file system for this table
|
||||||
*
|
|
||||||
* @return
|
|
||||||
*/
|
*/
|
||||||
public TableFileSystemView.ReadOptimizedView getROFileSystemView() {
|
public TableFileSystemView.ReadOptimizedView getROFileSystemView() {
|
||||||
return new HoodieTableFileSystemView(metaClient, getCompletedCommitTimeline());
|
return new HoodieTableFileSystemView(metaClient, getCompletedCommitTimeline());
|
||||||
@@ -121,8 +107,6 @@ public abstract class HoodieTable<T extends HoodieRecordPayload> implements Seri
|
|||||||
|
|
||||||
/**
|
/**
|
||||||
* Get the real time view of the file system for this table
|
* Get the real time view of the file system for this table
|
||||||
*
|
|
||||||
* @return
|
|
||||||
*/
|
*/
|
||||||
public TableFileSystemView.RealtimeView getRTFileSystemView() {
|
public TableFileSystemView.RealtimeView getRTFileSystemView() {
|
||||||
return new HoodieTableFileSystemView(metaClient, getCompletedCommitTimeline());
|
return new HoodieTableFileSystemView(metaClient, getCompletedCommitTimeline());
|
||||||
@@ -130,8 +114,6 @@ public abstract class HoodieTable<T extends HoodieRecordPayload> implements Seri
|
|||||||
|
|
||||||
/**
|
/**
|
||||||
* Get the completed (commit + compaction) view of the file system for this table
|
* Get the completed (commit + compaction) view of the file system for this table
|
||||||
*
|
|
||||||
* @return
|
|
||||||
*/
|
*/
|
||||||
public TableFileSystemView getCompletedFileSystemView() {
|
public TableFileSystemView getCompletedFileSystemView() {
|
||||||
return new HoodieTableFileSystemView(metaClient, getCommitTimeline());
|
return new HoodieTableFileSystemView(metaClient, getCommitTimeline());
|
||||||
@@ -139,7 +121,6 @@ public abstract class HoodieTable<T extends HoodieRecordPayload> implements Seri
|
|||||||
|
|
||||||
/**
|
/**
|
||||||
* Get only the completed (no-inflights) commit timeline
|
* Get only the completed (no-inflights) commit timeline
|
||||||
* @return
|
|
||||||
*/
|
*/
|
||||||
public HoodieTimeline getCompletedCommitTimeline() {
|
public HoodieTimeline getCompletedCommitTimeline() {
|
||||||
return getCommitTimeline().filterCompletedInstants();
|
return getCommitTimeline().filterCompletedInstants();
|
||||||
@@ -147,7 +128,6 @@ public abstract class HoodieTable<T extends HoodieRecordPayload> implements Seri
|
|||||||
|
|
||||||
/**
|
/**
|
||||||
* Get only the inflights (no-completed) commit timeline
|
* Get only the inflights (no-completed) commit timeline
|
||||||
* @return
|
|
||||||
*/
|
*/
|
||||||
public HoodieTimeline getInflightCommitTimeline() {
|
public HoodieTimeline getInflightCommitTimeline() {
|
||||||
return getCommitTimeline().filterInflights();
|
return getCommitTimeline().filterInflights();
|
||||||
@@ -156,7 +136,6 @@ public abstract class HoodieTable<T extends HoodieRecordPayload> implements Seri
|
|||||||
|
|
||||||
/**
|
/**
|
||||||
* Get only the completed (no-inflights) clean timeline
|
* Get only the completed (no-inflights) clean timeline
|
||||||
* @return
|
|
||||||
*/
|
*/
|
||||||
public HoodieTimeline getCompletedCleanTimeline() {
|
public HoodieTimeline getCompletedCleanTimeline() {
|
||||||
return getActiveTimeline().getCleanerTimeline().filterCompletedInstants();
|
return getActiveTimeline().getCleanerTimeline().filterCompletedInstants();
|
||||||
@@ -164,7 +143,6 @@ public abstract class HoodieTable<T extends HoodieRecordPayload> implements Seri
|
|||||||
|
|
||||||
/**
|
/**
|
||||||
* Get only the completed (no-inflights) savepoint timeline
|
* Get only the completed (no-inflights) savepoint timeline
|
||||||
* @return
|
|
||||||
*/
|
*/
|
||||||
public HoodieTimeline getCompletedSavepointTimeline() {
|
public HoodieTimeline getCompletedSavepointTimeline() {
|
||||||
return getActiveTimeline().getSavePointTimeline().filterCompletedInstants();
|
return getActiveTimeline().getSavePointTimeline().filterCompletedInstants();
|
||||||
@@ -172,7 +150,6 @@ public abstract class HoodieTable<T extends HoodieRecordPayload> implements Seri
|
|||||||
|
|
||||||
/**
|
/**
|
||||||
* Get the list of savepoints in this table
|
* Get the list of savepoints in this table
|
||||||
* @return
|
|
||||||
*/
|
*/
|
||||||
public List<String> getSavepoints() {
|
public List<String> getSavepoints() {
|
||||||
return getCompletedSavepointTimeline().getInstants().map(HoodieInstant::getTimestamp)
|
return getCompletedSavepointTimeline().getInstants().map(HoodieInstant::getTimestamp)
|
||||||
@@ -181,10 +158,6 @@ public abstract class HoodieTable<T extends HoodieRecordPayload> implements Seri
|
|||||||
|
|
||||||
/**
|
/**
|
||||||
* Get the list of data file names savepointed
|
* Get the list of data file names savepointed
|
||||||
*
|
|
||||||
* @param savepointTime
|
|
||||||
* @return
|
|
||||||
* @throws IOException
|
|
||||||
*/
|
*/
|
||||||
public Stream<String> getSavepointedDataFiles(String savepointTime) {
|
public Stream<String> getSavepointedDataFiles(String savepointTime) {
|
||||||
if (!getSavepoints().contains(savepointTime)) {
|
if (!getSavepoints().contains(savepointTime)) {
|
||||||
@@ -211,8 +184,6 @@ public abstract class HoodieTable<T extends HoodieRecordPayload> implements Seri
|
|||||||
|
|
||||||
/**
|
/**
|
||||||
* Get the commit timeline visible for this table
|
* Get the commit timeline visible for this table
|
||||||
*
|
|
||||||
* @return
|
|
||||||
*/
|
*/
|
||||||
public HoodieTimeline getCommitTimeline() {
|
public HoodieTimeline getCommitTimeline() {
|
||||||
switch (metaClient.getTableType()) {
|
switch (metaClient.getTableType()) {
|
||||||
@@ -223,13 +194,12 @@ public abstract class HoodieTable<T extends HoodieRecordPayload> implements Seri
|
|||||||
// Include commit action to be able to start doing a MOR over a COW dataset - no migration required
|
// Include commit action to be able to start doing a MOR over a COW dataset - no migration required
|
||||||
return getActiveTimeline().getCommitsAndCompactionsTimeline();
|
return getActiveTimeline().getCommitsAndCompactionsTimeline();
|
||||||
default:
|
default:
|
||||||
throw new HoodieException("Unsupported table type :"+ metaClient.getTableType());
|
throw new HoodieException("Unsupported table type :" + metaClient.getTableType());
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Get only the completed (no-inflights) compaction commit timeline
|
* Get only the completed (no-inflights) compaction commit timeline
|
||||||
* @return
|
|
||||||
*/
|
*/
|
||||||
public HoodieTimeline getCompletedCompactionCommitTimeline() {
|
public HoodieTimeline getCompletedCompactionCommitTimeline() {
|
||||||
return getCompactionCommitTimeline().filterCompletedInstants();
|
return getCompactionCommitTimeline().filterCompletedInstants();
|
||||||
@@ -238,8 +208,6 @@ public abstract class HoodieTable<T extends HoodieRecordPayload> implements Seri
|
|||||||
|
|
||||||
/**
|
/**
|
||||||
* Get the compacted commit timeline visible for this table
|
* Get the compacted commit timeline visible for this table
|
||||||
*
|
|
||||||
* @return
|
|
||||||
*/
|
*/
|
||||||
public HoodieTimeline getCompactionCommitTimeline() {
|
public HoodieTimeline getCompactionCommitTimeline() {
|
||||||
switch (metaClient.getTableType()) {
|
switch (metaClient.getTableType()) {
|
||||||
@@ -250,13 +218,12 @@ public abstract class HoodieTable<T extends HoodieRecordPayload> implements Seri
|
|||||||
return getActiveTimeline().getTimelineOfActions(
|
return getActiveTimeline().getTimelineOfActions(
|
||||||
Sets.newHashSet(HoodieActiveTimeline.COMPACTION_ACTION));
|
Sets.newHashSet(HoodieActiveTimeline.COMPACTION_ACTION));
|
||||||
default:
|
default:
|
||||||
throw new HoodieException("Unsupported table type :"+ metaClient.getTableType());
|
throw new HoodieException("Unsupported table type :" + metaClient.getTableType());
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Gets the commit action type
|
* Gets the commit action type
|
||||||
* @return
|
|
||||||
*/
|
*/
|
||||||
public String getCommitActionType() {
|
public String getCommitActionType() {
|
||||||
switch (metaClient.getTableType()) {
|
switch (metaClient.getTableType()) {
|
||||||
@@ -271,7 +238,6 @@ public abstract class HoodieTable<T extends HoodieRecordPayload> implements Seri
|
|||||||
|
|
||||||
/**
|
/**
|
||||||
* Gets the action type for a compaction commit
|
* Gets the action type for a compaction commit
|
||||||
* @return
|
|
||||||
*/
|
*/
|
||||||
public String getCompactedCommitActionType() {
|
public String getCompactedCommitActionType() {
|
||||||
switch (metaClient.getTableType()) {
|
switch (metaClient.getTableType()) {
|
||||||
@@ -280,27 +246,18 @@ public abstract class HoodieTable<T extends HoodieRecordPayload> implements Seri
|
|||||||
case MERGE_ON_READ:
|
case MERGE_ON_READ:
|
||||||
return HoodieTimeline.COMPACTION_ACTION;
|
return HoodieTimeline.COMPACTION_ACTION;
|
||||||
}
|
}
|
||||||
throw new HoodieException("Unsupported table type :"+ metaClient.getTableType());
|
throw new HoodieException("Unsupported table type :" + metaClient.getTableType());
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Perform the ultimate IO for a given upserted (RDD) partition
|
* Perform the ultimate IO for a given upserted (RDD) partition
|
||||||
*
|
|
||||||
* @param partition
|
|
||||||
* @param recordIterator
|
|
||||||
* @param partitioner
|
|
||||||
*/
|
*/
|
||||||
public abstract Iterator<List<WriteStatus>> handleUpsertPartition(String commitTime,
|
public abstract Iterator<List<WriteStatus>> handleUpsertPartition(String commitTime,
|
||||||
Integer partition, Iterator<HoodieRecord<T>> recordIterator, Partitioner partitioner);
|
Integer partition, Iterator<HoodieRecord<T>> recordIterator, Partitioner partitioner);
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Perform the ultimate IO for a given inserted (RDD) partition
|
* Perform the ultimate IO for a given inserted (RDD) partition
|
||||||
*
|
|
||||||
* @param partition
|
|
||||||
* @param recordIterator
|
|
||||||
* @param partitioner
|
|
||||||
*/
|
*/
|
||||||
public abstract Iterator<List<WriteStatus>> handleInsertPartition(String commitTime,
|
public abstract Iterator<List<WriteStatus>> handleInsertPartition(String commitTime,
|
||||||
Integer partition, Iterator<HoodieRecord<T>> recordIterator, Partitioner partitioner);
|
Integer partition, Iterator<HoodieRecord<T>> recordIterator, Partitioner partitioner);
|
||||||
@@ -319,27 +276,21 @@ public abstract class HoodieTable<T extends HoodieRecordPayload> implements Seri
|
|||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Run Compaction on the table.
|
* Run Compaction on the table. Compaction arranges the data so that it is optimized for data
|
||||||
* Compaction arranges the data so that it is optimized for data access
|
* access
|
||||||
*/
|
*/
|
||||||
public abstract Optional<HoodieCompactionMetadata> compact(JavaSparkContext jsc);
|
public abstract Optional<HoodieCompactionMetadata> compact(JavaSparkContext jsc);
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Clean partition paths according to cleaning policy and returns the number
|
* Clean partition paths according to cleaning policy and returns the number of files cleaned.
|
||||||
* of files cleaned.
|
|
||||||
*/
|
*/
|
||||||
public abstract List<HoodieCleanStat> clean(JavaSparkContext jsc);
|
public abstract List<HoodieCleanStat> clean(JavaSparkContext jsc);
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Rollback the (inflight/committed) record changes with the given commit time.
|
* Rollback the (inflight/committed) record changes with the given commit time. Four steps: (1)
|
||||||
* Four steps:
|
* Atomically unpublish this commit (2) clean indexing data (3) clean new generated parquet files
|
||||||
* (1) Atomically unpublish this commit
|
* / log blocks (4) Finally, delete .<action>.commit or .<action>.inflight file
|
||||||
* (2) clean indexing data
|
|
||||||
* (3) clean new generated parquet files / log blocks
|
|
||||||
* (4) Finally, delete .<action>.commit or .<action>.inflight file
|
|
||||||
* @param commits
|
|
||||||
* @return
|
|
||||||
* @throws HoodieRollbackException
|
|
||||||
*/
|
*/
|
||||||
public abstract List<HoodieRollbackStat> rollback(JavaSparkContext jsc, List<String> commits) throws IOException;
|
public abstract List<HoodieRollbackStat> rollback(JavaSparkContext jsc, List<String> commits)
|
||||||
|
throws IOException;
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -20,13 +20,13 @@ import com.uber.hoodie.common.model.HoodieRecordPayload;
|
|||||||
import org.apache.spark.api.java.JavaRDD;
|
import org.apache.spark.api.java.JavaRDD;
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Repartition input records into at least expected number of output spark partitions. It should give
|
* Repartition input records into at least expected number of output spark partitions. It should
|
||||||
* below guarantees
|
* give below guarantees - Output spark partition will have records from only one hoodie partition.
|
||||||
* - Output spark partition will have records from only one hoodie partition.
|
* - Average records per output spark partitions should be almost equal to (#inputRecords /
|
||||||
* - Average records per output spark partitions should be almost equal to (#inputRecords / #outputSparkPartitions)
|
* #outputSparkPartitions) to avoid possible skews.
|
||||||
* to avoid possible skews.
|
|
||||||
*/
|
*/
|
||||||
public interface UserDefinedBulkInsertPartitioner<T extends HoodieRecordPayload> {
|
public interface UserDefinedBulkInsertPartitioner<T extends HoodieRecordPayload> {
|
||||||
|
|
||||||
JavaRDD<HoodieRecord<T>> repartitionRecords(JavaRDD<HoodieRecord<T>> records, int outputSparkPartitions);
|
JavaRDD<HoodieRecord<T>> repartitionRecords(JavaRDD<HoodieRecord<T>> records,
|
||||||
|
int outputSparkPartitions);
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -20,15 +20,11 @@ package com.uber.hoodie.table;
|
|||||||
import com.uber.hoodie.common.model.HoodieRecord;
|
import com.uber.hoodie.common.model.HoodieRecord;
|
||||||
import com.uber.hoodie.common.model.HoodieRecordLocation;
|
import com.uber.hoodie.common.model.HoodieRecordLocation;
|
||||||
import com.uber.hoodie.common.model.HoodieRecordPayload;
|
import com.uber.hoodie.common.model.HoodieRecordPayload;
|
||||||
|
|
||||||
import org.apache.spark.api.java.JavaRDD;
|
|
||||||
import org.apache.spark.api.java.function.PairFunction;
|
|
||||||
|
|
||||||
import java.io.Serializable;
|
import java.io.Serializable;
|
||||||
import java.util.HashMap;
|
import java.util.HashMap;
|
||||||
import java.util.Map;
|
import java.util.Map;
|
||||||
import java.util.Set;
|
import java.util.Set;
|
||||||
|
import org.apache.spark.api.java.JavaRDD;
|
||||||
import scala.Option;
|
import scala.Option;
|
||||||
import scala.Tuple2;
|
import scala.Tuple2;
|
||||||
|
|
||||||
@@ -65,15 +61,18 @@ public class WorkloadProfile<T extends HoodieRecordPayload> implements Serializa
|
|||||||
|
|
||||||
Map<Tuple2<String, Option<HoodieRecordLocation>>, Long> partitionLocationCounts = taggedRecords
|
Map<Tuple2<String, Option<HoodieRecordLocation>>, Long> partitionLocationCounts = taggedRecords
|
||||||
.mapToPair(record ->
|
.mapToPair(record ->
|
||||||
new Tuple2<>(new Tuple2<>(record.getPartitionPath(), Option.apply(record.getCurrentLocation())), record))
|
new Tuple2<>(
|
||||||
|
new Tuple2<>(record.getPartitionPath(), Option.apply(record.getCurrentLocation())),
|
||||||
|
record))
|
||||||
.countByKey();
|
.countByKey();
|
||||||
|
|
||||||
for (Map.Entry<Tuple2<String, Option<HoodieRecordLocation>>, Long> e: partitionLocationCounts.entrySet()) {
|
for (Map.Entry<Tuple2<String, Option<HoodieRecordLocation>>, Long> e : partitionLocationCounts
|
||||||
|
.entrySet()) {
|
||||||
String partitionPath = e.getKey()._1();
|
String partitionPath = e.getKey()._1();
|
||||||
Long count = e.getValue();
|
Long count = e.getValue();
|
||||||
Option<HoodieRecordLocation> locOption = e.getKey()._2();
|
Option<HoodieRecordLocation> locOption = e.getKey()._2();
|
||||||
|
|
||||||
if (!partitionPathStatMap.containsKey(partitionPath)){
|
if (!partitionPathStatMap.containsKey(partitionPath)) {
|
||||||
partitionPathStatMap.put(partitionPath, new WorkloadStat());
|
partitionPathStatMap.put(partitionPath, new WorkloadStat());
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -97,7 +96,7 @@ public class WorkloadProfile<T extends HoodieRecordPayload> implements Serializa
|
|||||||
return partitionPathStatMap.keySet();
|
return partitionPathStatMap.keySet();
|
||||||
}
|
}
|
||||||
|
|
||||||
public WorkloadStat getWorkloadStat(String partitionPath){
|
public WorkloadStat getWorkloadStat(String partitionPath) {
|
||||||
return partitionPathStatMap.get(partitionPath);
|
return partitionPathStatMap.get(partitionPath);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|||||||
@@ -17,7 +17,6 @@
|
|||||||
package com.uber.hoodie.table;
|
package com.uber.hoodie.table;
|
||||||
|
|
||||||
import com.uber.hoodie.common.model.HoodieRecordLocation;
|
import com.uber.hoodie.common.model.HoodieRecordLocation;
|
||||||
|
|
||||||
import java.io.Serializable;
|
import java.io.Serializable;
|
||||||
import java.util.HashMap;
|
import java.util.HashMap;
|
||||||
|
|
||||||
@@ -25,6 +24,7 @@ import java.util.HashMap;
|
|||||||
* Wraps stats about a single partition path.
|
* Wraps stats about a single partition path.
|
||||||
*/
|
*/
|
||||||
public class WorkloadStat implements Serializable {
|
public class WorkloadStat implements Serializable {
|
||||||
|
|
||||||
private long numInserts = 0L;
|
private long numInserts = 0L;
|
||||||
|
|
||||||
private long numUpdates = 0L;
|
private long numUpdates = 0L;
|
||||||
|
|||||||
@@ -13,7 +13,6 @@
|
|||||||
# See the License for the specific language governing permissions and
|
# See the License for the specific language governing permissions and
|
||||||
# limitations under the License.
|
# limitations under the License.
|
||||||
#
|
#
|
||||||
|
|
||||||
# Set root logger level to DEBUG and its only appender to A1.
|
# Set root logger level to DEBUG and its only appender to A1.
|
||||||
log4j.rootLogger=INFO, A1
|
log4j.rootLogger=INFO, A1
|
||||||
# A1 is set to be a ConsoleAppender.
|
# A1 is set to be a ConsoleAppender.
|
||||||
|
|||||||
@@ -22,13 +22,12 @@ import com.uber.hoodie.common.HoodieTestDataGenerator;
|
|||||||
import com.uber.hoodie.common.model.HoodieAvroPayload;
|
import com.uber.hoodie.common.model.HoodieAvroPayload;
|
||||||
import com.uber.hoodie.common.model.HoodieRecord;
|
import com.uber.hoodie.common.model.HoodieRecord;
|
||||||
import com.uber.hoodie.common.model.HoodieTableType;
|
import com.uber.hoodie.common.model.HoodieTableType;
|
||||||
import com.uber.hoodie.common.table.HoodieTableConfig;
|
|
||||||
import com.uber.hoodie.common.table.HoodieTableMetaClient;
|
import com.uber.hoodie.common.table.HoodieTableMetaClient;
|
||||||
import com.uber.hoodie.common.util.FSUtils;
|
import com.uber.hoodie.common.util.FSUtils;
|
||||||
import com.uber.hoodie.config.HoodieIndexConfig;
|
import com.uber.hoodie.config.HoodieIndexConfig;
|
||||||
import com.uber.hoodie.config.HoodieWriteConfig;
|
import com.uber.hoodie.config.HoodieWriteConfig;
|
||||||
import com.uber.hoodie.index.HoodieIndex;
|
import com.uber.hoodie.index.HoodieIndex;
|
||||||
|
import java.util.List;
|
||||||
import org.apache.hadoop.fs.FileSystem;
|
import org.apache.hadoop.fs.FileSystem;
|
||||||
import org.apache.hadoop.fs.Path;
|
import org.apache.hadoop.fs.Path;
|
||||||
import org.apache.log4j.LogManager;
|
import org.apache.log4j.LogManager;
|
||||||
@@ -36,7 +35,6 @@ import org.apache.log4j.Logger;
|
|||||||
import org.apache.spark.SparkConf;
|
import org.apache.spark.SparkConf;
|
||||||
import org.apache.spark.api.java.JavaRDD;
|
import org.apache.spark.api.java.JavaRDD;
|
||||||
import org.apache.spark.api.java.JavaSparkContext;
|
import org.apache.spark.api.java.JavaSparkContext;
|
||||||
import java.util.List;
|
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Driver program that uses the Hoodie client with synthetic workload, and performs basic
|
* Driver program that uses the Hoodie client with synthetic workload, and performs basic
|
||||||
@@ -44,13 +42,13 @@ import java.util.List;
|
|||||||
*/
|
*/
|
||||||
public class HoodieClientExample {
|
public class HoodieClientExample {
|
||||||
|
|
||||||
@Parameter(names={"--table-path", "-p"}, description = "path for Hoodie sample table")
|
@Parameter(names = {"--table-path", "-p"}, description = "path for Hoodie sample table")
|
||||||
private String tablePath = "file:///tmp/hoodie/sample-table";
|
private String tablePath = "file:///tmp/hoodie/sample-table";
|
||||||
|
|
||||||
@Parameter(names={"--table-name", "-n"}, description = "table name for Hoodie sample table")
|
@Parameter(names = {"--table-name", "-n"}, description = "table name for Hoodie sample table")
|
||||||
private String tableName = "hoodie_rt";
|
private String tableName = "hoodie_rt";
|
||||||
|
|
||||||
@Parameter(names={"--table-type", "-t"}, description = "One of COPY_ON_WRITE or MERGE_ON_READ")
|
@Parameter(names = {"--table-type", "-t"}, description = "One of COPY_ON_WRITE or MERGE_ON_READ")
|
||||||
private String tableType = HoodieTableType.COPY_ON_WRITE.name();
|
private String tableType = HoodieTableType.COPY_ON_WRITE.name();
|
||||||
|
|
||||||
@Parameter(names = {"--help", "-h"}, help = true)
|
@Parameter(names = {"--help", "-h"}, help = true)
|
||||||
@@ -85,7 +83,9 @@ public class HoodieClientExample {
|
|||||||
Path path = new Path(tablePath);
|
Path path = new Path(tablePath);
|
||||||
FileSystem fs = FSUtils.getFs();
|
FileSystem fs = FSUtils.getFs();
|
||||||
if (!fs.exists(path)) {
|
if (!fs.exists(path)) {
|
||||||
HoodieTableMetaClient.initTableType(fs, tablePath, HoodieTableType.valueOf(tableType), tableName, HoodieAvroPayload.class.getName());
|
HoodieTableMetaClient
|
||||||
|
.initTableType(fs, tablePath, HoodieTableType.valueOf(tableType), tableName,
|
||||||
|
HoodieAvroPayload.class.getName());
|
||||||
}
|
}
|
||||||
|
|
||||||
// Create the write client to write some records in
|
// Create the write client to write some records in
|
||||||
|
|||||||
@@ -16,8 +16,12 @@
|
|||||||
|
|
||||||
package com.uber.hoodie;
|
package com.uber.hoodie;
|
||||||
|
|
||||||
import com.google.common.collect.Iterables;
|
import static org.junit.Assert.assertEquals;
|
||||||
|
import static org.junit.Assert.assertFalse;
|
||||||
|
import static org.junit.Assert.assertTrue;
|
||||||
|
import static org.junit.Assert.fail;
|
||||||
|
|
||||||
|
import com.google.common.collect.Iterables;
|
||||||
import com.uber.hoodie.common.HoodieCleanStat;
|
import com.uber.hoodie.common.HoodieCleanStat;
|
||||||
import com.uber.hoodie.common.HoodieClientTestUtils;
|
import com.uber.hoodie.common.HoodieClientTestUtils;
|
||||||
import com.uber.hoodie.common.HoodieTestDataGenerator;
|
import com.uber.hoodie.common.HoodieTestDataGenerator;
|
||||||
@@ -45,22 +49,6 @@ import com.uber.hoodie.config.HoodieWriteConfig;
|
|||||||
import com.uber.hoodie.exception.HoodieRollbackException;
|
import com.uber.hoodie.exception.HoodieRollbackException;
|
||||||
import com.uber.hoodie.index.HoodieIndex;
|
import com.uber.hoodie.index.HoodieIndex;
|
||||||
import com.uber.hoodie.table.HoodieTable;
|
import com.uber.hoodie.table.HoodieTable;
|
||||||
|
|
||||||
import org.apache.avro.generic.GenericRecord;
|
|
||||||
import org.apache.commons.io.IOUtils;
|
|
||||||
import org.apache.hadoop.fs.FileSystem;
|
|
||||||
import org.apache.hadoop.fs.Path;
|
|
||||||
import org.apache.spark.api.java.JavaRDD;
|
|
||||||
import org.apache.spark.api.java.JavaSparkContext;
|
|
||||||
import org.apache.spark.scheduler.SparkListener;
|
|
||||||
import org.apache.spark.scheduler.SparkListenerTaskEnd;
|
|
||||||
import org.apache.spark.sql.SQLContext;
|
|
||||||
import org.apache.spark.util.AccumulatorV2;
|
|
||||||
import org.junit.After;
|
|
||||||
import org.junit.Before;
|
|
||||||
import org.junit.Test;
|
|
||||||
import org.junit.rules.TemporaryFolder;
|
|
||||||
|
|
||||||
import java.io.File;
|
import java.io.File;
|
||||||
import java.io.FileInputStream;
|
import java.io.FileInputStream;
|
||||||
import java.io.IOException;
|
import java.io.IOException;
|
||||||
@@ -76,15 +64,24 @@ import java.util.Optional;
|
|||||||
import java.util.Set;
|
import java.util.Set;
|
||||||
import java.util.TreeSet;
|
import java.util.TreeSet;
|
||||||
import java.util.stream.Collectors;
|
import java.util.stream.Collectors;
|
||||||
|
import org.apache.avro.generic.GenericRecord;
|
||||||
|
import org.apache.commons.io.IOUtils;
|
||||||
|
import org.apache.hadoop.fs.FileSystem;
|
||||||
|
import org.apache.hadoop.fs.Path;
|
||||||
|
import org.apache.spark.api.java.JavaRDD;
|
||||||
|
import org.apache.spark.api.java.JavaSparkContext;
|
||||||
|
import org.apache.spark.scheduler.SparkListener;
|
||||||
|
import org.apache.spark.scheduler.SparkListenerTaskEnd;
|
||||||
|
import org.apache.spark.sql.SQLContext;
|
||||||
|
import org.apache.spark.util.AccumulatorV2;
|
||||||
|
import org.junit.After;
|
||||||
|
import org.junit.Before;
|
||||||
|
import org.junit.Test;
|
||||||
|
import org.junit.rules.TemporaryFolder;
|
||||||
import scala.collection.Iterator;
|
import scala.collection.Iterator;
|
||||||
|
|
||||||
import static org.junit.Assert.assertEquals;
|
|
||||||
import static org.junit.Assert.assertFalse;
|
|
||||||
import static org.junit.Assert.assertTrue;
|
|
||||||
import static org.junit.Assert.fail;
|
|
||||||
|
|
||||||
public class TestHoodieClientOnCopyOnWriteStorage implements Serializable {
|
public class TestHoodieClientOnCopyOnWriteStorage implements Serializable {
|
||||||
|
|
||||||
private transient JavaSparkContext jsc = null;
|
private transient JavaSparkContext jsc = null;
|
||||||
private transient SQLContext sqlContext;
|
private transient SQLContext sqlContext;
|
||||||
private String basePath = null;
|
private String basePath = null;
|
||||||
@@ -115,7 +112,8 @@ public class TestHoodieClientOnCopyOnWriteStorage implements Serializable {
|
|||||||
private HoodieWriteConfig.Builder getConfigBuilder() {
|
private HoodieWriteConfig.Builder getConfigBuilder() {
|
||||||
return HoodieWriteConfig.newBuilder().withPath(basePath)
|
return HoodieWriteConfig.newBuilder().withPath(basePath)
|
||||||
.withSchema(HoodieTestDataGenerator.TRIP_EXAMPLE_SCHEMA).withParallelism(2, 2)
|
.withSchema(HoodieTestDataGenerator.TRIP_EXAMPLE_SCHEMA).withParallelism(2, 2)
|
||||||
.withCompactionConfig(HoodieCompactionConfig.newBuilder().compactionSmallFileSize(1024 * 1024).build())
|
.withCompactionConfig(
|
||||||
|
HoodieCompactionConfig.newBuilder().compactionSmallFileSize(1024 * 1024).build())
|
||||||
.withStorageConfig(HoodieStorageConfig.newBuilder().limitFileSize(1024 * 1024).build())
|
.withStorageConfig(HoodieStorageConfig.newBuilder().limitFileSize(1024 * 1024).build())
|
||||||
.forTable("test-trip-table").withIndexConfig(
|
.forTable("test-trip-table").withIndexConfig(
|
||||||
HoodieIndexConfig.newBuilder().withIndexType(HoodieIndex.IndexType.BLOOM).build());
|
HoodieIndexConfig.newBuilder().withIndexType(HoodieIndex.IndexType.BLOOM).build());
|
||||||
@@ -129,9 +127,11 @@ public class TestHoodieClientOnCopyOnWriteStorage implements Serializable {
|
|||||||
}
|
}
|
||||||
|
|
||||||
private void assertPartitionMetadata(String[] partitionPaths, FileSystem fs) throws IOException {
|
private void assertPartitionMetadata(String[] partitionPaths, FileSystem fs) throws IOException {
|
||||||
for (String partitionPath: partitionPaths) {
|
for (String partitionPath : partitionPaths) {
|
||||||
assertTrue(HoodiePartitionMetadata.hasPartitionMetadata(fs, new Path(basePath, partitionPath)));
|
assertTrue(
|
||||||
HoodiePartitionMetadata pmeta = new HoodiePartitionMetadata(fs, new Path(basePath, partitionPath));
|
HoodiePartitionMetadata.hasPartitionMetadata(fs, new Path(basePath, partitionPath)));
|
||||||
|
HoodiePartitionMetadata pmeta = new HoodiePartitionMetadata(fs,
|
||||||
|
new Path(basePath, partitionPath));
|
||||||
pmeta.readFromFS();
|
pmeta.readFromFS();
|
||||||
assertEquals(3, pmeta.getPartitionDepth());
|
assertEquals(3, pmeta.getPartitionDepth());
|
||||||
}
|
}
|
||||||
@@ -140,13 +140,13 @@ public class TestHoodieClientOnCopyOnWriteStorage implements Serializable {
|
|||||||
private void checkTaggedRecords(List<HoodieRecord> taggedRecords, String commitTime) {
|
private void checkTaggedRecords(List<HoodieRecord> taggedRecords, String commitTime) {
|
||||||
for (HoodieRecord rec : taggedRecords) {
|
for (HoodieRecord rec : taggedRecords) {
|
||||||
assertTrue("Record " + rec + " found with no location.", rec.isCurrentLocationKnown());
|
assertTrue("Record " + rec + " found with no location.", rec.isCurrentLocationKnown());
|
||||||
assertEquals("All records should have commit time "+ commitTime+", since updates were made",
|
assertEquals(
|
||||||
|
"All records should have commit time " + commitTime + ", since updates were made",
|
||||||
rec.getCurrentLocation().getCommitTime(), commitTime);
|
rec.getCurrentLocation().getCommitTime(), commitTime);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
@Test
|
@Test
|
||||||
public void testFilterExist() throws Exception {
|
public void testFilterExist() throws Exception {
|
||||||
HoodieWriteConfig config = getConfig();
|
HoodieWriteConfig config = getConfig();
|
||||||
@@ -231,17 +231,21 @@ public class TestHoodieClientOnCopyOnWriteStorage implements Serializable {
|
|||||||
|
|
||||||
// verify that there is a commit
|
// verify that there is a commit
|
||||||
HoodieTableMetaClient metaClient = new HoodieTableMetaClient(fs, basePath);
|
HoodieTableMetaClient metaClient = new HoodieTableMetaClient(fs, basePath);
|
||||||
HoodieTimeline timeline = new HoodieActiveTimeline(fs, metaClient.getMetaPath()).getCommitTimeline();
|
HoodieTimeline timeline = new HoodieActiveTimeline(fs, metaClient.getMetaPath())
|
||||||
|
.getCommitTimeline();
|
||||||
|
|
||||||
assertEquals("Expecting a single commit.", 1, timeline.findInstantsAfter("000", Integer.MAX_VALUE).countInstants());
|
assertEquals("Expecting a single commit.", 1,
|
||||||
assertEquals("Latest commit should be 001", newCommitTime, timeline.lastInstant().get().getTimestamp());
|
timeline.findInstantsAfter("000", Integer.MAX_VALUE).countInstants());
|
||||||
|
assertEquals("Latest commit should be 001", newCommitTime,
|
||||||
|
timeline.lastInstant().get().getTimestamp());
|
||||||
assertEquals("Must contain 200 records",
|
assertEquals("Must contain 200 records",
|
||||||
records.size(),
|
records.size(),
|
||||||
HoodieClientTestUtils.readCommit(basePath, sqlContext, timeline, newCommitTime).count());
|
HoodieClientTestUtils.readCommit(basePath, sqlContext, timeline, newCommitTime).count());
|
||||||
// Should have 100 records in table (check using Index), all in locations marked at commit
|
// Should have 100 records in table (check using Index), all in locations marked at commit
|
||||||
HoodieTable table = HoodieTable.getHoodieTable(metaClient, getConfig());
|
HoodieTable table = HoodieTable.getHoodieTable(metaClient, getConfig());
|
||||||
|
|
||||||
List<HoodieRecord> taggedRecords = index.tagLocation(jsc.parallelize(records, 1), table).collect();
|
List<HoodieRecord> taggedRecords = index.tagLocation(jsc.parallelize(records, 1), table)
|
||||||
|
.collect();
|
||||||
checkTaggedRecords(taggedRecords, "001");
|
checkTaggedRecords(taggedRecords, "001");
|
||||||
|
|
||||||
/**
|
/**
|
||||||
@@ -265,8 +269,10 @@ public class TestHoodieClientOnCopyOnWriteStorage implements Serializable {
|
|||||||
|
|
||||||
// verify there are now 2 commits
|
// verify there are now 2 commits
|
||||||
timeline = new HoodieActiveTimeline(fs, metaClient.getMetaPath()).getCommitTimeline();
|
timeline = new HoodieActiveTimeline(fs, metaClient.getMetaPath()).getCommitTimeline();
|
||||||
assertEquals("Expecting two commits.", timeline.findInstantsAfter("000", Integer.MAX_VALUE).countInstants(), 2);
|
assertEquals("Expecting two commits.",
|
||||||
assertEquals("Latest commit should be 004", timeline.lastInstant().get().getTimestamp(), newCommitTime);
|
timeline.findInstantsAfter("000", Integer.MAX_VALUE).countInstants(), 2);
|
||||||
|
assertEquals("Latest commit should be 004", timeline.lastInstant().get().getTimestamp(),
|
||||||
|
newCommitTime);
|
||||||
|
|
||||||
metaClient = new HoodieTableMetaClient(fs, basePath);
|
metaClient = new HoodieTableMetaClient(fs, basePath);
|
||||||
table = HoodieTable.getHoodieTable(metaClient, getConfig());
|
table = HoodieTable.getHoodieTable(metaClient, getConfig());
|
||||||
@@ -277,21 +283,20 @@ public class TestHoodieClientOnCopyOnWriteStorage implements Serializable {
|
|||||||
|
|
||||||
// Check the entire dataset has 100 records still
|
// Check the entire dataset has 100 records still
|
||||||
String[] fullPartitionPaths = new String[dataGen.getPartitionPaths().length];
|
String[] fullPartitionPaths = new String[dataGen.getPartitionPaths().length];
|
||||||
for (int i=0; i < fullPartitionPaths.length; i++) {
|
for (int i = 0; i < fullPartitionPaths.length; i++) {
|
||||||
fullPartitionPaths[i] = String.format("%s/%s/*", basePath, dataGen.getPartitionPaths()[i]);
|
fullPartitionPaths[i] = String.format("%s/%s/*", basePath, dataGen.getPartitionPaths()[i]);
|
||||||
}
|
}
|
||||||
assertEquals("Must contain 200 records",
|
assertEquals("Must contain 200 records",
|
||||||
200,
|
200,
|
||||||
HoodieClientTestUtils.read(basePath, sqlContext, fs, fullPartitionPaths).count());
|
HoodieClientTestUtils.read(basePath, sqlContext, fs, fullPartitionPaths).count());
|
||||||
|
|
||||||
|
|
||||||
// Check that the incremental consumption from time 000
|
// Check that the incremental consumption from time 000
|
||||||
assertEquals("Incremental consumption from time 002, should give all records in commit 004",
|
assertEquals("Incremental consumption from time 002, should give all records in commit 004",
|
||||||
HoodieClientTestUtils.readCommit(basePath, sqlContext, timeline, newCommitTime).count(),
|
HoodieClientTestUtils.readCommit(basePath, sqlContext, timeline, newCommitTime).count(),
|
||||||
HoodieClientTestUtils.readSince(basePath, sqlContext, timeline, "002").count());
|
HoodieClientTestUtils.readSince(basePath, sqlContext, timeline, "002").count());
|
||||||
assertEquals("Incremental consumption from time 001, should give all records in commit 004",
|
assertEquals("Incremental consumption from time 001, should give all records in commit 004",
|
||||||
HoodieClientTestUtils.readCommit(basePath, sqlContext,timeline, newCommitTime).count(),
|
HoodieClientTestUtils.readCommit(basePath, sqlContext, timeline, newCommitTime).count(),
|
||||||
HoodieClientTestUtils.readSince(basePath, sqlContext,timeline, "001").count());
|
HoodieClientTestUtils.readSince(basePath, sqlContext, timeline, "001").count());
|
||||||
}
|
}
|
||||||
|
|
||||||
@Test
|
@Test
|
||||||
@@ -322,15 +327,19 @@ public class TestHoodieClientOnCopyOnWriteStorage implements Serializable {
|
|||||||
|
|
||||||
// verify that there is a commit
|
// verify that there is a commit
|
||||||
HoodieTableMetaClient metaClient = new HoodieTableMetaClient(fs, basePath);
|
HoodieTableMetaClient metaClient = new HoodieTableMetaClient(fs, basePath);
|
||||||
HoodieTimeline timeline = new HoodieActiveTimeline(fs, metaClient.getMetaPath()).getCommitTimeline();
|
HoodieTimeline timeline = new HoodieActiveTimeline(fs, metaClient.getMetaPath())
|
||||||
assertEquals("Expecting a single commit.", 1, timeline.findInstantsAfter("000", Integer.MAX_VALUE).countInstants());
|
.getCommitTimeline();
|
||||||
assertEquals("Latest commit should be 001", newCommitTime, timeline.lastInstant().get().getTimestamp());
|
assertEquals("Expecting a single commit.", 1,
|
||||||
|
timeline.findInstantsAfter("000", Integer.MAX_VALUE).countInstants());
|
||||||
|
assertEquals("Latest commit should be 001", newCommitTime,
|
||||||
|
timeline.lastInstant().get().getTimestamp());
|
||||||
assertEquals("Must contain 200 records", fewRecordsForInsert.size(),
|
assertEquals("Must contain 200 records", fewRecordsForInsert.size(),
|
||||||
HoodieClientTestUtils.readCommit(basePath, sqlContext, timeline, newCommitTime).count());
|
HoodieClientTestUtils.readCommit(basePath, sqlContext, timeline, newCommitTime).count());
|
||||||
// Should have 100 records in table (check using Index), all in locations marked at commit
|
// Should have 100 records in table (check using Index), all in locations marked at commit
|
||||||
HoodieTable table = HoodieTable.getHoodieTable(metaClient, getConfig());
|
HoodieTable table = HoodieTable.getHoodieTable(metaClient, getConfig());
|
||||||
|
|
||||||
List<HoodieRecord> taggedRecords = index.tagLocation(jsc.parallelize(fewRecordsForInsert, 1), table).collect();
|
List<HoodieRecord> taggedRecords = index
|
||||||
|
.tagLocation(jsc.parallelize(fewRecordsForInsert, 1), table).collect();
|
||||||
checkTaggedRecords(taggedRecords, "001");
|
checkTaggedRecords(taggedRecords, "001");
|
||||||
|
|
||||||
/**
|
/**
|
||||||
@@ -339,8 +348,8 @@ public class TestHoodieClientOnCopyOnWriteStorage implements Serializable {
|
|||||||
newCommitTime = "004";
|
newCommitTime = "004";
|
||||||
client.startCommitWithTime(newCommitTime);
|
client.startCommitWithTime(newCommitTime);
|
||||||
|
|
||||||
fewRecordsForDelete = records.subList(0,50);
|
fewRecordsForDelete = records.subList(0, 50);
|
||||||
List<HoodieRecord> fewRecordsForUpdate = records.subList(50,100);
|
List<HoodieRecord> fewRecordsForUpdate = records.subList(50, 100);
|
||||||
records = dataGen.generateDeletesFromExistingRecords(fewRecordsForDelete);
|
records = dataGen.generateDeletesFromExistingRecords(fewRecordsForDelete);
|
||||||
|
|
||||||
records.addAll(fewRecordsForUpdate);
|
records.addAll(fewRecordsForUpdate);
|
||||||
@@ -351,18 +360,19 @@ public class TestHoodieClientOnCopyOnWriteStorage implements Serializable {
|
|||||||
|
|
||||||
// verify there are now 2 commits
|
// verify there are now 2 commits
|
||||||
timeline = new HoodieActiveTimeline(fs, metaClient.getMetaPath()).getCommitTimeline();
|
timeline = new HoodieActiveTimeline(fs, metaClient.getMetaPath()).getCommitTimeline();
|
||||||
assertEquals("Expecting two commits.", timeline.findInstantsAfter("000", Integer.MAX_VALUE).countInstants(), 2);
|
assertEquals("Expecting two commits.",
|
||||||
assertEquals("Latest commit should be 004", timeline.lastInstant().get().getTimestamp(), newCommitTime);
|
timeline.findInstantsAfter("000", Integer.MAX_VALUE).countInstants(), 2);
|
||||||
|
assertEquals("Latest commit should be 004", timeline.lastInstant().get().getTimestamp(),
|
||||||
|
newCommitTime);
|
||||||
|
|
||||||
// Check the entire dataset has 150 records(200-50) still
|
// Check the entire dataset has 150 records(200-50) still
|
||||||
String[] fullPartitionPaths = new String[dataGen.getPartitionPaths().length];
|
String[] fullPartitionPaths = new String[dataGen.getPartitionPaths().length];
|
||||||
for (int i=0; i < fullPartitionPaths.length; i++) {
|
for (int i = 0; i < fullPartitionPaths.length; i++) {
|
||||||
fullPartitionPaths[i] = String.format("%s/%s/*", basePath, dataGen.getPartitionPaths()[i]);
|
fullPartitionPaths[i] = String.format("%s/%s/*", basePath, dataGen.getPartitionPaths()[i]);
|
||||||
}
|
}
|
||||||
assertEquals("Must contain 150 records", 150,
|
assertEquals("Must contain 150 records", 150,
|
||||||
HoodieClientTestUtils.read(basePath, sqlContext, fs, fullPartitionPaths).count());
|
HoodieClientTestUtils.read(basePath, sqlContext, fs, fullPartitionPaths).count());
|
||||||
|
|
||||||
|
|
||||||
// Check that the incremental consumption from time 000
|
// Check that the incremental consumption from time 000
|
||||||
assertEquals("Incremental consumption from latest commit, should give 50 updated records",
|
assertEquals("Incremental consumption from latest commit, should give 50 updated records",
|
||||||
50,
|
50,
|
||||||
@@ -384,7 +394,8 @@ public class TestHoodieClientOnCopyOnWriteStorage implements Serializable {
|
|||||||
.build()).build();
|
.build()).build();
|
||||||
HoodieWriteClient client = new HoodieWriteClient(jsc, cfg);
|
HoodieWriteClient client = new HoodieWriteClient(jsc, cfg);
|
||||||
FileSystem fs = FSUtils.getFs();
|
FileSystem fs = FSUtils.getFs();
|
||||||
HoodieTestDataGenerator.writePartitionMetadata(fs, HoodieTestDataGenerator.DEFAULT_PARTITION_PATHS, basePath);
|
HoodieTestDataGenerator
|
||||||
|
.writePartitionMetadata(fs, HoodieTestDataGenerator.DEFAULT_PARTITION_PATHS, basePath);
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Write 1 (only inserts)
|
* Write 1 (only inserts)
|
||||||
@@ -393,7 +404,8 @@ public class TestHoodieClientOnCopyOnWriteStorage implements Serializable {
|
|||||||
client.startCommitWithTime(newCommitTime);
|
client.startCommitWithTime(newCommitTime);
|
||||||
|
|
||||||
List<HoodieRecord> records = dataGen.generateInserts(newCommitTime, 200);
|
List<HoodieRecord> records = dataGen.generateInserts(newCommitTime, 200);
|
||||||
List<WriteStatus> statuses = client.upsert(jsc.parallelize(records, 1), newCommitTime).collect();
|
List<WriteStatus> statuses = client.upsert(jsc.parallelize(records, 1), newCommitTime)
|
||||||
|
.collect();
|
||||||
assertNoWriteErrors(statuses);
|
assertNoWriteErrors(statuses);
|
||||||
|
|
||||||
/**
|
/**
|
||||||
@@ -437,7 +449,8 @@ public class TestHoodieClientOnCopyOnWriteStorage implements Serializable {
|
|||||||
// Verify there are no errors
|
// Verify there are no errors
|
||||||
assertNoWriteErrors(statuses);
|
assertNoWriteErrors(statuses);
|
||||||
|
|
||||||
List<String> partitionPaths = FSUtils.getAllPartitionPaths(fs, cfg.getBasePath(), getConfig().shouldAssumeDatePartitioning());
|
List<String> partitionPaths = FSUtils
|
||||||
|
.getAllPartitionPaths(fs, cfg.getBasePath(), getConfig().shouldAssumeDatePartitioning());
|
||||||
HoodieTableMetaClient metaClient = new HoodieTableMetaClient(fs, basePath);
|
HoodieTableMetaClient metaClient = new HoodieTableMetaClient(fs, basePath);
|
||||||
HoodieTable table = HoodieTable.getHoodieTable(metaClient, getConfig());
|
HoodieTable table = HoodieTable.getHoodieTable(metaClient, getConfig());
|
||||||
final TableFileSystemView.ReadOptimizedView view = table.getROFileSystemView();
|
final TableFileSystemView.ReadOptimizedView view = table.getROFileSystemView();
|
||||||
@@ -478,7 +491,8 @@ public class TestHoodieClientOnCopyOnWriteStorage implements Serializable {
|
|||||||
.build()).build();
|
.build()).build();
|
||||||
HoodieWriteClient client = new HoodieWriteClient(jsc, cfg);
|
HoodieWriteClient client = new HoodieWriteClient(jsc, cfg);
|
||||||
FileSystem fs = FSUtils.getFs();
|
FileSystem fs = FSUtils.getFs();
|
||||||
HoodieTestDataGenerator.writePartitionMetadata(fs, HoodieTestDataGenerator.DEFAULT_PARTITION_PATHS, basePath);
|
HoodieTestDataGenerator
|
||||||
|
.writePartitionMetadata(fs, HoodieTestDataGenerator.DEFAULT_PARTITION_PATHS, basePath);
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Write 1 (only inserts)
|
* Write 1 (only inserts)
|
||||||
@@ -515,7 +529,8 @@ public class TestHoodieClientOnCopyOnWriteStorage implements Serializable {
|
|||||||
statuses = client.upsert(jsc.parallelize(records, 1), newCommitTime).collect();
|
statuses = client.upsert(jsc.parallelize(records, 1), newCommitTime).collect();
|
||||||
// Verify there are no errors
|
// Verify there are no errors
|
||||||
assertNoWriteErrors(statuses);
|
assertNoWriteErrors(statuses);
|
||||||
List<String> partitionPaths = FSUtils.getAllPartitionPaths(fs, cfg.getBasePath(), getConfig().shouldAssumeDatePartitioning());
|
List<String> partitionPaths = FSUtils
|
||||||
|
.getAllPartitionPaths(fs, cfg.getBasePath(), getConfig().shouldAssumeDatePartitioning());
|
||||||
HoodieTableMetaClient metaClient = new HoodieTableMetaClient(fs, basePath);
|
HoodieTableMetaClient metaClient = new HoodieTableMetaClient(fs, basePath);
|
||||||
HoodieTable table = HoodieTable.getHoodieTable(metaClient, getConfig());
|
HoodieTable table = HoodieTable.getHoodieTable(metaClient, getConfig());
|
||||||
final TableFileSystemView.ReadOptimizedView view1 = table.getROFileSystemView();
|
final TableFileSystemView.ReadOptimizedView view1 = table.getROFileSystemView();
|
||||||
@@ -525,7 +540,6 @@ public class TestHoodieClientOnCopyOnWriteStorage implements Serializable {
|
|||||||
}).collect(Collectors.toList());
|
}).collect(Collectors.toList());
|
||||||
assertEquals("The data files for commit 003 should be present", 3, dataFiles.size());
|
assertEquals("The data files for commit 003 should be present", 3, dataFiles.size());
|
||||||
|
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Write 4 (updates)
|
* Write 4 (updates)
|
||||||
*/
|
*/
|
||||||
@@ -546,7 +560,6 @@ public class TestHoodieClientOnCopyOnWriteStorage implements Serializable {
|
|||||||
}).collect(Collectors.toList());
|
}).collect(Collectors.toList());
|
||||||
assertEquals("The data files for commit 004 should be present", 3, dataFiles.size());
|
assertEquals("The data files for commit 004 should be present", 3, dataFiles.size());
|
||||||
|
|
||||||
|
|
||||||
// rolling back to a non existent savepoint must not succeed
|
// rolling back to a non existent savepoint must not succeed
|
||||||
try {
|
try {
|
||||||
client.rollbackToSavepoint("001");
|
client.rollbackToSavepoint("001");
|
||||||
@@ -606,8 +619,10 @@ public class TestHoodieClientOnCopyOnWriteStorage implements Serializable {
|
|||||||
|
|
||||||
// verify that there is a commit
|
// verify that there is a commit
|
||||||
HoodieTableMetaClient metaClient = new HoodieTableMetaClient(fs, basePath);
|
HoodieTableMetaClient metaClient = new HoodieTableMetaClient(fs, basePath);
|
||||||
HoodieTimeline timeline = new HoodieActiveTimeline(fs, metaClient.getMetaPath()).getCommitTimeline();
|
HoodieTimeline timeline = new HoodieActiveTimeline(fs, metaClient.getMetaPath())
|
||||||
assertEquals("Expecting a single commit.", 1, timeline.findInstantsAfter("000", Integer.MAX_VALUE).countInstants());
|
.getCommitTimeline();
|
||||||
|
assertEquals("Expecting a single commit.", 1,
|
||||||
|
timeline.findInstantsAfter("000", Integer.MAX_VALUE).countInstants());
|
||||||
// Should have 100 records in table (check using Index), all in locations marked at commit
|
// Should have 100 records in table (check using Index), all in locations marked at commit
|
||||||
HoodieTable table = HoodieTable.getHoodieTable(metaClient, getConfig());
|
HoodieTable table = HoodieTable.getHoodieTable(metaClient, getConfig());
|
||||||
assertFalse(table.getCompletedCommitTimeline().empty());
|
assertFalse(table.getCompletedCommitTimeline().empty());
|
||||||
@@ -617,7 +632,8 @@ public class TestHoodieClientOnCopyOnWriteStorage implements Serializable {
|
|||||||
assertEquals("The clean instant should be the same as the commit instant", commitTime,
|
assertEquals("The clean instant should be the same as the commit instant", commitTime,
|
||||||
table.getCompletedCleanTimeline().getInstants().findFirst().get().getTimestamp());
|
table.getCompletedCleanTimeline().getInstants().findFirst().get().getTimestamp());
|
||||||
|
|
||||||
List<HoodieRecord> taggedRecords = index.tagLocation(jsc.parallelize(records, 1), table).collect();
|
List<HoodieRecord> taggedRecords = index.tagLocation(jsc.parallelize(records, 1), table)
|
||||||
|
.collect();
|
||||||
checkTaggedRecords(taggedRecords, newCommitTime);
|
checkTaggedRecords(taggedRecords, newCommitTime);
|
||||||
|
|
||||||
// Keep doing some writes and clean inline. Make sure we have expected number of files remaining.
|
// Keep doing some writes and clean inline. Make sure we have expected number of files remaining.
|
||||||
@@ -641,18 +657,20 @@ public class TestHoodieClientOnCopyOnWriteStorage implements Serializable {
|
|||||||
// compute all the versions of all files, from time 0
|
// compute all the versions of all files, from time 0
|
||||||
HashMap<String, TreeSet<String>> fileIdToVersions = new HashMap<>();
|
HashMap<String, TreeSet<String>> fileIdToVersions = new HashMap<>();
|
||||||
for (HoodieInstant entry : timeline.getInstants().collect(Collectors.toList())) {
|
for (HoodieInstant entry : timeline.getInstants().collect(Collectors.toList())) {
|
||||||
HoodieCommitMetadata commitMetadata = HoodieCommitMetadata.fromBytes(timeline.getInstantDetails(entry).get());
|
HoodieCommitMetadata commitMetadata = HoodieCommitMetadata
|
||||||
|
.fromBytes(timeline.getInstantDetails(entry).get());
|
||||||
|
|
||||||
for (HoodieWriteStat wstat : commitMetadata.getWriteStats(partitionPath)) {
|
for (HoodieWriteStat wstat : commitMetadata.getWriteStats(partitionPath)) {
|
||||||
if (!fileIdToVersions.containsKey(wstat.getFileId())) {
|
if (!fileIdToVersions.containsKey(wstat.getFileId())) {
|
||||||
fileIdToVersions.put(wstat.getFileId(), new TreeSet<>());
|
fileIdToVersions.put(wstat.getFileId(), new TreeSet<>());
|
||||||
}
|
}
|
||||||
fileIdToVersions.get(wstat.getFileId()).add(FSUtils.getCommitTime(new Path(wstat.getPath()).getName()));
|
fileIdToVersions.get(wstat.getFileId())
|
||||||
|
.add(FSUtils.getCommitTime(new Path(wstat.getPath()).getName()));
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
List<HoodieFileGroup> fileGroups = fsView.getAllFileGroups(partitionPath)
|
||||||
List<HoodieFileGroup> fileGroups = fsView.getAllFileGroups(partitionPath).collect(Collectors.toList());
|
.collect(Collectors.toList());
|
||||||
|
|
||||||
for (HoodieFileGroup fileGroup : fileGroups) {
|
for (HoodieFileGroup fileGroup : fileGroups) {
|
||||||
// No file has no more than max versions
|
// No file has no more than max versions
|
||||||
@@ -665,7 +683,8 @@ public class TestHoodieClientOnCopyOnWriteStorage implements Serializable {
|
|||||||
// Each file, has the latest N versions (i.e cleaning gets rid of older versions)
|
// Each file, has the latest N versions (i.e cleaning gets rid of older versions)
|
||||||
List<String> commitedVersions = new ArrayList<>(fileIdToVersions.get(fileId));
|
List<String> commitedVersions = new ArrayList<>(fileIdToVersions.get(fileId));
|
||||||
for (int i = 0; i < dataFiles.size(); i++) {
|
for (int i = 0; i < dataFiles.size(); i++) {
|
||||||
assertEquals("File " + fileId + " does not have latest versions on commits" + commitedVersions,
|
assertEquals(
|
||||||
|
"File " + fileId + " does not have latest versions on commits" + commitedVersions,
|
||||||
Iterables.get(dataFiles, i).getCommitTime(),
|
Iterables.get(dataFiles, i).getCommitTime(),
|
||||||
commitedVersions.get(commitedVersions.size() - 1 - i));
|
commitedVersions.get(commitedVersions.size() - 1 - i));
|
||||||
}
|
}
|
||||||
@@ -700,8 +719,10 @@ public class TestHoodieClientOnCopyOnWriteStorage implements Serializable {
|
|||||||
|
|
||||||
// verify that there is a commit
|
// verify that there is a commit
|
||||||
HoodieTableMetaClient metaClient = new HoodieTableMetaClient(fs, basePath);
|
HoodieTableMetaClient metaClient = new HoodieTableMetaClient(fs, basePath);
|
||||||
HoodieTimeline timeline = new HoodieActiveTimeline(fs, metaClient.getMetaPath()).getCommitTimeline();
|
HoodieTimeline timeline = new HoodieActiveTimeline(fs, metaClient.getMetaPath())
|
||||||
assertEquals("Expecting a single commit.", 1, timeline.findInstantsAfter("000", Integer.MAX_VALUE).countInstants());
|
.getCommitTimeline();
|
||||||
|
assertEquals("Expecting a single commit.", 1,
|
||||||
|
timeline.findInstantsAfter("000", Integer.MAX_VALUE).countInstants());
|
||||||
// Should have 100 records in table (check using Index), all in locations marked at commit
|
// Should have 100 records in table (check using Index), all in locations marked at commit
|
||||||
HoodieTable table = HoodieTable.getHoodieTable(metaClient, getConfig());
|
HoodieTable table = HoodieTable.getHoodieTable(metaClient, getConfig());
|
||||||
|
|
||||||
@@ -712,7 +733,8 @@ public class TestHoodieClientOnCopyOnWriteStorage implements Serializable {
|
|||||||
assertEquals("The clean instant should be the same as the commit instant", commitTime,
|
assertEquals("The clean instant should be the same as the commit instant", commitTime,
|
||||||
table.getCompletedCleanTimeline().getInstants().findFirst().get().getTimestamp());
|
table.getCompletedCleanTimeline().getInstants().findFirst().get().getTimestamp());
|
||||||
|
|
||||||
List<HoodieRecord> taggedRecords = index.tagLocation(jsc.parallelize(records, 1), table).collect();
|
List<HoodieRecord> taggedRecords = index.tagLocation(jsc.parallelize(records, 1), table)
|
||||||
|
.collect();
|
||||||
checkTaggedRecords(taggedRecords, newCommitTime);
|
checkTaggedRecords(taggedRecords, newCommitTime);
|
||||||
|
|
||||||
// Keep doing some writes and clean inline. Make sure we have expected number of files remaining.
|
// Keep doing some writes and clean inline. Make sure we have expected number of files remaining.
|
||||||
@@ -734,7 +756,8 @@ public class TestHoodieClientOnCopyOnWriteStorage implements Serializable {
|
|||||||
activeTimeline.getInstants().collect(Collectors.toSet());
|
activeTimeline.getInstants().collect(Collectors.toSet());
|
||||||
if (earliestRetainedCommit.isPresent()) {
|
if (earliestRetainedCommit.isPresent()) {
|
||||||
acceptableCommits.removeAll(
|
acceptableCommits.removeAll(
|
||||||
activeTimeline.findInstantsInRange("000", earliestRetainedCommit.get().getTimestamp()).getInstants()
|
activeTimeline.findInstantsInRange("000", earliestRetainedCommit.get().getTimestamp())
|
||||||
|
.getInstants()
|
||||||
.collect(Collectors.toSet()));
|
.collect(Collectors.toSet()));
|
||||||
acceptableCommits.add(earliestRetainedCommit.get());
|
acceptableCommits.add(earliestRetainedCommit.get());
|
||||||
}
|
}
|
||||||
@@ -742,7 +765,8 @@ public class TestHoodieClientOnCopyOnWriteStorage implements Serializable {
|
|||||||
TableFileSystemView fsView = table1.getFileSystemView();
|
TableFileSystemView fsView = table1.getFileSystemView();
|
||||||
// Need to ensure the following
|
// Need to ensure the following
|
||||||
for (String partitionPath : dataGen.getPartitionPaths()) {
|
for (String partitionPath : dataGen.getPartitionPaths()) {
|
||||||
List<HoodieFileGroup> fileGroups = fsView.getAllFileGroups(partitionPath).collect(Collectors.toList());
|
List<HoodieFileGroup> fileGroups = fsView.getAllFileGroups(partitionPath)
|
||||||
|
.collect(Collectors.toList());
|
||||||
for (HoodieFileGroup fileGroup : fileGroups) {
|
for (HoodieFileGroup fileGroup : fileGroups) {
|
||||||
Set<String> commitTimes = new HashSet<>();
|
Set<String> commitTimes = new HashSet<>();
|
||||||
fileGroup.getAllDataFiles().forEach(value -> {
|
fileGroup.getAllDataFiles().forEach(value -> {
|
||||||
@@ -765,10 +789,9 @@ public class TestHoodieClientOnCopyOnWriteStorage implements Serializable {
|
|||||||
String commitTime3 = "20160506030611";
|
String commitTime3 = "20160506030611";
|
||||||
new File(basePath + "/.hoodie").mkdirs();
|
new File(basePath + "/.hoodie").mkdirs();
|
||||||
HoodieTestDataGenerator.writePartitionMetadata(FSUtils.getFs(),
|
HoodieTestDataGenerator.writePartitionMetadata(FSUtils.getFs(),
|
||||||
new String[] {"2016/05/01", "2016/05/02", "2016/05/06"},
|
new String[]{"2016/05/01", "2016/05/02", "2016/05/06"},
|
||||||
basePath);
|
basePath);
|
||||||
|
|
||||||
|
|
||||||
// Only first two have commit files
|
// Only first two have commit files
|
||||||
HoodieTestUtils.createCommitFiles(basePath, commitTime1, commitTime2);
|
HoodieTestUtils.createCommitFiles(basePath, commitTime1, commitTime2);
|
||||||
// Third one has a .inflight intermediate commit file
|
// Third one has a .inflight intermediate commit file
|
||||||
@@ -816,7 +839,6 @@ public class TestHoodieClientOnCopyOnWriteStorage implements Serializable {
|
|||||||
client.rollback(commitTime3);
|
client.rollback(commitTime3);
|
||||||
assertFalse(HoodieTestUtils.doesInflightExist(basePath, commitTime3));
|
assertFalse(HoodieTestUtils.doesInflightExist(basePath, commitTime3));
|
||||||
|
|
||||||
|
|
||||||
// Rollback commit2
|
// Rollback commit2
|
||||||
client.rollback(commitTime2);
|
client.rollback(commitTime2);
|
||||||
assertFalse(HoodieTestUtils.doesCommitExist(basePath, commitTime2));
|
assertFalse(HoodieTestUtils.doesCommitExist(basePath, commitTime2));
|
||||||
@@ -839,7 +861,6 @@ public class TestHoodieClientOnCopyOnWriteStorage implements Serializable {
|
|||||||
HoodieTestUtils.doesDataFileExist(basePath, "2016/05/02", commitTime2, file22) ||
|
HoodieTestUtils.doesDataFileExist(basePath, "2016/05/02", commitTime2, file22) ||
|
||||||
HoodieTestUtils.doesDataFileExist(basePath, "2016/05/06", commitTime2, file23));
|
HoodieTestUtils.doesDataFileExist(basePath, "2016/05/06", commitTime2, file23));
|
||||||
|
|
||||||
|
|
||||||
// Let's rollback commit1, Check results
|
// Let's rollback commit1, Check results
|
||||||
client.rollback(commitTime1);
|
client.rollback(commitTime1);
|
||||||
assertFalse(HoodieTestUtils.doesCommitExist(basePath, commitTime1));
|
assertFalse(HoodieTestUtils.doesCommitExist(basePath, commitTime1));
|
||||||
@@ -858,7 +879,7 @@ public class TestHoodieClientOnCopyOnWriteStorage implements Serializable {
|
|||||||
String commitTime3 = "20160506030611";
|
String commitTime3 = "20160506030611";
|
||||||
new File(basePath + "/.hoodie").mkdirs();
|
new File(basePath + "/.hoodie").mkdirs();
|
||||||
HoodieTestDataGenerator.writePartitionMetadata(FSUtils.getFs(),
|
HoodieTestDataGenerator.writePartitionMetadata(FSUtils.getFs(),
|
||||||
new String[] {"2016/05/01", "2016/05/02", "2016/05/06"},
|
new String[]{"2016/05/01", "2016/05/02", "2016/05/06"},
|
||||||
basePath);
|
basePath);
|
||||||
|
|
||||||
// One good commit
|
// One good commit
|
||||||
@@ -940,26 +961,29 @@ public class TestHoodieClientOnCopyOnWriteStorage implements Serializable {
|
|||||||
final String TEST_PARTITION_PATH = "2016/09/26";
|
final String TEST_PARTITION_PATH = "2016/09/26";
|
||||||
final int INSERT_SPLIT_LIMIT = 100;
|
final int INSERT_SPLIT_LIMIT = 100;
|
||||||
// setup the small file handling params
|
// setup the small file handling params
|
||||||
HoodieWriteConfig config = getSmallInsertWriteConfig(INSERT_SPLIT_LIMIT); // hold upto 200 records max
|
HoodieWriteConfig config = getSmallInsertWriteConfig(
|
||||||
dataGen = new HoodieTestDataGenerator(new String[] {TEST_PARTITION_PATH});
|
INSERT_SPLIT_LIMIT); // hold upto 200 records max
|
||||||
|
dataGen = new HoodieTestDataGenerator(new String[]{TEST_PARTITION_PATH});
|
||||||
|
|
||||||
HoodieWriteClient client = new HoodieWriteClient(jsc, config);
|
HoodieWriteClient client = new HoodieWriteClient(jsc, config);
|
||||||
|
|
||||||
// Inserts => will write file1
|
// Inserts => will write file1
|
||||||
String commitTime1 = "001";
|
String commitTime1 = "001";
|
||||||
client.startCommitWithTime(commitTime1);
|
client.startCommitWithTime(commitTime1);
|
||||||
List<HoodieRecord> inserts1 = dataGen.generateInserts(commitTime1, INSERT_SPLIT_LIMIT); // this writes ~500kb
|
List<HoodieRecord> inserts1 = dataGen
|
||||||
|
.generateInserts(commitTime1, INSERT_SPLIT_LIMIT); // this writes ~500kb
|
||||||
Set<String> keys1 = HoodieClientTestUtils.getRecordKeys(inserts1);
|
Set<String> keys1 = HoodieClientTestUtils.getRecordKeys(inserts1);
|
||||||
|
|
||||||
JavaRDD<HoodieRecord> insertRecordsRDD1 = jsc.parallelize(inserts1, 1);
|
JavaRDD<HoodieRecord> insertRecordsRDD1 = jsc.parallelize(inserts1, 1);
|
||||||
List<WriteStatus> statuses= client.upsert(insertRecordsRDD1, commitTime1).collect();
|
List<WriteStatus> statuses = client.upsert(insertRecordsRDD1, commitTime1).collect();
|
||||||
|
|
||||||
assertNoWriteErrors(statuses);
|
assertNoWriteErrors(statuses);
|
||||||
|
|
||||||
assertEquals("Just 1 file needs to be added.", 1, statuses.size());
|
assertEquals("Just 1 file needs to be added.", 1, statuses.size());
|
||||||
String file1 = statuses.get(0).getFileId();
|
String file1 = statuses.get(0).getFileId();
|
||||||
assertEquals("file should contain 100 records",
|
assertEquals("file should contain 100 records",
|
||||||
ParquetUtils.readRowKeysFromParquet(new Path(basePath, TEST_PARTITION_PATH + "/" + FSUtils.makeDataFileName(commitTime1, 0, file1))).size(),
|
ParquetUtils.readRowKeysFromParquet(new Path(basePath,
|
||||||
|
TEST_PARTITION_PATH + "/" + FSUtils.makeDataFileName(commitTime1, 0, file1))).size(),
|
||||||
100);
|
100);
|
||||||
|
|
||||||
// Update + Inserts such that they just expand file1
|
// Update + Inserts such that they just expand file1
|
||||||
@@ -977,15 +1001,20 @@ public class TestHoodieClientOnCopyOnWriteStorage implements Serializable {
|
|||||||
|
|
||||||
assertEquals("Just 1 file needs to be updated.", 1, statuses.size());
|
assertEquals("Just 1 file needs to be updated.", 1, statuses.size());
|
||||||
assertEquals("Existing file should be expanded", file1, statuses.get(0).getFileId());
|
assertEquals("Existing file should be expanded", file1, statuses.get(0).getFileId());
|
||||||
assertEquals("Existing file should be expanded", commitTime1, statuses.get(0).getStat().getPrevCommit());
|
assertEquals("Existing file should be expanded", commitTime1,
|
||||||
Path newFile = new Path(basePath, TEST_PARTITION_PATH + "/" + FSUtils.makeDataFileName(commitTime2, 0, file1));
|
statuses.get(0).getStat().getPrevCommit());
|
||||||
assertEquals("file should contain 140 records", ParquetUtils.readRowKeysFromParquet(newFile).size(), 140);
|
Path newFile = new Path(basePath,
|
||||||
|
TEST_PARTITION_PATH + "/" + FSUtils.makeDataFileName(commitTime2, 0, file1));
|
||||||
|
assertEquals("file should contain 140 records",
|
||||||
|
ParquetUtils.readRowKeysFromParquet(newFile).size(), 140);
|
||||||
|
|
||||||
List<GenericRecord> records = ParquetUtils.readAvroRecords(newFile);
|
List<GenericRecord> records = ParquetUtils.readAvroRecords(newFile);
|
||||||
for (GenericRecord record: records) {
|
for (GenericRecord record : records) {
|
||||||
String recordKey = record.get(HoodieRecord.RECORD_KEY_METADATA_FIELD).toString();
|
String recordKey = record.get(HoodieRecord.RECORD_KEY_METADATA_FIELD).toString();
|
||||||
assertEquals("only expect commit2", commitTime2, record.get(HoodieRecord.COMMIT_TIME_METADATA_FIELD).toString());
|
assertEquals("only expect commit2", commitTime2,
|
||||||
assertTrue("key expected to be part of commit2", keys2.contains(recordKey) || keys1.contains(recordKey));
|
record.get(HoodieRecord.COMMIT_TIME_METADATA_FIELD).toString());
|
||||||
|
assertTrue("key expected to be part of commit2",
|
||||||
|
keys2.contains(recordKey) || keys1.contains(recordKey));
|
||||||
}
|
}
|
||||||
|
|
||||||
// update + inserts such that file1 is updated and expanded, a new file2 is created.
|
// update + inserts such that file1 is updated and expanded, a new file2 is created.
|
||||||
@@ -1004,14 +1033,15 @@ public class TestHoodieClientOnCopyOnWriteStorage implements Serializable {
|
|||||||
HoodieTableMetaClient metadata = new HoodieTableMetaClient(fs, basePath);
|
HoodieTableMetaClient metadata = new HoodieTableMetaClient(fs, basePath);
|
||||||
HoodieTable table = HoodieTable.getHoodieTable(metadata, config);
|
HoodieTable table = HoodieTable.getHoodieTable(metadata, config);
|
||||||
TableFileSystemView.ReadOptimizedView fileSystemView = table.getROFileSystemView();
|
TableFileSystemView.ReadOptimizedView fileSystemView = table.getROFileSystemView();
|
||||||
List<HoodieDataFile> files = fileSystemView.getLatestDataFilesBeforeOrOn(TEST_PARTITION_PATH, commitTime3).collect(
|
List<HoodieDataFile> files = fileSystemView
|
||||||
|
.getLatestDataFilesBeforeOrOn(TEST_PARTITION_PATH, commitTime3).collect(
|
||||||
Collectors.toList());
|
Collectors.toList());
|
||||||
int numTotalInsertsInCommit3 = 0;
|
int numTotalInsertsInCommit3 = 0;
|
||||||
for (HoodieDataFile file: files) {
|
for (HoodieDataFile file : files) {
|
||||||
if (file.getFileName().contains(file1)) {
|
if (file.getFileName().contains(file1)) {
|
||||||
assertEquals("Existing file should be expanded", commitTime3, file.getCommitTime());
|
assertEquals("Existing file should be expanded", commitTime3, file.getCommitTime());
|
||||||
records = ParquetUtils.readAvroRecords(new Path(file.getPath()));
|
records = ParquetUtils.readAvroRecords(new Path(file.getPath()));
|
||||||
for (GenericRecord record: records) {
|
for (GenericRecord record : records) {
|
||||||
String recordKey = record.get(HoodieRecord.RECORD_KEY_METADATA_FIELD).toString();
|
String recordKey = record.get(HoodieRecord.RECORD_KEY_METADATA_FIELD).toString();
|
||||||
String recordCommitTime = record.get(HoodieRecord.COMMIT_TIME_METADATA_FIELD).toString();
|
String recordCommitTime = record.get(HoodieRecord.COMMIT_TIME_METADATA_FIELD).toString();
|
||||||
if (recordCommitTime.equals(commitTime3)) {
|
if (recordCommitTime.equals(commitTime3)) {
|
||||||
@@ -1023,13 +1053,15 @@ public class TestHoodieClientOnCopyOnWriteStorage implements Serializable {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
assertEquals("All keys added in commit 2 must be updated in commit3 correctly", 0, keys2.size());
|
assertEquals("All keys added in commit 2 must be updated in commit3 correctly", 0,
|
||||||
|
keys2.size());
|
||||||
} else {
|
} else {
|
||||||
assertEquals("New file must be written for commit 3", commitTime3, file.getCommitTime());
|
assertEquals("New file must be written for commit 3", commitTime3, file.getCommitTime());
|
||||||
records = ParquetUtils.readAvroRecords(new Path(file.getPath()));
|
records = ParquetUtils.readAvroRecords(new Path(file.getPath()));
|
||||||
for (GenericRecord record: records) {
|
for (GenericRecord record : records) {
|
||||||
String recordKey = record.get(HoodieRecord.RECORD_KEY_METADATA_FIELD).toString();
|
String recordKey = record.get(HoodieRecord.RECORD_KEY_METADATA_FIELD).toString();
|
||||||
assertEquals("only expect commit3", commitTime3, record.get(HoodieRecord.COMMIT_TIME_METADATA_FIELD).toString());
|
assertEquals("only expect commit3", commitTime3,
|
||||||
|
record.get(HoodieRecord.COMMIT_TIME_METADATA_FIELD).toString());
|
||||||
assertTrue("key expected to be part of commit3", keys3.contains(recordKey));
|
assertTrue("key expected to be part of commit3", keys3.contains(recordKey));
|
||||||
}
|
}
|
||||||
numTotalInsertsInCommit3 += records.size();
|
numTotalInsertsInCommit3 += records.size();
|
||||||
@@ -1044,17 +1076,19 @@ public class TestHoodieClientOnCopyOnWriteStorage implements Serializable {
|
|||||||
final String TEST_PARTITION_PATH = "2016/09/26";
|
final String TEST_PARTITION_PATH = "2016/09/26";
|
||||||
final int INSERT_SPLIT_LIMIT = 100;
|
final int INSERT_SPLIT_LIMIT = 100;
|
||||||
// setup the small file handling params
|
// setup the small file handling params
|
||||||
HoodieWriteConfig config = getSmallInsertWriteConfig(INSERT_SPLIT_LIMIT); // hold upto 200 records max
|
HoodieWriteConfig config = getSmallInsertWriteConfig(
|
||||||
dataGen = new HoodieTestDataGenerator(new String[] {TEST_PARTITION_PATH});
|
INSERT_SPLIT_LIMIT); // hold upto 200 records max
|
||||||
|
dataGen = new HoodieTestDataGenerator(new String[]{TEST_PARTITION_PATH});
|
||||||
HoodieWriteClient client = new HoodieWriteClient(jsc, config);
|
HoodieWriteClient client = new HoodieWriteClient(jsc, config);
|
||||||
|
|
||||||
// Inserts => will write file1
|
// Inserts => will write file1
|
||||||
String commitTime1 = "001";
|
String commitTime1 = "001";
|
||||||
client.startCommitWithTime(commitTime1);
|
client.startCommitWithTime(commitTime1);
|
||||||
List<HoodieRecord> inserts1 = dataGen.generateInserts(commitTime1, INSERT_SPLIT_LIMIT); // this writes ~500kb
|
List<HoodieRecord> inserts1 = dataGen
|
||||||
|
.generateInserts(commitTime1, INSERT_SPLIT_LIMIT); // this writes ~500kb
|
||||||
Set<String> keys1 = HoodieClientTestUtils.getRecordKeys(inserts1);
|
Set<String> keys1 = HoodieClientTestUtils.getRecordKeys(inserts1);
|
||||||
JavaRDD<HoodieRecord> insertRecordsRDD1 = jsc.parallelize(inserts1, 1);
|
JavaRDD<HoodieRecord> insertRecordsRDD1 = jsc.parallelize(inserts1, 1);
|
||||||
List<WriteStatus> statuses= client.insert(insertRecordsRDD1, commitTime1).collect();
|
List<WriteStatus> statuses = client.insert(insertRecordsRDD1, commitTime1).collect();
|
||||||
|
|
||||||
assertNoWriteErrors(statuses);
|
assertNoWriteErrors(statuses);
|
||||||
assertPartitionMetadata(new String[]{TEST_PARTITION_PATH}, FSUtils.getFs());
|
assertPartitionMetadata(new String[]{TEST_PARTITION_PATH}, FSUtils.getFs());
|
||||||
@@ -1062,7 +1096,8 @@ public class TestHoodieClientOnCopyOnWriteStorage implements Serializable {
|
|||||||
assertEquals("Just 1 file needs to be added.", 1, statuses.size());
|
assertEquals("Just 1 file needs to be added.", 1, statuses.size());
|
||||||
String file1 = statuses.get(0).getFileId();
|
String file1 = statuses.get(0).getFileId();
|
||||||
assertEquals("file should contain 100 records",
|
assertEquals("file should contain 100 records",
|
||||||
ParquetUtils.readRowKeysFromParquet(new Path(basePath, TEST_PARTITION_PATH + "/" + FSUtils.makeDataFileName(commitTime1, 0, file1))).size(),
|
ParquetUtils.readRowKeysFromParquet(new Path(basePath,
|
||||||
|
TEST_PARTITION_PATH + "/" + FSUtils.makeDataFileName(commitTime1, 0, file1))).size(),
|
||||||
100);
|
100);
|
||||||
|
|
||||||
// Second, set of Inserts should just expand file1
|
// Second, set of Inserts should just expand file1
|
||||||
@@ -1076,16 +1111,21 @@ public class TestHoodieClientOnCopyOnWriteStorage implements Serializable {
|
|||||||
|
|
||||||
assertEquals("Just 1 file needs to be updated.", 1, statuses.size());
|
assertEquals("Just 1 file needs to be updated.", 1, statuses.size());
|
||||||
assertEquals("Existing file should be expanded", file1, statuses.get(0).getFileId());
|
assertEquals("Existing file should be expanded", file1, statuses.get(0).getFileId());
|
||||||
assertEquals("Existing file should be expanded", commitTime1, statuses.get(0).getStat().getPrevCommit());
|
assertEquals("Existing file should be expanded", commitTime1,
|
||||||
Path newFile = new Path(basePath, TEST_PARTITION_PATH + "/" + FSUtils.makeDataFileName(commitTime2, 0, file1));
|
statuses.get(0).getStat().getPrevCommit());
|
||||||
assertEquals("file should contain 140 records", ParquetUtils.readRowKeysFromParquet(newFile).size(), 140);
|
Path newFile = new Path(basePath,
|
||||||
|
TEST_PARTITION_PATH + "/" + FSUtils.makeDataFileName(commitTime2, 0, file1));
|
||||||
|
assertEquals("file should contain 140 records",
|
||||||
|
ParquetUtils.readRowKeysFromParquet(newFile).size(), 140);
|
||||||
|
|
||||||
List<GenericRecord> records = ParquetUtils.readAvroRecords(newFile);
|
List<GenericRecord> records = ParquetUtils.readAvroRecords(newFile);
|
||||||
for (GenericRecord record: records) {
|
for (GenericRecord record : records) {
|
||||||
String recordKey = record.get(HoodieRecord.RECORD_KEY_METADATA_FIELD).toString();
|
String recordKey = record.get(HoodieRecord.RECORD_KEY_METADATA_FIELD).toString();
|
||||||
String recCommitTime = record.get(HoodieRecord.COMMIT_TIME_METADATA_FIELD).toString();
|
String recCommitTime = record.get(HoodieRecord.COMMIT_TIME_METADATA_FIELD).toString();
|
||||||
assertTrue("Record expected to be part of commit 1 or commit2", commitTime1.equals(recCommitTime) || commitTime2.equals(recCommitTime));
|
assertTrue("Record expected to be part of commit 1 or commit2",
|
||||||
assertTrue("key expected to be part of commit 1 or commit2", keys2.contains(recordKey) || keys1.contains(recordKey));
|
commitTime1.equals(recCommitTime) || commitTime2.equals(recCommitTime));
|
||||||
|
assertTrue("key expected to be part of commit 1 or commit2",
|
||||||
|
keys2.contains(recordKey) || keys1.contains(recordKey));
|
||||||
}
|
}
|
||||||
|
|
||||||
// Lots of inserts such that file1 is updated and expanded, a new file2 is created.
|
// Lots of inserts such that file1 is updated and expanded, a new file2 is created.
|
||||||
@@ -1097,7 +1137,6 @@ public class TestHoodieClientOnCopyOnWriteStorage implements Serializable {
|
|||||||
assertNoWriteErrors(statuses);
|
assertNoWriteErrors(statuses);
|
||||||
assertEquals("2 files needs to be committed.", 2, statuses.size());
|
assertEquals("2 files needs to be committed.", 2, statuses.size());
|
||||||
|
|
||||||
|
|
||||||
FileSystem fs = FSUtils.getFs();
|
FileSystem fs = FSUtils.getFs();
|
||||||
HoodieTableMetaClient metaClient = new HoodieTableMetaClient(fs, basePath);
|
HoodieTableMetaClient metaClient = new HoodieTableMetaClient(fs, basePath);
|
||||||
HoodieTable table = HoodieTable.getHoodieTable(metaClient, config);
|
HoodieTable table = HoodieTable.getHoodieTable(metaClient, config);
|
||||||
@@ -1106,14 +1145,14 @@ public class TestHoodieClientOnCopyOnWriteStorage implements Serializable {
|
|||||||
.collect(Collectors.toList());
|
.collect(Collectors.toList());
|
||||||
assertEquals("Total of 2 valid data files", 2, files.size());
|
assertEquals("Total of 2 valid data files", 2, files.size());
|
||||||
|
|
||||||
|
|
||||||
int totalInserts = 0;
|
int totalInserts = 0;
|
||||||
for (HoodieDataFile file: files) {
|
for (HoodieDataFile file : files) {
|
||||||
assertEquals("All files must be at commit 3", commitTime3, file.getCommitTime());
|
assertEquals("All files must be at commit 3", commitTime3, file.getCommitTime());
|
||||||
records = ParquetUtils.readAvroRecords(new Path(file.getPath()));
|
records = ParquetUtils.readAvroRecords(new Path(file.getPath()));
|
||||||
totalInserts += records.size();
|
totalInserts += records.size();
|
||||||
}
|
}
|
||||||
assertEquals("Total number of records must add up", totalInserts, inserts1.size() + inserts2.size() + insert3.size());
|
assertEquals("Total number of records must add up", totalInserts,
|
||||||
|
inserts1.size() + inserts2.size() + insert3.size());
|
||||||
}
|
}
|
||||||
|
|
||||||
@Test
|
@Test
|
||||||
@@ -1130,27 +1169,35 @@ public class TestHoodieClientOnCopyOnWriteStorage implements Serializable {
|
|||||||
String file1P0C0 = HoodieTestUtils.createNewDataFile(basePath, partitionPaths[0], "000");
|
String file1P0C0 = HoodieTestUtils.createNewDataFile(basePath, partitionPaths[0], "000");
|
||||||
String file1P1C0 = HoodieTestUtils.createNewDataFile(basePath, partitionPaths[1], "000");
|
String file1P1C0 = HoodieTestUtils.createNewDataFile(basePath, partitionPaths[1], "000");
|
||||||
HoodieTable table = HoodieTable
|
HoodieTable table = HoodieTable
|
||||||
.getHoodieTable(new HoodieTableMetaClient(FSUtils.getFs(), config.getBasePath(), true), config);
|
.getHoodieTable(new HoodieTableMetaClient(FSUtils.getFs(), config.getBasePath(), true),
|
||||||
|
config);
|
||||||
|
|
||||||
List<HoodieCleanStat> hoodieCleanStatsOne = table.clean(jsc);
|
List<HoodieCleanStat> hoodieCleanStatsOne = table.clean(jsc);
|
||||||
assertEquals("Must not clean any files" , 0, getCleanStat(hoodieCleanStatsOne, partitionPaths[0]).getSuccessDeleteFiles().size());
|
assertEquals("Must not clean any files", 0,
|
||||||
assertEquals("Must not clean any files" , 0, getCleanStat(hoodieCleanStatsOne, partitionPaths[1]).getSuccessDeleteFiles().size());
|
getCleanStat(hoodieCleanStatsOne, partitionPaths[0]).getSuccessDeleteFiles().size());
|
||||||
|
assertEquals("Must not clean any files", 0,
|
||||||
|
getCleanStat(hoodieCleanStatsOne, partitionPaths[1]).getSuccessDeleteFiles().size());
|
||||||
assertTrue(HoodieTestUtils.doesDataFileExist(basePath, partitionPaths[0], "000", file1P0C0));
|
assertTrue(HoodieTestUtils.doesDataFileExist(basePath, partitionPaths[0], "000", file1P0C0));
|
||||||
assertTrue(HoodieTestUtils.doesDataFileExist(basePath, partitionPaths[1], "000", file1P1C0));
|
assertTrue(HoodieTestUtils.doesDataFileExist(basePath, partitionPaths[1], "000", file1P1C0));
|
||||||
|
|
||||||
// make next commit, with 1 insert & 1 update per partition
|
// make next commit, with 1 insert & 1 update per partition
|
||||||
HoodieTestUtils.createCommitFiles(basePath, "001");
|
HoodieTestUtils.createCommitFiles(basePath, "001");
|
||||||
table = HoodieTable
|
table = HoodieTable
|
||||||
.getHoodieTable(new HoodieTableMetaClient(FSUtils.getFs(), config.getBasePath(), true), config);
|
.getHoodieTable(new HoodieTableMetaClient(FSUtils.getFs(), config.getBasePath(), true),
|
||||||
|
config);
|
||||||
|
|
||||||
String file2P0C1 = HoodieTestUtils.createNewDataFile(basePath, partitionPaths[0], "001"); // insert
|
String file2P0C1 = HoodieTestUtils
|
||||||
String file2P1C1 = HoodieTestUtils.createNewDataFile(basePath, partitionPaths[1], "001"); // insert
|
.createNewDataFile(basePath, partitionPaths[0], "001"); // insert
|
||||||
|
String file2P1C1 = HoodieTestUtils
|
||||||
|
.createNewDataFile(basePath, partitionPaths[1], "001"); // insert
|
||||||
HoodieTestUtils.createDataFile(basePath, partitionPaths[0], "001", file1P0C0); // update
|
HoodieTestUtils.createDataFile(basePath, partitionPaths[0], "001", file1P0C0); // update
|
||||||
HoodieTestUtils.createDataFile(basePath, partitionPaths[1], "001", file1P1C0); // update
|
HoodieTestUtils.createDataFile(basePath, partitionPaths[1], "001", file1P1C0); // update
|
||||||
|
|
||||||
List<HoodieCleanStat> hoodieCleanStatsTwo = table.clean(jsc);
|
List<HoodieCleanStat> hoodieCleanStatsTwo = table.clean(jsc);
|
||||||
assertEquals("Must clean 1 file" , 1, getCleanStat(hoodieCleanStatsTwo, partitionPaths[0]).getSuccessDeleteFiles().size());
|
assertEquals("Must clean 1 file", 1,
|
||||||
assertEquals("Must clean 1 file" , 1, getCleanStat(hoodieCleanStatsTwo, partitionPaths[1]).getSuccessDeleteFiles().size());
|
getCleanStat(hoodieCleanStatsTwo, partitionPaths[0]).getSuccessDeleteFiles().size());
|
||||||
|
assertEquals("Must clean 1 file", 1,
|
||||||
|
getCleanStat(hoodieCleanStatsTwo, partitionPaths[1]).getSuccessDeleteFiles().size());
|
||||||
assertTrue(HoodieTestUtils.doesDataFileExist(basePath, partitionPaths[0], "001", file2P0C1));
|
assertTrue(HoodieTestUtils.doesDataFileExist(basePath, partitionPaths[0], "001", file2P0C1));
|
||||||
assertTrue(HoodieTestUtils.doesDataFileExist(basePath, partitionPaths[1], "001", file2P1C1));
|
assertTrue(HoodieTestUtils.doesDataFileExist(basePath, partitionPaths[1], "001", file2P1C1));
|
||||||
assertFalse(HoodieTestUtils.doesDataFileExist(basePath, partitionPaths[0], "000", file1P0C0));
|
assertFalse(HoodieTestUtils.doesDataFileExist(basePath, partitionPaths[0], "000", file1P0C0));
|
||||||
@@ -1159,14 +1206,16 @@ public class TestHoodieClientOnCopyOnWriteStorage implements Serializable {
|
|||||||
// make next commit, with 2 updates to existing files, and 1 insert
|
// make next commit, with 2 updates to existing files, and 1 insert
|
||||||
HoodieTestUtils.createCommitFiles(basePath, "002");
|
HoodieTestUtils.createCommitFiles(basePath, "002");
|
||||||
table = HoodieTable
|
table = HoodieTable
|
||||||
.getHoodieTable(new HoodieTableMetaClient(FSUtils.getFs(), config.getBasePath(), true), config);
|
.getHoodieTable(new HoodieTableMetaClient(FSUtils.getFs(), config.getBasePath(), true),
|
||||||
|
config);
|
||||||
|
|
||||||
HoodieTestUtils.createDataFile(basePath, partitionPaths[0], "002", file1P0C0); // update
|
HoodieTestUtils.createDataFile(basePath, partitionPaths[0], "002", file1P0C0); // update
|
||||||
HoodieTestUtils.createDataFile(basePath, partitionPaths[0], "002", file2P0C1); // update
|
HoodieTestUtils.createDataFile(basePath, partitionPaths[0], "002", file2P0C1); // update
|
||||||
String file3P0C2 = HoodieTestUtils.createNewDataFile(basePath, partitionPaths[0], "002");
|
String file3P0C2 = HoodieTestUtils.createNewDataFile(basePath, partitionPaths[0], "002");
|
||||||
|
|
||||||
List<HoodieCleanStat> hoodieCleanStatsThree = table.clean(jsc);
|
List<HoodieCleanStat> hoodieCleanStatsThree = table.clean(jsc);
|
||||||
assertEquals("Must clean two files" , 2, getCleanStat(hoodieCleanStatsThree, partitionPaths[0]).getSuccessDeleteFiles().size());
|
assertEquals("Must clean two files", 2,
|
||||||
|
getCleanStat(hoodieCleanStatsThree, partitionPaths[0]).getSuccessDeleteFiles().size());
|
||||||
assertFalse(HoodieTestUtils.doesDataFileExist(basePath, partitionPaths[0], "001", file1P0C0));
|
assertFalse(HoodieTestUtils.doesDataFileExist(basePath, partitionPaths[0], "001", file1P0C0));
|
||||||
assertFalse(HoodieTestUtils.doesDataFileExist(basePath, partitionPaths[0], "001", file2P0C1));
|
assertFalse(HoodieTestUtils.doesDataFileExist(basePath, partitionPaths[0], "001", file2P0C1));
|
||||||
assertTrue(HoodieTestUtils.doesDataFileExist(basePath, partitionPaths[0], "002", file3P0C2));
|
assertTrue(HoodieTestUtils.doesDataFileExist(basePath, partitionPaths[0], "002", file3P0C2));
|
||||||
@@ -1174,7 +1223,8 @@ public class TestHoodieClientOnCopyOnWriteStorage implements Serializable {
|
|||||||
// No cleaning on partially written file, with no commit.
|
// No cleaning on partially written file, with no commit.
|
||||||
HoodieTestUtils.createDataFile(basePath, partitionPaths[0], "003", file3P0C2); // update
|
HoodieTestUtils.createDataFile(basePath, partitionPaths[0], "003", file3P0C2); // update
|
||||||
List<HoodieCleanStat> hoodieCleanStatsFour = table.clean(jsc);
|
List<HoodieCleanStat> hoodieCleanStatsFour = table.clean(jsc);
|
||||||
assertEquals("Must not clean any files" , 0, getCleanStat(hoodieCleanStatsFour, partitionPaths[0]).getSuccessDeleteFiles().size());
|
assertEquals("Must not clean any files", 0,
|
||||||
|
getCleanStat(hoodieCleanStatsFour, partitionPaths[0]).getSuccessDeleteFiles().size());
|
||||||
assertTrue(HoodieTestUtils.doesDataFileExist(basePath, partitionPaths[0], "002", file3P0C2));
|
assertTrue(HoodieTestUtils.doesDataFileExist(basePath, partitionPaths[0], "002", file3P0C2));
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -1187,31 +1237,39 @@ public class TestHoodieClientOnCopyOnWriteStorage implements Serializable {
|
|||||||
.withCleanerPolicy(HoodieCleaningPolicy.KEEP_LATEST_FILE_VERSIONS)
|
.withCleanerPolicy(HoodieCleaningPolicy.KEEP_LATEST_FILE_VERSIONS)
|
||||||
.retainFileVersions(1).build()).build();
|
.retainFileVersions(1).build()).build();
|
||||||
|
|
||||||
|
HoodieTableMetaClient metaClient = HoodieTestUtils
|
||||||
HoodieTableMetaClient metaClient = HoodieTestUtils.initTableType(basePath, HoodieTableType.MERGE_ON_READ);
|
.initTableType(basePath, HoodieTableType.MERGE_ON_READ);
|
||||||
|
|
||||||
// Make 3 files, one base file and 2 log files associated with base file
|
// Make 3 files, one base file and 2 log files associated with base file
|
||||||
String file1P0 = HoodieTestUtils.createNewDataFile(basePath, partitionPaths[0], "000");
|
String file1P0 = HoodieTestUtils.createNewDataFile(basePath, partitionPaths[0], "000");
|
||||||
String file2P0L0 = HoodieTestUtils.createNewLogFile(basePath, partitionPaths[0], "000", file1P0, Optional.empty());
|
String file2P0L0 = HoodieTestUtils
|
||||||
String file2P0L1 = HoodieTestUtils.createNewLogFile(basePath, partitionPaths[0], "000", file1P0, Optional.of(2));
|
.createNewLogFile(basePath, partitionPaths[0], "000", file1P0, Optional.empty());
|
||||||
|
String file2P0L1 = HoodieTestUtils
|
||||||
|
.createNewLogFile(basePath, partitionPaths[0], "000", file1P0, Optional.of(2));
|
||||||
// make 1 compaction commit
|
// make 1 compaction commit
|
||||||
HoodieTestUtils.createCompactionCommitFiles(basePath, "000");
|
HoodieTestUtils.createCompactionCommitFiles(basePath, "000");
|
||||||
|
|
||||||
// Make 4 files, one base file and 3 log files associated with base file
|
// Make 4 files, one base file and 3 log files associated with base file
|
||||||
HoodieTestUtils.createDataFile(basePath, partitionPaths[0], "001", file1P0);
|
HoodieTestUtils.createDataFile(basePath, partitionPaths[0], "001", file1P0);
|
||||||
file2P0L0 = HoodieTestUtils.createNewLogFile(basePath, partitionPaths[0], "001", file1P0, Optional.empty());
|
file2P0L0 = HoodieTestUtils
|
||||||
file2P0L0 = HoodieTestUtils.createNewLogFile(basePath, partitionPaths[0], "001", file1P0, Optional.of(2));
|
.createNewLogFile(basePath, partitionPaths[0], "001", file1P0, Optional.empty());
|
||||||
file2P0L0 = HoodieTestUtils.createNewLogFile(basePath, partitionPaths[0], "001", file1P0, Optional.of(3));
|
file2P0L0 = HoodieTestUtils
|
||||||
|
.createNewLogFile(basePath, partitionPaths[0], "001", file1P0, Optional.of(2));
|
||||||
|
file2P0L0 = HoodieTestUtils
|
||||||
|
.createNewLogFile(basePath, partitionPaths[0], "001", file1P0, Optional.of(3));
|
||||||
// make 1 compaction commit
|
// make 1 compaction commit
|
||||||
HoodieTestUtils.createCompactionCommitFiles(basePath, "001");
|
HoodieTestUtils.createCompactionCommitFiles(basePath, "001");
|
||||||
|
|
||||||
HoodieTable table = HoodieTable
|
HoodieTable table = HoodieTable
|
||||||
.getHoodieTable(metaClient, config);
|
.getHoodieTable(metaClient, config);
|
||||||
List<HoodieCleanStat> hoodieCleanStats = table.clean(jsc);
|
List<HoodieCleanStat> hoodieCleanStats = table.clean(jsc);
|
||||||
assertEquals("Must clean three files, one parquet and 2 log files" , 3, getCleanStat(hoodieCleanStats, partitionPaths[0]).getSuccessDeleteFiles().size());
|
assertEquals("Must clean three files, one parquet and 2 log files", 3,
|
||||||
|
getCleanStat(hoodieCleanStats, partitionPaths[0]).getSuccessDeleteFiles().size());
|
||||||
assertFalse(HoodieTestUtils.doesDataFileExist(basePath, partitionPaths[0], "000", file1P0));
|
assertFalse(HoodieTestUtils.doesDataFileExist(basePath, partitionPaths[0], "000", file1P0));
|
||||||
assertFalse(HoodieTestUtils.doesLogFileExist(basePath, partitionPaths[0], "000", file2P0L0, Optional.empty()));
|
assertFalse(HoodieTestUtils
|
||||||
assertFalse(HoodieTestUtils.doesLogFileExist(basePath, partitionPaths[0], "000", file2P0L0, Optional.of(2)));
|
.doesLogFileExist(basePath, partitionPaths[0], "000", file2P0L0, Optional.empty()));
|
||||||
|
assertFalse(HoodieTestUtils
|
||||||
|
.doesLogFileExist(basePath, partitionPaths[0], "000", file2P0L0, Optional.of(2)));
|
||||||
}
|
}
|
||||||
|
|
||||||
@Test
|
@Test
|
||||||
@@ -1229,27 +1287,35 @@ public class TestHoodieClientOnCopyOnWriteStorage implements Serializable {
|
|||||||
String file1P1C0 = HoodieTestUtils.createNewDataFile(basePath, partitionPaths[1], "000");
|
String file1P1C0 = HoodieTestUtils.createNewDataFile(basePath, partitionPaths[1], "000");
|
||||||
|
|
||||||
HoodieTable table = HoodieTable
|
HoodieTable table = HoodieTable
|
||||||
.getHoodieTable(new HoodieTableMetaClient(FSUtils.getFs(), config.getBasePath(), true), config);
|
.getHoodieTable(new HoodieTableMetaClient(FSUtils.getFs(), config.getBasePath(), true),
|
||||||
|
config);
|
||||||
|
|
||||||
List<HoodieCleanStat> hoodieCleanStatsOne = table.clean(jsc);
|
List<HoodieCleanStat> hoodieCleanStatsOne = table.clean(jsc);
|
||||||
assertEquals("Must not clean any files" , 0, getCleanStat(hoodieCleanStatsOne, partitionPaths[0]).getSuccessDeleteFiles().size());
|
assertEquals("Must not clean any files", 0,
|
||||||
assertEquals("Must not clean any files" , 0, getCleanStat(hoodieCleanStatsOne, partitionPaths[1]).getSuccessDeleteFiles().size());
|
getCleanStat(hoodieCleanStatsOne, partitionPaths[0]).getSuccessDeleteFiles().size());
|
||||||
|
assertEquals("Must not clean any files", 0,
|
||||||
|
getCleanStat(hoodieCleanStatsOne, partitionPaths[1]).getSuccessDeleteFiles().size());
|
||||||
assertTrue(HoodieTestUtils.doesDataFileExist(basePath, partitionPaths[0], "000", file1P0C0));
|
assertTrue(HoodieTestUtils.doesDataFileExist(basePath, partitionPaths[0], "000", file1P0C0));
|
||||||
assertTrue(HoodieTestUtils.doesDataFileExist(basePath, partitionPaths[1], "000", file1P1C0));
|
assertTrue(HoodieTestUtils.doesDataFileExist(basePath, partitionPaths[1], "000", file1P1C0));
|
||||||
|
|
||||||
// make next commit, with 1 insert & 1 update per partition
|
// make next commit, with 1 insert & 1 update per partition
|
||||||
HoodieTestUtils.createCommitFiles(basePath, "001");
|
HoodieTestUtils.createCommitFiles(basePath, "001");
|
||||||
table = HoodieTable
|
table = HoodieTable
|
||||||
.getHoodieTable(new HoodieTableMetaClient(FSUtils.getFs(), config.getBasePath(), true), config);
|
.getHoodieTable(new HoodieTableMetaClient(FSUtils.getFs(), config.getBasePath(), true),
|
||||||
|
config);
|
||||||
|
|
||||||
String file2P0C1 = HoodieTestUtils.createNewDataFile(basePath, partitionPaths[0], "001"); // insert
|
String file2P0C1 = HoodieTestUtils
|
||||||
String file2P1C1 = HoodieTestUtils.createNewDataFile(basePath, partitionPaths[1], "001"); // insert
|
.createNewDataFile(basePath, partitionPaths[0], "001"); // insert
|
||||||
|
String file2P1C1 = HoodieTestUtils
|
||||||
|
.createNewDataFile(basePath, partitionPaths[1], "001"); // insert
|
||||||
HoodieTestUtils.createDataFile(basePath, partitionPaths[0], "001", file1P0C0); // update
|
HoodieTestUtils.createDataFile(basePath, partitionPaths[0], "001", file1P0C0); // update
|
||||||
HoodieTestUtils.createDataFile(basePath, partitionPaths[1], "001", file1P1C0); // update
|
HoodieTestUtils.createDataFile(basePath, partitionPaths[1], "001", file1P1C0); // update
|
||||||
|
|
||||||
List<HoodieCleanStat> hoodieCleanStatsTwo = table.clean(jsc);
|
List<HoodieCleanStat> hoodieCleanStatsTwo = table.clean(jsc);
|
||||||
assertEquals("Must not clean any files" , 0, getCleanStat(hoodieCleanStatsTwo, partitionPaths[0]).getSuccessDeleteFiles().size());
|
assertEquals("Must not clean any files", 0,
|
||||||
assertEquals("Must not clean any files" , 0, getCleanStat(hoodieCleanStatsTwo, partitionPaths[1]).getSuccessDeleteFiles().size());
|
getCleanStat(hoodieCleanStatsTwo, partitionPaths[0]).getSuccessDeleteFiles().size());
|
||||||
|
assertEquals("Must not clean any files", 0,
|
||||||
|
getCleanStat(hoodieCleanStatsTwo, partitionPaths[1]).getSuccessDeleteFiles().size());
|
||||||
assertTrue(HoodieTestUtils.doesDataFileExist(basePath, partitionPaths[0], "001", file2P0C1));
|
assertTrue(HoodieTestUtils.doesDataFileExist(basePath, partitionPaths[0], "001", file2P0C1));
|
||||||
assertTrue(HoodieTestUtils.doesDataFileExist(basePath, partitionPaths[1], "001", file2P1C1));
|
assertTrue(HoodieTestUtils.doesDataFileExist(basePath, partitionPaths[1], "001", file2P1C1));
|
||||||
assertTrue(HoodieTestUtils.doesDataFileExist(basePath, partitionPaths[0], "000", file1P0C0));
|
assertTrue(HoodieTestUtils.doesDataFileExist(basePath, partitionPaths[0], "000", file1P0C0));
|
||||||
@@ -1258,7 +1324,8 @@ public class TestHoodieClientOnCopyOnWriteStorage implements Serializable {
|
|||||||
// make next commit, with 2 updates to existing files, and 1 insert
|
// make next commit, with 2 updates to existing files, and 1 insert
|
||||||
HoodieTestUtils.createCommitFiles(basePath, "002");
|
HoodieTestUtils.createCommitFiles(basePath, "002");
|
||||||
table = HoodieTable
|
table = HoodieTable
|
||||||
.getHoodieTable(new HoodieTableMetaClient(FSUtils.getFs(), config.getBasePath(), true), config);
|
.getHoodieTable(new HoodieTableMetaClient(FSUtils.getFs(), config.getBasePath(), true),
|
||||||
|
config);
|
||||||
|
|
||||||
HoodieTestUtils.createDataFile(basePath, partitionPaths[0], "002", file1P0C0); // update
|
HoodieTestUtils.createDataFile(basePath, partitionPaths[0], "002", file1P0C0); // update
|
||||||
HoodieTestUtils.createDataFile(basePath, partitionPaths[0], "002", file2P0C1); // update
|
HoodieTestUtils.createDataFile(basePath, partitionPaths[0], "002", file2P0C1); // update
|
||||||
@@ -1274,7 +1341,8 @@ public class TestHoodieClientOnCopyOnWriteStorage implements Serializable {
|
|||||||
// make next commit, with 2 updates to existing files, and 1 insert
|
// make next commit, with 2 updates to existing files, and 1 insert
|
||||||
HoodieTestUtils.createCommitFiles(basePath, "003");
|
HoodieTestUtils.createCommitFiles(basePath, "003");
|
||||||
table = HoodieTable
|
table = HoodieTable
|
||||||
.getHoodieTable(new HoodieTableMetaClient(FSUtils.getFs(), config.getBasePath(), true), config);
|
.getHoodieTable(new HoodieTableMetaClient(FSUtils.getFs(), config.getBasePath(), true),
|
||||||
|
config);
|
||||||
|
|
||||||
HoodieTestUtils.createDataFile(basePath, partitionPaths[0], "003", file1P0C0); // update
|
HoodieTestUtils.createDataFile(basePath, partitionPaths[0], "003", file1P0C0); // update
|
||||||
HoodieTestUtils.createDataFile(basePath, partitionPaths[0], "003", file2P0C1); // update
|
HoodieTestUtils.createDataFile(basePath, partitionPaths[0], "003", file2P0C1); // update
|
||||||
@@ -1282,7 +1350,8 @@ public class TestHoodieClientOnCopyOnWriteStorage implements Serializable {
|
|||||||
|
|
||||||
List<HoodieCleanStat> hoodieCleanStatsFour = table.clean(jsc);
|
List<HoodieCleanStat> hoodieCleanStatsFour = table.clean(jsc);
|
||||||
assertEquals(
|
assertEquals(
|
||||||
"Must not clean one old file", 1, getCleanStat(hoodieCleanStatsFour, partitionPaths[0]).getSuccessDeleteFiles().size());
|
"Must not clean one old file", 1,
|
||||||
|
getCleanStat(hoodieCleanStatsFour, partitionPaths[0]).getSuccessDeleteFiles().size());
|
||||||
|
|
||||||
assertFalse(HoodieTestUtils.doesDataFileExist(basePath, partitionPaths[0], "000", file1P0C0));
|
assertFalse(HoodieTestUtils.doesDataFileExist(basePath, partitionPaths[0], "000", file1P0C0));
|
||||||
assertTrue(HoodieTestUtils.doesDataFileExist(basePath, partitionPaths[0], "001", file1P0C0));
|
assertTrue(HoodieTestUtils.doesDataFileExist(basePath, partitionPaths[0], "001", file1P0C0));
|
||||||
@@ -1295,7 +1364,8 @@ public class TestHoodieClientOnCopyOnWriteStorage implements Serializable {
|
|||||||
// No cleaning on partially written file, with no commit.
|
// No cleaning on partially written file, with no commit.
|
||||||
HoodieTestUtils.createDataFile(basePath, partitionPaths[0], "004", file3P0C2); // update
|
HoodieTestUtils.createDataFile(basePath, partitionPaths[0], "004", file3P0C2); // update
|
||||||
List<HoodieCleanStat> hoodieCleanStatsFive = table.clean(jsc);
|
List<HoodieCleanStat> hoodieCleanStatsFive = table.clean(jsc);
|
||||||
assertEquals("Must not clean any files" , 0, getCleanStat(hoodieCleanStatsFive, partitionPaths[0]).getSuccessDeleteFiles().size());
|
assertEquals("Must not clean any files", 0,
|
||||||
|
getCleanStat(hoodieCleanStatsFive, partitionPaths[0]).getSuccessDeleteFiles().size());
|
||||||
assertTrue(HoodieTestUtils.doesDataFileExist(basePath, partitionPaths[0], "001", file1P0C0));
|
assertTrue(HoodieTestUtils.doesDataFileExist(basePath, partitionPaths[0], "001", file1P0C0));
|
||||||
assertTrue(HoodieTestUtils.doesDataFileExist(basePath, partitionPaths[0], "001", file2P0C1));
|
assertTrue(HoodieTestUtils.doesDataFileExist(basePath, partitionPaths[0], "001", file2P0C1));
|
||||||
}
|
}
|
||||||
@@ -1344,13 +1414,14 @@ public class TestHoodieClientOnCopyOnWriteStorage implements Serializable {
|
|||||||
|
|
||||||
Iterator<AccumulatorV2<?, ?>> iterator = taskEnd.taskMetrics().accumulators()
|
Iterator<AccumulatorV2<?, ?>> iterator = taskEnd.taskMetrics().accumulators()
|
||||||
.iterator();
|
.iterator();
|
||||||
while(iterator.hasNext()) {
|
while (iterator.hasNext()) {
|
||||||
AccumulatorV2 accumulator = iterator.next();
|
AccumulatorV2 accumulator = iterator.next();
|
||||||
if (taskEnd.stageId() == 1 &&
|
if (taskEnd.stageId() == 1 &&
|
||||||
accumulator.isRegistered() &&
|
accumulator.isRegistered() &&
|
||||||
accumulator.name().isDefined() &&
|
accumulator.name().isDefined() &&
|
||||||
accumulator.name().get().equals("internal.metrics.shuffle.read.recordsRead")) {
|
accumulator.name().get().equals("internal.metrics.shuffle.read.recordsRead")) {
|
||||||
stageOneShuffleReadTaskRecordsCountMap.put(taskEnd.taskInfo().taskId(), (Long) accumulator.value());
|
stageOneShuffleReadTaskRecordsCountMap
|
||||||
|
.put(taskEnd.taskInfo().taskId(), (Long) accumulator.value());
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@@ -1378,22 +1449,27 @@ public class TestHoodieClientOnCopyOnWriteStorage implements Serializable {
|
|||||||
updateAllFilesInPartition(filesP2C0, partitionPaths[2], "003");
|
updateAllFilesInPartition(filesP2C0, partitionPaths[2], "003");
|
||||||
|
|
||||||
HoodieTable table = HoodieTable
|
HoodieTable table = HoodieTable
|
||||||
.getHoodieTable(new HoodieTableMetaClient(FSUtils.getFs(), config.getBasePath(), true), config);
|
.getHoodieTable(new HoodieTableMetaClient(FSUtils.getFs(), config.getBasePath(), true),
|
||||||
|
config);
|
||||||
List<HoodieCleanStat> hoodieCleanStats = table.clean(jsc);
|
List<HoodieCleanStat> hoodieCleanStats = table.clean(jsc);
|
||||||
|
|
||||||
assertEquals(100, getCleanStat(hoodieCleanStats, partitionPaths[0]).getSuccessDeleteFiles().size());
|
assertEquals(100,
|
||||||
assertEquals(10, getCleanStat(hoodieCleanStats, partitionPaths[1]).getSuccessDeleteFiles().size());
|
getCleanStat(hoodieCleanStats, partitionPaths[0]).getSuccessDeleteFiles().size());
|
||||||
assertEquals(10, getCleanStat(hoodieCleanStats, partitionPaths[2]).getSuccessDeleteFiles().size());
|
assertEquals(10,
|
||||||
|
getCleanStat(hoodieCleanStats, partitionPaths[1]).getSuccessDeleteFiles().size());
|
||||||
|
assertEquals(10,
|
||||||
|
getCleanStat(hoodieCleanStats, partitionPaths[2]).getSuccessDeleteFiles().size());
|
||||||
|
|
||||||
// 3 tasks are expected since the number of partitions is 3
|
// 3 tasks are expected since the number of partitions is 3
|
||||||
assertEquals(3, stageOneShuffleReadTaskRecordsCountMap.keySet().size());
|
assertEquals(3, stageOneShuffleReadTaskRecordsCountMap.keySet().size());
|
||||||
// Sum of all records processed = total number of files to clean
|
// Sum of all records processed = total number of files to clean
|
||||||
assertEquals(120, stageOneShuffleReadTaskRecordsCountMap
|
assertEquals(120, stageOneShuffleReadTaskRecordsCountMap
|
||||||
.values().stream().reduce((a,b) -> a + b).get().intValue());
|
.values().stream().reduce((a, b) -> a + b).get().intValue());
|
||||||
assertTrue("The skew in handling files to clean is not removed. "
|
assertTrue("The skew in handling files to clean is not removed. "
|
||||||
+ "Each task should handle more records than the partitionPath with least files "
|
+ "Each task should handle more records than the partitionPath with least files "
|
||||||
+ "and less records than the partitionPath with most files.",
|
+ "and less records than the partitionPath with most files.",
|
||||||
stageOneShuffleReadTaskRecordsCountMap.values().stream().filter(a -> a > 10 && a < 100).count() == 3);
|
stageOneShuffleReadTaskRecordsCountMap.values().stream().filter(a -> a > 10 && a < 100)
|
||||||
|
.count() == 3);
|
||||||
}
|
}
|
||||||
|
|
||||||
public void testCommitWritesRelativePaths() throws Exception {
|
public void testCommitWritesRelativePaths() throws Exception {
|
||||||
@@ -1454,7 +1530,8 @@ public class TestHoodieClientOnCopyOnWriteStorage implements Serializable {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
private List<String> createFilesInPartition(String partitionPath, String commitTime, int numFiles) throws IOException {
|
private List<String> createFilesInPartition(String partitionPath, String commitTime, int numFiles)
|
||||||
|
throws IOException {
|
||||||
List<String> files = new ArrayList<>();
|
List<String> files = new ArrayList<>();
|
||||||
for (int i = 0; i < numFiles; i++) {
|
for (int i = 0; i < numFiles; i++) {
|
||||||
files.add(HoodieTestUtils.createNewDataFile(basePath, partitionPath, commitTime));
|
files.add(HoodieTestUtils.createNewDataFile(basePath, partitionPath, commitTime));
|
||||||
|
|||||||
@@ -29,15 +29,6 @@ import com.uber.hoodie.common.table.view.HoodieTableFileSystemView;
|
|||||||
import com.uber.hoodie.common.util.FSUtils;
|
import com.uber.hoodie.common.util.FSUtils;
|
||||||
import com.uber.hoodie.exception.HoodieException;
|
import com.uber.hoodie.exception.HoodieException;
|
||||||
import com.uber.hoodie.table.HoodieTable;
|
import com.uber.hoodie.table.HoodieTable;
|
||||||
|
|
||||||
import org.apache.hadoop.fs.FileSystem;
|
|
||||||
import org.apache.hadoop.fs.Path;
|
|
||||||
import org.apache.spark.sql.Dataset;
|
|
||||||
import org.apache.spark.sql.Row;
|
|
||||||
import org.apache.spark.sql.SQLContext;
|
|
||||||
|
|
||||||
import org.apache.spark.SparkConf;
|
|
||||||
|
|
||||||
import java.io.File;
|
import java.io.File;
|
||||||
import java.io.IOException;
|
import java.io.IOException;
|
||||||
import java.io.RandomAccessFile;
|
import java.io.RandomAccessFile;
|
||||||
@@ -49,6 +40,12 @@ import java.util.Iterator;
|
|||||||
import java.util.List;
|
import java.util.List;
|
||||||
import java.util.Set;
|
import java.util.Set;
|
||||||
import java.util.stream.Collectors;
|
import java.util.stream.Collectors;
|
||||||
|
import org.apache.hadoop.fs.FileSystem;
|
||||||
|
import org.apache.hadoop.fs.Path;
|
||||||
|
import org.apache.spark.SparkConf;
|
||||||
|
import org.apache.spark.sql.Dataset;
|
||||||
|
import org.apache.spark.sql.Row;
|
||||||
|
import org.apache.spark.sql.SQLContext;
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Utility methods to aid testing inside the HoodieClient module.
|
* Utility methods to aid testing inside the HoodieClient module.
|
||||||
@@ -66,14 +63,15 @@ public class HoodieClientTestUtils {
|
|||||||
|
|
||||||
public static Set<String> getRecordKeys(List<HoodieRecord> hoodieRecords) {
|
public static Set<String> getRecordKeys(List<HoodieRecord> hoodieRecords) {
|
||||||
Set<String> keys = new HashSet<>();
|
Set<String> keys = new HashSet<>();
|
||||||
for (HoodieRecord rec: hoodieRecords) {
|
for (HoodieRecord rec : hoodieRecords) {
|
||||||
keys.add(rec.getRecordKey());
|
keys.add(rec.getRecordKey());
|
||||||
}
|
}
|
||||||
return keys;
|
return keys;
|
||||||
}
|
}
|
||||||
|
|
||||||
private static void fakeMetaFile(String basePath, String commitTime, String suffix) throws IOException {
|
private static void fakeMetaFile(String basePath, String commitTime, String suffix)
|
||||||
String parentPath = basePath + "/"+ HoodieTableMetaClient.METAFOLDER_NAME;
|
throws IOException {
|
||||||
|
String parentPath = basePath + "/" + HoodieTableMetaClient.METAFOLDER_NAME;
|
||||||
new File(parentPath).mkdirs();
|
new File(parentPath).mkdirs();
|
||||||
new File(parentPath + "/" + commitTime + suffix).createNewFile();
|
new File(parentPath + "/" + commitTime + suffix).createNewFile();
|
||||||
}
|
}
|
||||||
@@ -87,14 +85,17 @@ public class HoodieClientTestUtils {
|
|||||||
fakeMetaFile(basePath, commitTime, HoodieTimeline.INFLIGHT_EXTENSION);
|
fakeMetaFile(basePath, commitTime, HoodieTimeline.INFLIGHT_EXTENSION);
|
||||||
}
|
}
|
||||||
|
|
||||||
public static void fakeDataFile(String basePath, String partitionPath, String commitTime, String fileId) throws Exception {
|
public static void fakeDataFile(String basePath, String partitionPath, String commitTime,
|
||||||
|
String fileId) throws Exception {
|
||||||
fakeDataFile(basePath, partitionPath, commitTime, fileId, 0);
|
fakeDataFile(basePath, partitionPath, commitTime, fileId, 0);
|
||||||
}
|
}
|
||||||
|
|
||||||
public static void fakeDataFile(String basePath, String partitionPath, String commitTime, String fileId, long length) throws Exception {
|
public static void fakeDataFile(String basePath, String partitionPath, String commitTime,
|
||||||
|
String fileId, long length) throws Exception {
|
||||||
String parentPath = String.format("%s/%s", basePath, partitionPath);
|
String parentPath = String.format("%s/%s", basePath, partitionPath);
|
||||||
new File(parentPath).mkdirs();
|
new File(parentPath).mkdirs();
|
||||||
String path = String.format("%s/%s", parentPath, FSUtils.makeDataFileName(commitTime, 0, fileId));
|
String path = String
|
||||||
|
.format("%s/%s", parentPath, FSUtils.makeDataFileName(commitTime, 0, fileId));
|
||||||
new File(path).createNewFile();
|
new File(path).createNewFile();
|
||||||
new RandomAccessFile(path, "rw").setLength(length);
|
new RandomAccessFile(path, "rw").setLength(length);
|
||||||
}
|
}
|
||||||
@@ -129,7 +130,8 @@ public class HoodieClientTestUtils {
|
|||||||
new HoodieException("No commit exists at " + commitTime);
|
new HoodieException("No commit exists at " + commitTime);
|
||||||
}
|
}
|
||||||
try {
|
try {
|
||||||
HashMap<String, String> paths = getLatestFileIDsToFullPath(basePath, commitTimeline, Arrays.asList(commitInstant));
|
HashMap<String, String> paths = getLatestFileIDsToFullPath(basePath, commitTimeline,
|
||||||
|
Arrays.asList(commitInstant));
|
||||||
return sqlContext.read()
|
return sqlContext.read()
|
||||||
.parquet(paths.values().toArray(new String[paths.size()]))
|
.parquet(paths.values().toArray(new String[paths.size()]))
|
||||||
.filter(String.format("%s ='%s'", HoodieRecord.COMMIT_TIME_METADATA_FIELD, commitTime));
|
.filter(String.format("%s ='%s'", HoodieRecord.COMMIT_TIME_METADATA_FIELD, commitTime));
|
||||||
@@ -150,12 +152,15 @@ public class HoodieClientTestUtils {
|
|||||||
.getInstants().collect(Collectors.toList());
|
.getInstants().collect(Collectors.toList());
|
||||||
try {
|
try {
|
||||||
// Go over the commit metadata, and obtain the new files that need to be read.
|
// Go over the commit metadata, and obtain the new files that need to be read.
|
||||||
HashMap<String, String> fileIdToFullPath = getLatestFileIDsToFullPath(basePath, commitTimeline, commitsToReturn);
|
HashMap<String, String> fileIdToFullPath = getLatestFileIDsToFullPath(basePath,
|
||||||
|
commitTimeline, commitsToReturn);
|
||||||
return sqlContext.read()
|
return sqlContext.read()
|
||||||
.parquet(fileIdToFullPath.values().toArray(new String[fileIdToFullPath.size()]))
|
.parquet(fileIdToFullPath.values().toArray(new String[fileIdToFullPath.size()]))
|
||||||
.filter(String.format("%s >'%s'", HoodieRecord.COMMIT_TIME_METADATA_FIELD, lastCommitTime));
|
.filter(
|
||||||
|
String.format("%s >'%s'", HoodieRecord.COMMIT_TIME_METADATA_FIELD, lastCommitTime));
|
||||||
} catch (IOException e) {
|
} catch (IOException e) {
|
||||||
throw new HoodieException("Error pulling data incrementally from commitTimestamp :" + lastCommitTime, e);
|
throw new HoodieException(
|
||||||
|
"Error pulling data incrementally from commitTimestamp :" + lastCommitTime, e);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -171,7 +176,8 @@ public class HoodieClientTestUtils {
|
|||||||
HoodieTable hoodieTable = HoodieTable
|
HoodieTable hoodieTable = HoodieTable
|
||||||
.getHoodieTable(new HoodieTableMetaClient(fs, basePath, true), null);
|
.getHoodieTable(new HoodieTableMetaClient(fs, basePath, true), null);
|
||||||
for (String path : paths) {
|
for (String path : paths) {
|
||||||
TableFileSystemView.ReadOptimizedView fileSystemView = new HoodieTableFileSystemView(hoodieTable.getMetaClient(),
|
TableFileSystemView.ReadOptimizedView fileSystemView = new HoodieTableFileSystemView(
|
||||||
|
hoodieTable.getMetaClient(),
|
||||||
hoodieTable.getCompletedCommitTimeline(), fs.globStatus(new Path(path)));
|
hoodieTable.getCompletedCommitTimeline(), fs.globStatus(new Path(path)));
|
||||||
List<HoodieDataFile> latestFiles = fileSystemView.getLatestDataFiles().collect(
|
List<HoodieDataFile> latestFiles = fileSystemView.getLatestDataFiles().collect(
|
||||||
Collectors.toList());
|
Collectors.toList());
|
||||||
|
|||||||
@@ -16,9 +16,16 @@
|
|||||||
|
|
||||||
package com.uber.hoodie.common;
|
package com.uber.hoodie.common;
|
||||||
|
|
||||||
|
import static com.uber.hoodie.common.HoodieTestDataGenerator.TRIP_EXAMPLE_SCHEMA;
|
||||||
|
|
||||||
import com.uber.hoodie.common.util.FSUtils;
|
import com.uber.hoodie.common.util.FSUtils;
|
||||||
import com.uber.hoodie.common.util.HoodieAvroUtils;
|
import com.uber.hoodie.common.util.HoodieAvroUtils;
|
||||||
import com.uber.hoodie.hadoop.realtime.HoodieRealtimeInputFormat;
|
import com.uber.hoodie.hadoop.realtime.HoodieRealtimeInputFormat;
|
||||||
|
import java.io.IOException;
|
||||||
|
import java.util.ArrayList;
|
||||||
|
import java.util.Arrays;
|
||||||
|
import java.util.List;
|
||||||
|
import java.util.stream.Collectors;
|
||||||
import org.apache.avro.Schema;
|
import org.apache.avro.Schema;
|
||||||
import org.apache.avro.generic.GenericRecord;
|
import org.apache.avro.generic.GenericRecord;
|
||||||
import org.apache.avro.generic.GenericRecordBuilder;
|
import org.apache.avro.generic.GenericRecordBuilder;
|
||||||
@@ -30,20 +37,13 @@ import org.apache.hadoop.mapred.InputSplit;
|
|||||||
import org.apache.hadoop.mapred.JobConf;
|
import org.apache.hadoop.mapred.JobConf;
|
||||||
import org.apache.hadoop.mapred.RecordReader;
|
import org.apache.hadoop.mapred.RecordReader;
|
||||||
|
|
||||||
import java.io.IOException;
|
|
||||||
import java.util.ArrayList;
|
|
||||||
import java.util.Arrays;
|
|
||||||
import java.util.List;
|
|
||||||
import java.util.stream.Collectors;
|
|
||||||
|
|
||||||
import static com.uber.hoodie.common.HoodieTestDataGenerator.TRIP_EXAMPLE_SCHEMA;
|
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Utility methods to aid in testing MergeOnRead (workaround for HoodieReadClient for MOR)
|
* Utility methods to aid in testing MergeOnRead (workaround for HoodieReadClient for MOR)
|
||||||
*/
|
*/
|
||||||
public class HoodieMergeOnReadTestUtils {
|
public class HoodieMergeOnReadTestUtils {
|
||||||
|
|
||||||
public static List<GenericRecord> getRecordsUsingInputFormat(List<String> inputPaths) throws IOException {
|
public static List<GenericRecord> getRecordsUsingInputFormat(List<String> inputPaths)
|
||||||
|
throws IOException {
|
||||||
JobConf jobConf = new JobConf();
|
JobConf jobConf = new JobConf();
|
||||||
Schema schema = HoodieAvroUtils.addMetadataFields(Schema.parse(TRIP_EXAMPLE_SCHEMA));
|
Schema schema = HoodieAvroUtils.addMetadataFields(Schema.parse(TRIP_EXAMPLE_SCHEMA));
|
||||||
HoodieRealtimeInputFormat inputFormat = new HoodieRealtimeInputFormat();
|
HoodieRealtimeInputFormat inputFormat = new HoodieRealtimeInputFormat();
|
||||||
@@ -75,10 +75,12 @@ public class HoodieMergeOnReadTestUtils {
|
|||||||
}).get();
|
}).get();
|
||||||
}
|
}
|
||||||
|
|
||||||
private static void setPropsForInputFormat(HoodieRealtimeInputFormat inputFormat, JobConf jobConf, Schema schema) {
|
private static void setPropsForInputFormat(HoodieRealtimeInputFormat inputFormat, JobConf jobConf,
|
||||||
|
Schema schema) {
|
||||||
List<Schema.Field> fields = schema.getFields();
|
List<Schema.Field> fields = schema.getFields();
|
||||||
String names = fields.stream().map(f -> f.name().toString()).collect(Collectors.joining(","));
|
String names = fields.stream().map(f -> f.name().toString()).collect(Collectors.joining(","));
|
||||||
String postions = fields.stream().map(f -> String.valueOf(f.pos())).collect(Collectors.joining(","));
|
String postions = fields.stream().map(f -> String.valueOf(f.pos()))
|
||||||
|
.collect(Collectors.joining(","));
|
||||||
Configuration conf = FSUtils.getFs().getConf();
|
Configuration conf = FSUtils.getFs().getConf();
|
||||||
jobConf.set(ColumnProjectionUtils.READ_COLUMN_NAMES_CONF_STR, names);
|
jobConf.set(ColumnProjectionUtils.READ_COLUMN_NAMES_CONF_STR, names);
|
||||||
jobConf.set(ColumnProjectionUtils.READ_COLUMN_IDS_CONF_STR, postions);
|
jobConf.set(ColumnProjectionUtils.READ_COLUMN_IDS_CONF_STR, postions);
|
||||||
|
|||||||
@@ -16,17 +16,21 @@
|
|||||||
|
|
||||||
package com.uber.hoodie.common;
|
package com.uber.hoodie.common;
|
||||||
|
|
||||||
import com.uber.hoodie.avro.model.HoodieCleanMetadata;
|
|
||||||
import com.uber.hoodie.common.model.HoodieCleaningPolicy;
|
|
||||||
import com.uber.hoodie.common.model.HoodieCommitMetadata;
|
import com.uber.hoodie.common.model.HoodieCommitMetadata;
|
||||||
import com.uber.hoodie.common.model.HoodieKey;
|
import com.uber.hoodie.common.model.HoodieKey;
|
||||||
import com.uber.hoodie.common.model.HoodiePartitionMetadata;
|
import com.uber.hoodie.common.model.HoodiePartitionMetadata;
|
||||||
import com.uber.hoodie.common.model.HoodieRecord;
|
import com.uber.hoodie.common.model.HoodieRecord;
|
||||||
import com.uber.hoodie.common.table.HoodieTableMetaClient;
|
import com.uber.hoodie.common.table.HoodieTableMetaClient;
|
||||||
import com.uber.hoodie.common.table.HoodieTimeline;
|
import com.uber.hoodie.common.table.HoodieTimeline;
|
||||||
import com.uber.hoodie.common.util.AvroUtils;
|
|
||||||
import com.uber.hoodie.common.util.FSUtils;
|
import com.uber.hoodie.common.util.FSUtils;
|
||||||
import com.uber.hoodie.common.util.HoodieAvroUtils;
|
import com.uber.hoodie.common.util.HoodieAvroUtils;
|
||||||
|
import java.io.IOException;
|
||||||
|
import java.nio.charset.StandardCharsets;
|
||||||
|
import java.util.ArrayList;
|
||||||
|
import java.util.List;
|
||||||
|
import java.util.Optional;
|
||||||
|
import java.util.Random;
|
||||||
|
import java.util.UUID;
|
||||||
import org.apache.avro.Schema;
|
import org.apache.avro.Schema;
|
||||||
import org.apache.avro.generic.GenericData;
|
import org.apache.avro.generic.GenericData;
|
||||||
import org.apache.avro.generic.GenericRecord;
|
import org.apache.avro.generic.GenericRecord;
|
||||||
@@ -34,15 +38,6 @@ import org.apache.hadoop.fs.FSDataOutputStream;
|
|||||||
import org.apache.hadoop.fs.FileSystem;
|
import org.apache.hadoop.fs.FileSystem;
|
||||||
import org.apache.hadoop.fs.Path;
|
import org.apache.hadoop.fs.Path;
|
||||||
|
|
||||||
import java.io.IOException;
|
|
||||||
import java.nio.charset.StandardCharsets;
|
|
||||||
import java.util.ArrayList;
|
|
||||||
import java.util.Arrays;
|
|
||||||
import java.util.List;
|
|
||||||
import java.util.Optional;
|
|
||||||
import java.util.Random;
|
|
||||||
import java.util.UUID;
|
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Class to be used in tests to keep generating test inserts and updates against a corpus.
|
* Class to be used in tests to keep generating test inserts and updates against a corpus.
|
||||||
*
|
*
|
||||||
@@ -51,6 +46,7 @@ import java.util.UUID;
|
|||||||
public class HoodieTestDataGenerator {
|
public class HoodieTestDataGenerator {
|
||||||
|
|
||||||
static class KeyPartition {
|
static class KeyPartition {
|
||||||
|
|
||||||
HoodieKey key;
|
HoodieKey key;
|
||||||
String partitionPath;
|
String partitionPath;
|
||||||
}
|
}
|
||||||
@@ -74,14 +70,17 @@ public class HoodieTestDataGenerator {
|
|||||||
public static final String[] DEFAULT_PARTITION_PATHS = {"2016/03/15", "2015/03/16", "2015/03/17"};
|
public static final String[] DEFAULT_PARTITION_PATHS = {"2016/03/15", "2015/03/16", "2015/03/17"};
|
||||||
|
|
||||||
|
|
||||||
public static void writePartitionMetadata(FileSystem fs, String[] partitionPaths, String basePath) {
|
public static void writePartitionMetadata(FileSystem fs, String[] partitionPaths,
|
||||||
for (String partitionPath: partitionPaths) {
|
String basePath) {
|
||||||
new HoodiePartitionMetadata(fs, "000", new Path(basePath), new Path(basePath, partitionPath)).trySave(0);
|
for (String partitionPath : partitionPaths) {
|
||||||
|
new HoodiePartitionMetadata(fs, "000", new Path(basePath), new Path(basePath, partitionPath))
|
||||||
|
.trySave(0);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
private List<KeyPartition> existingKeysList = new ArrayList<>();
|
private List<KeyPartition> existingKeysList = new ArrayList<>();
|
||||||
public static Schema avroSchema = HoodieAvroUtils.addMetadataFields(new Schema.Parser().parse(TRIP_EXAMPLE_SCHEMA));
|
public static Schema avroSchema = HoodieAvroUtils
|
||||||
|
.addMetadataFields(new Schema.Parser().parse(TRIP_EXAMPLE_SCHEMA));
|
||||||
private static Random rand = new Random(46474747);
|
private static Random rand = new Random(46474747);
|
||||||
private String[] partitionPaths = DEFAULT_PARTITION_PATHS;
|
private String[] partitionPaths = DEFAULT_PARTITION_PATHS;
|
||||||
|
|
||||||
@@ -95,8 +94,8 @@ public class HoodieTestDataGenerator {
|
|||||||
|
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Generates new inserts, uniformly across the partition paths above. It also updates the list
|
* Generates new inserts, uniformly across the partition paths above. It also updates the list of
|
||||||
* of existing keys.
|
* existing keys.
|
||||||
*/
|
*/
|
||||||
public List<HoodieRecord> generateInserts(String commitTime, int n) throws IOException {
|
public List<HoodieRecord> generateInserts(String commitTime, int n) throws IOException {
|
||||||
List<HoodieRecord> inserts = new ArrayList<>();
|
List<HoodieRecord> inserts = new ArrayList<>();
|
||||||
@@ -119,9 +118,10 @@ public class HoodieTestDataGenerator {
|
|||||||
return generateDeletesFromExistingRecords(inserts);
|
return generateDeletesFromExistingRecords(inserts);
|
||||||
}
|
}
|
||||||
|
|
||||||
public List<HoodieRecord> generateDeletesFromExistingRecords(List<HoodieRecord> existingRecords) throws IOException {
|
public List<HoodieRecord> generateDeletesFromExistingRecords(List<HoodieRecord> existingRecords)
|
||||||
|
throws IOException {
|
||||||
List<HoodieRecord> deletes = new ArrayList<>();
|
List<HoodieRecord> deletes = new ArrayList<>();
|
||||||
for (HoodieRecord existingRecord: existingRecords) {
|
for (HoodieRecord existingRecord : existingRecords) {
|
||||||
HoodieRecord record = generateDeleteRecord(existingRecord);
|
HoodieRecord record = generateDeleteRecord(existingRecord);
|
||||||
deletes.add(record);
|
deletes.add(record);
|
||||||
|
|
||||||
@@ -131,14 +131,17 @@ public class HoodieTestDataGenerator {
|
|||||||
|
|
||||||
public HoodieRecord generateDeleteRecord(HoodieRecord existingRecord) throws IOException {
|
public HoodieRecord generateDeleteRecord(HoodieRecord existingRecord) throws IOException {
|
||||||
HoodieKey key = existingRecord.getKey();
|
HoodieKey key = existingRecord.getKey();
|
||||||
TestRawTripPayload payload = new TestRawTripPayload(Optional.empty(), key.getRecordKey(), key.getPartitionPath(), null, true);
|
TestRawTripPayload payload = new TestRawTripPayload(Optional.empty(), key.getRecordKey(),
|
||||||
|
key.getPartitionPath(), null, true);
|
||||||
return new HoodieRecord(key, payload);
|
return new HoodieRecord(key, payload);
|
||||||
}
|
}
|
||||||
|
|
||||||
public List<HoodieRecord> generateUpdates(String commitTime, List<HoodieRecord> baseRecords) throws IOException {
|
public List<HoodieRecord> generateUpdates(String commitTime, List<HoodieRecord> baseRecords)
|
||||||
|
throws IOException {
|
||||||
List<HoodieRecord> updates = new ArrayList<>();
|
List<HoodieRecord> updates = new ArrayList<>();
|
||||||
for (HoodieRecord baseRecord: baseRecords) {
|
for (HoodieRecord baseRecord : baseRecords) {
|
||||||
HoodieRecord record = new HoodieRecord(baseRecord.getKey(), generateRandomValue(baseRecord.getKey(), commitTime));
|
HoodieRecord record = new HoodieRecord(baseRecord.getKey(),
|
||||||
|
generateRandomValue(baseRecord.getKey(), commitTime));
|
||||||
updates.add(record);
|
updates.add(record);
|
||||||
}
|
}
|
||||||
return updates;
|
return updates;
|
||||||
@@ -162,11 +165,13 @@ public class HoodieTestDataGenerator {
|
|||||||
* Generates a new avro record of the above schema format, retaining the key if optionally
|
* Generates a new avro record of the above schema format, retaining the key if optionally
|
||||||
* provided.
|
* provided.
|
||||||
*/
|
*/
|
||||||
public static TestRawTripPayload generateRandomValue(HoodieKey key, String commitTime) throws IOException {
|
public static TestRawTripPayload generateRandomValue(HoodieKey key, String commitTime)
|
||||||
|
throws IOException {
|
||||||
GenericRecord rec = generateGenericRecord(key.getRecordKey(), "rider-" + commitTime,
|
GenericRecord rec = generateGenericRecord(key.getRecordKey(), "rider-" + commitTime,
|
||||||
"driver-" + commitTime, 0.0);
|
"driver-" + commitTime, 0.0);
|
||||||
HoodieAvroUtils.addCommitMetadataToRecord(rec, commitTime, "-1");
|
HoodieAvroUtils.addCommitMetadataToRecord(rec, commitTime, "-1");
|
||||||
return new TestRawTripPayload(rec.toString(), key.getRecordKey(), key.getPartitionPath(), TRIP_EXAMPLE_SCHEMA);
|
return new TestRawTripPayload(rec.toString(), key.getRecordKey(), key.getPartitionPath(),
|
||||||
|
TRIP_EXAMPLE_SCHEMA);
|
||||||
}
|
}
|
||||||
|
|
||||||
public static GenericRecord generateGenericRecord(String rowKey, String riderName,
|
public static GenericRecord generateGenericRecord(String rowKey, String riderName,
|
||||||
@@ -186,7 +191,8 @@ public class HoodieTestDataGenerator {
|
|||||||
|
|
||||||
public static void createCommitFile(String basePath, String commitTime) throws IOException {
|
public static void createCommitFile(String basePath, String commitTime) throws IOException {
|
||||||
Path commitFile =
|
Path commitFile =
|
||||||
new Path(basePath + "/" + HoodieTableMetaClient.METAFOLDER_NAME + "/" + HoodieTimeline.makeCommitFileName(commitTime));
|
new Path(basePath + "/" + HoodieTableMetaClient.METAFOLDER_NAME + "/" + HoodieTimeline
|
||||||
|
.makeCommitFileName(commitTime));
|
||||||
FileSystem fs = FSUtils.getFs();
|
FileSystem fs = FSUtils.getFs();
|
||||||
FSDataOutputStream os = fs.create(commitFile, true);
|
FSDataOutputStream os = fs.create(commitFile, true);
|
||||||
HoodieCommitMetadata commitMetadata = new HoodieCommitMetadata();
|
HoodieCommitMetadata commitMetadata = new HoodieCommitMetadata();
|
||||||
|
|||||||
@@ -17,31 +17,32 @@
|
|||||||
package com.uber.hoodie.common;
|
package com.uber.hoodie.common;
|
||||||
|
|
||||||
import com.fasterxml.jackson.databind.ObjectMapper;
|
import com.fasterxml.jackson.databind.ObjectMapper;
|
||||||
|
|
||||||
import com.uber.hoodie.WriteStatus;
|
import com.uber.hoodie.WriteStatus;
|
||||||
import com.uber.hoodie.avro.MercifulJsonConverter;
|
import com.uber.hoodie.avro.MercifulJsonConverter;
|
||||||
import com.uber.hoodie.common.model.HoodieRecord;
|
import com.uber.hoodie.common.model.HoodieRecord;
|
||||||
import com.uber.hoodie.common.model.HoodieRecordPayload;
|
import com.uber.hoodie.common.model.HoodieRecordPayload;
|
||||||
|
import java.io.ByteArrayInputStream;
|
||||||
|
import java.io.ByteArrayOutputStream;
|
||||||
|
import java.io.IOException;
|
||||||
|
import java.io.StringWriter;
|
||||||
import java.util.HashMap;
|
import java.util.HashMap;
|
||||||
import java.util.List;
|
import java.util.List;
|
||||||
import java.util.Map.Entry;
|
|
||||||
import org.apache.avro.Schema;
|
|
||||||
import org.apache.avro.generic.IndexedRecord;
|
|
||||||
import org.apache.commons.io.IOUtils;
|
|
||||||
|
|
||||||
import java.io.*;
|
|
||||||
import java.util.Map;
|
import java.util.Map;
|
||||||
|
import java.util.Map.Entry;
|
||||||
import java.util.Optional;
|
import java.util.Optional;
|
||||||
import java.util.zip.Deflater;
|
import java.util.zip.Deflater;
|
||||||
import java.util.zip.DeflaterOutputStream;
|
import java.util.zip.DeflaterOutputStream;
|
||||||
import java.util.zip.InflaterInputStream;
|
import java.util.zip.InflaterInputStream;
|
||||||
|
import org.apache.avro.Schema;
|
||||||
|
import org.apache.avro.generic.IndexedRecord;
|
||||||
|
import org.apache.commons.io.IOUtils;
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Example row change event based on some example data used by testcases. The data avro schema is
|
* Example row change event based on some example data used by testcases. The data avro schema is
|
||||||
* src/test/resources/schema1.
|
* src/test/resources/schema1.
|
||||||
*/
|
*/
|
||||||
public class TestRawTripPayload implements HoodieRecordPayload<TestRawTripPayload> {
|
public class TestRawTripPayload implements HoodieRecordPayload<TestRawTripPayload> {
|
||||||
|
|
||||||
private transient static final ObjectMapper mapper = new ObjectMapper();
|
private transient static final ObjectMapper mapper = new ObjectMapper();
|
||||||
private String partitionPath;
|
private String partitionPath;
|
||||||
private String rowKey;
|
private String rowKey;
|
||||||
@@ -51,7 +52,7 @@ public class TestRawTripPayload implements HoodieRecordPayload<TestRawTripPayloa
|
|||||||
|
|
||||||
public TestRawTripPayload(Optional<String> jsonData, String rowKey, String partitionPath,
|
public TestRawTripPayload(Optional<String> jsonData, String rowKey, String partitionPath,
|
||||||
String schemaStr, Boolean isDeleted) throws IOException {
|
String schemaStr, Boolean isDeleted) throws IOException {
|
||||||
if(jsonData.isPresent()) {
|
if (jsonData.isPresent()) {
|
||||||
this.jsonDataCompressed = compressData(jsonData.get());
|
this.jsonDataCompressed = compressData(jsonData.get());
|
||||||
this.dataSize = jsonData.get().length();
|
this.dataSize = jsonData.get().length();
|
||||||
}
|
}
|
||||||
@@ -61,7 +62,7 @@ public class TestRawTripPayload implements HoodieRecordPayload<TestRawTripPayloa
|
|||||||
}
|
}
|
||||||
|
|
||||||
public TestRawTripPayload(String jsonData, String rowKey, String partitionPath,
|
public TestRawTripPayload(String jsonData, String rowKey, String partitionPath,
|
||||||
String schemaStr)throws IOException {
|
String schemaStr) throws IOException {
|
||||||
this(Optional.of(jsonData), rowKey, partitionPath, schemaStr, false);
|
this(Optional.of(jsonData), rowKey, partitionPath, schemaStr, false);
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -79,16 +80,20 @@ public class TestRawTripPayload implements HoodieRecordPayload<TestRawTripPayloa
|
|||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
@Override public TestRawTripPayload preCombine(TestRawTripPayload another) {
|
@Override
|
||||||
|
public TestRawTripPayload preCombine(TestRawTripPayload another) {
|
||||||
return another;
|
return another;
|
||||||
}
|
}
|
||||||
|
|
||||||
@Override public Optional<IndexedRecord> combineAndGetUpdateValue(IndexedRecord oldRec, Schema schema) throws IOException {
|
@Override
|
||||||
|
public Optional<IndexedRecord> combineAndGetUpdateValue(IndexedRecord oldRec, Schema schema)
|
||||||
|
throws IOException {
|
||||||
return this.getInsertValue(schema);
|
return this.getInsertValue(schema);
|
||||||
}
|
}
|
||||||
|
|
||||||
@Override public Optional<IndexedRecord> getInsertValue(Schema schema) throws IOException {
|
@Override
|
||||||
if(isDeleted){
|
public Optional<IndexedRecord> getInsertValue(Schema schema) throws IOException {
|
||||||
|
if (isDeleted) {
|
||||||
return Optional.empty();
|
return Optional.empty();
|
||||||
} else {
|
} else {
|
||||||
MercifulJsonConverter jsonConverter = new MercifulJsonConverter(schema);
|
MercifulJsonConverter jsonConverter = new MercifulJsonConverter(schema);
|
||||||
@@ -135,16 +140,17 @@ public class TestRawTripPayload implements HoodieRecordPayload<TestRawTripPayloa
|
|||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* A custom {@link WriteStatus} that merges passed metadata key value map
|
* A custom {@link WriteStatus} that merges passed metadata key value map to {@code
|
||||||
* to {@code WriteStatus.markSuccess()} and {@code WriteStatus.markFailure()}.
|
* WriteStatus.markSuccess()} and {@code WriteStatus.markFailure()}.
|
||||||
*/
|
*/
|
||||||
public static class MetadataMergeWriteStatus extends WriteStatus {
|
public static class MetadataMergeWriteStatus extends WriteStatus {
|
||||||
|
|
||||||
private Map<String, String> mergedMetadataMap = new HashMap<>();
|
private Map<String, String> mergedMetadataMap = new HashMap<>();
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
public void markSuccess(HoodieRecord record, Optional<Map<String, String>> recordMetadata) {
|
public void markSuccess(HoodieRecord record, Optional<Map<String, String>> recordMetadata) {
|
||||||
super.markSuccess(record, recordMetadata);
|
super.markSuccess(record, recordMetadata);
|
||||||
if(recordMetadata.isPresent()) {
|
if (recordMetadata.isPresent()) {
|
||||||
mergeMetadataMaps(recordMetadata.get(), mergedMetadataMap);
|
mergeMetadataMaps(recordMetadata.get(), mergedMetadataMap);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@@ -153,25 +159,27 @@ public class TestRawTripPayload implements HoodieRecordPayload<TestRawTripPayloa
|
|||||||
public void markFailure(HoodieRecord record, Throwable t,
|
public void markFailure(HoodieRecord record, Throwable t,
|
||||||
Optional<Map<String, String>> recordMetadata) {
|
Optional<Map<String, String>> recordMetadata) {
|
||||||
super.markFailure(record, t, recordMetadata);
|
super.markFailure(record, t, recordMetadata);
|
||||||
if(recordMetadata.isPresent()) {
|
if (recordMetadata.isPresent()) {
|
||||||
mergeMetadataMaps(recordMetadata.get(), mergedMetadataMap);
|
mergeMetadataMaps(recordMetadata.get(), mergedMetadataMap);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
public static Map<String, String> mergeMetadataForWriteStatuses(List<WriteStatus> writeStatuses) {
|
public static Map<String, String> mergeMetadataForWriteStatuses(
|
||||||
|
List<WriteStatus> writeStatuses) {
|
||||||
Map<String, String> allWriteStatusMergedMetadataMap = new HashMap<>();
|
Map<String, String> allWriteStatusMergedMetadataMap = new HashMap<>();
|
||||||
for (WriteStatus writeStatus : writeStatuses) {
|
for (WriteStatus writeStatus : writeStatuses) {
|
||||||
MetadataMergeWriteStatus.mergeMetadataMaps(
|
MetadataMergeWriteStatus.mergeMetadataMaps(
|
||||||
((MetadataMergeWriteStatus)writeStatus).getMergedMetadataMap(),
|
((MetadataMergeWriteStatus) writeStatus).getMergedMetadataMap(),
|
||||||
allWriteStatusMergedMetadataMap);
|
allWriteStatusMergedMetadataMap);
|
||||||
}
|
}
|
||||||
return allWriteStatusMergedMetadataMap;
|
return allWriteStatusMergedMetadataMap;
|
||||||
}
|
}
|
||||||
|
|
||||||
private static void mergeMetadataMaps(Map<String, String> mergeFromMap, Map<String, String> mergeToMap) {
|
private static void mergeMetadataMaps(Map<String, String> mergeFromMap,
|
||||||
|
Map<String, String> mergeToMap) {
|
||||||
for (Entry<String, String> entry : mergeFromMap.entrySet()) {
|
for (Entry<String, String> entry : mergeFromMap.entrySet()) {
|
||||||
String key = entry.getKey();
|
String key = entry.getKey();
|
||||||
if(!mergeToMap.containsKey(key)) {
|
if (!mergeToMap.containsKey(key)) {
|
||||||
mergeToMap.put(key, "0");
|
mergeToMap.put(key, "0");
|
||||||
}
|
}
|
||||||
mergeToMap
|
mergeToMap
|
||||||
|
|||||||
@@ -16,7 +16,7 @@
|
|||||||
|
|
||||||
package com.uber.hoodie.config;
|
package com.uber.hoodie.config;
|
||||||
|
|
||||||
import static org.junit.Assert.*;
|
import static org.junit.Assert.assertEquals;
|
||||||
|
|
||||||
import com.google.common.collect.Maps;
|
import com.google.common.collect.Maps;
|
||||||
import com.uber.hoodie.config.HoodieWriteConfig.Builder;
|
import com.uber.hoodie.config.HoodieWriteConfig.Builder;
|
||||||
@@ -29,6 +29,7 @@ import java.util.Properties;
|
|||||||
import org.junit.Test;
|
import org.junit.Test;
|
||||||
|
|
||||||
public class HoodieWriteConfigTest {
|
public class HoodieWriteConfigTest {
|
||||||
|
|
||||||
@Test
|
@Test
|
||||||
public void testPropertyLoading() throws IOException {
|
public void testPropertyLoading() throws IOException {
|
||||||
Builder builder = HoodieWriteConfig.newBuilder().withPath("/tmp");
|
Builder builder = HoodieWriteConfig.newBuilder().withPath("/tmp");
|
||||||
@@ -46,9 +47,10 @@ public class HoodieWriteConfigTest {
|
|||||||
HoodieWriteConfig config = builder.build();
|
HoodieWriteConfig config = builder.build();
|
||||||
assertEquals(config.getMaxCommitsToKeep(), 5);
|
assertEquals(config.getMaxCommitsToKeep(), 5);
|
||||||
assertEquals(config.getMinCommitsToKeep(), 2);
|
assertEquals(config.getMinCommitsToKeep(), 2);
|
||||||
}
|
}
|
||||||
|
|
||||||
private ByteArrayOutputStream saveParamsIntoOutputStream(Map<String, String> params) throws IOException {
|
private ByteArrayOutputStream saveParamsIntoOutputStream(Map<String, String> params)
|
||||||
|
throws IOException {
|
||||||
Properties properties = new Properties();
|
Properties properties = new Properties();
|
||||||
properties.putAll(params);
|
properties.putAll(params);
|
||||||
ByteArrayOutputStream outStream = new ByteArrayOutputStream();
|
ByteArrayOutputStream outStream = new ByteArrayOutputStream();
|
||||||
|
|||||||
@@ -16,31 +16,30 @@
|
|||||||
|
|
||||||
package com.uber.hoodie.func;
|
package com.uber.hoodie.func;
|
||||||
|
|
||||||
import com.uber.hoodie.common.table.HoodieTableMetaClient;
|
import static org.junit.Assert.fail;
|
||||||
import com.uber.hoodie.common.table.HoodieTimeline;
|
|
||||||
import com.uber.hoodie.config.HoodieWriteConfig;
|
|
||||||
import com.uber.hoodie.WriteStatus;
|
import com.uber.hoodie.WriteStatus;
|
||||||
import com.uber.hoodie.common.TestRawTripPayload;
|
import com.uber.hoodie.common.TestRawTripPayload;
|
||||||
import com.uber.hoodie.common.model.HoodieKey;
|
import com.uber.hoodie.common.model.HoodieKey;
|
||||||
import com.uber.hoodie.common.model.HoodieRecord;
|
import com.uber.hoodie.common.model.HoodieRecord;
|
||||||
import com.uber.hoodie.common.model.HoodieRecordLocation;
|
import com.uber.hoodie.common.model.HoodieRecordLocation;
|
||||||
import com.uber.hoodie.common.model.HoodieTestUtils;
|
import com.uber.hoodie.common.model.HoodieTestUtils;
|
||||||
|
import com.uber.hoodie.common.table.HoodieTableMetaClient;
|
||||||
|
import com.uber.hoodie.common.table.HoodieTimeline;
|
||||||
import com.uber.hoodie.common.util.FSUtils;
|
import com.uber.hoodie.common.util.FSUtils;
|
||||||
|
import com.uber.hoodie.config.HoodieWriteConfig;
|
||||||
import com.uber.hoodie.table.HoodieCopyOnWriteTable;
|
import com.uber.hoodie.table.HoodieCopyOnWriteTable;
|
||||||
|
import java.util.ArrayList;
|
||||||
|
import java.util.Iterator;
|
||||||
|
import java.util.List;
|
||||||
import org.apache.commons.io.IOUtils;
|
import org.apache.commons.io.IOUtils;
|
||||||
import org.apache.hadoop.fs.Path;
|
import org.apache.hadoop.fs.Path;
|
||||||
import org.junit.Before;
|
import org.junit.Before;
|
||||||
import org.junit.Test;
|
import org.junit.Test;
|
||||||
import org.junit.rules.TemporaryFolder;
|
import org.junit.rules.TemporaryFolder;
|
||||||
|
|
||||||
import java.util.ArrayList;
|
|
||||||
import java.util.Iterator;
|
|
||||||
import java.util.List;
|
|
||||||
|
|
||||||
import static org.junit.Assert.fail;
|
|
||||||
|
|
||||||
public class TestUpdateMapFunction {
|
public class TestUpdateMapFunction {
|
||||||
|
|
||||||
private String basePath = null;
|
private String basePath = null;
|
||||||
|
|
||||||
@Before
|
@Before
|
||||||
@@ -90,7 +89,6 @@ public class TestUpdateMapFunction {
|
|||||||
String fileId = insertResult.next().get(0).getFileId();
|
String fileId = insertResult.next().get(0).getFileId();
|
||||||
System.out.println(fileId);
|
System.out.println(fileId);
|
||||||
|
|
||||||
|
|
||||||
table = new HoodieCopyOnWriteTable(config, metadata);
|
table = new HoodieCopyOnWriteTable(config, metadata);
|
||||||
// New content with values for the newly added field
|
// New content with values for the newly added field
|
||||||
recordStr1 =
|
recordStr1 =
|
||||||
|
|||||||
@@ -16,17 +16,16 @@
|
|||||||
|
|
||||||
package com.uber.hoodie.index;
|
package com.uber.hoodie.index;
|
||||||
|
|
||||||
import com.uber.hoodie.config.HoodieWriteConfig;
|
import static org.junit.Assert.assertTrue;
|
||||||
|
|
||||||
import com.uber.hoodie.config.HoodieIndexConfig;
|
import com.uber.hoodie.config.HoodieIndexConfig;
|
||||||
|
import com.uber.hoodie.config.HoodieWriteConfig;
|
||||||
import com.uber.hoodie.index.bloom.HoodieBloomIndex;
|
import com.uber.hoodie.index.bloom.HoodieBloomIndex;
|
||||||
import com.uber.hoodie.index.hbase.HBaseIndex;
|
import com.uber.hoodie.index.hbase.HBaseIndex;
|
||||||
|
|
||||||
import org.junit.Test;
|
import org.junit.Test;
|
||||||
|
|
||||||
import static org.junit.Assert.*;
|
|
||||||
|
|
||||||
public class TestHoodieIndex {
|
public class TestHoodieIndex {
|
||||||
|
|
||||||
@Test
|
@Test
|
||||||
public void testCreateIndex() throws Exception {
|
public void testCreateIndex() throws Exception {
|
||||||
HoodieWriteConfig.Builder clientConfigBuilder = HoodieWriteConfig.newBuilder();
|
HoodieWriteConfig.Builder clientConfigBuilder = HoodieWriteConfig.newBuilder();
|
||||||
|
|||||||
@@ -18,28 +18,39 @@
|
|||||||
|
|
||||||
package com.uber.hoodie.index.bloom;
|
package com.uber.hoodie.index.bloom;
|
||||||
|
|
||||||
|
import static org.junit.Assert.assertEquals;
|
||||||
|
import static org.junit.Assert.assertFalse;
|
||||||
|
import static org.junit.Assert.assertNotNull;
|
||||||
|
import static org.junit.Assert.assertNull;
|
||||||
|
import static org.junit.Assert.assertTrue;
|
||||||
|
import static org.junit.Assert.fail;
|
||||||
|
|
||||||
import com.google.common.base.Optional;
|
import com.google.common.base.Optional;
|
||||||
import com.google.common.collect.Lists;
|
import com.google.common.collect.Lists;
|
||||||
|
|
||||||
import com.uber.hoodie.common.HoodieClientTestUtils;
|
|
||||||
import com.uber.hoodie.common.table.HoodieTableMetaClient;
|
|
||||||
import com.uber.hoodie.config.HoodieIndexConfig;
|
|
||||||
import com.uber.hoodie.config.HoodieWriteConfig;
|
|
||||||
import com.uber.hoodie.avro.HoodieAvroWriteSupport;
|
import com.uber.hoodie.avro.HoodieAvroWriteSupport;
|
||||||
import com.uber.hoodie.common.BloomFilter;
|
import com.uber.hoodie.common.BloomFilter;
|
||||||
|
import com.uber.hoodie.common.HoodieClientTestUtils;
|
||||||
import com.uber.hoodie.common.TestRawTripPayload;
|
import com.uber.hoodie.common.TestRawTripPayload;
|
||||||
import com.uber.hoodie.common.model.HoodieKey;
|
import com.uber.hoodie.common.model.HoodieKey;
|
||||||
import com.uber.hoodie.common.model.HoodieRecord;
|
import com.uber.hoodie.common.model.HoodieRecord;
|
||||||
import com.uber.hoodie.common.model.HoodieTestUtils;
|
import com.uber.hoodie.common.model.HoodieTestUtils;
|
||||||
|
import com.uber.hoodie.common.table.HoodieTableMetaClient;
|
||||||
import com.uber.hoodie.common.util.FSUtils;
|
import com.uber.hoodie.common.util.FSUtils;
|
||||||
import com.uber.hoodie.common.util.HoodieAvroUtils;
|
import com.uber.hoodie.common.util.HoodieAvroUtils;
|
||||||
|
import com.uber.hoodie.config.HoodieWriteConfig;
|
||||||
import com.uber.hoodie.index.bloom.BloomIndexFileInfo;
|
|
||||||
import com.uber.hoodie.index.bloom.HoodieBloomIndex;
|
|
||||||
import com.uber.hoodie.index.bloom.HoodieBloomIndexCheckFunction;
|
|
||||||
import com.uber.hoodie.io.storage.HoodieParquetConfig;
|
import com.uber.hoodie.io.storage.HoodieParquetConfig;
|
||||||
import com.uber.hoodie.io.storage.HoodieParquetWriter;
|
import com.uber.hoodie.io.storage.HoodieParquetWriter;
|
||||||
import com.uber.hoodie.table.HoodieTable;
|
import com.uber.hoodie.table.HoodieTable;
|
||||||
|
import java.io.File;
|
||||||
|
import java.io.IOException;
|
||||||
|
import java.text.SimpleDateFormat;
|
||||||
|
import java.util.Arrays;
|
||||||
|
import java.util.Date;
|
||||||
|
import java.util.HashMap;
|
||||||
|
import java.util.List;
|
||||||
|
import java.util.Map;
|
||||||
|
import java.util.UUID;
|
||||||
|
import java.util.stream.Collectors;
|
||||||
import org.apache.avro.Schema;
|
import org.apache.avro.Schema;
|
||||||
import org.apache.avro.generic.GenericRecord;
|
import org.apache.avro.generic.GenericRecord;
|
||||||
import org.apache.commons.io.IOUtils;
|
import org.apache.commons.io.IOUtils;
|
||||||
@@ -47,11 +58,8 @@ import org.apache.hadoop.conf.Configuration;
|
|||||||
import org.apache.hadoop.fs.FileSystem;
|
import org.apache.hadoop.fs.FileSystem;
|
||||||
import org.apache.hadoop.fs.Path;
|
import org.apache.hadoop.fs.Path;
|
||||||
import org.apache.parquet.avro.AvroSchemaConverter;
|
import org.apache.parquet.avro.AvroSchemaConverter;
|
||||||
import org.apache.parquet.avro.AvroWriteSupport;
|
|
||||||
import org.apache.parquet.hadoop.ParquetWriter;
|
import org.apache.parquet.hadoop.ParquetWriter;
|
||||||
import org.apache.parquet.hadoop.api.WriteSupport;
|
|
||||||
import org.apache.parquet.hadoop.metadata.CompressionCodecName;
|
import org.apache.parquet.hadoop.metadata.CompressionCodecName;
|
||||||
import org.apache.spark.SparkConf;
|
|
||||||
import org.apache.spark.api.java.JavaPairRDD;
|
import org.apache.spark.api.java.JavaPairRDD;
|
||||||
import org.apache.spark.api.java.JavaRDD;
|
import org.apache.spark.api.java.JavaRDD;
|
||||||
import org.apache.spark.api.java.JavaSparkContext;
|
import org.apache.spark.api.java.JavaSparkContext;
|
||||||
@@ -59,20 +67,10 @@ import org.junit.After;
|
|||||||
import org.junit.Before;
|
import org.junit.Before;
|
||||||
import org.junit.Test;
|
import org.junit.Test;
|
||||||
import org.junit.rules.TemporaryFolder;
|
import org.junit.rules.TemporaryFolder;
|
||||||
import org.mockito.Mockito;
|
|
||||||
|
|
||||||
import scala.Tuple2;
|
import scala.Tuple2;
|
||||||
|
|
||||||
import java.io.File;
|
|
||||||
import java.io.IOException;
|
|
||||||
import java.text.SimpleDateFormat;
|
|
||||||
import java.util.*;
|
|
||||||
import java.util.stream.Collectors;
|
|
||||||
import java.util.stream.Stream;
|
|
||||||
|
|
||||||
import static org.junit.Assert.*;
|
|
||||||
|
|
||||||
public class TestHoodieBloomIndex {
|
public class TestHoodieBloomIndex {
|
||||||
|
|
||||||
private JavaSparkContext jsc = null;
|
private JavaSparkContext jsc = null;
|
||||||
private String basePath = null;
|
private String basePath = null;
|
||||||
private transient final FileSystem fs;
|
private transient final FileSystem fs;
|
||||||
@@ -106,15 +104,20 @@ public class TestHoodieBloomIndex {
|
|||||||
String recordStr4 = "{\"_row_key\":\"4eb5b87c-1fej-4edd-87b4-6ec96dc405a0\",\"time\":\"2015-01-31T03:16:41.415Z\",\"number\":32}";
|
String recordStr4 = "{\"_row_key\":\"4eb5b87c-1fej-4edd-87b4-6ec96dc405a0\",\"time\":\"2015-01-31T03:16:41.415Z\",\"number\":32}";
|
||||||
|
|
||||||
TestRawTripPayload rowChange1 = new TestRawTripPayload(recordStr1);
|
TestRawTripPayload rowChange1 = new TestRawTripPayload(recordStr1);
|
||||||
HoodieRecord record1 = new HoodieRecord(new HoodieKey(rowChange1.getRowKey(), rowChange1.getPartitionPath()), rowChange1);
|
HoodieRecord record1 = new HoodieRecord(
|
||||||
|
new HoodieKey(rowChange1.getRowKey(), rowChange1.getPartitionPath()), rowChange1);
|
||||||
TestRawTripPayload rowChange2 = new TestRawTripPayload(recordStr2);
|
TestRawTripPayload rowChange2 = new TestRawTripPayload(recordStr2);
|
||||||
HoodieRecord record2 = new HoodieRecord(new HoodieKey(rowChange2.getRowKey(), rowChange2.getPartitionPath()), rowChange2);
|
HoodieRecord record2 = new HoodieRecord(
|
||||||
|
new HoodieKey(rowChange2.getRowKey(), rowChange2.getPartitionPath()), rowChange2);
|
||||||
TestRawTripPayload rowChange3 = new TestRawTripPayload(recordStr3);
|
TestRawTripPayload rowChange3 = new TestRawTripPayload(recordStr3);
|
||||||
HoodieRecord record3 = new HoodieRecord(new HoodieKey(rowChange3.getRowKey(), rowChange3.getPartitionPath()), rowChange3);
|
HoodieRecord record3 = new HoodieRecord(
|
||||||
|
new HoodieKey(rowChange3.getRowKey(), rowChange3.getPartitionPath()), rowChange3);
|
||||||
TestRawTripPayload rowChange4 = new TestRawTripPayload(recordStr4);
|
TestRawTripPayload rowChange4 = new TestRawTripPayload(recordStr4);
|
||||||
HoodieRecord record4 = new HoodieRecord(new HoodieKey(rowChange4.getRowKey(), rowChange4.getPartitionPath()), rowChange4);
|
HoodieRecord record4 = new HoodieRecord(
|
||||||
|
new HoodieKey(rowChange4.getRowKey(), rowChange4.getPartitionPath()), rowChange4);
|
||||||
|
|
||||||
JavaRDD<HoodieRecord> recordRDD = jsc.parallelize(Arrays.asList(record1, record2, record3, record4));
|
JavaRDD<HoodieRecord> recordRDD = jsc
|
||||||
|
.parallelize(Arrays.asList(record1, record2, record3, record4));
|
||||||
|
|
||||||
// Load to memory
|
// Load to memory
|
||||||
HoodieWriteConfig config = HoodieWriteConfig.newBuilder().withPath(basePath).build();
|
HoodieWriteConfig config = HoodieWriteConfig.newBuilder().withPath(basePath).build();
|
||||||
@@ -144,20 +147,31 @@ public class TestHoodieBloomIndex {
|
|||||||
new File(basePath + "/2016/04/01").mkdirs();
|
new File(basePath + "/2016/04/01").mkdirs();
|
||||||
new File(basePath + "/2015/03/12").mkdirs();
|
new File(basePath + "/2015/03/12").mkdirs();
|
||||||
|
|
||||||
TestRawTripPayload rowChange1 = new TestRawTripPayload("{\"_row_key\":\"000\",\"time\":\"2016-01-31T03:16:41.415Z\",\"number\":12}");
|
TestRawTripPayload rowChange1 = new TestRawTripPayload(
|
||||||
HoodieRecord record1 = new HoodieRecord(new HoodieKey(rowChange1.getRowKey(), rowChange1.getPartitionPath()), rowChange1);
|
"{\"_row_key\":\"000\",\"time\":\"2016-01-31T03:16:41.415Z\",\"number\":12}");
|
||||||
TestRawTripPayload rowChange2 = new TestRawTripPayload("{\"_row_key\":\"001\",\"time\":\"2016-01-31T03:16:41.415Z\",\"number\":12}");
|
HoodieRecord record1 = new HoodieRecord(
|
||||||
HoodieRecord record2 = new HoodieRecord(new HoodieKey(rowChange2.getRowKey(), rowChange2.getPartitionPath()), rowChange2);
|
new HoodieKey(rowChange1.getRowKey(), rowChange1.getPartitionPath()), rowChange1);
|
||||||
TestRawTripPayload rowChange3 = new TestRawTripPayload("{\"_row_key\":\"002\",\"time\":\"2016-01-31T03:16:41.415Z\",\"number\":12}");
|
TestRawTripPayload rowChange2 = new TestRawTripPayload(
|
||||||
HoodieRecord record3 = new HoodieRecord(new HoodieKey(rowChange3.getRowKey(), rowChange3.getPartitionPath()), rowChange3);
|
"{\"_row_key\":\"001\",\"time\":\"2016-01-31T03:16:41.415Z\",\"number\":12}");
|
||||||
TestRawTripPayload rowChange4 = new TestRawTripPayload("{\"_row_key\":\"003\",\"time\":\"2016-01-31T03:16:41.415Z\",\"number\":12}");
|
HoodieRecord record2 = new HoodieRecord(
|
||||||
HoodieRecord record4 = new HoodieRecord(new HoodieKey(rowChange4.getRowKey(), rowChange4.getPartitionPath()), rowChange4);
|
new HoodieKey(rowChange2.getRowKey(), rowChange2.getPartitionPath()), rowChange2);
|
||||||
|
TestRawTripPayload rowChange3 = new TestRawTripPayload(
|
||||||
|
"{\"_row_key\":\"002\",\"time\":\"2016-01-31T03:16:41.415Z\",\"number\":12}");
|
||||||
|
HoodieRecord record3 = new HoodieRecord(
|
||||||
|
new HoodieKey(rowChange3.getRowKey(), rowChange3.getPartitionPath()), rowChange3);
|
||||||
|
TestRawTripPayload rowChange4 = new TestRawTripPayload(
|
||||||
|
"{\"_row_key\":\"003\",\"time\":\"2016-01-31T03:16:41.415Z\",\"number\":12}");
|
||||||
|
HoodieRecord record4 = new HoodieRecord(
|
||||||
|
new HoodieKey(rowChange4.getRowKey(), rowChange4.getPartitionPath()), rowChange4);
|
||||||
|
|
||||||
|
writeParquetFile("2016/04/01", "2_0_20160401010101.parquet", Lists.newArrayList(), schema, null,
|
||||||
writeParquetFile("2016/04/01","2_0_20160401010101.parquet", Lists.newArrayList(), schema, null, false);
|
false);
|
||||||
writeParquetFile("2015/03/12","1_0_20150312101010.parquet", Lists.newArrayList(), schema, null, false);
|
writeParquetFile("2015/03/12", "1_0_20150312101010.parquet", Lists.newArrayList(), schema, null,
|
||||||
writeParquetFile("2015/03/12","3_0_20150312101010.parquet", Arrays.asList(record1), schema, null, false);
|
false);
|
||||||
writeParquetFile("2015/03/12","4_0_20150312101010.parquet", Arrays.asList(record2, record3, record4), schema, null, false);
|
writeParquetFile("2015/03/12", "3_0_20150312101010.parquet", Arrays.asList(record1), schema,
|
||||||
|
null, false);
|
||||||
|
writeParquetFile("2015/03/12", "4_0_20150312101010.parquet",
|
||||||
|
Arrays.asList(record2, record3, record4), schema, null, false);
|
||||||
|
|
||||||
List<String> partitions = Arrays.asList("2016/01/21", "2016/04/01", "2015/03/12");
|
List<String> partitions = Arrays.asList("2016/01/21", "2016/04/01", "2015/03/12");
|
||||||
HoodieTableMetaClient metadata = new HoodieTableMetaClient(fs, basePath);
|
HoodieTableMetaClient metadata = new HoodieTableMetaClient(fs, basePath);
|
||||||
@@ -185,9 +199,11 @@ public class TestHoodieBloomIndex {
|
|||||||
|
|
||||||
List<Tuple2<String, BloomIndexFileInfo>> expected = Arrays.asList(
|
List<Tuple2<String, BloomIndexFileInfo>> expected = Arrays.asList(
|
||||||
new Tuple2<>("2016/04/01", new BloomIndexFileInfo("2_0_20160401010101.parquet")),
|
new Tuple2<>("2016/04/01", new BloomIndexFileInfo("2_0_20160401010101.parquet")),
|
||||||
new Tuple2<>("2015/03/12",new BloomIndexFileInfo("1_0_20150312101010.parquet")),
|
new Tuple2<>("2015/03/12", new BloomIndexFileInfo("1_0_20150312101010.parquet")),
|
||||||
new Tuple2<>("2015/03/12",new BloomIndexFileInfo("3_0_20150312101010.parquet", "000", "000")),
|
new Tuple2<>("2015/03/12",
|
||||||
new Tuple2<>("2015/03/12",new BloomIndexFileInfo("4_0_20150312101010.parquet", "001", "003"))
|
new BloomIndexFileInfo("3_0_20150312101010.parquet", "000", "000")),
|
||||||
|
new Tuple2<>("2015/03/12",
|
||||||
|
new BloomIndexFileInfo("4_0_20150312101010.parquet", "001", "003"))
|
||||||
);
|
);
|
||||||
assertEquals(expected, filesList);
|
assertEquals(expected, filesList);
|
||||||
}
|
}
|
||||||
@@ -200,7 +216,6 @@ public class TestHoodieBloomIndex {
|
|||||||
.build();
|
.build();
|
||||||
HoodieBloomIndex index = new HoodieBloomIndex(config, jsc);
|
HoodieBloomIndex index = new HoodieBloomIndex(config, jsc);
|
||||||
|
|
||||||
|
|
||||||
final Map<String, List<BloomIndexFileInfo>> partitionToFileIndexInfo = new HashMap<>();
|
final Map<String, List<BloomIndexFileInfo>> partitionToFileIndexInfo = new HashMap<>();
|
||||||
partitionToFileIndexInfo.put("2017/10/22", Arrays.asList(
|
partitionToFileIndexInfo.put("2017/10/22", Arrays.asList(
|
||||||
new BloomIndexFileInfo("f1"),
|
new BloomIndexFileInfo("f1"),
|
||||||
@@ -212,14 +227,13 @@ public class TestHoodieBloomIndex {
|
|||||||
|
|
||||||
JavaPairRDD<String, String> partitionRecordKeyPairRDD = jsc
|
JavaPairRDD<String, String> partitionRecordKeyPairRDD = jsc
|
||||||
.parallelize(Arrays.asList(
|
.parallelize(Arrays.asList(
|
||||||
new Tuple2<>("2017/10/22","003"),
|
new Tuple2<>("2017/10/22", "003"),
|
||||||
new Tuple2<>("2017/10/22","002"),
|
new Tuple2<>("2017/10/22", "002"),
|
||||||
new Tuple2<>("2017/10/22","005"),
|
new Tuple2<>("2017/10/22", "005"),
|
||||||
new Tuple2<>("2017/10/22","004")
|
new Tuple2<>("2017/10/22", "004")
|
||||||
))
|
))
|
||||||
.mapToPair(t -> t);
|
.mapToPair(t -> t);
|
||||||
|
|
||||||
|
|
||||||
List<Tuple2<String, Tuple2<String, HoodieKey>>> comparisonKeyList = index
|
List<Tuple2<String, Tuple2<String, HoodieKey>>> comparisonKeyList = index
|
||||||
.explodeRecordRDDWithFileComparisons(partitionToFileIndexInfo, partitionRecordKeyPairRDD)
|
.explodeRecordRDDWithFileComparisons(partitionToFileIndexInfo, partitionRecordKeyPairRDD)
|
||||||
.collect();
|
.collect();
|
||||||
@@ -240,7 +254,8 @@ public class TestHoodieBloomIndex {
|
|||||||
}
|
}
|
||||||
|
|
||||||
@Test
|
@Test
|
||||||
public void testCheckUUIDsAgainstOneFile() throws IOException, InterruptedException, ClassNotFoundException {
|
public void testCheckUUIDsAgainstOneFile()
|
||||||
|
throws IOException, InterruptedException, ClassNotFoundException {
|
||||||
|
|
||||||
// Create some records to use
|
// Create some records to use
|
||||||
String recordStr1 = "{\"_row_key\":\"1eb5b87a-1feh-4edd-87b4-6ec96dc405a0\",\"time\":\"2016-01-31T03:16:41.415Z\",\"number\":12}";
|
String recordStr1 = "{\"_row_key\":\"1eb5b87a-1feh-4edd-87b4-6ec96dc405a0\",\"time\":\"2016-01-31T03:16:41.415Z\",\"number\":12}";
|
||||||
@@ -248,19 +263,23 @@ public class TestHoodieBloomIndex {
|
|||||||
String recordStr3 = "{\"_row_key\":\"3eb5b87c-1fej-4edd-87b4-6ec96dc405a0\",\"time\":\"2016-01-31T03:16:41.415Z\",\"number\":15}";
|
String recordStr3 = "{\"_row_key\":\"3eb5b87c-1fej-4edd-87b4-6ec96dc405a0\",\"time\":\"2016-01-31T03:16:41.415Z\",\"number\":15}";
|
||||||
String recordStr4 = "{\"_row_key\":\"4eb5b87c-1fej-4edd-87b4-6ec96dc405a0\",\"time\":\"2016-01-31T03:16:41.415Z\",\"number\":32}";
|
String recordStr4 = "{\"_row_key\":\"4eb5b87c-1fej-4edd-87b4-6ec96dc405a0\",\"time\":\"2016-01-31T03:16:41.415Z\",\"number\":32}";
|
||||||
TestRawTripPayload rowChange1 = new TestRawTripPayload(recordStr1);
|
TestRawTripPayload rowChange1 = new TestRawTripPayload(recordStr1);
|
||||||
HoodieRecord record1 = new HoodieRecord(new HoodieKey(rowChange1.getRowKey(), rowChange1.getPartitionPath()), rowChange1);
|
HoodieRecord record1 = new HoodieRecord(
|
||||||
|
new HoodieKey(rowChange1.getRowKey(), rowChange1.getPartitionPath()), rowChange1);
|
||||||
TestRawTripPayload rowChange2 = new TestRawTripPayload(recordStr2);
|
TestRawTripPayload rowChange2 = new TestRawTripPayload(recordStr2);
|
||||||
HoodieRecord record2 = new HoodieRecord(new HoodieKey(rowChange2.getRowKey(), rowChange2.getPartitionPath()), rowChange2);
|
HoodieRecord record2 = new HoodieRecord(
|
||||||
|
new HoodieKey(rowChange2.getRowKey(), rowChange2.getPartitionPath()), rowChange2);
|
||||||
TestRawTripPayload rowChange3 = new TestRawTripPayload(recordStr3);
|
TestRawTripPayload rowChange3 = new TestRawTripPayload(recordStr3);
|
||||||
HoodieRecord record3 = new HoodieRecord(new HoodieKey(rowChange3.getRowKey(), rowChange3.getPartitionPath()), rowChange3);
|
HoodieRecord record3 = new HoodieRecord(
|
||||||
|
new HoodieKey(rowChange3.getRowKey(), rowChange3.getPartitionPath()), rowChange3);
|
||||||
TestRawTripPayload rowChange4 = new TestRawTripPayload(recordStr4);
|
TestRawTripPayload rowChange4 = new TestRawTripPayload(recordStr4);
|
||||||
HoodieRecord record4 = new HoodieRecord(new HoodieKey(rowChange4.getRowKey(), rowChange4.getPartitionPath()), rowChange4);
|
HoodieRecord record4 = new HoodieRecord(
|
||||||
|
new HoodieKey(rowChange4.getRowKey(), rowChange4.getPartitionPath()), rowChange4);
|
||||||
|
|
||||||
// We write record1, record2 to a parquet file, but the bloom filter contains (record1, record2, record3).
|
// We write record1, record2 to a parquet file, but the bloom filter contains (record1, record2, record3).
|
||||||
BloomFilter filter = new BloomFilter(10000, 0.0000001);
|
BloomFilter filter = new BloomFilter(10000, 0.0000001);
|
||||||
filter.add(record3.getRecordKey());
|
filter.add(record3.getRecordKey());
|
||||||
String filename = writeParquetFile("2016/01/31", Arrays.asList(record1, record2), schema, filter, true);
|
String filename = writeParquetFile("2016/01/31", Arrays.asList(record1, record2), schema,
|
||||||
|
filter, true);
|
||||||
|
|
||||||
// The bloom filter contains 3 records
|
// The bloom filter contains 3 records
|
||||||
assertTrue(filter.mightContain(record1.getRecordKey()));
|
assertTrue(filter.mightContain(record1.getRecordKey()));
|
||||||
@@ -299,7 +318,8 @@ public class TestHoodieBloomIndex {
|
|||||||
try {
|
try {
|
||||||
bloomIndex.tagLocation(recordRDD, table);
|
bloomIndex.tagLocation(recordRDD, table);
|
||||||
} catch (IllegalArgumentException e) {
|
} catch (IllegalArgumentException e) {
|
||||||
fail("EmptyRDD should not result in IllegalArgumentException: Positive number of slices required");
|
fail(
|
||||||
|
"EmptyRDD should not result in IllegalArgumentException: Positive number of slices required");
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -313,14 +333,19 @@ public class TestHoodieBloomIndex {
|
|||||||
String recordStr3 = "{\"_row_key\":\"3eb5b87c-1fej-4edd-87b4-6ec96dc405a0\",\"time\":\"2016-01-31T03:16:41.415Z\",\"number\":15}";
|
String recordStr3 = "{\"_row_key\":\"3eb5b87c-1fej-4edd-87b4-6ec96dc405a0\",\"time\":\"2016-01-31T03:16:41.415Z\",\"number\":15}";
|
||||||
String recordStr4 = "{\"_row_key\":\"4eb5b87c-1fej-4edd-87b4-6ec96dc405a0\",\"time\":\"2015-01-31T03:16:41.415Z\",\"number\":32}";
|
String recordStr4 = "{\"_row_key\":\"4eb5b87c-1fej-4edd-87b4-6ec96dc405a0\",\"time\":\"2015-01-31T03:16:41.415Z\",\"number\":32}";
|
||||||
TestRawTripPayload rowChange1 = new TestRawTripPayload(recordStr1);
|
TestRawTripPayload rowChange1 = new TestRawTripPayload(recordStr1);
|
||||||
HoodieRecord record1 = new HoodieRecord(new HoodieKey(rowChange1.getRowKey(), rowChange1.getPartitionPath()), rowChange1);
|
HoodieRecord record1 = new HoodieRecord(
|
||||||
|
new HoodieKey(rowChange1.getRowKey(), rowChange1.getPartitionPath()), rowChange1);
|
||||||
TestRawTripPayload rowChange2 = new TestRawTripPayload(recordStr2);
|
TestRawTripPayload rowChange2 = new TestRawTripPayload(recordStr2);
|
||||||
HoodieRecord record2 = new HoodieRecord(new HoodieKey(rowChange2.getRowKey(), rowChange2.getPartitionPath()), rowChange2);
|
HoodieRecord record2 = new HoodieRecord(
|
||||||
|
new HoodieKey(rowChange2.getRowKey(), rowChange2.getPartitionPath()), rowChange2);
|
||||||
TestRawTripPayload rowChange3 = new TestRawTripPayload(recordStr3);
|
TestRawTripPayload rowChange3 = new TestRawTripPayload(recordStr3);
|
||||||
HoodieRecord record3 = new HoodieRecord(new HoodieKey(rowChange3.getRowKey(), rowChange3.getPartitionPath()), rowChange3);
|
HoodieRecord record3 = new HoodieRecord(
|
||||||
|
new HoodieKey(rowChange3.getRowKey(), rowChange3.getPartitionPath()), rowChange3);
|
||||||
TestRawTripPayload rowChange4 = new TestRawTripPayload(recordStr4);
|
TestRawTripPayload rowChange4 = new TestRawTripPayload(recordStr4);
|
||||||
HoodieRecord record4 = new HoodieRecord(new HoodieKey(rowChange4.getRowKey(), rowChange4.getPartitionPath()), rowChange4);
|
HoodieRecord record4 = new HoodieRecord(
|
||||||
JavaRDD<HoodieRecord> recordRDD = jsc.parallelize(Arrays.asList(record1, record2, record3, record4));
|
new HoodieKey(rowChange4.getRowKey(), rowChange4.getPartitionPath()), rowChange4);
|
||||||
|
JavaRDD<HoodieRecord> recordRDD = jsc
|
||||||
|
.parallelize(Arrays.asList(record1, record2, record3, record4));
|
||||||
|
|
||||||
// Also create the metadata and config
|
// Also create the metadata and config
|
||||||
HoodieTableMetaClient metadata = new HoodieTableMetaClient(fs, basePath);
|
HoodieTableMetaClient metadata = new HoodieTableMetaClient(fs, basePath);
|
||||||
@@ -390,7 +415,8 @@ public class TestHoodieBloomIndex {
|
|||||||
|
|
||||||
// Let's tag
|
// Let's tag
|
||||||
HoodieBloomIndex bloomIndex = new HoodieBloomIndex(config, jsc);
|
HoodieBloomIndex bloomIndex = new HoodieBloomIndex(config, jsc);
|
||||||
JavaPairRDD<HoodieKey, Optional<String>> taggedRecordRDD = bloomIndex.fetchRecordLocation(keysRDD, table);
|
JavaPairRDD<HoodieKey, Optional<String>> taggedRecordRDD = bloomIndex
|
||||||
|
.fetchRecordLocation(keysRDD, table);
|
||||||
|
|
||||||
// Should not find any files
|
// Should not find any files
|
||||||
for (Tuple2<HoodieKey, Optional<String>> record : taggedRecordRDD.collect()) {
|
for (Tuple2<HoodieKey, Optional<String>> record : taggedRecordRDD.collect()) {
|
||||||
@@ -436,9 +462,11 @@ public class TestHoodieBloomIndex {
|
|||||||
|
|
||||||
// We write record1 to a parquet file, using a bloom filter having both records
|
// We write record1 to a parquet file, using a bloom filter having both records
|
||||||
TestRawTripPayload rowChange1 = new TestRawTripPayload(recordStr1);
|
TestRawTripPayload rowChange1 = new TestRawTripPayload(recordStr1);
|
||||||
HoodieRecord record1 = new HoodieRecord(new HoodieKey(rowChange1.getRowKey(), rowChange1.getPartitionPath()), rowChange1);
|
HoodieRecord record1 = new HoodieRecord(
|
||||||
|
new HoodieKey(rowChange1.getRowKey(), rowChange1.getPartitionPath()), rowChange1);
|
||||||
TestRawTripPayload rowChange2 = new TestRawTripPayload(recordStr2);
|
TestRawTripPayload rowChange2 = new TestRawTripPayload(recordStr2);
|
||||||
HoodieRecord record2 = new HoodieRecord(new HoodieKey(rowChange2.getRowKey(), rowChange2.getPartitionPath()), rowChange2);
|
HoodieRecord record2 = new HoodieRecord(
|
||||||
|
new HoodieKey(rowChange2.getRowKey(), rowChange2.getPartitionPath()), rowChange2);
|
||||||
|
|
||||||
BloomFilter filter = new BloomFilter(10000, 0.0000001);
|
BloomFilter filter = new BloomFilter(10000, 0.0000001);
|
||||||
filter.add(record2.getRecordKey());
|
filter.add(record2.getRecordKey());
|
||||||
@@ -472,21 +500,22 @@ public class TestHoodieBloomIndex {
|
|||||||
String fileId = UUID.randomUUID().toString();
|
String fileId = UUID.randomUUID().toString();
|
||||||
String filename = FSUtils.makeDataFileName(commitTime, 1, fileId);
|
String filename = FSUtils.makeDataFileName(commitTime, 1, fileId);
|
||||||
|
|
||||||
|
|
||||||
return writeParquetFile(partitionPath, filename, records, schema, filter, createCommitTime);
|
return writeParquetFile(partitionPath, filename, records, schema, filter, createCommitTime);
|
||||||
}
|
}
|
||||||
|
|
||||||
private String writeParquetFile(String partitionPath, String filename, List<HoodieRecord> records, Schema schema,
|
private String writeParquetFile(String partitionPath, String filename, List<HoodieRecord> records,
|
||||||
|
Schema schema,
|
||||||
BloomFilter filter, boolean createCommitTime) throws IOException {
|
BloomFilter filter, boolean createCommitTime) throws IOException {
|
||||||
|
|
||||||
|
|
||||||
if (filter == null) {
|
if (filter == null) {
|
||||||
filter = new BloomFilter(10000, 0.0000001);
|
filter = new BloomFilter(10000, 0.0000001);
|
||||||
}
|
}
|
||||||
HoodieAvroWriteSupport writeSupport = new HoodieAvroWriteSupport(new AvroSchemaConverter().convert(schema), schema, filter);
|
HoodieAvroWriteSupport writeSupport = new HoodieAvroWriteSupport(
|
||||||
|
new AvroSchemaConverter().convert(schema), schema, filter);
|
||||||
String commitTime = FSUtils.getCommitTime(filename);
|
String commitTime = FSUtils.getCommitTime(filename);
|
||||||
HoodieParquetConfig config = new HoodieParquetConfig(writeSupport, CompressionCodecName.GZIP,
|
HoodieParquetConfig config = new HoodieParquetConfig(writeSupport, CompressionCodecName.GZIP,
|
||||||
ParquetWriter.DEFAULT_BLOCK_SIZE, ParquetWriter.DEFAULT_PAGE_SIZE, 120 * 1024 * 1024, new Configuration());
|
ParquetWriter.DEFAULT_BLOCK_SIZE, ParquetWriter.DEFAULT_PAGE_SIZE, 120 * 1024 * 1024,
|
||||||
|
new Configuration());
|
||||||
HoodieParquetWriter writer = new HoodieParquetWriter(
|
HoodieParquetWriter writer = new HoodieParquetWriter(
|
||||||
commitTime,
|
commitTime,
|
||||||
new Path(basePath + "/" + partitionPath + "/" + filename),
|
new Path(basePath + "/" + partitionPath + "/" + filename),
|
||||||
@@ -496,7 +525,9 @@ public class TestHoodieBloomIndex {
|
|||||||
for (HoodieRecord record : records) {
|
for (HoodieRecord record : records) {
|
||||||
GenericRecord avroRecord = (GenericRecord) record.getData().getInsertValue(schema).get();
|
GenericRecord avroRecord = (GenericRecord) record.getData().getInsertValue(schema).get();
|
||||||
HoodieAvroUtils.addCommitMetadataToRecord(avroRecord, commitTime, "" + seqId++);
|
HoodieAvroUtils.addCommitMetadataToRecord(avroRecord, commitTime, "" + seqId++);
|
||||||
HoodieAvroUtils.addHoodieKeyToRecord(avroRecord, record.getRecordKey(), record.getPartitionPath(), filename);
|
HoodieAvroUtils
|
||||||
|
.addHoodieKeyToRecord(avroRecord, record.getRecordKey(), record.getPartitionPath(),
|
||||||
|
filename);
|
||||||
writer.writeAvro(record.getRecordKey(), avroRecord);
|
writer.writeAvro(record.getRecordKey(), avroRecord);
|
||||||
filter.add(record.getRecordKey());
|
filter.add(record.getRecordKey());
|
||||||
}
|
}
|
||||||
@@ -505,7 +536,9 @@ public class TestHoodieBloomIndex {
|
|||||||
if (createCommitTime) {
|
if (createCommitTime) {
|
||||||
// Also make sure the commit is valid
|
// Also make sure the commit is valid
|
||||||
new File(basePath + "/" + HoodieTableMetaClient.METAFOLDER_NAME).mkdirs();
|
new File(basePath + "/" + HoodieTableMetaClient.METAFOLDER_NAME).mkdirs();
|
||||||
new File(basePath + "/" + HoodieTableMetaClient.METAFOLDER_NAME + "/" + commitTime + ".commit").createNewFile();
|
new File(
|
||||||
|
basePath + "/" + HoodieTableMetaClient.METAFOLDER_NAME + "/" + commitTime + ".commit")
|
||||||
|
.createNewFile();
|
||||||
}
|
}
|
||||||
return filename;
|
return filename;
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -16,9 +16,11 @@
|
|||||||
|
|
||||||
package com.uber.hoodie.io;
|
package com.uber.hoodie.io;
|
||||||
|
|
||||||
|
import static org.junit.Assert.assertEquals;
|
||||||
|
import static org.junit.Assert.assertTrue;
|
||||||
|
|
||||||
import com.uber.hoodie.avro.model.HoodieArchivedMetaEntry;
|
import com.uber.hoodie.avro.model.HoodieArchivedMetaEntry;
|
||||||
import com.uber.hoodie.common.HoodieTestDataGenerator;
|
import com.uber.hoodie.common.HoodieTestDataGenerator;
|
||||||
import com.uber.hoodie.common.model.HoodieArchivedLogFile;
|
|
||||||
import com.uber.hoodie.common.model.HoodieLogFile;
|
import com.uber.hoodie.common.model.HoodieLogFile;
|
||||||
import com.uber.hoodie.common.model.HoodieTestUtils;
|
import com.uber.hoodie.common.model.HoodieTestUtils;
|
||||||
import com.uber.hoodie.common.table.HoodieTableMetaClient;
|
import com.uber.hoodie.common.table.HoodieTableMetaClient;
|
||||||
@@ -29,6 +31,11 @@ import com.uber.hoodie.common.table.timeline.HoodieInstant;
|
|||||||
import com.uber.hoodie.common.util.FSUtils;
|
import com.uber.hoodie.common.util.FSUtils;
|
||||||
import com.uber.hoodie.config.HoodieCompactionConfig;
|
import com.uber.hoodie.config.HoodieCompactionConfig;
|
||||||
import com.uber.hoodie.config.HoodieWriteConfig;
|
import com.uber.hoodie.config.HoodieWriteConfig;
|
||||||
|
import java.io.IOException;
|
||||||
|
import java.util.ArrayList;
|
||||||
|
import java.util.Collections;
|
||||||
|
import java.util.List;
|
||||||
|
import java.util.stream.Collectors;
|
||||||
import org.apache.avro.generic.GenericRecord;
|
import org.apache.avro.generic.GenericRecord;
|
||||||
import org.apache.avro.generic.IndexedRecord;
|
import org.apache.avro.generic.IndexedRecord;
|
||||||
import org.apache.hadoop.fs.FileSystem;
|
import org.apache.hadoop.fs.FileSystem;
|
||||||
@@ -37,16 +44,8 @@ import org.junit.Before;
|
|||||||
import org.junit.Test;
|
import org.junit.Test;
|
||||||
import org.junit.rules.TemporaryFolder;
|
import org.junit.rules.TemporaryFolder;
|
||||||
|
|
||||||
import java.io.IOException;
|
|
||||||
import java.util.ArrayList;
|
|
||||||
import java.util.Collections;
|
|
||||||
import java.util.List;
|
|
||||||
import java.util.stream.Collectors;
|
|
||||||
|
|
||||||
import static org.junit.Assert.assertEquals;
|
|
||||||
import static org.junit.Assert.assertTrue;
|
|
||||||
|
|
||||||
public class TestHoodieCommitArchiveLog {
|
public class TestHoodieCommitArchiveLog {
|
||||||
|
|
||||||
private String basePath;
|
private String basePath;
|
||||||
private FileSystem fs;
|
private FileSystem fs;
|
||||||
|
|
||||||
@@ -97,7 +96,8 @@ public class TestHoodieCommitArchiveLog {
|
|||||||
HoodieTestUtils.createCleanFiles(basePath, "105");
|
HoodieTestUtils.createCleanFiles(basePath, "105");
|
||||||
|
|
||||||
//reload the timeline and get all the commmits before archive
|
//reload the timeline and get all the commmits before archive
|
||||||
timeline = metadata.getActiveTimeline().reload().getAllCommitsTimeline().filterCompletedInstants();
|
timeline = metadata.getActiveTimeline().reload().getAllCommitsTimeline()
|
||||||
|
.filterCompletedInstants();
|
||||||
List<HoodieInstant> originalCommits = timeline.getInstants().collect(Collectors.toList());
|
List<HoodieInstant> originalCommits = timeline.getInstants().collect(Collectors.toList());
|
||||||
|
|
||||||
assertEquals("Loaded 6 commits and the count should match", 12, timeline.countInstants());
|
assertEquals("Loaded 6 commits and the count should match", 12, timeline.countInstants());
|
||||||
@@ -107,27 +107,30 @@ public class TestHoodieCommitArchiveLog {
|
|||||||
assertTrue(archiveLog.archiveIfRequired());
|
assertTrue(archiveLog.archiveIfRequired());
|
||||||
|
|
||||||
//reload the timeline and remove the remaining commits
|
//reload the timeline and remove the remaining commits
|
||||||
timeline = metadata.getActiveTimeline().reload().getAllCommitsTimeline().filterCompletedInstants();
|
timeline = metadata.getActiveTimeline().reload().getAllCommitsTimeline()
|
||||||
|
.filterCompletedInstants();
|
||||||
originalCommits.removeAll(timeline.getInstants().collect(Collectors.toList()));
|
originalCommits.removeAll(timeline.getInstants().collect(Collectors.toList()));
|
||||||
|
|
||||||
//read the file
|
//read the file
|
||||||
HoodieLogFormat.Reader reader = HoodieLogFormat.newReader(FSUtils.getFs(),
|
HoodieLogFormat.Reader reader = HoodieLogFormat.newReader(FSUtils.getFs(),
|
||||||
new HoodieLogFile(new Path(basePath + "/.hoodie/.commits_.archive.1")), HoodieArchivedMetaEntry.getClassSchema(), false);
|
new HoodieLogFile(new Path(basePath + "/.hoodie/.commits_.archive.1")),
|
||||||
|
HoodieArchivedMetaEntry.getClassSchema(), false);
|
||||||
|
|
||||||
int archivedRecordsCount = 0;
|
int archivedRecordsCount = 0;
|
||||||
List<IndexedRecord> readRecords = new ArrayList<>();
|
List<IndexedRecord> readRecords = new ArrayList<>();
|
||||||
//read the avro blocks and validate the number of records written in each avro block
|
//read the avro blocks and validate the number of records written in each avro block
|
||||||
while(reader.hasNext()) {
|
while (reader.hasNext()) {
|
||||||
HoodieAvroDataBlock blk = (HoodieAvroDataBlock) reader.next();
|
HoodieAvroDataBlock blk = (HoodieAvroDataBlock) reader.next();
|
||||||
List<IndexedRecord> records = blk.getRecords();
|
List<IndexedRecord> records = blk.getRecords();
|
||||||
readRecords.addAll(records);
|
readRecords.addAll(records);
|
||||||
assertEquals("Archived and read records for each block are same", 8, records.size());
|
assertEquals("Archived and read records for each block are same", 8, records.size());
|
||||||
archivedRecordsCount += records.size();
|
archivedRecordsCount += records.size();
|
||||||
}
|
}
|
||||||
assertEquals("Total archived records and total read records are the same count", 8, archivedRecordsCount);
|
assertEquals("Total archived records and total read records are the same count", 8,
|
||||||
|
archivedRecordsCount);
|
||||||
|
|
||||||
//make sure the archived commits are the same as the (originalcommits - commitsleft)
|
//make sure the archived commits are the same as the (originalcommits - commitsleft)
|
||||||
List<String> readCommits = readRecords.stream().map(r -> (GenericRecord)r).map(r -> {
|
List<String> readCommits = readRecords.stream().map(r -> (GenericRecord) r).map(r -> {
|
||||||
return r.get("commitTime").toString();
|
return r.get("commitTime").toString();
|
||||||
}).collect(Collectors.toList());
|
}).collect(Collectors.toList());
|
||||||
Collections.sort(readCommits);
|
Collections.sort(readCommits);
|
||||||
@@ -158,7 +161,8 @@ public class TestHoodieCommitArchiveLog {
|
|||||||
boolean result = archiveLog.archiveIfRequired();
|
boolean result = archiveLog.archiveIfRequired();
|
||||||
assertTrue(result);
|
assertTrue(result);
|
||||||
timeline =
|
timeline =
|
||||||
metadata.getActiveTimeline().reload().getCommitsAndCompactionsTimeline().filterCompletedInstants();
|
metadata.getActiveTimeline().reload().getCommitsAndCompactionsTimeline()
|
||||||
|
.filterCompletedInstants();
|
||||||
assertEquals("Should not archive commits when maxCommitsToKeep is 5", 4,
|
assertEquals("Should not archive commits when maxCommitsToKeep is 5", 4,
|
||||||
timeline.countInstants());
|
timeline.countInstants());
|
||||||
}
|
}
|
||||||
@@ -184,7 +188,8 @@ public class TestHoodieCommitArchiveLog {
|
|||||||
boolean result = archiveLog.archiveIfRequired();
|
boolean result = archiveLog.archiveIfRequired();
|
||||||
assertTrue(result);
|
assertTrue(result);
|
||||||
timeline =
|
timeline =
|
||||||
metadata.getActiveTimeline().reload().getCommitsAndCompactionsTimeline().filterCompletedInstants();
|
metadata.getActiveTimeline().reload().getCommitsAndCompactionsTimeline()
|
||||||
|
.filterCompletedInstants();
|
||||||
assertTrue("Archived commits should always be safe",
|
assertTrue("Archived commits should always be safe",
|
||||||
timeline.containsOrBeforeTimelineStarts("100"));
|
timeline.containsOrBeforeTimelineStarts("100"));
|
||||||
assertTrue("Archived commits should always be safe",
|
assertTrue("Archived commits should always be safe",
|
||||||
@@ -217,7 +222,8 @@ public class TestHoodieCommitArchiveLog {
|
|||||||
boolean result = archiveLog.archiveIfRequired();
|
boolean result = archiveLog.archiveIfRequired();
|
||||||
assertTrue(result);
|
assertTrue(result);
|
||||||
timeline =
|
timeline =
|
||||||
metadata.getActiveTimeline().reload().getCommitsAndCompactionsTimeline().filterCompletedInstants();
|
metadata.getActiveTimeline().reload().getCommitsAndCompactionsTimeline()
|
||||||
|
.filterCompletedInstants();
|
||||||
assertEquals(
|
assertEquals(
|
||||||
"Since we have a savepoint at 101, we should never archive any commit after 101 (we only archive 100)",
|
"Since we have a savepoint at 101, we should never archive any commit after 101 (we only archive 100)",
|
||||||
5, timeline.countInstants());
|
5, timeline.countInstants());
|
||||||
|
|||||||
@@ -16,7 +16,9 @@
|
|||||||
|
|
||||||
package com.uber.hoodie.io;
|
package com.uber.hoodie.io;
|
||||||
|
|
||||||
import com.uber.hoodie.HoodieReadClient;
|
import static org.junit.Assert.assertEquals;
|
||||||
|
import static org.junit.Assert.assertTrue;
|
||||||
|
|
||||||
import com.uber.hoodie.HoodieWriteClient;
|
import com.uber.hoodie.HoodieWriteClient;
|
||||||
import com.uber.hoodie.WriteStatus;
|
import com.uber.hoodie.WriteStatus;
|
||||||
import com.uber.hoodie.common.HoodieClientTestUtils;
|
import com.uber.hoodie.common.HoodieClientTestUtils;
|
||||||
@@ -34,13 +36,16 @@ import com.uber.hoodie.config.HoodieCompactionConfig;
|
|||||||
import com.uber.hoodie.config.HoodieIndexConfig;
|
import com.uber.hoodie.config.HoodieIndexConfig;
|
||||||
import com.uber.hoodie.config.HoodieStorageConfig;
|
import com.uber.hoodie.config.HoodieStorageConfig;
|
||||||
import com.uber.hoodie.config.HoodieWriteConfig;
|
import com.uber.hoodie.config.HoodieWriteConfig;
|
||||||
import com.uber.hoodie.index.bloom.HoodieBloomIndex;
|
|
||||||
import com.uber.hoodie.index.HoodieIndex;
|
import com.uber.hoodie.index.HoodieIndex;
|
||||||
|
import com.uber.hoodie.index.bloom.HoodieBloomIndex;
|
||||||
import com.uber.hoodie.io.compact.HoodieCompactor;
|
import com.uber.hoodie.io.compact.HoodieCompactor;
|
||||||
import com.uber.hoodie.io.compact.HoodieRealtimeTableCompactor;
|
import com.uber.hoodie.io.compact.HoodieRealtimeTableCompactor;
|
||||||
import com.uber.hoodie.table.HoodieTable;
|
import com.uber.hoodie.table.HoodieTable;
|
||||||
|
import java.io.File;
|
||||||
|
import java.io.IOException;
|
||||||
|
import java.util.List;
|
||||||
|
import java.util.stream.Collectors;
|
||||||
import org.apache.hadoop.fs.FileSystem;
|
import org.apache.hadoop.fs.FileSystem;
|
||||||
import org.apache.spark.SparkConf;
|
|
||||||
import org.apache.spark.api.java.JavaRDD;
|
import org.apache.spark.api.java.JavaRDD;
|
||||||
import org.apache.spark.api.java.JavaSparkContext;
|
import org.apache.spark.api.java.JavaSparkContext;
|
||||||
import org.junit.After;
|
import org.junit.After;
|
||||||
@@ -48,15 +53,8 @@ import org.junit.Before;
|
|||||||
import org.junit.Test;
|
import org.junit.Test;
|
||||||
import org.junit.rules.TemporaryFolder;
|
import org.junit.rules.TemporaryFolder;
|
||||||
|
|
||||||
import java.io.File;
|
|
||||||
import java.io.IOException;
|
|
||||||
import java.util.List;
|
|
||||||
import java.util.stream.Collectors;
|
|
||||||
|
|
||||||
import static org.junit.Assert.assertEquals;
|
|
||||||
import static org.junit.Assert.assertTrue;
|
|
||||||
|
|
||||||
public class TestHoodieCompactor {
|
public class TestHoodieCompactor {
|
||||||
|
|
||||||
private transient JavaSparkContext jsc = null;
|
private transient JavaSparkContext jsc = null;
|
||||||
private String basePath = null;
|
private String basePath = null;
|
||||||
private HoodieCompactor compactor;
|
private HoodieCompactor compactor;
|
||||||
@@ -194,7 +192,7 @@ public class TestHoodieCompactor {
|
|||||||
List<FileSlice> groupedLogFiles = table.getRTFileSystemView()
|
List<FileSlice> groupedLogFiles = table.getRTFileSystemView()
|
||||||
.getLatestFileSlices(partitionPath)
|
.getLatestFileSlices(partitionPath)
|
||||||
.collect(Collectors.toList());
|
.collect(Collectors.toList());
|
||||||
for (FileSlice slice: groupedLogFiles) {
|
for (FileSlice slice : groupedLogFiles) {
|
||||||
assertTrue(
|
assertTrue(
|
||||||
"After compaction there should be no log files visiable on a Realtime view",
|
"After compaction there should be no log files visiable on a Realtime view",
|
||||||
slice.getLogFiles().collect(Collectors.toList()).isEmpty());
|
slice.getLogFiles().collect(Collectors.toList()).isEmpty());
|
||||||
|
|||||||
@@ -17,12 +17,10 @@
|
|||||||
package com.uber.hoodie.io.strategy;
|
package com.uber.hoodie.io.strategy;
|
||||||
|
|
||||||
import static org.junit.Assert.assertEquals;
|
import static org.junit.Assert.assertEquals;
|
||||||
import static org.junit.Assert.assertThat;
|
|
||||||
import static org.junit.Assert.assertTrue;
|
import static org.junit.Assert.assertTrue;
|
||||||
|
|
||||||
import com.beust.jcommander.internal.Lists;
|
import com.beust.jcommander.internal.Lists;
|
||||||
import com.google.common.collect.Maps;
|
import com.google.common.collect.Maps;
|
||||||
|
|
||||||
import com.uber.hoodie.config.HoodieCompactionConfig;
|
import com.uber.hoodie.config.HoodieCompactionConfig;
|
||||||
import com.uber.hoodie.config.HoodieWriteConfig;
|
import com.uber.hoodie.config.HoodieWriteConfig;
|
||||||
import com.uber.hoodie.io.compact.CompactionOperation;
|
import com.uber.hoodie.io.compact.CompactionOperation;
|
||||||
|
|||||||
@@ -17,9 +17,7 @@
|
|||||||
package com.uber.hoodie.io.strategy;
|
package com.uber.hoodie.io.strategy;
|
||||||
|
|
||||||
import com.uber.hoodie.common.model.HoodieDataFile;
|
import com.uber.hoodie.common.model.HoodieDataFile;
|
||||||
import com.uber.hoodie.common.util.FSUtils;
|
|
||||||
import java.util.UUID;
|
import java.util.UUID;
|
||||||
import org.apache.hadoop.fs.FileStatus;
|
|
||||||
|
|
||||||
public class TestHoodieDataFile extends HoodieDataFile {
|
public class TestHoodieDataFile extends HoodieDataFile {
|
||||||
|
|
||||||
|
|||||||
@@ -18,7 +18,6 @@ package com.uber.hoodie.io.strategy;
|
|||||||
|
|
||||||
import com.uber.hoodie.common.model.HoodieLogFile;
|
import com.uber.hoodie.common.model.HoodieLogFile;
|
||||||
import java.util.Optional;
|
import java.util.Optional;
|
||||||
|
|
||||||
import org.apache.hadoop.fs.Path;
|
import org.apache.hadoop.fs.Path;
|
||||||
|
|
||||||
public class TestHoodieLogFile extends HoodieLogFile {
|
public class TestHoodieLogFile extends HoodieLogFile {
|
||||||
|
|||||||
@@ -16,17 +16,17 @@
|
|||||||
|
|
||||||
package com.uber.hoodie.metrics;
|
package com.uber.hoodie.metrics;
|
||||||
|
|
||||||
import com.uber.hoodie.config.HoodieWriteConfig;
|
|
||||||
|
|
||||||
import org.apache.commons.configuration.ConfigurationException;
|
|
||||||
import org.junit.Before;
|
|
||||||
import org.junit.Test;
|
|
||||||
|
|
||||||
import static org.junit.Assert.assertTrue;
|
import static org.junit.Assert.assertTrue;
|
||||||
import static org.mockito.Mockito.mock;
|
import static org.mockito.Mockito.mock;
|
||||||
import static org.mockito.Mockito.when;
|
import static org.mockito.Mockito.when;
|
||||||
|
|
||||||
|
import com.uber.hoodie.config.HoodieWriteConfig;
|
||||||
|
import org.apache.commons.configuration.ConfigurationException;
|
||||||
|
import org.junit.Before;
|
||||||
|
import org.junit.Test;
|
||||||
|
|
||||||
public class TestHoodieMetrics {
|
public class TestHoodieMetrics {
|
||||||
|
|
||||||
private HoodieMetrics metrics = null;
|
private HoodieMetrics metrics = null;
|
||||||
|
|
||||||
@Before
|
@Before
|
||||||
@@ -40,6 +40,7 @@ public class TestHoodieMetrics {
|
|||||||
@Test
|
@Test
|
||||||
public void testRegisterGauge() {
|
public void testRegisterGauge() {
|
||||||
metrics.registerGauge("metric1", 123L);
|
metrics.registerGauge("metric1", 123L);
|
||||||
assertTrue(Metrics.getInstance().getRegistry().getGauges().get("metric1").getValue().toString().equals("123"));
|
assertTrue(Metrics.getInstance().getRegistry().getGauges().get("metric1").getValue().toString()
|
||||||
|
.equals("123"));
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -16,26 +16,37 @@
|
|||||||
|
|
||||||
package com.uber.hoodie.table;
|
package com.uber.hoodie.table;
|
||||||
|
|
||||||
import com.uber.hoodie.common.TestRawTripPayload.MetadataMergeWriteStatus;
|
import static org.junit.Assert.assertEquals;
|
||||||
import com.uber.hoodie.common.table.HoodieTableMetaClient;
|
import static org.junit.Assert.assertFalse;
|
||||||
import com.uber.hoodie.common.table.HoodieTimeline;
|
import static org.junit.Assert.assertTrue;
|
||||||
import com.uber.hoodie.config.HoodieWriteConfig;
|
import static org.mockito.Mockito.mock;
|
||||||
|
import static org.mockito.Mockito.when;
|
||||||
|
|
||||||
import com.uber.hoodie.WriteStatus;
|
import com.uber.hoodie.WriteStatus;
|
||||||
import com.uber.hoodie.common.BloomFilter;
|
import com.uber.hoodie.common.BloomFilter;
|
||||||
import com.uber.hoodie.common.HoodieClientTestUtils;
|
import com.uber.hoodie.common.HoodieClientTestUtils;
|
||||||
import com.uber.hoodie.common.HoodieTestDataGenerator;
|
import com.uber.hoodie.common.HoodieTestDataGenerator;
|
||||||
import com.uber.hoodie.common.TestRawTripPayload;
|
import com.uber.hoodie.common.TestRawTripPayload;
|
||||||
|
import com.uber.hoodie.common.TestRawTripPayload.MetadataMergeWriteStatus;
|
||||||
import com.uber.hoodie.common.model.HoodieKey;
|
import com.uber.hoodie.common.model.HoodieKey;
|
||||||
import com.uber.hoodie.common.model.HoodieRecord;
|
import com.uber.hoodie.common.model.HoodieRecord;
|
||||||
import com.uber.hoodie.common.model.HoodieRecordLocation;
|
import com.uber.hoodie.common.model.HoodieRecordLocation;
|
||||||
import com.uber.hoodie.common.model.HoodieTestUtils;
|
import com.uber.hoodie.common.model.HoodieTestUtils;
|
||||||
|
import com.uber.hoodie.common.table.HoodieTableMetaClient;
|
||||||
|
import com.uber.hoodie.common.table.HoodieTimeline;
|
||||||
import com.uber.hoodie.common.util.FSUtils;
|
import com.uber.hoodie.common.util.FSUtils;
|
||||||
import com.uber.hoodie.common.util.ParquetUtils;
|
import com.uber.hoodie.common.util.ParquetUtils;
|
||||||
|
|
||||||
import com.uber.hoodie.config.HoodieCompactionConfig;
|
import com.uber.hoodie.config.HoodieCompactionConfig;
|
||||||
import com.uber.hoodie.io.HoodieCreateHandle;
|
|
||||||
import com.uber.hoodie.config.HoodieStorageConfig;
|
import com.uber.hoodie.config.HoodieStorageConfig;
|
||||||
|
import com.uber.hoodie.config.HoodieWriteConfig;
|
||||||
|
import com.uber.hoodie.io.HoodieCreateHandle;
|
||||||
|
import java.io.File;
|
||||||
|
import java.util.ArrayList;
|
||||||
|
import java.util.Arrays;
|
||||||
|
import java.util.Iterator;
|
||||||
|
import java.util.List;
|
||||||
import java.util.Map;
|
import java.util.Map;
|
||||||
|
import java.util.UUID;
|
||||||
import org.apache.avro.generic.GenericRecord;
|
import org.apache.avro.generic.GenericRecord;
|
||||||
import org.apache.commons.io.IOUtils;
|
import org.apache.commons.io.IOUtils;
|
||||||
import org.apache.hadoop.fs.FileSystem;
|
import org.apache.hadoop.fs.FileSystem;
|
||||||
@@ -47,22 +58,11 @@ import org.junit.After;
|
|||||||
import org.junit.Before;
|
import org.junit.Before;
|
||||||
import org.junit.Test;
|
import org.junit.Test;
|
||||||
import org.junit.rules.TemporaryFolder;
|
import org.junit.rules.TemporaryFolder;
|
||||||
|
|
||||||
import java.io.File;
|
|
||||||
import java.util.ArrayList;
|
|
||||||
import java.util.Arrays;
|
|
||||||
import java.util.Iterator;
|
|
||||||
import java.util.List;
|
|
||||||
import java.util.UUID;
|
|
||||||
|
|
||||||
import scala.Option;
|
import scala.Option;
|
||||||
import scala.Tuple2;
|
import scala.Tuple2;
|
||||||
|
|
||||||
import static org.junit.Assert.*;
|
|
||||||
import static org.mockito.Mockito.mock;
|
|
||||||
import static org.mockito.Mockito.when;
|
|
||||||
|
|
||||||
public class TestCopyOnWriteTable {
|
public class TestCopyOnWriteTable {
|
||||||
|
|
||||||
private String basePath = null;
|
private String basePath = null;
|
||||||
private transient JavaSparkContext jsc = null;
|
private transient JavaSparkContext jsc = null;
|
||||||
|
|
||||||
@@ -104,7 +104,8 @@ public class TestCopyOnWriteTable {
|
|||||||
|
|
||||||
private HoodieWriteConfig.Builder makeHoodieClientConfigBuilder() throws Exception {
|
private HoodieWriteConfig.Builder makeHoodieClientConfigBuilder() throws Exception {
|
||||||
// Prepare the AvroParquetIO
|
// Prepare the AvroParquetIO
|
||||||
String schemaStr = IOUtils.toString(getClass().getResourceAsStream("/exampleSchema.txt"), "UTF-8");
|
String schemaStr = IOUtils
|
||||||
|
.toString(getClass().getResourceAsStream("/exampleSchema.txt"), "UTF-8");
|
||||||
return HoodieWriteConfig.newBuilder().withPath(basePath).withSchema(schemaStr);
|
return HoodieWriteConfig.newBuilder().withPath(basePath).withSchema(schemaStr);
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -127,11 +128,17 @@ public class TestCopyOnWriteTable {
|
|||||||
|
|
||||||
List<HoodieRecord> records = new ArrayList<>();
|
List<HoodieRecord> records = new ArrayList<>();
|
||||||
TestRawTripPayload rowChange1 = new TestRawTripPayload(recordStr1);
|
TestRawTripPayload rowChange1 = new TestRawTripPayload(recordStr1);
|
||||||
records.add(new HoodieRecord(new HoodieKey(rowChange1.getRowKey(), rowChange1.getPartitionPath()), rowChange1));
|
records.add(
|
||||||
|
new HoodieRecord(new HoodieKey(rowChange1.getRowKey(), rowChange1.getPartitionPath()),
|
||||||
|
rowChange1));
|
||||||
TestRawTripPayload rowChange2 = new TestRawTripPayload(recordStr2);
|
TestRawTripPayload rowChange2 = new TestRawTripPayload(recordStr2);
|
||||||
records.add(new HoodieRecord(new HoodieKey(rowChange2.getRowKey(), rowChange2.getPartitionPath()), rowChange2));
|
records.add(
|
||||||
|
new HoodieRecord(new HoodieKey(rowChange2.getRowKey(), rowChange2.getPartitionPath()),
|
||||||
|
rowChange2));
|
||||||
TestRawTripPayload rowChange3 = new TestRawTripPayload(recordStr3);
|
TestRawTripPayload rowChange3 = new TestRawTripPayload(recordStr3);
|
||||||
records.add(new HoodieRecord(new HoodieKey(rowChange3.getRowKey(), rowChange3.getPartitionPath()), rowChange3));
|
records.add(
|
||||||
|
new HoodieRecord(new HoodieKey(rowChange3.getRowKey(), rowChange3.getPartitionPath()),
|
||||||
|
rowChange3));
|
||||||
|
|
||||||
// Insert new records
|
// Insert new records
|
||||||
HoodieClientTestUtils.collectStatuses(table.handleInsert(firstCommitTime, records.iterator()));
|
HoodieClientTestUtils.collectStatuses(table.handleInsert(firstCommitTime, records.iterator()));
|
||||||
@@ -159,7 +166,7 @@ public class TestCopyOnWriteTable {
|
|||||||
List<GenericRecord> fileRecords = ParquetUtils.readAvroRecords(parquetFilePath);
|
List<GenericRecord> fileRecords = ParquetUtils.readAvroRecords(parquetFilePath);
|
||||||
GenericRecord newRecord;
|
GenericRecord newRecord;
|
||||||
int index = 0;
|
int index = 0;
|
||||||
for (GenericRecord record: fileRecords) {
|
for (GenericRecord record : fileRecords) {
|
||||||
assertTrue(record.get("_row_key").toString().equals(records.get(index).getRecordKey()));
|
assertTrue(record.get("_row_key").toString().equals(records.get(index).getRecordKey()));
|
||||||
index++;
|
index++;
|
||||||
}
|
}
|
||||||
@@ -167,11 +174,15 @@ public class TestCopyOnWriteTable {
|
|||||||
// We update the 1st record & add a new record
|
// We update the 1st record & add a new record
|
||||||
String updateRecordStr1 = "{\"_row_key\":\"8eb5b87a-1feh-4edd-87b4-6ec96dc405a0\",\"time\":\"2016-01-31T03:16:41.415Z\",\"number\":15}";
|
String updateRecordStr1 = "{\"_row_key\":\"8eb5b87a-1feh-4edd-87b4-6ec96dc405a0\",\"time\":\"2016-01-31T03:16:41.415Z\",\"number\":15}";
|
||||||
TestRawTripPayload updateRowChanges1 = new TestRawTripPayload(updateRecordStr1);
|
TestRawTripPayload updateRowChanges1 = new TestRawTripPayload(updateRecordStr1);
|
||||||
HoodieRecord updatedRecord1 = new HoodieRecord(new HoodieKey(updateRowChanges1.getRowKey(), updateRowChanges1.getPartitionPath()), updateRowChanges1);
|
HoodieRecord updatedRecord1 = new HoodieRecord(
|
||||||
updatedRecord1.setCurrentLocation(new HoodieRecordLocation(null, FSUtils.getFileId(parquetFile.getName())));
|
new HoodieKey(updateRowChanges1.getRowKey(), updateRowChanges1.getPartitionPath()),
|
||||||
|
updateRowChanges1);
|
||||||
|
updatedRecord1.setCurrentLocation(
|
||||||
|
new HoodieRecordLocation(null, FSUtils.getFileId(parquetFile.getName())));
|
||||||
|
|
||||||
TestRawTripPayload rowChange4 = new TestRawTripPayload(recordStr4);
|
TestRawTripPayload rowChange4 = new TestRawTripPayload(recordStr4);
|
||||||
HoodieRecord insertedRecord1 = new HoodieRecord(new HoodieKey(rowChange4.getRowKey(), rowChange4.getPartitionPath()), rowChange4);
|
HoodieRecord insertedRecord1 = new HoodieRecord(
|
||||||
|
new HoodieKey(rowChange4.getRowKey(), rowChange4.getPartitionPath()), rowChange4);
|
||||||
|
|
||||||
List<HoodieRecord> updatedRecords = Arrays.asList(updatedRecord1, insertedRecord1);
|
List<HoodieRecord> updatedRecords = Arrays.asList(updatedRecord1, insertedRecord1);
|
||||||
|
|
||||||
@@ -179,7 +190,9 @@ public class TestCopyOnWriteTable {
|
|||||||
String newCommitTime = HoodieTestUtils.makeNewCommitTime();
|
String newCommitTime = HoodieTestUtils.makeNewCommitTime();
|
||||||
metadata = new HoodieTableMetaClient(FSUtils.getFs(), basePath);
|
metadata = new HoodieTableMetaClient(FSUtils.getFs(), basePath);
|
||||||
table = new HoodieCopyOnWriteTable(config, metadata);
|
table = new HoodieCopyOnWriteTable(config, metadata);
|
||||||
Iterator<List<WriteStatus>> iter = table.handleUpdate(newCommitTime, updatedRecord1.getCurrentLocation().getFileId(), updatedRecords.iterator());
|
Iterator<List<WriteStatus>> iter = table
|
||||||
|
.handleUpdate(newCommitTime, updatedRecord1.getCurrentLocation().getFileId(),
|
||||||
|
updatedRecords.iterator());
|
||||||
|
|
||||||
// Check the updated file
|
// Check the updated file
|
||||||
File updatedParquetFile = null;
|
File updatedParquetFile = null;
|
||||||
@@ -197,7 +210,8 @@ public class TestCopyOnWriteTable {
|
|||||||
assertTrue(updatedParquetFile != null);
|
assertTrue(updatedParquetFile != null);
|
||||||
// Check whether the record has been updated
|
// Check whether the record has been updated
|
||||||
Path updatedParquetFilePath = new Path(updatedParquetFile.getAbsolutePath());
|
Path updatedParquetFilePath = new Path(updatedParquetFile.getAbsolutePath());
|
||||||
BloomFilter updatedFilter = ParquetUtils.readBloomFilterFromParquetMetadata(updatedParquetFilePath);
|
BloomFilter updatedFilter = ParquetUtils
|
||||||
|
.readBloomFilterFromParquetMetadata(updatedParquetFilePath);
|
||||||
for (HoodieRecord record : records) {
|
for (HoodieRecord record : records) {
|
||||||
// No change to the _row_key
|
// No change to the _row_key
|
||||||
assertTrue(updatedFilter.mightContain(record.getRecordKey()));
|
assertTrue(updatedFilter.mightContain(record.getRecordKey()));
|
||||||
@@ -206,7 +220,8 @@ public class TestCopyOnWriteTable {
|
|||||||
assertTrue(updatedFilter.mightContain(insertedRecord1.getRecordKey()));
|
assertTrue(updatedFilter.mightContain(insertedRecord1.getRecordKey()));
|
||||||
records.add(insertedRecord1);// add this so it can further check below
|
records.add(insertedRecord1);// add this so it can further check below
|
||||||
|
|
||||||
ParquetReader updatedReader = ParquetReader.builder(new AvroReadSupport<>(), updatedParquetFilePath).build();
|
ParquetReader updatedReader = ParquetReader
|
||||||
|
.builder(new AvroReadSupport<>(), updatedParquetFilePath).build();
|
||||||
index = 0;
|
index = 0;
|
||||||
while ((newRecord = (GenericRecord) updatedReader.read()) != null) {
|
while ((newRecord = (GenericRecord) updatedReader.read()) != null) {
|
||||||
assertTrue(newRecord.get("_row_key").toString().equals(records.get(index).getRecordKey()));
|
assertTrue(newRecord.get("_row_key").toString().equals(records.get(index).getRecordKey()));
|
||||||
@@ -243,7 +258,8 @@ public class TestCopyOnWriteTable {
|
|||||||
@Test
|
@Test
|
||||||
public void testMetadataAggregateFromWriteStatus() throws Exception {
|
public void testMetadataAggregateFromWriteStatus() throws Exception {
|
||||||
// Prepare the AvroParquetIO
|
// Prepare the AvroParquetIO
|
||||||
HoodieWriteConfig config = makeHoodieClientConfigBuilder().withWriteStatusClass(MetadataMergeWriteStatus.class).build();
|
HoodieWriteConfig config = makeHoodieClientConfigBuilder()
|
||||||
|
.withWriteStatusClass(MetadataMergeWriteStatus.class).build();
|
||||||
String firstCommitTime = HoodieTestUtils.makeNewCommitTime();
|
String firstCommitTime = HoodieTestUtils.makeNewCommitTime();
|
||||||
HoodieTableMetaClient metadata = new HoodieTableMetaClient(FSUtils.getFs(), basePath);
|
HoodieTableMetaClient metadata = new HoodieTableMetaClient(FSUtils.getFs(), basePath);
|
||||||
|
|
||||||
@@ -256,11 +272,17 @@ public class TestCopyOnWriteTable {
|
|||||||
|
|
||||||
List<HoodieRecord> records = new ArrayList<>();
|
List<HoodieRecord> records = new ArrayList<>();
|
||||||
TestRawTripPayload rowChange1 = new TestRawTripPayload(recordStr1);
|
TestRawTripPayload rowChange1 = new TestRawTripPayload(recordStr1);
|
||||||
records.add(new HoodieRecord(new HoodieKey(rowChange1.getRowKey(), rowChange1.getPartitionPath()), rowChange1));
|
records.add(
|
||||||
|
new HoodieRecord(new HoodieKey(rowChange1.getRowKey(), rowChange1.getPartitionPath()),
|
||||||
|
rowChange1));
|
||||||
TestRawTripPayload rowChange2 = new TestRawTripPayload(recordStr2);
|
TestRawTripPayload rowChange2 = new TestRawTripPayload(recordStr2);
|
||||||
records.add(new HoodieRecord(new HoodieKey(rowChange2.getRowKey(), rowChange2.getPartitionPath()), rowChange2));
|
records.add(
|
||||||
|
new HoodieRecord(new HoodieKey(rowChange2.getRowKey(), rowChange2.getPartitionPath()),
|
||||||
|
rowChange2));
|
||||||
TestRawTripPayload rowChange3 = new TestRawTripPayload(recordStr3);
|
TestRawTripPayload rowChange3 = new TestRawTripPayload(recordStr3);
|
||||||
records.add(new HoodieRecord(new HoodieKey(rowChange3.getRowKey(), rowChange3.getPartitionPath()), rowChange3));
|
records.add(
|
||||||
|
new HoodieRecord(new HoodieKey(rowChange3.getRowKey(), rowChange3.getPartitionPath()),
|
||||||
|
rowChange3));
|
||||||
|
|
||||||
// Insert new records
|
// Insert new records
|
||||||
List<WriteStatus> writeStatuses = HoodieClientTestUtils
|
List<WriteStatus> writeStatuses = HoodieClientTestUtils
|
||||||
@@ -286,7 +308,8 @@ public class TestCopyOnWriteTable {
|
|||||||
records.addAll(newHoodieRecords(1, "2016-02-01T03:16:41.415Z"));
|
records.addAll(newHoodieRecords(1, "2016-02-01T03:16:41.415Z"));
|
||||||
|
|
||||||
// Simulate crash after first file
|
// Simulate crash after first file
|
||||||
List<WriteStatus> statuses = HoodieClientTestUtils.collectStatuses(table.handleInsert(commitTime, records.iterator()));
|
List<WriteStatus> statuses = HoodieClientTestUtils
|
||||||
|
.collectStatuses(table.handleInsert(commitTime, records.iterator()));
|
||||||
WriteStatus status = statuses.get(0);
|
WriteStatus status = statuses.get(0);
|
||||||
Path partialFile = new Path(String.format("%s/%s/%s",
|
Path partialFile = new Path(String.format("%s/%s/%s",
|
||||||
basePath,
|
basePath,
|
||||||
@@ -299,7 +322,8 @@ public class TestCopyOnWriteTable {
|
|||||||
records = newHoodieRecords(10, "2016-01-31T03:16:41.415Z");
|
records = newHoodieRecords(10, "2016-01-31T03:16:41.415Z");
|
||||||
records.addAll(newHoodieRecords(1, "2016-02-01T03:16:41.415Z"));
|
records.addAll(newHoodieRecords(1, "2016-02-01T03:16:41.415Z"));
|
||||||
|
|
||||||
statuses = HoodieClientTestUtils.collectStatuses(table.handleInsert(commitTime, records.iterator()));
|
statuses = HoodieClientTestUtils
|
||||||
|
.collectStatuses(table.handleInsert(commitTime, records.iterator()));
|
||||||
status = statuses.get(0);
|
status = statuses.get(0);
|
||||||
|
|
||||||
Path retriedFIle = new Path(String.format("%s/%s/%s",
|
Path retriedFIle = new Path(String.format("%s/%s/%s",
|
||||||
@@ -312,7 +336,8 @@ public class TestCopyOnWriteTable {
|
|||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
@Test public void testInsertRecords() throws Exception {
|
@Test
|
||||||
|
public void testInsertRecords() throws Exception {
|
||||||
HoodieWriteConfig config = makeHoodieClientConfig();
|
HoodieWriteConfig config = makeHoodieClientConfig();
|
||||||
String commitTime = HoodieTestUtils.makeNewCommitTime();
|
String commitTime = HoodieTestUtils.makeNewCommitTime();
|
||||||
HoodieTableMetaClient metadata = new HoodieTableMetaClient(FSUtils.getFs(), basePath);
|
HoodieTableMetaClient metadata = new HoodieTableMetaClient(FSUtils.getFs(), basePath);
|
||||||
@@ -324,8 +349,8 @@ public class TestCopyOnWriteTable {
|
|||||||
records.addAll(newHoodieRecords(1, "2016-02-01T03:16:41.415Z"));
|
records.addAll(newHoodieRecords(1, "2016-02-01T03:16:41.415Z"));
|
||||||
|
|
||||||
// Insert new records
|
// Insert new records
|
||||||
List<WriteStatus> returnedStatuses = HoodieClientTestUtils.collectStatuses(table.handleInsert(commitTime, records.iterator()));
|
List<WriteStatus> returnedStatuses = HoodieClientTestUtils
|
||||||
|
.collectStatuses(table.handleInsert(commitTime, records.iterator()));
|
||||||
|
|
||||||
// TODO: check the actual files and make sure 11 records, total were written.
|
// TODO: check the actual files and make sure 11 records, total were written.
|
||||||
assertEquals(2, returnedStatuses.size());
|
assertEquals(2, returnedStatuses.size());
|
||||||
@@ -343,7 +368,8 @@ public class TestCopyOnWriteTable {
|
|||||||
records.addAll(newHoodieRecords(1, "2016-02-02T03:16:41.415Z"));
|
records.addAll(newHoodieRecords(1, "2016-02-02T03:16:41.415Z"));
|
||||||
|
|
||||||
// Insert new records
|
// Insert new records
|
||||||
returnedStatuses = HoodieClientTestUtils.collectStatuses(table.handleInsert(commitTime, records.iterator()));
|
returnedStatuses = HoodieClientTestUtils
|
||||||
|
.collectStatuses(table.handleInsert(commitTime, records.iterator()));
|
||||||
|
|
||||||
assertEquals(3, returnedStatuses.size());
|
assertEquals(3, returnedStatuses.size());
|
||||||
assertEquals("2016/01/31", returnedStatuses.get(0).getPartitionPath());
|
assertEquals("2016/01/31", returnedStatuses.get(0).getPartitionPath());
|
||||||
@@ -357,7 +383,8 @@ public class TestCopyOnWriteTable {
|
|||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
@Test public void testFileSizeUpsertRecords() throws Exception {
|
@Test
|
||||||
|
public void testFileSizeUpsertRecords() throws Exception {
|
||||||
HoodieWriteConfig config = makeHoodieClientConfigBuilder().withStorageConfig(
|
HoodieWriteConfig config = makeHoodieClientConfigBuilder().withStorageConfig(
|
||||||
HoodieStorageConfig.newBuilder().limitFileSize(64 * 1024).parquetBlockSize(64 * 1024)
|
HoodieStorageConfig.newBuilder().limitFileSize(64 * 1024).parquetBlockSize(64 * 1024)
|
||||||
.parquetPageSize(64 * 1024).build()).build();
|
.parquetPageSize(64 * 1024).build()).build();
|
||||||
@@ -368,9 +395,11 @@ public class TestCopyOnWriteTable {
|
|||||||
List<HoodieRecord> records = new ArrayList<>();
|
List<HoodieRecord> records = new ArrayList<>();
|
||||||
// Approx 1150 records are written for block size of 64KB
|
// Approx 1150 records are written for block size of 64KB
|
||||||
for (int i = 0; i < 2000; i++) {
|
for (int i = 0; i < 2000; i++) {
|
||||||
String recordStr = "{\"_row_key\":\"" + UUID.randomUUID().toString() + "\",\"time\":\"2016-01-31T03:16:41.415Z\",\"number\":" + i + "}";
|
String recordStr = "{\"_row_key\":\"" + UUID.randomUUID().toString()
|
||||||
|
+ "\",\"time\":\"2016-01-31T03:16:41.415Z\",\"number\":" + i + "}";
|
||||||
TestRawTripPayload rowChange = new TestRawTripPayload(recordStr);
|
TestRawTripPayload rowChange = new TestRawTripPayload(recordStr);
|
||||||
records.add(new HoodieRecord(new HoodieKey(rowChange.getRowKey(), rowChange.getPartitionPath()),
|
records
|
||||||
|
.add(new HoodieRecord(new HoodieKey(rowChange.getRowKey(), rowChange.getPartitionPath()),
|
||||||
rowChange));
|
rowChange));
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -380,7 +409,8 @@ public class TestCopyOnWriteTable {
|
|||||||
// Check the updated file
|
// Check the updated file
|
||||||
int counts = 0;
|
int counts = 0;
|
||||||
for (File file : new File(basePath + "/2016/01/31").listFiles()) {
|
for (File file : new File(basePath + "/2016/01/31").listFiles()) {
|
||||||
if (file.getName().endsWith(".parquet") && FSUtils.getCommitTime(file.getName()).equals(commitTime)) {
|
if (file.getName().endsWith(".parquet") && FSUtils.getCommitTime(file.getName())
|
||||||
|
.equals(commitTime)) {
|
||||||
System.out.println(file.getName() + "-" + file.length());
|
System.out.println(file.getName() + "-" + file.length());
|
||||||
counts++;
|
counts++;
|
||||||
}
|
}
|
||||||
@@ -391,7 +421,6 @@ public class TestCopyOnWriteTable {
|
|||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
private List<HoodieCopyOnWriteTable.InsertBucket> testUpsertPartitioner(int smallFileSize,
|
private List<HoodieCopyOnWriteTable.InsertBucket> testUpsertPartitioner(int smallFileSize,
|
||||||
int numInserts,
|
int numInserts,
|
||||||
int numUpdates,
|
int numUpdates,
|
||||||
@@ -400,8 +429,10 @@ public class TestCopyOnWriteTable {
|
|||||||
final String TEST_PARTITION_PATH = "2016/09/26";
|
final String TEST_PARTITION_PATH = "2016/09/26";
|
||||||
HoodieWriteConfig config = makeHoodieClientConfigBuilder()
|
HoodieWriteConfig config = makeHoodieClientConfigBuilder()
|
||||||
.withCompactionConfig(HoodieCompactionConfig.newBuilder()
|
.withCompactionConfig(HoodieCompactionConfig.newBuilder()
|
||||||
.compactionSmallFileSize(smallFileSize).insertSplitSize(100).autoTuneInsertSplits(autoSplitInserts).build())
|
.compactionSmallFileSize(smallFileSize).insertSplitSize(100)
|
||||||
.withStorageConfig(HoodieStorageConfig.newBuilder().limitFileSize(1000 * 1024).build()).build();
|
.autoTuneInsertSplits(autoSplitInserts).build())
|
||||||
|
.withStorageConfig(HoodieStorageConfig.newBuilder().limitFileSize(1000 * 1024).build())
|
||||||
|
.build();
|
||||||
|
|
||||||
HoodieClientTestUtils.fakeCommitFile(basePath, "001");
|
HoodieClientTestUtils.fakeCommitFile(basePath, "001");
|
||||||
HoodieClientTestUtils.fakeDataFile(basePath, TEST_PARTITION_PATH, "001", "file1", fileSize);
|
HoodieClientTestUtils.fakeDataFile(basePath, TEST_PARTITION_PATH, "001", "file1", fileSize);
|
||||||
@@ -409,10 +440,11 @@ public class TestCopyOnWriteTable {
|
|||||||
HoodieTableMetaClient metadata = new HoodieTableMetaClient(FSUtils.getFs(), basePath);
|
HoodieTableMetaClient metadata = new HoodieTableMetaClient(FSUtils.getFs(), basePath);
|
||||||
HoodieCopyOnWriteTable table = new HoodieCopyOnWriteTable(config, metadata);
|
HoodieCopyOnWriteTable table = new HoodieCopyOnWriteTable(config, metadata);
|
||||||
|
|
||||||
HoodieTestDataGenerator dataGenerator = new HoodieTestDataGenerator(new String[]{TEST_PARTITION_PATH});
|
HoodieTestDataGenerator dataGenerator = new HoodieTestDataGenerator(
|
||||||
|
new String[]{TEST_PARTITION_PATH});
|
||||||
List<HoodieRecord> insertRecords = dataGenerator.generateInserts("001", numInserts);
|
List<HoodieRecord> insertRecords = dataGenerator.generateInserts("001", numInserts);
|
||||||
List<HoodieRecord> updateRecords = dataGenerator.generateUpdates("001", numUpdates);
|
List<HoodieRecord> updateRecords = dataGenerator.generateUpdates("001", numUpdates);
|
||||||
for (HoodieRecord updateRec: updateRecords) {
|
for (HoodieRecord updateRec : updateRecords) {
|
||||||
updateRec.setCurrentLocation(new HoodieRecordLocation("001", "file1"));
|
updateRec.setCurrentLocation(new HoodieRecordLocation("001", "file1"));
|
||||||
}
|
}
|
||||||
List<HoodieRecord> records = new ArrayList<>();
|
List<HoodieRecord> records = new ArrayList<>();
|
||||||
@@ -430,7 +462,8 @@ public class TestCopyOnWriteTable {
|
|||||||
assertEquals("Bucket 2 is INSERT", HoodieCopyOnWriteTable.BucketType.INSERT,
|
assertEquals("Bucket 2 is INSERT", HoodieCopyOnWriteTable.BucketType.INSERT,
|
||||||
partitioner.getBucketInfo(2).bucketType);
|
partitioner.getBucketInfo(2).bucketType);
|
||||||
assertEquals("Update record should have gone to the 1 update partiton", 0,
|
assertEquals("Update record should have gone to the 1 update partiton", 0,
|
||||||
partitioner.getPartition(new Tuple2<>(updateRecords.get(0).getKey(), Option.apply(updateRecords.get(0).getCurrentLocation()))));
|
partitioner.getPartition(new Tuple2<>(updateRecords.get(0).getKey(),
|
||||||
|
Option.apply(updateRecords.get(0).getCurrentLocation()))));
|
||||||
return partitioner.getInsertBuckets(TEST_PARTITION_PATH);
|
return partitioner.getInsertBuckets(TEST_PARTITION_PATH);
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -438,7 +471,8 @@ public class TestCopyOnWriteTable {
|
|||||||
@Test
|
@Test
|
||||||
public void testUpsertPartitioner() throws Exception {
|
public void testUpsertPartitioner() throws Exception {
|
||||||
// Inserts + Updates... Check all updates go together & inserts subsplit
|
// Inserts + Updates... Check all updates go together & inserts subsplit
|
||||||
List<HoodieCopyOnWriteTable.InsertBucket> insertBuckets = testUpsertPartitioner(0, 200, 100, 1024, false);
|
List<HoodieCopyOnWriteTable.InsertBucket> insertBuckets = testUpsertPartitioner(0, 200, 100,
|
||||||
|
1024, false);
|
||||||
assertEquals("Total of 2 insert buckets", 2, insertBuckets.size());
|
assertEquals("Total of 2 insert buckets", 2, insertBuckets.size());
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -446,16 +480,21 @@ public class TestCopyOnWriteTable {
|
|||||||
@Test
|
@Test
|
||||||
public void testUpsertPartitionerWithSmallInsertHandling() throws Exception {
|
public void testUpsertPartitionerWithSmallInsertHandling() throws Exception {
|
||||||
// Inserts + Updates .. Check updates go together & inserts subsplit, after expanding smallest file
|
// Inserts + Updates .. Check updates go together & inserts subsplit, after expanding smallest file
|
||||||
List<HoodieCopyOnWriteTable.InsertBucket> insertBuckets = testUpsertPartitioner(1000 * 1024, 400, 100, 800 * 1024, false);
|
List<HoodieCopyOnWriteTable.InsertBucket> insertBuckets = testUpsertPartitioner(1000 * 1024,
|
||||||
|
400, 100, 800 * 1024, false);
|
||||||
assertEquals("Total of 3 insert buckets", 3, insertBuckets.size());
|
assertEquals("Total of 3 insert buckets", 3, insertBuckets.size());
|
||||||
assertEquals("First insert bucket must be same as update bucket", 0, insertBuckets.get(0).bucketNumber);
|
assertEquals("First insert bucket must be same as update bucket", 0,
|
||||||
assertEquals("First insert bucket should have weight 0.5", 0.5, insertBuckets.get(0).weight, 0.01);
|
insertBuckets.get(0).bucketNumber);
|
||||||
|
assertEquals("First insert bucket should have weight 0.5", 0.5, insertBuckets.get(0).weight,
|
||||||
|
0.01);
|
||||||
|
|
||||||
// Now with insert split size auto tuned
|
// Now with insert split size auto tuned
|
||||||
insertBuckets = testUpsertPartitioner(1000 * 1024, 2400, 100, 800 * 1024, true);
|
insertBuckets = testUpsertPartitioner(1000 * 1024, 2400, 100, 800 * 1024, true);
|
||||||
assertEquals("Total of 3 insert buckets", 3, insertBuckets.size());
|
assertEquals("Total of 3 insert buckets", 3, insertBuckets.size());
|
||||||
assertEquals("First insert bucket must be same as update bucket", 0, insertBuckets.get(0).bucketNumber);
|
assertEquals("First insert bucket must be same as update bucket", 0,
|
||||||
assertEquals("First insert bucket should have weight 0.5", 200.0/2400, insertBuckets.get(0).weight, 0.01);
|
insertBuckets.get(0).bucketNumber);
|
||||||
|
assertEquals("First insert bucket should have weight 0.5", 200.0 / 2400,
|
||||||
|
insertBuckets.get(0).weight, 0.01);
|
||||||
}
|
}
|
||||||
|
|
||||||
@After
|
@After
|
||||||
|
|||||||
Some files were not shown because too many files have changed in this diff Show More
Reference in New Issue
Block a user