1
0

Reformatting code per Google Code Style all over

This commit is contained in:
Vinoth Chandar
2017-11-12 22:54:56 -08:00
committed by vinoth chandar
parent 5a62480a92
commit e45679f5e2
254 changed files with 21580 additions and 21108 deletions

View File

@@ -17,38 +17,38 @@
package com.uber.hoodie.cli;
import com.uber.hoodie.common.table.HoodieTableMetaClient;
import java.io.IOException;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileSystem;
import java.io.IOException;
public class HoodieCLI {
public static Configuration conf;
public static FileSystem fs;
public static CLIState state = CLIState.INIT;
public static HoodieTableMetaClient tableMetadata;
public static HoodieTableMetaClient syncTableMetadata;
public static Configuration conf;
public static FileSystem fs;
public static CLIState state = CLIState.INIT;
public static HoodieTableMetaClient tableMetadata;
public static HoodieTableMetaClient syncTableMetadata;
public enum CLIState {
INIT, DATASET, SYNC
public enum CLIState {
INIT, DATASET, SYNC
}
public static boolean initConf() {
if (HoodieCLI.conf == null) {
HoodieCLI.conf = new Configuration();
return true;
}
return false;
}
public static boolean initConf() {
if (HoodieCLI.conf == null) {
HoodieCLI.conf = new Configuration();
return true;
}
return false;
public static void initFS(boolean force) throws IOException {
if (fs == null || force) {
fs = FileSystem.get(conf);
}
}
public static void initFS(boolean force) throws IOException {
if(fs == null || force) {
fs = FileSystem.get(conf);
}
}
public static void setTableMetadata(HoodieTableMetaClient tableMetadata) {
HoodieCLI.tableMetadata = tableMetadata;
}
public static void setTableMetadata(HoodieTableMetaClient tableMetadata) {
HoodieCLI.tableMetadata = tableMetadata;
}
}

View File

@@ -25,13 +25,13 @@ import org.springframework.stereotype.Component;
@Order(Ordered.HIGHEST_PRECEDENCE)
public class HoodieHistoryFileNameProvider extends DefaultHistoryFileNameProvider {
public String getHistoryFileName() {
return "hoodie-cmd.log";
}
public String getHistoryFileName() {
return "hoodie-cmd.log";
}
@Override
public String getProviderName() {
return "Hoodie file name provider";
}
@Override
public String getProviderName() {
return "Hoodie file name provider";
}
}

View File

@@ -17,18 +17,17 @@
package com.uber.hoodie.cli;
import dnl.utils.text.table.TextTable;
import java.io.ByteArrayOutputStream;
import java.io.PrintStream;
import java.nio.charset.Charset;
public class HoodiePrintHelper {
public static String print(String[] header, String[][] rows) {
TextTable textTable = new TextTable(header, rows);
ByteArrayOutputStream baos = new ByteArrayOutputStream();
PrintStream ps = new PrintStream(baos);
textTable.printTable(ps, 4);
return new String(baos.toByteArray(), Charset.forName("utf-8"));
}
public static String print(String[] header, String[][] rows) {
TextTable textTable = new TextTable(header, rows);
ByteArrayOutputStream baos = new ByteArrayOutputStream();
PrintStream ps = new PrintStream(baos);
textTable.printTable(ps, 4);
return new String(baos.toByteArray(), Charset.forName("utf-8"));
}
}

View File

@@ -16,7 +16,6 @@
package com.uber.hoodie.cli;
import com.uber.hoodie.common.table.HoodieTableConfig;
import org.springframework.core.Ordered;
import org.springframework.core.annotation.Order;
import org.springframework.shell.plugin.support.DefaultPromptProvider;
@@ -26,27 +25,27 @@ import org.springframework.stereotype.Component;
@Order(Ordered.HIGHEST_PRECEDENCE)
public class HoodiePrompt extends DefaultPromptProvider {
@Override
public String getPrompt() {
if (HoodieCLI.tableMetadata != null) {
String tableName = HoodieCLI.tableMetadata.getTableConfig().getTableName();
switch (HoodieCLI.state) {
case INIT:
return "hoodie->";
case DATASET:
return "hoodie:" + tableName + "->";
case SYNC:
return "hoodie:" + tableName + " <==> "
+ HoodieCLI.syncTableMetadata.getTableConfig().getTableName() + "->";
}
return "hoodie:" + tableName + "->";
}
return "hoodie->";
@Override
public String getPrompt() {
if (HoodieCLI.tableMetadata != null) {
String tableName = HoodieCLI.tableMetadata.getTableConfig().getTableName();
switch (HoodieCLI.state) {
case INIT:
return "hoodie->";
case DATASET:
return "hoodie:" + tableName + "->";
case SYNC:
return "hoodie:" + tableName + " <==> "
+ HoodieCLI.syncTableMetadata.getTableConfig().getTableName() + "->";
}
return "hoodie:" + tableName + "->";
}
return "hoodie->";
}
@Override
public String getProviderName() {
return "Hoodie provider";
}
@Override
public String getProviderName() {
return "Hoodie provider";
}
}

View File

@@ -22,34 +22,39 @@ import org.springframework.shell.plugin.support.DefaultBannerProvider;
import org.springframework.shell.support.util.OsUtils;
import org.springframework.stereotype.Component;
@Component @Order(Ordered.HIGHEST_PRECEDENCE) public class HoodieSplashScreen
@Component
@Order(Ordered.HIGHEST_PRECEDENCE)
public class HoodieSplashScreen
extends DefaultBannerProvider {
private static String screen = "============================================" + OsUtils.LINE_SEPARATOR +
"* *" + OsUtils.LINE_SEPARATOR +
"* _ _ _ _ *" + OsUtils.LINE_SEPARATOR +
"* | | | | | (_) *" + OsUtils.LINE_SEPARATOR +
"* | |__| | ___ ___ __| |_ ___ *" + OsUtils.LINE_SEPARATOR +
"* | __ |/ _ \\ / _ \\ / _` | |/ _ \\ *" +
OsUtils.LINE_SEPARATOR +
"* | | | | (_) | (_) | (_| | | __/ *" + OsUtils.LINE_SEPARATOR +
"* |_| |_|\\___/ \\___/ \\__,_|_|\\___| *" +
OsUtils.LINE_SEPARATOR +
"* *" + OsUtils.LINE_SEPARATOR +
"============================================" + OsUtils.LINE_SEPARATOR;
public String getBanner() {
return screen;
}
private static String screen =
"============================================" + OsUtils.LINE_SEPARATOR +
"* *" + OsUtils.LINE_SEPARATOR +
"* _ _ _ _ *" + OsUtils.LINE_SEPARATOR +
"* | | | | | (_) *" + OsUtils.LINE_SEPARATOR +
"* | |__| | ___ ___ __| |_ ___ *" + OsUtils.LINE_SEPARATOR +
"* | __ |/ _ \\ / _ \\ / _` | |/ _ \\ *" +
OsUtils.LINE_SEPARATOR +
"* | | | | (_) | (_) | (_| | | __/ *" + OsUtils.LINE_SEPARATOR +
"* |_| |_|\\___/ \\___/ \\__,_|_|\\___| *" +
OsUtils.LINE_SEPARATOR +
"* *" + OsUtils.LINE_SEPARATOR +
"============================================" + OsUtils.LINE_SEPARATOR;
public String getVersion() {
return "1.0";
}
public String getBanner() {
return screen;
}
public String getWelcomeMessage() {
return "Welcome to Hoodie CLI. Please type help if you are looking for help. ";
}
public String getVersion() {
return "1.0";
}
@Override public String getProviderName() {
return "Hoodie Banner";
}
public String getWelcomeMessage() {
return "Welcome to Hoodie CLI. Please type help if you are looking for help. ";
}
@Override
public String getProviderName() {
return "Hoodie Banner";
}
}

View File

@@ -16,18 +16,16 @@
package com.uber.hoodie.cli;
import java.io.IOException;
import org.springframework.shell.Bootstrap;
import java.io.IOException;
public class Main {
/**
* Main class that delegates to Spring Shell's Bootstrap class in order to simplify debugging inside an IDE
*
* @param args
* @throws IOException
*/
public static void main(String[] args) throws IOException {
Bootstrap.main(args);
}
/**
* Main class that delegates to Spring Shell's Bootstrap class in order to simplify debugging
* inside an IDE
*/
public static void main(String[] args) throws IOException {
Bootstrap.main(args);
}
}

View File

@@ -24,6 +24,10 @@ import com.uber.hoodie.common.table.HoodieTimeline;
import com.uber.hoodie.common.table.log.HoodieLogFormat;
import com.uber.hoodie.common.table.log.block.HoodieAvroDataBlock;
import com.uber.hoodie.common.util.FSUtils;
import java.io.IOException;
import java.util.ArrayList;
import java.util.List;
import java.util.stream.Collectors;
import org.apache.avro.generic.GenericRecord;
import org.apache.avro.generic.IndexedRecord;
import org.apache.hadoop.fs.FileStatus;
@@ -34,90 +38,90 @@ import org.springframework.shell.core.annotation.CliCommand;
import org.springframework.shell.core.annotation.CliOption;
import org.springframework.stereotype.Component;
import java.io.IOException;
import java.util.ArrayList;
import java.util.List;
import java.util.stream.Collectors;
@Component
public class ArchivedCommitsCommand implements CommandMarker {
@CliAvailabilityIndicator({"show archived commits"})
public boolean isShowArchivedCommitAvailable() {
return HoodieCLI.tableMetadata != null;
@CliAvailabilityIndicator({"show archived commits"})
public boolean isShowArchivedCommitAvailable() {
return HoodieCLI.tableMetadata != null;
}
@CliCommand(value = "show archived commits", help = "Read commits from archived files and show details")
public String showCommits(
@CliOption(key = {
"limit"}, mandatory = false, help = "Limit commits", unspecifiedDefaultValue = "10")
final Integer limit) throws IOException {
System.out
.println("===============> Showing only " + limit + " archived commits <===============");
FileStatus[] fsStatuses = FSUtils.getFs().globStatus(
new Path(HoodieCLI.tableMetadata.getBasePath() + "/.hoodie/.commits_.archive*"));
List<String[]> allCommits = new ArrayList<>();
for (FileStatus fs : fsStatuses) {
//read the archived file
HoodieLogFormat.Reader reader = HoodieLogFormat.newReader(FSUtils.getFs(),
new HoodieLogFile(fs.getPath()), HoodieArchivedMetaEntry.getClassSchema(), false);
List<IndexedRecord> readRecords = new ArrayList<>();
//read the avro blocks
while (reader.hasNext()) {
HoodieAvroDataBlock blk = (HoodieAvroDataBlock) reader.next();
List<IndexedRecord> records = blk.getRecords();
readRecords.addAll(records);
}
List<String[]> readCommits = readRecords.stream().map(r -> (GenericRecord) r)
.map(r -> readCommit(r)).limit(limit).collect(Collectors.toList());
allCommits.addAll(readCommits);
}
return HoodiePrintHelper.print(
new String[]{"CommitTime", "CommitType", "CommitDetails"},
allCommits.toArray(new String[allCommits.size()][]));
}
@CliCommand(value = "show archived commits", help = "Read commits from archived files and show details")
public String showCommits(
@CliOption(key = {"limit"}, mandatory = false, help = "Limit commits", unspecifiedDefaultValue = "10")
final Integer limit) throws IOException {
System.out.println("===============> Showing only " + limit + " archived commits <===============");
FileStatus [] fsStatuses = FSUtils.getFs().globStatus(new Path(HoodieCLI.tableMetadata.getBasePath() + "/.hoodie/.commits_.archive*"));
List<String[]> allCommits = new ArrayList<>();
for(FileStatus fs : fsStatuses) {
//read the archived file
HoodieLogFormat.Reader reader = HoodieLogFormat.newReader(FSUtils.getFs(),
new HoodieLogFile(fs.getPath()), HoodieArchivedMetaEntry.getClassSchema(), false);
List<IndexedRecord> readRecords = new ArrayList<>();
//read the avro blocks
while (reader.hasNext()) {
HoodieAvroDataBlock blk = (HoodieAvroDataBlock) reader.next();
List<IndexedRecord> records = blk.getRecords();
readRecords.addAll(records);
}
List<String[]> readCommits = readRecords.stream().map(r -> (GenericRecord)r).map(r -> readCommit(r)).limit(limit).collect(Collectors.toList());
allCommits.addAll(readCommits);
private String[] readCommit(GenericRecord record) {
List<String> commitDetails = new ArrayList<>();
try {
switch (record.get("actionType").toString()) {
case HoodieTimeline.CLEAN_ACTION: {
commitDetails.add(record.get("commitTime").toString());
commitDetails.add(record.get("actionType").toString());
commitDetails.add(record.get("hoodieCleanMetadata").toString());
break;
}
return HoodiePrintHelper.print(
new String[] {"CommitTime", "CommitType", "CommitDetails"}, allCommits.toArray(new String[allCommits.size()][]));
}
private String[] readCommit(GenericRecord record) {
List<String> commitDetails = new ArrayList<>();
try {
switch (record.get("actionType").toString()) {
case HoodieTimeline.CLEAN_ACTION: {
commitDetails.add(record.get("commitTime").toString());
commitDetails.add(record.get("actionType").toString());
commitDetails.add(record.get("hoodieCleanMetadata").toString());
break;
}
case HoodieTimeline.COMMIT_ACTION: {
commitDetails.add(record.get("commitTime").toString());
commitDetails.add(record.get("actionType").toString());
commitDetails.add(record.get("hoodieCommitMetadata").toString());
break;
}
case HoodieTimeline.COMPACTION_ACTION: {
commitDetails.add(record.get("commitTime").toString());
commitDetails.add(record.get("actionType").toString());
commitDetails.add(record.get("hoodieCompactionMetadata").toString());
break;
}
case HoodieTimeline.DELTA_COMMIT_ACTION: {
commitDetails.add(record.get("commitTime").toString());
commitDetails.add(record.get("actionType").toString());
commitDetails.add(record.get("hoodieCommitMetadata").toString());
break;
}
case HoodieTimeline.ROLLBACK_ACTION: {
commitDetails.add(record.get("commitTime").toString());
commitDetails.add(record.get("actionType").toString());
commitDetails.add(record.get("hoodieRollbackMetadata").toString());
break;
}
case HoodieTimeline.SAVEPOINT_ACTION: {
commitDetails.add(record.get("commitTime").toString());
commitDetails.add(record.get("actionType").toString());
commitDetails.add(record.get("hoodieSavePointMetadata").toString());
break;
}
}
} catch (Exception e) {
e.printStackTrace();
case HoodieTimeline.COMMIT_ACTION: {
commitDetails.add(record.get("commitTime").toString());
commitDetails.add(record.get("actionType").toString());
commitDetails.add(record.get("hoodieCommitMetadata").toString());
break;
}
return commitDetails.toArray(new String[commitDetails.size()]);
case HoodieTimeline.COMPACTION_ACTION: {
commitDetails.add(record.get("commitTime").toString());
commitDetails.add(record.get("actionType").toString());
commitDetails.add(record.get("hoodieCompactionMetadata").toString());
break;
}
case HoodieTimeline.DELTA_COMMIT_ACTION: {
commitDetails.add(record.get("commitTime").toString());
commitDetails.add(record.get("actionType").toString());
commitDetails.add(record.get("hoodieCommitMetadata").toString());
break;
}
case HoodieTimeline.ROLLBACK_ACTION: {
commitDetails.add(record.get("commitTime").toString());
commitDetails.add(record.get("actionType").toString());
commitDetails.add(record.get("hoodieRollbackMetadata").toString());
break;
}
case HoodieTimeline.SAVEPOINT_ACTION: {
commitDetails.add(record.get("commitTime").toString());
commitDetails.add(record.get("actionType").toString());
commitDetails.add(record.get("hoodieSavePointMetadata").toString());
break;
}
}
} catch (Exception e) {
e.printStackTrace();
}
}
return commitDetails.toArray(new String[commitDetails.size()]);
}
}

View File

@@ -24,89 +24,90 @@ import com.uber.hoodie.common.table.HoodieTimeline;
import com.uber.hoodie.common.table.timeline.HoodieActiveTimeline;
import com.uber.hoodie.common.table.timeline.HoodieInstant;
import com.uber.hoodie.common.util.AvroUtils;
import org.springframework.shell.core.CommandMarker;
import org.springframework.shell.core.annotation.CliAvailabilityIndicator;
import org.springframework.shell.core.annotation.CliCommand;
import org.springframework.shell.core.annotation.CliOption;
import org.springframework.stereotype.Component;
import java.io.IOException;
import java.util.ArrayList;
import java.util.Collections;
import java.util.List;
import java.util.Map;
import java.util.stream.Collectors;
import org.springframework.shell.core.CommandMarker;
import org.springframework.shell.core.annotation.CliAvailabilityIndicator;
import org.springframework.shell.core.annotation.CliCommand;
import org.springframework.shell.core.annotation.CliOption;
import org.springframework.stereotype.Component;
@Component
public class CleansCommand implements CommandMarker {
@CliAvailabilityIndicator({"cleans show"})
public boolean isShowAvailable() {
return HoodieCLI.tableMetadata != null;
}
@CliAvailabilityIndicator({"cleans refresh"})
public boolean isRefreshAvailable() {
return HoodieCLI.tableMetadata != null;
}
@CliAvailabilityIndicator({"cleans show"})
public boolean isShowAvailable() {
return HoodieCLI.tableMetadata != null;
}
@CliAvailabilityIndicator({"clean showpartitions"})
public boolean isCommitShowAvailable() {
return HoodieCLI.tableMetadata != null;
}
@CliAvailabilityIndicator({"cleans refresh"})
public boolean isRefreshAvailable() {
return HoodieCLI.tableMetadata != null;
}
@CliCommand(value = "cleans show", help = "Show the cleans")
public String showCleans() throws IOException {
HoodieActiveTimeline activeTimeline = HoodieCLI.tableMetadata.getActiveTimeline();
HoodieTimeline timeline = activeTimeline.getCleanerTimeline().filterCompletedInstants();
List<HoodieInstant> cleans = timeline.getInstants().collect(Collectors.toList());
String[][] rows = new String[cleans.size()][];
Collections.reverse(cleans);
for (int i = 0; i < cleans.size(); i++) {
HoodieInstant clean = cleans.get(i);
HoodieCleanMetadata cleanMetadata =
AvroUtils.deserializeHoodieCleanMetadata(timeline.getInstantDetails(clean).get());
rows[i] = new String[] {clean.getTimestamp(), cleanMetadata.getEarliestCommitToRetain(),
String.valueOf(cleanMetadata.getTotalFilesDeleted()),
String.valueOf(cleanMetadata.getTimeTakenInMillis())};
}
return HoodiePrintHelper.print(
new String[] {"CleanTime", "EarliestCommandRetained", "Total Files Deleted",
"Total Time Taken"}, rows);
}
@CliAvailabilityIndicator({"clean showpartitions"})
public boolean isCommitShowAvailable() {
return HoodieCLI.tableMetadata != null;
}
@CliCommand(value = "cleans refresh", help = "Refresh the commits")
public String refreshCleans() throws IOException {
HoodieTableMetaClient metadata =
new HoodieTableMetaClient(HoodieCLI.fs, HoodieCLI.tableMetadata.getBasePath());
HoodieCLI.setTableMetadata(metadata);
return "Metadata for table " + metadata.getTableConfig().getTableName() + " refreshed.";
@CliCommand(value = "cleans show", help = "Show the cleans")
public String showCleans() throws IOException {
HoodieActiveTimeline activeTimeline = HoodieCLI.tableMetadata.getActiveTimeline();
HoodieTimeline timeline = activeTimeline.getCleanerTimeline().filterCompletedInstants();
List<HoodieInstant> cleans = timeline.getInstants().collect(Collectors.toList());
String[][] rows = new String[cleans.size()][];
Collections.reverse(cleans);
for (int i = 0; i < cleans.size(); i++) {
HoodieInstant clean = cleans.get(i);
HoodieCleanMetadata cleanMetadata =
AvroUtils.deserializeHoodieCleanMetadata(timeline.getInstantDetails(clean).get());
rows[i] = new String[]{clean.getTimestamp(), cleanMetadata.getEarliestCommitToRetain(),
String.valueOf(cleanMetadata.getTotalFilesDeleted()),
String.valueOf(cleanMetadata.getTimeTakenInMillis())};
}
return HoodiePrintHelper.print(
new String[]{"CleanTime", "EarliestCommandRetained", "Total Files Deleted",
"Total Time Taken"}, rows);
}
@CliCommand(value = "clean showpartitions", help = "Show partition level details of a clean")
public String showCleanPartitions(
@CliOption(key = {"clean"}, help = "clean to show")
final String commitTime) throws Exception {
HoodieActiveTimeline activeTimeline = HoodieCLI.tableMetadata.getActiveTimeline();
HoodieTimeline timeline = activeTimeline.getCleanerTimeline().filterCompletedInstants();
HoodieInstant cleanInstant =
new HoodieInstant(false, HoodieTimeline.CLEAN_ACTION, commitTime);
@CliCommand(value = "cleans refresh", help = "Refresh the commits")
public String refreshCleans() throws IOException {
HoodieTableMetaClient metadata =
new HoodieTableMetaClient(HoodieCLI.fs, HoodieCLI.tableMetadata.getBasePath());
HoodieCLI.setTableMetadata(metadata);
return "Metadata for table " + metadata.getTableConfig().getTableName() + " refreshed.";
}
if (!timeline.containsInstant(cleanInstant)) {
return "Clean " + commitTime + " not found in metadata " + timeline;
}
HoodieCleanMetadata cleanMetadata =
AvroUtils.deserializeHoodieCleanMetadata(timeline.getInstantDetails(cleanInstant).get());
List<String[]> rows = new ArrayList<>();
for (Map.Entry<String, HoodieCleanPartitionMetadata> entry : cleanMetadata.getPartitionMetadata().entrySet()) {
String path = entry.getKey();
HoodieCleanPartitionMetadata stats = entry.getValue();
String policy = stats.getPolicy();
String totalSuccessDeletedFiles = String.valueOf(stats.getSuccessDeleteFiles().size());
String totalFailedDeletedFiles = String.valueOf(stats.getFailedDeleteFiles().size());
rows.add(new String[] {path, policy, totalSuccessDeletedFiles, totalFailedDeletedFiles});
}
return HoodiePrintHelper.print(
new String[] {"Partition Path", "Cleaning policy", "Total Files Successfully Deleted",
"Total Failed Deletions"}, rows.toArray(new String[rows.size()][]));
@CliCommand(value = "clean showpartitions", help = "Show partition level details of a clean")
public String showCleanPartitions(
@CliOption(key = {"clean"}, help = "clean to show")
final String commitTime) throws Exception {
HoodieActiveTimeline activeTimeline = HoodieCLI.tableMetadata.getActiveTimeline();
HoodieTimeline timeline = activeTimeline.getCleanerTimeline().filterCompletedInstants();
HoodieInstant cleanInstant =
new HoodieInstant(false, HoodieTimeline.CLEAN_ACTION, commitTime);
if (!timeline.containsInstant(cleanInstant)) {
return "Clean " + commitTime + " not found in metadata " + timeline;
}
HoodieCleanMetadata cleanMetadata =
AvroUtils.deserializeHoodieCleanMetadata(timeline.getInstantDetails(cleanInstant).get());
List<String[]> rows = new ArrayList<>();
for (Map.Entry<String, HoodieCleanPartitionMetadata> entry : cleanMetadata
.getPartitionMetadata().entrySet()) {
String path = entry.getKey();
HoodieCleanPartitionMetadata stats = entry.getValue();
String policy = stats.getPolicy();
String totalSuccessDeletedFiles = String.valueOf(stats.getSuccessDeleteFiles().size());
String totalFailedDeletedFiles = String.valueOf(stats.getFailedDeleteFiles().size());
rows.add(new String[]{path, policy, totalSuccessDeletedFiles, totalFailedDeletedFiles});
}
return HoodiePrintHelper.print(
new String[]{"Partition Path", "Cleaning policy", "Total Files Successfully Deleted",
"Total Failed Deletions"}, rows.toArray(new String[rows.size()][]));
}
}

View File

@@ -27,7 +27,12 @@ import com.uber.hoodie.common.table.HoodieTimeline;
import com.uber.hoodie.common.table.timeline.HoodieActiveTimeline;
import com.uber.hoodie.common.table.timeline.HoodieInstant;
import com.uber.hoodie.common.util.NumericUtils;
import java.io.IOException;
import java.util.ArrayList;
import java.util.Collections;
import java.util.List;
import java.util.Map;
import java.util.stream.Collectors;
import org.apache.spark.launcher.SparkLauncher;
import org.springframework.shell.core.CommandMarker;
import org.springframework.shell.core.annotation.CliAvailabilityIndicator;
@@ -35,228 +40,236 @@ import org.springframework.shell.core.annotation.CliCommand;
import org.springframework.shell.core.annotation.CliOption;
import org.springframework.stereotype.Component;
import java.io.IOException;
import java.util.ArrayList;
import java.util.Collections;
import java.util.List;
import java.util.Map;
import java.util.stream.Collectors;
@Component
public class CommitsCommand implements CommandMarker {
@CliAvailabilityIndicator({"commits show"})
public boolean isShowAvailable() {
return HoodieCLI.tableMetadata != null;
@CliAvailabilityIndicator({"commits show"})
public boolean isShowAvailable() {
return HoodieCLI.tableMetadata != null;
}
@CliAvailabilityIndicator({"commits refresh"})
public boolean isRefreshAvailable() {
return HoodieCLI.tableMetadata != null;
}
@CliAvailabilityIndicator({"commit rollback"})
public boolean isRollbackAvailable() {
return HoodieCLI.tableMetadata != null;
}
@CliAvailabilityIndicator({"commit show"})
public boolean isCommitShowAvailable() {
return HoodieCLI.tableMetadata != null;
}
@CliCommand(value = "commits show", help = "Show the commits")
public String showCommits(
@CliOption(key = {
"limit"}, mandatory = false, help = "Limit commits", unspecifiedDefaultValue = "10")
final Integer limit) throws IOException {
HoodieActiveTimeline activeTimeline = HoodieCLI.tableMetadata.getActiveTimeline();
HoodieTimeline timeline = activeTimeline.getCommitsAndCompactionsTimeline()
.filterCompletedInstants();
List<HoodieInstant> commits = timeline.getInstants().collect(Collectors.toList());
String[][] rows = new String[commits.size()][];
Collections.reverse(commits);
for (int i = 0; i < commits.size(); i++) {
HoodieInstant commit = commits.get(i);
HoodieCommitMetadata commitMetadata =
HoodieCommitMetadata.fromBytes(timeline.getInstantDetails(commit).get());
rows[i] = new String[]{commit.getTimestamp(),
NumericUtils.humanReadableByteCount(commitMetadata.fetchTotalBytesWritten()),
String.valueOf(commitMetadata.fetchTotalFilesInsert()),
String.valueOf(commitMetadata.fetchTotalFilesUpdated()),
String.valueOf(commitMetadata.fetchTotalPartitionsWritten()),
String.valueOf(commitMetadata.fetchTotalRecordsWritten()),
String.valueOf(commitMetadata.fetchTotalUpdateRecordsWritten()),
String.valueOf(commitMetadata.fetchTotalWriteErrors())};
}
return HoodiePrintHelper.print(
new String[]{"CommitTime", "Total Written (B)", "Total Files Added",
"Total Files Updated", "Total Partitions Written", "Total Records Written",
"Total Update Records Written", "Total Errors"}, rows);
}
@CliCommand(value = "commits refresh", help = "Refresh the commits")
public String refreshCommits() throws IOException {
HoodieTableMetaClient metadata =
new HoodieTableMetaClient(HoodieCLI.fs, HoodieCLI.tableMetadata.getBasePath());
HoodieCLI.setTableMetadata(metadata);
return "Metadata for table " + metadata.getTableConfig().getTableName() + " refreshed.";
}
@CliCommand(value = "commit rollback", help = "Rollback a commit")
public String rollbackCommit(
@CliOption(key = {"commit"}, help = "Commit to rollback")
final String commitTime,
@CliOption(key = {"sparkProperties"}, help = "Spark Properites File Path")
final String sparkPropertiesPath) throws Exception {
HoodieActiveTimeline activeTimeline = HoodieCLI.tableMetadata.getActiveTimeline();
HoodieTimeline timeline = activeTimeline.getCommitsAndCompactionsTimeline()
.filterCompletedInstants();
HoodieInstant commitInstant = new HoodieInstant(false, HoodieTimeline.COMMIT_ACTION,
commitTime);
if (!timeline.containsInstant(commitInstant)) {
return "Commit " + commitTime + " not found in Commits " + timeline;
}
@CliAvailabilityIndicator({"commits refresh"})
public boolean isRefreshAvailable() {
return HoodieCLI.tableMetadata != null;
SparkLauncher sparkLauncher = SparkUtil.initLauncher(sparkPropertiesPath);
sparkLauncher.addAppArgs(SparkMain.SparkCommand.ROLLBACK.toString(),
commitTime,
HoodieCLI.tableMetadata.getBasePath());
Process process = sparkLauncher.launch();
InputStreamConsumer.captureOutput(process);
int exitCode = process.waitFor();
// Refresh the current
refreshCommits();
if (exitCode != 0) {
return "Commit " + commitTime + " failed to roll back";
}
return "Commit " + commitTime + " rolled back";
}
@CliAvailabilityIndicator({"commit rollback"})
public boolean isRollbackAvailable() {
return HoodieCLI.tableMetadata != null;
@CliCommand(value = "commit showpartitions", help = "Show partition level details of a commit")
public String showCommitPartitions(
@CliOption(key = {"commit"}, help = "Commit to show")
final String commitTime) throws Exception {
HoodieActiveTimeline activeTimeline = HoodieCLI.tableMetadata.getActiveTimeline();
HoodieTimeline timeline = activeTimeline.getCommitsAndCompactionsTimeline()
.filterCompletedInstants();
HoodieInstant commitInstant = new HoodieInstant(false, HoodieTimeline.COMMIT_ACTION,
commitTime);
if (!timeline.containsInstant(commitInstant)) {
return "Commit " + commitTime + " not found in Commits " + timeline;
}
@CliAvailabilityIndicator({"commit show"})
public boolean isCommitShowAvailable() {
return HoodieCLI.tableMetadata != null;
}
@CliCommand(value = "commits show", help = "Show the commits")
public String showCommits(
@CliOption(key = {
"limit"}, mandatory = false, help = "Limit commits", unspecifiedDefaultValue = "10")
final Integer limit) throws IOException {
HoodieActiveTimeline activeTimeline = HoodieCLI.tableMetadata.getActiveTimeline();
HoodieTimeline timeline = activeTimeline.getCommitsAndCompactionsTimeline().filterCompletedInstants();
List<HoodieInstant> commits = timeline.getInstants().collect(Collectors.toList());
String[][] rows = new String[commits.size()][];
Collections.reverse(commits);
for (int i = 0; i < commits.size(); i++) {
HoodieInstant commit = commits.get(i);
HoodieCommitMetadata commitMetadata =
HoodieCommitMetadata.fromBytes(timeline.getInstantDetails(commit).get());
rows[i] = new String[] {commit.getTimestamp(),
NumericUtils.humanReadableByteCount(commitMetadata.fetchTotalBytesWritten()),
String.valueOf(commitMetadata.fetchTotalFilesInsert()),
String.valueOf(commitMetadata.fetchTotalFilesUpdated()),
String.valueOf(commitMetadata.fetchTotalPartitionsWritten()),
String.valueOf(commitMetadata.fetchTotalRecordsWritten()),
String.valueOf(commitMetadata.fetchTotalUpdateRecordsWritten()),
String.valueOf(commitMetadata.fetchTotalWriteErrors())};
}
return HoodiePrintHelper.print(
new String[] {"CommitTime", "Total Written (B)", "Total Files Added",
"Total Files Updated", "Total Partitions Written", "Total Records Written",
"Total Update Records Written", "Total Errors"}, rows);
}
@CliCommand(value = "commits refresh", help = "Refresh the commits")
public String refreshCommits() throws IOException {
HoodieTableMetaClient metadata =
new HoodieTableMetaClient(HoodieCLI.fs, HoodieCLI.tableMetadata.getBasePath());
HoodieCLI.setTableMetadata(metadata);
return "Metadata for table " + metadata.getTableConfig().getTableName() + " refreshed.";
}
@CliCommand(value = "commit rollback", help = "Rollback a commit")
public String rollbackCommit(
@CliOption(key = {"commit"}, help = "Commit to rollback")
final String commitTime,
@CliOption(key = {"sparkProperties"}, help = "Spark Properites File Path")
final String sparkPropertiesPath) throws Exception {
HoodieActiveTimeline activeTimeline = HoodieCLI.tableMetadata.getActiveTimeline();
HoodieTimeline timeline = activeTimeline.getCommitsAndCompactionsTimeline().filterCompletedInstants();
HoodieInstant commitInstant = new HoodieInstant(false, HoodieTimeline.COMMIT_ACTION, commitTime);
if (!timeline.containsInstant(commitInstant)) {
return "Commit " + commitTime + " not found in Commits " + timeline;
}
SparkLauncher sparkLauncher = SparkUtil.initLauncher(sparkPropertiesPath);
sparkLauncher.addAppArgs(SparkMain.SparkCommand.ROLLBACK.toString(),
commitTime,
HoodieCLI.tableMetadata.getBasePath());
Process process = sparkLauncher.launch();
InputStreamConsumer.captureOutput(process);
int exitCode = process.waitFor();
// Refresh the current
refreshCommits();
if (exitCode != 0) {
return "Commit " + commitTime + " failed to roll back";
}
return "Commit " + commitTime + " rolled back";
}
@CliCommand(value = "commit showpartitions", help = "Show partition level details of a commit")
public String showCommitPartitions(
@CliOption(key = {"commit"}, help = "Commit to show")
final String commitTime) throws Exception {
HoodieActiveTimeline activeTimeline = HoodieCLI.tableMetadata.getActiveTimeline();
HoodieTimeline timeline = activeTimeline.getCommitsAndCompactionsTimeline().filterCompletedInstants();
HoodieInstant commitInstant = new HoodieInstant(false, HoodieTimeline.COMMIT_ACTION, commitTime);
if (!timeline.containsInstant(commitInstant)) {
return "Commit " + commitTime + " not found in Commits " + timeline;
}
HoodieCommitMetadata meta =
HoodieCommitMetadata.fromBytes(activeTimeline.getInstantDetails(commitInstant).get());
List<String[]> rows = new ArrayList<String[]>();
for (Map.Entry<String, List<HoodieWriteStat>> entry : meta.getPartitionToWriteStats()
.entrySet()) {
String path = entry.getKey();
List<HoodieWriteStat> stats = entry.getValue();
long totalFilesAdded = 0;
long totalFilesUpdated = 0;
long totalRecordsUpdated = 0;
long totalRecordsInserted = 0;
long totalBytesWritten = 0;
long totalWriteErrors = 0;
for (HoodieWriteStat stat : stats) {
if (stat.getPrevCommit().equals(HoodieWriteStat.NULL_COMMIT)) {
totalFilesAdded += 1;
totalRecordsInserted += stat.getNumWrites();
} else {
totalFilesUpdated += 1;
totalRecordsUpdated += stat.getNumUpdateWrites();
}
totalBytesWritten += stat.getTotalWriteBytes();
totalWriteErrors += stat.getTotalWriteErrors();
}
rows.add(new String[] {path, String.valueOf(totalFilesAdded),
String.valueOf(totalFilesUpdated), String.valueOf(totalRecordsInserted),
String.valueOf(totalRecordsUpdated),
NumericUtils.humanReadableByteCount(totalBytesWritten),
String.valueOf(totalWriteErrors)});
}
return HoodiePrintHelper.print(
new String[] {"Partition Path", "Total Files Added", "Total Files Updated",
"Total Records Inserted", "Total Records Updated", "Total Bytes Written",
"Total Errors"}, rows.toArray(new String[rows.size()][]));
}
@CliCommand(value = "commit showfiles", help = "Show file level details of a commit")
public String showCommitFiles(
@CliOption(key = {"commit"}, help = "Commit to show")
final String commitTime) throws Exception {
HoodieActiveTimeline activeTimeline = HoodieCLI.tableMetadata.getActiveTimeline();
HoodieTimeline timeline = activeTimeline.getCommitsAndCompactionsTimeline().filterCompletedInstants();
HoodieInstant commitInstant = new HoodieInstant(false, HoodieTimeline.COMMIT_ACTION, commitTime);
if (!timeline.containsInstant(commitInstant)) {
return "Commit " + commitTime + " not found in Commits " + timeline;
}
HoodieCommitMetadata meta =
HoodieCommitMetadata.fromBytes(activeTimeline.getInstantDetails(commitInstant).get());
List<String[]> rows = new ArrayList<String[]>();
for (Map.Entry<String, List<HoodieWriteStat>> entry : meta.getPartitionToWriteStats()
.entrySet()) {
String path = entry.getKey();
List<HoodieWriteStat> stats = entry.getValue();
for (HoodieWriteStat stat : stats) {
rows.add(new String[] {path, stat.getFileId(), stat.getPrevCommit(),
String.valueOf(stat.getNumUpdateWrites()), String.valueOf(stat.getNumWrites()),
String.valueOf(stat.getTotalWriteBytes()),
String.valueOf(stat.getTotalWriteErrors())});
}
}
return HoodiePrintHelper.print(
new String[] {"Partition Path", "File ID", "Previous Commit", "Total Records Updated",
"Total Records Written", "Total Bytes Written", "Total Errors"},
rows.toArray(new String[rows.size()][]));
}
@CliAvailabilityIndicator({"commits compare"})
public boolean isCompareCommitsAvailable() {
return HoodieCLI.tableMetadata != null;
}
@CliCommand(value = "commits compare", help = "Compare commits with another Hoodie dataset")
public String compareCommits(
@CliOption(key = {"path"}, help = "Path of the dataset to compare to")
final String path) throws Exception {
HoodieTableMetaClient target = new HoodieTableMetaClient(HoodieCLI.fs, path);
HoodieTimeline targetTimeline = target.getActiveTimeline().getCommitsAndCompactionsTimeline().filterCompletedInstants();;
HoodieTableMetaClient source = HoodieCLI.tableMetadata;
HoodieTimeline sourceTimeline = source.getActiveTimeline().getCommitsAndCompactionsTimeline().filterCompletedInstants();;
String targetLatestCommit =
targetTimeline.getInstants().iterator().hasNext() ? "0" : targetTimeline.lastInstant().get().getTimestamp();
String sourceLatestCommit =
sourceTimeline.getInstants().iterator().hasNext() ? "0" : sourceTimeline.lastInstant().get().getTimestamp();
if (sourceLatestCommit != null &&
HoodieTimeline.compareTimestamps(targetLatestCommit, sourceLatestCommit, HoodieTimeline.GREATER)) {
// source is behind the target
List<String> commitsToCatchup =
targetTimeline.findInstantsAfter(sourceLatestCommit, Integer.MAX_VALUE)
.getInstants().map(HoodieInstant::getTimestamp).collect(Collectors.toList());
return "Source " + source.getTableConfig().getTableName() + " is behind by "
+ commitsToCatchup.size() + " commits. Commits to catch up - " + commitsToCatchup;
HoodieCommitMetadata meta =
HoodieCommitMetadata.fromBytes(activeTimeline.getInstantDetails(commitInstant).get());
List<String[]> rows = new ArrayList<String[]>();
for (Map.Entry<String, List<HoodieWriteStat>> entry : meta.getPartitionToWriteStats()
.entrySet()) {
String path = entry.getKey();
List<HoodieWriteStat> stats = entry.getValue();
long totalFilesAdded = 0;
long totalFilesUpdated = 0;
long totalRecordsUpdated = 0;
long totalRecordsInserted = 0;
long totalBytesWritten = 0;
long totalWriteErrors = 0;
for (HoodieWriteStat stat : stats) {
if (stat.getPrevCommit().equals(HoodieWriteStat.NULL_COMMIT)) {
totalFilesAdded += 1;
totalRecordsInserted += stat.getNumWrites();
} else {
List<String> commitsToCatchup =
sourceTimeline.findInstantsAfter(targetLatestCommit, Integer.MAX_VALUE)
.getInstants().map(HoodieInstant::getTimestamp).collect(Collectors.toList());
return "Source " + source.getTableConfig().getTableName() + " is ahead by "
+ commitsToCatchup.size() + " commits. Commits to catch up - " + commitsToCatchup;
totalFilesUpdated += 1;
totalRecordsUpdated += stat.getNumUpdateWrites();
}
}
totalBytesWritten += stat.getTotalWriteBytes();
totalWriteErrors += stat.getTotalWriteErrors();
}
rows.add(new String[]{path, String.valueOf(totalFilesAdded),
String.valueOf(totalFilesUpdated), String.valueOf(totalRecordsInserted),
String.valueOf(totalRecordsUpdated),
NumericUtils.humanReadableByteCount(totalBytesWritten),
String.valueOf(totalWriteErrors)});
@CliAvailabilityIndicator({"commits sync"})
public boolean isSyncCommitsAvailable() {
return HoodieCLI.tableMetadata != null;
}
return HoodiePrintHelper.print(
new String[]{"Partition Path", "Total Files Added", "Total Files Updated",
"Total Records Inserted", "Total Records Updated", "Total Bytes Written",
"Total Errors"}, rows.toArray(new String[rows.size()][]));
}
@CliCommand(value = "commits sync", help = "Compare commits with another Hoodie dataset")
public String syncCommits(
@CliOption(key = {"path"}, help = "Path of the dataset to compare to")
final String path) throws Exception {
HoodieCLI.syncTableMetadata = new HoodieTableMetaClient(HoodieCLI.fs, path);
HoodieCLI.state = HoodieCLI.CLIState.SYNC;
return "Load sync state between " + HoodieCLI.tableMetadata.getTableConfig().getTableName()
+ " and " + HoodieCLI.syncTableMetadata.getTableConfig().getTableName();
@CliCommand(value = "commit showfiles", help = "Show file level details of a commit")
public String showCommitFiles(
@CliOption(key = {"commit"}, help = "Commit to show")
final String commitTime) throws Exception {
HoodieActiveTimeline activeTimeline = HoodieCLI.tableMetadata.getActiveTimeline();
HoodieTimeline timeline = activeTimeline.getCommitsAndCompactionsTimeline()
.filterCompletedInstants();
HoodieInstant commitInstant = new HoodieInstant(false, HoodieTimeline.COMMIT_ACTION,
commitTime);
if (!timeline.containsInstant(commitInstant)) {
return "Commit " + commitTime + " not found in Commits " + timeline;
}
HoodieCommitMetadata meta =
HoodieCommitMetadata.fromBytes(activeTimeline.getInstantDetails(commitInstant).get());
List<String[]> rows = new ArrayList<String[]>();
for (Map.Entry<String, List<HoodieWriteStat>> entry : meta.getPartitionToWriteStats()
.entrySet()) {
String path = entry.getKey();
List<HoodieWriteStat> stats = entry.getValue();
for (HoodieWriteStat stat : stats) {
rows.add(new String[]{path, stat.getFileId(), stat.getPrevCommit(),
String.valueOf(stat.getNumUpdateWrites()), String.valueOf(stat.getNumWrites()),
String.valueOf(stat.getTotalWriteBytes()),
String.valueOf(stat.getTotalWriteErrors())});
}
}
return HoodiePrintHelper.print(
new String[]{"Partition Path", "File ID", "Previous Commit", "Total Records Updated",
"Total Records Written", "Total Bytes Written", "Total Errors"},
rows.toArray(new String[rows.size()][]));
}
@CliAvailabilityIndicator({"commits compare"})
public boolean isCompareCommitsAvailable() {
return HoodieCLI.tableMetadata != null;
}
@CliCommand(value = "commits compare", help = "Compare commits with another Hoodie dataset")
public String compareCommits(
@CliOption(key = {"path"}, help = "Path of the dataset to compare to")
final String path) throws Exception {
HoodieTableMetaClient target = new HoodieTableMetaClient(HoodieCLI.fs, path);
HoodieTimeline targetTimeline = target.getActiveTimeline().getCommitsAndCompactionsTimeline()
.filterCompletedInstants();
;
HoodieTableMetaClient source = HoodieCLI.tableMetadata;
HoodieTimeline sourceTimeline = source.getActiveTimeline().getCommitsAndCompactionsTimeline()
.filterCompletedInstants();
;
String targetLatestCommit =
targetTimeline.getInstants().iterator().hasNext() ? "0"
: targetTimeline.lastInstant().get().getTimestamp();
String sourceLatestCommit =
sourceTimeline.getInstants().iterator().hasNext() ? "0"
: sourceTimeline.lastInstant().get().getTimestamp();
if (sourceLatestCommit != null &&
HoodieTimeline
.compareTimestamps(targetLatestCommit, sourceLatestCommit, HoodieTimeline.GREATER)) {
// source is behind the target
List<String> commitsToCatchup =
targetTimeline.findInstantsAfter(sourceLatestCommit, Integer.MAX_VALUE)
.getInstants().map(HoodieInstant::getTimestamp).collect(Collectors.toList());
return "Source " + source.getTableConfig().getTableName() + " is behind by "
+ commitsToCatchup.size() + " commits. Commits to catch up - " + commitsToCatchup;
} else {
List<String> commitsToCatchup =
sourceTimeline.findInstantsAfter(targetLatestCommit, Integer.MAX_VALUE)
.getInstants().map(HoodieInstant::getTimestamp).collect(Collectors.toList());
return "Source " + source.getTableConfig().getTableName() + " is ahead by "
+ commitsToCatchup.size() + " commits. Commits to catch up - " + commitsToCatchup;
}
}
@CliAvailabilityIndicator({"commits sync"})
public boolean isSyncCommitsAvailable() {
return HoodieCLI.tableMetadata != null;
}
@CliCommand(value = "commits sync", help = "Compare commits with another Hoodie dataset")
public String syncCommits(
@CliOption(key = {"path"}, help = "Path of the dataset to compare to")
final String path) throws Exception {
HoodieCLI.syncTableMetadata = new HoodieTableMetaClient(HoodieCLI.fs, path);
HoodieCLI.state = HoodieCLI.CLIState.SYNC;
return "Load sync state between " + HoodieCLI.tableMetadata.getTableConfig().getTableName()
+ " and " + HoodieCLI.syncTableMetadata.getTableConfig().getTableName();
}
}

View File

@@ -18,24 +18,24 @@ package com.uber.hoodie.cli.commands;
import com.uber.hoodie.cli.HoodieCLI;
import com.uber.hoodie.common.table.HoodieTableMetaClient;
import java.io.IOException;
import org.springframework.shell.core.CommandMarker;
import org.springframework.shell.core.annotation.CliCommand;
import org.springframework.shell.core.annotation.CliOption;
import org.springframework.stereotype.Component;
import java.io.IOException;
@Component
public class DatasetsCommand implements CommandMarker {
@CliCommand(value = "connect", help = "Connect to a hoodie dataset")
public String connect(
@CliOption(key = {"path"}, mandatory = true, help = "Base Path of the dataset")
final String path) throws IOException {
boolean initialized = HoodieCLI.initConf();
HoodieCLI.initFS(initialized);
HoodieCLI.setTableMetadata(new HoodieTableMetaClient(HoodieCLI.fs, path));
HoodieCLI.state = HoodieCLI.CLIState.DATASET;
return "Metadata for table " + HoodieCLI.tableMetadata.getTableConfig().getTableName()
+ " loaded";
}
@CliCommand(value = "connect", help = "Connect to a hoodie dataset")
public String connect(
@CliOption(key = {"path"}, mandatory = true, help = "Base Path of the dataset")
final String path) throws IOException {
boolean initialized = HoodieCLI.initConf();
HoodieCLI.initFS(initialized);
HoodieCLI.setTableMetadata(new HoodieTableMetaClient(HoodieCLI.fs, path));
HoodieCLI.state = HoodieCLI.CLIState.DATASET;
return "Metadata for table " + HoodieCLI.tableMetadata.getTableConfig().getTableName()
+ " loaded";
}
}

View File

@@ -33,58 +33,59 @@ import org.springframework.stereotype.Component;
@Component
public class HDFSParquetImportCommand implements CommandMarker {
private static Logger log = LogManager.getLogger(HDFSParquetImportCommand.class);
private static Logger log = LogManager.getLogger(HDFSParquetImportCommand.class);
@CliCommand(value = "hdfsparquetimport", help = "Imports hdfs dataset to a hoodie dataset")
public String convert(
@CliOption(key = "srcPath", mandatory = true, help = "Base path for the input dataset")
final String srcPath,
@CliOption(key = "srcType", mandatory = true, help = "Source type for the input dataset")
final String srcType,
@CliOption(key = "targetPath", mandatory = true, help = "Base path for the target hoodie dataset")
final String targetPath,
@CliOption(key = "tableName", mandatory = true, help = "Table name")
final String tableName,
@CliOption(key = "tableType", mandatory = true, help = "Table type")
final String tableType,
@CliOption(key = "rowKeyField", mandatory = true, help = "Row key field name")
final String rowKeyField,
@CliOption(key = "partitionPathField", mandatory = true, help = "Partition path field name")
final String partitionPathField,
@CliOption(key = {"parallelism"}, mandatory = true, help = "Parallelism for hoodie insert")
final String parallelism,
@CliOption(key = "schemaFilePath", mandatory = true, help = "Path for Avro schema file")
final String schemaFilePath,
@CliOption(key = "format", mandatory = true, help = "Format for the input data")
final String format,
@CliOption(key = "sparkMemory", mandatory = true, help = "Spark executor memory")
final String sparkMemory,
@CliOption(key = "retry", mandatory = true, help = "Number of retries")
final String retry)
throws Exception {
@CliCommand(value = "hdfsparquetimport", help = "Imports hdfs dataset to a hoodie dataset")
public String convert(
@CliOption(key = "srcPath", mandatory = true, help = "Base path for the input dataset")
final String srcPath,
@CliOption(key = "srcType", mandatory = true, help = "Source type for the input dataset")
final String srcType,
@CliOption(key = "targetPath", mandatory = true, help = "Base path for the target hoodie dataset")
final String targetPath,
@CliOption(key = "tableName", mandatory = true, help = "Table name")
final String tableName,
@CliOption(key = "tableType", mandatory = true, help = "Table type")
final String tableType,
@CliOption(key = "rowKeyField", mandatory = true, help = "Row key field name")
final String rowKeyField,
@CliOption(key = "partitionPathField", mandatory = true, help = "Partition path field name")
final String partitionPathField,
@CliOption(key = {"parallelism"}, mandatory = true, help = "Parallelism for hoodie insert")
final String parallelism,
@CliOption(key = "schemaFilePath", mandatory = true, help = "Path for Avro schema file")
final String schemaFilePath,
@CliOption(key = "format", mandatory = true, help = "Format for the input data")
final String format,
@CliOption(key = "sparkMemory", mandatory = true, help = "Spark executor memory")
final String sparkMemory,
@CliOption(key = "retry", mandatory = true, help = "Number of retries")
final String retry)
throws Exception {
validate(format, srcType);
validate(format, srcType);
boolean initialized = HoodieCLI.initConf();
HoodieCLI.initFS(initialized);
String sparkPropertiesPath = Utils
.getDefaultPropertiesFile(scala.collection.JavaConversions.propertiesAsScalaMap(System.getProperties()));
SparkLauncher sparkLauncher = SparkUtil.initLauncher(sparkPropertiesPath);
boolean initialized = HoodieCLI.initConf();
HoodieCLI.initFS(initialized);
String sparkPropertiesPath = Utils
.getDefaultPropertiesFile(
scala.collection.JavaConversions.propertiesAsScalaMap(System.getProperties()));
SparkLauncher sparkLauncher = SparkUtil.initLauncher(sparkPropertiesPath);
sparkLauncher.addAppArgs(SparkCommand.IMPORT.toString(), srcPath, targetPath, tableName,
tableType, rowKeyField, partitionPathField, parallelism, schemaFilePath, sparkMemory,
retry);
Process process = sparkLauncher.launch();
InputStreamConsumer.captureOutput(process);
int exitCode = process.waitFor();
if (exitCode != 0) {
return "Failed to import dataset to hoodie format";
}
return "Dataset imported to hoodie format";
sparkLauncher.addAppArgs(SparkCommand.IMPORT.toString(), srcPath, targetPath, tableName,
tableType, rowKeyField, partitionPathField, parallelism, schemaFilePath, sparkMemory,
retry);
Process process = sparkLauncher.launch();
InputStreamConsumer.captureOutput(process);
int exitCode = process.waitFor();
if (exitCode != 0) {
return "Failed to import dataset to hoodie format";
}
return "Dataset imported to hoodie format";
}
private void validate(String format, String srcType) {
(new HDFSParquetImporter.FormatValidator()).validate("format", format);
(new HDFSParquetImporter.SourceTypeValidator()).validate("srcType", srcType);
}
private void validate(String format, String srcType) {
(new HDFSParquetImporter.FormatValidator()).validate("format", format);
(new HDFSParquetImporter.SourceTypeValidator()).validate("srcType", srcType);
}
}

View File

@@ -16,105 +16,109 @@
package com.uber.hoodie.cli.commands;
import com.uber.hoodie.cli.HoodieCLI;
import com.uber.hoodie.cli.utils.CommitUtil;
import com.uber.hoodie.cli.utils.HiveUtil;
import com.uber.hoodie.cli.HoodieCLI;
import com.uber.hoodie.common.table.HoodieTableMetaClient;
import com.uber.hoodie.common.table.HoodieTimeline;
import com.uber.hoodie.common.table.timeline.HoodieInstant;
import java.util.List;
import java.util.stream.Collectors;
import org.springframework.shell.core.CommandMarker;
import org.springframework.shell.core.annotation.CliAvailabilityIndicator;
import org.springframework.shell.core.annotation.CliCommand;
import org.springframework.shell.core.annotation.CliOption;
import org.springframework.stereotype.Component;
import java.util.List;
import java.util.stream.Collectors;
@Component
public class HoodieSyncCommand implements CommandMarker {
@CliAvailabilityIndicator({"sync validate"})
public boolean isSyncVerificationAvailable() {
return HoodieCLI.tableMetadata != null && HoodieCLI.syncTableMetadata != null;
@CliAvailabilityIndicator({"sync validate"})
public boolean isSyncVerificationAvailable() {
return HoodieCLI.tableMetadata != null && HoodieCLI.syncTableMetadata != null;
}
@CliCommand(value = "sync validate", help = "Validate the sync by counting the number of records")
public String validateSync(
@CliOption(key = {"mode"}, unspecifiedDefaultValue = "complete", help = "Check mode")
final String mode,
@CliOption(key = {
"sourceDb"}, unspecifiedDefaultValue = "rawdata", help = "source database")
final String srcDb,
@CliOption(key = {
"targetDb"}, unspecifiedDefaultValue = "dwh_hoodie", help = "target database")
final String tgtDb,
@CliOption(key = {
"partitionCount"}, unspecifiedDefaultValue = "5", help = "total number of recent partitions to validate")
final int partitionCount,
@CliOption(key = {
"hiveServerUrl"}, mandatory = true, help = "hiveServerURL to connect to")
final String hiveServerUrl,
@CliOption(key = {
"hiveUser"}, mandatory = false, unspecifiedDefaultValue = "", help = "hive username to connect to")
final String hiveUser,
@CliOption(key = {
"hivePass"}, mandatory = true, unspecifiedDefaultValue = "", help = "hive password to connect to")
final String hivePass) throws Exception {
HoodieTableMetaClient target = HoodieCLI.syncTableMetadata;
HoodieTimeline targetTimeline = target.getActiveTimeline().getCommitsAndCompactionsTimeline();
HoodieTableMetaClient source = HoodieCLI.tableMetadata;
HoodieTimeline sourceTimeline = source.getActiveTimeline().getCommitsAndCompactionsTimeline();
long sourceCount = 0;
long targetCount = 0;
if ("complete".equals(mode)) {
sourceCount = HiveUtil.countRecords(hiveServerUrl, source, srcDb, hiveUser, hivePass);
targetCount = HiveUtil.countRecords(hiveServerUrl, target, tgtDb, hiveUser, hivePass);
} else if ("latestPartitions".equals(mode)) {
sourceCount = HiveUtil
.countRecords(hiveServerUrl, source, srcDb, partitionCount, hiveUser, hivePass);
targetCount = HiveUtil
.countRecords(hiveServerUrl, target, tgtDb, partitionCount, hiveUser, hivePass);
}
@CliCommand(value = "sync validate", help = "Validate the sync by counting the number of records")
public String validateSync(
@CliOption(key = {"mode"}, unspecifiedDefaultValue = "complete", help = "Check mode")
final String mode,
@CliOption(key = {
"sourceDb"}, unspecifiedDefaultValue = "rawdata", help = "source database")
final String srcDb,
@CliOption(key = {
"targetDb"}, unspecifiedDefaultValue = "dwh_hoodie", help = "target database")
final String tgtDb,
@CliOption(key = {
"partitionCount"}, unspecifiedDefaultValue = "5", help = "total number of recent partitions to validate")
final int partitionCount,
@CliOption(key = {
"hiveServerUrl"}, mandatory = true, help = "hiveServerURL to connect to")
final String hiveServerUrl,
@CliOption(key = {
"hiveUser"}, mandatory = false, unspecifiedDefaultValue = "", help = "hive username to connect to")
final String hiveUser,
@CliOption(key = {
"hivePass"}, mandatory = true, unspecifiedDefaultValue = "", help = "hive password to connect to")
final String hivePass) throws Exception {
HoodieTableMetaClient target = HoodieCLI.syncTableMetadata;
HoodieTimeline targetTimeline = target.getActiveTimeline().getCommitsAndCompactionsTimeline();
HoodieTableMetaClient source = HoodieCLI.tableMetadata;
HoodieTimeline sourceTimeline = source.getActiveTimeline().getCommitsAndCompactionsTimeline();
long sourceCount = 0;
long targetCount = 0;
if ("complete".equals(mode)) {
sourceCount = HiveUtil.countRecords(hiveServerUrl, source, srcDb, hiveUser, hivePass);
targetCount = HiveUtil.countRecords(hiveServerUrl, target, tgtDb, hiveUser, hivePass);
} else if ("latestPartitions".equals(mode)) {
sourceCount = HiveUtil.countRecords(hiveServerUrl, source, srcDb, partitionCount, hiveUser, hivePass);
targetCount = HiveUtil.countRecords(hiveServerUrl, target, tgtDb, partitionCount, hiveUser, hivePass);
}
String targetLatestCommit =
targetTimeline.getInstants().iterator().hasNext() ? "0"
: targetTimeline.lastInstant().get().getTimestamp();
String sourceLatestCommit =
sourceTimeline.getInstants().iterator().hasNext() ? "0"
: sourceTimeline.lastInstant().get().getTimestamp();
String targetLatestCommit =
targetTimeline.getInstants().iterator().hasNext() ? "0" : targetTimeline.lastInstant().get().getTimestamp();
String sourceLatestCommit =
sourceTimeline.getInstants().iterator().hasNext() ? "0" : sourceTimeline.lastInstant().get().getTimestamp();
if (sourceLatestCommit != null && HoodieTimeline
.compareTimestamps(targetLatestCommit, sourceLatestCommit, HoodieTimeline.GREATER)) {
// source is behind the target
List<HoodieInstant> commitsToCatchup =
targetTimeline.findInstantsAfter(sourceLatestCommit, Integer.MAX_VALUE).getInstants()
.collect(Collectors.toList());
if (commitsToCatchup.isEmpty()) {
return "Count difference now is (count(" + target.getTableConfig().getTableName()
+ ") - count(" + source.getTableConfig().getTableName() + ") == " + (targetCount
- sourceCount);
} else {
long newInserts = CommitUtil.countNewRecords(target,
commitsToCatchup.stream().map(HoodieInstant::getTimestamp)
.collect(Collectors.toList()));
return "Count difference now is (count(" + target.getTableConfig().getTableName()
+ ") - count(" + source.getTableConfig().getTableName() + ") == " + (targetCount
- sourceCount) + ". Catch up count is " + newInserts;
}
} else {
List<HoodieInstant> commitsToCatchup =
sourceTimeline.findInstantsAfter(targetLatestCommit, Integer.MAX_VALUE).getInstants()
.collect(Collectors.toList());
if (commitsToCatchup.isEmpty()) {
return "Count difference now is (count(" + source.getTableConfig().getTableName()
+ ") - count(" + target.getTableConfig().getTableName() + ") == " + (sourceCount
- targetCount);
} else {
long newInserts = CommitUtil.countNewRecords(source,
commitsToCatchup.stream().map(HoodieInstant::getTimestamp)
.collect(Collectors.toList()));
return "Count difference now is (count(" + source.getTableConfig().getTableName()
+ ") - count(" + target.getTableConfig().getTableName() + ") == " + (sourceCount
- targetCount) + ". Catch up count is " + newInserts;
}
if (sourceLatestCommit != null && HoodieTimeline
.compareTimestamps(targetLatestCommit, sourceLatestCommit, HoodieTimeline.GREATER)) {
// source is behind the target
List<HoodieInstant> commitsToCatchup =
targetTimeline.findInstantsAfter(sourceLatestCommit, Integer.MAX_VALUE).getInstants()
.collect(Collectors.toList());
if (commitsToCatchup.isEmpty()) {
return "Count difference now is (count(" + target.getTableConfig().getTableName()
+ ") - count(" + source.getTableConfig().getTableName() + ") == " + (targetCount
- sourceCount);
} else {
long newInserts = CommitUtil.countNewRecords(target,
commitsToCatchup.stream().map(HoodieInstant::getTimestamp)
.collect(Collectors.toList()));
return "Count difference now is (count(" + target.getTableConfig().getTableName()
+ ") - count(" + source.getTableConfig().getTableName() + ") == " + (targetCount
- sourceCount) + ". Catch up count is " + newInserts;
}
} else {
List<HoodieInstant> commitsToCatchup =
sourceTimeline.findInstantsAfter(targetLatestCommit, Integer.MAX_VALUE).getInstants()
.collect(Collectors.toList());
if (commitsToCatchup.isEmpty()) {
return "Count difference now is (count(" + source.getTableConfig().getTableName()
+ ") - count(" + target.getTableConfig().getTableName() + ") == " + (sourceCount
- targetCount);
} else {
long newInserts = CommitUtil.countNewRecords(source,
commitsToCatchup.stream().map(HoodieInstant::getTimestamp)
.collect(Collectors.toList()));
return "Count difference now is (count(" + source.getTableConfig().getTableName()
+ ") - count(" + target.getTableConfig().getTableName() + ") == " + (sourceCount
- targetCount) + ". Catch up count is " + newInserts;
}
}
}
}
}

View File

@@ -22,7 +22,8 @@ import com.uber.hoodie.cli.utils.InputStreamConsumer;
import com.uber.hoodie.cli.utils.SparkUtil;
import com.uber.hoodie.common.model.HoodiePartitionMetadata;
import com.uber.hoodie.common.util.FSUtils;
import java.io.IOException;
import java.util.List;
import org.apache.hadoop.fs.Path;
import org.apache.spark.launcher.SparkLauncher;
import org.springframework.shell.core.CommandMarker;
@@ -31,80 +32,80 @@ import org.springframework.shell.core.annotation.CliCommand;
import org.springframework.shell.core.annotation.CliOption;
import org.springframework.stereotype.Component;
import java.io.IOException;
import java.util.List;
@Component
public class RepairsCommand implements CommandMarker {
@CliAvailabilityIndicator({"repair deduplicate"})
public boolean isRepairDeduplicateAvailable() {
return HoodieCLI.tableMetadata != null;
@CliAvailabilityIndicator({"repair deduplicate"})
public boolean isRepairDeduplicateAvailable() {
return HoodieCLI.tableMetadata != null;
}
@CliAvailabilityIndicator({"repair addpartitionmeta"})
public boolean isRepairAddPartitionMetaAvailable() {
return HoodieCLI.tableMetadata != null;
}
@CliCommand(value = "repair deduplicate", help = "De-duplicate a partition path contains duplicates & produce repaired files to replace with")
public String deduplicate(
@CliOption(key = {
"duplicatedPartitionPath"}, help = "Partition Path containing the duplicates", mandatory = true)
final String duplicatedPartitionPath,
@CliOption(key = {
"repairedOutputPath"}, help = "Location to place the repaired files", mandatory = true)
final String repairedOutputPath,
@CliOption(key = {"sparkProperties"}, help = "Spark Properites File Path", mandatory = true)
final String sparkPropertiesPath) throws Exception {
SparkLauncher sparkLauncher = SparkUtil.initLauncher(sparkPropertiesPath);
sparkLauncher
.addAppArgs(SparkMain.SparkCommand.DEDUPLICATE.toString(), duplicatedPartitionPath,
repairedOutputPath, HoodieCLI.tableMetadata.getBasePath());
Process process = sparkLauncher.launch();
InputStreamConsumer.captureOutput(process);
int exitCode = process.waitFor();
if (exitCode != 0) {
return "Deduplicated files placed in: " + repairedOutputPath;
}
return "Deduplication failed ";
}
@CliAvailabilityIndicator({"repair addpartitionmeta"})
public boolean isRepairAddPartitionMetaAvailable() {
return HoodieCLI.tableMetadata != null;
}
@CliCommand(value = "repair deduplicate", help = "De-duplicate a partition path contains duplicates & produce repaired files to replace with")
public String deduplicate(
@CliOption(key = {
"duplicatedPartitionPath"}, help = "Partition Path containing the duplicates", mandatory = true)
final String duplicatedPartitionPath,
@CliOption(key = {"repairedOutputPath"}, help = "Location to place the repaired files", mandatory = true)
final String repairedOutputPath,
@CliOption(key = {"sparkProperties"}, help = "Spark Properites File Path", mandatory = true)
final String sparkPropertiesPath) throws Exception {
SparkLauncher sparkLauncher = SparkUtil.initLauncher(sparkPropertiesPath);
sparkLauncher
.addAppArgs(SparkMain.SparkCommand.DEDUPLICATE.toString(), duplicatedPartitionPath,
repairedOutputPath, HoodieCLI.tableMetadata.getBasePath());
Process process = sparkLauncher.launch();
InputStreamConsumer.captureOutput(process);
int exitCode = process.waitFor();
@CliCommand(value = "repair addpartitionmeta", help = "Add partition metadata to a dataset, if not present")
public String addPartitionMeta(
@CliOption(key = {"dryrun"},
help = "Should we actually add or just print what would be done",
unspecifiedDefaultValue = "true")
final boolean dryRun) throws IOException {
if (exitCode != 0) {
return "Deduplicated files placed in: " + repairedOutputPath;
String latestCommit = HoodieCLI.tableMetadata.getActiveTimeline().getCommitTimeline()
.lastInstant().get().getTimestamp();
List<String> partitionPaths = FSUtils.getAllFoldersThreeLevelsDown(HoodieCLI.fs,
HoodieCLI.tableMetadata.getBasePath());
Path basePath = new Path(HoodieCLI.tableMetadata.getBasePath());
String[][] rows = new String[partitionPaths.size() + 1][];
int ind = 0;
for (String partition : partitionPaths) {
Path partitionPath = new Path(basePath, partition);
String[] row = new String[3];
row[0] = partition;
row[1] = "Yes";
row[2] = "None";
if (!HoodiePartitionMetadata.hasPartitionMetadata(HoodieCLI.fs, partitionPath)) {
row[1] = "No";
if (!dryRun) {
HoodiePartitionMetadata partitionMetadata = new HoodiePartitionMetadata(
HoodieCLI.fs,
latestCommit,
basePath,
partitionPath);
partitionMetadata.trySave(0);
}
return "Deduplication failed ";
}
rows[ind++] = row;
}
@CliCommand(value = "repair addpartitionmeta", help = "Add partition metadata to a dataset, if not present")
public String addPartitionMeta(
@CliOption(key = {"dryrun"},
help = "Should we actually add or just print what would be done",
unspecifiedDefaultValue = "true")
final boolean dryRun) throws IOException {
String latestCommit = HoodieCLI.tableMetadata.getActiveTimeline().getCommitTimeline().lastInstant().get().getTimestamp();
List<String> partitionPaths = FSUtils.getAllFoldersThreeLevelsDown(HoodieCLI.fs,
HoodieCLI.tableMetadata.getBasePath());
Path basePath = new Path(HoodieCLI.tableMetadata.getBasePath());
String[][] rows = new String[partitionPaths.size() + 1][];
int ind = 0;
for (String partition: partitionPaths) {
Path partitionPath = new Path(basePath, partition);
String[] row = new String[3];
row[0] = partition; row[1] = "Yes"; row[2] = "None";
if (!HoodiePartitionMetadata.hasPartitionMetadata(HoodieCLI.fs, partitionPath)) {
row[1] = "No";
if (!dryRun) {
HoodiePartitionMetadata partitionMetadata = new HoodiePartitionMetadata(
HoodieCLI.fs,
latestCommit,
basePath,
partitionPath);
partitionMetadata.trySave(0);
}
}
rows[ind++] = row;
}
return HoodiePrintHelper.print(
new String[] {"Partition Path", "Metadata Present?", "Action"}, rows);
}
return HoodiePrintHelper.print(
new String[]{"Partition Path", "Metadata Present?", "Action"}, rows);
}
}

View File

@@ -27,6 +27,10 @@ import com.uber.hoodie.common.table.timeline.HoodieInstant;
import com.uber.hoodie.config.HoodieIndexConfig;
import com.uber.hoodie.config.HoodieWriteConfig;
import com.uber.hoodie.index.HoodieIndex;
import java.io.IOException;
import java.util.Collections;
import java.util.List;
import java.util.stream.Collectors;
import org.apache.spark.api.java.JavaSparkContext;
import org.apache.spark.launcher.SparkLauncher;
import org.springframework.shell.core.CommandMarker;
@@ -35,122 +39,118 @@ import org.springframework.shell.core.annotation.CliCommand;
import org.springframework.shell.core.annotation.CliOption;
import org.springframework.stereotype.Component;
import java.io.IOException;
import java.util.Collections;
import java.util.List;
import java.util.stream.Collectors;
@Component
public class SavepointsCommand implements CommandMarker {
@CliAvailabilityIndicator({"savepoints show"})
public boolean isShowAvailable() {
return HoodieCLI.tableMetadata != null;
@CliAvailabilityIndicator({"savepoints show"})
public boolean isShowAvailable() {
return HoodieCLI.tableMetadata != null;
}
@CliAvailabilityIndicator({"savepoints refresh"})
public boolean isRefreshAvailable() {
return HoodieCLI.tableMetadata != null;
}
@CliAvailabilityIndicator({"savepoint create"})
public boolean isCreateSavepointAvailable() {
return HoodieCLI.tableMetadata != null;
}
@CliAvailabilityIndicator({"savepoint rollback"})
public boolean isRollbackToSavepointAvailable() {
return HoodieCLI.tableMetadata != null && !HoodieCLI.tableMetadata.getActiveTimeline()
.getSavePointTimeline().filterCompletedInstants().empty();
}
@CliCommand(value = "savepoints show", help = "Show the savepoints")
public String showSavepoints() throws IOException {
HoodieActiveTimeline activeTimeline = HoodieCLI.tableMetadata.getActiveTimeline();
HoodieTimeline timeline = activeTimeline.getSavePointTimeline().filterCompletedInstants();
List<HoodieInstant> commits = timeline.getInstants().collect(Collectors.toList());
String[][] rows = new String[commits.size()][];
Collections.reverse(commits);
for (int i = 0; i < commits.size(); i++) {
HoodieInstant commit = commits.get(i);
rows[i] = new String[]{commit.getTimestamp()};
}
return HoodiePrintHelper.print(new String[]{"SavepointTime"}, rows);
}
@CliCommand(value = "savepoint create", help = "Savepoint a commit")
public String savepoint(
@CliOption(key = {"commit"}, help = "Commit to savepoint")
final String commitTime,
@CliOption(key = {"user"}, help = "User who is creating the savepoint")
final String user,
@CliOption(key = {"comments"}, help = "Comments for creating the savepoint")
final String comments) throws Exception {
HoodieActiveTimeline activeTimeline = HoodieCLI.tableMetadata.getActiveTimeline();
HoodieTimeline timeline = activeTimeline.getCommitTimeline().filterCompletedInstants();
HoodieInstant
commitInstant = new HoodieInstant(false, HoodieTimeline.COMMIT_ACTION, commitTime);
if (!timeline.containsInstant(commitInstant)) {
return "Commit " + commitTime + " not found in Commits " + timeline;
}
@CliAvailabilityIndicator({"savepoints refresh"})
public boolean isRefreshAvailable() {
return HoodieCLI.tableMetadata != null;
HoodieWriteClient client = createHoodieClient(null, HoodieCLI.tableMetadata.getBasePath());
if (client.savepoint(commitTime, user, comments)) {
// Refresh the current
refreshMetaClient();
return String.format("The commit \"%s\" has been savepointed.", commitTime);
}
return String.format("Failed: Could not savepoint commit \"%s\".", commitTime);
}
@CliCommand(value = "savepoint rollback", help = "Savepoint a commit")
public String rollbackToSavepoint(
@CliOption(key = {"savepoint"}, help = "Savepoint to rollback")
final String commitTime,
@CliOption(key = {"sparkProperties"}, help = "Spark Properites File Path")
final String sparkPropertiesPath) throws Exception {
HoodieActiveTimeline activeTimeline = HoodieCLI.tableMetadata.getActiveTimeline();
HoodieTimeline timeline = activeTimeline.getCommitTimeline().filterCompletedInstants();
HoodieInstant
commitInstant = new HoodieInstant(false, HoodieTimeline.COMMIT_ACTION, commitTime);
if (!timeline.containsInstant(commitInstant)) {
return "Commit " + commitTime + " not found in Commits " + timeline;
}
@CliAvailabilityIndicator({"savepoint create"})
public boolean isCreateSavepointAvailable() {
return HoodieCLI.tableMetadata != null;
}
@CliAvailabilityIndicator({"savepoint rollback"})
public boolean isRollbackToSavepointAvailable() {
return HoodieCLI.tableMetadata != null && !HoodieCLI.tableMetadata.getActiveTimeline().getSavePointTimeline().filterCompletedInstants().empty();
}
@CliCommand(value = "savepoints show", help = "Show the savepoints")
public String showSavepoints() throws IOException {
HoodieActiveTimeline activeTimeline = HoodieCLI.tableMetadata.getActiveTimeline();
HoodieTimeline timeline = activeTimeline.getSavePointTimeline().filterCompletedInstants();
List<HoodieInstant> commits = timeline.getInstants().collect(Collectors.toList());
String[][] rows = new String[commits.size()][];
Collections.reverse(commits);
for (int i = 0; i < commits.size(); i++) {
HoodieInstant commit = commits.get(i);
rows[i] = new String[] {commit.getTimestamp()};
}
return HoodiePrintHelper.print(new String[] {"SavepointTime"}, rows);
}
@CliCommand(value = "savepoint create", help = "Savepoint a commit")
public String savepoint(
@CliOption(key = {"commit"}, help = "Commit to savepoint")
final String commitTime,
@CliOption(key = {"user"}, help = "User who is creating the savepoint")
final String user,
@CliOption(key = {"comments"}, help = "Comments for creating the savepoint")
final String comments) throws Exception {
HoodieActiveTimeline activeTimeline = HoodieCLI.tableMetadata.getActiveTimeline();
HoodieTimeline timeline = activeTimeline.getCommitTimeline().filterCompletedInstants();
HoodieInstant
commitInstant = new HoodieInstant(false, HoodieTimeline.COMMIT_ACTION, commitTime);
if (!timeline.containsInstant(commitInstant)) {
return "Commit " + commitTime + " not found in Commits " + timeline;
}
HoodieWriteClient client = createHoodieClient(null, HoodieCLI.tableMetadata.getBasePath());
if (client.savepoint(commitTime, user, comments)) {
// Refresh the current
refreshMetaClient();
return String.format("The commit \"%s\" has been savepointed.", commitTime);
}
return String.format("Failed: Could not savepoint commit \"%s\".", commitTime);
}
@CliCommand(value = "savepoint rollback", help = "Savepoint a commit")
public String rollbackToSavepoint(
@CliOption(key = {"savepoint"}, help = "Savepoint to rollback")
final String commitTime,
@CliOption(key = {"sparkProperties"}, help = "Spark Properites File Path")
final String sparkPropertiesPath) throws Exception {
HoodieActiveTimeline activeTimeline = HoodieCLI.tableMetadata.getActiveTimeline();
HoodieTimeline timeline = activeTimeline.getCommitTimeline().filterCompletedInstants();
HoodieInstant
commitInstant = new HoodieInstant(false, HoodieTimeline.COMMIT_ACTION, commitTime);
if (!timeline.containsInstant(commitInstant)) {
return "Commit " + commitTime + " not found in Commits " + timeline;
}
SparkLauncher sparkLauncher = SparkUtil.initLauncher(sparkPropertiesPath);
sparkLauncher.addAppArgs(SparkMain.SparkCommand.ROLLBACK_TO_SAVEPOINT.toString(),
commitTime,
HoodieCLI.tableMetadata.getBasePath());
Process process = sparkLauncher.launch();
InputStreamConsumer.captureOutput(process);
int exitCode = process.waitFor();
// Refresh the current
refreshMetaClient();
if (exitCode != 0) {
return "Savepoint " + commitTime + " failed to roll back";
}
return "Savepoint " + commitTime + " rolled back";
SparkLauncher sparkLauncher = SparkUtil.initLauncher(sparkPropertiesPath);
sparkLauncher.addAppArgs(SparkMain.SparkCommand.ROLLBACK_TO_SAVEPOINT.toString(),
commitTime,
HoodieCLI.tableMetadata.getBasePath());
Process process = sparkLauncher.launch();
InputStreamConsumer.captureOutput(process);
int exitCode = process.waitFor();
// Refresh the current
refreshMetaClient();
if (exitCode != 0) {
return "Savepoint " + commitTime + " failed to roll back";
}
return "Savepoint " + commitTime + " rolled back";
}
@CliCommand(value = "savepoints refresh", help = "Refresh the savepoints")
public String refreshMetaClient() throws IOException {
HoodieTableMetaClient metadata =
new HoodieTableMetaClient(HoodieCLI.fs, HoodieCLI.tableMetadata.getBasePath());
HoodieCLI.setTableMetadata(metadata);
return "Metadata for table " + metadata.getTableConfig().getTableName() + " refreshed.";
}
private static HoodieWriteClient createHoodieClient(JavaSparkContext jsc, String basePath)
throws Exception {
HoodieWriteConfig config = HoodieWriteConfig.newBuilder().withPath(basePath)
.withIndexConfig(
HoodieIndexConfig.newBuilder().withIndexType(HoodieIndex.IndexType.BLOOM).build())
.build();
return new HoodieWriteClient(jsc, config, false);
}
@CliCommand(value = "savepoints refresh", help = "Refresh the savepoints")
public String refreshMetaClient() throws IOException {
HoodieTableMetaClient metadata =
new HoodieTableMetaClient(HoodieCLI.fs, HoodieCLI.tableMetadata.getBasePath());
HoodieCLI.setTableMetadata(metadata);
return "Metadata for table " + metadata.getTableConfig().getTableName() + " refreshed.";
}
private static HoodieWriteClient createHoodieClient(JavaSparkContext jsc, String basePath)
throws Exception {
HoodieWriteConfig config = HoodieWriteConfig.newBuilder().withPath(basePath)
.withIndexConfig(
HoodieIndexConfig.newBuilder().withIndexType(HoodieIndex.IndexType.BLOOM).build())
.build();
return new HoodieWriteClient(jsc, config, false);
}
}

View File

@@ -30,109 +30,110 @@ import org.apache.spark.sql.SQLContext;
public class SparkMain {
protected final static Logger LOG = Logger.getLogger(SparkMain.class);
protected final static Logger LOG = Logger.getLogger(SparkMain.class);
/**
* Commands
*/
enum SparkCommand {
ROLLBACK,
DEDUPLICATE,
ROLLBACK_TO_SAVEPOINT,
SAVEPOINT,
IMPORT
/**
* Commands
*/
enum SparkCommand {
ROLLBACK,
DEDUPLICATE,
ROLLBACK_TO_SAVEPOINT,
SAVEPOINT,
IMPORT
}
public static void main(String[] args) throws Exception {
String command = args[0];
LOG.info("Invoking SparkMain:" + command);
SparkCommand cmd = SparkCommand.valueOf(command);
JavaSparkContext jsc = SparkUtil.initJavaSparkConf("hoodie-cli-" + command);
int returnCode = 0;
switch (cmd) {
case ROLLBACK:
assert (args.length == 3);
returnCode = rollback(jsc, args[1], args[2]);
break;
case DEDUPLICATE:
assert (args.length == 4);
returnCode = deduplicatePartitionPath(jsc, args[1], args[2], args[3]);
break;
case ROLLBACK_TO_SAVEPOINT:
assert (args.length == 3);
returnCode = rollbackToSavepoint(jsc, args[1], args[2]);
break;
case IMPORT:
assert (args.length == 11);
returnCode = dataImport(jsc, args[1], args[2], args[3], args[4], args[5], args[6],
Integer.parseInt(args[7]), args[8], SparkUtil.DEFUALT_SPARK_MASTER, args[9],
Integer.parseInt(args[10]));
break;
}
public static void main(String[] args) throws Exception {
String command = args[0];
LOG.info("Invoking SparkMain:" + command);
System.exit(returnCode);
}
SparkCommand cmd = SparkCommand.valueOf(command);
private static int dataImport(JavaSparkContext jsc, String srcPath, String targetPath,
String tableName, String tableType, String rowKey, String partitionKey, int parallelism,
String schemaFile, String sparkMaster, String sparkMemory, int retry) throws Exception {
HDFSParquetImporter.Config cfg = new HDFSParquetImporter.Config();
cfg.srcPath = srcPath;
cfg.targetPath = targetPath;
cfg.tableName = tableName;
cfg.tableType = tableType;
cfg.rowKey = rowKey;
cfg.partitionKey = partitionKey;
cfg.parallelism = parallelism;
cfg.schemaFile = schemaFile;
jsc.getConf().set("spark.executor.memory", sparkMemory);
return new HDFSParquetImporter(cfg).dataImport(jsc, retry);
}
JavaSparkContext jsc = SparkUtil.initJavaSparkConf("hoodie-cli-" + command);
int returnCode = 0;
switch(cmd) {
case ROLLBACK:
assert (args.length == 3);
returnCode = rollback(jsc, args[1], args[2]);
break;
case DEDUPLICATE:
assert (args.length == 4);
returnCode = deduplicatePartitionPath(jsc, args[1], args[2], args[3]);
break;
case ROLLBACK_TO_SAVEPOINT:
assert (args.length == 3);
returnCode = rollbackToSavepoint(jsc, args[1], args[2]);
break;
case IMPORT:
assert (args.length == 11);
returnCode = dataImport(jsc, args[1], args[2], args[3], args[4], args[5], args[6],
Integer.parseInt(args[7]), args[8], SparkUtil.DEFUALT_SPARK_MASTER, args[9],
Integer.parseInt(args[10]));
break;
}
private static int deduplicatePartitionPath(JavaSparkContext jsc,
String duplicatedPartitionPath,
String repairedOutputPath,
String basePath)
throws Exception {
DedupeSparkJob job = new DedupeSparkJob(basePath,
duplicatedPartitionPath, repairedOutputPath, new SQLContext(jsc), FSUtils.getFs());
job.fixDuplicates(true);
return 0;
}
System.exit(returnCode);
private static int rollback(JavaSparkContext jsc, String commitTime, String basePath)
throws Exception {
HoodieWriteClient client = createHoodieClient(jsc, basePath);
if (client.rollback(commitTime)) {
LOG.info(String.format("The commit \"%s\" rolled back.", commitTime));
return 0;
} else {
LOG.info(String.format("The commit \"%s\" failed to roll back.", commitTime));
return -1;
}
}
private static int dataImport(JavaSparkContext jsc, String srcPath, String targetPath,
String tableName, String tableType, String rowKey, String partitionKey, int parallelism,
String schemaFile, String sparkMaster, String sparkMemory, int retry) throws Exception {
HDFSParquetImporter.Config cfg = new HDFSParquetImporter.Config();
cfg.srcPath = srcPath;
cfg.targetPath = targetPath;
cfg.tableName = tableName;
cfg.tableType = tableType;
cfg.rowKey = rowKey;
cfg.partitionKey = partitionKey;
cfg.parallelism = parallelism;
cfg.schemaFile = schemaFile;
jsc.getConf().set("spark.executor.memory", sparkMemory);
return new HDFSParquetImporter(cfg).dataImport(jsc, retry);
private static int rollbackToSavepoint(JavaSparkContext jsc, String savepointTime,
String basePath)
throws Exception {
HoodieWriteClient client = createHoodieClient(jsc, basePath);
if (client.rollbackToSavepoint(savepointTime)) {
LOG.info(String.format("The commit \"%s\" rolled back.", savepointTime));
return 0;
} else {
LOG.info(String.format("The commit \"%s\" failed to roll back.", savepointTime));
return -1;
}
}
private static int deduplicatePartitionPath(JavaSparkContext jsc,
String duplicatedPartitionPath,
String repairedOutputPath,
String basePath)
throws Exception {
DedupeSparkJob job = new DedupeSparkJob(basePath,
duplicatedPartitionPath,repairedOutputPath,new SQLContext(jsc), FSUtils.getFs());
job.fixDuplicates(true);
return 0;
}
private static int rollback(JavaSparkContext jsc, String commitTime, String basePath)
throws Exception {
HoodieWriteClient client = createHoodieClient(jsc, basePath);
if (client.rollback(commitTime)) {
LOG.info(String.format("The commit \"%s\" rolled back.", commitTime));
return 0;
} else {
LOG.info(String.format("The commit \"%s\" failed to roll back.", commitTime));
return -1;
}
}
private static int rollbackToSavepoint(JavaSparkContext jsc, String savepointTime, String basePath)
throws Exception {
HoodieWriteClient client = createHoodieClient(jsc, basePath);
if (client.rollbackToSavepoint(savepointTime)) {
LOG.info(String.format("The commit \"%s\" rolled back.", savepointTime));
return 0;
} else {
LOG.info(String.format("The commit \"%s\" failed to roll back.", savepointTime));
return -1;
}
}
private static HoodieWriteClient createHoodieClient(JavaSparkContext jsc, String basePath)
throws Exception {
HoodieWriteConfig config = HoodieWriteConfig.newBuilder().withPath(basePath)
.withIndexConfig(
HoodieIndexConfig.newBuilder().withIndexType(HoodieIndex.IndexType.BLOOM).build())
.build();
return new HoodieWriteClient(jsc, config);
}
private static HoodieWriteClient createHoodieClient(JavaSparkContext jsc, String basePath)
throws Exception {
HoodieWriteConfig config = HoodieWriteConfig.newBuilder().withPath(basePath)
.withIndexConfig(
HoodieIndexConfig.newBuilder().withIndexType(HoodieIndex.IndexType.BLOOM).build())
.build();
return new HoodieWriteClient(jsc, config);
}
}

View File

@@ -28,7 +28,10 @@ import com.uber.hoodie.common.table.timeline.HoodieActiveTimeline;
import com.uber.hoodie.common.table.timeline.HoodieInstant;
import com.uber.hoodie.common.util.FSUtils;
import com.uber.hoodie.common.util.NumericUtils;
import java.io.IOException;
import java.text.DecimalFormat;
import java.util.HashMap;
import java.util.stream.Collectors;
import org.apache.hadoop.fs.FileStatus;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
@@ -38,106 +41,105 @@ import org.springframework.shell.core.annotation.CliCommand;
import org.springframework.shell.core.annotation.CliOption;
import org.springframework.stereotype.Component;
import java.io.IOException;
import java.text.DecimalFormat;
import java.util.HashMap;
import java.util.stream.Collectors;
@Component
public class StatsCommand implements CommandMarker {
@CliAvailabilityIndicator({"stats wa"})
public boolean isWriteAmpAvailable() {
return HoodieCLI.tableMetadata != null;
@CliAvailabilityIndicator({"stats wa"})
public boolean isWriteAmpAvailable() {
return HoodieCLI.tableMetadata != null;
}
@CliCommand(value = "stats wa", help = "Write Amplification. Ratio of how many records were upserted to how many records were actually written")
public String writeAmplificationStats() throws IOException {
long totalRecordsUpserted = 0;
long totalRecordsWritten = 0;
HoodieActiveTimeline activeTimeline = HoodieCLI.tableMetadata.getActiveTimeline();
HoodieTimeline timeline = activeTimeline.getCommitTimeline().filterCompletedInstants();
String[][] rows = new String[new Long(timeline.countInstants()).intValue() + 1][];
int i = 0;
DecimalFormat df = new DecimalFormat("#.00");
for (HoodieInstant commitTime : timeline.getInstants().collect(
Collectors.toList())) {
String waf = "0";
HoodieCommitMetadata commit = HoodieCommitMetadata
.fromBytes(activeTimeline.getInstantDetails(commitTime).get());
if (commit.fetchTotalUpdateRecordsWritten() > 0) {
waf = df.format(
(float) commit.fetchTotalRecordsWritten() / commit
.fetchTotalUpdateRecordsWritten());
}
rows[i++] = new String[]{commitTime.getTimestamp(),
String.valueOf(commit.fetchTotalUpdateRecordsWritten()),
String.valueOf(commit.fetchTotalRecordsWritten()), waf};
totalRecordsUpserted += commit.fetchTotalUpdateRecordsWritten();
totalRecordsWritten += commit.fetchTotalRecordsWritten();
}
String waf = "0";
if (totalRecordsUpserted > 0) {
waf = df.format((float) totalRecordsWritten / totalRecordsUpserted);
}
rows[i] = new String[]{"Total", String.valueOf(totalRecordsUpserted),
String.valueOf(totalRecordsWritten), waf};
return HoodiePrintHelper.print(
new String[]{"CommitTime", "Total Upserted", "Total Written",
"Write Amplifiation Factor"}, rows);
}
private String[] printFileSizeHistogram(String commitTime, Snapshot s) {
return new String[]{
commitTime,
NumericUtils.humanReadableByteCount(s.getMin()),
NumericUtils.humanReadableByteCount(s.getValue(0.1)),
NumericUtils.humanReadableByteCount(s.getMedian()),
NumericUtils.humanReadableByteCount(s.getMean()),
NumericUtils.humanReadableByteCount(s.get95thPercentile()),
NumericUtils.humanReadableByteCount(s.getMax()),
String.valueOf(s.size()),
NumericUtils.humanReadableByteCount(s.getStdDev())
};
}
@CliCommand(value = "stats filesizes", help = "File Sizes. Display summary stats on sizes of files")
public String fileSizeStats(
@CliOption(key = {
"partitionPath"}, help = "regex to select files, eg: 2016/08/02", unspecifiedDefaultValue = "*/*/*")
final String globRegex) throws IOException {
FileSystem fs = HoodieCLI.fs;
String globPath = String.format("%s/%s/*",
HoodieCLI.tableMetadata.getBasePath(),
globRegex);
FileStatus[] statuses = fs.globStatus(new Path(globPath));
// max, min, #small files < 10MB, 50th, avg, 95th
final int MAX_FILES = 1000000;
Histogram globalHistogram = new Histogram(new UniformReservoir(MAX_FILES));
HashMap<String, Histogram> commitHistoMap = new HashMap<String, Histogram>();
for (FileStatus fileStatus : statuses) {
String commitTime = FSUtils.getCommitTime(fileStatus.getPath().getName());
long sz = fileStatus.getLen();
if (!commitHistoMap.containsKey(commitTime)) {
commitHistoMap.put(commitTime, new Histogram(new UniformReservoir(MAX_FILES)));
}
commitHistoMap.get(commitTime).update(sz);
globalHistogram.update(sz);
}
@CliCommand(value = "stats wa", help = "Write Amplification. Ratio of how many records were upserted to how many records were actually written")
public String writeAmplificationStats() throws IOException {
long totalRecordsUpserted = 0;
long totalRecordsWritten = 0;
HoodieActiveTimeline activeTimeline = HoodieCLI.tableMetadata.getActiveTimeline();
HoodieTimeline timeline = activeTimeline.getCommitTimeline().filterCompletedInstants();
String[][] rows = new String[new Long(timeline.countInstants()).intValue() + 1][];
int i = 0;
DecimalFormat df = new DecimalFormat("#.00");
for (HoodieInstant commitTime : timeline.getInstants().collect(
Collectors.toList())) {
String waf = "0";
HoodieCommitMetadata commit = HoodieCommitMetadata.fromBytes(activeTimeline.getInstantDetails(commitTime).get());
if (commit.fetchTotalUpdateRecordsWritten() > 0) {
waf = df.format(
(float) commit.fetchTotalRecordsWritten() / commit
.fetchTotalUpdateRecordsWritten());
}
rows[i++] = new String[] {commitTime.getTimestamp(),
String.valueOf(commit.fetchTotalUpdateRecordsWritten()),
String.valueOf(commit.fetchTotalRecordsWritten()), waf};
totalRecordsUpserted += commit.fetchTotalUpdateRecordsWritten();
totalRecordsWritten += commit.fetchTotalRecordsWritten();
}
String waf = "0";
if (totalRecordsUpserted > 0) {
waf = df.format((float) totalRecordsWritten / totalRecordsUpserted);
}
rows[i] = new String[] {"Total", String.valueOf(totalRecordsUpserted),
String.valueOf(totalRecordsWritten), waf};
return HoodiePrintHelper.print(
new String[] {"CommitTime", "Total Upserted", "Total Written",
"Write Amplifiation Factor"}, rows);
String[][] rows = new String[commitHistoMap.size() + 1][];
int ind = 0;
for (String commitTime : commitHistoMap.keySet()) {
Snapshot s = commitHistoMap.get(commitTime).getSnapshot();
rows[ind++] = printFileSizeHistogram(commitTime, s);
}
Snapshot s = globalHistogram.getSnapshot();
rows[ind++] = printFileSizeHistogram("ALL", s);
private String[] printFileSizeHistogram(String commitTime, Snapshot s) {
return new String[]{
commitTime,
NumericUtils.humanReadableByteCount(s.getMin()),
NumericUtils.humanReadableByteCount(s.getValue(0.1)),
NumericUtils.humanReadableByteCount(s.getMedian()),
NumericUtils.humanReadableByteCount(s.getMean()),
NumericUtils.humanReadableByteCount(s.get95thPercentile()),
NumericUtils.humanReadableByteCount(s.getMax()),
String.valueOf(s.size()),
NumericUtils.humanReadableByteCount(s.getStdDev())
};
}
@CliCommand(value = "stats filesizes", help = "File Sizes. Display summary stats on sizes of files")
public String fileSizeStats(
@CliOption(key = {"partitionPath"}, help = "regex to select files, eg: 2016/08/02", unspecifiedDefaultValue = "*/*/*")
final String globRegex) throws IOException {
FileSystem fs = HoodieCLI.fs;
String globPath = String.format("%s/%s/*",
HoodieCLI.tableMetadata.getBasePath(),
globRegex);
FileStatus[] statuses = fs.globStatus(new Path(globPath));
// max, min, #small files < 10MB, 50th, avg, 95th
final int MAX_FILES = 1000000;
Histogram globalHistogram = new Histogram(new UniformReservoir(MAX_FILES));
HashMap<String, Histogram> commitHistoMap = new HashMap<String, Histogram>();
for (FileStatus fileStatus: statuses) {
String commitTime = FSUtils.getCommitTime(fileStatus.getPath().getName());
long sz = fileStatus.getLen();
if (!commitHistoMap.containsKey(commitTime)) {
commitHistoMap.put(commitTime, new Histogram(new UniformReservoir(MAX_FILES)));
}
commitHistoMap.get(commitTime).update(sz);
globalHistogram.update(sz);
}
String[][] rows = new String[commitHistoMap.size() + 1][];
int ind = 0;
for (String commitTime: commitHistoMap.keySet()) {
Snapshot s = commitHistoMap.get(commitTime).getSnapshot();
rows[ind++] = printFileSizeHistogram(commitTime, s);
}
Snapshot s = globalHistogram.getSnapshot();
rows[ind++] = printFileSizeHistogram("ALL", s);
return HoodiePrintHelper.print(
new String[] {"CommitTime", "Min", "10th", "50th", "avg", "95th", "Max", "NumFiles", "StdDev"}, rows);
}
return HoodiePrintHelper.print(
new String[]{"CommitTime", "Min", "10th", "50th", "avg", "95th", "Max", "NumFiles",
"StdDev"}, rows);
}
}

View File

@@ -23,12 +23,13 @@ import org.springframework.stereotype.Component;
@Component
public class UtilsCommand implements CommandMarker {
@CliCommand(value = "utils loadClass", help = "Load a class" )
public String loadClass(
@CliOption(key = {"class"}, help = "Check mode" ) final String clazz
) throws Exception {
Class klass = Class.forName(clazz);
return klass.getProtectionDomain().getCodeSource().getLocation().toExternalForm();
}
@CliCommand(value = "utils loadClass", help = "Load a class")
public String loadClass(
@CliOption(key = {"class"}, help = "Check mode") final String clazz
) throws Exception {
Class klass = Class.forName(clazz);
return klass.getProtectionDomain().getCodeSource().getLocation().toExternalForm();
}
}

View File

@@ -20,21 +20,22 @@ import com.uber.hoodie.common.model.HoodieCommitMetadata;
import com.uber.hoodie.common.table.HoodieTableMetaClient;
import com.uber.hoodie.common.table.HoodieTimeline;
import com.uber.hoodie.common.table.timeline.HoodieInstant;
import java.io.IOException;
import java.util.List;
public class CommitUtil {
public static long countNewRecords(HoodieTableMetaClient target, List<String> commitsToCatchup)
throws IOException {
long totalNew = 0;
HoodieTimeline timeline = target.getActiveTimeline().reload().getCommitTimeline().filterCompletedInstants();
for(String commit:commitsToCatchup) {
HoodieCommitMetadata c = HoodieCommitMetadata.fromBytes(timeline
.getInstantDetails(new HoodieInstant(false, HoodieTimeline.COMMIT_ACTION, commit))
.get());
totalNew += c.fetchTotalRecordsWritten() - c.fetchTotalUpdateRecordsWritten();
}
return totalNew;
public static long countNewRecords(HoodieTableMetaClient target, List<String> commitsToCatchup)
throws IOException {
long totalNew = 0;
HoodieTimeline timeline = target.getActiveTimeline().reload().getCommitTimeline()
.filterCompletedInstants();
for (String commit : commitsToCatchup) {
HoodieCommitMetadata c = HoodieCommitMetadata.fromBytes(timeline
.getInstantDetails(new HoodieInstant(false, HoodieTimeline.COMMIT_ACTION, commit))
.get());
totalNew += c.fetchTotalRecordsWritten() - c.fetchTotalUpdateRecordsWritten();
}
return totalNew;
}
}

View File

@@ -17,107 +17,112 @@
package com.uber.hoodie.cli.utils;
import com.uber.hoodie.common.table.HoodieTableMetaClient;
import org.apache.commons.dbcp.BasicDataSource;
import org.joda.time.DateTime;
import javax.sql.DataSource;
import java.sql.Connection;
import java.sql.ResultSet;
import java.sql.SQLException;
import java.sql.Statement;
import javax.sql.DataSource;
import org.apache.commons.dbcp.BasicDataSource;
import org.joda.time.DateTime;
public class HiveUtil {
private static String driverName = "org.apache.hive.jdbc.HiveDriver";
static {
try {
Class.forName(driverName);
} catch (ClassNotFoundException e) {
throw new IllegalStateException("Could not find " + driverName + " in classpath. ", e);
}
private static String driverName = "org.apache.hive.jdbc.HiveDriver";
static {
try {
Class.forName(driverName);
} catch (ClassNotFoundException e) {
throw new IllegalStateException("Could not find " + driverName + " in classpath. ", e);
}
}
private static Connection connection;
private static Connection connection;
private static Connection getConnection(String jdbcUrl, String user, String pass) throws SQLException {
DataSource ds = getDatasource(jdbcUrl, user, pass);
return ds.getConnection();
private static Connection getConnection(String jdbcUrl, String user, String pass)
throws SQLException {
DataSource ds = getDatasource(jdbcUrl, user, pass);
return ds.getConnection();
}
private static DataSource getDatasource(String jdbcUrl, String user, String pass) {
BasicDataSource ds = new BasicDataSource();
ds.setDriverClassName(driverName);
ds.setUrl(jdbcUrl);
ds.setUsername(user);
ds.setPassword(pass);
return ds;
}
public static long countRecords(String jdbcUrl, HoodieTableMetaClient source, String dbName,
String user, String pass) throws SQLException {
Connection conn = HiveUtil.getConnection(jdbcUrl, user, pass);
ResultSet rs = null;
Statement stmt = conn.createStatement();
try {
//stmt.execute("set mapred.job.queue.name=<queue_name>");
stmt.execute("set hive.input.format=org.apache.hadoop.hive.ql.io.HiveInputFormat");
stmt.execute("set hive.stats.autogather=false");
rs = stmt.executeQuery(
"select count(`_hoodie_commit_time`) as cnt from " + dbName + "." + source
.getTableConfig()
.getTableName());
long count = -1;
if (rs.next()) {
count = rs.getLong("cnt");
}
System.out
.println("Total records in " + source.getTableConfig().getTableName() + " is " + count);
return count;
} finally {
if (rs != null) {
rs.close();
}
if (stmt != null) {
stmt.close();
}
}
}
private static DataSource getDatasource(String jdbcUrl, String user, String pass) {
BasicDataSource ds = new BasicDataSource();
ds.setDriverClassName(driverName);
ds.setUrl(jdbcUrl);
ds.setUsername(user);
ds.setPassword(pass);
return ds;
}
public static long countRecords(String jdbcUrl, HoodieTableMetaClient source, String srcDb,
int partitions, String user, String pass) throws SQLException {
DateTime dateTime = DateTime.now();
String endDateStr =
dateTime.getYear() + "-" + String.format("%02d", dateTime.getMonthOfYear()) + "-" +
String.format("%02d", dateTime.getDayOfMonth());
dateTime = dateTime.minusDays(partitions);
String startDateStr =
dateTime.getYear() + "-" + String.format("%02d", dateTime.getMonthOfYear()) + "-" +
String.format("%02d", dateTime.getDayOfMonth());
System.out.println("Start date " + startDateStr + " and end date " + endDateStr);
return countRecords(jdbcUrl, source, srcDb, startDateStr, endDateStr, user, pass);
}
public static long countRecords(String jdbcUrl, HoodieTableMetaClient source, String dbName, String user, String pass) throws SQLException {
Connection conn = HiveUtil.getConnection(jdbcUrl, user, pass);
ResultSet rs = null;
Statement stmt = conn.createStatement();
try {
//stmt.execute("set mapred.job.queue.name=<queue_name>");
stmt.execute("set hive.input.format=org.apache.hadoop.hive.ql.io.HiveInputFormat" );
stmt.execute("set hive.stats.autogather=false" );
rs = stmt.executeQuery(
"select count(`_hoodie_commit_time`) as cnt from " + dbName + "." + source.getTableConfig()
.getTableName());
long count = -1;
if(rs.next()) {
count = rs.getLong("cnt");
}
System.out.println("Total records in " + source.getTableConfig().getTableName() + " is " + count);
return count;
} finally {
if (rs != null) {
rs.close();
}
if (stmt != null) {
stmt.close();
}
}
}
public static long countRecords(String jdbcUrl, HoodieTableMetaClient source, String srcDb,
int partitions, String user, String pass) throws SQLException {
DateTime dateTime = DateTime.now();
String endDateStr =
dateTime.getYear() + "-" + String.format("%02d", dateTime.getMonthOfYear()) + "-" +
String.format("%02d", dateTime.getDayOfMonth());
dateTime = dateTime.minusDays(partitions);
String startDateStr =
dateTime.getYear() + "-" + String.format("%02d", dateTime.getMonthOfYear()) + "-" +
String.format("%02d", dateTime.getDayOfMonth());
System.out.println("Start date " + startDateStr + " and end date " + endDateStr);
return countRecords(jdbcUrl, source, srcDb, startDateStr, endDateStr, user, pass);
}
private static long countRecords(String jdbcUrl, HoodieTableMetaClient source, String srcDb, String startDateStr,
String endDateStr, String user, String pass) throws SQLException {
Connection conn = HiveUtil.getConnection(jdbcUrl, user, pass);
ResultSet rs = null;
Statement stmt = conn.createStatement();
try {
//stmt.execute("set mapred.job.queue.name=<queue_name>");
stmt.execute("set hive.input.format=org.apache.hadoop.hive.ql.io.HiveInputFormat");
stmt.execute("set hive.stats.autogather=false");
rs = stmt.executeQuery(
"select count(`_hoodie_commit_time`) as cnt from " + srcDb + "." + source.getTableConfig()
.getTableName() + " where datestr>'" + startDateStr + "' and datestr<='"
+ endDateStr + "'");
if(rs.next()) {
return rs.getLong("cnt");
}
return -1;
} finally {
if (rs != null) {
rs.close();
}
if (stmt != null) {
stmt.close();
}
}
private static long countRecords(String jdbcUrl, HoodieTableMetaClient source, String srcDb,
String startDateStr,
String endDateStr, String user, String pass) throws SQLException {
Connection conn = HiveUtil.getConnection(jdbcUrl, user, pass);
ResultSet rs = null;
Statement stmt = conn.createStatement();
try {
//stmt.execute("set mapred.job.queue.name=<queue_name>");
stmt.execute("set hive.input.format=org.apache.hadoop.hive.ql.io.HiveInputFormat");
stmt.execute("set hive.stats.autogather=false");
rs = stmt.executeQuery(
"select count(`_hoodie_commit_time`) as cnt from " + srcDb + "." + source.getTableConfig()
.getTableName() + " where datestr>'" + startDateStr + "' and datestr<='"
+ endDateStr + "'");
if (rs.next()) {
return rs.getLong("cnt");
}
return -1;
} finally {
if (rs != null) {
rs.close();
}
if (stmt != null) {
stmt.close();
}
}
}
}

View File

@@ -23,34 +23,37 @@ import java.io.InputStreamReader;
import java.util.logging.Logger;
public class InputStreamConsumer extends Thread {
protected final static Logger LOG = Logger.getLogger(InputStreamConsumer.class.getName());
private InputStream is;
public InputStreamConsumer(InputStream is) {
this.is = is;
}
@Override
public void run() {
try {
InputStreamReader isr = new InputStreamReader(is);
BufferedReader br = new BufferedReader(isr);
String line;
while ( (line = br.readLine()) != null)
LOG.info(line);
} catch (IOException ioe) {
LOG.severe(ioe.toString());
ioe.printStackTrace();
}
}
protected final static Logger LOG = Logger.getLogger(InputStreamConsumer.class.getName());
private InputStream is;
public static void captureOutput(Process p) {
InputStreamConsumer stdout;
InputStreamConsumer errout;
errout = new InputStreamConsumer(p.getErrorStream());
stdout = new InputStreamConsumer(p.getInputStream());
errout.start();
stdout.start();
public InputStreamConsumer(InputStream is) {
this.is = is;
}
@Override
public void run() {
try {
InputStreamReader isr = new InputStreamReader(is);
BufferedReader br = new BufferedReader(isr);
String line;
while ((line = br.readLine()) != null) {
LOG.info(line);
}
} catch (IOException ioe) {
LOG.severe(ioe.toString());
ioe.printStackTrace();
}
}
public static void captureOutput(Process p) {
InputStreamConsumer stdout;
InputStreamConsumer errout;
errout = new InputStreamConsumer(p.getErrorStream());
stdout = new InputStreamConsumer(p.getInputStream());
errout.start();
stdout.start();
}
}

View File

@@ -18,59 +18,54 @@ package com.uber.hoodie.cli.utils;
import com.uber.hoodie.HoodieWriteClient;
import com.uber.hoodie.cli.commands.SparkMain;
import java.io.File;
import java.net.URISyntaxException;
import org.apache.log4j.Logger;
import org.apache.spark.SparkConf;
import org.apache.spark.api.java.JavaSparkContext;
import org.apache.spark.launcher.SparkLauncher;
import java.io.File;
import java.net.URISyntaxException;
public class SparkUtil {
public static Logger logger = Logger.getLogger(SparkUtil.class);
public static final String DEFUALT_SPARK_MASTER = "yarn-client";
public static Logger logger = Logger.getLogger(SparkUtil.class);
public static final String DEFUALT_SPARK_MASTER = "yarn-client";
/**
*
* TODO: Need to fix a bunch of hardcoded stuff here eg: history server, spark distro
*
* @return
* @throws URISyntaxException
*/
public static SparkLauncher initLauncher(String propertiesFile) throws URISyntaxException {
String currentJar = new File(
SparkUtil.class.getProtectionDomain().getCodeSource().getLocation().toURI().getPath())
.getAbsolutePath();
SparkLauncher sparkLauncher =
new SparkLauncher().setAppResource(currentJar)
.setMainClass(SparkMain.class.getName())
.setPropertiesFile(propertiesFile);
File libDirectory = new File(new File(currentJar).getParent(), "lib");
for (String library : libDirectory.list()) {
sparkLauncher.addJar(new File(libDirectory, library).getAbsolutePath());
}
return sparkLauncher;
/**
* TODO: Need to fix a bunch of hardcoded stuff here eg: history server, spark distro
*/
public static SparkLauncher initLauncher(String propertiesFile) throws URISyntaxException {
String currentJar = new File(
SparkUtil.class.getProtectionDomain().getCodeSource().getLocation().toURI().getPath())
.getAbsolutePath();
SparkLauncher sparkLauncher =
new SparkLauncher().setAppResource(currentJar)
.setMainClass(SparkMain.class.getName())
.setPropertiesFile(propertiesFile);
File libDirectory = new File(new File(currentJar).getParent(), "lib");
for (String library : libDirectory.list()) {
sparkLauncher.addJar(new File(libDirectory, library).getAbsolutePath());
}
return sparkLauncher;
}
public static JavaSparkContext initJavaSparkConf(String name) {
SparkConf sparkConf = new SparkConf().setAppName(name);
sparkConf.setMaster(DEFUALT_SPARK_MASTER);
sparkConf.set("spark.serializer", "org.apache.spark.serializer.KryoSerializer");
sparkConf.set("spark.driver.maxResultSize", "2g");
sparkConf.set("spark.eventLog.overwrite", "true");
sparkConf.set("spark.eventLog.enabled", "true");
public static JavaSparkContext initJavaSparkConf(String name) {
SparkConf sparkConf = new SparkConf().setAppName(name);
sparkConf.setMaster(DEFUALT_SPARK_MASTER);
sparkConf.set("spark.serializer", "org.apache.spark.serializer.KryoSerializer");
sparkConf.set("spark.driver.maxResultSize", "2g");
sparkConf.set("spark.eventLog.overwrite", "true");
sparkConf.set("spark.eventLog.enabled", "true");
// Configure hadoop conf
sparkConf.set("spark.hadoop.mapred.output.compress", "true");
sparkConf.set("spark.hadoop.mapred.output.compression.codec", "true");
sparkConf.set("spark.hadoop.mapred.output.compression.codec", "org.apache.hadoop.io.compress.GzipCodec");
sparkConf.set("spark.hadoop.mapred.output.compression.type", "BLOCK");
// Configure hadoop conf
sparkConf.set("spark.hadoop.mapred.output.compress", "true");
sparkConf.set("spark.hadoop.mapred.output.compression.codec", "true");
sparkConf.set("spark.hadoop.mapred.output.compression.codec",
"org.apache.hadoop.io.compress.GzipCodec");
sparkConf.set("spark.hadoop.mapred.output.compression.type", "BLOCK");
sparkConf = HoodieWriteClient.registerClasses(sparkConf);
JavaSparkContext jsc = new JavaSparkContext(sparkConf);
jsc.hadoopConfiguration().setBoolean("parquet.enable.summary-metadata", false);
return jsc;
}
sparkConf = HoodieWriteClient.registerClasses(sparkConf);
JavaSparkContext jsc = new JavaSparkContext(sparkConf);
jsc.hadoopConfiguration().setBoolean("parquet.enable.summary-metadata", false);
return jsc;
}
}

View File

@@ -16,11 +16,11 @@
-->
<beans xmlns="http://www.springframework.org/schema/beans"
xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
xmlns:context="http://www.springframework.org/schema/context"
xsi:schemaLocation="http://www.springframework.org/schema/beans http://www.springframework.org/schema/beans/spring-beans.xsd
xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
xmlns:context="http://www.springframework.org/schema/context"
xsi:schemaLocation="http://www.springframework.org/schema/beans http://www.springframework.org/schema/beans/spring-beans.xsd
http://www.springframework.org/schema/context http://www.springframework.org/schema/context/spring-context.xsd">
<context:component-scan base-package="com.uber.hoodie.cli" />
<context:component-scan base-package="com.uber.hoodie.cli"/>
</beans>

View File

@@ -34,11 +34,11 @@ import scala.collection.mutable._
/**
* Spark job to de-duplicate data present in a partition path
*/
class DedupeSparkJob (basePath: String,
duplicatedPartitionPath: String,
repairOutputPath: String,
sqlContext: SQLContext,
fs: FileSystem) {
class DedupeSparkJob(basePath: String,
duplicatedPartitionPath: String,
repairOutputPath: String,
sqlContext: SQLContext,
fs: FileSystem) {
val sparkHelper = new SparkHelper(sqlContext, fs)
@@ -50,8 +50,9 @@ class DedupeSparkJob (basePath: String,
* @param tblName
* @return
*/
def getDupeKeyDF(tblName: String) : DataFrame = {
val dupeSql = s"""
def getDupeKeyDF(tblName: String): DataFrame = {
val dupeSql =
s"""
select `${HoodieRecord.RECORD_KEY_METADATA_FIELD}` as dupe_key,
count(*) as dupe_cnt
from ${tblName}
@@ -69,7 +70,7 @@ class DedupeSparkJob (basePath: String,
*
* @return
*/
private def planDuplicateFix() : HashMap[String, HashSet[String]] = {
private def planDuplicateFix(): HashMap[String, HashSet[String]] = {
val tmpTableName = s"htbl_${System.currentTimeMillis()}"
val dedupeTblName = s"${tmpTableName}_dupeKeys"
@@ -78,17 +79,18 @@ class DedupeSparkJob (basePath: String,
val allFiles = fs.listStatus(new org.apache.hadoop.fs.Path(s"${basePath}/${duplicatedPartitionPath}"))
val fsView = new HoodieTableFileSystemView(metadata, metadata.getActiveTimeline.getCommitTimeline.filterCompletedInstants(), allFiles)
val latestFiles:java.util.List[HoodieDataFile] = fsView.getLatestDataFiles().collect(Collectors.toList[HoodieDataFile]())
val latestFiles: java.util.List[HoodieDataFile] = fsView.getLatestDataFiles().collect(Collectors.toList[HoodieDataFile]())
val filteredStatuses = latestFiles.map(f => f.getPath)
LOG.info(s" List of files under partition: ${} => ${filteredStatuses.mkString(" ")}")
val df = sqlContext.parquetFile(filteredStatuses:_*)
val df = sqlContext.parquetFile(filteredStatuses: _*)
df.registerTempTable(tmpTableName)
val dupeKeyDF = getDupeKeyDF(tmpTableName)
dupeKeyDF.registerTempTable(dedupeTblName)
// Obtain necessary satellite information for duplicate rows
val dupeDataSql = s"""
val dupeDataSql =
s"""
SELECT `_hoodie_record_key`, `_hoodie_partition_path`, `_hoodie_file_name`, `_hoodie_commit_time`
FROM ${tmpTableName} h
JOIN ${dedupeTblName} d
@@ -111,9 +113,9 @@ class DedupeSparkJob (basePath: String,
rows.foreach(r => {
val c = r(3).asInstanceOf[String].toLong
if (c != maxCommit){
if (c != maxCommit) {
val f = r(2).asInstanceOf[String].split("_")(0)
if (!fileToDeleteKeyMap.contains(f)){
if (!fileToDeleteKeyMap.contains(f)) {
fileToDeleteKeyMap(f) = HashSet[String]()
}
fileToDeleteKeyMap(f).add(key)
@@ -130,28 +132,30 @@ class DedupeSparkJob (basePath: String,
val allFiles = fs.listStatus(new Path(s"${basePath}/${duplicatedPartitionPath}"))
val fsView = new HoodieTableFileSystemView(metadata, metadata.getActiveTimeline.getCommitTimeline.filterCompletedInstants(), allFiles)
val latestFiles:java.util.List[HoodieDataFile] = fsView.getLatestDataFiles().collect(Collectors.toList[HoodieDataFile]())
val latestFiles: java.util.List[HoodieDataFile] = fsView.getLatestDataFiles().collect(Collectors.toList[HoodieDataFile]())
val fileNameToPathMap = latestFiles.map(f => (f.getFileId, new Path(f.getPath))).toMap
val dupeFixPlan = planDuplicateFix()
// 1. Copy all latest files into the temp fix path
fileNameToPathMap.foreach{ case(fileName, filePath) => {
fileNameToPathMap.foreach { case (fileName, filePath) => {
val badSuffix = if (dupeFixPlan.contains(fileName)) ".bad" else ""
val dstPath = new Path(s"${repairOutputPath}/${filePath.getName}${badSuffix}")
LOG.info(s"Copying from ${filePath} to ${dstPath}")
FileUtil.copy(fs, filePath, fs, dstPath, false, true, fs.getConf)
}}
}
}
// 2. Remove duplicates from the bad files
dupeFixPlan.foreach{case(fileName, keysToSkip) => {
dupeFixPlan.foreach { case (fileName, keysToSkip) => {
val commitTime = FSUtils.getCommitTime(fileNameToPathMap(fileName).getName)
val badFilePath = new Path(s"${repairOutputPath}/${fileNameToPathMap(fileName).getName}.bad")
val newFilePath = new Path(s"${repairOutputPath}/${fileNameToPathMap(fileName).getName}")
LOG.info(" Skipping and writing new file for : " + fileName)
SparkHelpers.skipKeysAndWriteNewFile(commitTime, fs, badFilePath, newFilePath, dupeFixPlan(fileName))
fs.delete(badFilePath, false)
}}
}
}
// 3. Check that there are no duplicates anymore.
val df = sqlContext.read.parquet(s"${repairOutputPath}/*.parquet")
@@ -186,6 +190,7 @@ class DedupeSparkJob (basePath: String,
LOG.info(s"[FOR REAL!!!] Copying from ${srcPath} to ${dstPath}")
FileUtil.copy(fs, srcPath, fs, dstPath, false, true, fs.getConf)
}
}}
}
}
}
}

View File

@@ -17,9 +17,9 @@
package com.uber.hoodie.cli
import com.uber.hoodie.avro.HoodieAvroWriteSupport
import com.uber.hoodie.common.{BloomFilter, HoodieJsonPayload}
import com.uber.hoodie.common.model.HoodieRecord
import com.uber.hoodie.common.util.ParquetUtils
import com.uber.hoodie.common.{BloomFilter, HoodieJsonPayload}
import com.uber.hoodie.config.{HoodieIndexConfig, HoodieStorageConfig}
import com.uber.hoodie.io.storage.{HoodieParquetConfig, HoodieParquetWriter}
import org.apache.avro.Schema
@@ -107,7 +107,7 @@ class SparkHelper(sqlContext: SQLContext, fs: FileSystem) {
* @param file
* @param sqlContext
*/
def getKeyCount(file: String, sqlContext: org.apache.spark.sql.SQLContext) ={
def getKeyCount(file: String, sqlContext: org.apache.spark.sql.SQLContext) = {
println(getRowKeyDF(file).collect().size)
}
@@ -122,7 +122,7 @@ class SparkHelper(sqlContext: SQLContext, fs: FileSystem) {
* @param file
* @return
*/
def fileKeysAgainstBF(conf: Configuration, sqlContext: SQLContext, file: String) : Boolean = {
def fileKeysAgainstBF(conf: Configuration, sqlContext: SQLContext, file: String): Boolean = {
val bfStr = SparkHelpers.getBloomFilter(file, conf)
val bf = new com.uber.hoodie.common.BloomFilter(bfStr)
val foundCount = sqlContext.parquetFile(file)
@@ -134,7 +134,7 @@ class SparkHelper(sqlContext: SQLContext, fs: FileSystem) {
totalCount == foundCount
}
def getDistinctKeyDF(paths: List[String]) : DataFrame = {
sqlContext.read.parquet(paths:_*).select(s"`${HoodieRecord.RECORD_KEY_METADATA_FIELD}`").distinct()
def getDistinctKeyDF(paths: List[String]): DataFrame = {
sqlContext.read.parquet(paths: _*).select(s"`${HoodieRecord.RECORD_KEY_METADATA_FIELD}`").distinct()
}
}