Reformatting code per Google Code Style all over
This commit is contained in:
committed by
vinoth chandar
parent
5a62480a92
commit
e45679f5e2
@@ -17,38 +17,38 @@
|
||||
package com.uber.hoodie.cli;
|
||||
|
||||
import com.uber.hoodie.common.table.HoodieTableMetaClient;
|
||||
import java.io.IOException;
|
||||
import org.apache.hadoop.conf.Configuration;
|
||||
import org.apache.hadoop.fs.FileSystem;
|
||||
|
||||
import java.io.IOException;
|
||||
|
||||
public class HoodieCLI {
|
||||
public static Configuration conf;
|
||||
public static FileSystem fs;
|
||||
public static CLIState state = CLIState.INIT;
|
||||
public static HoodieTableMetaClient tableMetadata;
|
||||
public static HoodieTableMetaClient syncTableMetadata;
|
||||
|
||||
public static Configuration conf;
|
||||
public static FileSystem fs;
|
||||
public static CLIState state = CLIState.INIT;
|
||||
public static HoodieTableMetaClient tableMetadata;
|
||||
public static HoodieTableMetaClient syncTableMetadata;
|
||||
|
||||
|
||||
public enum CLIState {
|
||||
INIT, DATASET, SYNC
|
||||
public enum CLIState {
|
||||
INIT, DATASET, SYNC
|
||||
}
|
||||
|
||||
public static boolean initConf() {
|
||||
if (HoodieCLI.conf == null) {
|
||||
HoodieCLI.conf = new Configuration();
|
||||
return true;
|
||||
}
|
||||
return false;
|
||||
}
|
||||
|
||||
public static boolean initConf() {
|
||||
if (HoodieCLI.conf == null) {
|
||||
HoodieCLI.conf = new Configuration();
|
||||
return true;
|
||||
}
|
||||
return false;
|
||||
public static void initFS(boolean force) throws IOException {
|
||||
if (fs == null || force) {
|
||||
fs = FileSystem.get(conf);
|
||||
}
|
||||
}
|
||||
|
||||
public static void initFS(boolean force) throws IOException {
|
||||
if(fs == null || force) {
|
||||
fs = FileSystem.get(conf);
|
||||
}
|
||||
}
|
||||
|
||||
public static void setTableMetadata(HoodieTableMetaClient tableMetadata) {
|
||||
HoodieCLI.tableMetadata = tableMetadata;
|
||||
}
|
||||
public static void setTableMetadata(HoodieTableMetaClient tableMetadata) {
|
||||
HoodieCLI.tableMetadata = tableMetadata;
|
||||
}
|
||||
}
|
||||
|
||||
@@ -25,13 +25,13 @@ import org.springframework.stereotype.Component;
|
||||
@Order(Ordered.HIGHEST_PRECEDENCE)
|
||||
public class HoodieHistoryFileNameProvider extends DefaultHistoryFileNameProvider {
|
||||
|
||||
public String getHistoryFileName() {
|
||||
return "hoodie-cmd.log";
|
||||
}
|
||||
public String getHistoryFileName() {
|
||||
return "hoodie-cmd.log";
|
||||
}
|
||||
|
||||
@Override
|
||||
public String getProviderName() {
|
||||
return "Hoodie file name provider";
|
||||
}
|
||||
@Override
|
||||
public String getProviderName() {
|
||||
return "Hoodie file name provider";
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
@@ -17,18 +17,17 @@
|
||||
package com.uber.hoodie.cli;
|
||||
|
||||
import dnl.utils.text.table.TextTable;
|
||||
|
||||
import java.io.ByteArrayOutputStream;
|
||||
import java.io.PrintStream;
|
||||
import java.nio.charset.Charset;
|
||||
|
||||
public class HoodiePrintHelper {
|
||||
|
||||
public static String print(String[] header, String[][] rows) {
|
||||
TextTable textTable = new TextTable(header, rows);
|
||||
ByteArrayOutputStream baos = new ByteArrayOutputStream();
|
||||
PrintStream ps = new PrintStream(baos);
|
||||
textTable.printTable(ps, 4);
|
||||
return new String(baos.toByteArray(), Charset.forName("utf-8"));
|
||||
}
|
||||
public static String print(String[] header, String[][] rows) {
|
||||
TextTable textTable = new TextTable(header, rows);
|
||||
ByteArrayOutputStream baos = new ByteArrayOutputStream();
|
||||
PrintStream ps = new PrintStream(baos);
|
||||
textTable.printTable(ps, 4);
|
||||
return new String(baos.toByteArray(), Charset.forName("utf-8"));
|
||||
}
|
||||
}
|
||||
|
||||
@@ -16,7 +16,6 @@
|
||||
|
||||
package com.uber.hoodie.cli;
|
||||
|
||||
import com.uber.hoodie.common.table.HoodieTableConfig;
|
||||
import org.springframework.core.Ordered;
|
||||
import org.springframework.core.annotation.Order;
|
||||
import org.springframework.shell.plugin.support.DefaultPromptProvider;
|
||||
@@ -26,27 +25,27 @@ import org.springframework.stereotype.Component;
|
||||
@Order(Ordered.HIGHEST_PRECEDENCE)
|
||||
public class HoodiePrompt extends DefaultPromptProvider {
|
||||
|
||||
@Override
|
||||
public String getPrompt() {
|
||||
if (HoodieCLI.tableMetadata != null) {
|
||||
String tableName = HoodieCLI.tableMetadata.getTableConfig().getTableName();
|
||||
switch (HoodieCLI.state) {
|
||||
case INIT:
|
||||
return "hoodie->";
|
||||
case DATASET:
|
||||
return "hoodie:" + tableName + "->";
|
||||
case SYNC:
|
||||
return "hoodie:" + tableName + " <==> "
|
||||
+ HoodieCLI.syncTableMetadata.getTableConfig().getTableName() + "->";
|
||||
}
|
||||
return "hoodie:" + tableName + "->";
|
||||
}
|
||||
return "hoodie->";
|
||||
@Override
|
||||
public String getPrompt() {
|
||||
if (HoodieCLI.tableMetadata != null) {
|
||||
String tableName = HoodieCLI.tableMetadata.getTableConfig().getTableName();
|
||||
switch (HoodieCLI.state) {
|
||||
case INIT:
|
||||
return "hoodie->";
|
||||
case DATASET:
|
||||
return "hoodie:" + tableName + "->";
|
||||
case SYNC:
|
||||
return "hoodie:" + tableName + " <==> "
|
||||
+ HoodieCLI.syncTableMetadata.getTableConfig().getTableName() + "->";
|
||||
}
|
||||
return "hoodie:" + tableName + "->";
|
||||
}
|
||||
return "hoodie->";
|
||||
}
|
||||
|
||||
@Override
|
||||
public String getProviderName() {
|
||||
return "Hoodie provider";
|
||||
}
|
||||
@Override
|
||||
public String getProviderName() {
|
||||
return "Hoodie provider";
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
@@ -22,34 +22,39 @@ import org.springframework.shell.plugin.support.DefaultBannerProvider;
|
||||
import org.springframework.shell.support.util.OsUtils;
|
||||
import org.springframework.stereotype.Component;
|
||||
|
||||
@Component @Order(Ordered.HIGHEST_PRECEDENCE) public class HoodieSplashScreen
|
||||
@Component
|
||||
@Order(Ordered.HIGHEST_PRECEDENCE)
|
||||
public class HoodieSplashScreen
|
||||
extends DefaultBannerProvider {
|
||||
private static String screen = "============================================" + OsUtils.LINE_SEPARATOR +
|
||||
"* *" + OsUtils.LINE_SEPARATOR +
|
||||
"* _ _ _ _ *" + OsUtils.LINE_SEPARATOR +
|
||||
"* | | | | | (_) *" + OsUtils.LINE_SEPARATOR +
|
||||
"* | |__| | ___ ___ __| |_ ___ *" + OsUtils.LINE_SEPARATOR +
|
||||
"* | __ |/ _ \\ / _ \\ / _` | |/ _ \\ *" +
|
||||
OsUtils.LINE_SEPARATOR +
|
||||
"* | | | | (_) | (_) | (_| | | __/ *" + OsUtils.LINE_SEPARATOR +
|
||||
"* |_| |_|\\___/ \\___/ \\__,_|_|\\___| *" +
|
||||
OsUtils.LINE_SEPARATOR +
|
||||
"* *" + OsUtils.LINE_SEPARATOR +
|
||||
"============================================" + OsUtils.LINE_SEPARATOR;
|
||||
|
||||
public String getBanner() {
|
||||
return screen;
|
||||
}
|
||||
private static String screen =
|
||||
"============================================" + OsUtils.LINE_SEPARATOR +
|
||||
"* *" + OsUtils.LINE_SEPARATOR +
|
||||
"* _ _ _ _ *" + OsUtils.LINE_SEPARATOR +
|
||||
"* | | | | | (_) *" + OsUtils.LINE_SEPARATOR +
|
||||
"* | |__| | ___ ___ __| |_ ___ *" + OsUtils.LINE_SEPARATOR +
|
||||
"* | __ |/ _ \\ / _ \\ / _` | |/ _ \\ *" +
|
||||
OsUtils.LINE_SEPARATOR +
|
||||
"* | | | | (_) | (_) | (_| | | __/ *" + OsUtils.LINE_SEPARATOR +
|
||||
"* |_| |_|\\___/ \\___/ \\__,_|_|\\___| *" +
|
||||
OsUtils.LINE_SEPARATOR +
|
||||
"* *" + OsUtils.LINE_SEPARATOR +
|
||||
"============================================" + OsUtils.LINE_SEPARATOR;
|
||||
|
||||
public String getVersion() {
|
||||
return "1.0";
|
||||
}
|
||||
public String getBanner() {
|
||||
return screen;
|
||||
}
|
||||
|
||||
public String getWelcomeMessage() {
|
||||
return "Welcome to Hoodie CLI. Please type help if you are looking for help. ";
|
||||
}
|
||||
public String getVersion() {
|
||||
return "1.0";
|
||||
}
|
||||
|
||||
@Override public String getProviderName() {
|
||||
return "Hoodie Banner";
|
||||
}
|
||||
public String getWelcomeMessage() {
|
||||
return "Welcome to Hoodie CLI. Please type help if you are looking for help. ";
|
||||
}
|
||||
|
||||
@Override
|
||||
public String getProviderName() {
|
||||
return "Hoodie Banner";
|
||||
}
|
||||
}
|
||||
|
||||
@@ -16,18 +16,16 @@
|
||||
|
||||
package com.uber.hoodie.cli;
|
||||
|
||||
import java.io.IOException;
|
||||
import org.springframework.shell.Bootstrap;
|
||||
|
||||
import java.io.IOException;
|
||||
|
||||
public class Main {
|
||||
/**
|
||||
* Main class that delegates to Spring Shell's Bootstrap class in order to simplify debugging inside an IDE
|
||||
*
|
||||
* @param args
|
||||
* @throws IOException
|
||||
*/
|
||||
public static void main(String[] args) throws IOException {
|
||||
Bootstrap.main(args);
|
||||
}
|
||||
|
||||
/**
|
||||
* Main class that delegates to Spring Shell's Bootstrap class in order to simplify debugging
|
||||
* inside an IDE
|
||||
*/
|
||||
public static void main(String[] args) throws IOException {
|
||||
Bootstrap.main(args);
|
||||
}
|
||||
}
|
||||
|
||||
@@ -24,6 +24,10 @@ import com.uber.hoodie.common.table.HoodieTimeline;
|
||||
import com.uber.hoodie.common.table.log.HoodieLogFormat;
|
||||
import com.uber.hoodie.common.table.log.block.HoodieAvroDataBlock;
|
||||
import com.uber.hoodie.common.util.FSUtils;
|
||||
import java.io.IOException;
|
||||
import java.util.ArrayList;
|
||||
import java.util.List;
|
||||
import java.util.stream.Collectors;
|
||||
import org.apache.avro.generic.GenericRecord;
|
||||
import org.apache.avro.generic.IndexedRecord;
|
||||
import org.apache.hadoop.fs.FileStatus;
|
||||
@@ -34,90 +38,90 @@ import org.springframework.shell.core.annotation.CliCommand;
|
||||
import org.springframework.shell.core.annotation.CliOption;
|
||||
import org.springframework.stereotype.Component;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.util.ArrayList;
|
||||
import java.util.List;
|
||||
import java.util.stream.Collectors;
|
||||
|
||||
@Component
|
||||
public class ArchivedCommitsCommand implements CommandMarker {
|
||||
|
||||
@CliAvailabilityIndicator({"show archived commits"})
|
||||
public boolean isShowArchivedCommitAvailable() {
|
||||
return HoodieCLI.tableMetadata != null;
|
||||
@CliAvailabilityIndicator({"show archived commits"})
|
||||
public boolean isShowArchivedCommitAvailable() {
|
||||
return HoodieCLI.tableMetadata != null;
|
||||
}
|
||||
|
||||
@CliCommand(value = "show archived commits", help = "Read commits from archived files and show details")
|
||||
public String showCommits(
|
||||
@CliOption(key = {
|
||||
"limit"}, mandatory = false, help = "Limit commits", unspecifiedDefaultValue = "10")
|
||||
final Integer limit) throws IOException {
|
||||
|
||||
System.out
|
||||
.println("===============> Showing only " + limit + " archived commits <===============");
|
||||
FileStatus[] fsStatuses = FSUtils.getFs().globStatus(
|
||||
new Path(HoodieCLI.tableMetadata.getBasePath() + "/.hoodie/.commits_.archive*"));
|
||||
List<String[]> allCommits = new ArrayList<>();
|
||||
for (FileStatus fs : fsStatuses) {
|
||||
//read the archived file
|
||||
HoodieLogFormat.Reader reader = HoodieLogFormat.newReader(FSUtils.getFs(),
|
||||
new HoodieLogFile(fs.getPath()), HoodieArchivedMetaEntry.getClassSchema(), false);
|
||||
|
||||
List<IndexedRecord> readRecords = new ArrayList<>();
|
||||
//read the avro blocks
|
||||
while (reader.hasNext()) {
|
||||
HoodieAvroDataBlock blk = (HoodieAvroDataBlock) reader.next();
|
||||
List<IndexedRecord> records = blk.getRecords();
|
||||
readRecords.addAll(records);
|
||||
}
|
||||
List<String[]> readCommits = readRecords.stream().map(r -> (GenericRecord) r)
|
||||
.map(r -> readCommit(r)).limit(limit).collect(Collectors.toList());
|
||||
allCommits.addAll(readCommits);
|
||||
}
|
||||
return HoodiePrintHelper.print(
|
||||
new String[]{"CommitTime", "CommitType", "CommitDetails"},
|
||||
allCommits.toArray(new String[allCommits.size()][]));
|
||||
}
|
||||
|
||||
@CliCommand(value = "show archived commits", help = "Read commits from archived files and show details")
|
||||
public String showCommits(
|
||||
@CliOption(key = {"limit"}, mandatory = false, help = "Limit commits", unspecifiedDefaultValue = "10")
|
||||
final Integer limit) throws IOException {
|
||||
|
||||
System.out.println("===============> Showing only " + limit + " archived commits <===============");
|
||||
FileStatus [] fsStatuses = FSUtils.getFs().globStatus(new Path(HoodieCLI.tableMetadata.getBasePath() + "/.hoodie/.commits_.archive*"));
|
||||
List<String[]> allCommits = new ArrayList<>();
|
||||
for(FileStatus fs : fsStatuses) {
|
||||
//read the archived file
|
||||
HoodieLogFormat.Reader reader = HoodieLogFormat.newReader(FSUtils.getFs(),
|
||||
new HoodieLogFile(fs.getPath()), HoodieArchivedMetaEntry.getClassSchema(), false);
|
||||
|
||||
List<IndexedRecord> readRecords = new ArrayList<>();
|
||||
//read the avro blocks
|
||||
while (reader.hasNext()) {
|
||||
HoodieAvroDataBlock blk = (HoodieAvroDataBlock) reader.next();
|
||||
List<IndexedRecord> records = blk.getRecords();
|
||||
readRecords.addAll(records);
|
||||
}
|
||||
List<String[]> readCommits = readRecords.stream().map(r -> (GenericRecord)r).map(r -> readCommit(r)).limit(limit).collect(Collectors.toList());
|
||||
allCommits.addAll(readCommits);
|
||||
private String[] readCommit(GenericRecord record) {
|
||||
List<String> commitDetails = new ArrayList<>();
|
||||
try {
|
||||
switch (record.get("actionType").toString()) {
|
||||
case HoodieTimeline.CLEAN_ACTION: {
|
||||
commitDetails.add(record.get("commitTime").toString());
|
||||
commitDetails.add(record.get("actionType").toString());
|
||||
commitDetails.add(record.get("hoodieCleanMetadata").toString());
|
||||
break;
|
||||
}
|
||||
return HoodiePrintHelper.print(
|
||||
new String[] {"CommitTime", "CommitType", "CommitDetails"}, allCommits.toArray(new String[allCommits.size()][]));
|
||||
}
|
||||
|
||||
private String[] readCommit(GenericRecord record) {
|
||||
List<String> commitDetails = new ArrayList<>();
|
||||
try {
|
||||
switch (record.get("actionType").toString()) {
|
||||
case HoodieTimeline.CLEAN_ACTION: {
|
||||
commitDetails.add(record.get("commitTime").toString());
|
||||
commitDetails.add(record.get("actionType").toString());
|
||||
commitDetails.add(record.get("hoodieCleanMetadata").toString());
|
||||
break;
|
||||
}
|
||||
case HoodieTimeline.COMMIT_ACTION: {
|
||||
commitDetails.add(record.get("commitTime").toString());
|
||||
commitDetails.add(record.get("actionType").toString());
|
||||
commitDetails.add(record.get("hoodieCommitMetadata").toString());
|
||||
break;
|
||||
}
|
||||
case HoodieTimeline.COMPACTION_ACTION: {
|
||||
commitDetails.add(record.get("commitTime").toString());
|
||||
commitDetails.add(record.get("actionType").toString());
|
||||
commitDetails.add(record.get("hoodieCompactionMetadata").toString());
|
||||
break;
|
||||
}
|
||||
case HoodieTimeline.DELTA_COMMIT_ACTION: {
|
||||
commitDetails.add(record.get("commitTime").toString());
|
||||
commitDetails.add(record.get("actionType").toString());
|
||||
commitDetails.add(record.get("hoodieCommitMetadata").toString());
|
||||
break;
|
||||
}
|
||||
case HoodieTimeline.ROLLBACK_ACTION: {
|
||||
commitDetails.add(record.get("commitTime").toString());
|
||||
commitDetails.add(record.get("actionType").toString());
|
||||
commitDetails.add(record.get("hoodieRollbackMetadata").toString());
|
||||
break;
|
||||
}
|
||||
case HoodieTimeline.SAVEPOINT_ACTION: {
|
||||
commitDetails.add(record.get("commitTime").toString());
|
||||
commitDetails.add(record.get("actionType").toString());
|
||||
commitDetails.add(record.get("hoodieSavePointMetadata").toString());
|
||||
break;
|
||||
}
|
||||
}
|
||||
} catch (Exception e) {
|
||||
e.printStackTrace();
|
||||
case HoodieTimeline.COMMIT_ACTION: {
|
||||
commitDetails.add(record.get("commitTime").toString());
|
||||
commitDetails.add(record.get("actionType").toString());
|
||||
commitDetails.add(record.get("hoodieCommitMetadata").toString());
|
||||
break;
|
||||
}
|
||||
return commitDetails.toArray(new String[commitDetails.size()]);
|
||||
case HoodieTimeline.COMPACTION_ACTION: {
|
||||
commitDetails.add(record.get("commitTime").toString());
|
||||
commitDetails.add(record.get("actionType").toString());
|
||||
commitDetails.add(record.get("hoodieCompactionMetadata").toString());
|
||||
break;
|
||||
}
|
||||
case HoodieTimeline.DELTA_COMMIT_ACTION: {
|
||||
commitDetails.add(record.get("commitTime").toString());
|
||||
commitDetails.add(record.get("actionType").toString());
|
||||
commitDetails.add(record.get("hoodieCommitMetadata").toString());
|
||||
break;
|
||||
}
|
||||
case HoodieTimeline.ROLLBACK_ACTION: {
|
||||
commitDetails.add(record.get("commitTime").toString());
|
||||
commitDetails.add(record.get("actionType").toString());
|
||||
commitDetails.add(record.get("hoodieRollbackMetadata").toString());
|
||||
break;
|
||||
}
|
||||
case HoodieTimeline.SAVEPOINT_ACTION: {
|
||||
commitDetails.add(record.get("commitTime").toString());
|
||||
commitDetails.add(record.get("actionType").toString());
|
||||
commitDetails.add(record.get("hoodieSavePointMetadata").toString());
|
||||
break;
|
||||
}
|
||||
}
|
||||
} catch (Exception e) {
|
||||
e.printStackTrace();
|
||||
}
|
||||
}
|
||||
return commitDetails.toArray(new String[commitDetails.size()]);
|
||||
}
|
||||
}
|
||||
|
||||
@@ -24,89 +24,90 @@ import com.uber.hoodie.common.table.HoodieTimeline;
|
||||
import com.uber.hoodie.common.table.timeline.HoodieActiveTimeline;
|
||||
import com.uber.hoodie.common.table.timeline.HoodieInstant;
|
||||
import com.uber.hoodie.common.util.AvroUtils;
|
||||
import org.springframework.shell.core.CommandMarker;
|
||||
import org.springframework.shell.core.annotation.CliAvailabilityIndicator;
|
||||
import org.springframework.shell.core.annotation.CliCommand;
|
||||
import org.springframework.shell.core.annotation.CliOption;
|
||||
import org.springframework.stereotype.Component;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.util.ArrayList;
|
||||
import java.util.Collections;
|
||||
import java.util.List;
|
||||
import java.util.Map;
|
||||
import java.util.stream.Collectors;
|
||||
import org.springframework.shell.core.CommandMarker;
|
||||
import org.springframework.shell.core.annotation.CliAvailabilityIndicator;
|
||||
import org.springframework.shell.core.annotation.CliCommand;
|
||||
import org.springframework.shell.core.annotation.CliOption;
|
||||
import org.springframework.stereotype.Component;
|
||||
|
||||
@Component
|
||||
public class CleansCommand implements CommandMarker {
|
||||
@CliAvailabilityIndicator({"cleans show"})
|
||||
public boolean isShowAvailable() {
|
||||
return HoodieCLI.tableMetadata != null;
|
||||
}
|
||||
|
||||
@CliAvailabilityIndicator({"cleans refresh"})
|
||||
public boolean isRefreshAvailable() {
|
||||
return HoodieCLI.tableMetadata != null;
|
||||
}
|
||||
@CliAvailabilityIndicator({"cleans show"})
|
||||
public boolean isShowAvailable() {
|
||||
return HoodieCLI.tableMetadata != null;
|
||||
}
|
||||
|
||||
@CliAvailabilityIndicator({"clean showpartitions"})
|
||||
public boolean isCommitShowAvailable() {
|
||||
return HoodieCLI.tableMetadata != null;
|
||||
}
|
||||
@CliAvailabilityIndicator({"cleans refresh"})
|
||||
public boolean isRefreshAvailable() {
|
||||
return HoodieCLI.tableMetadata != null;
|
||||
}
|
||||
|
||||
@CliCommand(value = "cleans show", help = "Show the cleans")
|
||||
public String showCleans() throws IOException {
|
||||
HoodieActiveTimeline activeTimeline = HoodieCLI.tableMetadata.getActiveTimeline();
|
||||
HoodieTimeline timeline = activeTimeline.getCleanerTimeline().filterCompletedInstants();
|
||||
List<HoodieInstant> cleans = timeline.getInstants().collect(Collectors.toList());
|
||||
String[][] rows = new String[cleans.size()][];
|
||||
Collections.reverse(cleans);
|
||||
for (int i = 0; i < cleans.size(); i++) {
|
||||
HoodieInstant clean = cleans.get(i);
|
||||
HoodieCleanMetadata cleanMetadata =
|
||||
AvroUtils.deserializeHoodieCleanMetadata(timeline.getInstantDetails(clean).get());
|
||||
rows[i] = new String[] {clean.getTimestamp(), cleanMetadata.getEarliestCommitToRetain(),
|
||||
String.valueOf(cleanMetadata.getTotalFilesDeleted()),
|
||||
String.valueOf(cleanMetadata.getTimeTakenInMillis())};
|
||||
}
|
||||
return HoodiePrintHelper.print(
|
||||
new String[] {"CleanTime", "EarliestCommandRetained", "Total Files Deleted",
|
||||
"Total Time Taken"}, rows);
|
||||
}
|
||||
@CliAvailabilityIndicator({"clean showpartitions"})
|
||||
public boolean isCommitShowAvailable() {
|
||||
return HoodieCLI.tableMetadata != null;
|
||||
}
|
||||
|
||||
@CliCommand(value = "cleans refresh", help = "Refresh the commits")
|
||||
public String refreshCleans() throws IOException {
|
||||
HoodieTableMetaClient metadata =
|
||||
new HoodieTableMetaClient(HoodieCLI.fs, HoodieCLI.tableMetadata.getBasePath());
|
||||
HoodieCLI.setTableMetadata(metadata);
|
||||
return "Metadata for table " + metadata.getTableConfig().getTableName() + " refreshed.";
|
||||
@CliCommand(value = "cleans show", help = "Show the cleans")
|
||||
public String showCleans() throws IOException {
|
||||
HoodieActiveTimeline activeTimeline = HoodieCLI.tableMetadata.getActiveTimeline();
|
||||
HoodieTimeline timeline = activeTimeline.getCleanerTimeline().filterCompletedInstants();
|
||||
List<HoodieInstant> cleans = timeline.getInstants().collect(Collectors.toList());
|
||||
String[][] rows = new String[cleans.size()][];
|
||||
Collections.reverse(cleans);
|
||||
for (int i = 0; i < cleans.size(); i++) {
|
||||
HoodieInstant clean = cleans.get(i);
|
||||
HoodieCleanMetadata cleanMetadata =
|
||||
AvroUtils.deserializeHoodieCleanMetadata(timeline.getInstantDetails(clean).get());
|
||||
rows[i] = new String[]{clean.getTimestamp(), cleanMetadata.getEarliestCommitToRetain(),
|
||||
String.valueOf(cleanMetadata.getTotalFilesDeleted()),
|
||||
String.valueOf(cleanMetadata.getTimeTakenInMillis())};
|
||||
}
|
||||
return HoodiePrintHelper.print(
|
||||
new String[]{"CleanTime", "EarliestCommandRetained", "Total Files Deleted",
|
||||
"Total Time Taken"}, rows);
|
||||
}
|
||||
|
||||
@CliCommand(value = "clean showpartitions", help = "Show partition level details of a clean")
|
||||
public String showCleanPartitions(
|
||||
@CliOption(key = {"clean"}, help = "clean to show")
|
||||
final String commitTime) throws Exception {
|
||||
HoodieActiveTimeline activeTimeline = HoodieCLI.tableMetadata.getActiveTimeline();
|
||||
HoodieTimeline timeline = activeTimeline.getCleanerTimeline().filterCompletedInstants();
|
||||
HoodieInstant cleanInstant =
|
||||
new HoodieInstant(false, HoodieTimeline.CLEAN_ACTION, commitTime);
|
||||
@CliCommand(value = "cleans refresh", help = "Refresh the commits")
|
||||
public String refreshCleans() throws IOException {
|
||||
HoodieTableMetaClient metadata =
|
||||
new HoodieTableMetaClient(HoodieCLI.fs, HoodieCLI.tableMetadata.getBasePath());
|
||||
HoodieCLI.setTableMetadata(metadata);
|
||||
return "Metadata for table " + metadata.getTableConfig().getTableName() + " refreshed.";
|
||||
}
|
||||
|
||||
if (!timeline.containsInstant(cleanInstant)) {
|
||||
return "Clean " + commitTime + " not found in metadata " + timeline;
|
||||
}
|
||||
HoodieCleanMetadata cleanMetadata =
|
||||
AvroUtils.deserializeHoodieCleanMetadata(timeline.getInstantDetails(cleanInstant).get());
|
||||
List<String[]> rows = new ArrayList<>();
|
||||
for (Map.Entry<String, HoodieCleanPartitionMetadata> entry : cleanMetadata.getPartitionMetadata().entrySet()) {
|
||||
String path = entry.getKey();
|
||||
HoodieCleanPartitionMetadata stats = entry.getValue();
|
||||
String policy = stats.getPolicy();
|
||||
String totalSuccessDeletedFiles = String.valueOf(stats.getSuccessDeleteFiles().size());
|
||||
String totalFailedDeletedFiles = String.valueOf(stats.getFailedDeleteFiles().size());
|
||||
rows.add(new String[] {path, policy, totalSuccessDeletedFiles, totalFailedDeletedFiles});
|
||||
}
|
||||
return HoodiePrintHelper.print(
|
||||
new String[] {"Partition Path", "Cleaning policy", "Total Files Successfully Deleted",
|
||||
"Total Failed Deletions"}, rows.toArray(new String[rows.size()][]));
|
||||
@CliCommand(value = "clean showpartitions", help = "Show partition level details of a clean")
|
||||
public String showCleanPartitions(
|
||||
@CliOption(key = {"clean"}, help = "clean to show")
|
||||
final String commitTime) throws Exception {
|
||||
HoodieActiveTimeline activeTimeline = HoodieCLI.tableMetadata.getActiveTimeline();
|
||||
HoodieTimeline timeline = activeTimeline.getCleanerTimeline().filterCompletedInstants();
|
||||
HoodieInstant cleanInstant =
|
||||
new HoodieInstant(false, HoodieTimeline.CLEAN_ACTION, commitTime);
|
||||
|
||||
if (!timeline.containsInstant(cleanInstant)) {
|
||||
return "Clean " + commitTime + " not found in metadata " + timeline;
|
||||
}
|
||||
HoodieCleanMetadata cleanMetadata =
|
||||
AvroUtils.deserializeHoodieCleanMetadata(timeline.getInstantDetails(cleanInstant).get());
|
||||
List<String[]> rows = new ArrayList<>();
|
||||
for (Map.Entry<String, HoodieCleanPartitionMetadata> entry : cleanMetadata
|
||||
.getPartitionMetadata().entrySet()) {
|
||||
String path = entry.getKey();
|
||||
HoodieCleanPartitionMetadata stats = entry.getValue();
|
||||
String policy = stats.getPolicy();
|
||||
String totalSuccessDeletedFiles = String.valueOf(stats.getSuccessDeleteFiles().size());
|
||||
String totalFailedDeletedFiles = String.valueOf(stats.getFailedDeleteFiles().size());
|
||||
rows.add(new String[]{path, policy, totalSuccessDeletedFiles, totalFailedDeletedFiles});
|
||||
}
|
||||
return HoodiePrintHelper.print(
|
||||
new String[]{"Partition Path", "Cleaning policy", "Total Files Successfully Deleted",
|
||||
"Total Failed Deletions"}, rows.toArray(new String[rows.size()][]));
|
||||
}
|
||||
}
|
||||
|
||||
@@ -27,7 +27,12 @@ import com.uber.hoodie.common.table.HoodieTimeline;
|
||||
import com.uber.hoodie.common.table.timeline.HoodieActiveTimeline;
|
||||
import com.uber.hoodie.common.table.timeline.HoodieInstant;
|
||||
import com.uber.hoodie.common.util.NumericUtils;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.util.ArrayList;
|
||||
import java.util.Collections;
|
||||
import java.util.List;
|
||||
import java.util.Map;
|
||||
import java.util.stream.Collectors;
|
||||
import org.apache.spark.launcher.SparkLauncher;
|
||||
import org.springframework.shell.core.CommandMarker;
|
||||
import org.springframework.shell.core.annotation.CliAvailabilityIndicator;
|
||||
@@ -35,228 +40,236 @@ import org.springframework.shell.core.annotation.CliCommand;
|
||||
import org.springframework.shell.core.annotation.CliOption;
|
||||
import org.springframework.stereotype.Component;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.util.ArrayList;
|
||||
import java.util.Collections;
|
||||
import java.util.List;
|
||||
import java.util.Map;
|
||||
import java.util.stream.Collectors;
|
||||
|
||||
@Component
|
||||
public class CommitsCommand implements CommandMarker {
|
||||
@CliAvailabilityIndicator({"commits show"})
|
||||
public boolean isShowAvailable() {
|
||||
return HoodieCLI.tableMetadata != null;
|
||||
|
||||
@CliAvailabilityIndicator({"commits show"})
|
||||
public boolean isShowAvailable() {
|
||||
return HoodieCLI.tableMetadata != null;
|
||||
}
|
||||
|
||||
@CliAvailabilityIndicator({"commits refresh"})
|
||||
public boolean isRefreshAvailable() {
|
||||
return HoodieCLI.tableMetadata != null;
|
||||
}
|
||||
|
||||
@CliAvailabilityIndicator({"commit rollback"})
|
||||
public boolean isRollbackAvailable() {
|
||||
return HoodieCLI.tableMetadata != null;
|
||||
}
|
||||
|
||||
@CliAvailabilityIndicator({"commit show"})
|
||||
public boolean isCommitShowAvailable() {
|
||||
return HoodieCLI.tableMetadata != null;
|
||||
}
|
||||
|
||||
@CliCommand(value = "commits show", help = "Show the commits")
|
||||
public String showCommits(
|
||||
@CliOption(key = {
|
||||
"limit"}, mandatory = false, help = "Limit commits", unspecifiedDefaultValue = "10")
|
||||
final Integer limit) throws IOException {
|
||||
HoodieActiveTimeline activeTimeline = HoodieCLI.tableMetadata.getActiveTimeline();
|
||||
HoodieTimeline timeline = activeTimeline.getCommitsAndCompactionsTimeline()
|
||||
.filterCompletedInstants();
|
||||
List<HoodieInstant> commits = timeline.getInstants().collect(Collectors.toList());
|
||||
String[][] rows = new String[commits.size()][];
|
||||
Collections.reverse(commits);
|
||||
for (int i = 0; i < commits.size(); i++) {
|
||||
HoodieInstant commit = commits.get(i);
|
||||
HoodieCommitMetadata commitMetadata =
|
||||
HoodieCommitMetadata.fromBytes(timeline.getInstantDetails(commit).get());
|
||||
rows[i] = new String[]{commit.getTimestamp(),
|
||||
NumericUtils.humanReadableByteCount(commitMetadata.fetchTotalBytesWritten()),
|
||||
String.valueOf(commitMetadata.fetchTotalFilesInsert()),
|
||||
String.valueOf(commitMetadata.fetchTotalFilesUpdated()),
|
||||
String.valueOf(commitMetadata.fetchTotalPartitionsWritten()),
|
||||
String.valueOf(commitMetadata.fetchTotalRecordsWritten()),
|
||||
String.valueOf(commitMetadata.fetchTotalUpdateRecordsWritten()),
|
||||
String.valueOf(commitMetadata.fetchTotalWriteErrors())};
|
||||
}
|
||||
return HoodiePrintHelper.print(
|
||||
new String[]{"CommitTime", "Total Written (B)", "Total Files Added",
|
||||
"Total Files Updated", "Total Partitions Written", "Total Records Written",
|
||||
"Total Update Records Written", "Total Errors"}, rows);
|
||||
}
|
||||
|
||||
@CliCommand(value = "commits refresh", help = "Refresh the commits")
|
||||
public String refreshCommits() throws IOException {
|
||||
HoodieTableMetaClient metadata =
|
||||
new HoodieTableMetaClient(HoodieCLI.fs, HoodieCLI.tableMetadata.getBasePath());
|
||||
HoodieCLI.setTableMetadata(metadata);
|
||||
return "Metadata for table " + metadata.getTableConfig().getTableName() + " refreshed.";
|
||||
}
|
||||
|
||||
@CliCommand(value = "commit rollback", help = "Rollback a commit")
|
||||
public String rollbackCommit(
|
||||
@CliOption(key = {"commit"}, help = "Commit to rollback")
|
||||
final String commitTime,
|
||||
@CliOption(key = {"sparkProperties"}, help = "Spark Properites File Path")
|
||||
final String sparkPropertiesPath) throws Exception {
|
||||
HoodieActiveTimeline activeTimeline = HoodieCLI.tableMetadata.getActiveTimeline();
|
||||
HoodieTimeline timeline = activeTimeline.getCommitsAndCompactionsTimeline()
|
||||
.filterCompletedInstants();
|
||||
HoodieInstant commitInstant = new HoodieInstant(false, HoodieTimeline.COMMIT_ACTION,
|
||||
commitTime);
|
||||
|
||||
if (!timeline.containsInstant(commitInstant)) {
|
||||
return "Commit " + commitTime + " not found in Commits " + timeline;
|
||||
}
|
||||
|
||||
@CliAvailabilityIndicator({"commits refresh"})
|
||||
public boolean isRefreshAvailable() {
|
||||
return HoodieCLI.tableMetadata != null;
|
||||
SparkLauncher sparkLauncher = SparkUtil.initLauncher(sparkPropertiesPath);
|
||||
sparkLauncher.addAppArgs(SparkMain.SparkCommand.ROLLBACK.toString(),
|
||||
commitTime,
|
||||
HoodieCLI.tableMetadata.getBasePath());
|
||||
Process process = sparkLauncher.launch();
|
||||
InputStreamConsumer.captureOutput(process);
|
||||
int exitCode = process.waitFor();
|
||||
// Refresh the current
|
||||
refreshCommits();
|
||||
if (exitCode != 0) {
|
||||
return "Commit " + commitTime + " failed to roll back";
|
||||
}
|
||||
return "Commit " + commitTime + " rolled back";
|
||||
}
|
||||
|
||||
@CliAvailabilityIndicator({"commit rollback"})
|
||||
public boolean isRollbackAvailable() {
|
||||
return HoodieCLI.tableMetadata != null;
|
||||
@CliCommand(value = "commit showpartitions", help = "Show partition level details of a commit")
|
||||
public String showCommitPartitions(
|
||||
@CliOption(key = {"commit"}, help = "Commit to show")
|
||||
final String commitTime) throws Exception {
|
||||
HoodieActiveTimeline activeTimeline = HoodieCLI.tableMetadata.getActiveTimeline();
|
||||
HoodieTimeline timeline = activeTimeline.getCommitsAndCompactionsTimeline()
|
||||
.filterCompletedInstants();
|
||||
HoodieInstant commitInstant = new HoodieInstant(false, HoodieTimeline.COMMIT_ACTION,
|
||||
commitTime);
|
||||
|
||||
if (!timeline.containsInstant(commitInstant)) {
|
||||
return "Commit " + commitTime + " not found in Commits " + timeline;
|
||||
}
|
||||
|
||||
@CliAvailabilityIndicator({"commit show"})
|
||||
public boolean isCommitShowAvailable() {
|
||||
return HoodieCLI.tableMetadata != null;
|
||||
}
|
||||
|
||||
@CliCommand(value = "commits show", help = "Show the commits")
|
||||
public String showCommits(
|
||||
@CliOption(key = {
|
||||
"limit"}, mandatory = false, help = "Limit commits", unspecifiedDefaultValue = "10")
|
||||
final Integer limit) throws IOException {
|
||||
HoodieActiveTimeline activeTimeline = HoodieCLI.tableMetadata.getActiveTimeline();
|
||||
HoodieTimeline timeline = activeTimeline.getCommitsAndCompactionsTimeline().filterCompletedInstants();
|
||||
List<HoodieInstant> commits = timeline.getInstants().collect(Collectors.toList());
|
||||
String[][] rows = new String[commits.size()][];
|
||||
Collections.reverse(commits);
|
||||
for (int i = 0; i < commits.size(); i++) {
|
||||
HoodieInstant commit = commits.get(i);
|
||||
HoodieCommitMetadata commitMetadata =
|
||||
HoodieCommitMetadata.fromBytes(timeline.getInstantDetails(commit).get());
|
||||
rows[i] = new String[] {commit.getTimestamp(),
|
||||
NumericUtils.humanReadableByteCount(commitMetadata.fetchTotalBytesWritten()),
|
||||
String.valueOf(commitMetadata.fetchTotalFilesInsert()),
|
||||
String.valueOf(commitMetadata.fetchTotalFilesUpdated()),
|
||||
String.valueOf(commitMetadata.fetchTotalPartitionsWritten()),
|
||||
String.valueOf(commitMetadata.fetchTotalRecordsWritten()),
|
||||
String.valueOf(commitMetadata.fetchTotalUpdateRecordsWritten()),
|
||||
String.valueOf(commitMetadata.fetchTotalWriteErrors())};
|
||||
}
|
||||
return HoodiePrintHelper.print(
|
||||
new String[] {"CommitTime", "Total Written (B)", "Total Files Added",
|
||||
"Total Files Updated", "Total Partitions Written", "Total Records Written",
|
||||
"Total Update Records Written", "Total Errors"}, rows);
|
||||
}
|
||||
|
||||
@CliCommand(value = "commits refresh", help = "Refresh the commits")
|
||||
public String refreshCommits() throws IOException {
|
||||
HoodieTableMetaClient metadata =
|
||||
new HoodieTableMetaClient(HoodieCLI.fs, HoodieCLI.tableMetadata.getBasePath());
|
||||
HoodieCLI.setTableMetadata(metadata);
|
||||
return "Metadata for table " + metadata.getTableConfig().getTableName() + " refreshed.";
|
||||
}
|
||||
|
||||
@CliCommand(value = "commit rollback", help = "Rollback a commit")
|
||||
public String rollbackCommit(
|
||||
@CliOption(key = {"commit"}, help = "Commit to rollback")
|
||||
final String commitTime,
|
||||
@CliOption(key = {"sparkProperties"}, help = "Spark Properites File Path")
|
||||
final String sparkPropertiesPath) throws Exception {
|
||||
HoodieActiveTimeline activeTimeline = HoodieCLI.tableMetadata.getActiveTimeline();
|
||||
HoodieTimeline timeline = activeTimeline.getCommitsAndCompactionsTimeline().filterCompletedInstants();
|
||||
HoodieInstant commitInstant = new HoodieInstant(false, HoodieTimeline.COMMIT_ACTION, commitTime);
|
||||
|
||||
if (!timeline.containsInstant(commitInstant)) {
|
||||
return "Commit " + commitTime + " not found in Commits " + timeline;
|
||||
}
|
||||
|
||||
SparkLauncher sparkLauncher = SparkUtil.initLauncher(sparkPropertiesPath);
|
||||
sparkLauncher.addAppArgs(SparkMain.SparkCommand.ROLLBACK.toString(),
|
||||
commitTime,
|
||||
HoodieCLI.tableMetadata.getBasePath());
|
||||
Process process = sparkLauncher.launch();
|
||||
InputStreamConsumer.captureOutput(process);
|
||||
int exitCode = process.waitFor();
|
||||
// Refresh the current
|
||||
refreshCommits();
|
||||
if (exitCode != 0) {
|
||||
return "Commit " + commitTime + " failed to roll back";
|
||||
}
|
||||
return "Commit " + commitTime + " rolled back";
|
||||
}
|
||||
|
||||
@CliCommand(value = "commit showpartitions", help = "Show partition level details of a commit")
|
||||
public String showCommitPartitions(
|
||||
@CliOption(key = {"commit"}, help = "Commit to show")
|
||||
final String commitTime) throws Exception {
|
||||
HoodieActiveTimeline activeTimeline = HoodieCLI.tableMetadata.getActiveTimeline();
|
||||
HoodieTimeline timeline = activeTimeline.getCommitsAndCompactionsTimeline().filterCompletedInstants();
|
||||
HoodieInstant commitInstant = new HoodieInstant(false, HoodieTimeline.COMMIT_ACTION, commitTime);
|
||||
|
||||
if (!timeline.containsInstant(commitInstant)) {
|
||||
return "Commit " + commitTime + " not found in Commits " + timeline;
|
||||
}
|
||||
HoodieCommitMetadata meta =
|
||||
HoodieCommitMetadata.fromBytes(activeTimeline.getInstantDetails(commitInstant).get());
|
||||
List<String[]> rows = new ArrayList<String[]>();
|
||||
for (Map.Entry<String, List<HoodieWriteStat>> entry : meta.getPartitionToWriteStats()
|
||||
.entrySet()) {
|
||||
String path = entry.getKey();
|
||||
List<HoodieWriteStat> stats = entry.getValue();
|
||||
long totalFilesAdded = 0;
|
||||
long totalFilesUpdated = 0;
|
||||
long totalRecordsUpdated = 0;
|
||||
long totalRecordsInserted = 0;
|
||||
long totalBytesWritten = 0;
|
||||
long totalWriteErrors = 0;
|
||||
for (HoodieWriteStat stat : stats) {
|
||||
if (stat.getPrevCommit().equals(HoodieWriteStat.NULL_COMMIT)) {
|
||||
totalFilesAdded += 1;
|
||||
totalRecordsInserted += stat.getNumWrites();
|
||||
} else {
|
||||
totalFilesUpdated += 1;
|
||||
totalRecordsUpdated += stat.getNumUpdateWrites();
|
||||
}
|
||||
totalBytesWritten += stat.getTotalWriteBytes();
|
||||
totalWriteErrors += stat.getTotalWriteErrors();
|
||||
}
|
||||
rows.add(new String[] {path, String.valueOf(totalFilesAdded),
|
||||
String.valueOf(totalFilesUpdated), String.valueOf(totalRecordsInserted),
|
||||
String.valueOf(totalRecordsUpdated),
|
||||
NumericUtils.humanReadableByteCount(totalBytesWritten),
|
||||
String.valueOf(totalWriteErrors)});
|
||||
|
||||
}
|
||||
return HoodiePrintHelper.print(
|
||||
new String[] {"Partition Path", "Total Files Added", "Total Files Updated",
|
||||
"Total Records Inserted", "Total Records Updated", "Total Bytes Written",
|
||||
"Total Errors"}, rows.toArray(new String[rows.size()][]));
|
||||
}
|
||||
|
||||
@CliCommand(value = "commit showfiles", help = "Show file level details of a commit")
|
||||
public String showCommitFiles(
|
||||
@CliOption(key = {"commit"}, help = "Commit to show")
|
||||
final String commitTime) throws Exception {
|
||||
HoodieActiveTimeline activeTimeline = HoodieCLI.tableMetadata.getActiveTimeline();
|
||||
HoodieTimeline timeline = activeTimeline.getCommitsAndCompactionsTimeline().filterCompletedInstants();
|
||||
HoodieInstant commitInstant = new HoodieInstant(false, HoodieTimeline.COMMIT_ACTION, commitTime);
|
||||
|
||||
if (!timeline.containsInstant(commitInstant)) {
|
||||
return "Commit " + commitTime + " not found in Commits " + timeline;
|
||||
}
|
||||
HoodieCommitMetadata meta =
|
||||
HoodieCommitMetadata.fromBytes(activeTimeline.getInstantDetails(commitInstant).get());
|
||||
List<String[]> rows = new ArrayList<String[]>();
|
||||
for (Map.Entry<String, List<HoodieWriteStat>> entry : meta.getPartitionToWriteStats()
|
||||
.entrySet()) {
|
||||
String path = entry.getKey();
|
||||
List<HoodieWriteStat> stats = entry.getValue();
|
||||
for (HoodieWriteStat stat : stats) {
|
||||
rows.add(new String[] {path, stat.getFileId(), stat.getPrevCommit(),
|
||||
String.valueOf(stat.getNumUpdateWrites()), String.valueOf(stat.getNumWrites()),
|
||||
String.valueOf(stat.getTotalWriteBytes()),
|
||||
String.valueOf(stat.getTotalWriteErrors())});
|
||||
}
|
||||
}
|
||||
return HoodiePrintHelper.print(
|
||||
new String[] {"Partition Path", "File ID", "Previous Commit", "Total Records Updated",
|
||||
"Total Records Written", "Total Bytes Written", "Total Errors"},
|
||||
rows.toArray(new String[rows.size()][]));
|
||||
}
|
||||
|
||||
@CliAvailabilityIndicator({"commits compare"})
|
||||
public boolean isCompareCommitsAvailable() {
|
||||
return HoodieCLI.tableMetadata != null;
|
||||
}
|
||||
|
||||
@CliCommand(value = "commits compare", help = "Compare commits with another Hoodie dataset")
|
||||
public String compareCommits(
|
||||
@CliOption(key = {"path"}, help = "Path of the dataset to compare to")
|
||||
final String path) throws Exception {
|
||||
HoodieTableMetaClient target = new HoodieTableMetaClient(HoodieCLI.fs, path);
|
||||
HoodieTimeline targetTimeline = target.getActiveTimeline().getCommitsAndCompactionsTimeline().filterCompletedInstants();;
|
||||
HoodieTableMetaClient source = HoodieCLI.tableMetadata;
|
||||
HoodieTimeline sourceTimeline = source.getActiveTimeline().getCommitsAndCompactionsTimeline().filterCompletedInstants();;
|
||||
String targetLatestCommit =
|
||||
targetTimeline.getInstants().iterator().hasNext() ? "0" : targetTimeline.lastInstant().get().getTimestamp();
|
||||
String sourceLatestCommit =
|
||||
sourceTimeline.getInstants().iterator().hasNext() ? "0" : sourceTimeline.lastInstant().get().getTimestamp();
|
||||
|
||||
if (sourceLatestCommit != null &&
|
||||
HoodieTimeline.compareTimestamps(targetLatestCommit, sourceLatestCommit, HoodieTimeline.GREATER)) {
|
||||
// source is behind the target
|
||||
List<String> commitsToCatchup =
|
||||
targetTimeline.findInstantsAfter(sourceLatestCommit, Integer.MAX_VALUE)
|
||||
.getInstants().map(HoodieInstant::getTimestamp).collect(Collectors.toList());
|
||||
return "Source " + source.getTableConfig().getTableName() + " is behind by "
|
||||
+ commitsToCatchup.size() + " commits. Commits to catch up - " + commitsToCatchup;
|
||||
HoodieCommitMetadata meta =
|
||||
HoodieCommitMetadata.fromBytes(activeTimeline.getInstantDetails(commitInstant).get());
|
||||
List<String[]> rows = new ArrayList<String[]>();
|
||||
for (Map.Entry<String, List<HoodieWriteStat>> entry : meta.getPartitionToWriteStats()
|
||||
.entrySet()) {
|
||||
String path = entry.getKey();
|
||||
List<HoodieWriteStat> stats = entry.getValue();
|
||||
long totalFilesAdded = 0;
|
||||
long totalFilesUpdated = 0;
|
||||
long totalRecordsUpdated = 0;
|
||||
long totalRecordsInserted = 0;
|
||||
long totalBytesWritten = 0;
|
||||
long totalWriteErrors = 0;
|
||||
for (HoodieWriteStat stat : stats) {
|
||||
if (stat.getPrevCommit().equals(HoodieWriteStat.NULL_COMMIT)) {
|
||||
totalFilesAdded += 1;
|
||||
totalRecordsInserted += stat.getNumWrites();
|
||||
} else {
|
||||
List<String> commitsToCatchup =
|
||||
sourceTimeline.findInstantsAfter(targetLatestCommit, Integer.MAX_VALUE)
|
||||
.getInstants().map(HoodieInstant::getTimestamp).collect(Collectors.toList());
|
||||
return "Source " + source.getTableConfig().getTableName() + " is ahead by "
|
||||
+ commitsToCatchup.size() + " commits. Commits to catch up - " + commitsToCatchup;
|
||||
totalFilesUpdated += 1;
|
||||
totalRecordsUpdated += stat.getNumUpdateWrites();
|
||||
}
|
||||
}
|
||||
totalBytesWritten += stat.getTotalWriteBytes();
|
||||
totalWriteErrors += stat.getTotalWriteErrors();
|
||||
}
|
||||
rows.add(new String[]{path, String.valueOf(totalFilesAdded),
|
||||
String.valueOf(totalFilesUpdated), String.valueOf(totalRecordsInserted),
|
||||
String.valueOf(totalRecordsUpdated),
|
||||
NumericUtils.humanReadableByteCount(totalBytesWritten),
|
||||
String.valueOf(totalWriteErrors)});
|
||||
|
||||
@CliAvailabilityIndicator({"commits sync"})
|
||||
public boolean isSyncCommitsAvailable() {
|
||||
return HoodieCLI.tableMetadata != null;
|
||||
}
|
||||
return HoodiePrintHelper.print(
|
||||
new String[]{"Partition Path", "Total Files Added", "Total Files Updated",
|
||||
"Total Records Inserted", "Total Records Updated", "Total Bytes Written",
|
||||
"Total Errors"}, rows.toArray(new String[rows.size()][]));
|
||||
}
|
||||
|
||||
@CliCommand(value = "commits sync", help = "Compare commits with another Hoodie dataset")
|
||||
public String syncCommits(
|
||||
@CliOption(key = {"path"}, help = "Path of the dataset to compare to")
|
||||
final String path) throws Exception {
|
||||
HoodieCLI.syncTableMetadata = new HoodieTableMetaClient(HoodieCLI.fs, path);
|
||||
HoodieCLI.state = HoodieCLI.CLIState.SYNC;
|
||||
return "Load sync state between " + HoodieCLI.tableMetadata.getTableConfig().getTableName()
|
||||
+ " and " + HoodieCLI.syncTableMetadata.getTableConfig().getTableName();
|
||||
@CliCommand(value = "commit showfiles", help = "Show file level details of a commit")
|
||||
public String showCommitFiles(
|
||||
@CliOption(key = {"commit"}, help = "Commit to show")
|
||||
final String commitTime) throws Exception {
|
||||
HoodieActiveTimeline activeTimeline = HoodieCLI.tableMetadata.getActiveTimeline();
|
||||
HoodieTimeline timeline = activeTimeline.getCommitsAndCompactionsTimeline()
|
||||
.filterCompletedInstants();
|
||||
HoodieInstant commitInstant = new HoodieInstant(false, HoodieTimeline.COMMIT_ACTION,
|
||||
commitTime);
|
||||
|
||||
if (!timeline.containsInstant(commitInstant)) {
|
||||
return "Commit " + commitTime + " not found in Commits " + timeline;
|
||||
}
|
||||
HoodieCommitMetadata meta =
|
||||
HoodieCommitMetadata.fromBytes(activeTimeline.getInstantDetails(commitInstant).get());
|
||||
List<String[]> rows = new ArrayList<String[]>();
|
||||
for (Map.Entry<String, List<HoodieWriteStat>> entry : meta.getPartitionToWriteStats()
|
||||
.entrySet()) {
|
||||
String path = entry.getKey();
|
||||
List<HoodieWriteStat> stats = entry.getValue();
|
||||
for (HoodieWriteStat stat : stats) {
|
||||
rows.add(new String[]{path, stat.getFileId(), stat.getPrevCommit(),
|
||||
String.valueOf(stat.getNumUpdateWrites()), String.valueOf(stat.getNumWrites()),
|
||||
String.valueOf(stat.getTotalWriteBytes()),
|
||||
String.valueOf(stat.getTotalWriteErrors())});
|
||||
}
|
||||
}
|
||||
return HoodiePrintHelper.print(
|
||||
new String[]{"Partition Path", "File ID", "Previous Commit", "Total Records Updated",
|
||||
"Total Records Written", "Total Bytes Written", "Total Errors"},
|
||||
rows.toArray(new String[rows.size()][]));
|
||||
}
|
||||
|
||||
@CliAvailabilityIndicator({"commits compare"})
|
||||
public boolean isCompareCommitsAvailable() {
|
||||
return HoodieCLI.tableMetadata != null;
|
||||
}
|
||||
|
||||
@CliCommand(value = "commits compare", help = "Compare commits with another Hoodie dataset")
|
||||
public String compareCommits(
|
||||
@CliOption(key = {"path"}, help = "Path of the dataset to compare to")
|
||||
final String path) throws Exception {
|
||||
HoodieTableMetaClient target = new HoodieTableMetaClient(HoodieCLI.fs, path);
|
||||
HoodieTimeline targetTimeline = target.getActiveTimeline().getCommitsAndCompactionsTimeline()
|
||||
.filterCompletedInstants();
|
||||
;
|
||||
HoodieTableMetaClient source = HoodieCLI.tableMetadata;
|
||||
HoodieTimeline sourceTimeline = source.getActiveTimeline().getCommitsAndCompactionsTimeline()
|
||||
.filterCompletedInstants();
|
||||
;
|
||||
String targetLatestCommit =
|
||||
targetTimeline.getInstants().iterator().hasNext() ? "0"
|
||||
: targetTimeline.lastInstant().get().getTimestamp();
|
||||
String sourceLatestCommit =
|
||||
sourceTimeline.getInstants().iterator().hasNext() ? "0"
|
||||
: sourceTimeline.lastInstant().get().getTimestamp();
|
||||
|
||||
if (sourceLatestCommit != null &&
|
||||
HoodieTimeline
|
||||
.compareTimestamps(targetLatestCommit, sourceLatestCommit, HoodieTimeline.GREATER)) {
|
||||
// source is behind the target
|
||||
List<String> commitsToCatchup =
|
||||
targetTimeline.findInstantsAfter(sourceLatestCommit, Integer.MAX_VALUE)
|
||||
.getInstants().map(HoodieInstant::getTimestamp).collect(Collectors.toList());
|
||||
return "Source " + source.getTableConfig().getTableName() + " is behind by "
|
||||
+ commitsToCatchup.size() + " commits. Commits to catch up - " + commitsToCatchup;
|
||||
} else {
|
||||
List<String> commitsToCatchup =
|
||||
sourceTimeline.findInstantsAfter(targetLatestCommit, Integer.MAX_VALUE)
|
||||
.getInstants().map(HoodieInstant::getTimestamp).collect(Collectors.toList());
|
||||
return "Source " + source.getTableConfig().getTableName() + " is ahead by "
|
||||
+ commitsToCatchup.size() + " commits. Commits to catch up - " + commitsToCatchup;
|
||||
}
|
||||
}
|
||||
|
||||
@CliAvailabilityIndicator({"commits sync"})
|
||||
public boolean isSyncCommitsAvailable() {
|
||||
return HoodieCLI.tableMetadata != null;
|
||||
}
|
||||
|
||||
@CliCommand(value = "commits sync", help = "Compare commits with another Hoodie dataset")
|
||||
public String syncCommits(
|
||||
@CliOption(key = {"path"}, help = "Path of the dataset to compare to")
|
||||
final String path) throws Exception {
|
||||
HoodieCLI.syncTableMetadata = new HoodieTableMetaClient(HoodieCLI.fs, path);
|
||||
HoodieCLI.state = HoodieCLI.CLIState.SYNC;
|
||||
return "Load sync state between " + HoodieCLI.tableMetadata.getTableConfig().getTableName()
|
||||
+ " and " + HoodieCLI.syncTableMetadata.getTableConfig().getTableName();
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
@@ -18,24 +18,24 @@ package com.uber.hoodie.cli.commands;
|
||||
|
||||
import com.uber.hoodie.cli.HoodieCLI;
|
||||
import com.uber.hoodie.common.table.HoodieTableMetaClient;
|
||||
import java.io.IOException;
|
||||
import org.springframework.shell.core.CommandMarker;
|
||||
import org.springframework.shell.core.annotation.CliCommand;
|
||||
import org.springframework.shell.core.annotation.CliOption;
|
||||
import org.springframework.stereotype.Component;
|
||||
|
||||
import java.io.IOException;
|
||||
|
||||
@Component
|
||||
public class DatasetsCommand implements CommandMarker {
|
||||
@CliCommand(value = "connect", help = "Connect to a hoodie dataset")
|
||||
public String connect(
|
||||
@CliOption(key = {"path"}, mandatory = true, help = "Base Path of the dataset")
|
||||
final String path) throws IOException {
|
||||
boolean initialized = HoodieCLI.initConf();
|
||||
HoodieCLI.initFS(initialized);
|
||||
HoodieCLI.setTableMetadata(new HoodieTableMetaClient(HoodieCLI.fs, path));
|
||||
HoodieCLI.state = HoodieCLI.CLIState.DATASET;
|
||||
return "Metadata for table " + HoodieCLI.tableMetadata.getTableConfig().getTableName()
|
||||
+ " loaded";
|
||||
}
|
||||
|
||||
@CliCommand(value = "connect", help = "Connect to a hoodie dataset")
|
||||
public String connect(
|
||||
@CliOption(key = {"path"}, mandatory = true, help = "Base Path of the dataset")
|
||||
final String path) throws IOException {
|
||||
boolean initialized = HoodieCLI.initConf();
|
||||
HoodieCLI.initFS(initialized);
|
||||
HoodieCLI.setTableMetadata(new HoodieTableMetaClient(HoodieCLI.fs, path));
|
||||
HoodieCLI.state = HoodieCLI.CLIState.DATASET;
|
||||
return "Metadata for table " + HoodieCLI.tableMetadata.getTableConfig().getTableName()
|
||||
+ " loaded";
|
||||
}
|
||||
}
|
||||
|
||||
@@ -33,58 +33,59 @@ import org.springframework.stereotype.Component;
|
||||
@Component
|
||||
public class HDFSParquetImportCommand implements CommandMarker {
|
||||
|
||||
private static Logger log = LogManager.getLogger(HDFSParquetImportCommand.class);
|
||||
private static Logger log = LogManager.getLogger(HDFSParquetImportCommand.class);
|
||||
|
||||
@CliCommand(value = "hdfsparquetimport", help = "Imports hdfs dataset to a hoodie dataset")
|
||||
public String convert(
|
||||
@CliOption(key = "srcPath", mandatory = true, help = "Base path for the input dataset")
|
||||
final String srcPath,
|
||||
@CliOption(key = "srcType", mandatory = true, help = "Source type for the input dataset")
|
||||
final String srcType,
|
||||
@CliOption(key = "targetPath", mandatory = true, help = "Base path for the target hoodie dataset")
|
||||
final String targetPath,
|
||||
@CliOption(key = "tableName", mandatory = true, help = "Table name")
|
||||
final String tableName,
|
||||
@CliOption(key = "tableType", mandatory = true, help = "Table type")
|
||||
final String tableType,
|
||||
@CliOption(key = "rowKeyField", mandatory = true, help = "Row key field name")
|
||||
final String rowKeyField,
|
||||
@CliOption(key = "partitionPathField", mandatory = true, help = "Partition path field name")
|
||||
final String partitionPathField,
|
||||
@CliOption(key = {"parallelism"}, mandatory = true, help = "Parallelism for hoodie insert")
|
||||
final String parallelism,
|
||||
@CliOption(key = "schemaFilePath", mandatory = true, help = "Path for Avro schema file")
|
||||
final String schemaFilePath,
|
||||
@CliOption(key = "format", mandatory = true, help = "Format for the input data")
|
||||
final String format,
|
||||
@CliOption(key = "sparkMemory", mandatory = true, help = "Spark executor memory")
|
||||
final String sparkMemory,
|
||||
@CliOption(key = "retry", mandatory = true, help = "Number of retries")
|
||||
final String retry)
|
||||
throws Exception {
|
||||
@CliCommand(value = "hdfsparquetimport", help = "Imports hdfs dataset to a hoodie dataset")
|
||||
public String convert(
|
||||
@CliOption(key = "srcPath", mandatory = true, help = "Base path for the input dataset")
|
||||
final String srcPath,
|
||||
@CliOption(key = "srcType", mandatory = true, help = "Source type for the input dataset")
|
||||
final String srcType,
|
||||
@CliOption(key = "targetPath", mandatory = true, help = "Base path for the target hoodie dataset")
|
||||
final String targetPath,
|
||||
@CliOption(key = "tableName", mandatory = true, help = "Table name")
|
||||
final String tableName,
|
||||
@CliOption(key = "tableType", mandatory = true, help = "Table type")
|
||||
final String tableType,
|
||||
@CliOption(key = "rowKeyField", mandatory = true, help = "Row key field name")
|
||||
final String rowKeyField,
|
||||
@CliOption(key = "partitionPathField", mandatory = true, help = "Partition path field name")
|
||||
final String partitionPathField,
|
||||
@CliOption(key = {"parallelism"}, mandatory = true, help = "Parallelism for hoodie insert")
|
||||
final String parallelism,
|
||||
@CliOption(key = "schemaFilePath", mandatory = true, help = "Path for Avro schema file")
|
||||
final String schemaFilePath,
|
||||
@CliOption(key = "format", mandatory = true, help = "Format for the input data")
|
||||
final String format,
|
||||
@CliOption(key = "sparkMemory", mandatory = true, help = "Spark executor memory")
|
||||
final String sparkMemory,
|
||||
@CliOption(key = "retry", mandatory = true, help = "Number of retries")
|
||||
final String retry)
|
||||
throws Exception {
|
||||
|
||||
validate(format, srcType);
|
||||
validate(format, srcType);
|
||||
|
||||
boolean initialized = HoodieCLI.initConf();
|
||||
HoodieCLI.initFS(initialized);
|
||||
String sparkPropertiesPath = Utils
|
||||
.getDefaultPropertiesFile(scala.collection.JavaConversions.propertiesAsScalaMap(System.getProperties()));
|
||||
SparkLauncher sparkLauncher = SparkUtil.initLauncher(sparkPropertiesPath);
|
||||
boolean initialized = HoodieCLI.initConf();
|
||||
HoodieCLI.initFS(initialized);
|
||||
String sparkPropertiesPath = Utils
|
||||
.getDefaultPropertiesFile(
|
||||
scala.collection.JavaConversions.propertiesAsScalaMap(System.getProperties()));
|
||||
SparkLauncher sparkLauncher = SparkUtil.initLauncher(sparkPropertiesPath);
|
||||
|
||||
sparkLauncher.addAppArgs(SparkCommand.IMPORT.toString(), srcPath, targetPath, tableName,
|
||||
tableType, rowKeyField, partitionPathField, parallelism, schemaFilePath, sparkMemory,
|
||||
retry);
|
||||
Process process = sparkLauncher.launch();
|
||||
InputStreamConsumer.captureOutput(process);
|
||||
int exitCode = process.waitFor();
|
||||
if (exitCode != 0) {
|
||||
return "Failed to import dataset to hoodie format";
|
||||
}
|
||||
return "Dataset imported to hoodie format";
|
||||
sparkLauncher.addAppArgs(SparkCommand.IMPORT.toString(), srcPath, targetPath, tableName,
|
||||
tableType, rowKeyField, partitionPathField, parallelism, schemaFilePath, sparkMemory,
|
||||
retry);
|
||||
Process process = sparkLauncher.launch();
|
||||
InputStreamConsumer.captureOutput(process);
|
||||
int exitCode = process.waitFor();
|
||||
if (exitCode != 0) {
|
||||
return "Failed to import dataset to hoodie format";
|
||||
}
|
||||
return "Dataset imported to hoodie format";
|
||||
}
|
||||
|
||||
private void validate(String format, String srcType) {
|
||||
(new HDFSParquetImporter.FormatValidator()).validate("format", format);
|
||||
(new HDFSParquetImporter.SourceTypeValidator()).validate("srcType", srcType);
|
||||
}
|
||||
private void validate(String format, String srcType) {
|
||||
(new HDFSParquetImporter.FormatValidator()).validate("format", format);
|
||||
(new HDFSParquetImporter.SourceTypeValidator()).validate("srcType", srcType);
|
||||
}
|
||||
}
|
||||
|
||||
@@ -16,105 +16,109 @@
|
||||
|
||||
package com.uber.hoodie.cli.commands;
|
||||
|
||||
import com.uber.hoodie.cli.HoodieCLI;
|
||||
import com.uber.hoodie.cli.utils.CommitUtil;
|
||||
import com.uber.hoodie.cli.utils.HiveUtil;
|
||||
import com.uber.hoodie.cli.HoodieCLI;
|
||||
import com.uber.hoodie.common.table.HoodieTableMetaClient;
|
||||
import com.uber.hoodie.common.table.HoodieTimeline;
|
||||
import com.uber.hoodie.common.table.timeline.HoodieInstant;
|
||||
import java.util.List;
|
||||
import java.util.stream.Collectors;
|
||||
import org.springframework.shell.core.CommandMarker;
|
||||
import org.springframework.shell.core.annotation.CliAvailabilityIndicator;
|
||||
import org.springframework.shell.core.annotation.CliCommand;
|
||||
import org.springframework.shell.core.annotation.CliOption;
|
||||
import org.springframework.stereotype.Component;
|
||||
|
||||
import java.util.List;
|
||||
import java.util.stream.Collectors;
|
||||
|
||||
@Component
|
||||
public class HoodieSyncCommand implements CommandMarker {
|
||||
@CliAvailabilityIndicator({"sync validate"})
|
||||
public boolean isSyncVerificationAvailable() {
|
||||
return HoodieCLI.tableMetadata != null && HoodieCLI.syncTableMetadata != null;
|
||||
|
||||
@CliAvailabilityIndicator({"sync validate"})
|
||||
public boolean isSyncVerificationAvailable() {
|
||||
return HoodieCLI.tableMetadata != null && HoodieCLI.syncTableMetadata != null;
|
||||
}
|
||||
|
||||
@CliCommand(value = "sync validate", help = "Validate the sync by counting the number of records")
|
||||
public String validateSync(
|
||||
@CliOption(key = {"mode"}, unspecifiedDefaultValue = "complete", help = "Check mode")
|
||||
final String mode,
|
||||
@CliOption(key = {
|
||||
"sourceDb"}, unspecifiedDefaultValue = "rawdata", help = "source database")
|
||||
final String srcDb,
|
||||
@CliOption(key = {
|
||||
"targetDb"}, unspecifiedDefaultValue = "dwh_hoodie", help = "target database")
|
||||
final String tgtDb,
|
||||
@CliOption(key = {
|
||||
"partitionCount"}, unspecifiedDefaultValue = "5", help = "total number of recent partitions to validate")
|
||||
final int partitionCount,
|
||||
@CliOption(key = {
|
||||
"hiveServerUrl"}, mandatory = true, help = "hiveServerURL to connect to")
|
||||
final String hiveServerUrl,
|
||||
@CliOption(key = {
|
||||
"hiveUser"}, mandatory = false, unspecifiedDefaultValue = "", help = "hive username to connect to")
|
||||
final String hiveUser,
|
||||
@CliOption(key = {
|
||||
"hivePass"}, mandatory = true, unspecifiedDefaultValue = "", help = "hive password to connect to")
|
||||
final String hivePass) throws Exception {
|
||||
HoodieTableMetaClient target = HoodieCLI.syncTableMetadata;
|
||||
HoodieTimeline targetTimeline = target.getActiveTimeline().getCommitsAndCompactionsTimeline();
|
||||
HoodieTableMetaClient source = HoodieCLI.tableMetadata;
|
||||
HoodieTimeline sourceTimeline = source.getActiveTimeline().getCommitsAndCompactionsTimeline();
|
||||
long sourceCount = 0;
|
||||
long targetCount = 0;
|
||||
if ("complete".equals(mode)) {
|
||||
sourceCount = HiveUtil.countRecords(hiveServerUrl, source, srcDb, hiveUser, hivePass);
|
||||
targetCount = HiveUtil.countRecords(hiveServerUrl, target, tgtDb, hiveUser, hivePass);
|
||||
} else if ("latestPartitions".equals(mode)) {
|
||||
sourceCount = HiveUtil
|
||||
.countRecords(hiveServerUrl, source, srcDb, partitionCount, hiveUser, hivePass);
|
||||
targetCount = HiveUtil
|
||||
.countRecords(hiveServerUrl, target, tgtDb, partitionCount, hiveUser, hivePass);
|
||||
}
|
||||
|
||||
@CliCommand(value = "sync validate", help = "Validate the sync by counting the number of records")
|
||||
public String validateSync(
|
||||
@CliOption(key = {"mode"}, unspecifiedDefaultValue = "complete", help = "Check mode")
|
||||
final String mode,
|
||||
@CliOption(key = {
|
||||
"sourceDb"}, unspecifiedDefaultValue = "rawdata", help = "source database")
|
||||
final String srcDb,
|
||||
@CliOption(key = {
|
||||
"targetDb"}, unspecifiedDefaultValue = "dwh_hoodie", help = "target database")
|
||||
final String tgtDb,
|
||||
@CliOption(key = {
|
||||
"partitionCount"}, unspecifiedDefaultValue = "5", help = "total number of recent partitions to validate")
|
||||
final int partitionCount,
|
||||
@CliOption(key = {
|
||||
"hiveServerUrl"}, mandatory = true, help = "hiveServerURL to connect to")
|
||||
final String hiveServerUrl,
|
||||
@CliOption(key = {
|
||||
"hiveUser"}, mandatory = false, unspecifiedDefaultValue = "", help = "hive username to connect to")
|
||||
final String hiveUser,
|
||||
@CliOption(key = {
|
||||
"hivePass"}, mandatory = true, unspecifiedDefaultValue = "", help = "hive password to connect to")
|
||||
final String hivePass) throws Exception {
|
||||
HoodieTableMetaClient target = HoodieCLI.syncTableMetadata;
|
||||
HoodieTimeline targetTimeline = target.getActiveTimeline().getCommitsAndCompactionsTimeline();
|
||||
HoodieTableMetaClient source = HoodieCLI.tableMetadata;
|
||||
HoodieTimeline sourceTimeline = source.getActiveTimeline().getCommitsAndCompactionsTimeline();
|
||||
long sourceCount = 0;
|
||||
long targetCount = 0;
|
||||
if ("complete".equals(mode)) {
|
||||
sourceCount = HiveUtil.countRecords(hiveServerUrl, source, srcDb, hiveUser, hivePass);
|
||||
targetCount = HiveUtil.countRecords(hiveServerUrl, target, tgtDb, hiveUser, hivePass);
|
||||
} else if ("latestPartitions".equals(mode)) {
|
||||
sourceCount = HiveUtil.countRecords(hiveServerUrl, source, srcDb, partitionCount, hiveUser, hivePass);
|
||||
targetCount = HiveUtil.countRecords(hiveServerUrl, target, tgtDb, partitionCount, hiveUser, hivePass);
|
||||
}
|
||||
String targetLatestCommit =
|
||||
targetTimeline.getInstants().iterator().hasNext() ? "0"
|
||||
: targetTimeline.lastInstant().get().getTimestamp();
|
||||
String sourceLatestCommit =
|
||||
sourceTimeline.getInstants().iterator().hasNext() ? "0"
|
||||
: sourceTimeline.lastInstant().get().getTimestamp();
|
||||
|
||||
String targetLatestCommit =
|
||||
targetTimeline.getInstants().iterator().hasNext() ? "0" : targetTimeline.lastInstant().get().getTimestamp();
|
||||
String sourceLatestCommit =
|
||||
sourceTimeline.getInstants().iterator().hasNext() ? "0" : sourceTimeline.lastInstant().get().getTimestamp();
|
||||
if (sourceLatestCommit != null && HoodieTimeline
|
||||
.compareTimestamps(targetLatestCommit, sourceLatestCommit, HoodieTimeline.GREATER)) {
|
||||
// source is behind the target
|
||||
List<HoodieInstant> commitsToCatchup =
|
||||
targetTimeline.findInstantsAfter(sourceLatestCommit, Integer.MAX_VALUE).getInstants()
|
||||
.collect(Collectors.toList());
|
||||
if (commitsToCatchup.isEmpty()) {
|
||||
return "Count difference now is (count(" + target.getTableConfig().getTableName()
|
||||
+ ") - count(" + source.getTableConfig().getTableName() + ") == " + (targetCount
|
||||
- sourceCount);
|
||||
} else {
|
||||
long newInserts = CommitUtil.countNewRecords(target,
|
||||
commitsToCatchup.stream().map(HoodieInstant::getTimestamp)
|
||||
.collect(Collectors.toList()));
|
||||
return "Count difference now is (count(" + target.getTableConfig().getTableName()
|
||||
+ ") - count(" + source.getTableConfig().getTableName() + ") == " + (targetCount
|
||||
- sourceCount) + ". Catch up count is " + newInserts;
|
||||
}
|
||||
} else {
|
||||
List<HoodieInstant> commitsToCatchup =
|
||||
sourceTimeline.findInstantsAfter(targetLatestCommit, Integer.MAX_VALUE).getInstants()
|
||||
.collect(Collectors.toList());
|
||||
if (commitsToCatchup.isEmpty()) {
|
||||
return "Count difference now is (count(" + source.getTableConfig().getTableName()
|
||||
+ ") - count(" + target.getTableConfig().getTableName() + ") == " + (sourceCount
|
||||
- targetCount);
|
||||
} else {
|
||||
long newInserts = CommitUtil.countNewRecords(source,
|
||||
commitsToCatchup.stream().map(HoodieInstant::getTimestamp)
|
||||
.collect(Collectors.toList()));
|
||||
return "Count difference now is (count(" + source.getTableConfig().getTableName()
|
||||
+ ") - count(" + target.getTableConfig().getTableName() + ") == " + (sourceCount
|
||||
- targetCount) + ". Catch up count is " + newInserts;
|
||||
}
|
||||
|
||||
if (sourceLatestCommit != null && HoodieTimeline
|
||||
.compareTimestamps(targetLatestCommit, sourceLatestCommit, HoodieTimeline.GREATER)) {
|
||||
// source is behind the target
|
||||
List<HoodieInstant> commitsToCatchup =
|
||||
targetTimeline.findInstantsAfter(sourceLatestCommit, Integer.MAX_VALUE).getInstants()
|
||||
.collect(Collectors.toList());
|
||||
if (commitsToCatchup.isEmpty()) {
|
||||
return "Count difference now is (count(" + target.getTableConfig().getTableName()
|
||||
+ ") - count(" + source.getTableConfig().getTableName() + ") == " + (targetCount
|
||||
- sourceCount);
|
||||
} else {
|
||||
long newInserts = CommitUtil.countNewRecords(target,
|
||||
commitsToCatchup.stream().map(HoodieInstant::getTimestamp)
|
||||
.collect(Collectors.toList()));
|
||||
return "Count difference now is (count(" + target.getTableConfig().getTableName()
|
||||
+ ") - count(" + source.getTableConfig().getTableName() + ") == " + (targetCount
|
||||
- sourceCount) + ". Catch up count is " + newInserts;
|
||||
}
|
||||
} else {
|
||||
List<HoodieInstant> commitsToCatchup =
|
||||
sourceTimeline.findInstantsAfter(targetLatestCommit, Integer.MAX_VALUE).getInstants()
|
||||
.collect(Collectors.toList());
|
||||
if (commitsToCatchup.isEmpty()) {
|
||||
return "Count difference now is (count(" + source.getTableConfig().getTableName()
|
||||
+ ") - count(" + target.getTableConfig().getTableName() + ") == " + (sourceCount
|
||||
- targetCount);
|
||||
} else {
|
||||
long newInserts = CommitUtil.countNewRecords(source,
|
||||
commitsToCatchup.stream().map(HoodieInstant::getTimestamp)
|
||||
.collect(Collectors.toList()));
|
||||
return "Count difference now is (count(" + source.getTableConfig().getTableName()
|
||||
+ ") - count(" + target.getTableConfig().getTableName() + ") == " + (sourceCount
|
||||
- targetCount) + ". Catch up count is " + newInserts;
|
||||
}
|
||||
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
@@ -22,7 +22,8 @@ import com.uber.hoodie.cli.utils.InputStreamConsumer;
|
||||
import com.uber.hoodie.cli.utils.SparkUtil;
|
||||
import com.uber.hoodie.common.model.HoodiePartitionMetadata;
|
||||
import com.uber.hoodie.common.util.FSUtils;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.util.List;
|
||||
import org.apache.hadoop.fs.Path;
|
||||
import org.apache.spark.launcher.SparkLauncher;
|
||||
import org.springframework.shell.core.CommandMarker;
|
||||
@@ -31,80 +32,80 @@ import org.springframework.shell.core.annotation.CliCommand;
|
||||
import org.springframework.shell.core.annotation.CliOption;
|
||||
import org.springframework.stereotype.Component;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.util.List;
|
||||
|
||||
@Component
|
||||
public class RepairsCommand implements CommandMarker {
|
||||
|
||||
@CliAvailabilityIndicator({"repair deduplicate"})
|
||||
public boolean isRepairDeduplicateAvailable() {
|
||||
return HoodieCLI.tableMetadata != null;
|
||||
@CliAvailabilityIndicator({"repair deduplicate"})
|
||||
public boolean isRepairDeduplicateAvailable() {
|
||||
return HoodieCLI.tableMetadata != null;
|
||||
}
|
||||
|
||||
@CliAvailabilityIndicator({"repair addpartitionmeta"})
|
||||
public boolean isRepairAddPartitionMetaAvailable() {
|
||||
return HoodieCLI.tableMetadata != null;
|
||||
}
|
||||
|
||||
@CliCommand(value = "repair deduplicate", help = "De-duplicate a partition path contains duplicates & produce repaired files to replace with")
|
||||
public String deduplicate(
|
||||
@CliOption(key = {
|
||||
"duplicatedPartitionPath"}, help = "Partition Path containing the duplicates", mandatory = true)
|
||||
final String duplicatedPartitionPath,
|
||||
@CliOption(key = {
|
||||
"repairedOutputPath"}, help = "Location to place the repaired files", mandatory = true)
|
||||
final String repairedOutputPath,
|
||||
@CliOption(key = {"sparkProperties"}, help = "Spark Properites File Path", mandatory = true)
|
||||
final String sparkPropertiesPath) throws Exception {
|
||||
SparkLauncher sparkLauncher = SparkUtil.initLauncher(sparkPropertiesPath);
|
||||
sparkLauncher
|
||||
.addAppArgs(SparkMain.SparkCommand.DEDUPLICATE.toString(), duplicatedPartitionPath,
|
||||
repairedOutputPath, HoodieCLI.tableMetadata.getBasePath());
|
||||
Process process = sparkLauncher.launch();
|
||||
InputStreamConsumer.captureOutput(process);
|
||||
int exitCode = process.waitFor();
|
||||
|
||||
if (exitCode != 0) {
|
||||
return "Deduplicated files placed in: " + repairedOutputPath;
|
||||
}
|
||||
return "Deduplication failed ";
|
||||
}
|
||||
|
||||
@CliAvailabilityIndicator({"repair addpartitionmeta"})
|
||||
public boolean isRepairAddPartitionMetaAvailable() {
|
||||
return HoodieCLI.tableMetadata != null;
|
||||
}
|
||||
|
||||
@CliCommand(value = "repair deduplicate", help = "De-duplicate a partition path contains duplicates & produce repaired files to replace with")
|
||||
public String deduplicate(
|
||||
@CliOption(key = {
|
||||
"duplicatedPartitionPath"}, help = "Partition Path containing the duplicates", mandatory = true)
|
||||
final String duplicatedPartitionPath,
|
||||
@CliOption(key = {"repairedOutputPath"}, help = "Location to place the repaired files", mandatory = true)
|
||||
final String repairedOutputPath,
|
||||
@CliOption(key = {"sparkProperties"}, help = "Spark Properites File Path", mandatory = true)
|
||||
final String sparkPropertiesPath) throws Exception {
|
||||
SparkLauncher sparkLauncher = SparkUtil.initLauncher(sparkPropertiesPath);
|
||||
sparkLauncher
|
||||
.addAppArgs(SparkMain.SparkCommand.DEDUPLICATE.toString(), duplicatedPartitionPath,
|
||||
repairedOutputPath, HoodieCLI.tableMetadata.getBasePath());
|
||||
Process process = sparkLauncher.launch();
|
||||
InputStreamConsumer.captureOutput(process);
|
||||
int exitCode = process.waitFor();
|
||||
@CliCommand(value = "repair addpartitionmeta", help = "Add partition metadata to a dataset, if not present")
|
||||
public String addPartitionMeta(
|
||||
@CliOption(key = {"dryrun"},
|
||||
help = "Should we actually add or just print what would be done",
|
||||
unspecifiedDefaultValue = "true")
|
||||
final boolean dryRun) throws IOException {
|
||||
|
||||
if (exitCode != 0) {
|
||||
return "Deduplicated files placed in: " + repairedOutputPath;
|
||||
String latestCommit = HoodieCLI.tableMetadata.getActiveTimeline().getCommitTimeline()
|
||||
.lastInstant().get().getTimestamp();
|
||||
List<String> partitionPaths = FSUtils.getAllFoldersThreeLevelsDown(HoodieCLI.fs,
|
||||
HoodieCLI.tableMetadata.getBasePath());
|
||||
Path basePath = new Path(HoodieCLI.tableMetadata.getBasePath());
|
||||
String[][] rows = new String[partitionPaths.size() + 1][];
|
||||
|
||||
int ind = 0;
|
||||
for (String partition : partitionPaths) {
|
||||
Path partitionPath = new Path(basePath, partition);
|
||||
String[] row = new String[3];
|
||||
row[0] = partition;
|
||||
row[1] = "Yes";
|
||||
row[2] = "None";
|
||||
if (!HoodiePartitionMetadata.hasPartitionMetadata(HoodieCLI.fs, partitionPath)) {
|
||||
row[1] = "No";
|
||||
if (!dryRun) {
|
||||
HoodiePartitionMetadata partitionMetadata = new HoodiePartitionMetadata(
|
||||
HoodieCLI.fs,
|
||||
latestCommit,
|
||||
basePath,
|
||||
partitionPath);
|
||||
partitionMetadata.trySave(0);
|
||||
}
|
||||
return "Deduplication failed ";
|
||||
}
|
||||
rows[ind++] = row;
|
||||
}
|
||||
|
||||
|
||||
|
||||
@CliCommand(value = "repair addpartitionmeta", help = "Add partition metadata to a dataset, if not present")
|
||||
public String addPartitionMeta(
|
||||
@CliOption(key = {"dryrun"},
|
||||
help = "Should we actually add or just print what would be done",
|
||||
unspecifiedDefaultValue = "true")
|
||||
final boolean dryRun) throws IOException {
|
||||
|
||||
String latestCommit = HoodieCLI.tableMetadata.getActiveTimeline().getCommitTimeline().lastInstant().get().getTimestamp();
|
||||
List<String> partitionPaths = FSUtils.getAllFoldersThreeLevelsDown(HoodieCLI.fs,
|
||||
HoodieCLI.tableMetadata.getBasePath());
|
||||
Path basePath = new Path(HoodieCLI.tableMetadata.getBasePath());
|
||||
String[][] rows = new String[partitionPaths.size() + 1][];
|
||||
|
||||
int ind = 0;
|
||||
for (String partition: partitionPaths) {
|
||||
Path partitionPath = new Path(basePath, partition);
|
||||
String[] row = new String[3];
|
||||
row[0] = partition; row[1] = "Yes"; row[2] = "None";
|
||||
if (!HoodiePartitionMetadata.hasPartitionMetadata(HoodieCLI.fs, partitionPath)) {
|
||||
row[1] = "No";
|
||||
if (!dryRun) {
|
||||
HoodiePartitionMetadata partitionMetadata = new HoodiePartitionMetadata(
|
||||
HoodieCLI.fs,
|
||||
latestCommit,
|
||||
basePath,
|
||||
partitionPath);
|
||||
partitionMetadata.trySave(0);
|
||||
}
|
||||
}
|
||||
rows[ind++] = row;
|
||||
}
|
||||
|
||||
return HoodiePrintHelper.print(
|
||||
new String[] {"Partition Path", "Metadata Present?", "Action"}, rows);
|
||||
}
|
||||
return HoodiePrintHelper.print(
|
||||
new String[]{"Partition Path", "Metadata Present?", "Action"}, rows);
|
||||
}
|
||||
}
|
||||
|
||||
@@ -27,6 +27,10 @@ import com.uber.hoodie.common.table.timeline.HoodieInstant;
|
||||
import com.uber.hoodie.config.HoodieIndexConfig;
|
||||
import com.uber.hoodie.config.HoodieWriteConfig;
|
||||
import com.uber.hoodie.index.HoodieIndex;
|
||||
import java.io.IOException;
|
||||
import java.util.Collections;
|
||||
import java.util.List;
|
||||
import java.util.stream.Collectors;
|
||||
import org.apache.spark.api.java.JavaSparkContext;
|
||||
import org.apache.spark.launcher.SparkLauncher;
|
||||
import org.springframework.shell.core.CommandMarker;
|
||||
@@ -35,122 +39,118 @@ import org.springframework.shell.core.annotation.CliCommand;
|
||||
import org.springframework.shell.core.annotation.CliOption;
|
||||
import org.springframework.stereotype.Component;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.util.Collections;
|
||||
import java.util.List;
|
||||
import java.util.stream.Collectors;
|
||||
|
||||
@Component
|
||||
public class SavepointsCommand implements CommandMarker {
|
||||
@CliAvailabilityIndicator({"savepoints show"})
|
||||
public boolean isShowAvailable() {
|
||||
return HoodieCLI.tableMetadata != null;
|
||||
|
||||
@CliAvailabilityIndicator({"savepoints show"})
|
||||
public boolean isShowAvailable() {
|
||||
return HoodieCLI.tableMetadata != null;
|
||||
}
|
||||
|
||||
@CliAvailabilityIndicator({"savepoints refresh"})
|
||||
public boolean isRefreshAvailable() {
|
||||
return HoodieCLI.tableMetadata != null;
|
||||
}
|
||||
|
||||
|
||||
@CliAvailabilityIndicator({"savepoint create"})
|
||||
public boolean isCreateSavepointAvailable() {
|
||||
return HoodieCLI.tableMetadata != null;
|
||||
}
|
||||
|
||||
@CliAvailabilityIndicator({"savepoint rollback"})
|
||||
public boolean isRollbackToSavepointAvailable() {
|
||||
return HoodieCLI.tableMetadata != null && !HoodieCLI.tableMetadata.getActiveTimeline()
|
||||
.getSavePointTimeline().filterCompletedInstants().empty();
|
||||
}
|
||||
|
||||
@CliCommand(value = "savepoints show", help = "Show the savepoints")
|
||||
public String showSavepoints() throws IOException {
|
||||
HoodieActiveTimeline activeTimeline = HoodieCLI.tableMetadata.getActiveTimeline();
|
||||
HoodieTimeline timeline = activeTimeline.getSavePointTimeline().filterCompletedInstants();
|
||||
List<HoodieInstant> commits = timeline.getInstants().collect(Collectors.toList());
|
||||
String[][] rows = new String[commits.size()][];
|
||||
Collections.reverse(commits);
|
||||
for (int i = 0; i < commits.size(); i++) {
|
||||
HoodieInstant commit = commits.get(i);
|
||||
rows[i] = new String[]{commit.getTimestamp()};
|
||||
}
|
||||
return HoodiePrintHelper.print(new String[]{"SavepointTime"}, rows);
|
||||
}
|
||||
|
||||
@CliCommand(value = "savepoint create", help = "Savepoint a commit")
|
||||
public String savepoint(
|
||||
@CliOption(key = {"commit"}, help = "Commit to savepoint")
|
||||
final String commitTime,
|
||||
@CliOption(key = {"user"}, help = "User who is creating the savepoint")
|
||||
final String user,
|
||||
@CliOption(key = {"comments"}, help = "Comments for creating the savepoint")
|
||||
final String comments) throws Exception {
|
||||
HoodieActiveTimeline activeTimeline = HoodieCLI.tableMetadata.getActiveTimeline();
|
||||
HoodieTimeline timeline = activeTimeline.getCommitTimeline().filterCompletedInstants();
|
||||
HoodieInstant
|
||||
commitInstant = new HoodieInstant(false, HoodieTimeline.COMMIT_ACTION, commitTime);
|
||||
|
||||
if (!timeline.containsInstant(commitInstant)) {
|
||||
return "Commit " + commitTime + " not found in Commits " + timeline;
|
||||
}
|
||||
|
||||
@CliAvailabilityIndicator({"savepoints refresh"})
|
||||
public boolean isRefreshAvailable() {
|
||||
return HoodieCLI.tableMetadata != null;
|
||||
HoodieWriteClient client = createHoodieClient(null, HoodieCLI.tableMetadata.getBasePath());
|
||||
if (client.savepoint(commitTime, user, comments)) {
|
||||
// Refresh the current
|
||||
refreshMetaClient();
|
||||
return String.format("The commit \"%s\" has been savepointed.", commitTime);
|
||||
}
|
||||
return String.format("Failed: Could not savepoint commit \"%s\".", commitTime);
|
||||
}
|
||||
|
||||
@CliCommand(value = "savepoint rollback", help = "Savepoint a commit")
|
||||
public String rollbackToSavepoint(
|
||||
@CliOption(key = {"savepoint"}, help = "Savepoint to rollback")
|
||||
final String commitTime,
|
||||
@CliOption(key = {"sparkProperties"}, help = "Spark Properites File Path")
|
||||
final String sparkPropertiesPath) throws Exception {
|
||||
HoodieActiveTimeline activeTimeline = HoodieCLI.tableMetadata.getActiveTimeline();
|
||||
HoodieTimeline timeline = activeTimeline.getCommitTimeline().filterCompletedInstants();
|
||||
HoodieInstant
|
||||
commitInstant = new HoodieInstant(false, HoodieTimeline.COMMIT_ACTION, commitTime);
|
||||
|
||||
if (!timeline.containsInstant(commitInstant)) {
|
||||
return "Commit " + commitTime + " not found in Commits " + timeline;
|
||||
}
|
||||
|
||||
|
||||
@CliAvailabilityIndicator({"savepoint create"})
|
||||
public boolean isCreateSavepointAvailable() {
|
||||
return HoodieCLI.tableMetadata != null;
|
||||
}
|
||||
|
||||
@CliAvailabilityIndicator({"savepoint rollback"})
|
||||
public boolean isRollbackToSavepointAvailable() {
|
||||
return HoodieCLI.tableMetadata != null && !HoodieCLI.tableMetadata.getActiveTimeline().getSavePointTimeline().filterCompletedInstants().empty();
|
||||
}
|
||||
|
||||
@CliCommand(value = "savepoints show", help = "Show the savepoints")
|
||||
public String showSavepoints() throws IOException {
|
||||
HoodieActiveTimeline activeTimeline = HoodieCLI.tableMetadata.getActiveTimeline();
|
||||
HoodieTimeline timeline = activeTimeline.getSavePointTimeline().filterCompletedInstants();
|
||||
List<HoodieInstant> commits = timeline.getInstants().collect(Collectors.toList());
|
||||
String[][] rows = new String[commits.size()][];
|
||||
Collections.reverse(commits);
|
||||
for (int i = 0; i < commits.size(); i++) {
|
||||
HoodieInstant commit = commits.get(i);
|
||||
rows[i] = new String[] {commit.getTimestamp()};
|
||||
}
|
||||
return HoodiePrintHelper.print(new String[] {"SavepointTime"}, rows);
|
||||
}
|
||||
|
||||
@CliCommand(value = "savepoint create", help = "Savepoint a commit")
|
||||
public String savepoint(
|
||||
@CliOption(key = {"commit"}, help = "Commit to savepoint")
|
||||
final String commitTime,
|
||||
@CliOption(key = {"user"}, help = "User who is creating the savepoint")
|
||||
final String user,
|
||||
@CliOption(key = {"comments"}, help = "Comments for creating the savepoint")
|
||||
final String comments) throws Exception {
|
||||
HoodieActiveTimeline activeTimeline = HoodieCLI.tableMetadata.getActiveTimeline();
|
||||
HoodieTimeline timeline = activeTimeline.getCommitTimeline().filterCompletedInstants();
|
||||
HoodieInstant
|
||||
commitInstant = new HoodieInstant(false, HoodieTimeline.COMMIT_ACTION, commitTime);
|
||||
|
||||
if (!timeline.containsInstant(commitInstant)) {
|
||||
return "Commit " + commitTime + " not found in Commits " + timeline;
|
||||
}
|
||||
|
||||
HoodieWriteClient client = createHoodieClient(null, HoodieCLI.tableMetadata.getBasePath());
|
||||
if (client.savepoint(commitTime, user, comments)) {
|
||||
// Refresh the current
|
||||
refreshMetaClient();
|
||||
return String.format("The commit \"%s\" has been savepointed.", commitTime);
|
||||
}
|
||||
return String.format("Failed: Could not savepoint commit \"%s\".", commitTime);
|
||||
}
|
||||
|
||||
@CliCommand(value = "savepoint rollback", help = "Savepoint a commit")
|
||||
public String rollbackToSavepoint(
|
||||
@CliOption(key = {"savepoint"}, help = "Savepoint to rollback")
|
||||
final String commitTime,
|
||||
@CliOption(key = {"sparkProperties"}, help = "Spark Properites File Path")
|
||||
final String sparkPropertiesPath) throws Exception {
|
||||
HoodieActiveTimeline activeTimeline = HoodieCLI.tableMetadata.getActiveTimeline();
|
||||
HoodieTimeline timeline = activeTimeline.getCommitTimeline().filterCompletedInstants();
|
||||
HoodieInstant
|
||||
commitInstant = new HoodieInstant(false, HoodieTimeline.COMMIT_ACTION, commitTime);
|
||||
|
||||
if (!timeline.containsInstant(commitInstant)) {
|
||||
return "Commit " + commitTime + " not found in Commits " + timeline;
|
||||
}
|
||||
|
||||
SparkLauncher sparkLauncher = SparkUtil.initLauncher(sparkPropertiesPath);
|
||||
sparkLauncher.addAppArgs(SparkMain.SparkCommand.ROLLBACK_TO_SAVEPOINT.toString(),
|
||||
commitTime,
|
||||
HoodieCLI.tableMetadata.getBasePath());
|
||||
Process process = sparkLauncher.launch();
|
||||
InputStreamConsumer.captureOutput(process);
|
||||
int exitCode = process.waitFor();
|
||||
// Refresh the current
|
||||
refreshMetaClient();
|
||||
if (exitCode != 0) {
|
||||
return "Savepoint " + commitTime + " failed to roll back";
|
||||
}
|
||||
return "Savepoint " + commitTime + " rolled back";
|
||||
SparkLauncher sparkLauncher = SparkUtil.initLauncher(sparkPropertiesPath);
|
||||
sparkLauncher.addAppArgs(SparkMain.SparkCommand.ROLLBACK_TO_SAVEPOINT.toString(),
|
||||
commitTime,
|
||||
HoodieCLI.tableMetadata.getBasePath());
|
||||
Process process = sparkLauncher.launch();
|
||||
InputStreamConsumer.captureOutput(process);
|
||||
int exitCode = process.waitFor();
|
||||
// Refresh the current
|
||||
refreshMetaClient();
|
||||
if (exitCode != 0) {
|
||||
return "Savepoint " + commitTime + " failed to roll back";
|
||||
}
|
||||
return "Savepoint " + commitTime + " rolled back";
|
||||
}
|
||||
|
||||
|
||||
@CliCommand(value = "savepoints refresh", help = "Refresh the savepoints")
|
||||
public String refreshMetaClient() throws IOException {
|
||||
HoodieTableMetaClient metadata =
|
||||
new HoodieTableMetaClient(HoodieCLI.fs, HoodieCLI.tableMetadata.getBasePath());
|
||||
HoodieCLI.setTableMetadata(metadata);
|
||||
return "Metadata for table " + metadata.getTableConfig().getTableName() + " refreshed.";
|
||||
}
|
||||
|
||||
private static HoodieWriteClient createHoodieClient(JavaSparkContext jsc, String basePath)
|
||||
throws Exception {
|
||||
HoodieWriteConfig config = HoodieWriteConfig.newBuilder().withPath(basePath)
|
||||
.withIndexConfig(
|
||||
HoodieIndexConfig.newBuilder().withIndexType(HoodieIndex.IndexType.BLOOM).build())
|
||||
.build();
|
||||
return new HoodieWriteClient(jsc, config, false);
|
||||
}
|
||||
@CliCommand(value = "savepoints refresh", help = "Refresh the savepoints")
|
||||
public String refreshMetaClient() throws IOException {
|
||||
HoodieTableMetaClient metadata =
|
||||
new HoodieTableMetaClient(HoodieCLI.fs, HoodieCLI.tableMetadata.getBasePath());
|
||||
HoodieCLI.setTableMetadata(metadata);
|
||||
return "Metadata for table " + metadata.getTableConfig().getTableName() + " refreshed.";
|
||||
}
|
||||
|
||||
private static HoodieWriteClient createHoodieClient(JavaSparkContext jsc, String basePath)
|
||||
throws Exception {
|
||||
HoodieWriteConfig config = HoodieWriteConfig.newBuilder().withPath(basePath)
|
||||
.withIndexConfig(
|
||||
HoodieIndexConfig.newBuilder().withIndexType(HoodieIndex.IndexType.BLOOM).build())
|
||||
.build();
|
||||
return new HoodieWriteClient(jsc, config, false);
|
||||
}
|
||||
|
||||
|
||||
}
|
||||
|
||||
@@ -30,109 +30,110 @@ import org.apache.spark.sql.SQLContext;
|
||||
|
||||
public class SparkMain {
|
||||
|
||||
protected final static Logger LOG = Logger.getLogger(SparkMain.class);
|
||||
protected final static Logger LOG = Logger.getLogger(SparkMain.class);
|
||||
|
||||
|
||||
/**
|
||||
* Commands
|
||||
*/
|
||||
enum SparkCommand {
|
||||
ROLLBACK,
|
||||
DEDUPLICATE,
|
||||
ROLLBACK_TO_SAVEPOINT,
|
||||
SAVEPOINT,
|
||||
IMPORT
|
||||
/**
|
||||
* Commands
|
||||
*/
|
||||
enum SparkCommand {
|
||||
ROLLBACK,
|
||||
DEDUPLICATE,
|
||||
ROLLBACK_TO_SAVEPOINT,
|
||||
SAVEPOINT,
|
||||
IMPORT
|
||||
}
|
||||
|
||||
public static void main(String[] args) throws Exception {
|
||||
String command = args[0];
|
||||
LOG.info("Invoking SparkMain:" + command);
|
||||
|
||||
SparkCommand cmd = SparkCommand.valueOf(command);
|
||||
|
||||
JavaSparkContext jsc = SparkUtil.initJavaSparkConf("hoodie-cli-" + command);
|
||||
int returnCode = 0;
|
||||
switch (cmd) {
|
||||
case ROLLBACK:
|
||||
assert (args.length == 3);
|
||||
returnCode = rollback(jsc, args[1], args[2]);
|
||||
break;
|
||||
case DEDUPLICATE:
|
||||
assert (args.length == 4);
|
||||
returnCode = deduplicatePartitionPath(jsc, args[1], args[2], args[3]);
|
||||
break;
|
||||
case ROLLBACK_TO_SAVEPOINT:
|
||||
assert (args.length == 3);
|
||||
returnCode = rollbackToSavepoint(jsc, args[1], args[2]);
|
||||
break;
|
||||
case IMPORT:
|
||||
assert (args.length == 11);
|
||||
returnCode = dataImport(jsc, args[1], args[2], args[3], args[4], args[5], args[6],
|
||||
Integer.parseInt(args[7]), args[8], SparkUtil.DEFUALT_SPARK_MASTER, args[9],
|
||||
Integer.parseInt(args[10]));
|
||||
break;
|
||||
}
|
||||
|
||||
public static void main(String[] args) throws Exception {
|
||||
String command = args[0];
|
||||
LOG.info("Invoking SparkMain:" + command);
|
||||
System.exit(returnCode);
|
||||
}
|
||||
|
||||
SparkCommand cmd = SparkCommand.valueOf(command);
|
||||
private static int dataImport(JavaSparkContext jsc, String srcPath, String targetPath,
|
||||
String tableName, String tableType, String rowKey, String partitionKey, int parallelism,
|
||||
String schemaFile, String sparkMaster, String sparkMemory, int retry) throws Exception {
|
||||
HDFSParquetImporter.Config cfg = new HDFSParquetImporter.Config();
|
||||
cfg.srcPath = srcPath;
|
||||
cfg.targetPath = targetPath;
|
||||
cfg.tableName = tableName;
|
||||
cfg.tableType = tableType;
|
||||
cfg.rowKey = rowKey;
|
||||
cfg.partitionKey = partitionKey;
|
||||
cfg.parallelism = parallelism;
|
||||
cfg.schemaFile = schemaFile;
|
||||
jsc.getConf().set("spark.executor.memory", sparkMemory);
|
||||
return new HDFSParquetImporter(cfg).dataImport(jsc, retry);
|
||||
}
|
||||
|
||||
JavaSparkContext jsc = SparkUtil.initJavaSparkConf("hoodie-cli-" + command);
|
||||
int returnCode = 0;
|
||||
switch(cmd) {
|
||||
case ROLLBACK:
|
||||
assert (args.length == 3);
|
||||
returnCode = rollback(jsc, args[1], args[2]);
|
||||
break;
|
||||
case DEDUPLICATE:
|
||||
assert (args.length == 4);
|
||||
returnCode = deduplicatePartitionPath(jsc, args[1], args[2], args[3]);
|
||||
break;
|
||||
case ROLLBACK_TO_SAVEPOINT:
|
||||
assert (args.length == 3);
|
||||
returnCode = rollbackToSavepoint(jsc, args[1], args[2]);
|
||||
break;
|
||||
case IMPORT:
|
||||
assert (args.length == 11);
|
||||
returnCode = dataImport(jsc, args[1], args[2], args[3], args[4], args[5], args[6],
|
||||
Integer.parseInt(args[7]), args[8], SparkUtil.DEFUALT_SPARK_MASTER, args[9],
|
||||
Integer.parseInt(args[10]));
|
||||
break;
|
||||
}
|
||||
private static int deduplicatePartitionPath(JavaSparkContext jsc,
|
||||
String duplicatedPartitionPath,
|
||||
String repairedOutputPath,
|
||||
String basePath)
|
||||
throws Exception {
|
||||
DedupeSparkJob job = new DedupeSparkJob(basePath,
|
||||
duplicatedPartitionPath, repairedOutputPath, new SQLContext(jsc), FSUtils.getFs());
|
||||
job.fixDuplicates(true);
|
||||
return 0;
|
||||
}
|
||||
|
||||
System.exit(returnCode);
|
||||
private static int rollback(JavaSparkContext jsc, String commitTime, String basePath)
|
||||
throws Exception {
|
||||
HoodieWriteClient client = createHoodieClient(jsc, basePath);
|
||||
if (client.rollback(commitTime)) {
|
||||
LOG.info(String.format("The commit \"%s\" rolled back.", commitTime));
|
||||
return 0;
|
||||
} else {
|
||||
LOG.info(String.format("The commit \"%s\" failed to roll back.", commitTime));
|
||||
return -1;
|
||||
}
|
||||
}
|
||||
|
||||
private static int dataImport(JavaSparkContext jsc, String srcPath, String targetPath,
|
||||
String tableName, String tableType, String rowKey, String partitionKey, int parallelism,
|
||||
String schemaFile, String sparkMaster, String sparkMemory, int retry) throws Exception {
|
||||
HDFSParquetImporter.Config cfg = new HDFSParquetImporter.Config();
|
||||
cfg.srcPath = srcPath;
|
||||
cfg.targetPath = targetPath;
|
||||
cfg.tableName = tableName;
|
||||
cfg.tableType = tableType;
|
||||
cfg.rowKey = rowKey;
|
||||
cfg.partitionKey = partitionKey;
|
||||
cfg.parallelism = parallelism;
|
||||
cfg.schemaFile = schemaFile;
|
||||
jsc.getConf().set("spark.executor.memory", sparkMemory);
|
||||
return new HDFSParquetImporter(cfg).dataImport(jsc, retry);
|
||||
private static int rollbackToSavepoint(JavaSparkContext jsc, String savepointTime,
|
||||
String basePath)
|
||||
throws Exception {
|
||||
HoodieWriteClient client = createHoodieClient(jsc, basePath);
|
||||
if (client.rollbackToSavepoint(savepointTime)) {
|
||||
LOG.info(String.format("The commit \"%s\" rolled back.", savepointTime));
|
||||
return 0;
|
||||
} else {
|
||||
LOG.info(String.format("The commit \"%s\" failed to roll back.", savepointTime));
|
||||
return -1;
|
||||
}
|
||||
}
|
||||
|
||||
private static int deduplicatePartitionPath(JavaSparkContext jsc,
|
||||
String duplicatedPartitionPath,
|
||||
String repairedOutputPath,
|
||||
String basePath)
|
||||
throws Exception {
|
||||
DedupeSparkJob job = new DedupeSparkJob(basePath,
|
||||
duplicatedPartitionPath,repairedOutputPath,new SQLContext(jsc), FSUtils.getFs());
|
||||
job.fixDuplicates(true);
|
||||
return 0;
|
||||
}
|
||||
|
||||
private static int rollback(JavaSparkContext jsc, String commitTime, String basePath)
|
||||
throws Exception {
|
||||
HoodieWriteClient client = createHoodieClient(jsc, basePath);
|
||||
if (client.rollback(commitTime)) {
|
||||
LOG.info(String.format("The commit \"%s\" rolled back.", commitTime));
|
||||
return 0;
|
||||
} else {
|
||||
LOG.info(String.format("The commit \"%s\" failed to roll back.", commitTime));
|
||||
return -1;
|
||||
}
|
||||
}
|
||||
|
||||
private static int rollbackToSavepoint(JavaSparkContext jsc, String savepointTime, String basePath)
|
||||
throws Exception {
|
||||
HoodieWriteClient client = createHoodieClient(jsc, basePath);
|
||||
if (client.rollbackToSavepoint(savepointTime)) {
|
||||
LOG.info(String.format("The commit \"%s\" rolled back.", savepointTime));
|
||||
return 0;
|
||||
} else {
|
||||
LOG.info(String.format("The commit \"%s\" failed to roll back.", savepointTime));
|
||||
return -1;
|
||||
}
|
||||
}
|
||||
|
||||
private static HoodieWriteClient createHoodieClient(JavaSparkContext jsc, String basePath)
|
||||
throws Exception {
|
||||
HoodieWriteConfig config = HoodieWriteConfig.newBuilder().withPath(basePath)
|
||||
.withIndexConfig(
|
||||
HoodieIndexConfig.newBuilder().withIndexType(HoodieIndex.IndexType.BLOOM).build())
|
||||
.build();
|
||||
return new HoodieWriteClient(jsc, config);
|
||||
}
|
||||
private static HoodieWriteClient createHoodieClient(JavaSparkContext jsc, String basePath)
|
||||
throws Exception {
|
||||
HoodieWriteConfig config = HoodieWriteConfig.newBuilder().withPath(basePath)
|
||||
.withIndexConfig(
|
||||
HoodieIndexConfig.newBuilder().withIndexType(HoodieIndex.IndexType.BLOOM).build())
|
||||
.build();
|
||||
return new HoodieWriteClient(jsc, config);
|
||||
}
|
||||
}
|
||||
|
||||
@@ -28,7 +28,10 @@ import com.uber.hoodie.common.table.timeline.HoodieActiveTimeline;
|
||||
import com.uber.hoodie.common.table.timeline.HoodieInstant;
|
||||
import com.uber.hoodie.common.util.FSUtils;
|
||||
import com.uber.hoodie.common.util.NumericUtils;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.text.DecimalFormat;
|
||||
import java.util.HashMap;
|
||||
import java.util.stream.Collectors;
|
||||
import org.apache.hadoop.fs.FileStatus;
|
||||
import org.apache.hadoop.fs.FileSystem;
|
||||
import org.apache.hadoop.fs.Path;
|
||||
@@ -38,106 +41,105 @@ import org.springframework.shell.core.annotation.CliCommand;
|
||||
import org.springframework.shell.core.annotation.CliOption;
|
||||
import org.springframework.stereotype.Component;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.text.DecimalFormat;
|
||||
import java.util.HashMap;
|
||||
import java.util.stream.Collectors;
|
||||
|
||||
@Component
|
||||
public class StatsCommand implements CommandMarker {
|
||||
@CliAvailabilityIndicator({"stats wa"})
|
||||
public boolean isWriteAmpAvailable() {
|
||||
return HoodieCLI.tableMetadata != null;
|
||||
|
||||
@CliAvailabilityIndicator({"stats wa"})
|
||||
public boolean isWriteAmpAvailable() {
|
||||
return HoodieCLI.tableMetadata != null;
|
||||
}
|
||||
|
||||
@CliCommand(value = "stats wa", help = "Write Amplification. Ratio of how many records were upserted to how many records were actually written")
|
||||
public String writeAmplificationStats() throws IOException {
|
||||
long totalRecordsUpserted = 0;
|
||||
long totalRecordsWritten = 0;
|
||||
|
||||
HoodieActiveTimeline activeTimeline = HoodieCLI.tableMetadata.getActiveTimeline();
|
||||
HoodieTimeline timeline = activeTimeline.getCommitTimeline().filterCompletedInstants();
|
||||
|
||||
String[][] rows = new String[new Long(timeline.countInstants()).intValue() + 1][];
|
||||
int i = 0;
|
||||
DecimalFormat df = new DecimalFormat("#.00");
|
||||
for (HoodieInstant commitTime : timeline.getInstants().collect(
|
||||
Collectors.toList())) {
|
||||
String waf = "0";
|
||||
HoodieCommitMetadata commit = HoodieCommitMetadata
|
||||
.fromBytes(activeTimeline.getInstantDetails(commitTime).get());
|
||||
if (commit.fetchTotalUpdateRecordsWritten() > 0) {
|
||||
waf = df.format(
|
||||
(float) commit.fetchTotalRecordsWritten() / commit
|
||||
.fetchTotalUpdateRecordsWritten());
|
||||
}
|
||||
rows[i++] = new String[]{commitTime.getTimestamp(),
|
||||
String.valueOf(commit.fetchTotalUpdateRecordsWritten()),
|
||||
String.valueOf(commit.fetchTotalRecordsWritten()), waf};
|
||||
totalRecordsUpserted += commit.fetchTotalUpdateRecordsWritten();
|
||||
totalRecordsWritten += commit.fetchTotalRecordsWritten();
|
||||
}
|
||||
String waf = "0";
|
||||
if (totalRecordsUpserted > 0) {
|
||||
waf = df.format((float) totalRecordsWritten / totalRecordsUpserted);
|
||||
}
|
||||
rows[i] = new String[]{"Total", String.valueOf(totalRecordsUpserted),
|
||||
String.valueOf(totalRecordsWritten), waf};
|
||||
return HoodiePrintHelper.print(
|
||||
new String[]{"CommitTime", "Total Upserted", "Total Written",
|
||||
"Write Amplifiation Factor"}, rows);
|
||||
|
||||
}
|
||||
|
||||
|
||||
private String[] printFileSizeHistogram(String commitTime, Snapshot s) {
|
||||
return new String[]{
|
||||
commitTime,
|
||||
NumericUtils.humanReadableByteCount(s.getMin()),
|
||||
NumericUtils.humanReadableByteCount(s.getValue(0.1)),
|
||||
NumericUtils.humanReadableByteCount(s.getMedian()),
|
||||
NumericUtils.humanReadableByteCount(s.getMean()),
|
||||
NumericUtils.humanReadableByteCount(s.get95thPercentile()),
|
||||
NumericUtils.humanReadableByteCount(s.getMax()),
|
||||
String.valueOf(s.size()),
|
||||
NumericUtils.humanReadableByteCount(s.getStdDev())
|
||||
};
|
||||
}
|
||||
|
||||
@CliCommand(value = "stats filesizes", help = "File Sizes. Display summary stats on sizes of files")
|
||||
public String fileSizeStats(
|
||||
@CliOption(key = {
|
||||
"partitionPath"}, help = "regex to select files, eg: 2016/08/02", unspecifiedDefaultValue = "*/*/*")
|
||||
final String globRegex) throws IOException {
|
||||
|
||||
FileSystem fs = HoodieCLI.fs;
|
||||
String globPath = String.format("%s/%s/*",
|
||||
HoodieCLI.tableMetadata.getBasePath(),
|
||||
globRegex);
|
||||
FileStatus[] statuses = fs.globStatus(new Path(globPath));
|
||||
|
||||
// max, min, #small files < 10MB, 50th, avg, 95th
|
||||
final int MAX_FILES = 1000000;
|
||||
Histogram globalHistogram = new Histogram(new UniformReservoir(MAX_FILES));
|
||||
HashMap<String, Histogram> commitHistoMap = new HashMap<String, Histogram>();
|
||||
for (FileStatus fileStatus : statuses) {
|
||||
String commitTime = FSUtils.getCommitTime(fileStatus.getPath().getName());
|
||||
long sz = fileStatus.getLen();
|
||||
if (!commitHistoMap.containsKey(commitTime)) {
|
||||
commitHistoMap.put(commitTime, new Histogram(new UniformReservoir(MAX_FILES)));
|
||||
}
|
||||
commitHistoMap.get(commitTime).update(sz);
|
||||
globalHistogram.update(sz);
|
||||
}
|
||||
|
||||
@CliCommand(value = "stats wa", help = "Write Amplification. Ratio of how many records were upserted to how many records were actually written")
|
||||
public String writeAmplificationStats() throws IOException {
|
||||
long totalRecordsUpserted = 0;
|
||||
long totalRecordsWritten = 0;
|
||||
|
||||
HoodieActiveTimeline activeTimeline = HoodieCLI.tableMetadata.getActiveTimeline();
|
||||
HoodieTimeline timeline = activeTimeline.getCommitTimeline().filterCompletedInstants();
|
||||
|
||||
String[][] rows = new String[new Long(timeline.countInstants()).intValue() + 1][];
|
||||
int i = 0;
|
||||
DecimalFormat df = new DecimalFormat("#.00");
|
||||
for (HoodieInstant commitTime : timeline.getInstants().collect(
|
||||
Collectors.toList())) {
|
||||
String waf = "0";
|
||||
HoodieCommitMetadata commit = HoodieCommitMetadata.fromBytes(activeTimeline.getInstantDetails(commitTime).get());
|
||||
if (commit.fetchTotalUpdateRecordsWritten() > 0) {
|
||||
waf = df.format(
|
||||
(float) commit.fetchTotalRecordsWritten() / commit
|
||||
.fetchTotalUpdateRecordsWritten());
|
||||
}
|
||||
rows[i++] = new String[] {commitTime.getTimestamp(),
|
||||
String.valueOf(commit.fetchTotalUpdateRecordsWritten()),
|
||||
String.valueOf(commit.fetchTotalRecordsWritten()), waf};
|
||||
totalRecordsUpserted += commit.fetchTotalUpdateRecordsWritten();
|
||||
totalRecordsWritten += commit.fetchTotalRecordsWritten();
|
||||
}
|
||||
String waf = "0";
|
||||
if (totalRecordsUpserted > 0) {
|
||||
waf = df.format((float) totalRecordsWritten / totalRecordsUpserted);
|
||||
}
|
||||
rows[i] = new String[] {"Total", String.valueOf(totalRecordsUpserted),
|
||||
String.valueOf(totalRecordsWritten), waf};
|
||||
return HoodiePrintHelper.print(
|
||||
new String[] {"CommitTime", "Total Upserted", "Total Written",
|
||||
"Write Amplifiation Factor"}, rows);
|
||||
|
||||
String[][] rows = new String[commitHistoMap.size() + 1][];
|
||||
int ind = 0;
|
||||
for (String commitTime : commitHistoMap.keySet()) {
|
||||
Snapshot s = commitHistoMap.get(commitTime).getSnapshot();
|
||||
rows[ind++] = printFileSizeHistogram(commitTime, s);
|
||||
}
|
||||
Snapshot s = globalHistogram.getSnapshot();
|
||||
rows[ind++] = printFileSizeHistogram("ALL", s);
|
||||
|
||||
|
||||
private String[] printFileSizeHistogram(String commitTime, Snapshot s) {
|
||||
return new String[]{
|
||||
commitTime,
|
||||
NumericUtils.humanReadableByteCount(s.getMin()),
|
||||
NumericUtils.humanReadableByteCount(s.getValue(0.1)),
|
||||
NumericUtils.humanReadableByteCount(s.getMedian()),
|
||||
NumericUtils.humanReadableByteCount(s.getMean()),
|
||||
NumericUtils.humanReadableByteCount(s.get95thPercentile()),
|
||||
NumericUtils.humanReadableByteCount(s.getMax()),
|
||||
String.valueOf(s.size()),
|
||||
NumericUtils.humanReadableByteCount(s.getStdDev())
|
||||
};
|
||||
}
|
||||
|
||||
@CliCommand(value = "stats filesizes", help = "File Sizes. Display summary stats on sizes of files")
|
||||
public String fileSizeStats(
|
||||
@CliOption(key = {"partitionPath"}, help = "regex to select files, eg: 2016/08/02", unspecifiedDefaultValue = "*/*/*")
|
||||
final String globRegex) throws IOException {
|
||||
|
||||
FileSystem fs = HoodieCLI.fs;
|
||||
String globPath = String.format("%s/%s/*",
|
||||
HoodieCLI.tableMetadata.getBasePath(),
|
||||
globRegex);
|
||||
FileStatus[] statuses = fs.globStatus(new Path(globPath));
|
||||
|
||||
// max, min, #small files < 10MB, 50th, avg, 95th
|
||||
final int MAX_FILES = 1000000;
|
||||
Histogram globalHistogram = new Histogram(new UniformReservoir(MAX_FILES));
|
||||
HashMap<String, Histogram> commitHistoMap = new HashMap<String, Histogram>();
|
||||
for (FileStatus fileStatus: statuses) {
|
||||
String commitTime = FSUtils.getCommitTime(fileStatus.getPath().getName());
|
||||
long sz = fileStatus.getLen();
|
||||
if (!commitHistoMap.containsKey(commitTime)) {
|
||||
commitHistoMap.put(commitTime, new Histogram(new UniformReservoir(MAX_FILES)));
|
||||
}
|
||||
commitHistoMap.get(commitTime).update(sz);
|
||||
globalHistogram.update(sz);
|
||||
}
|
||||
|
||||
String[][] rows = new String[commitHistoMap.size() + 1][];
|
||||
int ind = 0;
|
||||
for (String commitTime: commitHistoMap.keySet()) {
|
||||
Snapshot s = commitHistoMap.get(commitTime).getSnapshot();
|
||||
rows[ind++] = printFileSizeHistogram(commitTime, s);
|
||||
}
|
||||
Snapshot s = globalHistogram.getSnapshot();
|
||||
rows[ind++] = printFileSizeHistogram("ALL", s);
|
||||
|
||||
return HoodiePrintHelper.print(
|
||||
new String[] {"CommitTime", "Min", "10th", "50th", "avg", "95th", "Max", "NumFiles", "StdDev"}, rows);
|
||||
}
|
||||
return HoodiePrintHelper.print(
|
||||
new String[]{"CommitTime", "Min", "10th", "50th", "avg", "95th", "Max", "NumFiles",
|
||||
"StdDev"}, rows);
|
||||
}
|
||||
}
|
||||
|
||||
@@ -23,12 +23,13 @@ import org.springframework.stereotype.Component;
|
||||
|
||||
@Component
|
||||
public class UtilsCommand implements CommandMarker {
|
||||
@CliCommand(value = "utils loadClass", help = "Load a class" )
|
||||
public String loadClass(
|
||||
@CliOption(key = {"class"}, help = "Check mode" ) final String clazz
|
||||
) throws Exception {
|
||||
Class klass = Class.forName(clazz);
|
||||
return klass.getProtectionDomain().getCodeSource().getLocation().toExternalForm();
|
||||
}
|
||||
|
||||
@CliCommand(value = "utils loadClass", help = "Load a class")
|
||||
public String loadClass(
|
||||
@CliOption(key = {"class"}, help = "Check mode") final String clazz
|
||||
) throws Exception {
|
||||
Class klass = Class.forName(clazz);
|
||||
return klass.getProtectionDomain().getCodeSource().getLocation().toExternalForm();
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
@@ -20,21 +20,22 @@ import com.uber.hoodie.common.model.HoodieCommitMetadata;
|
||||
import com.uber.hoodie.common.table.HoodieTableMetaClient;
|
||||
import com.uber.hoodie.common.table.HoodieTimeline;
|
||||
import com.uber.hoodie.common.table.timeline.HoodieInstant;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.util.List;
|
||||
|
||||
public class CommitUtil {
|
||||
public static long countNewRecords(HoodieTableMetaClient target, List<String> commitsToCatchup)
|
||||
throws IOException {
|
||||
long totalNew = 0;
|
||||
HoodieTimeline timeline = target.getActiveTimeline().reload().getCommitTimeline().filterCompletedInstants();
|
||||
for(String commit:commitsToCatchup) {
|
||||
HoodieCommitMetadata c = HoodieCommitMetadata.fromBytes(timeline
|
||||
.getInstantDetails(new HoodieInstant(false, HoodieTimeline.COMMIT_ACTION, commit))
|
||||
.get());
|
||||
totalNew += c.fetchTotalRecordsWritten() - c.fetchTotalUpdateRecordsWritten();
|
||||
}
|
||||
return totalNew;
|
||||
|
||||
public static long countNewRecords(HoodieTableMetaClient target, List<String> commitsToCatchup)
|
||||
throws IOException {
|
||||
long totalNew = 0;
|
||||
HoodieTimeline timeline = target.getActiveTimeline().reload().getCommitTimeline()
|
||||
.filterCompletedInstants();
|
||||
for (String commit : commitsToCatchup) {
|
||||
HoodieCommitMetadata c = HoodieCommitMetadata.fromBytes(timeline
|
||||
.getInstantDetails(new HoodieInstant(false, HoodieTimeline.COMMIT_ACTION, commit))
|
||||
.get());
|
||||
totalNew += c.fetchTotalRecordsWritten() - c.fetchTotalUpdateRecordsWritten();
|
||||
}
|
||||
return totalNew;
|
||||
}
|
||||
}
|
||||
|
||||
@@ -17,107 +17,112 @@
|
||||
package com.uber.hoodie.cli.utils;
|
||||
|
||||
import com.uber.hoodie.common.table.HoodieTableMetaClient;
|
||||
import org.apache.commons.dbcp.BasicDataSource;
|
||||
import org.joda.time.DateTime;
|
||||
|
||||
import javax.sql.DataSource;
|
||||
import java.sql.Connection;
|
||||
import java.sql.ResultSet;
|
||||
import java.sql.SQLException;
|
||||
import java.sql.Statement;
|
||||
import javax.sql.DataSource;
|
||||
import org.apache.commons.dbcp.BasicDataSource;
|
||||
import org.joda.time.DateTime;
|
||||
|
||||
public class HiveUtil {
|
||||
private static String driverName = "org.apache.hive.jdbc.HiveDriver";
|
||||
|
||||
static {
|
||||
try {
|
||||
Class.forName(driverName);
|
||||
} catch (ClassNotFoundException e) {
|
||||
throw new IllegalStateException("Could not find " + driverName + " in classpath. ", e);
|
||||
}
|
||||
private static String driverName = "org.apache.hive.jdbc.HiveDriver";
|
||||
|
||||
static {
|
||||
try {
|
||||
Class.forName(driverName);
|
||||
} catch (ClassNotFoundException e) {
|
||||
throw new IllegalStateException("Could not find " + driverName + " in classpath. ", e);
|
||||
}
|
||||
}
|
||||
|
||||
private static Connection connection;
|
||||
private static Connection connection;
|
||||
|
||||
private static Connection getConnection(String jdbcUrl, String user, String pass) throws SQLException {
|
||||
DataSource ds = getDatasource(jdbcUrl, user, pass);
|
||||
return ds.getConnection();
|
||||
private static Connection getConnection(String jdbcUrl, String user, String pass)
|
||||
throws SQLException {
|
||||
DataSource ds = getDatasource(jdbcUrl, user, pass);
|
||||
return ds.getConnection();
|
||||
}
|
||||
|
||||
private static DataSource getDatasource(String jdbcUrl, String user, String pass) {
|
||||
BasicDataSource ds = new BasicDataSource();
|
||||
ds.setDriverClassName(driverName);
|
||||
ds.setUrl(jdbcUrl);
|
||||
ds.setUsername(user);
|
||||
ds.setPassword(pass);
|
||||
return ds;
|
||||
}
|
||||
|
||||
public static long countRecords(String jdbcUrl, HoodieTableMetaClient source, String dbName,
|
||||
String user, String pass) throws SQLException {
|
||||
Connection conn = HiveUtil.getConnection(jdbcUrl, user, pass);
|
||||
ResultSet rs = null;
|
||||
Statement stmt = conn.createStatement();
|
||||
try {
|
||||
//stmt.execute("set mapred.job.queue.name=<queue_name>");
|
||||
stmt.execute("set hive.input.format=org.apache.hadoop.hive.ql.io.HiveInputFormat");
|
||||
stmt.execute("set hive.stats.autogather=false");
|
||||
rs = stmt.executeQuery(
|
||||
"select count(`_hoodie_commit_time`) as cnt from " + dbName + "." + source
|
||||
.getTableConfig()
|
||||
.getTableName());
|
||||
long count = -1;
|
||||
if (rs.next()) {
|
||||
count = rs.getLong("cnt");
|
||||
}
|
||||
System.out
|
||||
.println("Total records in " + source.getTableConfig().getTableName() + " is " + count);
|
||||
return count;
|
||||
} finally {
|
||||
if (rs != null) {
|
||||
rs.close();
|
||||
}
|
||||
if (stmt != null) {
|
||||
stmt.close();
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
private static DataSource getDatasource(String jdbcUrl, String user, String pass) {
|
||||
BasicDataSource ds = new BasicDataSource();
|
||||
ds.setDriverClassName(driverName);
|
||||
ds.setUrl(jdbcUrl);
|
||||
ds.setUsername(user);
|
||||
ds.setPassword(pass);
|
||||
return ds;
|
||||
}
|
||||
public static long countRecords(String jdbcUrl, HoodieTableMetaClient source, String srcDb,
|
||||
int partitions, String user, String pass) throws SQLException {
|
||||
DateTime dateTime = DateTime.now();
|
||||
String endDateStr =
|
||||
dateTime.getYear() + "-" + String.format("%02d", dateTime.getMonthOfYear()) + "-" +
|
||||
String.format("%02d", dateTime.getDayOfMonth());
|
||||
dateTime = dateTime.minusDays(partitions);
|
||||
String startDateStr =
|
||||
dateTime.getYear() + "-" + String.format("%02d", dateTime.getMonthOfYear()) + "-" +
|
||||
String.format("%02d", dateTime.getDayOfMonth());
|
||||
System.out.println("Start date " + startDateStr + " and end date " + endDateStr);
|
||||
return countRecords(jdbcUrl, source, srcDb, startDateStr, endDateStr, user, pass);
|
||||
}
|
||||
|
||||
public static long countRecords(String jdbcUrl, HoodieTableMetaClient source, String dbName, String user, String pass) throws SQLException {
|
||||
Connection conn = HiveUtil.getConnection(jdbcUrl, user, pass);
|
||||
ResultSet rs = null;
|
||||
Statement stmt = conn.createStatement();
|
||||
try {
|
||||
//stmt.execute("set mapred.job.queue.name=<queue_name>");
|
||||
stmt.execute("set hive.input.format=org.apache.hadoop.hive.ql.io.HiveInputFormat" );
|
||||
stmt.execute("set hive.stats.autogather=false" );
|
||||
rs = stmt.executeQuery(
|
||||
"select count(`_hoodie_commit_time`) as cnt from " + dbName + "." + source.getTableConfig()
|
||||
.getTableName());
|
||||
long count = -1;
|
||||
if(rs.next()) {
|
||||
count = rs.getLong("cnt");
|
||||
}
|
||||
System.out.println("Total records in " + source.getTableConfig().getTableName() + " is " + count);
|
||||
return count;
|
||||
} finally {
|
||||
if (rs != null) {
|
||||
rs.close();
|
||||
}
|
||||
if (stmt != null) {
|
||||
stmt.close();
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
public static long countRecords(String jdbcUrl, HoodieTableMetaClient source, String srcDb,
|
||||
int partitions, String user, String pass) throws SQLException {
|
||||
DateTime dateTime = DateTime.now();
|
||||
String endDateStr =
|
||||
dateTime.getYear() + "-" + String.format("%02d", dateTime.getMonthOfYear()) + "-" +
|
||||
String.format("%02d", dateTime.getDayOfMonth());
|
||||
dateTime = dateTime.minusDays(partitions);
|
||||
String startDateStr =
|
||||
dateTime.getYear() + "-" + String.format("%02d", dateTime.getMonthOfYear()) + "-" +
|
||||
String.format("%02d", dateTime.getDayOfMonth());
|
||||
System.out.println("Start date " + startDateStr + " and end date " + endDateStr);
|
||||
return countRecords(jdbcUrl, source, srcDb, startDateStr, endDateStr, user, pass);
|
||||
}
|
||||
|
||||
private static long countRecords(String jdbcUrl, HoodieTableMetaClient source, String srcDb, String startDateStr,
|
||||
String endDateStr, String user, String pass) throws SQLException {
|
||||
Connection conn = HiveUtil.getConnection(jdbcUrl, user, pass);
|
||||
ResultSet rs = null;
|
||||
Statement stmt = conn.createStatement();
|
||||
try {
|
||||
//stmt.execute("set mapred.job.queue.name=<queue_name>");
|
||||
stmt.execute("set hive.input.format=org.apache.hadoop.hive.ql.io.HiveInputFormat");
|
||||
stmt.execute("set hive.stats.autogather=false");
|
||||
rs = stmt.executeQuery(
|
||||
"select count(`_hoodie_commit_time`) as cnt from " + srcDb + "." + source.getTableConfig()
|
||||
.getTableName() + " where datestr>'" + startDateStr + "' and datestr<='"
|
||||
+ endDateStr + "'");
|
||||
if(rs.next()) {
|
||||
return rs.getLong("cnt");
|
||||
}
|
||||
return -1;
|
||||
} finally {
|
||||
if (rs != null) {
|
||||
rs.close();
|
||||
}
|
||||
if (stmt != null) {
|
||||
stmt.close();
|
||||
}
|
||||
}
|
||||
private static long countRecords(String jdbcUrl, HoodieTableMetaClient source, String srcDb,
|
||||
String startDateStr,
|
||||
String endDateStr, String user, String pass) throws SQLException {
|
||||
Connection conn = HiveUtil.getConnection(jdbcUrl, user, pass);
|
||||
ResultSet rs = null;
|
||||
Statement stmt = conn.createStatement();
|
||||
try {
|
||||
//stmt.execute("set mapred.job.queue.name=<queue_name>");
|
||||
stmt.execute("set hive.input.format=org.apache.hadoop.hive.ql.io.HiveInputFormat");
|
||||
stmt.execute("set hive.stats.autogather=false");
|
||||
rs = stmt.executeQuery(
|
||||
"select count(`_hoodie_commit_time`) as cnt from " + srcDb + "." + source.getTableConfig()
|
||||
.getTableName() + " where datestr>'" + startDateStr + "' and datestr<='"
|
||||
+ endDateStr + "'");
|
||||
if (rs.next()) {
|
||||
return rs.getLong("cnt");
|
||||
}
|
||||
return -1;
|
||||
} finally {
|
||||
if (rs != null) {
|
||||
rs.close();
|
||||
}
|
||||
if (stmt != null) {
|
||||
stmt.close();
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -23,34 +23,37 @@ import java.io.InputStreamReader;
|
||||
import java.util.logging.Logger;
|
||||
|
||||
public class InputStreamConsumer extends Thread {
|
||||
protected final static Logger LOG = Logger.getLogger(InputStreamConsumer.class.getName());
|
||||
private InputStream is;
|
||||
public InputStreamConsumer(InputStream is) {
|
||||
this.is = is;
|
||||
}
|
||||
|
||||
@Override
|
||||
public void run() {
|
||||
try {
|
||||
InputStreamReader isr = new InputStreamReader(is);
|
||||
BufferedReader br = new BufferedReader(isr);
|
||||
String line;
|
||||
while ( (line = br.readLine()) != null)
|
||||
LOG.info(line);
|
||||
} catch (IOException ioe) {
|
||||
LOG.severe(ioe.toString());
|
||||
ioe.printStackTrace();
|
||||
}
|
||||
}
|
||||
protected final static Logger LOG = Logger.getLogger(InputStreamConsumer.class.getName());
|
||||
private InputStream is;
|
||||
|
||||
public static void captureOutput(Process p) {
|
||||
InputStreamConsumer stdout;
|
||||
InputStreamConsumer errout;
|
||||
errout = new InputStreamConsumer(p.getErrorStream());
|
||||
stdout = new InputStreamConsumer(p.getInputStream());
|
||||
errout.start();
|
||||
stdout.start();
|
||||
public InputStreamConsumer(InputStream is) {
|
||||
this.is = is;
|
||||
}
|
||||
|
||||
@Override
|
||||
public void run() {
|
||||
try {
|
||||
InputStreamReader isr = new InputStreamReader(is);
|
||||
BufferedReader br = new BufferedReader(isr);
|
||||
String line;
|
||||
while ((line = br.readLine()) != null) {
|
||||
LOG.info(line);
|
||||
}
|
||||
} catch (IOException ioe) {
|
||||
LOG.severe(ioe.toString());
|
||||
ioe.printStackTrace();
|
||||
}
|
||||
}
|
||||
|
||||
public static void captureOutput(Process p) {
|
||||
InputStreamConsumer stdout;
|
||||
InputStreamConsumer errout;
|
||||
errout = new InputStreamConsumer(p.getErrorStream());
|
||||
stdout = new InputStreamConsumer(p.getInputStream());
|
||||
errout.start();
|
||||
stdout.start();
|
||||
}
|
||||
|
||||
|
||||
}
|
||||
|
||||
@@ -18,59 +18,54 @@ package com.uber.hoodie.cli.utils;
|
||||
|
||||
import com.uber.hoodie.HoodieWriteClient;
|
||||
import com.uber.hoodie.cli.commands.SparkMain;
|
||||
|
||||
import java.io.File;
|
||||
import java.net.URISyntaxException;
|
||||
import org.apache.log4j.Logger;
|
||||
import org.apache.spark.SparkConf;
|
||||
import org.apache.spark.api.java.JavaSparkContext;
|
||||
import org.apache.spark.launcher.SparkLauncher;
|
||||
|
||||
import java.io.File;
|
||||
import java.net.URISyntaxException;
|
||||
|
||||
public class SparkUtil {
|
||||
|
||||
public static Logger logger = Logger.getLogger(SparkUtil.class);
|
||||
public static final String DEFUALT_SPARK_MASTER = "yarn-client";
|
||||
public static Logger logger = Logger.getLogger(SparkUtil.class);
|
||||
public static final String DEFUALT_SPARK_MASTER = "yarn-client";
|
||||
|
||||
/**
|
||||
*
|
||||
* TODO: Need to fix a bunch of hardcoded stuff here eg: history server, spark distro
|
||||
*
|
||||
* @return
|
||||
* @throws URISyntaxException
|
||||
*/
|
||||
public static SparkLauncher initLauncher(String propertiesFile) throws URISyntaxException {
|
||||
String currentJar = new File(
|
||||
SparkUtil.class.getProtectionDomain().getCodeSource().getLocation().toURI().getPath())
|
||||
.getAbsolutePath();
|
||||
SparkLauncher sparkLauncher =
|
||||
new SparkLauncher().setAppResource(currentJar)
|
||||
.setMainClass(SparkMain.class.getName())
|
||||
.setPropertiesFile(propertiesFile);
|
||||
File libDirectory = new File(new File(currentJar).getParent(), "lib");
|
||||
for (String library : libDirectory.list()) {
|
||||
sparkLauncher.addJar(new File(libDirectory, library).getAbsolutePath());
|
||||
}
|
||||
return sparkLauncher;
|
||||
/**
|
||||
* TODO: Need to fix a bunch of hardcoded stuff here eg: history server, spark distro
|
||||
*/
|
||||
public static SparkLauncher initLauncher(String propertiesFile) throws URISyntaxException {
|
||||
String currentJar = new File(
|
||||
SparkUtil.class.getProtectionDomain().getCodeSource().getLocation().toURI().getPath())
|
||||
.getAbsolutePath();
|
||||
SparkLauncher sparkLauncher =
|
||||
new SparkLauncher().setAppResource(currentJar)
|
||||
.setMainClass(SparkMain.class.getName())
|
||||
.setPropertiesFile(propertiesFile);
|
||||
File libDirectory = new File(new File(currentJar).getParent(), "lib");
|
||||
for (String library : libDirectory.list()) {
|
||||
sparkLauncher.addJar(new File(libDirectory, library).getAbsolutePath());
|
||||
}
|
||||
return sparkLauncher;
|
||||
}
|
||||
|
||||
public static JavaSparkContext initJavaSparkConf(String name) {
|
||||
SparkConf sparkConf = new SparkConf().setAppName(name);
|
||||
sparkConf.setMaster(DEFUALT_SPARK_MASTER);
|
||||
sparkConf.set("spark.serializer", "org.apache.spark.serializer.KryoSerializer");
|
||||
sparkConf.set("spark.driver.maxResultSize", "2g");
|
||||
sparkConf.set("spark.eventLog.overwrite", "true");
|
||||
sparkConf.set("spark.eventLog.enabled", "true");
|
||||
public static JavaSparkContext initJavaSparkConf(String name) {
|
||||
SparkConf sparkConf = new SparkConf().setAppName(name);
|
||||
sparkConf.setMaster(DEFUALT_SPARK_MASTER);
|
||||
sparkConf.set("spark.serializer", "org.apache.spark.serializer.KryoSerializer");
|
||||
sparkConf.set("spark.driver.maxResultSize", "2g");
|
||||
sparkConf.set("spark.eventLog.overwrite", "true");
|
||||
sparkConf.set("spark.eventLog.enabled", "true");
|
||||
|
||||
// Configure hadoop conf
|
||||
sparkConf.set("spark.hadoop.mapred.output.compress", "true");
|
||||
sparkConf.set("spark.hadoop.mapred.output.compression.codec", "true");
|
||||
sparkConf.set("spark.hadoop.mapred.output.compression.codec", "org.apache.hadoop.io.compress.GzipCodec");
|
||||
sparkConf.set("spark.hadoop.mapred.output.compression.type", "BLOCK");
|
||||
// Configure hadoop conf
|
||||
sparkConf.set("spark.hadoop.mapred.output.compress", "true");
|
||||
sparkConf.set("spark.hadoop.mapred.output.compression.codec", "true");
|
||||
sparkConf.set("spark.hadoop.mapred.output.compression.codec",
|
||||
"org.apache.hadoop.io.compress.GzipCodec");
|
||||
sparkConf.set("spark.hadoop.mapred.output.compression.type", "BLOCK");
|
||||
|
||||
sparkConf = HoodieWriteClient.registerClasses(sparkConf);
|
||||
JavaSparkContext jsc = new JavaSparkContext(sparkConf);
|
||||
jsc.hadoopConfiguration().setBoolean("parquet.enable.summary-metadata", false);
|
||||
return jsc;
|
||||
}
|
||||
sparkConf = HoodieWriteClient.registerClasses(sparkConf);
|
||||
JavaSparkContext jsc = new JavaSparkContext(sparkConf);
|
||||
jsc.hadoopConfiguration().setBoolean("parquet.enable.summary-metadata", false);
|
||||
return jsc;
|
||||
}
|
||||
}
|
||||
|
||||
@@ -16,11 +16,11 @@
|
||||
-->
|
||||
|
||||
<beans xmlns="http://www.springframework.org/schema/beans"
|
||||
xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
|
||||
xmlns:context="http://www.springframework.org/schema/context"
|
||||
xsi:schemaLocation="http://www.springframework.org/schema/beans http://www.springframework.org/schema/beans/spring-beans.xsd
|
||||
xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
|
||||
xmlns:context="http://www.springframework.org/schema/context"
|
||||
xsi:schemaLocation="http://www.springframework.org/schema/beans http://www.springframework.org/schema/beans/spring-beans.xsd
|
||||
http://www.springframework.org/schema/context http://www.springframework.org/schema/context/spring-context.xsd">
|
||||
|
||||
<context:component-scan base-package="com.uber.hoodie.cli" />
|
||||
<context:component-scan base-package="com.uber.hoodie.cli"/>
|
||||
|
||||
</beans>
|
||||
|
||||
@@ -34,11 +34,11 @@ import scala.collection.mutable._
|
||||
/**
|
||||
* Spark job to de-duplicate data present in a partition path
|
||||
*/
|
||||
class DedupeSparkJob (basePath: String,
|
||||
duplicatedPartitionPath: String,
|
||||
repairOutputPath: String,
|
||||
sqlContext: SQLContext,
|
||||
fs: FileSystem) {
|
||||
class DedupeSparkJob(basePath: String,
|
||||
duplicatedPartitionPath: String,
|
||||
repairOutputPath: String,
|
||||
sqlContext: SQLContext,
|
||||
fs: FileSystem) {
|
||||
|
||||
|
||||
val sparkHelper = new SparkHelper(sqlContext, fs)
|
||||
@@ -50,8 +50,9 @@ class DedupeSparkJob (basePath: String,
|
||||
* @param tblName
|
||||
* @return
|
||||
*/
|
||||
def getDupeKeyDF(tblName: String) : DataFrame = {
|
||||
val dupeSql = s"""
|
||||
def getDupeKeyDF(tblName: String): DataFrame = {
|
||||
val dupeSql =
|
||||
s"""
|
||||
select `${HoodieRecord.RECORD_KEY_METADATA_FIELD}` as dupe_key,
|
||||
count(*) as dupe_cnt
|
||||
from ${tblName}
|
||||
@@ -69,7 +70,7 @@ class DedupeSparkJob (basePath: String,
|
||||
*
|
||||
* @return
|
||||
*/
|
||||
private def planDuplicateFix() : HashMap[String, HashSet[String]] = {
|
||||
private def planDuplicateFix(): HashMap[String, HashSet[String]] = {
|
||||
|
||||
val tmpTableName = s"htbl_${System.currentTimeMillis()}"
|
||||
val dedupeTblName = s"${tmpTableName}_dupeKeys"
|
||||
@@ -78,17 +79,18 @@ class DedupeSparkJob (basePath: String,
|
||||
|
||||
val allFiles = fs.listStatus(new org.apache.hadoop.fs.Path(s"${basePath}/${duplicatedPartitionPath}"))
|
||||
val fsView = new HoodieTableFileSystemView(metadata, metadata.getActiveTimeline.getCommitTimeline.filterCompletedInstants(), allFiles)
|
||||
val latestFiles:java.util.List[HoodieDataFile] = fsView.getLatestDataFiles().collect(Collectors.toList[HoodieDataFile]())
|
||||
val latestFiles: java.util.List[HoodieDataFile] = fsView.getLatestDataFiles().collect(Collectors.toList[HoodieDataFile]())
|
||||
val filteredStatuses = latestFiles.map(f => f.getPath)
|
||||
LOG.info(s" List of files under partition: ${} => ${filteredStatuses.mkString(" ")}")
|
||||
|
||||
val df = sqlContext.parquetFile(filteredStatuses:_*)
|
||||
val df = sqlContext.parquetFile(filteredStatuses: _*)
|
||||
df.registerTempTable(tmpTableName)
|
||||
val dupeKeyDF = getDupeKeyDF(tmpTableName)
|
||||
dupeKeyDF.registerTempTable(dedupeTblName)
|
||||
|
||||
// Obtain necessary satellite information for duplicate rows
|
||||
val dupeDataSql = s"""
|
||||
val dupeDataSql =
|
||||
s"""
|
||||
SELECT `_hoodie_record_key`, `_hoodie_partition_path`, `_hoodie_file_name`, `_hoodie_commit_time`
|
||||
FROM ${tmpTableName} h
|
||||
JOIN ${dedupeTblName} d
|
||||
@@ -111,9 +113,9 @@ class DedupeSparkJob (basePath: String,
|
||||
|
||||
rows.foreach(r => {
|
||||
val c = r(3).asInstanceOf[String].toLong
|
||||
if (c != maxCommit){
|
||||
if (c != maxCommit) {
|
||||
val f = r(2).asInstanceOf[String].split("_")(0)
|
||||
if (!fileToDeleteKeyMap.contains(f)){
|
||||
if (!fileToDeleteKeyMap.contains(f)) {
|
||||
fileToDeleteKeyMap(f) = HashSet[String]()
|
||||
}
|
||||
fileToDeleteKeyMap(f).add(key)
|
||||
@@ -130,28 +132,30 @@ class DedupeSparkJob (basePath: String,
|
||||
val allFiles = fs.listStatus(new Path(s"${basePath}/${duplicatedPartitionPath}"))
|
||||
val fsView = new HoodieTableFileSystemView(metadata, metadata.getActiveTimeline.getCommitTimeline.filterCompletedInstants(), allFiles)
|
||||
|
||||
val latestFiles:java.util.List[HoodieDataFile] = fsView.getLatestDataFiles().collect(Collectors.toList[HoodieDataFile]())
|
||||
val latestFiles: java.util.List[HoodieDataFile] = fsView.getLatestDataFiles().collect(Collectors.toList[HoodieDataFile]())
|
||||
|
||||
val fileNameToPathMap = latestFiles.map(f => (f.getFileId, new Path(f.getPath))).toMap
|
||||
val dupeFixPlan = planDuplicateFix()
|
||||
|
||||
// 1. Copy all latest files into the temp fix path
|
||||
fileNameToPathMap.foreach{ case(fileName, filePath) => {
|
||||
fileNameToPathMap.foreach { case (fileName, filePath) => {
|
||||
val badSuffix = if (dupeFixPlan.contains(fileName)) ".bad" else ""
|
||||
val dstPath = new Path(s"${repairOutputPath}/${filePath.getName}${badSuffix}")
|
||||
LOG.info(s"Copying from ${filePath} to ${dstPath}")
|
||||
FileUtil.copy(fs, filePath, fs, dstPath, false, true, fs.getConf)
|
||||
}}
|
||||
}
|
||||
}
|
||||
|
||||
// 2. Remove duplicates from the bad files
|
||||
dupeFixPlan.foreach{case(fileName, keysToSkip) => {
|
||||
dupeFixPlan.foreach { case (fileName, keysToSkip) => {
|
||||
val commitTime = FSUtils.getCommitTime(fileNameToPathMap(fileName).getName)
|
||||
val badFilePath = new Path(s"${repairOutputPath}/${fileNameToPathMap(fileName).getName}.bad")
|
||||
val newFilePath = new Path(s"${repairOutputPath}/${fileNameToPathMap(fileName).getName}")
|
||||
LOG.info(" Skipping and writing new file for : " + fileName)
|
||||
SparkHelpers.skipKeysAndWriteNewFile(commitTime, fs, badFilePath, newFilePath, dupeFixPlan(fileName))
|
||||
fs.delete(badFilePath, false)
|
||||
}}
|
||||
}
|
||||
}
|
||||
|
||||
// 3. Check that there are no duplicates anymore.
|
||||
val df = sqlContext.read.parquet(s"${repairOutputPath}/*.parquet")
|
||||
@@ -186,6 +190,7 @@ class DedupeSparkJob (basePath: String,
|
||||
LOG.info(s"[FOR REAL!!!] Copying from ${srcPath} to ${dstPath}")
|
||||
FileUtil.copy(fs, srcPath, fs, dstPath, false, true, fs.getConf)
|
||||
}
|
||||
}}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -17,9 +17,9 @@
|
||||
package com.uber.hoodie.cli
|
||||
|
||||
import com.uber.hoodie.avro.HoodieAvroWriteSupport
|
||||
import com.uber.hoodie.common.{BloomFilter, HoodieJsonPayload}
|
||||
import com.uber.hoodie.common.model.HoodieRecord
|
||||
import com.uber.hoodie.common.util.ParquetUtils
|
||||
import com.uber.hoodie.common.{BloomFilter, HoodieJsonPayload}
|
||||
import com.uber.hoodie.config.{HoodieIndexConfig, HoodieStorageConfig}
|
||||
import com.uber.hoodie.io.storage.{HoodieParquetConfig, HoodieParquetWriter}
|
||||
import org.apache.avro.Schema
|
||||
@@ -107,7 +107,7 @@ class SparkHelper(sqlContext: SQLContext, fs: FileSystem) {
|
||||
* @param file
|
||||
* @param sqlContext
|
||||
*/
|
||||
def getKeyCount(file: String, sqlContext: org.apache.spark.sql.SQLContext) ={
|
||||
def getKeyCount(file: String, sqlContext: org.apache.spark.sql.SQLContext) = {
|
||||
println(getRowKeyDF(file).collect().size)
|
||||
}
|
||||
|
||||
@@ -122,7 +122,7 @@ class SparkHelper(sqlContext: SQLContext, fs: FileSystem) {
|
||||
* @param file
|
||||
* @return
|
||||
*/
|
||||
def fileKeysAgainstBF(conf: Configuration, sqlContext: SQLContext, file: String) : Boolean = {
|
||||
def fileKeysAgainstBF(conf: Configuration, sqlContext: SQLContext, file: String): Boolean = {
|
||||
val bfStr = SparkHelpers.getBloomFilter(file, conf)
|
||||
val bf = new com.uber.hoodie.common.BloomFilter(bfStr)
|
||||
val foundCount = sqlContext.parquetFile(file)
|
||||
@@ -134,7 +134,7 @@ class SparkHelper(sqlContext: SQLContext, fs: FileSystem) {
|
||||
totalCount == foundCount
|
||||
}
|
||||
|
||||
def getDistinctKeyDF(paths: List[String]) : DataFrame = {
|
||||
sqlContext.read.parquet(paths:_*).select(s"`${HoodieRecord.RECORD_KEY_METADATA_FIELD}`").distinct()
|
||||
def getDistinctKeyDF(paths: List[String]): DataFrame = {
|
||||
sqlContext.read.parquet(paths: _*).select(s"`${HoodieRecord.RECORD_KEY_METADATA_FIELD}`").distinct()
|
||||
}
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user