1
0

Create .hoodie_partition_metadata in each partition, linking back to basepath

- Concurreny handled via taskID, failure recovery handled via renames
 - Falls back to search 3 levels up
 - Cli tool has command to add this to existing tables
This commit is contained in:
Vinoth Chandar
2017-03-21 23:57:30 -07:00
committed by vinoth chandar
parent 1e802ad4f2
commit 3129770fd0
10 changed files with 291 additions and 86 deletions

View File

@@ -1,78 +0,0 @@
/*
* Copyright (c) 2016 Uber Technologies, Inc. (hoodie-dev-group@uber.com)
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package com.uber.hoodie.cli.commands;
import com.uber.hoodie.cli.HoodieCLI;
import com.uber.hoodie.cli.utils.InputStreamConsumer;
import com.uber.hoodie.cli.utils.SparkUtil;
import org.apache.spark.launcher.SparkLauncher;
import org.springframework.shell.core.CommandMarker;
import org.springframework.shell.core.annotation.CliAvailabilityIndicator;
import org.springframework.shell.core.annotation.CliCommand;
import org.springframework.shell.core.annotation.CliOption;
import org.springframework.stereotype.Component;
@Component
public class RecordsCommand implements CommandMarker {
@CliAvailabilityIndicator({"records deduplicate"})
public boolean isRecordsDeduplicateAvailable() {
return HoodieCLI.tableMetadata != null;
}
@CliCommand(value = "records deduplicate", help = "De-duplicate a partition path contains duplicates & produce repaired files to replace with")
public String deduplicate(
@CliOption(key = {
"duplicatedPartitionPath"}, help = "Partition Path containing the duplicates")
final String duplicatedPartitionPath,
@CliOption(key = {"repairedOutputPath"}, help = "Location to place the repaired files")
final String repairedOutputPath,
@CliOption(key = {"sparkProperties"}, help = "Spark Properites File Path")
final String sparkPropertiesPath) throws Exception {
SparkLauncher sparkLauncher = SparkUtil.initLauncher(sparkPropertiesPath);
sparkLauncher
.addAppArgs(SparkMain.SparkCommand.DEDUPLICATE.toString(), duplicatedPartitionPath,
repairedOutputPath, HoodieCLI.tableMetadata.getBasePath());
Process process = sparkLauncher.launch();
InputStreamConsumer.captureOutput(process);
int exitCode = process.waitFor();
if (exitCode != 0) {
return "Deduplicated files placed in: " + repairedOutputPath;
}
return "Deduplication failed ";
}
// @CliCommand(value = "records find", help = "Find Records in a hoodie dataset")
// public String findRecords(
// @CliOption(key = {"keys"}, help = "Keys To Find (Comma seperated)")
// final String hoodieKeys,
// @CliOption(key = {"sparkProperties"}, help = "Spark Properites File Path")
// final String sparkPropertiesPath) throws Exception {
// SparkLauncher sparkLauncher = SparkUtil.initLauncher(sparkPropertiesPath);
// sparkLauncher
// .addAppArgs(SparkMain.RECORD_FIND, hoodieKeys, HoodieCLI.tableMetadata.getBasePath());
// Process process = sparkLauncher.launch();
// InputStreamConsumer.captureOutput(process);
// int exitCode = process.waitFor();
//
// if (exitCode != 0) {
// return "Deduplicated files placed in: " + repairedOutputPath;
// }
// return "Deduplication failed ";
// }
}

View File

@@ -0,0 +1,110 @@
/*
* Copyright (c) 2016 Uber Technologies, Inc. (hoodie-dev-group@uber.com)
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package com.uber.hoodie.cli.commands;
import com.uber.hoodie.cli.HoodieCLI;
import com.uber.hoodie.cli.HoodiePrintHelper;
import com.uber.hoodie.cli.utils.InputStreamConsumer;
import com.uber.hoodie.cli.utils.SparkUtil;
import com.uber.hoodie.common.model.HoodiePartitionMetadata;
import com.uber.hoodie.common.util.FSUtils;
import org.apache.hadoop.fs.Path;
import org.apache.spark.launcher.SparkLauncher;
import org.springframework.shell.core.CommandMarker;
import org.springframework.shell.core.annotation.CliAvailabilityIndicator;
import org.springframework.shell.core.annotation.CliCommand;
import org.springframework.shell.core.annotation.CliOption;
import org.springframework.stereotype.Component;
import java.io.IOException;
import java.util.List;
@Component
public class RepairsCommand implements CommandMarker {
@CliAvailabilityIndicator({"repair deduplicate"})
public boolean isRepairDeduplicateAvailable() {
return HoodieCLI.tableMetadata != null;
}
@CliAvailabilityIndicator({"repair addpartitionmeta"})
public boolean isRepairAddPartitionMetaAvailable() {
return HoodieCLI.tableMetadata != null;
}
@CliCommand(value = "repair deduplicate", help = "De-duplicate a partition path contains duplicates & produce repaired files to replace with")
public String deduplicate(
@CliOption(key = {
"duplicatedPartitionPath"}, help = "Partition Path containing the duplicates")
final String duplicatedPartitionPath,
@CliOption(key = {"repairedOutputPath"}, help = "Location to place the repaired files")
final String repairedOutputPath,
@CliOption(key = {"sparkProperties"}, help = "Spark Properites File Path")
final String sparkPropertiesPath) throws Exception {
SparkLauncher sparkLauncher = SparkUtil.initLauncher(sparkPropertiesPath);
sparkLauncher
.addAppArgs(SparkMain.SparkCommand.DEDUPLICATE.toString(), duplicatedPartitionPath,
repairedOutputPath, HoodieCLI.tableMetadata.getBasePath());
Process process = sparkLauncher.launch();
InputStreamConsumer.captureOutput(process);
int exitCode = process.waitFor();
if (exitCode != 0) {
return "Deduplicated files placed in: " + repairedOutputPath;
}
return "Deduplication failed ";
}
@CliCommand(value = "repair addpartitionmeta", help = "Add partition metadata to a dataset, if not present")
public String addPartitionMeta(
@CliOption(key = {"dryrun"},
help = "Should we actually add or just print what would be done",
unspecifiedDefaultValue = "true")
final boolean dryRun) throws IOException {
String latestCommit = HoodieCLI.tableMetadata.getActiveTimeline().getCommitTimeline().lastInstant().get().getTimestamp();
List<String> partitionPaths = FSUtils.getAllPartitionPaths(HoodieCLI.fs,
HoodieCLI.tableMetadata.getBasePath());
Path basePath = new Path(HoodieCLI.tableMetadata.getBasePath());
String[][] rows = new String[partitionPaths.size() + 1][];
int ind = 0;
for (String partition: partitionPaths) {
Path partitionPath = new Path(basePath, partition);
String[] row = new String[3];
row[0] = partition; row[1] = "Yes"; row[2] = "None";
if (!HoodiePartitionMetadata.hasPartitionMetadata(HoodieCLI.fs, partitionPath)) {
row[1] = "No";
if (!dryRun) {
HoodiePartitionMetadata partitionMetadata = new HoodiePartitionMetadata(
HoodieCLI.fs,
latestCommit,
basePath,
partitionPath);
partitionMetadata.trySave(0);
}
}
rows[ind++] = row;
}
return HoodiePrintHelper.print(
new String[] {"Partition Path", "Metadata Present?", "Action"}, rows);
}
}