1
0

Create .hoodie_partition_metadata in each partition, linking back to basepath

- Concurreny handled via taskID, failure recovery handled via renames
 - Falls back to search 3 levels up
 - Cli tool has command to add this to existing tables
This commit is contained in:
Vinoth Chandar
2017-03-21 23:57:30 -07:00
committed by vinoth chandar
parent 1e802ad4f2
commit 3129770fd0
10 changed files with 291 additions and 86 deletions

View File

@@ -16,6 +16,7 @@
package com.uber.hoodie.hadoop;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.mapreduce.JobContext;
import org.apache.log4j.LogManager;
import org.apache.log4j.Logger;
@@ -32,6 +33,7 @@ public class HoodieHiveUtil {
public static final String DEFAULT_SCAN_MODE = LATEST_SCAN_MODE;
public static final int DEFAULT_MAX_COMMITS = 1;
public static final int MAX_COMMIT_ALL = -1;
public static final int DEFAULT_LEVELS_TO_BASEPATH = 3;
public static Integer readMaxCommits(JobContext job, String tableName) {
String maxCommitName = String.format(HOODIE_MAX_COMMIT_PATTERN, tableName);
@@ -55,4 +57,12 @@ public class HoodieHiveUtil {
LOG.info(modePropertyName + ": " + mode);
return mode;
}
public static Path getNthParent(Path path, int n) {
Path parent = path;
for (int i = 0; i < n; i++) {
parent = parent.getParent();
}
return parent;
}
}

View File

@@ -17,6 +17,7 @@
package com.uber.hoodie.hadoop;
import com.uber.hoodie.common.model.HoodieDataFile;
import com.uber.hoodie.common.model.HoodiePartitionMetadata;
import com.uber.hoodie.common.model.HoodieRecord;
import com.uber.hoodie.common.table.HoodieTableMetaClient;
import com.uber.hoodie.common.table.HoodieTimeline;
@@ -259,8 +260,13 @@ public class HoodieInputFormat extends MapredParquetInputFormat
*/
private HoodieTableMetaClient getTableMetaClient(Path dataPath) throws IOException {
FileSystem fs = dataPath.getFileSystem(conf);
// TODO - remove this hard-coding. Pass this in job conf, somehow. Or read the Table Location
Path baseDir = dataPath.getParent().getParent().getParent();
int levels = HoodieHiveUtil.DEFAULT_LEVELS_TO_BASEPATH;
if (HoodiePartitionMetadata.hasPartitionMetadata(fs, dataPath)) {
HoodiePartitionMetadata metadata = new HoodiePartitionMetadata(fs, dataPath);
metadata.readFromFS();
levels = metadata.getPartitionDepth();
}
Path baseDir = HoodieHiveUtil.getNthParent(dataPath, levels);
LOG.info("Reading hoodie metadata from path " + baseDir.toString());
return new HoodieTableMetaClient(fs, baseDir.toString());

View File

@@ -16,6 +16,7 @@
package com.uber.hoodie.hadoop;
import com.uber.hoodie.common.model.HoodieDataFile;
import com.uber.hoodie.common.model.HoodiePartitionMetadata;
import com.uber.hoodie.common.table.HoodieTableMetaClient;
import com.uber.hoodie.common.table.view.HoodieTableFileSystemView;
import com.uber.hoodie.exception.DatasetNotFoundException;
@@ -117,7 +118,15 @@ public class HoodieROTablePathFilter implements PathFilter, Serializable {
}
// Perform actual checking.
Path baseDir = safeGetParentsParent(folder);
Path baseDir;
if (HoodiePartitionMetadata.hasPartitionMetadata(fs, folder)) {
HoodiePartitionMetadata metadata = new HoodiePartitionMetadata(fs, folder);
metadata.readFromFS();
baseDir = HoodieHiveUtil.getNthParent(folder, metadata.getPartitionDepth());
} else {
baseDir = safeGetParentsParent(folder);
}
if (baseDir != null) {
try {
HoodieTableMetaClient metaClient =