1
0

Writes relative paths to .commit files instead of absolute paths

Clean up code

Removed commented out code

Fixed merge conflict with master
This commit is contained in:
gekath
2017-06-02 11:28:47 -04:00
committed by prazanna
parent 0ed3fac5e3
commit db7311f85e
12 changed files with 92 additions and 34 deletions

View File

@@ -210,9 +210,9 @@ public class HoodieReadClient implements Serializable {
HoodieCommitMetadata metadata = HoodieCommitMetadata metadata =
HoodieCommitMetadata.fromBytes(commitTimeline.getInstantDetails(commit).get()); HoodieCommitMetadata.fromBytes(commitTimeline.getInstantDetails(commit).get());
// get files from each commit, and replace any previous versions // get files from each commit, and replace any previous versions
fileIdToFullPath.putAll(metadata.getFileIdAndFullPaths()); String basePath = hoodieTable.getMetaClient().getBasePath();
fileIdToFullPath.putAll(metadata.getFileIdAndFullPaths(basePath));
} }
return sqlContextOpt.get().read() return sqlContextOpt.get().read()
.parquet(fileIdToFullPath.values().toArray(new String[fileIdToFullPath.size()])) .parquet(fileIdToFullPath.values().toArray(new String[fileIdToFullPath.size()]))
.filter(String.format("%s >'%s'", HoodieRecord.COMMIT_TIME_METADATA_FIELD, lastCommitTimestamp)); .filter(String.format("%s >'%s'", HoodieRecord.COMMIT_TIME_METADATA_FIELD, lastCommitTimestamp));
@@ -234,11 +234,12 @@ public class HoodieReadClient implements Serializable {
} }
try { try {
HoodieCommitMetadata commitMetdata = HoodieCommitMetadata commitMetadata =
HoodieCommitMetadata.fromBytes(commitTimeline.getInstantDetails(commitInstant).get()); HoodieCommitMetadata.fromBytes(commitTimeline.getInstantDetails(commitInstant).get());
Collection<String> paths = commitMetdata.getFileIdAndFullPaths().values(); String basePath = hoodieTable.getMetaClient().getBasePath();
HashMap<String, String> paths = commitMetadata.getFileIdAndFullPaths(basePath);
return sqlContextOpt.get().read() return sqlContextOpt.get().read()
.parquet(paths.toArray(new String[paths.size()])) .parquet(paths.values().toArray(new String[paths.size()]))
.filter(String.format("%s ='%s'", HoodieRecord.COMMIT_TIME_METADATA_FIELD, commitTime)); .filter(String.format("%s ='%s'", HoodieRecord.COMMIT_TIME_METADATA_FIELD, commitTime));
} catch (Exception e) { } catch (Exception e) {
throw new HoodieException("Error reading commit " + commitTime, e); throw new HoodieException("Error reading commit " + commitTime, e);

View File

@@ -80,6 +80,8 @@ public class HoodieAppendHandle<T extends HoodieRecordPayload> extends HoodieIOH
fileSystemView.getLatestDataFilesForFileId(record.getPartitionPath(), fileId) fileSystemView.getLatestDataFilesForFileId(record.getPartitionPath(), fileId)
.findFirst().get().getFileName(); .findFirst().get().getFileName();
String baseCommitTime = FSUtils.getCommitTime(latestValidFilePath); String baseCommitTime = FSUtils.getCommitTime(latestValidFilePath);
Path path = new Path(record.getPartitionPath(),
FSUtils.makeDataFileName(commitTime, TaskContext.getPartitionId(), fileId));
writeStatus.getStat().setPrevCommit(baseCommitTime); writeStatus.getStat().setPrevCommit(baseCommitTime);
writeStatus.setFileId(fileId); writeStatus.setFileId(fileId);
writeStatus.setPartitionPath(record.getPartitionPath()); writeStatus.setPartitionPath(record.getPartitionPath());
@@ -103,7 +105,7 @@ public class HoodieAppendHandle<T extends HoodieRecordPayload> extends HoodieIOH
+ " on commit " + commitTime + " on HDFS path " + hoodieTable + " on commit " + commitTime + " on HDFS path " + hoodieTable
.getMetaClient().getBasePath() + partitionPath, e); .getMetaClient().getBasePath() + partitionPath, e);
} }
writeStatus.getStat().setFullPath(currentLogFile.getPath().toString()); writeStatus.getStat().setPath(path.toString());
} }
// update the new location of the record, so we know where to find it next // update the new location of the record, so we know where to find it next
record.setNewLocation(new HoodieRecordLocation(commitTime, fileId)); record.setNewLocation(new HoodieRecordLocation(commitTime, fileId));

View File

@@ -123,12 +123,14 @@ public class HoodieInsertHandle<T extends HoodieRecordPayload> extends HoodieIOH
try { try {
storageWriter.close(); storageWriter.close();
String relativePath = path.toString().replace(new Path(config.getBasePath()) + "/", "");
HoodieWriteStat stat = new HoodieWriteStat(); HoodieWriteStat stat = new HoodieWriteStat();
stat.setNumWrites(recordsWritten); stat.setNumWrites(recordsWritten);
stat.setNumDeletes(recordsDeleted); stat.setNumDeletes(recordsDeleted);
stat.setPrevCommit(HoodieWriteStat.NULL_COMMIT); stat.setPrevCommit(HoodieWriteStat.NULL_COMMIT);
stat.setFileId(status.getFileId()); stat.setFileId(status.getFileId());
stat.setFullPath(path.toString()); stat.setPath(relativePath);
stat.setTotalWriteBytes(FSUtils.getFileSize(fs, path)); stat.setTotalWriteBytes(FSUtils.getFileSize(fs, path));
stat.setTotalWriteErrors(status.getFailedRecords().size()); stat.setTotalWriteErrors(status.getFailedRecords().size());
status.setStat(stat); status.setStat(stat);

View File

@@ -93,9 +93,9 @@ public class HoodieUpdateHandle <T extends HoodieRecordPayload> extends HoodieIO
oldFilePath = new Path( oldFilePath = new Path(
config.getBasePath() + "/" + record.getPartitionPath() + "/" config.getBasePath() + "/" + record.getPartitionPath() + "/"
+ latestValidFilePath); + latestValidFilePath);
newFilePath = new Path( String relativePath = new Path( record.getPartitionPath() + "/" + FSUtils
config.getBasePath() + "/" + record.getPartitionPath() + "/" + FSUtils .makeDataFileName(commitTime, TaskContext.getPartitionId(), fileId)).toString();
.makeDataFileName(commitTime, TaskContext.getPartitionId(), fileId)); newFilePath = new Path(config.getBasePath(), relativePath);
// handle cases of partial failures, for update task // handle cases of partial failures, for update task
if (fs.exists(newFilePath)) { if (fs.exists(newFilePath)) {
@@ -108,7 +108,7 @@ public class HoodieUpdateHandle <T extends HoodieRecordPayload> extends HoodieIO
writeStatus.setFileId(fileId); writeStatus.setFileId(fileId);
writeStatus.setPartitionPath(record.getPartitionPath()); writeStatus.setPartitionPath(record.getPartitionPath());
writeStatus.getStat().setFileId(fileId); writeStatus.getStat().setFileId(fileId);
writeStatus.getStat().setFullPath(newFilePath.toString()); writeStatus.getStat().setPath(relativePath);
} }
keyToNewRecords.put(record.getRecordKey(), record); keyToNewRecords.put(record.getRecordKey(), record);
// update the new location of the record, so we know where to find it next // update the new location of the record, so we know where to find it next

View File

@@ -114,6 +114,7 @@ public class HoodieRealtimeTableCompactor implements HoodieCompactor {
for (CompactionWriteStat stat : updateStatusMap) { for (CompactionWriteStat stat : updateStatusMap) {
metadata.addWriteStat(stat.getPartitionPath(), stat); metadata.addWriteStat(stat.getPartitionPath(), stat);
} }
log.info("Compaction finished with result " + metadata); log.info("Compaction finished with result " + metadata);
//noinspection ConstantConditions //noinspection ConstantConditions

View File

@@ -45,6 +45,7 @@ import com.uber.hoodie.table.HoodieTable;
import java.util.Map; import java.util.Map;
import org.apache.avro.generic.GenericRecord; import org.apache.avro.generic.GenericRecord;
import org.apache.commons.io.IOUtils;
import org.apache.hadoop.fs.FileSystem; import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path; import org.apache.hadoop.fs.Path;
import org.apache.spark.SparkConf; import org.apache.spark.SparkConf;
@@ -60,6 +61,7 @@ import org.junit.Test;
import org.junit.rules.TemporaryFolder; import org.junit.rules.TemporaryFolder;
import java.io.File; import java.io.File;
import java.io.FileInputStream;
import java.io.IOException; import java.io.IOException;
import java.io.Serializable; import java.io.Serializable;
import java.util.ArrayList; import java.util.ArrayList;
@@ -101,7 +103,6 @@ public class TestHoodieClient implements Serializable {
folder.create(); folder.create();
basePath = folder.getRoot().getAbsolutePath(); basePath = folder.getRoot().getAbsolutePath();
HoodieTestUtils.init(basePath); HoodieTestUtils.init(basePath);
dataGen = new HoodieTestDataGenerator(); dataGen = new HoodieTestDataGenerator();
} }
@@ -616,7 +617,7 @@ public class TestHoodieClient implements Serializable {
if (!fileIdToVersions.containsKey(wstat.getFileId())) { if (!fileIdToVersions.containsKey(wstat.getFileId())) {
fileIdToVersions.put(wstat.getFileId(), new TreeSet<>()); fileIdToVersions.put(wstat.getFileId(), new TreeSet<>());
} }
fileIdToVersions.get(wstat.getFileId()).add(FSUtils.getCommitTime(new Path(wstat.getFullPath()).getName())); fileIdToVersions.get(wstat.getFileId()).add(FSUtils.getCommitTime(new Path(wstat.getPath()).getName()));
} }
} }
@@ -1136,7 +1137,6 @@ public class TestHoodieClient implements Serializable {
List<HoodieCleanStat> hoodieCleanStatsFour = table.clean(jsc); List<HoodieCleanStat> hoodieCleanStatsFour = table.clean(jsc);
assertEquals("Must not clean any files" , 0, getCleanStat(hoodieCleanStatsFour, partitionPaths[0]).getSuccessDeleteFiles().size()); assertEquals("Must not clean any files" , 0, getCleanStat(hoodieCleanStatsFour, partitionPaths[0]).getSuccessDeleteFiles().size());
assertTrue(HoodieTestUtils.doesDataFileExist(basePath, partitionPaths[0], "002", file3P0C2)); assertTrue(HoodieTestUtils.doesDataFileExist(basePath, partitionPaths[0], "002", file3P0C2));
}
@Test @Test
public void testKeepLatestCommits() throws IOException { public void testKeepLatestCommits() throws IOException {
@@ -1298,6 +1298,47 @@ public class TestHoodieClient implements Serializable {
stageOneShuffleReadTaskRecordsCountMap.values().stream().filter(a -> a > 10 && a < 100).count() == 3); stageOneShuffleReadTaskRecordsCountMap.values().stream().filter(a -> a > 10 && a < 100).count() == 3);
} }
public void testCommitWritesRelativePaths() throws Exception {
HoodieWriteConfig cfg = getConfigBuilder().withAutoCommit(false).build();
HoodieWriteClient client = new HoodieWriteClient(jsc, cfg);
FileSystem fs = FSUtils.getFs();
HoodieTableMetaClient metaClient = new HoodieTableMetaClient(fs, basePath);
HoodieTable table = HoodieTable.getHoodieTable(metaClient, cfg);
String commitTime = "000";
List<HoodieRecord> records = dataGen.generateInserts(commitTime, 200);
JavaRDD<HoodieRecord> writeRecords = jsc.parallelize(records, 1);
JavaRDD<WriteStatus> result = client.bulkInsert(writeRecords, commitTime);
assertTrue("Commit should succeed", client.commit(commitTime, result));
assertTrue("After explicit commit, commit file should be created",
HoodieTestUtils.doesCommitExist(basePath, commitTime));
// Get parquet file paths from commit metadata
String actionType = table.getCompactedCommitActionType();
HoodieInstant commitInstant =
new HoodieInstant(false, actionType, commitTime);
HoodieTimeline commitTimeline = table.getCompletedCompactionCommitTimeline();
HoodieCommitMetadata commitMetadata =
HoodieCommitMetadata.fromBytes(commitTimeline.getInstantDetails(commitInstant).get());
String basePath = table.getMetaClient().getBasePath();
Collection<String> commitPathNames = commitMetadata.getFileIdAndFullPaths(basePath).values();
// Read from commit file
String filename = HoodieTestUtils.getCommitFilePath(basePath, commitTime);
FileInputStream inputStream = new FileInputStream(filename);
String everything = IOUtils.toString(inputStream);
HoodieCommitMetadata metadata = HoodieCommitMetadata.fromJsonString(everything.toString());
HashMap<String, String> paths = metadata.getFileIdAndFullPaths(basePath);
inputStream.close();
// Compare values in both to make sure they are equal.
for (String pathName: paths.values()) {
assertTrue(commitPathNames.contains(pathName));
}
private HoodieCleanStat getCleanStat(List<HoodieCleanStat> hoodieCleanStatsTwo, private HoodieCleanStat getCleanStat(List<HoodieCleanStat> hoodieCleanStatsTwo,
String partitionPath) { String partitionPath) {
return hoodieCleanStatsTwo.stream() return hoodieCleanStatsTwo.stream()

View File

@@ -129,8 +129,9 @@ public class TestHoodieCompactor {
HoodieCompactionMetadata result = HoodieCompactionMetadata result =
compactor.compact(jsc, getConfig(), table); compactor.compact(jsc, getConfig(), table);
String basePath = table.getMetaClient().getBasePath();
assertTrue("If there is nothing to compact, result will be empty", assertTrue("If there is nothing to compact, result will be empty",
result.getFileIdAndFullPaths().isEmpty()); result.getFileIdAndFullPaths(basePath).isEmpty());
} }
@Test @Test

View File

@@ -19,6 +19,7 @@ package com.uber.hoodie.common.model;
import com.fasterxml.jackson.annotation.JsonIgnoreProperties; import com.fasterxml.jackson.annotation.JsonIgnoreProperties;
import com.fasterxml.jackson.databind.DeserializationFeature; import com.fasterxml.jackson.databind.DeserializationFeature;
import org.apache.hadoop.fs.Path;
import org.apache.log4j.LogManager; import org.apache.log4j.LogManager;
import org.apache.log4j.Logger; import org.apache.log4j.Logger;
import org.codehaus.jackson.annotate.JsonAutoDetect; import org.codehaus.jackson.annotate.JsonAutoDetect;
@@ -72,17 +73,25 @@ public class HoodieCommitMetadata implements Serializable {
return extraMetadataMap.get(metaKey); return extraMetadataMap.get(metaKey);
} }
public HashMap<String, String> getFileIdAndFullPaths() { public HashMap<String, String> getFileIdAndRelativePaths() {
HashMap<String, String> filePaths = new HashMap<>(); HashMap<String, String> filePaths = new HashMap<>();
// list all partitions paths // list all partitions paths
for (Map.Entry<String, List<HoodieWriteStat>> entry: getPartitionToWriteStats().entrySet()) { for (Map.Entry<String, List<HoodieWriteStat>> entry: getPartitionToWriteStats().entrySet()) {
for (HoodieWriteStat stat: entry.getValue()) { for (HoodieWriteStat stat: entry.getValue()) {
filePaths.put(stat.getFileId(), stat.getFullPath()); filePaths.put(stat.getFileId(), stat.getPath());
} }
} }
return filePaths; return filePaths;
} }
public HashMap<String, String> getFileIdAndFullPaths(String basePath) {
HashMap<String, String> fullPaths = new HashMap<>();
HashMap<String, String> relativePaths = getFileIdAndRelativePaths();
for (Map.Entry<String, String> entry: relativePaths.entrySet()) {
Path fullPath = new Path(basePath, entry.getValue());
fullPaths.put(entry.getKey(), fullPath.toString());
} return fullPaths;
}
public String toJsonString() throws IOException { public String toJsonString() throws IOException {
if(partitionToWriteStats.containsKey(null)) { if(partitionToWriteStats.containsKey(null)) {

View File

@@ -35,9 +35,9 @@ public class HoodieWriteStat implements Serializable {
private String fileId; private String fileId;
/** /**
* Full path to the file on underlying file system * Relative path to the file from the base path
*/ */
private String fullPath; private String path;
/** /**
* The previous version of the file. (null if this is the first version. i.e insert) * The previous version of the file. (null if this is the first version. i.e insert)
@@ -79,9 +79,7 @@ public class HoodieWriteStat implements Serializable {
this.fileId = fileId; this.fileId = fileId;
} }
public void setFullPath(String fullFilePath) { public void setPath(String path) { this.path = path; }
this.fullPath = fullFilePath;
}
public void setPrevCommit(String prevCommit) { public void setPrevCommit(String prevCommit) {
this.prevCommit = prevCommit; this.prevCommit = prevCommit;
@@ -131,15 +129,14 @@ public class HoodieWriteStat implements Serializable {
return fileId; return fileId;
} }
public String getFullPath() { public String getPath() { return path; }
return fullPath;
}
@Override @Override
public String toString() { public String toString() {
return new StringBuilder() return new StringBuilder()
.append("HoodieWriteStat {") .append("HoodieWriteStat {")
.append("fullPath='" + fullPath + '\'') .append("path=" + path)
.append(", prevCommit='" + prevCommit + '\'') .append(", prevCommit='" + prevCommit + '\'')
.append(", numWrites=" + numWrites) .append(", numWrites=" + numWrites)
.append(", numDeletes=" + numDeletes) .append(", numDeletes=" + numDeletes)
@@ -157,7 +154,7 @@ public class HoodieWriteStat implements Serializable {
return false; return false;
HoodieWriteStat that = (HoodieWriteStat) o; HoodieWriteStat that = (HoodieWriteStat) o;
if (!fullPath.equals(that.fullPath)) if (!path.equals(that.path))
return false; return false;
return prevCommit.equals(that.prevCommit); return prevCommit.equals(that.prevCommit);
@@ -165,7 +162,7 @@ public class HoodieWriteStat implements Serializable {
@Override @Override
public int hashCode() { public int hashCode() {
int result = fullPath.hashCode(); int result = path.hashCode();
result = 31 * result + prevCommit.hashCode(); result = 31 * result + prevCommit.hashCode();
return result; return result;
} }

View File

@@ -116,6 +116,10 @@ public class HoodieTestUtils {
return basePath + "/" + partitionPath + "/" + FSUtils.makeDataFileName(commitTime, DEFAULT_TASK_PARTITIONID, fileID); return basePath + "/" + partitionPath + "/" + FSUtils.makeDataFileName(commitTime, DEFAULT_TASK_PARTITIONID, fileID);
} }
public static final String getCommitFilePath(String basePath, String commitTime) throws IOException {
return basePath + "/" + HoodieTableMetaClient.METAFOLDER_NAME + "/" + commitTime + HoodieTimeline.COMMIT_EXTENSION;
}
public static final boolean doesDataFileExist(String basePath, String partitionPath, String commitTime, String fileID) throws IOException { public static final boolean doesDataFileExist(String basePath, String partitionPath, String commitTime, String fileID) throws IOException {
return new File(getDataFilePath(basePath, partitionPath, commitTime, fileID)).exists(); return new File(getDataFilePath(basePath, partitionPath, commitTime, fileID)).exists();
} }

View File

@@ -313,7 +313,7 @@ public class HoodieHiveClient {
.orElseThrow(() -> new InvalidDatasetException(syncConfig.basePath)); .orElseThrow(() -> new InvalidDatasetException(syncConfig.basePath));
HoodieCommitMetadata commitMetadata = HoodieCommitMetadata HoodieCommitMetadata commitMetadata = HoodieCommitMetadata
.fromBytes(activeTimeline.getInstantDetails(lastCommit).get()); .fromBytes(activeTimeline.getInstantDetails(lastCommit).get());
String filePath = commitMetadata.getFileIdAndFullPaths().values().stream().findAny() String filePath = commitMetadata.getFileIdAndFullPaths(metaClient.getBasePath()).values().stream().findAny()
.orElseThrow(() -> new IllegalArgumentException( .orElseThrow(() -> new IllegalArgumentException(
"Could not find any data file written for commit " + lastCommit "Could not find any data file written for commit " + lastCommit
+ ", could not get schema for dataset " + metaClient.getBasePath())); + ", could not get schema for dataset " + metaClient.getBasePath()));
@@ -340,7 +340,7 @@ public class HoodieHiveClient {
// read from the log file wrote // read from the log file wrote
commitMetadata = HoodieCommitMetadata commitMetadata = HoodieCommitMetadata
.fromBytes(activeTimeline.getInstantDetails(lastDeltaCommit).get()); .fromBytes(activeTimeline.getInstantDetails(lastDeltaCommit).get());
filePath = commitMetadata.getFileIdAndFullPaths().values().stream().filter(s -> s.contains( filePath = commitMetadata.getFileIdAndFullPaths(metaClient.getBasePath()).values().stream().filter(s -> s.contains(
HoodieLogFile.DELTA_EXTENSION)).findAny() HoodieLogFile.DELTA_EXTENSION)).findAny()
.orElseThrow(() -> new IllegalArgumentException( .orElseThrow(() -> new IllegalArgumentException(
"Could not find any data file written for commit " + lastDeltaCommit "Could not find any data file written for commit " + lastDeltaCommit
@@ -377,7 +377,7 @@ public class HoodieHiveClient {
// Read from the compacted file wrote // Read from the compacted file wrote
HoodieCompactionMetadata compactionMetadata = HoodieCompactionMetadata HoodieCompactionMetadata compactionMetadata = HoodieCompactionMetadata
.fromBytes(activeTimeline.getInstantDetails(lastCompactionCommit).get()); .fromBytes(activeTimeline.getInstantDetails(lastCompactionCommit).get());
String filePath = compactionMetadata.getFileIdAndFullPaths().values().stream().findAny() String filePath = compactionMetadata.getFileIdAndFullPaths(metaClient.getBasePath()).values().stream().findAny()
.orElseThrow(() -> new IllegalArgumentException( .orElseThrow(() -> new IllegalArgumentException(
"Could not find any data file written for compaction " + lastCompactionCommit "Could not find any data file written for compaction " + lastCompactionCommit
+ ", could not get schema for dataset " + metaClient.getBasePath())); + ", could not get schema for dataset " + metaClient.getBasePath()));

View File

@@ -217,12 +217,12 @@ public class TestUtil {
for (Entry<String, List<HoodieWriteStat>> wEntry : partitionWriteStats.entrySet()) { for (Entry<String, List<HoodieWriteStat>> wEntry : partitionWriteStats.entrySet()) {
String partitionPath = wEntry.getKey(); String partitionPath = wEntry.getKey();
for (HoodieWriteStat wStat : wEntry.getValue()) { for (HoodieWriteStat wStat : wEntry.getValue()) {
Path path = new Path(wStat.getFullPath()); Path path = new Path(wStat.getPath());
HoodieDataFile dataFile = new HoodieDataFile(fileSystem.getFileStatus(path)); HoodieDataFile dataFile = new HoodieDataFile(fileSystem.getFileStatus(path));
HoodieLogFile logFile = generateLogData(path, isLogSchemaSimple); HoodieLogFile logFile = generateLogData(path, isLogSchemaSimple);
HoodieDeltaWriteStat writeStat = new HoodieDeltaWriteStat(); HoodieDeltaWriteStat writeStat = new HoodieDeltaWriteStat();
writeStat.setFileId(dataFile.getFileId()); writeStat.setFileId(dataFile.getFileId());
writeStat.setFullPath(logFile.getPath().toString()); writeStat.setPath(logFile.getPath().toString());
commitMetadata.addWriteStat(partitionPath, writeStat); commitMetadata.addWriteStat(partitionPath, writeStat);
} }
} }
@@ -258,7 +258,7 @@ public class TestUtil {
generateParquetData(filePath, isParquetSchemaSimple); generateParquetData(filePath, isParquetSchemaSimple);
HoodieWriteStat writeStat = new HoodieWriteStat(); HoodieWriteStat writeStat = new HoodieWriteStat();
writeStat.setFileId(fileId); writeStat.setFileId(fileId);
writeStat.setFullPath(filePath.toString()); writeStat.setPath(filePath.toString());
writeStats.add(writeStat); writeStats.add(writeStat);
} }
return writeStats; return writeStats;