1
0

Refactoring HoodieTableFileSystemView using FileGroups/FileSlices

- Merged all filter* and get* methods
 - new constructor takes filestatus[]
 - All existing tests pass
 - FileGroup is all files that belong to a fileID within a partition
 - FileSlice is a generation of data and log files, starting at a base commit
This commit is contained in:
Vinoth Chandar
2017-06-19 00:34:37 -07:00
committed by prazanna
parent 23e7badd8a
commit c00f1a9ed9
42 changed files with 810 additions and 408 deletions

View File

@@ -16,21 +16,21 @@
package com.uber.hoodie.common.model;
import com.google.common.collect.Lists;
import com.esotericsoftware.kryo.Kryo;
import com.esotericsoftware.kryo.io.Input;
import com.esotericsoftware.kryo.io.Output;
import com.esotericsoftware.kryo.serializers.JavaSerializer;
import com.google.common.collect.Lists;
import com.uber.hoodie.common.table.HoodieTableConfig;
import com.uber.hoodie.common.table.HoodieTableMetaClient;
import com.uber.hoodie.common.table.HoodieTimeline;
import com.uber.hoodie.common.table.log.HoodieLogFile;
import com.uber.hoodie.common.table.log.HoodieLogFormat;
import com.uber.hoodie.common.table.log.HoodieLogFormat.Writer;
import com.uber.hoodie.common.table.log.block.HoodieAvroDataBlock;
import com.uber.hoodie.common.util.FSUtils;
import com.uber.hoodie.common.util.HoodieAvroUtils;
import org.apache.avro.Schema;
import org.apache.avro.generic.GenericRecord;
import org.apache.avro.generic.IndexedRecord;

View File

@@ -17,6 +17,7 @@
package com.uber.hoodie.common.table.log;
import com.uber.hoodie.common.minicluster.MiniClusterUtil;
import com.uber.hoodie.common.model.HoodieLogFile;
import com.uber.hoodie.common.model.HoodieRecord;
import com.uber.hoodie.common.table.log.HoodieLogFormat.Reader;
import com.uber.hoodie.common.table.log.HoodieLogFormat.Writer;

View File

@@ -19,6 +19,7 @@ package com.uber.hoodie.common.table.view;
import com.google.common.collect.Lists;
import com.google.common.collect.Sets;
import com.uber.hoodie.common.model.HoodieDataFile;
import com.uber.hoodie.common.model.HoodieFileGroup;
import com.uber.hoodie.common.model.HoodieTestUtils;
import com.uber.hoodie.common.table.HoodieTableMetaClient;
import com.uber.hoodie.common.table.HoodieTimeline;
@@ -43,7 +44,7 @@ import java.util.stream.Collectors;
import static org.junit.Assert.*;
@SuppressWarnings("ResultOfMethodCallIgnored")
public class ReadOptimizedTableViewTest {
public class HoodieTableFileSystemViewTest {
private HoodieTableMetaClient metaClient;
private String basePath;
private TableFileSystemView fsView;
@@ -58,10 +59,16 @@ public class ReadOptimizedTableViewTest {
metaClient.getActiveTimeline().getCommitTimeline().filterCompletedInstants());
}
private void refreshFsView() {
private void refreshFsView(FileStatus[] statuses) {
metaClient = new HoodieTableMetaClient(HoodieTestUtils.fs, basePath, true);
fsView = new HoodieTableFileSystemView(metaClient,
metaClient.getActiveTimeline().getCommitTimeline().filterCompletedInstants());
if (statuses != null) {
fsView = new HoodieTableFileSystemView(metaClient,
metaClient.getActiveTimeline().getCommitTimeline().filterCompletedInstants(),
statuses);
} else {
fsView = new HoodieTableFileSystemView(metaClient,
metaClient.getActiveTimeline().getCommitTimeline().filterCompletedInstants());
}
}
@Test
@@ -71,42 +78,51 @@ public class ReadOptimizedTableViewTest {
String fileId = UUID.randomUUID().toString();
assertFalse("No commit, should not find any data file",
fsView.getLatestDataFilesForFileId(partitionPath, fileId).findFirst().isPresent());
fsView.getLatestDataFiles(partitionPath)
.filter(dfile -> dfile.getFileId().equals(fileId)).findFirst().isPresent());
// Only one commit, but is not safe
String commitTime1 = "1";
String fileName1 = FSUtils.makeDataFileName(commitTime1, 1, fileId);
new File(basePath + "/" + partitionPath + "/" + fileName1).createNewFile();
refreshFsView();
refreshFsView(null);
assertFalse("No commit, should not find any data file",
fsView.getLatestDataFilesForFileId(partitionPath, fileId).findFirst().isPresent());
fsView.getLatestDataFiles(partitionPath)
.filter(dfile -> dfile.getFileId().equals(fileId))
.findFirst().isPresent());
// Make this commit safe
HoodieActiveTimeline commitTimeline = metaClient.getActiveTimeline();
HoodieInstant instant1 =
new HoodieInstant(true, HoodieTimeline.COMMIT_ACTION, commitTime1);
commitTimeline.saveAsComplete(instant1, Optional.empty());
refreshFsView();
assertEquals("", fileName1,
fsView.getLatestDataFilesForFileId(partitionPath, fileId).findFirst().get()
refreshFsView(null);
assertEquals("", fileName1, fsView
.getLatestDataFiles(partitionPath)
.filter(dfile -> dfile.getFileId().equals(fileId))
.findFirst().get()
.getFileName());
// Do another commit, but not safe
String commitTime2 = "2";
String fileName2 = FSUtils.makeDataFileName(commitTime2, 1, fileId);
new File(basePath + "/" + partitionPath + "/" + fileName2).createNewFile();
refreshFsView();
assertEquals("", fileName1,
fsView.getLatestDataFilesForFileId(partitionPath, fileId).findFirst().get()
refreshFsView(null);
assertEquals("", fileName1, fsView
.getLatestDataFiles(partitionPath)
.filter(dfile -> dfile.getFileId().equals(fileId))
.findFirst().get()
.getFileName());
// Make it safe
HoodieInstant instant2 =
new HoodieInstant(true, HoodieTimeline.COMMIT_ACTION, commitTime2);
commitTimeline.saveAsComplete(instant2, Optional.empty());
refreshFsView();
assertEquals("", fileName2,
fsView.getLatestDataFilesForFileId(partitionPath, fileId).findFirst().get()
refreshFsView(null);
assertEquals("", fileName2, fsView
.getLatestDataFiles(partitionPath)
.filter(dfile -> dfile.getFileId().equals(fileId))
.findFirst().get()
.getFileName());
}
@@ -147,13 +163,13 @@ public class ReadOptimizedTableViewTest {
FileStatus[] statuses = HoodieTestUtils.fs.listStatus(new Path(fullPartitionPath));
assertEquals(statuses.length, 7);
refreshFsView();
List<HoodieDataFile> statuses1 =
fsView.getLatestVersionInPartition("2016/05/01", commitTime4)
refreshFsView(null);
List<HoodieDataFile> dataFileList =
fsView.getLatestDataFilesBeforeOrOn("2016/05/01", commitTime4)
.collect(Collectors.toList());
assertEquals(statuses1.size(), 3);
assertEquals(dataFileList.size(), 3);
Set<String> filenames = Sets.newHashSet();
for (HoodieDataFile status : statuses1) {
for (HoodieDataFile status : dataFileList) {
filenames.add(status.getFileName());
}
assertTrue(filenames.contains(FSUtils.makeDataFileName(commitTime4, 1, fileId1)));
@@ -162,7 +178,7 @@ public class ReadOptimizedTableViewTest {
// Reset the max commit time
List<HoodieDataFile> statuses2 =
fsView.getLatestVersionInPartition("2016/05/01", commitTime3)
fsView.getLatestDataFilesBeforeOrOn("2016/05/01", commitTime3)
.collect(Collectors.toList());
assertEquals(statuses2.size(), 3);
filenames = Sets.newHashSet();
@@ -211,18 +227,18 @@ public class ReadOptimizedTableViewTest {
FileStatus[] statuses = HoodieTestUtils.fs.listStatus(new Path(fullPartitionPath));
assertEquals(statuses.length, 7);
refreshFsView();
List<List<HoodieDataFile>> statuses1 =
fsView.getEveryVersionInPartition("2016/05/01").collect(Collectors.toList());
assertEquals(statuses1.size(), 3);
refreshFsView(null);
List<HoodieFileGroup> fileGroups =
fsView.getAllFileGroups("2016/05/01").collect(Collectors.toList());
assertEquals(fileGroups.size(), 3);
for (List<HoodieDataFile> status : statuses1) {
String fileId = status.get(0).getFileId();
for (HoodieFileGroup fileGroup : fileGroups) {
String fileId = fileGroup.getId();
Set<String> filenames = Sets.newHashSet();
for (HoodieDataFile dataFile : status) {
fileGroup.getAllDataFiles().forEach(dataFile -> {
assertEquals("All same fileId should be grouped", fileId, dataFile.getFileId());
filenames.add(dataFile.getFileName());
}
});
if (fileId.equals(fileId1)) {
assertEquals(filenames,
Sets.newHashSet(FSUtils.makeDataFileName(commitTime1, 1, fileId1),
@@ -277,9 +293,9 @@ public class ReadOptimizedTableViewTest {
FileStatus[] statuses = HoodieTestUtils.fs.listStatus(new Path(fullPartitionPath));
assertEquals(statuses.length, 7);
refreshFsView();
refreshFsView(statuses);
List<HoodieDataFile> statuses1 = fsView
.getLatestVersionInRange(statuses, Lists.newArrayList(commitTime2, commitTime3))
.getLatestDataFilesInRange(Lists.newArrayList(commitTime2, commitTime3))
.collect(Collectors.toList());
assertEquals(statuses1.size(), 2);
Set<String> filenames = Sets.newHashSet();
@@ -293,7 +309,8 @@ public class ReadOptimizedTableViewTest {
@Test
public void streamLatestVersionsBefore() throws IOException {
// Put some files in the partition
String fullPartitionPath = basePath + "/2016/05/01/";
String partitionPath = "2016/05/01/";
String fullPartitionPath = basePath + "/" + partitionPath;
new File(fullPartitionPath).mkdirs();
String commitTime1 = "1";
String commitTime2 = "2";
@@ -327,9 +344,9 @@ public class ReadOptimizedTableViewTest {
FileStatus[] statuses = HoodieTestUtils.fs.listStatus(new Path(fullPartitionPath));
assertEquals(statuses.length, 7);
refreshFsView();
refreshFsView(null);
List<HoodieDataFile> statuses1 =
fsView.getLatestVersionsBeforeOrOn(statuses, commitTime2)
fsView.getLatestDataFilesBeforeOrOn(partitionPath, commitTime2)
.collect(Collectors.toList());
assertEquals(statuses1.size(), 2);
Set<String> filenames = Sets.newHashSet();
@@ -344,7 +361,8 @@ public class ReadOptimizedTableViewTest {
@Test
public void streamLatestVersions() throws IOException {
// Put some files in the partition
String fullPartitionPath = basePath + "/2016/05/01/";
String partitionPath = "2016/05/01/";
String fullPartitionPath = basePath + "/" + partitionPath;
new File(fullPartitionPath).mkdirs();
String commitTime1 = "1";
String commitTime2 = "2";
@@ -378,9 +396,9 @@ public class ReadOptimizedTableViewTest {
FileStatus[] statuses = HoodieTestUtils.fs.listStatus(new Path(fullPartitionPath));
assertEquals(statuses.length, 7);
refreshFsView();
refreshFsView(statuses);
List<HoodieDataFile> statuses1 =
fsView.getLatestVersions(statuses).collect(Collectors.toList());
fsView.getLatestDataFiles().collect(Collectors.toList());
assertEquals(statuses1.size(), 3);
Set<String> filenames = Sets.newHashSet();
for (HoodieDataFile status : statuses1) {