1
0

Implement Merge on Read Storage (#76)

1. Create HoodieTable abstraction for commits and fileSystemView
2. HoodieMergeOnReadTable created
3. View is now always obtained from the table and the correct view based on the table type is returned
This commit is contained in:
prazanna
2017-02-21 15:24:00 -08:00
committed by Prasanna Rajaperumal
parent 11d2fd3428
commit eb46e7c72b
47 changed files with 1113 additions and 421 deletions

View File

@@ -20,7 +20,7 @@ import java.util.stream.Collectors
import com.uber.hoodie.common.model.{HoodieDataFile, HoodieRecord}
import com.uber.hoodie.common.table.HoodieTableMetaClient
import com.uber.hoodie.common.table.view.ReadOptimizedTableView
import com.uber.hoodie.common.table.view.HoodieTableFileSystemView
import com.uber.hoodie.common.util.FSUtils
import com.uber.hoodie.exception.HoodieException
import org.apache.hadoop.fs.{FileSystem, FileUtil, Path}
@@ -75,7 +75,7 @@ class DedupeSparkJob (basePath: String,
val dedupeTblName = s"${tmpTableName}_dupeKeys"
val metadata = new HoodieTableMetaClient(fs, basePath)
val fsView = new ReadOptimizedTableView(fs, metadata)
val fsView = new HoodieTableFileSystemView(metadata, metadata.getActiveTimeline.getCommitTimeline.filterCompletedInstants())
val allFiles = fs.listStatus(new org.apache.hadoop.fs.Path(s"${basePath}/${duplicatedPartitionPath}"))
val latestFiles:java.util.List[HoodieDataFile] = fsView.getLatestVersions(allFiles).collect(Collectors.toList[HoodieDataFile]())
@@ -126,7 +126,7 @@ class DedupeSparkJob (basePath: String,
def fixDuplicates(dryRun: Boolean = true) = {
val metadata = new HoodieTableMetaClient(fs, basePath)
val fsView = new ReadOptimizedTableView(fs, metadata)
val fsView = new HoodieTableFileSystemView(metadata, metadata.getActiveTimeline.getCommitTimeline.filterCompletedInstants())
val allFiles = fs.listStatus(new Path(s"${basePath}/${duplicatedPartitionPath}"))
val latestFiles:java.util.List[HoodieDataFile] = fsView.getLatestVersions(allFiles).collect(Collectors.toList[HoodieDataFile]())