CodeStyle formatting to conform to basic Checkstyle rules.

The code-style rules follow google style with some changes: 1. Increase line length from 100 to 120 2. Disable JavaDoc related checkstyles as this needs more manual work. Both source and test code are checked for code-style
2018-03-20 16:29:20 -07:00
parent 987f5d6b96
commit 788e4f2d2e
200 changed files with 6209 additions and 5975 deletions
--- a/hoodie-client/src/main/java/com/uber/hoodie/table/HoodieCopyOnWriteTable.java
+++ b/hoodie-client/src/main/java/com/uber/hoodie/table/HoodieCopyOnWriteTable.java
@@ -75,23 +75,425 @@ import scala.Tuple2;

 /**
 * Implementation of a very heavily read-optimized Hoodie Table where
- *
+ * <p>
 * INSERTS - Produce new files, block aligned to desired size (or) Merge with the smallest existing
 * file, to expand it
- *
+ * <p>
 * UPDATES - Produce a new version of the file, just replacing the updated records with new values
 */
 public class HoodieCopyOnWriteTable<T extends HoodieRecordPayload> extends HoodieTable<T> {

+  private static Logger logger = LogManager.getLogger(HoodieCopyOnWriteTable.class);
+
  public HoodieCopyOnWriteTable(HoodieWriteConfig config, HoodieTableMetaClient metaClient) {
    super(config, metaClient);
  }

-  private static Logger logger = LogManager.getLogger(HoodieCopyOnWriteTable.class);
+  private static PairFlatMapFunction<Iterator<Tuple2<String, String>>, String,
+      PartitionCleanStat> deleteFilesFunc(
+      HoodieTable table) {
+    return (PairFlatMapFunction<Iterator<Tuple2<String, String>>, String, PartitionCleanStat>)
+        iter -> {
+          Map<String, PartitionCleanStat> partitionCleanStatMap = new HashMap<>();
+
+          FileSystem fs = table.getMetaClient().getFs();
+          while (iter.hasNext()) {
+            Tuple2<String, String> partitionDelFileTuple = iter.next();
+            String partitionPath = partitionDelFileTuple._1();
+            String deletePathStr = partitionDelFileTuple._2();
+            Boolean deletedFileResult = deleteFileAndGetResult(fs, deletePathStr);
+            if (!partitionCleanStatMap.containsKey(partitionPath)) {
+              partitionCleanStatMap.put(partitionPath, new PartitionCleanStat(partitionPath));
+            }
+            PartitionCleanStat partitionCleanStat = partitionCleanStatMap.get(partitionPath);
+            partitionCleanStat.addDeleteFilePatterns(deletePathStr);
+            partitionCleanStat.addDeletedFileResult(deletePathStr, deletedFileResult);
+          }
+
+          return partitionCleanStatMap.entrySet().stream()
+              .map(e -> new Tuple2<>(e.getKey(), e.getValue()))
+              .collect(Collectors.toList()).iterator();
+        };
+  }
+
+  private static PairFlatMapFunction<String, String, String> getFilesToDeleteFunc(HoodieTable table,
+      HoodieWriteConfig config) {
+    return (PairFlatMapFunction<String, String, String>) partitionPathToClean -> {
+      HoodieCleanHelper cleaner = new HoodieCleanHelper(table, config);
+      return cleaner.getDeletePaths(partitionPathToClean).stream()
+          .map(deleteFile -> new Tuple2<>(partitionPathToClean, deleteFile.toString())).iterator();
+    };
+  }
+
+  private static Boolean deleteFileAndGetResult(FileSystem fs, String deletePathStr)
+      throws IOException {
+    Path deletePath = new Path(deletePathStr);
+    logger.debug("Working on delete path :" + deletePath);
+    boolean deleteResult = fs.delete(deletePath, false);
+    if (deleteResult) {
+      logger.debug("Cleaned file at path :" + deletePath);
+    }
+    return deleteResult;
+  }
+
+  @Override
+  public Partitioner getUpsertPartitioner(WorkloadProfile profile) {
+    if (profile == null) {
+      throw new HoodieUpsertException("Need workload profile to construct the upsert partitioner.");
+    }
+    return new UpsertPartitioner(profile);
+  }
+
+  @Override
+  public Partitioner getInsertPartitioner(WorkloadProfile profile) {
+    return getUpsertPartitioner(profile);
+  }
+
+  @Override
+  public boolean isWorkloadProfileNeeded() {
+    return true;
+  }
+
+  @Override
+  public JavaRDD<WriteStatus> compact(JavaSparkContext jsc, String commitTime) {
+    throw new HoodieNotSupportedException("Compaction is not supported from a CopyOnWrite table");
+  }
+
+  public Iterator<List<WriteStatus>> handleUpdate(String commitTime, String fileLoc,
+      Iterator<HoodieRecord<T>> recordItr) throws IOException {
+    // these are updates
+    HoodieMergeHandle upsertHandle = getUpdateHandle(commitTime, fileLoc, recordItr);
+    return handleUpdateInternal(upsertHandle, commitTime, fileLoc);
+  }
+
+  public Iterator<List<WriteStatus>> handleUpdate(String commitTime, String fileLoc,
+      Map<String, HoodieRecord<T>> keyToNewRecords) throws IOException {
+    // these are updates
+    HoodieMergeHandle upsertHandle = getUpdateHandle(commitTime, fileLoc, keyToNewRecords);
+    return handleUpdateInternal(upsertHandle, commitTime, fileLoc);
+  }
+
+  protected Iterator<List<WriteStatus>> handleUpdateInternal(HoodieMergeHandle upsertHandle,
+      String commitTime, String fileLoc) throws IOException {
+    if (upsertHandle.getOldFilePath() == null) {
+      throw new HoodieUpsertException(
+          "Error in finding the old file path at commit " + commitTime + " at fileLoc: " + fileLoc);
+    } else {
+      AvroReadSupport.setAvroReadSchema(getHadoopConf(), upsertHandle.getSchema());
+      ParquetReader<IndexedRecord> reader = AvroParquetReader.builder(upsertHandle.getOldFilePath())
+          .withConf(getHadoopConf()).build();
+      try {
+        IndexedRecord record;
+        while ((record = reader.read()) != null) {
+          // Two types of writes here (new record, and old record).
+          // We have already catch the exception during writing new records.
+          // But for old records, we should fail if any exception happens.
+          upsertHandle.write((GenericRecord) record);
+        }
+      } catch (IOException e) {
+        throw new HoodieUpsertException(
+            "Failed to read record from " + upsertHandle.getOldFilePath() + " with new Schema "
+                + upsertHandle.getSchema(), e);
+      } finally {
+        reader.close();
+        upsertHandle.close();
+      }
+    }
+    //TODO(vc): This needs to be revisited
+    if (upsertHandle.getWriteStatus().getPartitionPath() == null) {
+      logger.info("Upsert Handle has partition path as null " + upsertHandle.getOldFilePath() + ", "
+          + upsertHandle.getWriteStatus());
+    }
+    return Collections.singletonList(Collections.singletonList(upsertHandle.getWriteStatus()))
+        .iterator();
+  }
+
+  protected HoodieMergeHandle getUpdateHandle(String commitTime, String fileLoc,
+      Iterator<HoodieRecord<T>> recordItr) {
+    return new HoodieMergeHandle<>(config, commitTime, this, recordItr, fileLoc);
+  }
+
+  protected HoodieMergeHandle getUpdateHandle(String commitTime, String fileLoc,
+      Map<String, HoodieRecord<T>> keyToNewRecords) {
+    return new HoodieMergeHandle<>(config, commitTime, this, keyToNewRecords, fileLoc);
+  }
+
+  public Iterator<List<WriteStatus>> handleInsert(String commitTime,
+      Iterator<HoodieRecord<T>> recordItr) throws Exception {
+    return new LazyInsertIterable<>(recordItr, config, commitTime, this);
+  }
+
+  @SuppressWarnings("unchecked")
+  @Override
+  public Iterator<List<WriteStatus>> handleUpsertPartition(String commitTime, Integer partition,
+      Iterator recordItr, Partitioner partitioner) {
+    UpsertPartitioner upsertPartitioner = (UpsertPartitioner) partitioner;
+    BucketInfo binfo = upsertPartitioner.getBucketInfo(partition);
+    BucketType btype = binfo.bucketType;
+    try {
+      if (btype.equals(BucketType.INSERT)) {
+        return handleInsert(commitTime, recordItr);
+      } else if (btype.equals(BucketType.UPDATE)) {
+        return handleUpdate(commitTime, binfo.fileLoc, recordItr);
+      } else {
+        throw new HoodieUpsertException(
+            "Unknown bucketType " + btype + " for partition :" + partition);
+      }
+    } catch (Throwable t) {
+      String msg = "Error upserting bucketType " + btype + " for partition :" + partition;
+      logger.error(msg, t);
+      throw new HoodieUpsertException(msg, t);
+    }
+  }
+
+  @Override
+  public Iterator<List<WriteStatus>> handleInsertPartition(String commitTime, Integer partition,
+      Iterator recordItr, Partitioner partitioner) {
+    return handleUpsertPartition(commitTime, partition, recordItr, partitioner);
+  }
+
+  /**
+   * Performs cleaning of partition paths according to cleaning policy and returns the number of
+   * files cleaned. Handles skews in partitions to clean by making files to clean as the unit of
+   * task distribution.
+   *
+   * @throws IllegalArgumentException if unknown cleaning policy is provided
+   */
+  @Override
+  public List<HoodieCleanStat> clean(JavaSparkContext jsc) {
+    try {
+      FileSystem fs = getMetaClient().getFs();
+      List<String> partitionsToClean = FSUtils
+          .getAllPartitionPaths(fs, getMetaClient().getBasePath(),
+              config.shouldAssumeDatePartitioning());
+      logger.info("Partitions to clean up : " + partitionsToClean + ", with policy " + config
+          .getCleanerPolicy());
+      if (partitionsToClean.isEmpty()) {
+        logger.info("Nothing to clean here mom. It is already clean");
+        return Collections.emptyList();
+      }
+      return cleanPartitionPaths(partitionsToClean, jsc);
+    } catch (IOException e) {
+      throw new HoodieIOException("Failed to clean up after commit", e);
+    }
+  }
+
+  /**
+   * Common method used for cleaning out parquet files under a partition path during rollback of a
+   * set of commits
+   */
+  protected Map<FileStatus, Boolean> deleteCleanedFiles(String partitionPath, List<String> commits)
+      throws IOException {
+    logger.info("Cleaning path " + partitionPath);
+    FileSystem fs = getMetaClient().getFs();
+    FileStatus[] toBeDeleted = fs
+        .listStatus(new Path(config.getBasePath(), partitionPath), path -> {
+          if (!path.toString().contains(".parquet")) {
+            return false;
+          }
+          String fileCommitTime = FSUtils.getCommitTime(path.getName());
+          return commits.contains(fileCommitTime);
+        });
+    Map<FileStatus, Boolean> results = Maps.newHashMap();
+    for (FileStatus file : toBeDeleted) {
+      boolean success = fs.delete(file.getPath(), false);
+      results.put(file, success);
+      logger.info("Delete file " + file.getPath() + "\t" + success);
+    }
+    return results;
+  }
+
+  @Override
+  public List<HoodieRollbackStat> rollback(JavaSparkContext jsc, List<String> commits)
+      throws IOException {
+    String actionType = this.getCommitActionType();
+    HoodieActiveTimeline activeTimeline = this.getActiveTimeline();
+    List<String> inflights = this.getInflightCommitTimeline().getInstants()
+        .map(HoodieInstant::getTimestamp).collect(Collectors.toList());
+
+    // Atomically unpublish all the commits
+    commits.stream().filter(s -> !inflights.contains(s))
+        .map(s -> new HoodieInstant(false, actionType, s))
+        .forEach(activeTimeline::revertToInflight);
+    logger.info("Unpublished " + commits);
+
+    // delete all the data files for all these commits
+    logger.info("Clean out all parquet files generated for commits: " + commits);
+    List<HoodieRollbackStat> stats = jsc.parallelize(FSUtils
+        .getAllPartitionPaths(metaClient.getFs(), getMetaClient().getBasePath(),
+            config.shouldAssumeDatePartitioning()))
+        .map((Function<String, HoodieRollbackStat>) partitionPath -> {
+          // Scan all partitions files with this commit time
+          Map<FileStatus, Boolean> results = deleteCleanedFiles(partitionPath, commits);
+          return HoodieRollbackStat.newBuilder().withPartitionPath(partitionPath)
+              .withDeletedFileResults(results).build();
+        }).collect();
+
+    // clean temporary data files
+    cleanTemporaryDataFiles(jsc);
+
+    // Remove the rolled back inflight commits
+    commits.stream().map(s -> new HoodieInstant(true, actionType, s))
+        .forEach(activeTimeline::deleteInflight);
+    logger.info("Deleted inflight commits " + commits);
+    return stats;
+  }
+
+  /**
+   * Finalize the written data files
+   *
+   * @param writeStatuses List of WriteStatus
+   * @return number of files finalized
+   */
+  @Override
+  @SuppressWarnings("unchecked")
+  public Optional<Integer> finalizeWrite(JavaSparkContext jsc, List writeStatuses) {
+    if (!config.shouldUseTempFolderForCopyOnWrite()) {
+      return Optional.empty();
+    }
+
+    // This is to rename each data file from temporary path to its final location
+    List<Tuple2<String, Boolean>> results = jsc
+        .parallelize(writeStatuses, config.getFinalizeWriteParallelism()).map(writeStatus -> {
+          Tuple2<String, HoodieWriteStat> writeStatTuple2 = (Tuple2<String, HoodieWriteStat>)
+              writeStatus;
+          HoodieWriteStat writeStat = writeStatTuple2._2();
+          final FileSystem fs = getMetaClient().getFs();
+          final Path finalPath = new Path(config.getBasePath(), writeStat.getPath());
+
+          if (writeStat.getTempPath() != null) {
+            final Path tempPath = new Path(config.getBasePath(), writeStat.getTempPath());
+            boolean success;
+            try {
+              logger.info("Renaming temporary file: " + tempPath + " to " + finalPath);
+              success = fs.rename(tempPath, finalPath);
+            } catch (IOException e) {
+              throw new HoodieIOException(
+                  "Failed to rename file: " + tempPath + " to " + finalPath);
+            }
+
+            if (!success) {
+              throw new HoodieIOException(
+                  "Failed to rename file: " + tempPath + " to " + finalPath);
+            }
+          }
+
+          return new Tuple2<>(writeStat.getPath(), true);
+        }).collect();
+
+    // clean temporary data files
+    cleanTemporaryDataFiles(jsc);
+
+    return Optional.of(results.size());
+  }
+
+  /**
+   * Clean temporary data files that are produced from previous failed commit or retried spark
+   * stages.
+   */
+  private void cleanTemporaryDataFiles(JavaSparkContext jsc) {
+    if (!config.shouldUseTempFolderForCopyOnWrite()) {
+      return;
+    }
+
+    final FileSystem fs = getMetaClient().getFs();
+    final Path temporaryFolder = new Path(config.getBasePath(),
+        HoodieTableMetaClient.TEMPFOLDER_NAME);
+    try {
+      if (!fs.exists(temporaryFolder)) {
+        logger.info("Temporary folder does not exist: " + temporaryFolder);
+        return;
+      }
+      List<FileStatus> fileStatusesList = Arrays.asList(fs.listStatus(temporaryFolder));
+      List<Tuple2<String, Boolean>> results = jsc
+          .parallelize(fileStatusesList, config.getFinalizeWriteParallelism()).map(fileStatus -> {
+            FileSystem fs1 = getMetaClient().getFs();
+            boolean success = fs1.delete(fileStatus.getPath(), false);
+            logger
+                .info("Deleting file in temporary folder" + fileStatus.getPath() + "\t" + success);
+            return new Tuple2<>(fileStatus.getPath().toString(), success);
+          }).collect();
+
+      for (Tuple2<String, Boolean> result : results) {
+        if (!result._2()) {
+          logger.info("Failed to delete file: " + result._1());
+          throw new HoodieIOException("Failed to delete file in temporary folder: " + result._1());
+        }
+      }
+    } catch (IOException e) {
+      throw new HoodieIOException(
+          "Failed to clean data files in temporary folder: " + temporaryFolder);
+    }
+  }
+
+  private List<HoodieCleanStat> cleanPartitionPaths(List<String> partitionsToClean,
+      JavaSparkContext jsc) {
+    int cleanerParallelism = Math.min(partitionsToClean.size(), config.getCleanerParallelism());
+    logger.info("Using cleanerParallelism: " + cleanerParallelism);
+    List<Tuple2<String, PartitionCleanStat>> partitionCleanStats = jsc
+        .parallelize(partitionsToClean, cleanerParallelism)
+        .flatMapToPair(getFilesToDeleteFunc(this, config))
+        .repartition(cleanerParallelism)                    // repartition to remove skews
+        .mapPartitionsToPair(deleteFilesFunc(this)).reduceByKey(
+            // merge partition level clean stats below
+            (Function2<PartitionCleanStat, PartitionCleanStat, PartitionCleanStat>) (e1, e2) -> e1
+                .merge(e2)).collect();
+
+    Map<String, PartitionCleanStat> partitionCleanStatsMap = partitionCleanStats.stream()
+        .collect(Collectors.toMap(e -> e._1(), e -> e._2()));
+
+    HoodieCleanHelper cleaner = new HoodieCleanHelper(this, config);
+    // Return PartitionCleanStat for each partition passed.
+    return partitionsToClean.stream().map(partitionPath -> {
+      PartitionCleanStat partitionCleanStat =
+          (partitionCleanStatsMap.containsKey(partitionPath)) ? partitionCleanStatsMap
+              .get(partitionPath) : new PartitionCleanStat(partitionPath);
+      return HoodieCleanStat.newBuilder().withPolicy(config.getCleanerPolicy())
+          .withPartitionPath(partitionPath)
+          .withEarliestCommitRetained(cleaner.getEarliestCommitToRetain())
+          .withDeletePathPattern(partitionCleanStat.deletePathPatterns)
+          .withSuccessfulDeletes(partitionCleanStat.successDeleteFiles)
+          .withFailedDeletes(partitionCleanStat.failedDeleteFiles).build();
+    }).collect(Collectors.toList());
+  }

  enum BucketType {
-    UPDATE,
-    INSERT
+    UPDATE, INSERT
+  }
+
+  private static class PartitionCleanStat implements Serializable {
+
+    private final String partitionPath;
+    private final List<String> deletePathPatterns = new ArrayList<>();
+    private final List<String> successDeleteFiles = new ArrayList<>();
+    private final List<String> failedDeleteFiles = new ArrayList<>();
+
+    private PartitionCleanStat(String partitionPath) {
+      this.partitionPath = partitionPath;
+    }
+
+    private void addDeletedFileResult(String deletePathStr, Boolean deletedFileResult) {
+      if (deletedFileResult) {
+        successDeleteFiles.add(deletePathStr);
+      } else {
+        failedDeleteFiles.add(deletePathStr);
+      }
+    }
+
+    private void addDeleteFilePatterns(String deletePathStr) {
+      deletePathPatterns.add(deletePathStr);
+    }
+
+    private PartitionCleanStat merge(PartitionCleanStat other) {
+      if (!this.partitionPath.equals(other.partitionPath)) {
+        throw new RuntimeException(String
+            .format("partitionPath is not a match: (%s, %s)", partitionPath, other.partitionPath));
+      }
+      successDeleteFiles.addAll(other.successDeleteFiles);
+      deletePathPatterns.addAll(other.deletePathPatterns);
+      failedDeleteFiles.addAll(other.failedDeleteFiles);
+      return this;
+    }
  }

  /**
@@ -150,45 +552,37 @@ public class HoodieCopyOnWriteTable<T extends HoodieRecordPayload> extends Hoodi
    }
  }

-
  /**
   * Packs incoming records to be upserted, into buckets (1 bucket = 1 RDD partition)
   */
  class UpsertPartitioner extends Partitioner {

+    /**
+     * List of all small files to be corrected
+     */
+    List<SmallFile> smallFiles = new ArrayList<SmallFile>();
    /**
     * Total number of RDD partitions, is determined by total buckets we want to pack the incoming
     * workload into
     */
    private int totalBuckets = 0;
-
    /**
     * Stat for the current workload. Helps in determining total inserts, upserts etc.
     */
    private WorkloadStat globalStat;
-
    /**
     * Helps decide which bucket an incoming update should go to.
     */
    private HashMap<String, Integer> updateLocationToBucket;
-
-
    /**
     * Helps us pack inserts into 1 or more buckets depending on number of incoming records.
     */
    private HashMap<String, List<InsertBucket>> partitionPathToInsertBuckets;
-
-
    /**
     * Remembers what type each bucket is for later.
     */
    private HashMap<Integer, BucketInfo> bucketInfoMap;

-    /**
-     * List of all small files to be corrected
-     */
-    List<SmallFile> smallFiles = new ArrayList<SmallFile>();
-
    UpsertPartitioner(WorkloadProfile profile) {
      updateLocationToBucket = new HashMap<>();
      partitionPathToInsertBuckets = new HashMap<>();
@@ -198,16 +592,17 @@ public class HoodieCopyOnWriteTable<T extends HoodieRecordPayload> extends Hoodi
      assignUpdates(profile);
      assignInserts(profile);

-      logger.info("Total Buckets :" + totalBuckets + ", " +
-          "buckets info => " + bucketInfoMap + ", \n" +
-          "Partition to insert buckets => " + partitionPathToInsertBuckets + ", \n" +
-          "UpdateLocations mapped to buckets =>" + updateLocationToBucket);
+      logger.info(
+          "Total Buckets :" + totalBuckets + ", " + "buckets info => " + bucketInfoMap + ", \n"
+              + "Partition to insert buckets => " + partitionPathToInsertBuckets + ", \n"
+              + "UpdateLocations mapped to buckets =>" + updateLocationToBucket);
    }

    private void assignUpdates(WorkloadProfile profile) {
      // each update location gets a partition
      WorkloadStat gStat = profile.getGlobalStat();
-      for (Map.Entry<String, Pair<String, Long>> updateLocEntry : gStat.getUpdateLocationToCount().entrySet()) {
+      for (Map.Entry<String, Pair<String, Long>> updateLocEntry : gStat.getUpdateLocationToCount()
+          .entrySet()) {
        addUpdateBucket(updateLocEntry.getKey());
      }
    }
@@ -270,10 +665,10 @@ public class HoodieCopyOnWriteTable<T extends HoodieRecordPayload> extends Hoodi
            }

            int insertBuckets = (int) Math.max(totalUnassignedInserts / insertRecordsPerBucket, 1L);
-            logger
-                .info("After small file assignment: unassignedInserts => " + totalUnassignedInserts
-                    + ", totalInsertBuckets => " + insertBuckets
-                    + ", recordsPerBucket => " + insertRecordsPerBucket);
+            logger.info(
+                "After small file assignment: unassignedInserts => " + totalUnassignedInserts
+                    + ", totalInsertBuckets => " + insertBuckets + ", recordsPerBucket => "
+                    + insertRecordsPerBucket);
            for (int b = 0; b < insertBuckets; b++) {
              bucketNumbers.add(totalBuckets);
              recordsPerBucket.add(totalUnassignedInserts / insertBuckets);
@@ -339,8 +734,8 @@ public class HoodieCopyOnWriteTable<T extends HoodieRecordPayload> extends Hoodi
     */
    private long averageBytesPerRecord() {
      long avgSize = 0L;
-      HoodieTimeline commitTimeline =
-          metaClient.getActiveTimeline().getCommitTimeline().filterCompletedInstants();
+      HoodieTimeline commitTimeline = metaClient.getActiveTimeline().getCommitTimeline()
+          .filterCompletedInstants();
      try {
        if (!commitTimeline.empty()) {
          HoodieInstant latestCommitTime = commitTimeline.lastInstant().get();
@@ -372,7 +767,8 @@ public class HoodieCopyOnWriteTable<T extends HoodieRecordPayload> extends Hoodi

    @Override
    public int getPartition(Object key) {
-      Tuple2<HoodieKey, Option<HoodieRecordLocation>> keyLocation = (Tuple2<HoodieKey, Option<HoodieRecordLocation>>) key;
+      Tuple2<HoodieKey, Option<HoodieRecordLocation>> keyLocation = (Tuple2<HoodieKey,
+          Option<HoodieRecordLocation>>) key;
      if (keyLocation._2().isDefined()) {
        HoodieRecordLocation location = keyLocation._2().get();
        return updateLocationToBucket.get(location.getFileId());
@@ -396,420 +792,4 @@ public class HoodieCopyOnWriteTable<T extends HoodieRecordPayload> extends Hoodi
      }
    }
  }
-
-
-  @Override
-  public Partitioner getUpsertPartitioner(WorkloadProfile profile) {
-    if (profile == null) {
-      throw new HoodieUpsertException("Need workload profile to construct the upsert partitioner.");
-    }
-    return new UpsertPartitioner(profile);
-  }
-
-  @Override
-  public Partitioner getInsertPartitioner(WorkloadProfile profile) {
-    return getUpsertPartitioner(profile);
-  }
-
-  @Override
-  public boolean isWorkloadProfileNeeded() {
-    return true;
-  }
-
-  @Override
-  public JavaRDD<WriteStatus> compact(JavaSparkContext jsc, String commitTime) {
-    throw new HoodieNotSupportedException("Compaction is not supported from a CopyOnWrite table");
-  }
-
-
-  public Iterator<List<WriteStatus>> handleUpdate(String commitTime, String fileLoc,
-                                                  Iterator<HoodieRecord<T>> recordItr)
-      throws IOException {
-    // these are updates
-    HoodieMergeHandle upsertHandle = getUpdateHandle(commitTime, fileLoc, recordItr);
-    return handleUpdateInternal(upsertHandle, commitTime, fileLoc);
-  }
-
-  public Iterator<List<WriteStatus>> handleUpdate(String commitTime, String fileLoc,
-                                                  Map<String, HoodieRecord<T>> keyToNewRecords)
-      throws IOException {
-    // these are updates
-    HoodieMergeHandle upsertHandle = getUpdateHandle(commitTime, fileLoc, keyToNewRecords);
-    return handleUpdateInternal(upsertHandle, commitTime, fileLoc);
-  }
-
-  protected Iterator<List<WriteStatus>> handleUpdateInternal(HoodieMergeHandle upsertHandle, String commitTime, String fileLoc)
-      throws IOException {
-    if (upsertHandle.getOldFilePath() == null) {
-      throw new HoodieUpsertException("Error in finding the old file path at commit " +
-          commitTime + " at fileLoc: " + fileLoc);
-    } else {
-      AvroReadSupport.setAvroReadSchema(getHadoopConf(), upsertHandle.getSchema());
-      ParquetReader<IndexedRecord> reader =
-          AvroParquetReader.builder(upsertHandle.getOldFilePath()).withConf(getHadoopConf())
-              .build();
-      try {
-        IndexedRecord record;
-        while ((record = reader.read()) != null) {
-          // Two types of writes here (new record, and old record).
-          // We have already catch the exception during writing new records.
-          // But for old records, we should fail if any exception happens.
-          upsertHandle.write((GenericRecord) record);
-        }
-      } catch (IOException e) {
-        throw new HoodieUpsertException(
-            "Failed to read record from " + upsertHandle.getOldFilePath()
-                + " with new Schema " + upsertHandle.getSchema(), e);
-      } finally {
-        reader.close();
-        upsertHandle.close();
-      }
-    }
-    //TODO(vc): This needs to be revisited
-    if (upsertHandle.getWriteStatus().getPartitionPath() == null) {
-      logger.info("Upsert Handle has partition path as null " + upsertHandle.getOldFilePath()
-          + ", " + upsertHandle.getWriteStatus());
-    }
-    return Collections.singletonList(Collections.singletonList(upsertHandle.getWriteStatus()))
-        .iterator();
-  }
-
-  protected HoodieMergeHandle getUpdateHandle(String commitTime, String fileLoc,
-      Iterator<HoodieRecord<T>> recordItr) {
-    return new HoodieMergeHandle<>(config, commitTime, this, recordItr, fileLoc);
-  }
-
-  protected HoodieMergeHandle getUpdateHandle(String commitTime, String fileLoc,
-                                              Map<String, HoodieRecord<T>> keyToNewRecords) {
-    return new HoodieMergeHandle<>(config, commitTime, this, keyToNewRecords, fileLoc);
-  }
-
-  public Iterator<List<WriteStatus>> handleInsert(String commitTime,
-      Iterator<HoodieRecord<T>> recordItr) throws Exception {
-    return new LazyInsertIterable<>(recordItr, config, commitTime, this);
-  }
-
-
-  @SuppressWarnings("unchecked")
-  @Override
-  public Iterator<List<WriteStatus>> handleUpsertPartition(String commitTime, Integer partition,
-      Iterator recordItr, Partitioner partitioner) {
-    UpsertPartitioner upsertPartitioner = (UpsertPartitioner) partitioner;
-    BucketInfo binfo = upsertPartitioner.getBucketInfo(partition);
-    BucketType btype = binfo.bucketType;
-    try {
-      if (btype.equals(BucketType.INSERT)) {
-        return handleInsert(commitTime, recordItr);
-      } else if (btype.equals(BucketType.UPDATE)) {
-        return handleUpdate(commitTime, binfo.fileLoc, recordItr);
-      } else {
-        throw new HoodieUpsertException(
-            "Unknown bucketType " + btype + " for partition :" + partition);
-      }
-    } catch (Throwable t) {
-      String msg = "Error upserting bucketType " + btype + " for partition :" + partition;
-      logger.error(msg, t);
-      throw new HoodieUpsertException(msg, t);
-    }
-  }
-
-  @Override
-  public Iterator<List<WriteStatus>> handleInsertPartition(String commitTime, Integer partition,
-      Iterator recordItr,
-      Partitioner partitioner) {
-    return handleUpsertPartition(commitTime, partition, recordItr, partitioner);
-  }
-
-  /**
-   * Performs cleaning of partition paths according to cleaning policy and returns the number of
-   * files cleaned. Handles skews in partitions to clean by making files to clean as the unit of
-   * task distribution.
-   *
-   * @throws IllegalArgumentException if unknown cleaning policy is provided
-   */
-  @Override
-  public List<HoodieCleanStat> clean(JavaSparkContext jsc) {
-    try {
-      FileSystem fs = getMetaClient().getFs();
-      List<String> partitionsToClean =
-          FSUtils.getAllPartitionPaths(fs, getMetaClient().getBasePath(),
-              config.shouldAssumeDatePartitioning());
-      logger.info("Partitions to clean up : " + partitionsToClean + ", with policy " + config
-          .getCleanerPolicy());
-      if (partitionsToClean.isEmpty()) {
-        logger.info("Nothing to clean here mom. It is already clean");
-        return Collections.emptyList();
-      }
-      return cleanPartitionPaths(partitionsToClean, jsc);
-    } catch (IOException e) {
-      throw new HoodieIOException("Failed to clean up after commit", e);
-    }
-  }
-
-  /**
-   * Common method used for cleaning out parquet files under a partition path during rollback of a
-   * set of commits
-   */
-  protected Map<FileStatus, Boolean> deleteCleanedFiles(String partitionPath, List<String> commits)
-      throws IOException {
-    logger.info("Cleaning path " + partitionPath);
-    FileSystem fs = getMetaClient().getFs();
-    FileStatus[] toBeDeleted =
-        fs.listStatus(new Path(config.getBasePath(), partitionPath), path -> {
-          if (!path.toString().contains(".parquet")) {
-            return false;
-          }
-          String fileCommitTime = FSUtils.getCommitTime(path.getName());
-          return commits.contains(fileCommitTime);
-        });
-    Map<FileStatus, Boolean> results = Maps.newHashMap();
-    for (FileStatus file : toBeDeleted) {
-      boolean success = fs.delete(file.getPath(), false);
-      results.put(file, success);
-      logger.info("Delete file " + file.getPath() + "\t" + success);
-    }
-    return results;
-  }
-
-  @Override
-  public List<HoodieRollbackStat> rollback(JavaSparkContext jsc, List<String> commits)
-      throws IOException {
-    String actionType = this.getCommitActionType();
-    HoodieActiveTimeline activeTimeline = this.getActiveTimeline();
-    List<String> inflights = this.getInflightCommitTimeline().getInstants()
-        .map(HoodieInstant::getTimestamp)
-        .collect(Collectors.toList());
-
-    // Atomically unpublish all the commits
-    commits.stream().filter(s -> !inflights.contains(s))
-        .map(s -> new HoodieInstant(false, actionType, s))
-        .forEach(activeTimeline::revertToInflight);
-    logger.info("Unpublished " + commits);
-
-    // delete all the data files for all these commits
-    logger.info("Clean out all parquet files generated for commits: " + commits);
-    List<HoodieRollbackStat> stats = jsc.parallelize(
-        FSUtils.getAllPartitionPaths(metaClient.getFs(), getMetaClient().getBasePath(),
-            config.shouldAssumeDatePartitioning()))
-        .map((Function<String, HoodieRollbackStat>) partitionPath -> {
-          // Scan all partitions files with this commit time
-          Map<FileStatus, Boolean> results = deleteCleanedFiles(partitionPath, commits);
-          return HoodieRollbackStat.newBuilder().withPartitionPath(partitionPath)
-              .withDeletedFileResults(results).build();
-        }).collect();
-
-    // clean temporary data files
-    cleanTemporaryDataFiles(jsc);
-
-    // Remove the rolled back inflight commits
-    commits.stream().map(s -> new HoodieInstant(true, actionType, s))
-        .forEach(activeTimeline::deleteInflight);
-    logger.info("Deleted inflight commits " + commits);
-    return stats;
-  }
-
-  /**
-   * Finalize the written data files
-   *
-   * @param writeStatuses List of WriteStatus
-   * @return number of files finalized
-   */
-  @Override
-  @SuppressWarnings("unchecked")
-  public Optional<Integer> finalizeWrite(JavaSparkContext jsc, List writeStatuses) {
-    if (!config.shouldUseTempFolderForCopyOnWrite()) {
-      return Optional.empty();
-    }
-
-    // This is to rename each data file from temporary path to its final location
-    List<Tuple2<String, Boolean>> results = jsc.parallelize(writeStatuses, config.getFinalizeWriteParallelism())
-        .map(writeStatus -> {
-          Tuple2<String, HoodieWriteStat> writeStatTuple2 = (Tuple2<String, HoodieWriteStat>) writeStatus;
-          HoodieWriteStat writeStat = writeStatTuple2._2();
-          final FileSystem fs = getMetaClient().getFs();
-          final Path finalPath = new Path(config.getBasePath(), writeStat.getPath());
-
-          if (writeStat.getTempPath() != null) {
-            final Path tempPath = new Path(config.getBasePath(), writeStat.getTempPath());
-            boolean success;
-            try {
-              logger.info("Renaming temporary file: " + tempPath + " to " + finalPath);
-              success = fs.rename(tempPath, finalPath);
-            } catch (IOException e) {
-              throw new HoodieIOException("Failed to rename file: " + tempPath + " to " + finalPath);
-            }
-
-            if (!success) {
-              throw new HoodieIOException("Failed to rename file: " + tempPath + " to " + finalPath);
-            }
-          }
-
-          return new Tuple2<>(writeStat.getPath(), true);
-        }).collect();
-
-    // clean temporary data files
-    cleanTemporaryDataFiles(jsc);
-
-    return Optional.of(results.size());
-  }
-
-  /**
-   * Clean temporary data files that are produced from previous failed commit or retried spark
-   * stages.
-   */
-  private void cleanTemporaryDataFiles(JavaSparkContext jsc) {
-    if (!config.shouldUseTempFolderForCopyOnWrite()) {
-      return;
-    }
-
-    final FileSystem fs = getMetaClient().getFs();
-    final Path temporaryFolder = new Path(config.getBasePath(),
-        HoodieTableMetaClient.TEMPFOLDER_NAME);
-    try {
-      if (!fs.exists(temporaryFolder)) {
-        logger.info("Temporary folder does not exist: " + temporaryFolder);
-        return;
-      }
-      List<FileStatus> fileStatusesList = Arrays.asList(fs.listStatus(temporaryFolder));
-      List<Tuple2<String, Boolean>> results = jsc
-          .parallelize(fileStatusesList, config.getFinalizeWriteParallelism())
-          .map(fileStatus -> {
-            FileSystem fs1 = getMetaClient().getFs();
-            boolean success = fs1.delete(fileStatus.getPath(), false);
-            logger.info("Deleting file in temporary folder" + fileStatus.getPath() + "\t"
-                + success);
-            return new Tuple2<>(fileStatus.getPath().toString(), success);
-          }).collect();
-
-      for (Tuple2<String, Boolean> result : results) {
-        if (!result._2()) {
-          logger.info("Failed to delete file: " + result._1());
-          throw new HoodieIOException(
-              "Failed to delete file in temporary folder: " + result._1());
-        }
-      }
-    } catch (IOException e) {
-      throw new HoodieIOException(
-          "Failed to clean data files in temporary folder: " + temporaryFolder);
-    }
-  }
-
-  private static class PartitionCleanStat implements Serializable {
-
-    private final String partitionPath;
-    private final List<String> deletePathPatterns = new ArrayList<>();
-    private final List<String> successDeleteFiles = new ArrayList<>();
-    private final List<String> failedDeleteFiles = new ArrayList<>();
-
-    private PartitionCleanStat(String partitionPath) {
-      this.partitionPath = partitionPath;
-    }
-
-    private void addDeletedFileResult(String deletePathStr, Boolean deletedFileResult) {
-      if (deletedFileResult) {
-        successDeleteFiles.add(deletePathStr);
-      } else {
-        failedDeleteFiles.add(deletePathStr);
-      }
-    }
-
-    private void addDeleteFilePatterns(String deletePathStr) {
-      deletePathPatterns.add(deletePathStr);
-    }
-
-    private PartitionCleanStat merge(PartitionCleanStat other) {
-      if (!this.partitionPath.equals(other.partitionPath)) {
-        throw new RuntimeException(String.format(
-            "partitionPath is not a match: (%s, %s)",
-            partitionPath, other.partitionPath));
-      }
-      successDeleteFiles.addAll(other.successDeleteFiles);
-      deletePathPatterns.addAll(other.deletePathPatterns);
-      failedDeleteFiles.addAll(other.failedDeleteFiles);
-      return this;
-    }
-  }
-
-  private List<HoodieCleanStat> cleanPartitionPaths(List<String> partitionsToClean,
-      JavaSparkContext jsc) {
-    int cleanerParallelism = Math.min(partitionsToClean.size(), config.getCleanerParallelism());
-    logger.info("Using cleanerParallelism: " + cleanerParallelism);
-    List<Tuple2<String, PartitionCleanStat>> partitionCleanStats = jsc
-        .parallelize(partitionsToClean, cleanerParallelism)
-        .flatMapToPair(getFilesToDeleteFunc(this, config))
-        .repartition(cleanerParallelism)                    // repartition to remove skews
-        .mapPartitionsToPair(deleteFilesFunc(this))
-        .reduceByKey(
-            // merge partition level clean stats below
-            (Function2<PartitionCleanStat, PartitionCleanStat, PartitionCleanStat>) (e1, e2) -> e1
-                .merge(e2))
-        .collect();
-
-    Map<String, PartitionCleanStat> partitionCleanStatsMap = partitionCleanStats
-        .stream().collect(Collectors.toMap(e -> e._1(), e -> e._2()));
-
-    HoodieCleanHelper cleaner = new HoodieCleanHelper(this, config);
-    // Return PartitionCleanStat for each partition passed.
-    return partitionsToClean.stream().map(partitionPath -> {
-      PartitionCleanStat partitionCleanStat =
-          (partitionCleanStatsMap.containsKey(partitionPath)) ?
-              partitionCleanStatsMap.get(partitionPath)
-              : new PartitionCleanStat(partitionPath);
-      return HoodieCleanStat.newBuilder()
-          .withPolicy(config.getCleanerPolicy())
-          .withPartitionPath(partitionPath)
-          .withEarliestCommitRetained(cleaner.getEarliestCommitToRetain())
-          .withDeletePathPattern(partitionCleanStat.deletePathPatterns)
-          .withSuccessfulDeletes(partitionCleanStat.successDeleteFiles)
-          .withFailedDeletes(partitionCleanStat.failedDeleteFiles)
-          .build();
-    }).collect(Collectors.toList());
-  }
-
-  private static PairFlatMapFunction<Iterator<Tuple2<String, String>>, String, PartitionCleanStat> deleteFilesFunc(
-      HoodieTable table) {
-    return (PairFlatMapFunction<Iterator<Tuple2<String, String>>, String, PartitionCleanStat>) iter -> {
-      Map<String, PartitionCleanStat> partitionCleanStatMap = new HashMap<>();
-
-      FileSystem fs = table.getMetaClient().getFs();
-      while (iter.hasNext()) {
-        Tuple2<String, String> partitionDelFileTuple = iter.next();
-        String partitionPath = partitionDelFileTuple._1();
-        String deletePathStr = partitionDelFileTuple._2();
-        Boolean deletedFileResult = deleteFileAndGetResult(fs, deletePathStr);
-        if (!partitionCleanStatMap.containsKey(partitionPath)) {
-          partitionCleanStatMap.put(partitionPath,
-              new PartitionCleanStat(partitionPath));
-        }
-        PartitionCleanStat partitionCleanStat = partitionCleanStatMap.get(partitionPath);
-        partitionCleanStat.addDeleteFilePatterns(deletePathStr);
-        partitionCleanStat.addDeletedFileResult(deletePathStr, deletedFileResult);
-      }
-
-      return partitionCleanStatMap.entrySet().stream()
-          .map(e -> new Tuple2<>(e.getKey(), e.getValue()))
-          .collect(Collectors.toList()).iterator();
-    };
-  }
-
-  private static PairFlatMapFunction<String, String, String> getFilesToDeleteFunc(
-      HoodieTable table, HoodieWriteConfig config) {
-    return (PairFlatMapFunction<String, String, String>) partitionPathToClean -> {
-      HoodieCleanHelper cleaner = new HoodieCleanHelper(table, config);
-      return cleaner.getDeletePaths(partitionPathToClean).stream()
-          .map(deleteFile -> new Tuple2<>(partitionPathToClean, deleteFile.toString()))
-          .iterator();
-    };
-  }
-
-  private static Boolean deleteFileAndGetResult(FileSystem fs, String deletePathStr)
-      throws IOException {
-    Path deletePath = new Path(deletePathStr);
-    logger.debug("Working on delete path :" + deletePath);
-    boolean deleteResult = fs.delete(deletePath, false);
-    if (deleteResult) {
-      logger.debug("Cleaned file at path :" + deletePath);
-    }
-    return deleteResult;
-  }
 }
--- a/hoodie-client/src/main/java/com/uber/hoodie/table/HoodieMergeOnReadTable.java
+++ b/hoodie-client/src/main/java/com/uber/hoodie/table/HoodieMergeOnReadTable.java
@@ -62,19 +62,12 @@ import org.apache.spark.api.java.JavaSparkContext;
 import org.apache.spark.api.java.function.Function;

 /**
- * Implementation of a more real-time read-optimized Hoodie Table where
- * <p>
- * INSERTS - Same as HoodieCopyOnWriteTable - Produce new files, block aligned to desired size (or)
- * Merge with the smallest existing file, to expand it
- * </p>
- * <p>
- * UPDATES - Appends the changes to a rolling log file maintained per file Id. Compaction merges the
- * log file into the base file.
- * </p>
- * <p>
- * WARNING - MOR table type does not support nested rollbacks, every rollback must be followed by an
- * attempted commit action
- * </p>
+ * Implementation of a more real-time read-optimized Hoodie Table where <p> INSERTS - Same as
+ * HoodieCopyOnWriteTable - Produce new files, block aligned to desired size (or) Merge with the
+ * smallest existing file, to expand it </p> <p> UPDATES - Appends the changes to a rolling log file
+ * maintained per file Id. Compaction merges the log file into the base file. </p> <p> WARNING - MOR
+ * table type does not support nested rollbacks, every rollback must be followed by an attempted
+ * commit action </p>
 */
 public class HoodieMergeOnReadTable<T extends HoodieRecordPayload> extends
    HoodieCopyOnWriteTable<T> {
@@ -88,57 +81,6 @@ public class HoodieMergeOnReadTable<T extends HoodieRecordPayload> extends
    super(config, metaClient);
  }

-  /**
-   * UpsertPartitioner for MergeOnRead table type, this allows auto correction of small parquet
-   * files to larger ones without the need for an index in the logFile.
-   */
-  class MergeOnReadUpsertPartitioner extends HoodieCopyOnWriteTable.UpsertPartitioner {
-
-    MergeOnReadUpsertPartitioner(WorkloadProfile profile) {
-      super(profile);
-    }
-
-    @Override
-    protected List<SmallFile> getSmallFiles(String partitionPath) {
-
-      // smallFiles only for partitionPath
-      List<SmallFile> smallFileLocations = new ArrayList<>();
-
-      // Init here since this class (and member variables) might not have been initialized
-      HoodieTimeline commitTimeline = getCompletedCommitTimeline();
-
-      if (!commitTimeline.empty()) {
-        HoodieInstant latestCommitTime = commitTimeline.lastInstant().get();
-        // find smallest file in partition and append to it
-        Optional<FileSlice> smallFileSlice = getRTFileSystemView()
-            .getLatestFileSlicesBeforeOrOn(partitionPath, latestCommitTime.getTimestamp())
-            .filter(fileSlice -> fileSlice.getLogFiles().count() < 1 &&
-                fileSlice.getDataFile().get().getFileSize() < config.getParquetSmallFileLimit())
-            .sorted((FileSlice left, FileSlice right) ->
-                left.getDataFile().get().getFileSize() < right.getDataFile().get().getFileSize() ? -1 : 1)
-            .findFirst();
-
-        if(smallFileSlice.isPresent()) {
-          String filename = smallFileSlice.get().getDataFile().get().getFileName();
-          SmallFile sf = new SmallFile();
-          sf.location = new HoodieRecordLocation(FSUtils.getCommitTime(filename),
-              FSUtils.getFileId(filename));
-          sf.sizeBytes = smallFileSlice.get().getDataFile().get().getFileSize();
-          smallFileLocations.add(sf);
-          // Update the global small files list
-          smallFiles.add(sf);
-        }
-        }
-
-      return smallFileLocations;
-    }
-
-    public List<String> getSmallFileIds() {
-      return (List<String>) smallFiles.stream().map(smallFile -> ((SmallFile) smallFile).location.getFileId())
-          .collect(Collectors.toList());
-    }
-  }
-
  @Override
  public Partitioner getUpsertPartitioner(WorkloadProfile profile) {
    if (profile == null) {
@@ -150,15 +92,16 @@ public class HoodieMergeOnReadTable<T extends HoodieRecordPayload> extends

  @Override
  public Iterator<List<WriteStatus>> handleUpdate(String commitTime, String fileId,
-                                                  Iterator<HoodieRecord<T>> recordItr) throws IOException {
+      Iterator<HoodieRecord<T>> recordItr) throws IOException {
    logger.info("Merging updates for commit " + commitTime + " for file " + fileId);

-    if(mergeOnReadUpsertPartitioner.getSmallFileIds().contains(fileId)) {
-      logger.info("Small file corrections for updates for commit " + commitTime + " for file " + fileId);
+    if (mergeOnReadUpsertPartitioner.getSmallFileIds().contains(fileId)) {
+      logger.info(
+          "Small file corrections for updates for commit " + commitTime + " for file " + fileId);
      return super.handleUpdate(commitTime, fileId, recordItr);
    } else {
-      HoodieAppendHandle<T> appendHandle =
-          new HoodieAppendHandle<>(config, commitTime, this, fileId, recordItr);
+      HoodieAppendHandle<T> appendHandle = new HoodieAppendHandle<>(config, commitTime, this,
+          fileId, recordItr);
      appendHandle.doAppend();
      appendHandle.close();
      return Collections.singletonList(Collections.singletonList(appendHandle.getWriteStatus()))
@@ -202,13 +145,11 @@ public class HoodieMergeOnReadTable<T extends HoodieRecordPayload> extends
    if (commits.size() > 1) {
      throw new UnsupportedOperationException("Nested Rollbacks are not supported");
    }
-    Map<String, HoodieInstant> commitsAndCompactions =
-        this.getActiveTimeline()
-            .getTimelineOfActions(Sets.newHashSet(HoodieActiveTimeline.COMMIT_ACTION,
-                HoodieActiveTimeline.DELTA_COMMIT_ACTION))
-            .getInstants()
-            .filter(i -> commits.contains(i.getTimestamp()))
-            .collect(Collectors.toMap(i -> i.getTimestamp(), i -> i));
+    Map<String, HoodieInstant> commitsAndCompactions = this.getActiveTimeline()
+        .getTimelineOfActions(Sets.newHashSet(HoodieActiveTimeline.COMMIT_ACTION,
+            HoodieActiveTimeline.DELTA_COMMIT_ACTION)).getInstants()
+        .filter(i -> commits.contains(i.getTimestamp()))
+        .collect(Collectors.toMap(i -> i.getTimestamp(), i -> i));

    // Atomically un-publish all non-inflight commits
    commitsAndCompactions.entrySet().stream().map(entry -> entry.getValue())
@@ -218,9 +159,9 @@ public class HoodieMergeOnReadTable<T extends HoodieRecordPayload> extends

    Long startTime = System.currentTimeMillis();

-    List<HoodieRollbackStat> allRollbackStats = jsc.parallelize
-        (FSUtils.getAllPartitionPaths(this.metaClient.getFs(),
-            this.getMetaClient().getBasePath(), config.shouldAssumeDatePartitioning()))
+    List<HoodieRollbackStat> allRollbackStats = jsc.parallelize(FSUtils
+        .getAllPartitionPaths(this.metaClient.getFs(), this.getMetaClient().getBasePath(),
+            config.shouldAssumeDatePartitioning()))
        .map((Function<String, List<HoodieRollbackStat>>) partitionPath -> {
          return commits.stream().map(commit -> {
            HoodieInstant instant = commitsAndCompactions.get(commit);
@@ -228,56 +169,63 @@ public class HoodieMergeOnReadTable<T extends HoodieRecordPayload> extends
            switch (instant.getAction()) {
              case HoodieTimeline.COMMIT_ACTION:
                try {
-                  Map<FileStatus, Boolean> results = super.deleteCleanedFiles(partitionPath, Arrays.asList(commit));
-                  hoodieRollbackStats = HoodieRollbackStat.newBuilder().withPartitionPath(partitionPath)
-                      .withDeletedFileResults(results).build();
+                  Map<FileStatus, Boolean> results = super
+                      .deleteCleanedFiles(partitionPath, Arrays.asList(commit));
+                  hoodieRollbackStats = HoodieRollbackStat.newBuilder()
+                      .withPartitionPath(partitionPath).withDeletedFileResults(results).build();
                  break;
                } catch (IOException io) {
                  throw new UncheckedIOException("Failed to rollback for commit " + commit, io);
                }
              case HoodieTimeline.DELTA_COMMIT_ACTION:
                try {
-                  HoodieCommitMetadata commitMetadata = HoodieCommitMetadata
-                      .fromBytes(this.getCommitTimeline().getInstantDetails(new HoodieInstant(true, instant.getAction(), instant.getTimestamp())).get());
+                  HoodieCommitMetadata commitMetadata = HoodieCommitMetadata.fromBytes(
+                      this.getCommitTimeline().getInstantDetails(
+                          new HoodieInstant(true, instant.getAction(), instant.getTimestamp()))
+                          .get());

                  // read commit file and (either append delete blocks or delete file)
                  Map<FileStatus, Boolean> filesToDeletedStatus = new HashMap<>();
                  Map<FileStatus, Long> filesToNumBlocksRollback = new HashMap<>();

-                  // we do not know fileIds for inserts (first inserts are parquet files), delete all parquet files for the corresponding failed commit, if present (same as COW)
+                  // we do not know fileIds for inserts (first inserts are parquet files), delete
+                  // all parquet files for the corresponding failed commit, if present (same as COW)
                  filesToDeletedStatus = super
                      .deleteCleanedFiles(partitionPath, Arrays.asList(commit));

                  // append rollback blocks for updates
-                  if(commitMetadata.getPartitionToWriteStats().containsKey(partitionPath)) {
+                  if (commitMetadata.getPartitionToWriteStats().containsKey(partitionPath)) {
                    commitMetadata.getPartitionToWriteStats().get(partitionPath).stream()
                        .filter(wStat -> {
-                          return wStat != null && wStat.getPrevCommit() != HoodieWriteStat.NULL_COMMIT
+                          return wStat != null
+                              && wStat.getPrevCommit() != HoodieWriteStat.NULL_COMMIT
                              && wStat.getPrevCommit() != null;
-                        })
-                        .forEach(wStat -> {
+                        }).forEach(wStat -> {
                          HoodieLogFormat.Writer writer = null;
                          try {
-                            writer = HoodieLogFormat.newWriterBuilder()
-                                .onParentPath(new Path(this.getMetaClient().getBasePath(), partitionPath))
+                            writer = HoodieLogFormat.newWriterBuilder().onParentPath(
+                                new Path(this.getMetaClient().getBasePath(), partitionPath))
                                .withFileId(wStat.getFileId()).overBaseCommit(wStat.getPrevCommit())
                                .withFs(this.metaClient.getFs())
                                .withFileExtension(HoodieLogFile.DELTA_EXTENSION).build();
                            Long numRollbackBlocks = 0L;
                            // generate metadata
-                            Map<HoodieLogBlock.HeaderMetadataType, String> header = Maps.newHashMap();
+                            Map<HoodieLogBlock.HeaderMetadataType, String> header =
+                                Maps.newHashMap();
                            header.put(HoodieLogBlock.HeaderMetadataType.INSTANT_TIME,
                                metaClient.getActiveTimeline().lastInstant().get().getTimestamp());
-                            header.put(HoodieLogBlock.HeaderMetadataType.TARGET_INSTANT_TIME, commit);
-                            header.put(HoodieLogBlock.HeaderMetadataType.COMMAND_BLOCK_TYPE,
-                                String.valueOf(HoodieCommandBlock.HoodieCommandBlockTypeEnum.ROLLBACK_PREVIOUS_BLOCK.ordinal()));
+                            header.put(HoodieLogBlock.HeaderMetadataType.TARGET_INSTANT_TIME,
+                                commit);
+                            header.put(HoodieLogBlock.HeaderMetadataType.COMMAND_BLOCK_TYPE, String
+                                .valueOf(
+                                    HoodieCommandBlock.HoodieCommandBlockTypeEnum
+                                        .ROLLBACK_PREVIOUS_BLOCK
+                                        .ordinal()));
                            // if update belongs to an existing log file
-                            writer = writer.appendBlock(new HoodieCommandBlock(
-                                header));
+                            writer = writer.appendBlock(new HoodieCommandBlock(header));
                            numRollbackBlocks++;
-                            filesToNumBlocksRollback
-                                .put(this.getMetaClient().getFs().getFileStatus(writer.getLogFile().getPath()),
-                                    numRollbackBlocks);
+                            filesToNumBlocksRollback.put(this.getMetaClient().getFs()
+                                .getFileStatus(writer.getLogFile().getPath()), numRollbackBlocks);
                          } catch (IOException | InterruptedException io) {
                            throw new HoodieRollbackException(
                                "Failed to rollback for commit " + commit, io);
@@ -289,7 +237,8 @@ public class HoodieMergeOnReadTable<T extends HoodieRecordPayload> extends
                            }
                          }
                        });
-                    hoodieRollbackStats = HoodieRollbackStat.newBuilder().withPartitionPath(partitionPath)
+                    hoodieRollbackStats = HoodieRollbackStat.newBuilder()
+                        .withPartitionPath(partitionPath)
                        .withDeletedFileResults(filesToDeletedStatus)
                        .withRollbackBlockAppendResults(filesToNumBlocksRollback).build();
                  }
@@ -297,17 +246,19 @@ public class HoodieMergeOnReadTable<T extends HoodieRecordPayload> extends
                } catch (IOException io) {
                  throw new UncheckedIOException("Failed to rollback for commit " + commit, io);
                }
+              default:
+                break;
            }
            return hoodieRollbackStats;
          }).collect(Collectors.toList());
        }).flatMap(x -> x.iterator()).filter(x -> x != null).collect();

-    commitsAndCompactions.entrySet().stream()
-        .map(entry -> new HoodieInstant(true, entry.getValue().getAction(),
-            entry.getValue().getTimestamp()))
-        .forEach(this.getActiveTimeline()::deleteInflight);
+    commitsAndCompactions.entrySet().stream().map(
+        entry -> new HoodieInstant(true, entry.getValue().getAction(),
+            entry.getValue().getTimestamp())).forEach(this.getActiveTimeline()::deleteInflight);

-    logger.debug("Time(in ms) taken to finish rollback " + (System.currentTimeMillis() - startTime));
+    logger
+        .debug("Time(in ms) taken to finish rollback " + (System.currentTimeMillis() - startTime));

    return allRollbackStats;
  }
@@ -317,4 +268,56 @@ public class HoodieMergeOnReadTable<T extends HoodieRecordPayload> extends
    // do nothing for MOR tables
    return Optional.empty();
  }
+
+  /**
+   * UpsertPartitioner for MergeOnRead table type, this allows auto correction of small parquet
+   * files to larger ones without the need for an index in the logFile.
+   */
+  class MergeOnReadUpsertPartitioner extends HoodieCopyOnWriteTable.UpsertPartitioner {
+
+    MergeOnReadUpsertPartitioner(WorkloadProfile profile) {
+      super(profile);
+    }
+
+    @Override
+    protected List<SmallFile> getSmallFiles(String partitionPath) {
+
+      // smallFiles only for partitionPath
+      List<SmallFile> smallFileLocations = new ArrayList<>();
+
+      // Init here since this class (and member variables) might not have been initialized
+      HoodieTimeline commitTimeline = getCompletedCommitTimeline();
+
+      if (!commitTimeline.empty()) {
+        HoodieInstant latestCommitTime = commitTimeline.lastInstant().get();
+        // find smallest file in partition and append to it
+        Optional<FileSlice> smallFileSlice = getRTFileSystemView()
+            .getLatestFileSlicesBeforeOrOn(partitionPath, latestCommitTime.getTimestamp()).filter(
+                fileSlice -> fileSlice.getLogFiles().count() < 1
+                    && fileSlice.getDataFile().get().getFileSize() < config
+                    .getParquetSmallFileLimit()).sorted((FileSlice left, FileSlice right) ->
+                left.getDataFile().get().getFileSize() < right.getDataFile().get().getFileSize()
+                    ? -1 : 1).findFirst();
+
+        if (smallFileSlice.isPresent()) {
+          String filename = smallFileSlice.get().getDataFile().get().getFileName();
+          SmallFile sf = new SmallFile();
+          sf.location = new HoodieRecordLocation(FSUtils.getCommitTime(filename),
+              FSUtils.getFileId(filename));
+          sf.sizeBytes = smallFileSlice.get().getDataFile().get().getFileSize();
+          smallFileLocations.add(sf);
+          // Update the global small files list
+          smallFiles.add(sf);
+        }
+      }
+
+      return smallFileLocations;
+    }
+
+    public List<String> getSmallFileIds() {
+      return (List<String>) smallFiles.stream()
+          .map(smallFile -> ((SmallFile) smallFile).location.getFileId())
+          .collect(Collectors.toList());
+    }
+  }
 }
--- a/hoodie-client/src/main/java/com/uber/hoodie/table/HoodieTable.java
+++ b/hoodie-client/src/main/java/com/uber/hoodie/table/HoodieTable.java
@@ -60,18 +60,28 @@ public abstract class HoodieTable<T extends HoodieRecordPayload> implements Seri
    this.metaClient = metaClient;
  }

+  public static <T extends HoodieRecordPayload> HoodieTable<T> getHoodieTable(
+      HoodieTableMetaClient metaClient, HoodieWriteConfig config) {
+    switch (metaClient.getTableType()) {
+      case COPY_ON_WRITE:
+        return new HoodieCopyOnWriteTable<>(config, metaClient);
+      case MERGE_ON_READ:
+        return new HoodieMergeOnReadTable<>(config, metaClient);
+      default:
+        throw new HoodieException("Unsupported table type :" + metaClient.getTableType());
+    }
+  }
+
  /**
   * Provides a partitioner to perform the upsert operation, based on the workload profile
   */
  public abstract Partitioner getUpsertPartitioner(WorkloadProfile profile);

-
  /**
   * Provides a partitioner to perform the insert operation, based on the workload profile
   */
  public abstract Partitioner getInsertPartitioner(WorkloadProfile profile);

-
  /**
   * Return whether this HoodieTable implementation can benefit from workload profiling
   */
@@ -131,7 +141,6 @@ public abstract class HoodieTable<T extends HoodieRecordPayload> implements Seri
    return getCommitsTimeline().filterInflights();
  }

-
  /**
   * Get only the completed (no-inflights) clean timeline
   */
@@ -162,12 +171,12 @@ public abstract class HoodieTable<T extends HoodieRecordPayload> implements Seri
      throw new HoodieSavepointException(
          "Could not get data files for savepoint " + savepointTime + ". No such savepoint.");
    }
-    HoodieInstant instant =
-        new HoodieInstant(false, HoodieTimeline.SAVEPOINT_ACTION, savepointTime);
+    HoodieInstant instant = new HoodieInstant(false, HoodieTimeline.SAVEPOINT_ACTION,
+        savepointTime);
    HoodieSavepointMetadata metadata = null;
    try {
-      metadata = AvroUtils.deserializeHoodieSavepointMetadata(
-          getActiveTimeline().getInstantDetails(instant).get());
+      metadata = AvroUtils
+          .deserializeHoodieSavepointMetadata(getActiveTimeline().getInstantDetails(instant).get());
    } catch (IOException e) {
      throw new HoodieSavepointException(
          "Could not get savepointed data files for savepoint " + savepointTime, e);
@@ -189,7 +198,8 @@ public abstract class HoodieTable<T extends HoodieRecordPayload> implements Seri
        return getActiveTimeline().getCommitTimeline();
      case MERGE_ON_READ:
        // We need to include the parquet files written out in delta commits
-        // Include commit action to be able to start doing a MOR over a COW dataset - no migration required
+        // Include commit action to be able to start doing a MOR over a COW dataset - no
+        // migration required
        return getActiveTimeline().getCommitsTimeline();
      default:
        throw new HoodieException("Unsupported table type :" + metaClient.getTableType());
@@ -219,9 +229,10 @@ public abstract class HoodieTable<T extends HoodieRecordPayload> implements Seri
        return HoodieActiveTimeline.COMMIT_ACTION;
      case MERGE_ON_READ:
        return HoodieActiveTimeline.DELTA_COMMIT_ACTION;
+      default:
+        throw new HoodieCommitException(
+            "Could not commit on unknown storage type " + metaClient.getTableType());
    }
-    throw new HoodieCommitException(
-        "Could not commit on unknown storage type " + metaClient.getTableType());
  }

  /**
@@ -236,21 +247,9 @@ public abstract class HoodieTable<T extends HoodieRecordPayload> implements Seri
  public abstract Iterator<List<WriteStatus>> handleInsertPartition(String commitTime,
      Integer partition, Iterator<HoodieRecord<T>> recordIterator, Partitioner partitioner);

-  public static <T extends HoodieRecordPayload> HoodieTable<T> getHoodieTable(
-      HoodieTableMetaClient metaClient, HoodieWriteConfig config) {
-    switch (metaClient.getTableType()) {
-      case COPY_ON_WRITE:
-        return new HoodieCopyOnWriteTable<>(config, metaClient);
-      case MERGE_ON_READ:
-        return new HoodieMergeOnReadTable<>(config, metaClient);
-      default:
-        throw new HoodieException("Unsupported table type :" + metaClient.getTableType());
-    }
-  }
-
  /**
-   * Run Compaction on the table.
-   * Compaction arranges the data so that it is optimized for data access
+   * Run Compaction on the table. Compaction arranges the data so that it is optimized for data
+   * access
   */
  public abstract JavaRDD<WriteStatus> compact(JavaSparkContext jsc, String commitTime);

--- a/hoodie-client/src/main/java/com/uber/hoodie/table/UserDefinedBulkInsertPartitioner.java
+++ b/hoodie-client/src/main/java/com/uber/hoodie/table/UserDefinedBulkInsertPartitioner.java
@@ -13,6 +13,7 @@
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
+
 package com.uber.hoodie.table;

 import com.uber.hoodie.common.model.HoodieRecord;
--- a/hoodie-client/src/main/java/com/uber/hoodie/table/WorkloadProfile.java
+++ b/hoodie-client/src/main/java/com/uber/hoodie/table/WorkloadProfile.java
@@ -16,7 +16,6 @@

 package com.uber.hoodie.table;

-
 import com.uber.hoodie.common.model.HoodieRecord;
 import com.uber.hoodie.common.model.HoodieRecordLocation;
 import com.uber.hoodie.common.model.HoodieRecordPayload;
@@ -31,7 +30,7 @@ import scala.Tuple2;
 /**
 * Information about incoming records for upsert/insert obtained either via sampling or
 * introspecting the data fully
- *
+ * <p>
 * TODO(vc): Think about obtaining this directly from index.tagLocation
 */
 public class WorkloadProfile<T extends HoodieRecordPayload> implements Serializable {
@@ -60,11 +59,9 @@ public class WorkloadProfile<T extends HoodieRecordPayload> implements Serializa
  private void buildProfile() {

    Map<Tuple2<String, Option<HoodieRecordLocation>>, Long> partitionLocationCounts = taggedRecords
-        .mapToPair(record ->
-            new Tuple2<>(
-                new Tuple2<>(record.getPartitionPath(), Option.apply(record.getCurrentLocation())),
-                record))
-        .countByKey();
+        .mapToPair(record -> new Tuple2<>(
+            new Tuple2<>(record.getPartitionPath(), Option.apply(record.getCurrentLocation())),
+            record)).countByKey();

    for (Map.Entry<Tuple2<String, Option<HoodieRecordLocation>>, Long> e : partitionLocationCounts
        .entrySet()) {
--- a/hoodie-client/src/main/java/com/uber/hoodie/table/WorkloadStat.java
+++ b/hoodie-client/src/main/java/com/uber/hoodie/table/WorkloadStat.java
@@ -17,10 +17,9 @@
 package com.uber.hoodie.table;

 import com.uber.hoodie.common.model.HoodieRecordLocation;
-import org.apache.commons.lang3.tuple.Pair;
-
 import java.io.Serializable;
 import java.util.HashMap;
+import org.apache.commons.lang3.tuple.Pair;

 /**
 * Wraps stats about a single partition path.