HUDI-123 Rename code packages/constants to org.apache.hudi (#830)

- Rename com.uber.hoodie to org.apache.hudi - Flag to pass com.uber.hoodie Input formats for hoodie-sync - Works with HUDI demo. - Also tested for backwards compatibility with datasets built by com.uber.hoodie packages - Migration guide : https://cwiki.apache.org/confluence/display/HUDI/Migration+Guide+From+com.uber.hoodie+to+org.apache.hudi
2019-08-11 17:48:17 -07:00
parent 722b6be04a
commit a4f9d7575f
546 changed files with 3858 additions and 3562 deletions
--- a/hudi-hadoop-mr/src/main/java/com/uber/hoodie/hadoop/HoodieInputFormat.java
+++ b/hudi-hadoop-mr/src/main/java/com/uber/hoodie/hadoop/HoodieInputFormat.java
@@ -0,0 +1,26 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package com.uber.hoodie.hadoop;
+
+/**
+ * Temporary class to allow seamless migration of com.uber.hoodie to org.apache.hudi
+ */
+public class HoodieInputFormat extends org.apache.hudi.hadoop.HoodieInputFormat {
+
+}
--- a/hudi-hadoop-mr/src/main/java/com/uber/hoodie/hadoop/realtime/HoodieRealtimeInputFormat.java
+++ b/hudi-hadoop-mr/src/main/java/com/uber/hoodie/hadoop/realtime/HoodieRealtimeInputFormat.java
@@ -0,0 +1,26 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package com.uber.hoodie.hadoop.realtime;
+
+/**
+ * Temporary class to allow seamless migration of com.uber.hoodie to org.apache.hudi
+ */
+public class HoodieRealtimeInputFormat extends org.apache.hudi.hadoop.realtime.HoodieRealtimeInputFormat {
+
+}
--- a/hudi-hadoop-mr/src/main/java/org/apache/hudi/hadoop/HoodieHiveUtil.java
+++ b/hudi-hadoop-mr/src/main/java/org/apache/hudi/hadoop/HoodieHiveUtil.java
@@ -0,0 +1,70 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.hudi.hadoop;
+
+import org.apache.hadoop.fs.Path;
+import org.apache.hadoop.mapreduce.JobContext;
+import org.apache.log4j.LogManager;
+import org.apache.log4j.Logger;
+
+public class HoodieHiveUtil {
+
+  public static final Logger LOG = LogManager.getLogger(HoodieHiveUtil.class);
+
+  public static final String HOODIE_CONSUME_MODE_PATTERN = "hoodie.%s.consume.mode";
+  public static final String HOODIE_START_COMMIT_PATTERN = "hoodie.%s.consume.start.timestamp";
+  public static final String HOODIE_MAX_COMMIT_PATTERN = "hoodie.%s.consume.max.commits";
+  public static final String INCREMENTAL_SCAN_MODE = "INCREMENTAL";
+  public static final String LATEST_SCAN_MODE = "LATEST";
+  public static final String DEFAULT_SCAN_MODE = LATEST_SCAN_MODE;
+  public static final int DEFAULT_MAX_COMMITS = 1;
+  public static final int MAX_COMMIT_ALL = -1;
+  public static final int DEFAULT_LEVELS_TO_BASEPATH = 3;
+
+  public static Integer readMaxCommits(JobContext job, String tableName) {
+    String maxCommitName = String.format(HOODIE_MAX_COMMIT_PATTERN, tableName);
+    int maxCommits = job.getConfiguration().getInt(maxCommitName, DEFAULT_MAX_COMMITS);
+    if (maxCommits == MAX_COMMIT_ALL) {
+      maxCommits = Integer.MAX_VALUE;
+    }
+    LOG.info("Read max commits - " + maxCommits);
+    return maxCommits;
+  }
+
+  public static String readStartCommitTime(JobContext job, String tableName) {
+    String startCommitTimestampName = String.format(HOODIE_START_COMMIT_PATTERN, tableName);
+    LOG.info("Read start commit time - " + job.getConfiguration().get(startCommitTimestampName));
+    return job.getConfiguration().get(startCommitTimestampName);
+  }
+
+  public static String readMode(JobContext job, String tableName) {
+    String modePropertyName = String.format(HOODIE_CONSUME_MODE_PATTERN, tableName);
+    String mode = job.getConfiguration().get(modePropertyName, DEFAULT_SCAN_MODE);
+    LOG.info(modePropertyName + ": " + mode);
+    return mode;
+  }
+
+  public static Path getNthParent(Path path, int n) {
+    Path parent = path;
+    for (int i = 0; i < n; i++) {
+      parent = parent.getParent();
+    }
+    return parent;
+  }
+}
--- a/hudi-hadoop-mr/src/main/java/org/apache/hudi/hadoop/HoodieInputFormat.java
+++ b/hudi-hadoop-mr/src/main/java/org/apache/hudi/hadoop/HoodieInputFormat.java
@@ -0,0 +1,224 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.hudi.hadoop;
+
+import java.io.IOException;
+import java.util.ArrayList;
+import java.util.HashMap;
+import java.util.List;
+import java.util.Map;
+import java.util.stream.Collectors;
+import org.apache.commons.logging.Log;
+import org.apache.commons.logging.LogFactory;
+import org.apache.hadoop.conf.Configurable;
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.fs.FileStatus;
+import org.apache.hadoop.fs.FileSystem;
+import org.apache.hadoop.fs.Path;
+import org.apache.hadoop.hive.ql.io.parquet.MapredParquetInputFormat;
+import org.apache.hadoop.io.ArrayWritable;
+import org.apache.hadoop.io.NullWritable;
+import org.apache.hadoop.mapred.InputSplit;
+import org.apache.hadoop.mapred.JobConf;
+import org.apache.hadoop.mapred.RecordReader;
+import org.apache.hadoop.mapred.Reporter;
+import org.apache.hadoop.mapreduce.Job;
+import org.apache.hudi.common.model.HoodieDataFile;
+import org.apache.hudi.common.model.HoodiePartitionMetadata;
+import org.apache.hudi.common.table.HoodieTableMetaClient;
+import org.apache.hudi.common.table.HoodieTimeline;
+import org.apache.hudi.common.table.TableFileSystemView.ReadOptimizedView;
+import org.apache.hudi.common.table.timeline.HoodieInstant;
+import org.apache.hudi.common.table.view.HoodieTableFileSystemView;
+import org.apache.hudi.exception.DatasetNotFoundException;
+import org.apache.hudi.exception.HoodieIOException;
+import org.apache.hudi.exception.InvalidDatasetException;
+
+/**
+ * HoodieInputFormat which understands the Hoodie File Structure and filters files based on the
+ * Hoodie Mode. If paths that does not correspond to a hoodie dataset then they are passed in as is
+ * (as what FileInputFormat.listStatus() would do). The JobConf could have paths from multipe
+ * Hoodie/Non-Hoodie datasets
+ */
+@UseFileSplitsFromInputFormat
+public class HoodieInputFormat extends MapredParquetInputFormat implements Configurable {
+
+  public static final Log LOG = LogFactory.getLog(HoodieInputFormat.class);
+
+  protected Configuration conf;
+
+  @Override
+  public FileStatus[] listStatus(JobConf job) throws IOException {
+    // Get all the file status from FileInputFormat and then do the filter
+    FileStatus[] fileStatuses = super.listStatus(job);
+    Map<HoodieTableMetaClient, List<FileStatus>> groupedFileStatus = groupFileStatus(fileStatuses);
+    LOG.info("Found a total of " + groupedFileStatus.size() + " groups");
+    List<FileStatus> returns = new ArrayList<>();
+    for (Map.Entry<HoodieTableMetaClient, List<FileStatus>> entry : groupedFileStatus.entrySet()) {
+      HoodieTableMetaClient metadata = entry.getKey();
+      if (metadata == null) {
+        // Add all the paths which are not hoodie specific
+        returns.addAll(entry.getValue());
+        continue;
+      }
+
+      FileStatus[] statuses = entry.getValue().toArray(new FileStatus[entry.getValue().size()]);
+      if (LOG.isDebugEnabled()) {
+        LOG.debug("Hoodie Metadata initialized with completed commit Ts as :" + metadata);
+      }
+      String tableName = metadata.getTableConfig().getTableName();
+      String mode = HoodieHiveUtil.readMode(Job.getInstance(job), tableName);
+      // Get all commits, delta commits, compactions, as all of them produce a base parquet file
+      // today
+      HoodieTimeline timeline = metadata.getActiveTimeline().getCommitsTimeline()
+          .filterCompletedInstants();
+      ReadOptimizedView roView = new HoodieTableFileSystemView(metadata,
+          timeline, statuses);
+
+      if (HoodieHiveUtil.INCREMENTAL_SCAN_MODE.equals(mode)) {
+        // this is of the form commitTs_partition_sequenceNumber
+        String lastIncrementalTs = HoodieHiveUtil
+            .readStartCommitTime(Job.getInstance(job), tableName);
+        // Total number of commits to return in this batch. Set this to -1 to get all the commits.
+        Integer maxCommits = HoodieHiveUtil.readMaxCommits(Job.getInstance(job), tableName);
+        LOG.info("Last Incremental timestamp was set as " + lastIncrementalTs);
+        List<String> commitsToReturn = timeline.findInstantsAfter(lastIncrementalTs, maxCommits)
+            .getInstants().map(HoodieInstant::getTimestamp).collect(Collectors.toList());
+        List<HoodieDataFile> filteredFiles = roView.getLatestDataFilesInRange(commitsToReturn)
+            .collect(Collectors.toList());
+        for (HoodieDataFile filteredFile : filteredFiles) {
+          LOG.info("Processing incremental hoodie file - " + filteredFile.getPath());
+          filteredFile = checkFileStatus(filteredFile);
+          returns.add(filteredFile.getFileStatus());
+        }
+        LOG.info("Total paths to process after hoodie incremental filter " + filteredFiles.size());
+      } else {
+        // filter files on the latest commit found
+        List<HoodieDataFile> filteredFiles = roView.getLatestDataFiles()
+            .collect(Collectors.toList());
+        LOG.info("Total paths to process after hoodie filter " + filteredFiles.size());
+        for (HoodieDataFile filteredFile : filteredFiles) {
+          if (LOG.isDebugEnabled()) {
+            LOG.debug("Processing latest hoodie file - " + filteredFile.getPath());
+          }
+          filteredFile = checkFileStatus(filteredFile);
+          returns.add(filteredFile.getFileStatus());
+        }
+      }
+    }
+    return returns.toArray(new FileStatus[returns.size()]);
+
+  }
+
+  /**
+   * Checks the file status for a race condition which can set the file size to 0. 1.
+   * HiveInputFormat does super.listStatus() and gets back a FileStatus[] 2. Then it creates the
+   * HoodieTableMetaClient for the paths listed. 3. Generation of splits looks at FileStatus size to
+   * create splits, which skips this file
+   */
+  private HoodieDataFile checkFileStatus(HoodieDataFile dataFile) throws IOException {
+    Path dataPath = dataFile.getFileStatus().getPath();
+    try {
+      if (dataFile.getFileSize() == 0) {
+        FileSystem fs = dataPath.getFileSystem(conf);
+        LOG.info("Refreshing file status " + dataFile.getPath());
+        return new HoodieDataFile(fs.getFileStatus(dataPath));
+      }
+      return dataFile;
+    } catch (IOException e) {
+      throw new HoodieIOException("Could not get FileStatus on path " + dataPath);
+    }
+  }
+
+  private Map<HoodieTableMetaClient, List<FileStatus>> groupFileStatus(FileStatus[] fileStatuses)
+      throws IOException {
+    // This assumes the paths for different tables are grouped together
+    Map<HoodieTableMetaClient, List<FileStatus>> grouped = new HashMap<>();
+    HoodieTableMetaClient metadata = null;
+    String nonHoodieBasePath = null;
+    for (FileStatus status : fileStatuses) {
+      if (!status.getPath().getName().endsWith(".parquet")) {
+        //FIXME(vc): skip non parquet files for now. This wont be needed once log file name start
+        // with "."
+        continue;
+      }
+      if ((metadata == null && nonHoodieBasePath == null) || (metadata == null && !status.getPath()
+          .toString().contains(nonHoodieBasePath)) || (metadata != null && !status.getPath()
+          .toString().contains(metadata.getBasePath()))) {
+        try {
+          metadata = getTableMetaClient(status.getPath().getFileSystem(conf),
+              status.getPath().getParent());
+          nonHoodieBasePath = null;
+        } catch (DatasetNotFoundException | InvalidDatasetException e) {
+          LOG.info("Handling a non-hoodie path " + status.getPath());
+          metadata = null;
+          nonHoodieBasePath = status.getPath().getParent().toString();
+        }
+        if (!grouped.containsKey(metadata)) {
+          grouped.put(metadata, new ArrayList<>());
+        }
+      }
+      grouped.get(metadata).add(status);
+    }
+    return grouped;
+  }
+
+  public void setConf(Configuration conf) {
+    this.conf = conf;
+  }
+
+  public Configuration getConf() {
+    return conf;
+  }
+
+  @Override
+  public RecordReader<NullWritable, ArrayWritable> getRecordReader(final InputSplit split,
+      final JobConf job, final Reporter reporter) throws IOException {
+    // TODO enable automatic predicate pushdown after fixing issues
+    //        FileSplit fileSplit = (FileSplit) split;
+    //        HoodieTableMetadata metadata = getTableMetadata(fileSplit.getPath().getParent());
+    //        String tableName = metadata.getTableName();
+    //        String mode = HoodieHiveUtil.readMode(job, tableName);
+
+    //        if (HoodieHiveUtil.INCREMENTAL_SCAN_MODE.equals(mode)) {
+    //            FilterPredicate predicate = constructHoodiePredicate(job, tableName, split);
+    //            LOG.info("Setting parquet predicate push down as " + predicate);
+    //            ParquetInputFormat.setFilterPredicate(job, predicate);
+    //clearOutExistingPredicate(job);
+    //        }
+    return super.getRecordReader(split, job, reporter);
+  }
+
+  /**
+   * Read the table metadata from a data path. This assumes certain hierarchy of files which should
+   * be changed once a better way is figured out to pass in the hoodie meta directory
+   */
+  protected static HoodieTableMetaClient getTableMetaClient(FileSystem fs, Path dataPath)
+      throws IOException {
+    int levels = HoodieHiveUtil.DEFAULT_LEVELS_TO_BASEPATH;
+    if (HoodiePartitionMetadata.hasPartitionMetadata(fs, dataPath)) {
+      HoodiePartitionMetadata metadata = new HoodiePartitionMetadata(fs, dataPath);
+      metadata.readFromFS();
+      levels = metadata.getPartitionDepth();
+    }
+    Path baseDir = HoodieHiveUtil.getNthParent(dataPath, levels);
+    LOG.info("Reading hoodie metadata from path " + baseDir.toString());
+    return new HoodieTableMetaClient(fs.getConf(), baseDir.toString());
+  }
+}
--- a/hudi-hadoop-mr/src/main/java/org/apache/hudi/hadoop/HoodieROTablePathFilter.java
+++ b/hudi-hadoop-mr/src/main/java/org/apache/hudi/hadoop/HoodieROTablePathFilter.java
@@ -0,0 +1,183 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.hudi.hadoop;
+
+import java.io.Serializable;
+import java.util.HashMap;
+import java.util.HashSet;
+import java.util.List;
+import java.util.stream.Collectors;
+import org.apache.commons.logging.Log;
+import org.apache.commons.logging.LogFactory;
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.fs.FileSystem;
+import org.apache.hadoop.fs.Path;
+import org.apache.hadoop.fs.PathFilter;
+import org.apache.hudi.common.model.HoodieDataFile;
+import org.apache.hudi.common.model.HoodiePartitionMetadata;
+import org.apache.hudi.common.table.HoodieTableMetaClient;
+import org.apache.hudi.common.table.view.HoodieTableFileSystemView;
+import org.apache.hudi.exception.DatasetNotFoundException;
+import org.apache.hudi.exception.HoodieException;
+
+/**
+ * Given a path is a part of - Hoodie dataset = accepts ONLY the latest version of each path -
+ * Non-Hoodie dataset = then always accept
+ * <p>
+ * We can set this filter, on a query engine's Hadoop Config and if it respects path filters, then
+ * you should be able to query both hoodie and non-hoodie datasets as you would normally do.
+ * <p>
+ * hadoopConf.setClass("mapreduce.input.pathFilter.class", org.apache.hudi.hadoop
+ * .HoodieROTablePathFilter.class, org.apache.hadoop.fs.PathFilter.class)
+ */
+public class HoodieROTablePathFilter implements PathFilter, Serializable {
+
+  public static final Log LOG = LogFactory.getLog(HoodieROTablePathFilter.class);
+
+  /**
+   * Its quite common, to have all files from a given partition path be passed into accept(), cache
+   * the check for hoodie metadata for known partition paths and the latest versions of files
+   */
+  private HashMap<String, HashSet<Path>> hoodiePathCache;
+
+  /**
+   * Paths that are known to be non-hoodie datasets.
+   */
+  private HashSet<String> nonHoodiePathCache;
+
+
+  private transient FileSystem fs;
+
+
+  public HoodieROTablePathFilter() {
+    hoodiePathCache = new HashMap<>();
+    nonHoodiePathCache = new HashSet<>();
+  }
+
+  /**
+   * Obtain the path, two levels from provided path
+   *
+   * @return said path if available, null otherwise
+   */
+  private Path safeGetParentsParent(Path path) {
+    if (path.getParent() != null && path.getParent().getParent() != null
+        && path.getParent().getParent().getParent() != null) {
+      return path.getParent().getParent().getParent();
+    }
+    return null;
+  }
+
+  @Override
+  public boolean accept(Path path) {
+
+    if (LOG.isDebugEnabled()) {
+      LOG.debug("Checking acceptance for path " + path);
+    }
+    Path folder = null;
+    try {
+      if (fs == null) {
+        fs = path.getFileSystem(new Configuration());
+      }
+
+      // Assumes path is a file
+      folder = path.getParent(); // get the immediate parent.
+      // Try to use the caches.
+      if (nonHoodiePathCache.contains(folder.toString())) {
+        if (LOG.isDebugEnabled()) {
+          LOG.debug("Accepting non-hoodie path from cache: " + path);
+        }
+        return true;
+      }
+
+      if (hoodiePathCache.containsKey(folder.toString())) {
+        if (LOG.isDebugEnabled()) {
+          LOG.debug(String.format("%s Hoodie path checked against cache, accept => %s \n", path,
+              hoodiePathCache.get(folder.toString()).contains(path)));
+        }
+        return hoodiePathCache.get(folder.toString()).contains(path);
+      }
+
+      // Skip all files that are descendants of .hoodie in its path.
+      String filePath = path.toString();
+      if (filePath.contains("/" + HoodieTableMetaClient.METAFOLDER_NAME + "/")
+          || filePath.endsWith("/" + HoodieTableMetaClient.METAFOLDER_NAME)) {
+        if (LOG.isDebugEnabled()) {
+          LOG.debug(String.format("Skipping Hoodie Metadata file  %s \n", filePath));
+        }
+        return false;
+      }
+
+      // Perform actual checking.
+      Path baseDir;
+      if (HoodiePartitionMetadata.hasPartitionMetadata(fs, folder)) {
+        HoodiePartitionMetadata metadata = new HoodiePartitionMetadata(fs, folder);
+        metadata.readFromFS();
+        baseDir = HoodieHiveUtil.getNthParent(folder, metadata.getPartitionDepth());
+      } else {
+        baseDir = safeGetParentsParent(folder);
+      }
+
+      if (baseDir != null) {
+        try {
+          HoodieTableMetaClient metaClient = new HoodieTableMetaClient(fs.getConf(),
+              baseDir.toString());
+          HoodieTableFileSystemView fsView = new HoodieTableFileSystemView(metaClient,
+              metaClient.getActiveTimeline().getCommitsTimeline().filterCompletedInstants(),
+              fs.listStatus(folder));
+          List<HoodieDataFile> latestFiles = fsView.getLatestDataFiles()
+              .collect(Collectors.toList());
+          // populate the cache
+          if (!hoodiePathCache.containsKey(folder.toString())) {
+            hoodiePathCache.put(folder.toString(), new HashSet<>());
+          }
+          LOG.info("Based on hoodie metadata from base path: " + baseDir.toString() + ", caching "
+              + latestFiles.size() + " files under " + folder);
+          for (HoodieDataFile lfile : latestFiles) {
+            hoodiePathCache.get(folder.toString()).add(new Path(lfile.getPath()));
+          }
+
+          // accept the path, if its among the latest files.
+          if (LOG.isDebugEnabled()) {
+            LOG.debug(String.format("%s checked after cache population, accept => %s \n", path,
+                hoodiePathCache.get(folder.toString()).contains(path)));
+          }
+          return hoodiePathCache.get(folder.toString()).contains(path);
+        } catch (DatasetNotFoundException e) {
+          // Non-hoodie path, accept it.
+          if (LOG.isDebugEnabled()) {
+            LOG.debug(String.format("(1) Caching non-hoodie path under %s \n", folder.toString()));
+          }
+          nonHoodiePathCache.add(folder.toString());
+          return true;
+        }
+      } else {
+        // files is at < 3 level depth in FS tree, can't be hoodie dataset
+        if (LOG.isDebugEnabled()) {
+          LOG.debug(String.format("(2) Caching non-hoodie path under %s \n", folder.toString()));
+        }
+        nonHoodiePathCache.add(folder.toString());
+        return true;
+      }
+    } catch (Exception e) {
+      String msg = "Error checking path :" + path + ", under folder: " + folder;
+      LOG.error(msg, e);
+      throw new HoodieException(msg, e);
+    }
+  }
+}
--- a/hudi-hadoop-mr/src/main/java/org/apache/hudi/hadoop/RecordReaderValueIterator.java
+++ b/hudi-hadoop-mr/src/main/java/org/apache/hudi/hadoop/RecordReaderValueIterator.java
@@ -0,0 +1,83 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.hudi.hadoop;
+
+import java.io.IOException;
+import java.util.Iterator;
+import java.util.NoSuchElementException;
+import org.apache.commons.logging.Log;
+import org.apache.commons.logging.LogFactory;
+import org.apache.hadoop.mapred.RecordReader;
+import org.apache.hudi.exception.HoodieException;
+
+/**
+ * Provides Iterator Interface to iterate value entries read from record reader
+ *
+ * @param <K> Key Type
+ * @param <V> Value Type
+ */
+public class RecordReaderValueIterator<K, V> implements Iterator<V> {
+
+  public static final Log LOG = LogFactory.getLog(RecordReaderValueIterator.class);
+
+  private final RecordReader<K, V> reader;
+  private V nextVal = null;
+
+  /**
+   * Construct RecordReaderValueIterator
+   *
+   * @param reader reader
+   */
+  public RecordReaderValueIterator(RecordReader<K, V> reader) {
+    this.reader = reader;
+  }
+
+  @Override
+  public boolean hasNext() {
+    if (nextVal == null) {
+      K key = reader.createKey();
+      V val = reader.createValue();
+      try {
+        boolean notDone = reader.next(key, val);
+        if (!notDone) {
+          return false;
+        }
+        this.nextVal = val;
+      } catch (IOException e) {
+        LOG.error("Got error reading next record from record reader");
+        throw new HoodieException(e);
+      }
+    }
+    return true;
+  }
+
+  @Override
+  public V next() {
+    if (!hasNext()) {
+      throw new NoSuchElementException("Make sure you are following iterator contract.");
+    }
+    V retVal = this.nextVal;
+    this.nextVal = null;
+    return retVal;
+  }
+
+  public void close() throws IOException {
+    this.reader.close();
+  }
+}
--- a/hudi-hadoop-mr/src/main/java/org/apache/hudi/hadoop/SafeParquetRecordReaderWrapper.java
+++ b/hudi-hadoop-mr/src/main/java/org/apache/hudi/hadoop/SafeParquetRecordReaderWrapper.java
@@ -0,0 +1,92 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.hudi.hadoop;
+
+import java.io.IOException;
+import org.apache.hadoop.io.ArrayWritable;
+import org.apache.hadoop.io.NullWritable;
+import org.apache.hadoop.io.Writable;
+import org.apache.hadoop.mapred.RecordReader;
+
+/**
+ * Record Reader for parquet. Records read from this reader is safe to be
+ * buffered for concurrent processing.
+ *
+ * In concurrent producer/consumer pattern, where the record is read and buffered by one thread and processed in
+ * another thread, we need to ensure new instance of ArrayWritable is buffered. ParquetReader createKey/Value is unsafe
+ * as it gets reused for subsequent fetch. This wrapper makes ParquetReader safe for this use-case.
+ */
+public class SafeParquetRecordReaderWrapper implements RecordReader<NullWritable, ArrayWritable> {
+
+  // real Parquet reader to be wrapped
+  private final RecordReader<NullWritable, ArrayWritable> parquetReader;
+
+  // Value Class
+  private final Class valueClass;
+
+  // Number of fields in Value Schema
+  private final int numValueFields;
+
+
+  public SafeParquetRecordReaderWrapper(RecordReader<NullWritable, ArrayWritable> parquetReader) {
+    this.parquetReader = parquetReader;
+    ArrayWritable arrayWritable = parquetReader.createValue();
+    this.valueClass = arrayWritable.getValueClass();
+    this.numValueFields = arrayWritable.get().length;
+  }
+
+  @Override
+  public boolean next(NullWritable key, ArrayWritable value) throws IOException {
+    return parquetReader.next(key, value);
+  }
+
+  @Override
+  public NullWritable createKey() {
+    return parquetReader.createKey();
+  }
+
+  /**
+   * We could be in concurrent fetch and read env.
+   * We need to ensure new ArrayWritable as ParquetReader implementation reuses same
+   * ArrayWritable for all reads which will cause corruption when buffering.
+   * So, we create a new ArrayWritable here with Value class from parquetReader's value
+   * and an empty array.
+   */
+  @Override
+  public ArrayWritable createValue() {
+    // Call createValue of parquetReader to get size and class type info only
+    Writable[] emptyWritableBuf = new Writable[numValueFields];
+    return new ArrayWritable(valueClass, emptyWritableBuf);
+  }
+
+  @Override
+  public long getPos() throws IOException {
+    return parquetReader.getPos();
+  }
+
+  @Override
+  public void close() throws IOException {
+    parquetReader.close();
+  }
+
+  @Override
+  public float getProgress() throws IOException {
+    return parquetReader.getProgress();
+  }
+}
--- a/hudi-hadoop-mr/src/main/java/org/apache/hudi/hadoop/UseFileSplitsFromInputFormat.java
+++ b/hudi-hadoop-mr/src/main/java/org/apache/hudi/hadoop/UseFileSplitsFromInputFormat.java
@@ -0,0 +1,38 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.hudi.hadoop;
+
+import java.lang.annotation.Documented;
+import java.lang.annotation.ElementType;
+import java.lang.annotation.Inherited;
+import java.lang.annotation.Retention;
+import java.lang.annotation.RetentionPolicy;
+import java.lang.annotation.Target;
+
+/**
+ * When annotated on a InputFormat, informs the query engines, that they should use the FileSplits
+ * provided by the input format to execute the queries
+ */
+@Inherited
+@Documented
+@Target(ElementType.TYPE)
+@Retention(RetentionPolicy.RUNTIME)
+public @interface UseFileSplitsFromInputFormat {
+
+}
--- a/hudi-hadoop-mr/src/main/java/org/apache/hudi/hadoop/hive/HoodieCombineHiveInputFormat.java
+++ b/hudi-hadoop-mr/src/main/java/org/apache/hudi/hadoop/hive/HoodieCombineHiveInputFormat.java
@@ -0,0 +1,933 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.hudi.hadoop.hive;
+
+import com.google.common.annotations.VisibleForTesting;
+import java.io.DataInput;
+import java.io.DataOutput;
+import java.io.IOException;
+import java.util.ArrayList;
+import java.util.Arrays;
+import java.util.HashMap;
+import java.util.HashSet;
+import java.util.Iterator;
+import java.util.List;
+import java.util.Map;
+import java.util.Set;
+import java.util.concurrent.Callable;
+import java.util.concurrent.ExecutionException;
+import java.util.concurrent.ExecutorService;
+import java.util.concurrent.Executors;
+import java.util.concurrent.Future;
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.fs.FileStatus;
+import org.apache.hadoop.fs.FileSystem;
+import org.apache.hadoop.fs.Path;
+import org.apache.hadoop.fs.PathFilter;
+import org.apache.hadoop.hive.common.StringInternUtils;
+import org.apache.hadoop.hive.conf.HiveConf;
+import org.apache.hadoop.hive.ql.exec.Operator;
+import org.apache.hadoop.hive.ql.exec.Utilities;
+import org.apache.hadoop.hive.ql.io.CombineHiveRecordReader;
+import org.apache.hadoop.hive.ql.io.HiveFileFormatUtils;
+import org.apache.hadoop.hive.ql.io.HiveInputFormat;
+import org.apache.hadoop.hive.ql.io.IOPrepareCache;
+import org.apache.hadoop.hive.ql.io.SymlinkTextInputFormat;
+import org.apache.hadoop.hive.ql.log.PerfLogger;
+import org.apache.hadoop.hive.ql.parse.SplitSample;
+import org.apache.hadoop.hive.ql.plan.OperatorDesc;
+import org.apache.hadoop.hive.ql.plan.PartitionDesc;
+import org.apache.hadoop.hive.ql.plan.TableDesc;
+import org.apache.hadoop.hive.ql.session.SessionState;
+import org.apache.hadoop.hive.shims.HadoopShims.CombineFileInputFormatShim;
+import org.apache.hadoop.hive.shims.HadoopShimsSecure;
+import org.apache.hadoop.hive.shims.HadoopShimsSecure.InputSplitShim;
+import org.apache.hadoop.hive.shims.ShimLoader;
+import org.apache.hadoop.io.Writable;
+import org.apache.hadoop.io.WritableComparable;
+import org.apache.hadoop.mapred.FileInputFormat;
+import org.apache.hadoop.mapred.InputFormat;
+import org.apache.hadoop.mapred.InputSplit;
+import org.apache.hadoop.mapred.JobConf;
+import org.apache.hadoop.mapred.RecordReader;
+import org.apache.hadoop.mapred.Reporter;
+import org.apache.hadoop.mapred.lib.CombineFileInputFormat;
+import org.apache.hadoop.mapred.lib.CombineFileSplit;
+import org.apache.hadoop.mapreduce.JobContext;
+import org.apache.hudi.hadoop.HoodieInputFormat;
+import org.apache.hudi.hadoop.realtime.HoodieRealtimeInputFormat;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+/**
+ * This is just a copy of the org.apache.hadoop.hive.ql.io.CombineHiveInputFormat from Hive 2.x
+ * Search for **MOD** to see minor modifications to support custom inputformat in CombineHiveInputFormat.
+ * See https://issues.apache.org/jira/browse/HIVE-9771
+ * <p>
+ * <p>
+ * CombineHiveInputFormat is a parameterized InputFormat which looks at the path
+ * name and determine the correct InputFormat for that path name from
+ * mapredPlan.pathToPartitionInfo(). It can be used to read files with different
+ * input format in the same map-reduce job.
+ *
+ * NOTE : This class is implemented to work with Hive 2.x +
+ */
+public class HoodieCombineHiveInputFormat<K extends WritableComparable, V extends Writable>
+    extends HiveInputFormat<K, V> {
+
+  private static final String CLASS_NAME = HoodieCombineHiveInputFormat.class.getName();
+  public static final Logger LOG = LoggerFactory.getLogger(CLASS_NAME);
+
+  // max number of threads we can use to check non-combinable paths
+  private static final int MAX_CHECK_NONCOMBINABLE_THREAD_NUM = 50;
+  private static final int DEFAULT_NUM_PATH_PER_THREAD = 100;
+
+  private class CheckNonCombinablePathCallable implements Callable<Set<Integer>> {
+
+    private final Path[] paths;
+    private final int start;
+    private final int length;
+    private final JobConf conf;
+
+    public CheckNonCombinablePathCallable(Path[] paths, int start, int length, JobConf conf) {
+      this.paths = paths;
+      this.start = start;
+      this.length = length;
+      this.conf = conf;
+    }
+
+    @Override
+    public Set<Integer> call() throws Exception {
+      Set<Integer> nonCombinablePathIndices = new HashSet<Integer>();
+      for (int i = 0; i < length; i++) {
+        PartitionDesc part =
+            HiveFileFormatUtils.getPartitionDescFromPathRecursively(
+                pathToPartitionInfo, paths[i + start],
+                IOPrepareCache.get().allocatePartitionDescMap());
+        // Use HiveInputFormat if any of the paths is not splittable
+        Class<? extends InputFormat> inputFormatClass = part.getInputFileFormatClass();
+        InputFormat<WritableComparable, Writable> inputFormat =
+            getInputFormatFromCache(inputFormatClass, conf);
+        if (inputFormat instanceof AvoidSplitCombination
+            && ((AvoidSplitCombination) inputFormat).shouldSkipCombine(paths[i + start], conf)) {
+          if (LOG.isDebugEnabled()) {
+            LOG.debug("The path [" + paths[i + start]
+                + "] is being parked for HiveInputFormat.getSplits");
+          }
+          nonCombinablePathIndices.add(i + start);
+        }
+      }
+      return nonCombinablePathIndices;
+    }
+  }
+
+  /**
+   * CombineHiveInputSplit encapsulates an InputSplit with its corresponding
+   * inputFormatClassName. A CombineHiveInputSplit comprises of multiple chunks
+   * from different files. Since, they belong to a single directory, there is a
+   * single inputformat for all the chunks.
+   */
+  public static class CombineHiveInputSplit extends InputSplitShim {
+
+    private String inputFormatClassName;
+    private CombineFileSplit inputSplitShim;
+    private Map<Path, PartitionDesc> pathToPartitionInfo;
+
+    public CombineHiveInputSplit() throws IOException {
+      this(ShimLoader.getHadoopShims().getCombineFileInputFormat()
+          .getInputSplitShim());
+    }
+
+    public CombineHiveInputSplit(CombineFileSplit inputSplitShim) throws IOException {
+      this(inputSplitShim.getJob(), inputSplitShim);
+    }
+
+    public CombineHiveInputSplit(JobConf job, CombineFileSplit inputSplitShim)
+        throws IOException {
+      this(job, inputSplitShim, null);
+    }
+
+    public CombineHiveInputSplit(JobConf job, CombineFileSplit inputSplitShim,
+        Map<Path, PartitionDesc> pathToPartitionInfo) throws IOException {
+      this.inputSplitShim = inputSplitShim;
+      this.pathToPartitionInfo = pathToPartitionInfo;
+      if (job != null) {
+        if (this.pathToPartitionInfo == null) {
+          this.pathToPartitionInfo = Utilities.getMapWork(job).getPathToPartitionInfo();
+        }
+
+        // extract all the inputFormatClass names for each chunk in the
+        // CombinedSplit.
+        Path[] ipaths = inputSplitShim.getPaths();
+        if (ipaths.length > 0) {
+          PartitionDesc part = HiveFileFormatUtils
+              .getPartitionDescFromPathRecursively(this.pathToPartitionInfo,
+                  ipaths[0], IOPrepareCache.get().getPartitionDescMap());
+          inputFormatClassName = part.getInputFileFormatClass().getName();
+        }
+      }
+    }
+
+    public CombineFileSplit getInputSplitShim() {
+      return inputSplitShim;
+    }
+
+    /**
+     * Returns the inputFormat class name for the i-th chunk.
+     */
+    public String inputFormatClassName() {
+      return inputFormatClassName;
+    }
+
+    public void setInputFormatClassName(String inputFormatClassName) {
+      this.inputFormatClassName = inputFormatClassName;
+    }
+
+    @Override
+    public JobConf getJob() {
+      return inputSplitShim.getJob();
+    }
+
+    @Override
+    public long getLength() {
+      return inputSplitShim.getLength();
+    }
+
+    /**
+     * Returns an array containing the startoffsets of the files in the split.
+     */
+    @Override
+    public long[] getStartOffsets() {
+      return inputSplitShim.getStartOffsets();
+    }
+
+    /**
+     * Returns an array containing the lengths of the files in the split.
+     */
+    @Override
+    public long[] getLengths() {
+      return inputSplitShim.getLengths();
+    }
+
+    /**
+     * Returns the start offset of the i<sup>th</sup> Path.
+     */
+    @Override
+    public long getOffset(int i) {
+      return inputSplitShim.getOffset(i);
+    }
+
+    /**
+     * Returns the length of the i<sup>th</sup> Path.
+     */
+    @Override
+    public long getLength(int i) {
+      return inputSplitShim.getLength(i);
+    }
+
+    /**
+     * Returns the number of Paths in the split.
+     */
+    @Override
+    public int getNumPaths() {
+      return inputSplitShim.getNumPaths();
+    }
+
+    /**
+     * Returns the i<sup>th</sup> Path.
+     */
+    @Override
+    public Path getPath(int i) {
+      return inputSplitShim.getPath(i);
+    }
+
+    /**
+     * Returns all the Paths in the split.
+     */
+    @Override
+    public Path[] getPaths() {
+      return inputSplitShim.getPaths();
+    }
+
+    /**
+     * Returns all the Paths where this input-split resides.
+     */
+    @Override
+    public String[] getLocations() throws IOException {
+      return inputSplitShim.getLocations();
+    }
+
+    /**
+     * Prints this obejct as a string.
+     */
+    @Override
+    public String toString() {
+      StringBuilder sb = new StringBuilder();
+      sb.append(inputSplitShim.toString());
+      sb.append("InputFormatClass: " + inputFormatClassName);
+      sb.append("\n");
+      return sb.toString();
+    }
+
+    /**
+     * Writable interface.
+     */
+    @Override
+    public void readFields(DataInput in) throws IOException {
+      inputSplitShim.readFields(in);
+      inputFormatClassName = in.readUTF();
+    }
+
+    /**
+     * Writable interface.
+     */
+    @Override
+    public void write(DataOutput out) throws IOException {
+      inputSplitShim.write(out);
+      if (inputFormatClassName == null) {
+        if (pathToPartitionInfo == null) {
+          pathToPartitionInfo = Utilities.getMapWork(getJob()).getPathToPartitionInfo();
+        }
+
+        // extract all the inputFormatClass names for each chunk in the
+        // CombinedSplit.
+        PartitionDesc part =
+            HiveFileFormatUtils.getPartitionDescFromPathRecursively(pathToPartitionInfo,
+                inputSplitShim.getPath(0), IOPrepareCache.get().getPartitionDescMap());
+
+        // create a new InputFormat instance if this is the first time to see
+        // this class
+        inputFormatClassName = part.getInputFileFormatClass().getName();
+      }
+
+      out.writeUTF(inputFormatClassName);
+    }
+  }
+
+  // Splits are not shared across different partitions with different input formats.
+  // For example, 2 partitions (1 sequencefile and 1 rcfile) will have 2 different splits
+  private static class CombinePathInputFormat {
+
+    private final List<Operator<? extends OperatorDesc>> opList;
+    private final String inputFormatClassName;
+    private final String deserializerClassName;
+
+    public CombinePathInputFormat(List<Operator<? extends OperatorDesc>> opList,
+        String inputFormatClassName,
+        String deserializerClassName) {
+      this.opList = opList;
+      this.inputFormatClassName = inputFormatClassName;
+      this.deserializerClassName = deserializerClassName;
+    }
+
+    @Override
+    public boolean equals(Object o) {
+      if (o instanceof CombinePathInputFormat) {
+        CombinePathInputFormat mObj = (CombinePathInputFormat) o;
+        return (opList.equals(mObj.opList))
+            && (inputFormatClassName.equals(mObj.inputFormatClassName))
+            && (deserializerClassName == null ? (mObj.deserializerClassName == null) :
+            deserializerClassName.equals(mObj.deserializerClassName));
+      }
+      return false;
+    }
+
+    @Override
+    public int hashCode() {
+      return (opList == null) ? 0 : opList.hashCode();
+    }
+  }
+
+  /**
+   * Create Hive splits based on CombineFileSplit.
+   */
+  private InputSplit[] getCombineSplits(JobConf job, int numSplits,
+      Map<Path, PartitionDesc> pathToPartitionInfo)
+      throws IOException {
+    init(job);
+    Map<Path, ArrayList<String>> pathToAliases = mrwork.getPathToAliases();
+    Map<String, Operator<? extends OperatorDesc>> aliasToWork =
+        mrwork.getAliasToWork();
+    /** MOD - Initialize a custom combine input format shim that will call listStatus on the custom inputFormat **/
+    HoodieCombineHiveInputFormat.HoodieCombineFileInputFormatShim
+        combine = new HoodieCombineHiveInputFormat.HoodieCombineFileInputFormatShim();
+
+    InputSplit[] splits = null;
+    if (combine == null) {
+      splits = super.getSplits(job, numSplits);
+      return splits;
+    }
+
+    if (combine.getInputPathsShim(job).length == 0) {
+      throw new IOException("No input paths specified in job");
+    }
+    ArrayList<InputSplit> result = new ArrayList<InputSplit>();
+
+    // combine splits only from same tables and same partitions. Do not combine splits from multiple
+    // tables or multiple partitions.
+    Path[] paths = StringInternUtils.internUriStringsInPathArray(combine.getInputPathsShim(job));
+
+    List<Path> inpDirs = new ArrayList<Path>();
+    List<Path> inpFiles = new ArrayList<Path>();
+    Map<CombinePathInputFormat, CombineFilter> poolMap =
+        new HashMap<CombinePathInputFormat, CombineFilter>();
+    Set<Path> poolSet = new HashSet<Path>();
+
+    for (Path path : paths) {
+      PartitionDesc part = HiveFileFormatUtils.getPartitionDescFromPathRecursively(
+          pathToPartitionInfo, path, IOPrepareCache.get().allocatePartitionDescMap());
+      TableDesc tableDesc = part.getTableDesc();
+      if ((tableDesc != null) && tableDesc.isNonNative()) {
+        return super.getSplits(job, numSplits);
+      }
+
+      // Use HiveInputFormat if any of the paths is not splittable
+      Class inputFormatClass = part.getInputFileFormatClass();
+      String inputFormatClassName = inputFormatClass.getName();
+      InputFormat inputFormat = getInputFormatFromCache(inputFormatClass, job);
+      LOG.info("Input Format => " + inputFormatClass.getName());
+      // **MOD** Set the hoodie filter in the combine
+      if (inputFormatClass.getName().equals(HoodieInputFormat.class.getName())) {
+        combine.setHoodieFilter(true);
+      } else if (inputFormatClass.getName().equals(HoodieRealtimeInputFormat.class.getName())) {
+        LOG.info("Setting hoodie filter and realtime input format");
+        combine.setHoodieFilter(true);
+        combine.setRealTime(true);
+      }
+      String deserializerClassName = null;
+      try {
+        deserializerClassName = part.getDeserializer(job).getClass().getName();
+      } catch (Exception e) {
+        // ignore
+      }
+      FileSystem inpFs = path.getFileSystem(job);
+
+      //don't combine if inputformat is a SymlinkTextInputFormat
+      if (inputFormat instanceof SymlinkTextInputFormat) {
+        splits = super.getSplits(job, numSplits);
+        return splits;
+      }
+
+      Path filterPath = path;
+
+      // Does a pool exist for this path already
+      CombineFilter f = null;
+      List<Operator<? extends OperatorDesc>> opList = null;
+
+      if (!mrwork.isMapperCannotSpanPartns()) {
+        //if mapper can span partitions, make sure a splits does not contain multiple
+        // opList + inputFormatClassName + deserializerClassName combination
+        // This is done using the Map of CombinePathInputFormat to PathFilter
+
+        opList = HiveFileFormatUtils.doGetWorksFromPath(
+            pathToAliases, aliasToWork, filterPath);
+        CombinePathInputFormat combinePathInputFormat =
+            new CombinePathInputFormat(opList, inputFormatClassName, deserializerClassName);
+        f = poolMap.get(combinePathInputFormat);
+        if (f == null) {
+          f = new CombineFilter(filterPath);
+          LOG.info("CombineHiveInputSplit creating pool for "
+              + path + "; using filter path " + filterPath);
+          combine.createPool(job, f);
+          poolMap.put(combinePathInputFormat, f);
+        } else {
+          LOG.info("CombineHiveInputSplit: pool is already created for "
+              + path + "; using filter path " + filterPath);
+          f.addPath(filterPath);
+        }
+      } else {
+        // In the case of tablesample, the input paths are pointing to files rather than directories.
+        // We need to get the parent directory as the filtering path so that all files in the same
+        // parent directory will be grouped into one pool but not files from different parent
+        // directories. This guarantees that a split will combine all files in the same partition
+        // but won't cross multiple partitions if the user has asked so.
+        if (!path.getFileSystem(job).getFileStatus(path).isDir()) { // path is not directory
+          filterPath = path.getParent();
+          inpFiles.add(path);
+          poolSet.add(filterPath);
+        } else {
+          inpDirs.add(path);
+        }
+      }
+    }
+
+    // Processing directories
+    List<CombineFileSplit> iss = new ArrayList<CombineFileSplit>();
+    if (!mrwork.isMapperCannotSpanPartns()) {
+      //mapper can span partitions
+      //combine into as few as one split, subject to the PathFilters set
+      // using combine.createPool.
+      iss = Arrays.asList(combine.getSplits(job, 1));
+    } else {
+      for (Path path : inpDirs) {
+        processPaths(job, combine, iss, path);
+      }
+
+      if (inpFiles.size() > 0) {
+        // Processing files
+        for (Path filterPath : poolSet) {
+          combine.createPool(job, new CombineFilter(filterPath));
+        }
+        processPaths(job, combine, iss, inpFiles.toArray(new Path[0]));
+      }
+    }
+
+    if (mrwork.getNameToSplitSample() != null && !mrwork.getNameToSplitSample().isEmpty()) {
+      iss = sampleSplits(iss);
+    }
+
+    for (CombineFileSplit is : iss) {
+      CombineHiveInputSplit csplit = new CombineHiveInputSplit(job, is, pathToPartitionInfo);
+      result.add(csplit);
+    }
+
+    LOG.info("number of splits " + result.size());
+    return result.toArray(new CombineHiveInputSplit[result.size()]);
+  }
+
+  /**
+   * Gets all the path indices that should not be combined
+   */
+  @VisibleForTesting
+  public Set<Integer> getNonCombinablePathIndices(JobConf job, Path[] paths, int numThreads)
+      throws ExecutionException, InterruptedException {
+    LOG.info("Total number of paths: " + paths.length
+        + ", launching " + numThreads + " threads to check non-combinable ones.");
+    int numPathPerThread = (int) Math.ceil((double) paths.length / numThreads);
+
+    ExecutorService executor = Executors.newFixedThreadPool(numThreads);
+    List<Future<Set<Integer>>> futureList = new ArrayList<Future<Set<Integer>>>(numThreads);
+    try {
+      for (int i = 0; i < numThreads; i++) {
+        int start = i * numPathPerThread;
+        int length = i != numThreads - 1 ? numPathPerThread : paths.length - start;
+        futureList.add(executor.submit(
+            new CheckNonCombinablePathCallable(paths, start, length, job)));
+      }
+      Set<Integer> nonCombinablePathIndices = new HashSet<Integer>();
+      for (Future<Set<Integer>> future : futureList) {
+        nonCombinablePathIndices.addAll(future.get());
+      }
+      return nonCombinablePathIndices;
+    } finally {
+      executor.shutdownNow();
+    }
+  }
+
+  /**
+   * Create Hive splits based on CombineFileSplit.
+   */
+  @Override
+  public InputSplit[] getSplits(JobConf job, int numSplits) throws IOException {
+    PerfLogger perfLogger = SessionState.getPerfLogger();
+    perfLogger.PerfLogBegin(CLASS_NAME, PerfLogger.GET_SPLITS);
+    init(job);
+
+    ArrayList<InputSplit> result = new ArrayList<InputSplit>();
+
+    Path[] paths = getInputPaths(job);
+
+    List<Path> nonCombinablePaths = new ArrayList<Path>(paths.length / 2);
+    List<Path> combinablePaths = new ArrayList<Path>(paths.length / 2);
+
+    int numThreads = Math.min(MAX_CHECK_NONCOMBINABLE_THREAD_NUM,
+        (int) Math.ceil((double) paths.length / DEFAULT_NUM_PATH_PER_THREAD));
+
+    // This check is necessary because for Spark branch, the result array from
+    // getInputPaths() above could be empty, and therefore numThreads could be 0.
+    // In that case, Executors.newFixedThreadPool will fail.
+    if (numThreads > 0) {
+      try {
+        Set<Integer> nonCombinablePathIndices = getNonCombinablePathIndices(job, paths, numThreads);
+        for (int i = 0; i < paths.length; i++) {
+          if (nonCombinablePathIndices.contains(i)) {
+            nonCombinablePaths.add(paths[i]);
+          } else {
+            combinablePaths.add(paths[i]);
+          }
+        }
+      } catch (Exception e) {
+        LOG.error("Error checking non-combinable path", e);
+        perfLogger.PerfLogEnd(CLASS_NAME, PerfLogger.GET_SPLITS);
+        throw new IOException(e);
+      }
+    }
+
+    // Store the previous value for the path specification
+    String oldPaths = job.get(org.apache.hadoop.mapreduce.lib.input.FileInputFormat.INPUT_DIR);
+    if (LOG.isDebugEnabled()) {
+      LOG.debug("The received input paths are: [" + oldPaths
+          + "] against the property "
+          + org.apache.hadoop.mapreduce.lib.input.FileInputFormat.INPUT_DIR);
+    }
+
+    // Process the normal splits
+    if (nonCombinablePaths.size() > 0) {
+      FileInputFormat.setInputPaths(job,
+          nonCombinablePaths.toArray(new Path[nonCombinablePaths.size()]));
+      InputSplit[] splits = super.getSplits(job, numSplits);
+      for (InputSplit split : splits) {
+        result.add(split);
+      }
+    }
+
+    // Process the combine splits
+    if (combinablePaths.size() > 0) {
+      FileInputFormat.setInputPaths(job,
+          combinablePaths.toArray(new Path[combinablePaths.size()]));
+      Map<Path, PartitionDesc> pathToPartitionInfo = this.pathToPartitionInfo != null
+          ? this.pathToPartitionInfo : Utilities.getMapWork(job).getPathToPartitionInfo();
+      InputSplit[] splits = getCombineSplits(job, numSplits, pathToPartitionInfo);
+      for (InputSplit split : splits) {
+        result.add(split);
+      }
+    }
+
+    // Restore the old path information back
+    // This is just to prevent incompatibilities with previous versions Hive
+    // if some application depends on the original value being set.
+    if (oldPaths != null) {
+      job.set(org.apache.hadoop.mapreduce.lib.input.FileInputFormat.INPUT_DIR, oldPaths);
+    }
+
+    // clear work from ThreadLocal after splits generated in case of thread is reused in pool.
+    Utilities.clearWorkMapForConf(job);
+
+    LOG.info("Number of all splits " + result.size());
+    perfLogger.PerfLogEnd(CLASS_NAME, PerfLogger.GET_SPLITS);
+    return result.toArray(new InputSplit[result.size()]);
+  }
+
+  private void processPaths(JobConf job, CombineFileInputFormatShim combine,
+      List<CombineFileSplit> iss, Path... path) throws IOException {
+    JobConf currJob = new JobConf(job);
+    FileInputFormat.setInputPaths(currJob, path);
+    iss.addAll(Arrays.asList(combine.getSplits(currJob, 1)));
+  }
+
+  /**
+   * MOD - Just added this for visibility
+   **/
+  Path[] getInputPaths(JobConf job) throws IOException {
+    Path[] dirs = FileInputFormat.getInputPaths(job);
+    if (dirs.length == 0) {
+      // on tez we're avoiding to duplicate the file info in FileInputFormat.
+      if (HiveConf.getVar(job, HiveConf.ConfVars.HIVE_EXECUTION_ENGINE).equals("tez")) {
+        try {
+          List<Path> paths = Utilities.getInputPathsTez(job, mrwork);
+          dirs = paths.toArray(new Path[paths.size()]);
+        } catch (Exception e) {
+          throw new IOException("Could not create input files", e);
+        }
+      } else {
+        throw new IOException("No input paths specified in job");
+      }
+    }
+    return dirs;
+  }
+
+  /**
+   * This function is used to sample inputs for clauses like "TABLESAMPLE(1 PERCENT)"
+   * <p>
+   * First, splits are grouped by alias they are for. If one split serves more than one
+   * alias or not for any sampled alias, we just directly add it to returned list.
+   * Then we find a list of exclusive splits for every alias to be sampled.
+   * For each alias, we start from position of seedNumber%totalNumber, and keep add
+   * splits until the total size hits percentage.
+   *
+   * @return the sampled splits
+   */
+  private List<CombineFileSplit> sampleSplits(List<CombineFileSplit> splits) {
+    HashMap<String, SplitSample> nameToSamples = mrwork.getNameToSplitSample();
+    List<CombineFileSplit> retLists = new ArrayList<CombineFileSplit>();
+    Map<String, ArrayList<CombineFileSplit>> aliasToSplitList =
+        new HashMap<String, ArrayList<CombineFileSplit>>();
+    Map<Path, ArrayList<String>> pathToAliases = mrwork.getPathToAliases();
+    Map<Path, ArrayList<String>> pathToAliasesNoScheme = removeScheme(pathToAliases);
+
+    // Populate list of exclusive splits for every sampled alias
+    //
+    for (CombineFileSplit split : splits) {
+      String alias = null;
+      for (Path path : split.getPaths()) {
+        boolean schemeless = path.toUri().getScheme() == null;
+        List<String> l = HiveFileFormatUtils.doGetAliasesFromPath(
+            schemeless ? pathToAliasesNoScheme : pathToAliases, path);
+        // a path for a split unqualified the split from being sampled if:
+        // 1. it serves more than one alias
+        // 2. the alias it serves is not sampled
+        // 3. it serves different alias than another path for the same split
+        if (l.size() != 1 || !nameToSamples.containsKey(l.get(0))
+            || (alias != null && l.get(0) != alias)) {
+          alias = null;
+          break;
+        }
+        alias = l.get(0);
+      }
+
+      if (alias != null) {
+        // split exclusively serves alias, which needs to be sampled
+        // add it to the split list of the alias.
+        if (!aliasToSplitList.containsKey(alias)) {
+          aliasToSplitList.put(alias, new ArrayList<CombineFileSplit>());
+        }
+        aliasToSplitList.get(alias).add(split);
+      } else {
+        // The split doesn't exclusively serve one alias
+        retLists.add(split);
+      }
+    }
+
+    // for every sampled alias, we figure out splits to be sampled and add
+    // them to return list
+    //
+    for (Map.Entry<String, ArrayList<CombineFileSplit>> entry : aliasToSplitList.entrySet()) {
+      ArrayList<CombineFileSplit> splitList = entry.getValue();
+      long totalSize = 0;
+      for (CombineFileSplit split : splitList) {
+        totalSize += split.getLength();
+      }
+
+      SplitSample splitSample = nameToSamples.get(entry.getKey());
+
+      long targetSize = splitSample.getTargetSize(totalSize);
+      int startIndex = splitSample.getSeedNum() % splitList.size();
+      long size = 0;
+      for (int i = 0; i < splitList.size(); i++) {
+        CombineFileSplit split = splitList.get((startIndex + i) % splitList.size());
+        retLists.add(split);
+        long splitgLength = split.getLength();
+        if (size + splitgLength >= targetSize) {
+          LOG.info("Sample alias " + entry.getValue() + " using " + (i + 1) + "splits");
+          if (size + splitgLength > targetSize) {
+            ((InputSplitShim) split).shrinkSplit(targetSize - size);
+          }
+          break;
+        }
+        size += splitgLength;
+      }
+
+    }
+
+    return retLists;
+  }
+
+  Map<Path, ArrayList<String>> removeScheme(Map<Path, ArrayList<String>> pathToAliases) {
+    Map<Path, ArrayList<String>> result = new HashMap<>();
+    for (Map.Entry<Path, ArrayList<String>> entry : pathToAliases.entrySet()) {
+      Path newKey = Path.getPathWithoutSchemeAndAuthority(entry.getKey());
+      StringInternUtils.internUriStringsInPath(newKey);
+      result.put(newKey, entry.getValue());
+    }
+    return result;
+  }
+
+  /**
+   * Create a generic Hive RecordReader than can iterate over all chunks in a
+   * CombinedFileSplit.
+   */
+  @Override
+  public RecordReader getRecordReader(InputSplit split, JobConf job,
+      Reporter reporter) throws IOException {
+    if (!(split instanceof CombineHiveInputSplit)) {
+      return super.getRecordReader(split, job, reporter);
+    }
+
+    CombineHiveInputSplit hsplit = (CombineHiveInputSplit) split;
+
+    String inputFormatClassName = null;
+    Class inputFormatClass = null;
+    try {
+      inputFormatClassName = hsplit.inputFormatClassName();
+      inputFormatClass = job.getClassByName(inputFormatClassName);
+    } catch (Exception e) {
+      throw new IOException("cannot find class " + inputFormatClassName);
+    }
+
+    pushProjectionsAndFilters(job, inputFormatClass, hsplit.getPath(0));
+
+    return ShimLoader.getHadoopShims().getCombineFileInputFormat()
+        .getRecordReader(job,
+            (CombineFileSplit) split, reporter,
+            CombineHiveRecordReader.class);
+  }
+
+  static class CombineFilter implements PathFilter {
+
+    private final Set<String> pStrings = new HashSet<String>();
+
+    // store a path prefix in this TestFilter
+    // PRECONDITION: p should always be a directory
+    public CombineFilter(Path p) {
+      // we need to keep the path part only because the Hadoop CombineFileInputFormat will
+      // pass the path part only to accept().
+      // Trailing the path with a separator to prevent partial matching.
+      addPath(p);
+    }
+
+    public void addPath(Path p) {
+      String pString = p.toUri().getPath();
+      pStrings.add(pString);
+    }
+
+    // returns true if the specified path matches the prefix stored
+    // in this TestFilter.
+    @Override
+    public boolean accept(Path path) {
+      boolean find = false;
+      while (path != null && !find) {
+        if (pStrings.contains(path.toUri().getPath())) {
+          find = true;
+          break;
+        }
+        path = path.getParent();
+      }
+      return find;
+    }
+
+    @Override
+    public String toString() {
+      StringBuilder s = new StringBuilder();
+      s.append("PathFilter: ");
+      for (String pString : pStrings) {
+        s.append(pString + " ");
+      }
+      return s.toString();
+    }
+  }
+
+  /**
+   * This is a marker interface that is used to identify the formats where
+   * combine split generation is not applicable
+   */
+  public interface AvoidSplitCombination {
+
+    boolean shouldSkipCombine(Path path, Configuration conf) throws IOException;
+  }
+
+  /**
+   * **MOD** this is the implementation of CombineFileInputFormat which is a copy of
+   * org.apache.hadoop.hive.shims.HadoopShimsSecure.CombineFileInputFormatShim
+   * with changes in listStatus
+   */
+  public static class HoodieCombineFileInputFormatShim<K, V> extends CombineFileInputFormat<K, V>
+      implements org.apache.hadoop.hive.shims.HadoopShims.CombineFileInputFormatShim<K, V> {
+
+    private boolean hoodieFilter = false;
+    private boolean isRealTime = false;
+
+    public HoodieCombineFileInputFormatShim() {
+    }
+
+    public Path[] getInputPathsShim(JobConf conf) {
+      try {
+        return FileInputFormat.getInputPaths(conf);
+      } catch (Exception var3) {
+        throw new RuntimeException(var3);
+      }
+    }
+
+    public void createPool(JobConf conf, PathFilter... filters) {
+      super.createPool(conf, filters);
+    }
+
+    @Override
+    public RecordReader<K, V> getRecordReader(InputSplit split, JobConf job, Reporter reporter)
+        throws IOException {
+      throw new IOException("CombineFileInputFormat.getRecordReader not needed.");
+    }
+
+    protected List<FileStatus> listStatus(JobContext job) throws IOException {
+      LOG.info("Listing status in HoodieCombineHiveInputFormat.HoodieCombineFileInputFormatShim");
+      List<FileStatus> result;
+      if (hoodieFilter) {
+        HoodieInputFormat input;
+        if (isRealTime) {
+          LOG.info("Using HoodieRealtimeInputFormat");
+          input = new HoodieRealtimeInputFormat();
+        } else {
+          LOG.info("Using HoodieInputFormat");
+          input = new HoodieInputFormat();
+        }
+        input.setConf(job.getConfiguration());
+        result = new ArrayList<FileStatus>(
+            Arrays.asList(input.listStatus(new JobConf(job.getConfiguration()))));
+      } else {
+        result = super.listStatus(job);
+      }
+
+      Iterator it = result.iterator();
+
+      while (it.hasNext()) {
+        FileStatus stat = (FileStatus) it.next();
+        if (!stat.isFile()) {
+          it.remove();
+        }
+      }
+      return result;
+    }
+
+    public CombineFileSplit[] getSplits(JobConf job, int numSplits) throws IOException {
+      long minSize = job.getLong(org.apache.hadoop.mapreduce.lib.input.FileInputFormat.SPLIT_MINSIZE, 0L);
+      if (job.getLong("mapreduce.input.fileinputformat.split.minsize.per.node", 0L) == 0L) {
+        super.setMinSplitSizeNode(minSize);
+      }
+
+      if (job.getLong("mapreduce.input.fileinputformat.split.minsize.per.rack", 0L) == 0L) {
+        super.setMinSplitSizeRack(minSize);
+      }
+
+      if (job.getLong(org.apache.hadoop.mapreduce.lib.input.FileInputFormat.SPLIT_MAXSIZE, 0L) == 0L) {
+        super.setMaxSplitSize(minSize);
+      }
+
+      InputSplit[] splits = super.getSplits(job, numSplits);
+      ArrayList inputSplitShims = new ArrayList();
+
+      for (int pos = 0; pos < splits.length; ++pos) {
+        CombineFileSplit split = (CombineFileSplit) splits[pos];
+        if (split.getPaths().length > 0) {
+          inputSplitShims.add(
+              new HadoopShimsSecure.InputSplitShim(job, split.getPaths(), split.getStartOffsets(),
+                  split.getLengths(), split.getLocations()));
+        }
+      }
+
+      return (CombineFileSplit[]) inputSplitShims
+          .toArray(new HadoopShimsSecure.InputSplitShim[inputSplitShims.size()]);
+    }
+
+    public HadoopShimsSecure.InputSplitShim getInputSplitShim() throws IOException {
+      return new HadoopShimsSecure.InputSplitShim();
+    }
+
+    public RecordReader getRecordReader(JobConf job, CombineFileSplit split, Reporter reporter,
+        Class<RecordReader<K, V>> rrClass) throws IOException {
+      return new HadoopShimsSecure.CombineFileRecordReader(job, split, reporter, rrClass);
+    }
+
+    public void setHoodieFilter(boolean hoodieFilter) {
+      this.hoodieFilter = hoodieFilter;
+    }
+
+    public void setRealTime(boolean realTime) {
+      isRealTime = realTime;
+    }
+  }
+}
--- a/hudi-hadoop-mr/src/main/java/org/apache/hudi/hadoop/realtime/AbstractRealtimeRecordReader.java
+++ b/hudi-hadoop-mr/src/main/java/org/apache/hudi/hadoop/realtime/AbstractRealtimeRecordReader.java
@@ -0,0 +1,359 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.hudi.hadoop.realtime;
+
+import java.io.IOException;
+import java.util.ArrayList;
+import java.util.Arrays;
+import java.util.List;
+import java.util.Map;
+import java.util.Set;
+import java.util.TreeMap;
+import java.util.stream.Collectors;
+import org.apache.avro.Schema;
+import org.apache.avro.Schema.Field;
+import org.apache.avro.generic.GenericArray;
+import org.apache.avro.generic.GenericFixed;
+import org.apache.avro.generic.GenericRecord;
+import org.apache.commons.logging.Log;
+import org.apache.commons.logging.LogFactory;
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.fs.Path;
+import org.apache.hadoop.hive.serde2.ColumnProjectionUtils;
+import org.apache.hadoop.hive.serde2.io.DoubleWritable;
+import org.apache.hadoop.io.ArrayWritable;
+import org.apache.hadoop.io.BooleanWritable;
+import org.apache.hadoop.io.BytesWritable;
+import org.apache.hadoop.io.FloatWritable;
+import org.apache.hadoop.io.IntWritable;
+import org.apache.hadoop.io.LongWritable;
+import org.apache.hadoop.io.Text;
+import org.apache.hadoop.io.Writable;
+import org.apache.hadoop.mapred.JobConf;
+import org.apache.hudi.common.model.HoodieAvroPayload;
+import org.apache.hudi.common.table.HoodieTableMetaClient;
+import org.apache.hudi.common.util.HoodieAvroUtils;
+import org.apache.hudi.common.util.LogReaderUtils;
+import org.apache.hudi.common.util.collection.Pair;
+import org.apache.hudi.exception.HoodieException;
+import org.apache.hudi.exception.HoodieIOException;
+import parquet.avro.AvroSchemaConverter;
+import parquet.hadoop.ParquetFileReader;
+import parquet.schema.MessageType;
+
+/**
+ * Record Reader implementation to merge fresh avro data with base parquet data, to support real
+ * time queries.
+ */
+public abstract class AbstractRealtimeRecordReader {
+
+  // Fraction of mapper/reducer task memory used for compaction of log files
+  public static final String COMPACTION_MEMORY_FRACTION_PROP = "compaction.memory.fraction";
+  public static final String DEFAULT_COMPACTION_MEMORY_FRACTION = "0.75";
+  // used to choose a trade off between IO vs Memory when performing compaction process
+  // Depending on outputfile size and memory provided, choose true to avoid OOM for large file
+  // size + small memory
+  public static final String COMPACTION_LAZY_BLOCK_READ_ENABLED_PROP =
+      "compaction.lazy.block.read.enabled";
+  public static final String DEFAULT_COMPACTION_LAZY_BLOCK_READ_ENABLED = "true";
+
+  // Property to set the max memory for dfs inputstream buffer size
+  public static final String MAX_DFS_STREAM_BUFFER_SIZE_PROP = "hoodie.memory.dfs.buffer.max.size";
+  // Setting this to lower value of 1 MB since no control over how many RecordReaders will be started in a mapper
+  public static final int DEFAULT_MAX_DFS_STREAM_BUFFER_SIZE = 1 * 1024 * 1024; // 1 MB
+  // Property to set file path prefix for spillable file
+  public static final String SPILLABLE_MAP_BASE_PATH_PROP = "hoodie.memory.spillable.map.path";
+  // Default file path prefix for spillable file
+  public static final String DEFAULT_SPILLABLE_MAP_BASE_PATH = "/tmp/";
+
+  public static final Log LOG = LogFactory.getLog(AbstractRealtimeRecordReader.class);
+  protected final HoodieRealtimeFileSplit split;
+  protected final JobConf jobConf;
+  private final MessageType baseFileSchema;
+  protected final boolean usesCustomPayload;
+  // Schema handles
+  private Schema readerSchema;
+  private Schema writerSchema;
+
+  public AbstractRealtimeRecordReader(HoodieRealtimeFileSplit split, JobConf job) {
+    this.split = split;
+    this.jobConf = job;
+    LOG.info("cfg ==> " + job.get(ColumnProjectionUtils.READ_COLUMN_NAMES_CONF_STR));
+    LOG.info("columnIds ==> " + job.get(ColumnProjectionUtils.READ_COLUMN_IDS_CONF_STR));
+    LOG.info("partitioningColumns ==> " + job.get("partition_columns", ""));
+    try {
+      this.usesCustomPayload = usesCustomPayload();
+      LOG.info("usesCustomPayload ==> " + this.usesCustomPayload);
+      baseFileSchema = readSchema(jobConf, split.getPath());
+      init();
+    } catch (IOException e) {
+      throw new HoodieIOException(
+          "Could not create HoodieRealtimeRecordReader on path " + this.split.getPath(), e);
+    }
+  }
+
+  private boolean usesCustomPayload() {
+    HoodieTableMetaClient metaClient = new HoodieTableMetaClient(jobConf, split.getBasePath());
+    return !(metaClient.getTableConfig().getPayloadClass().contains(HoodieAvroPayload.class.getName())
+        || metaClient.getTableConfig().getPayloadClass().contains("org.apache.hudi.OverwriteWithLatestAvroPayload"));
+  }
+
+  /**
+   * Reads the schema from the parquet file. This is different from ParquetUtils as it uses the
+   * twitter parquet to support hive 1.1.0
+   */
+  private static MessageType readSchema(Configuration conf, Path parquetFilePath) {
+    try {
+      return ParquetFileReader.readFooter(conf, parquetFilePath).getFileMetaData().getSchema();
+    } catch (IOException e) {
+      throw new HoodieIOException("Failed to read footer for parquet " + parquetFilePath, e);
+    }
+  }
+
+  /**
+   * Prints a JSON representation of the ArrayWritable for easier debuggability
+   */
+  protected static String arrayWritableToString(ArrayWritable writable) {
+    if (writable == null) {
+      return "null";
+    }
+    StringBuilder builder = new StringBuilder();
+    Writable[] values = writable.get();
+    builder.append("\"values_" + Math.random() + "_" + values.length + "\": {");
+    int i = 0;
+    for (Writable w : values) {
+      if (w instanceof ArrayWritable) {
+        builder.append(arrayWritableToString((ArrayWritable) w)).append(",");
+      } else {
+        builder.append("\"value" + i + "\":" + "\"" + w + "\"").append(",");
+        if (w == null) {
+          builder.append("\"type" + i + "\":" + "\"unknown\"").append(",");
+        } else {
+          builder.append("\"type" + i + "\":" + "\"" + w.getClass().getSimpleName() + "\"").append(",");
+        }
+      }
+      i++;
+    }
+    builder.deleteCharAt(builder.length() - 1);
+    builder.append("}");
+    return builder.toString();
+  }
+
+  /**
+   * Given a comma separated list of field names and positions at which they appear on Hive, return
+   * a ordered list of field names, that can be passed onto storage.
+   */
+  private static List<String> orderFields(String fieldNameCsv, String fieldOrderCsv, List<String> partitioningFields) {
+
+    String[] fieldOrders = fieldOrderCsv.split(",");
+    List<String> fieldNames = Arrays.stream(fieldNameCsv.split(","))
+        .filter(fn -> !partitioningFields.contains(fn)).collect(Collectors.toList());
+
+    // Hive does not provide ids for partitioning fields, so check for lengths excluding that.
+    if (fieldNames.size() != fieldOrders.length) {
+      throw new HoodieException(String
+          .format("Error ordering fields for storage read. #fieldNames: %d, #fieldPositions: %d",
+              fieldNames.size(), fieldOrders.length));
+    }
+    TreeMap<Integer, String> orderedFieldMap = new TreeMap<>();
+    for (int ox = 0; ox < fieldOrders.length; ox++) {
+      orderedFieldMap.put(Integer.parseInt(fieldOrders[ox]), fieldNames.get(ox));
+    }
+    return new ArrayList<>(orderedFieldMap.values());
+  }
+
+  /**
+   * Generate a reader schema off the provided writeSchema, to just project out the provided
+   * columns
+   */
+  public static Schema generateProjectionSchema(Schema writeSchema, List<String> fieldNames) {
+    /**
+     * Avro & Presto field names seems to be case sensitive (support fields differing only in case)
+     * whereas Hive/Impala/SparkSQL(default) are case-insensitive. Spark allows this to be configurable
+     * using spark.sql.caseSensitive=true
+     *
+     * For a RT table setup with no delta-files (for a latest file-slice) -> we translate parquet schema to Avro
+     * Here the field-name case is dependent on parquet schema. Hive (1.x/2.x/CDH) translate column projections
+     * to lower-cases
+     *
+     */
+    List<Schema.Field> projectedFields = new ArrayList<>();
+    Map<String, Schema.Field> schemaFieldsMap = writeSchema.getFields().stream()
+        .map(r -> Pair.of(r.name().toLowerCase(), r)).collect(Collectors.toMap(Pair::getLeft, Pair::getRight));
+    for (String fn : fieldNames) {
+      Schema.Field field = schemaFieldsMap.get(fn.toLowerCase());
+      if (field == null) {
+        throw new HoodieException("Field " + fn + " not found in log schema. Query cannot proceed! "
+            + "Derived Schema Fields: "
+            + new ArrayList<>(schemaFieldsMap.keySet()));
+      } else {
+        projectedFields
+            .add(new Schema.Field(field.name(), field.schema(), field.doc(), field.defaultValue()));
+      }
+    }
+
+    Schema projectedSchema = Schema
+        .createRecord(writeSchema.getName(), writeSchema.getDoc(), writeSchema.getNamespace(), writeSchema.isError());
+    projectedSchema.setFields(projectedFields);
+    return projectedSchema;
+  }
+
+  /**
+   * Convert the projected read from delta record into an array writable
+   */
+  public static Writable avroToArrayWritable(Object value, Schema schema) {
+
+    if (value == null) {
+      return null;
+    }
+
+    switch (schema.getType()) {
+      case STRING:
+        return new Text(value.toString());
+      case BYTES:
+        return new BytesWritable((byte[]) value);
+      case INT:
+        return new IntWritable((Integer) value);
+      case LONG:
+        return new LongWritable((Long) value);
+      case FLOAT:
+        return new FloatWritable((Float) value);
+      case DOUBLE:
+        return new DoubleWritable((Double) value);
+      case BOOLEAN:
+        return new BooleanWritable((Boolean) value);
+      case NULL:
+        return null;
+      case RECORD:
+        GenericRecord record = (GenericRecord) value;
+        Writable[] recordValues = new Writable[schema.getFields().size()];
+        int recordValueIndex = 0;
+        for (Schema.Field field : schema.getFields()) {
+          recordValues[recordValueIndex++] = avroToArrayWritable(record.get(field.name()), field.schema());
+        }
+        return new ArrayWritable(Writable.class, recordValues);
+      case ENUM:
+        return new Text(value.toString());
+      case ARRAY:
+        GenericArray arrayValue = (GenericArray) value;
+        Writable[] arrayValues = new Writable[arrayValue.size()];
+        int arrayValueIndex = 0;
+        for (Object obj : arrayValue) {
+          arrayValues[arrayValueIndex++] = avroToArrayWritable(obj, schema.getElementType());
+        }
+        // Hive 1.x will fail here, it requires values2 to be wrapped into another ArrayWritable
+        return new ArrayWritable(Writable.class, arrayValues);
+      case MAP:
+        Map mapValue = (Map) value;
+        Writable[] mapValues = new Writable[mapValue.size()];
+        int mapValueIndex = 0;
+        for (Object entry : mapValue.entrySet()) {
+          Map.Entry mapEntry = (Map.Entry) entry;
+          Writable[] nestedMapValues = new Writable[2];
+          nestedMapValues[0] = new Text(mapEntry.getKey().toString());
+          nestedMapValues[1] = avroToArrayWritable(mapEntry.getValue(), schema.getValueType());
+          mapValues[mapValueIndex++] = new ArrayWritable(Writable.class, nestedMapValues);
+        }
+        // Hive 1.x will fail here, it requires values3 to be wrapped into another ArrayWritable
+        return new ArrayWritable(Writable.class, mapValues);
+      case UNION:
+        List<Schema> types = schema.getTypes();
+        if (types.size() != 2) {
+          throw new IllegalArgumentException("Only support union with 2 fields");
+        }
+        Schema s1 = types.get(0);
+        Schema s2 = types.get(1);
+        if (s1.getType() == Schema.Type.NULL) {
+          return avroToArrayWritable(value, s2);
+        } else if (s2.getType() == Schema.Type.NULL) {
+          return avroToArrayWritable(value, s1);
+        } else {
+          throw new IllegalArgumentException("Only support union with null");
+        }
+      case FIXED:
+        return new BytesWritable(((GenericFixed) value).bytes());
+      default:
+        return null;
+    }
+  }
+
+  /**
+   * Hive implementation of ParquetRecordReader results in partition columns not present in the original parquet file
+   * to also be part of the projected schema. Hive expects the record reader implementation to return the row in its
+   * entirety (with un-projected column having null values). As we use writerSchema for this, make sure writer schema
+   * also includes partition columns
+   *
+   * @param schema Schema to be changed
+   */
+  private static Schema addPartitionFields(Schema schema, List<String> partitioningFields) {
+    final Set<String> firstLevelFieldNames = schema.getFields().stream().map(Field::name)
+        .map(String::toLowerCase).collect(Collectors.toSet());
+    List<String> fieldsToAdd = partitioningFields.stream().map(String::toLowerCase)
+        .filter(x -> !firstLevelFieldNames.contains(x)).collect(Collectors.toList());
+
+    return HoodieAvroUtils.appendNullSchemaFields(schema, fieldsToAdd);
+  }
+
+  /**
+   * Goes through the log files in reverse order and finds the schema from the last available data block. If not, falls
+   * back to the schema from the latest parquet file. Finally, sets the partition column and projection fields into
+   * the job conf.
+   */
+  private void init() throws IOException {
+    Schema schemaFromLogFile = LogReaderUtils
+        .readLatestSchemaFromLogFiles(split.getBasePath(), split.getDeltaFilePaths(), jobConf);
+    if (schemaFromLogFile == null) {
+      writerSchema = new AvroSchemaConverter().convert(baseFileSchema);
+      LOG.debug("Writer Schema From Parquet => " + writerSchema.getFields());
+    } else {
+      writerSchema = schemaFromLogFile;
+      LOG.debug("Writer Schema From Log => " + writerSchema.getFields());
+    }
+    // Add partitioning fields to writer schema for resulting row to contain null values for these fields
+    String partitionFields = jobConf.get("partition_columns", "");
+    List<String> partitioningFields =
+        partitionFields.length() > 0 ? Arrays.stream(partitionFields.split(",")).collect(Collectors.toList())
+            : new ArrayList<>();
+    writerSchema = addPartitionFields(writerSchema, partitioningFields);
+    List<String> projectionFields = orderFields(
+        jobConf.get(ColumnProjectionUtils.READ_COLUMN_NAMES_CONF_STR),
+        jobConf.get(ColumnProjectionUtils.READ_COLUMN_IDS_CONF_STR),
+        partitioningFields);
+    // TODO(vc): In the future, the reader schema should be updated based on log files & be able
+    // to null out fields not present before
+    readerSchema = generateProjectionSchema(writerSchema, projectionFields);
+    LOG.info(String.format("About to read compacted logs %s for base split %s, projecting cols %s",
+        split.getDeltaFilePaths(), split.getPath(), projectionFields));
+  }
+
+  public Schema getReaderSchema() {
+    return readerSchema;
+  }
+
+  public Schema getWriterSchema() {
+    return writerSchema;
+  }
+
+  public long getMaxCompactionMemoryInBytes() {
+    // jobConf.getMemoryForMapTask() returns in MB
+    return (long) Math.ceil(Double
+        .valueOf(jobConf.get(COMPACTION_MEMORY_FRACTION_PROP, DEFAULT_COMPACTION_MEMORY_FRACTION))
+        * jobConf.getMemoryForMapTask() * 1024 * 1024L);
+  }
+}
--- a/hudi-hadoop-mr/src/main/java/org/apache/hudi/hadoop/realtime/HoodieParquetSerde.java
+++ b/hudi-hadoop-mr/src/main/java/org/apache/hudi/hadoop/realtime/HoodieParquetSerde.java
@@ -0,0 +1,31 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.hudi.hadoop.realtime;
+
+import org.apache.hadoop.hive.ql.io.parquet.serde.ParquetHiveSerDe;
+
+/**
+ * Simply extends ParquetHiveSerDe
+ */
+public class HoodieParquetSerde extends ParquetHiveSerDe {
+
+  public HoodieParquetSerde() {
+    super();
+  }
+}
--- a/hudi-hadoop-mr/src/main/java/org/apache/hudi/hadoop/realtime/HoodieRealtimeFileSplit.java
+++ b/hudi-hadoop-mr/src/main/java/org/apache/hudi/hadoop/realtime/HoodieRealtimeFileSplit.java
@@ -0,0 +1,110 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.hudi.hadoop.realtime;
+
+import java.io.DataInput;
+import java.io.DataOutput;
+import java.io.IOException;
+import java.nio.charset.StandardCharsets;
+import java.util.ArrayList;
+import java.util.List;
+import org.apache.hadoop.mapred.FileSplit;
+
+/**
+ * Filesplit that wraps the base split and a list of log files to merge deltas from.
+ */
+public class HoodieRealtimeFileSplit extends FileSplit {
+
+  private List<String> deltaFilePaths;
+
+  private String maxCommitTime;
+
+  private String basePath;
+
+  public HoodieRealtimeFileSplit() {
+    super();
+  }
+
+  public HoodieRealtimeFileSplit(FileSplit baseSplit, String basePath, List<String> deltaLogFiles,
+      String maxCommitTime) throws IOException {
+    super(baseSplit.getPath(), baseSplit.getStart(), baseSplit.getLength(),
+        baseSplit.getLocations());
+    this.deltaFilePaths = deltaLogFiles;
+    this.maxCommitTime = maxCommitTime;
+    this.basePath = basePath;
+  }
+
+  public List<String> getDeltaFilePaths() {
+    return deltaFilePaths;
+  }
+
+  public String getMaxCommitTime() {
+    return maxCommitTime;
+  }
+
+  public String getBasePath() {
+    return basePath;
+  }
+
+  private static void writeString(String str, DataOutput out) throws IOException {
+    byte[] bytes = str.getBytes(StandardCharsets.UTF_8);
+    out.writeInt(bytes.length);
+    out.write(bytes);
+  }
+
+  private static String readString(DataInput in) throws IOException {
+    byte[] bytes = new byte[in.readInt()];
+    in.readFully(bytes);
+    return new String(bytes, StandardCharsets.UTF_8);
+  }
+
+
+  @Override
+  public void write(DataOutput out) throws IOException {
+    super.write(out);
+    writeString(basePath, out);
+    writeString(maxCommitTime, out);
+    out.writeInt(deltaFilePaths.size());
+    for (String logFilePath : deltaFilePaths) {
+      writeString(logFilePath, out);
+    }
+  }
+
+  @Override
+  public void readFields(DataInput in) throws IOException {
+    super.readFields(in);
+    basePath = readString(in);
+    maxCommitTime = readString(in);
+    int totalLogFiles = in.readInt();
+    deltaFilePaths = new ArrayList<>(totalLogFiles);
+    for (int i = 0; i < totalLogFiles; i++) {
+      deltaFilePaths.add(readString(in));
+    }
+  }
+
+  @Override
+  public String toString() {
+    return "HoodieRealtimeFileSplit{"
+        + "DataPath=" + getPath()
+        + ", deltaFilePaths=" + deltaFilePaths
+        + ", maxCommitTime='" + maxCommitTime + '\''
+        + ", basePath='" + basePath + '\''
+        + '}';
+  }
+}
--- a/hudi-hadoop-mr/src/main/java/org/apache/hudi/hadoop/realtime/HoodieRealtimeInputFormat.java
+++ b/hudi-hadoop-mr/src/main/java/org/apache/hudi/hadoop/realtime/HoodieRealtimeInputFormat.java
@@ -0,0 +1,241 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.hudi.hadoop.realtime;
+
+import com.google.common.base.Preconditions;
+import com.google.common.collect.Sets;
+import java.io.IOException;
+import java.util.ArrayList;
+import java.util.Arrays;
+import java.util.HashMap;
+import java.util.List;
+import java.util.Map;
+import java.util.function.Function;
+import java.util.stream.Collectors;
+import java.util.stream.Stream;
+import org.apache.commons.logging.Log;
+import org.apache.commons.logging.LogFactory;
+import org.apache.hadoop.conf.Configurable;
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.fs.FileStatus;
+import org.apache.hadoop.fs.Path;
+import org.apache.hadoop.hive.serde2.ColumnProjectionUtils;
+import org.apache.hadoop.io.ArrayWritable;
+import org.apache.hadoop.io.NullWritable;
+import org.apache.hadoop.mapred.FileSplit;
+import org.apache.hadoop.mapred.InputSplit;
+import org.apache.hadoop.mapred.JobConf;
+import org.apache.hadoop.mapred.RecordReader;
+import org.apache.hadoop.mapred.Reporter;
+import org.apache.hudi.common.model.FileSlice;
+import org.apache.hudi.common.model.HoodieLogFile;
+import org.apache.hudi.common.model.HoodieRecord;
+import org.apache.hudi.common.table.HoodieTableMetaClient;
+import org.apache.hudi.common.table.HoodieTimeline;
+import org.apache.hudi.common.table.timeline.HoodieInstant;
+import org.apache.hudi.common.table.view.HoodieTableFileSystemView;
+import org.apache.hudi.common.util.FSUtils;
+import org.apache.hudi.common.util.Option;
+import org.apache.hudi.exception.HoodieException;
+import org.apache.hudi.exception.HoodieIOException;
+import org.apache.hudi.hadoop.HoodieInputFormat;
+import org.apache.hudi.hadoop.UseFileSplitsFromInputFormat;
+
+/**
+ * Input Format, that provides a real-time view of data in a Hoodie dataset
+ */
+@UseFileSplitsFromInputFormat
+public class HoodieRealtimeInputFormat extends HoodieInputFormat implements Configurable {
+
+  public static final Log LOG = LogFactory.getLog(HoodieRealtimeInputFormat.class);
+
+  // These positions have to be deterministic across all tables
+  public static final int HOODIE_COMMIT_TIME_COL_POS = 0;
+  public static final int HOODIE_RECORD_KEY_COL_POS = 2;
+  public static final int HOODIE_PARTITION_PATH_COL_POS = 3;
+  // Hive on Spark queries do not work with RT tables. Our theory is that due to
+  // {@link org.apache.hadoop.hive.ql.io.parquet.ProjectionPusher}
+  // not handling empty list correctly, the ParquetRecordReaderWrapper ends up adding the same column ids multiple
+  // times which ultimately breaks the query.
+
+  @Override
+  public InputSplit[] getSplits(JobConf job, int numSplits) throws IOException {
+
+    Stream<FileSplit> fileSplits = Arrays.stream(super.getSplits(job, numSplits))
+        .map(is -> (FileSplit) is);
+
+    // obtain all unique parent folders for splits
+    Map<Path, List<FileSplit>> partitionsToParquetSplits = fileSplits
+        .collect(Collectors.groupingBy(split -> split.getPath().getParent()));
+    // TODO(vc): Should we handle also non-hoodie splits here?
+    Map<String, HoodieTableMetaClient> metaClientMap = new HashMap<>();
+    Map<Path, HoodieTableMetaClient> partitionsToMetaClient = partitionsToParquetSplits.keySet()
+        .stream().collect(Collectors.toMap(Function.identity(), p -> {
+          // find if we have a metaclient already for this partition.
+          Option<String> matchingBasePath = Option.fromJavaOptional(metaClientMap.keySet().stream()
+              .filter(basePath -> p.toString().startsWith(basePath)).findFirst());
+          if (matchingBasePath.isPresent()) {
+            return metaClientMap.get(matchingBasePath.get());
+          }
+
+          try {
+            HoodieTableMetaClient metaClient = getTableMetaClient(p.getFileSystem(conf), p);
+            metaClientMap.put(metaClient.getBasePath(), metaClient);
+            return metaClient;
+          } catch (IOException e) {
+            throw new HoodieIOException("Error creating hoodie meta client against : " + p, e);
+          }
+        }));
+
+    // for all unique split parents, obtain all delta files based on delta commit timeline,
+    // grouped on file id
+    List<HoodieRealtimeFileSplit> rtSplits = new ArrayList<>();
+    partitionsToParquetSplits.keySet().stream().forEach(partitionPath -> {
+      // for each partition path obtain the data & log file groupings, then map back to inputsplits
+      HoodieTableMetaClient metaClient = partitionsToMetaClient.get(partitionPath);
+      HoodieTableFileSystemView fsView = new HoodieTableFileSystemView(metaClient,
+          metaClient.getActiveTimeline());
+      String relPartitionPath = FSUtils
+          .getRelativePartitionPath(new Path(metaClient.getBasePath()), partitionPath);
+
+      try {
+        // Both commit and delta-commits are included - pick the latest completed one
+        Option<HoodieInstant> latestCompletedInstant =
+            metaClient.getActiveTimeline().getCommitsTimeline().filterCompletedInstants().lastInstant();
+
+        Stream<FileSlice> latestFileSlices = latestCompletedInstant.map(instant ->
+            fsView.getLatestMergedFileSlicesBeforeOrOn(relPartitionPath, instant.getTimestamp()))
+            .orElse(Stream.empty());
+
+        // subgroup splits again by file id & match with log files.
+        Map<String, List<FileSplit>> groupedInputSplits = partitionsToParquetSplits
+            .get(partitionPath).stream()
+            .collect(Collectors.groupingBy(split -> FSUtils.getFileId(split.getPath().getName())));
+        latestFileSlices.forEach(fileSlice -> {
+          List<FileSplit> dataFileSplits = groupedInputSplits.get(fileSlice.getFileId());
+          dataFileSplits.forEach(split -> {
+            try {
+              List<String> logFilePaths = fileSlice.getLogFiles().sorted(HoodieLogFile.getLogFileComparator())
+                  .map(logFile -> logFile.getPath().toString()).collect(Collectors.toList());
+              // Get the maxCommit from the last delta or compaction or commit - when
+              // bootstrapped from COW table
+              String maxCommitTime = metaClient.getActiveTimeline().getTimelineOfActions(
+                  Sets.newHashSet(HoodieTimeline.COMMIT_ACTION, HoodieTimeline.ROLLBACK_ACTION,
+                      HoodieTimeline.DELTA_COMMIT_ACTION)).filterCompletedInstants().lastInstant()
+                  .get().getTimestamp();
+              rtSplits.add(
+                  new HoodieRealtimeFileSplit(split, metaClient.getBasePath(), logFilePaths,
+                      maxCommitTime));
+            } catch (IOException e) {
+              throw new HoodieIOException("Error creating hoodie real time split ", e);
+            }
+          });
+        });
+      } catch (Exception e) {
+        throw new HoodieException("Error obtaining data file/log file grouping: " + partitionPath,
+            e);
+      }
+    });
+    LOG.info("Returning a total splits of " + rtSplits.size());
+    return rtSplits.toArray(new InputSplit[rtSplits.size()]);
+  }
+
+
+  @Override
+  public FileStatus[] listStatus(JobConf job) throws IOException {
+    // Call the HoodieInputFormat::listStatus to obtain all latest parquet files, based on commit
+    // timeline.
+    return super.listStatus(job);
+  }
+
+  /**
+   * Add a field to the existing fields projected
+   */
+  private static Configuration addProjectionField(Configuration conf, String fieldName,
+      int fieldIndex) {
+    String readColNames = conf.get(ColumnProjectionUtils.READ_COLUMN_NAMES_CONF_STR, "");
+    String readColIds = conf.get(ColumnProjectionUtils.READ_COLUMN_IDS_CONF_STR, "");
+
+    String readColNamesPrefix = readColNames + ",";
+    if (readColNames == null || readColNames.isEmpty()) {
+      readColNamesPrefix = "";
+    }
+    String readColIdsPrefix = readColIds + ",";
+    if (readColIds == null || readColIds.isEmpty()) {
+      readColIdsPrefix = "";
+    }
+
+    if (!readColNames.contains(fieldName)) {
+      // If not already in the list - then add it
+      conf.set(ColumnProjectionUtils.READ_COLUMN_NAMES_CONF_STR, readColNamesPrefix + fieldName);
+      conf.set(ColumnProjectionUtils.READ_COLUMN_IDS_CONF_STR, readColIdsPrefix + fieldIndex);
+      if (LOG.isDebugEnabled()) {
+        LOG.debug(String.format(
+            "Adding extra column " + fieldName + ", to enable log merging cols (%s) ids (%s) ",
+            conf.get(ColumnProjectionUtils.READ_COLUMN_NAMES_CONF_STR),
+            conf.get(ColumnProjectionUtils.READ_COLUMN_IDS_CONF_STR)));
+      }
+    }
+    return conf;
+  }
+
+  private static synchronized Configuration addRequiredProjectionFields(Configuration configuration) {
+    // Need this to do merge records in HoodieRealtimeRecordReader
+    configuration = addProjectionField(configuration, HoodieRecord.RECORD_KEY_METADATA_FIELD,
+        HOODIE_RECORD_KEY_COL_POS);
+    configuration = addProjectionField(configuration, HoodieRecord.COMMIT_TIME_METADATA_FIELD,
+        HOODIE_COMMIT_TIME_COL_POS);
+    configuration = addProjectionField(configuration, HoodieRecord.PARTITION_PATH_METADATA_FIELD,
+        HOODIE_PARTITION_PATH_COL_POS);
+    return configuration;
+  }
+
+  @Override
+  public RecordReader<NullWritable, ArrayWritable> getRecordReader(final InputSplit split,
+      final JobConf job, final Reporter reporter) throws IOException {
+
+    LOG.info("Before adding Hoodie columns, Projections :" + job
+        .get(ColumnProjectionUtils.READ_COLUMN_NAMES_CONF_STR) + ", Ids :"
+        + job.get(ColumnProjectionUtils.READ_COLUMN_IDS_CONF_STR));
+
+    // Hive (across all versions) fails for queries like select count(`_hoodie_commit_time`) from table;
+    // In this case, the projection fields gets removed. Looking at HiveInputFormat implementation, in some cases
+    // hoodie additional projection columns are reset after calling setConf and only natural projections
+    // (one found in select queries) are set. things would break because of this.
+    // For e:g _hoodie_record_key would be missing and merge step would throw exceptions.
+    // TO fix this, hoodie columns are appended late at the time record-reader gets built instead of construction time.
+    this.conf = addRequiredProjectionFields(job);
+
+    LOG.info("Creating record reader with readCols :" + job
+        .get(ColumnProjectionUtils.READ_COLUMN_NAMES_CONF_STR) + ", Ids :"
+        + job.get(ColumnProjectionUtils.READ_COLUMN_IDS_CONF_STR));
+    // sanity check
+    Preconditions.checkArgument(split instanceof HoodieRealtimeFileSplit,
+        "HoodieRealtimeRecordReader can only work on HoodieRealtimeFileSplit and not with "
+            + split);
+
+    return new HoodieRealtimeRecordReader((HoodieRealtimeFileSplit) split, job,
+        super.getRecordReader(split, job, reporter));
+  }
+
+  @Override
+  public Configuration getConf() {
+    return conf;
+  }
+}
--- a/hudi-hadoop-mr/src/main/java/org/apache/hudi/hadoop/realtime/HoodieRealtimeRecordReader.java
+++ b/hudi-hadoop-mr/src/main/java/org/apache/hudi/hadoop/realtime/HoodieRealtimeRecordReader.java
@@ -0,0 +1,103 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.hudi.hadoop.realtime;
+
+import java.io.IOException;
+import org.apache.commons.logging.Log;
+import org.apache.commons.logging.LogFactory;
+import org.apache.hadoop.io.ArrayWritable;
+import org.apache.hadoop.io.NullWritable;
+import org.apache.hadoop.mapred.JobConf;
+import org.apache.hadoop.mapred.RecordReader;
+import org.apache.hudi.exception.HoodieException;
+
+/**
+ * Realtime Record Reader which can do compacted (merge-on-read) record reading or
+ * unmerged reading (parquet and log files read in parallel) based on job configuration.
+ */
+public class HoodieRealtimeRecordReader implements RecordReader<NullWritable, ArrayWritable> {
+
+  // Property to enable parallel reading of parquet and log files without merging.
+  public static final String REALTIME_SKIP_MERGE_PROP = "hoodie.realtime.merge.skip";
+  // By default, we do merged-reading
+  public static final String DEFAULT_REALTIME_SKIP_MERGE = "false";
+  public static final Log LOG = LogFactory.getLog(HoodieRealtimeRecordReader.class);
+  private final RecordReader<NullWritable, ArrayWritable> reader;
+
+  public HoodieRealtimeRecordReader(HoodieRealtimeFileSplit split, JobConf job,
+      RecordReader<NullWritable, ArrayWritable> realReader) {
+    this.reader = constructRecordReader(split, job, realReader);
+  }
+
+  public static boolean canSkipMerging(JobConf jobConf) {
+    return Boolean.valueOf(jobConf.get(REALTIME_SKIP_MERGE_PROP, DEFAULT_REALTIME_SKIP_MERGE));
+  }
+
+  /**
+   * Construct record reader based on job configuration
+   *
+   * @param split      File Split
+   * @param jobConf    Job Configuration
+   * @param realReader Parquet Record Reader
+   * @return Realtime Reader
+   */
+  private static RecordReader<NullWritable, ArrayWritable> constructRecordReader(HoodieRealtimeFileSplit split,
+      JobConf jobConf, RecordReader<NullWritable, ArrayWritable> realReader) {
+    try {
+      if (canSkipMerging(jobConf)) {
+        LOG.info("Enabling un-merged reading of realtime records");
+        return new RealtimeUnmergedRecordReader(split, jobConf, realReader);
+      }
+      return new RealtimeCompactedRecordReader(split, jobConf, realReader);
+    } catch (IOException ex) {
+      LOG.error("Got exception when constructing record reader", ex);
+      throw new HoodieException(ex);
+    }
+  }
+
+  @Override
+  public boolean next(NullWritable key, ArrayWritable value) throws IOException {
+    return this.reader.next(key, value);
+  }
+
+  @Override
+  public NullWritable createKey() {
+    return this.reader.createKey();
+  }
+
+  @Override
+  public ArrayWritable createValue() {
+    return this.reader.createValue();
+  }
+
+  @Override
+  public long getPos() throws IOException {
+    return this.reader.getPos();
+  }
+
+  @Override
+  public void close() throws IOException {
+    this.reader.close();
+  }
+
+  @Override
+  public float getProgress() throws IOException {
+    return this.reader.getProgress();
+  }
+}
--- a/hudi-hadoop-mr/src/main/java/org/apache/hudi/hadoop/realtime/RealtimeCompactedRecordReader.java
+++ b/hudi-hadoop-mr/src/main/java/org/apache/hudi/hadoop/realtime/RealtimeCompactedRecordReader.java
@@ -0,0 +1,147 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.hudi.hadoop.realtime;
+
+import java.io.IOException;
+import java.util.Map;
+import org.apache.avro.generic.GenericRecord;
+import org.apache.hadoop.io.ArrayWritable;
+import org.apache.hadoop.io.NullWritable;
+import org.apache.hadoop.io.Writable;
+import org.apache.hadoop.mapred.JobConf;
+import org.apache.hadoop.mapred.RecordReader;
+import org.apache.hudi.common.model.HoodieRecord;
+import org.apache.hudi.common.model.HoodieRecordPayload;
+import org.apache.hudi.common.table.log.HoodieMergedLogRecordScanner;
+import org.apache.hudi.common.util.FSUtils;
+import org.apache.hudi.common.util.HoodieAvroUtils;
+import org.apache.hudi.common.util.Option;
+
+class RealtimeCompactedRecordReader extends AbstractRealtimeRecordReader implements
+    RecordReader<NullWritable, ArrayWritable> {
+
+  protected final RecordReader<NullWritable, ArrayWritable> parquetReader;
+  private final Map<String, HoodieRecord<? extends HoodieRecordPayload>> deltaRecordMap;
+
+  public RealtimeCompactedRecordReader(HoodieRealtimeFileSplit split, JobConf job,
+      RecordReader<NullWritable, ArrayWritable> realReader) throws IOException {
+    super(split, job);
+    this.parquetReader = realReader;
+    this.deltaRecordMap = getMergedLogRecordScanner().getRecords();
+  }
+
+  /**
+   * Goes through the log files and populates a map with latest version of each key logged, since
+   * the base split was written.
+   */
+  private HoodieMergedLogRecordScanner getMergedLogRecordScanner() throws IOException {
+    // NOTE: HoodieCompactedLogRecordScanner will not return records for an in-flight commit
+    // but can return records for completed commits > the commit we are trying to read (if using
+    // readCommit() API)
+    return new HoodieMergedLogRecordScanner(
+        FSUtils.getFs(split.getPath().toString(), jobConf), split.getBasePath(),
+        split.getDeltaFilePaths(), usesCustomPayload ? getWriterSchema() : getReaderSchema(), split.getMaxCommitTime(),
+        getMaxCompactionMemoryInBytes(),
+        Boolean.valueOf(jobConf.get(COMPACTION_LAZY_BLOCK_READ_ENABLED_PROP,
+            DEFAULT_COMPACTION_LAZY_BLOCK_READ_ENABLED)),
+        false, jobConf.getInt(MAX_DFS_STREAM_BUFFER_SIZE_PROP, DEFAULT_MAX_DFS_STREAM_BUFFER_SIZE),
+        jobConf.get(SPILLABLE_MAP_BASE_PATH_PROP, DEFAULT_SPILLABLE_MAP_BASE_PATH));
+  }
+
+  @Override
+  public boolean next(NullWritable aVoid, ArrayWritable arrayWritable) throws IOException {
+    // Call the underlying parquetReader.next - which may replace the passed in ArrayWritable
+    // with a new block of values
+    boolean result = this.parquetReader.next(aVoid, arrayWritable);
+    if (!result) {
+      // if the result is false, then there are no more records
+      return false;
+    } else {
+      // TODO(VC): Right now, we assume all records in log, have a matching base record. (which
+      // would be true until we have a way to index logs too)
+      // return from delta records map if we have some match.
+      String key = arrayWritable.get()[HoodieRealtimeInputFormat.HOODIE_RECORD_KEY_COL_POS]
+          .toString();
+      if (deltaRecordMap.containsKey(key)) {
+        // TODO(NA): Invoke preCombine here by converting arrayWritable to Avro. This is required since the
+        // deltaRecord may not be a full record and needs values of columns from the parquet
+        Option<GenericRecord> rec;
+        if (usesCustomPayload) {
+          rec = deltaRecordMap.get(key).getData().getInsertValue(getWriterSchema());
+        } else {
+          rec = deltaRecordMap.get(key).getData().getInsertValue(getReaderSchema());
+        }
+        if (!rec.isPresent()) {
+          // If the record is not present, this is a delete record using an empty payload so skip this base record
+          // and move to the next record
+          return next(aVoid, arrayWritable);
+        }
+        GenericRecord recordToReturn = rec.get();
+        if (usesCustomPayload) {
+          // If using a custom payload, return only the projection fields
+          recordToReturn = HoodieAvroUtils.rewriteRecordWithOnlyNewSchemaFields(rec.get(), getReaderSchema());
+        }
+        // we assume, a later safe record in the log, is newer than what we have in the map &
+        // replace it.
+        ArrayWritable aWritable = (ArrayWritable) avroToArrayWritable(recordToReturn, getWriterSchema());
+        Writable[] replaceValue = aWritable.get();
+        if (LOG.isDebugEnabled()) {
+          LOG.debug(String.format("key %s, base values: %s, log values: %s", key,
+              arrayWritableToString(arrayWritable), arrayWritableToString(aWritable)));
+        }
+        Writable[] originalValue = arrayWritable.get();
+        try {
+          System.arraycopy(replaceValue, 0, originalValue, 0, originalValue.length);
+          arrayWritable.set(originalValue);
+        } catch (RuntimeException re) {
+          LOG.error("Got exception when doing array copy", re);
+          LOG.error("Base record :" + arrayWritableToString(arrayWritable));
+          LOG.error("Log record :" + arrayWritableToString(aWritable));
+          throw re;
+        }
+      }
+      return true;
+    }
+  }
+
+  @Override
+  public NullWritable createKey() {
+    return parquetReader.createKey();
+  }
+
+  @Override
+  public ArrayWritable createValue() {
+    return parquetReader.createValue();
+  }
+
+  @Override
+  public long getPos() throws IOException {
+    return parquetReader.getPos();
+  }
+
+  @Override
+  public void close() throws IOException {
+    parquetReader.close();
+  }
+
+  @Override
+  public float getProgress() throws IOException {
+    return parquetReader.getProgress();
+  }
+}
--- a/hudi-hadoop-mr/src/main/java/org/apache/hudi/hadoop/realtime/RealtimeUnmergedRecordReader.java
+++ b/hudi-hadoop-mr/src/main/java/org/apache/hudi/hadoop/realtime/RealtimeUnmergedRecordReader.java
@@ -0,0 +1,143 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.hudi.hadoop.realtime;
+
+import java.io.IOException;
+import java.util.ArrayList;
+import java.util.Iterator;
+import java.util.List;
+import org.apache.avro.generic.GenericRecord;
+import org.apache.hadoop.io.ArrayWritable;
+import org.apache.hadoop.io.NullWritable;
+import org.apache.hadoop.mapred.JobConf;
+import org.apache.hadoop.mapred.RecordReader;
+import org.apache.hudi.common.table.log.HoodieUnMergedLogRecordScanner;
+import org.apache.hudi.common.util.DefaultSizeEstimator;
+import org.apache.hudi.common.util.FSUtils;
+import org.apache.hudi.common.util.Option;
+import org.apache.hudi.common.util.queue.BoundedInMemoryExecutor;
+import org.apache.hudi.common.util.queue.BoundedInMemoryQueueProducer;
+import org.apache.hudi.common.util.queue.FunctionBasedQueueProducer;
+import org.apache.hudi.common.util.queue.IteratorBasedQueueProducer;
+import org.apache.hudi.hadoop.RecordReaderValueIterator;
+import org.apache.hudi.hadoop.SafeParquetRecordReaderWrapper;
+
+class RealtimeUnmergedRecordReader extends AbstractRealtimeRecordReader implements
+    RecordReader<NullWritable, ArrayWritable> {
+
+  // Log Record unmerged scanner
+  private final HoodieUnMergedLogRecordScanner logRecordScanner;
+
+  // Parquet record reader
+  private final RecordReader<NullWritable, ArrayWritable> parquetReader;
+
+  // Parquet record iterator wrapper for the above reader
+  private final RecordReaderValueIterator<NullWritable, ArrayWritable> parquetRecordsIterator;
+
+  // Executor that runs the above producers in parallel
+  private final BoundedInMemoryExecutor<ArrayWritable, ArrayWritable, ?> executor;
+
+  // Iterator for the buffer consumer
+  private final Iterator<ArrayWritable> iterator;
+
+  /**
+   * Construct a Unmerged record reader that parallely consumes both parquet and log records and buffers for upstream
+   * clients to consume
+   *
+   * @param split      File split
+   * @param job        Job Configuration
+   * @param realReader Parquet Reader
+   */
+  public RealtimeUnmergedRecordReader(HoodieRealtimeFileSplit split, JobConf job,
+      RecordReader<NullWritable, ArrayWritable> realReader) {
+    super(split, job);
+    this.parquetReader = new SafeParquetRecordReaderWrapper(realReader);
+    // Iterator for consuming records from parquet file
+    this.parquetRecordsIterator = new RecordReaderValueIterator<>(this.parquetReader);
+    this.executor = new BoundedInMemoryExecutor<>(getMaxCompactionMemoryInBytes(), getParallelProducers(),
+        Option.empty(), x -> x, new DefaultSizeEstimator<>());
+    // Consumer of this record reader
+    this.iterator = this.executor.getQueue().iterator();
+    this.logRecordScanner = new HoodieUnMergedLogRecordScanner(
+        FSUtils.getFs(split.getPath().toString(), jobConf), split.getBasePath(),
+        split.getDeltaFilePaths(), getReaderSchema(), split.getMaxCommitTime(), Boolean.valueOf(jobConf
+        .get(COMPACTION_LAZY_BLOCK_READ_ENABLED_PROP, DEFAULT_COMPACTION_LAZY_BLOCK_READ_ENABLED)),
+        false, jobConf.getInt(MAX_DFS_STREAM_BUFFER_SIZE_PROP, DEFAULT_MAX_DFS_STREAM_BUFFER_SIZE),
+        record -> {
+          // convert Hoodie log record to Hadoop AvroWritable and buffer
+          GenericRecord rec = (GenericRecord) record.getData().getInsertValue(getReaderSchema()).get();
+          ArrayWritable aWritable = (ArrayWritable) avroToArrayWritable(rec, getWriterSchema());
+          this.executor.getQueue().insertRecord(aWritable);
+        });
+    // Start reading and buffering
+    this.executor.startProducers();
+  }
+
+  /**
+   * Setup log and parquet reading in parallel. Both write to central buffer.
+   */
+  @SuppressWarnings("unchecked")
+  private List<BoundedInMemoryQueueProducer<ArrayWritable>> getParallelProducers() {
+    List<BoundedInMemoryQueueProducer<ArrayWritable>> producers = new ArrayList<>();
+    producers.add(new FunctionBasedQueueProducer<>(buffer -> {
+      logRecordScanner.scan();
+      return null;
+    }));
+    producers.add(new IteratorBasedQueueProducer<>(parquetRecordsIterator));
+    return producers;
+  }
+
+  @Override
+  public boolean next(NullWritable key, ArrayWritable value) throws IOException {
+    if (!iterator.hasNext()) {
+      return false;
+    }
+    // Copy from buffer iterator and set to passed writable
+    value.set(iterator.next().get());
+    return true;
+  }
+
+  @Override
+  public NullWritable createKey() {
+    return parquetReader.createKey();
+  }
+
+  @Override
+  public ArrayWritable createValue() {
+    return parquetReader.createValue();
+  }
+
+  @Override
+  public long getPos() throws IOException {
+    //TODO: vb - No logical way to represent parallel stream pos in a single long.
+    // Should we just return invalid (-1). Where is it used ?
+    return 0;
+  }
+
+  @Override
+  public void close() throws IOException {
+    this.parquetRecordsIterator.close();
+    this.executor.shutdownNow();
+  }
+
+  @Override
+  public float getProgress() throws IOException {
+    return Math.min(parquetReader.getProgress(), logRecordScanner.getProgress());
+  }
+}